src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11 from ieee754.pipeline import PipelineSpec
  12 from nmutil.pipemodbase import PipeModBase
  13
  14
  15 class PartitionPoints(dict):
  16     """Partition points and corresponding ``Value``s.
  17
  18     The points at where an ALU is partitioned along with ``Value``s that
  19     specify if the corresponding partition points are enabled.
  20
  21     For example: ``{1: True, 5: True, 10: True}`` with
  22     ``width == 16`` specifies that the ALU is split into 4 sections:
  23     * bits 0 <= ``i`` < 1
  24     * bits 1 <= ``i`` < 5
  25     * bits 5 <= ``i`` < 10
  26     * bits 10 <= ``i`` < 16
  27
  28     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  29     where ``a`` is a 1-bit ``Signal``:
  30     * If ``a`` is asserted:
  31         * bits 0 <= ``i`` < 1
  32         * bits 1 <= ``i`` < 5
  33         * bits 5 <= ``i`` < 10
  34         * bits 10 <= ``i`` < 16
  35     * Otherwise
  36         * bits 0 <= ``i`` < 1
  37         * bits 1 <= ``i`` < 10
  38         * bits 10 <= ``i`` < 16
  39     """
  40
  41     def __init__(self, partition_points=None):
  42         """Create a new ``PartitionPoints``.
  43
  44         :param partition_points: the input partition points to values mapping.
  45         """
  46         super().__init__()
  47         if partition_points is not None:
  48             for point, enabled in partition_points.items():
  49                 if not isinstance(point, int):
  50                     raise TypeError("point must be a non-negative integer")
  51                 if point < 0:
  52                     raise ValueError("point must be a non-negative integer")
  53                 self[point] = Value.wrap(enabled)
  54
  55     def like(self, name=None, src_loc_at=0, mul=1):
  56         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  57
  58         :param name: the base name for the new ``Signal``s.
  59         :param mul: a multiplication factor on the indices
  60         """
  61         if name is None:
  62             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  63         retval = PartitionPoints()
  64         for point, enabled in self.items():
  65             point *= mul
  66             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  67         return retval
  68
  69     def eq(self, rhs):
  70         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  71         if set(self.keys()) != set(rhs.keys()):
  72             raise ValueError("incompatible point set")
  73         for point, enabled in self.items():
  74             yield enabled.eq(rhs[point])
  75
  76     def as_mask(self, width, mul=1):
  77         """Create a bit-mask from `self`.
  78
  79         Each bit in the returned mask is clear only if the partition point at
  80         the same bit-index is enabled.
  81
  82         :param width: the bit width of the resulting mask
  83         :param mul: a "multiplier" which in-place expands the partition points
  84                     typically set to "2" when used for multipliers
  85         """
  86         bits = []
  87         for i in range(width):
  88             i /= mul
  89             if i.is_integer() and int(i) in self:
  90                 bits.append(~self[i])
  91             else:
  92                 bits.append(True)
  93         return Cat(*bits)
  94
  95     def get_max_partition_count(self, width):
  96         """Get the maximum number of partitions.
  97
  98         Gets the number of partitions when all partition points are enabled.
  99         """
 100         retval = 1
 101         for point in self.keys():
 102             if point < width:
 103                 retval += 1
 104         return retval
 105
 106     def fits_in_width(self, width):
 107         """Check if all partition points are smaller than `width`."""
 108         for point in self.keys():
 109             if point >= width:
 110                 return False
 111         return True
 112
 113     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 114         if index == -1 or index == 7:
 115             return C(True, 1)
 116         assert index >= 0 and index < 8
 117         return self[(index * 8 + 8)*mfactor]
 118
 119
 120 class FullAdder(Elaboratable):
 121     """Full Adder.
 122
 123     :attribute in0: the first input
 124     :attribute in1: the second input
 125     :attribute in2: the third input
 126     :attribute sum: the sum output
 127     :attribute carry: the carry output
 128
 129     Rather than do individual full adders (and have an array of them,
 130     which would be very slow to simulate), this module can specify the
 131     bit width of the inputs and outputs: in effect it performs multiple
 132     Full 3-2 Add operations "in parallel".
 133     """
 134
 135     def __init__(self, width):
 136         """Create a ``FullAdder``.
 137
 138         :param width: the bit width of the input and output
 139         """
 140         self.in0 = Signal(width, reset_less=True)
 141         self.in1 = Signal(width, reset_less=True)
 142         self.in2 = Signal(width, reset_less=True)
 143         self.sum = Signal(width, reset_less=True)
 144         self.carry = Signal(width, reset_less=True)
 145
 146     def elaborate(self, platform):
 147         """Elaborate this module."""
 148         m = Module()
 149         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 150         m.d.comb += self.carry.eq((self.in0 & self.in1)
 151                                   | (self.in1 & self.in2)
 152                                   | (self.in2 & self.in0))
 153         return m
 154
 155
 156 class MaskedFullAdder(Elaboratable):
 157     """Masked Full Adder.
 158
 159     :attribute mask: the carry partition mask
 160     :attribute in0: the first input
 161     :attribute in1: the second input
 162     :attribute in2: the third input
 163     :attribute sum: the sum output
 164     :attribute mcarry: the masked carry output
 165
 166     FullAdders are always used with a "mask" on the output.  To keep
 167     the graphviz "clean", this class performs the masking here rather
 168     than inside a large for-loop.
 169
 170     See the following discussion as to why this is no longer derived
 171     from FullAdder.  Each carry is shifted here *before* being ANDed
 172     with the mask, so that an AOI cell may be used (which is more
 173     gate-efficient)
 174     https://en.wikipedia.org/wiki/AND-OR-Invert
 175     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 176     """
 177
 178     def __init__(self, width):
 179         """Create a ``MaskedFullAdder``.
 180
 181         :param width: the bit width of the input and output
 182         """
 183         self.width = width
 184         self.mask = Signal(width, reset_less=True)
 185         self.mcarry = Signal(width, reset_less=True)
 186         self.in0 = Signal(width, reset_less=True)
 187         self.in1 = Signal(width, reset_less=True)
 188         self.in2 = Signal(width, reset_less=True)
 189         self.sum = Signal(width, reset_less=True)
 190
 191     def elaborate(self, platform):
 192         """Elaborate this module."""
 193         m = Module()
 194         s1 = Signal(self.width, reset_less=True)
 195         s2 = Signal(self.width, reset_less=True)
 196         s3 = Signal(self.width, reset_less=True)
 197         c1 = Signal(self.width, reset_less=True)
 198         c2 = Signal(self.width, reset_less=True)
 199         c3 = Signal(self.width, reset_less=True)
 200         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 201         m.d.comb += s1.eq(Cat(0, self.in0))
 202         m.d.comb += s2.eq(Cat(0, self.in1))
 203         m.d.comb += s3.eq(Cat(0, self.in2))
 204         m.d.comb += c1.eq(s1 & s2 & self.mask)
 205         m.d.comb += c2.eq(s2 & s3 & self.mask)
 206         m.d.comb += c3.eq(s3 & s1 & self.mask)
 207         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 208         return m
 209
 210
 211 class PartitionedAdder(Elaboratable):
 212     """Partitioned Adder.
 213
 214     Performs the final add.  The partition points are included in the
 215     actual add (in one of the operands only), which causes a carry over
 216     to the next bit.  Then the final output *removes* the extra bits from
 217     the result.
 218
 219     partition: .... P... P... P... P... (32 bits)
 220     a        : .... .... .... .... .... (32 bits)
 221     b        : .... .... .... .... .... (32 bits)
 222     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 223     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 224     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 225     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 226
 227     :attribute width: the bit width of the input and output. Read-only.
 228     :attribute a: the first input to the adder
 229     :attribute b: the second input to the adder
 230     :attribute output: the sum output
 231     :attribute partition_points: the input partition points. Modification not
 232         supported, except for by ``Signal.eq``.
 233     """
 234
 235     def __init__(self, width, partition_points, partition_step=1):
 236         """Create a ``PartitionedAdder``.
 237
 238         :param width: the bit width of the input and output
 239         :param partition_points: the input partition points
 240         :param partition_step: a multiplier (typically double) step
 241                                which in-place "expands" the partition points
 242         """
 243         self.width = width
 244         self.pmul = partition_step
 245         self.a = Signal(width, reset_less=True)
 246         self.b = Signal(width, reset_less=True)
 247         self.output = Signal(width, reset_less=True)
 248         self.partition_points = PartitionPoints(partition_points)
 249         if not self.partition_points.fits_in_width(width):
 250             raise ValueError("partition_points doesn't fit in width")
 251         expanded_width = 0
 252         for i in range(self.width):
 253             if i in self.partition_points:
 254                 expanded_width += 1
 255             expanded_width += 1
 256         self._expanded_width = expanded_width
 257
 258     def elaborate(self, platform):
 259         """Elaborate this module."""
 260         m = Module()
 261         expanded_a = Signal(self._expanded_width, reset_less=True)
 262         expanded_b = Signal(self._expanded_width, reset_less=True)
 263         expanded_o = Signal(self._expanded_width, reset_less=True)
 264
 265         expanded_index = 0
 266         # store bits in a list, use Cat later.  graphviz is much cleaner
 267         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 268
 269         # partition points are "breaks" (extra zeros or 1s) in what would
 270         # otherwise be a massive long add.  when the "break" points are 0,
 271         # whatever is in it (in the output) is discarded.  however when
 272         # there is a "1", it causes a roll-over carry to the *next* bit.
 273         # we still ignore the "break" bit in the [intermediate] output,
 274         # however by that time we've got the effect that we wanted: the
 275         # carry has been carried *over* the break point.
 276
 277         for i in range(self.width):
 278             pi = i/self.pmul # double the range of the partition point test
 279             if pi.is_integer() and pi in self.partition_points:
 280                 # add extra bit set to 0 + 0 for enabled partition points
 281                 # and 1 + 0 for disabled partition points
 282                 ea.append(expanded_a[expanded_index])
 283                 al.append(~self.partition_points[pi]) # add extra bit in a
 284                 eb.append(expanded_b[expanded_index])
 285                 bl.append(C(0)) # yes, add a zero
 286                 expanded_index += 1 # skip the extra point.  NOT in the output
 287             ea.append(expanded_a[expanded_index])
 288             eb.append(expanded_b[expanded_index])
 289             eo.append(expanded_o[expanded_index])
 290             al.append(self.a[i])
 291             bl.append(self.b[i])
 292             ol.append(self.output[i])
 293             expanded_index += 1
 294
 295         # combine above using Cat
 296         m.d.comb += Cat(*ea).eq(Cat(*al))
 297         m.d.comb += Cat(*eb).eq(Cat(*bl))
 298         m.d.comb += Cat(*ol).eq(Cat(*eo))
 299
 300         # use only one addition to take advantage of look-ahead carry and
 301         # special hardware on FPGAs
 302         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 303         return m
 304
 305
 306 FULL_ADDER_INPUT_COUNT = 3
 307
 308 class AddReduceData:
 309
 310     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 311         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 312                           for i in range(n_parts)]
 313         self.terms = [Signal(output_width, name=f"inputs_{i}",
 314                               reset_less=True)
 315                         for i in range(n_inputs)]
 316         self.part_pts = part_pts.like()
 317
 318     def eq_from(self, part_pts, inputs, part_ops):
 319         return [self.part_pts.eq(part_pts)] + \
 320                [self.terms[i].eq(inputs[i])
 321                                      for i in range(len(self.terms))] + \
 322                [self.part_ops[i].eq(part_ops[i])
 323                                      for i in range(len(self.part_ops))]
 324
 325     def eq(self, rhs):
 326         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 327
 328
 329 class FinalReduceData:
 330
 331     def __init__(self, part_pts, output_width, n_parts):
 332         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 333                           for i in range(n_parts)]
 334         self.output = Signal(output_width, reset_less=True)
 335         self.part_pts = part_pts.like()
 336
 337     def eq_from(self, part_pts, output, part_ops):
 338         return [self.part_pts.eq(part_pts)] + \
 339                [self.output.eq(output)] + \
 340                [self.part_ops[i].eq(part_ops[i])
 341                                      for i in range(len(self.part_ops))]
 342
 343     def eq(self, rhs):
 344         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 345
 346
 347 class FinalAdd(PipeModBase):
 348     """ Final stage of add reduce
 349     """
 350
 351     def __init__(self, pspec, lidx, n_inputs, partition_points,
 352                        partition_step=1):
 353         self.lidx = lidx
 354         self.partition_step = partition_step
 355         self.output_width = pspec.width * 2
 356         self.n_inputs = n_inputs
 357         self.n_parts = pspec.n_parts
 358         self.partition_points = PartitionPoints(partition_points)
 359         if not self.partition_points.fits_in_width(self.output_width):
 360             raise ValueError("partition_points doesn't fit in output_width")
 361
 362         super().__init__(pspec, "finaladd")
 363
 364     def ispec(self):
 365         return AddReduceData(self.partition_points, self.n_inputs,
 366                              self.output_width, self.n_parts)
 367
 368     def ospec(self):
 369         return FinalReduceData(self.partition_points,
 370                                  self.output_width, self.n_parts)
 371
 372     def elaborate(self, platform):
 373         """Elaborate this module."""
 374         m = Module()
 375
 376         output_width = self.output_width
 377         output = Signal(output_width, reset_less=True)
 378         if self.n_inputs == 0:
 379             # use 0 as the default output value
 380             m.d.comb += output.eq(0)
 381         elif self.n_inputs == 1:
 382             # handle single input
 383             m.d.comb += output.eq(self.i.terms[0])
 384         else:
 385             # base case for adding 2 inputs
 386             assert self.n_inputs == 2
 387             adder = PartitionedAdder(output_width,
 388                                      self.i.part_pts, self.partition_step)
 389             m.submodules.final_adder = adder
 390             m.d.comb += adder.a.eq(self.i.terms[0])
 391             m.d.comb += adder.b.eq(self.i.terms[1])
 392             m.d.comb += output.eq(adder.output)
 393
 394         # create output
 395         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 396                                    self.i.part_ops)
 397
 398         return m
 399
 400
 401 class AddReduceSingle(PipeModBase):
 402     """Add list of numbers together.
 403
 404     :attribute inputs: input ``Signal``s to be summed. Modification not
 405         supported, except for by ``Signal.eq``.
 406     :attribute register_levels: List of nesting levels that should have
 407         pipeline registers.
 408     :attribute output: output sum.
 409     :attribute partition_points: the input partition points. Modification not
 410         supported, except for by ``Signal.eq``.
 411     """
 412
 413     def __init__(self, pspec, lidx, n_inputs, partition_points,
 414                        partition_step=1):
 415         """Create an ``AddReduce``.
 416
 417         :param inputs: input ``Signal``s to be summed.
 418         :param output_width: bit-width of ``output``.
 419         :param partition_points: the input partition points.
 420         """
 421         self.lidx = lidx
 422         self.partition_step = partition_step
 423         self.n_inputs = n_inputs
 424         self.n_parts = pspec.n_parts
 425         self.output_width = pspec.width * 2
 426         self.partition_points = PartitionPoints(partition_points)
 427         if not self.partition_points.fits_in_width(self.output_width):
 428             raise ValueError("partition_points doesn't fit in output_width")
 429
 430         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 431         self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 432
 433         super().__init__(pspec, "addreduce_%d" % lidx)
 434
 435     def ispec(self):
 436         return AddReduceData(self.partition_points, self.n_inputs,
 437                              self.output_width, self.n_parts)
 438
 439     def ospec(self):
 440         return AddReduceData(self.partition_points, self.n_terms,
 441                              self.output_width, self.n_parts)
 442
 443     @staticmethod
 444     def calc_n_inputs(n_inputs, groups):
 445         retval = len(groups)*2
 446         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 447             retval += 1
 448         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 449             retval += 2
 450         else:
 451             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 452         return retval
 453
 454     @staticmethod
 455     def get_max_level(input_count):
 456         """Get the maximum level.
 457
 458         All ``register_levels`` must be less than or equal to the maximum
 459         level.
 460         """
 461         retval = 0
 462         while True:
 463             groups = AddReduceSingle.full_adder_groups(input_count)
 464             if len(groups) == 0:
 465                 return retval
 466             input_count %= FULL_ADDER_INPUT_COUNT
 467             input_count += 2 * len(groups)
 468             retval += 1
 469
 470     @staticmethod
 471     def full_adder_groups(input_count):
 472         """Get ``inputs`` indices for which a full adder should be built."""
 473         return range(0,
 474                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 475                      FULL_ADDER_INPUT_COUNT)
 476
 477     def create_next_terms(self):
 478         """ create next intermediate terms, for linking up in elaborate, below
 479         """
 480         terms = []
 481         adders = []
 482
 483         # create full adders for this recursive level.
 484         # this shrinks N terms to 2 * (N // 3) plus the remainder
 485         for i in self.groups:
 486             adder_i = MaskedFullAdder(self.output_width)
 487             adders.append((i, adder_i))
 488             # add both the sum and the masked-carry to the next level.
 489             # 3 inputs have now been reduced to 2...
 490             terms.append(adder_i.sum)
 491             terms.append(adder_i.mcarry)
 492         # handle the remaining inputs.
 493         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 494             terms.append(self.i.terms[-1])
 495         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 496             # Just pass the terms to the next layer, since we wouldn't gain
 497             # anything by using a half adder since there would still be 2 terms
 498             # and just passing the terms to the next layer saves gates.
 499             terms.append(self.i.terms[-2])
 500             terms.append(self.i.terms[-1])
 501         else:
 502             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 503
 504         return terms, adders
 505
 506     def elaborate(self, platform):
 507         """Elaborate this module."""
 508         m = Module()
 509
 510         terms, adders = self.create_next_terms()
 511
 512         # copy the intermediate terms to the output
 513         for i, value in enumerate(terms):
 514             m.d.comb += self.o.terms[i].eq(value)
 515
 516         # copy reg part points and part ops to output
 517         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 518         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 519                                      for i in range(len(self.i.part_ops))]
 520
 521         # set up the partition mask (for the adders)
 522         part_mask = Signal(self.output_width, reset_less=True)
 523
 524         # get partition points as a mask
 525         mask = self.i.part_pts.as_mask(self.output_width,
 526                                        mul=self.partition_step)
 527         m.d.comb += part_mask.eq(mask)
 528
 529         # add and link the intermediate term modules
 530         for i, (iidx, adder_i) in enumerate(adders):
 531             setattr(m.submodules, f"adder_{i}", adder_i)
 532
 533             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 534             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 535             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 536             m.d.comb += adder_i.mask.eq(part_mask)
 537
 538         return m
 539
 540
 541 class AddReduceInternal:
 542     """Recursively Add list of numbers together.
 543
 544     :attribute inputs: input ``Signal``s to be summed. Modification not
 545         supported, except for by ``Signal.eq``.
 546     :attribute register_levels: List of nesting levels that should have
 547         pipeline registers.
 548     :attribute output: output sum.
 549     :attribute partition_points: the input partition points. Modification not
 550         supported, except for by ``Signal.eq``.
 551     """
 552
 553     def __init__(self, i, pspec, partition_step=1):
 554         """Create an ``AddReduce``.
 555
 556         :param inputs: input ``Signal``s to be summed.
 557         :param output_width: bit-width of ``output``.
 558         :param partition_points: the input partition points.
 559         """
 560         self.i = i
 561         self.pspec = pspec
 562         self.inputs = i.terms
 563         self.part_ops = i.part_ops
 564         self.output_width = pspec.width * 2
 565         self.partition_points = i.part_pts
 566         self.partition_step = partition_step
 567
 568         self.create_levels()
 569
 570     def create_levels(self):
 571         """creates reduction levels"""
 572
 573         mods = []
 574         partition_points = self.partition_points
 575         part_ops = self.part_ops
 576         n_parts = len(part_ops)
 577         inputs = self.inputs
 578         ilen = len(inputs)
 579         while True:
 580             groups = AddReduceSingle.full_adder_groups(len(inputs))
 581             if len(groups) == 0:
 582                 break
 583             lidx = len(mods)
 584             next_level = AddReduceSingle(self.pspec, lidx, ilen,
 585                                          partition_points,
 586                                          self.partition_step)
 587             mods.append(next_level)
 588             partition_points = next_level.i.part_pts
 589             inputs = next_level.o.terms
 590             ilen = len(inputs)
 591             part_ops = next_level.i.part_ops
 592
 593         lidx = len(mods)
 594         next_level = FinalAdd(self.pspec, lidx, ilen,
 595                               partition_points, self.partition_step)
 596         mods.append(next_level)
 597
 598         self.levels = mods
 599
 600
 601 class AddReduce(AddReduceInternal, Elaboratable):
 602     """Recursively Add list of numbers together.
 603
 604     :attribute inputs: input ``Signal``s to be summed. Modification not
 605         supported, except for by ``Signal.eq``.
 606     :attribute register_levels: List of nesting levels that should have
 607         pipeline registers.
 608     :attribute output: output sum.
 609     :attribute partition_points: the input partition points. Modification not
 610         supported, except for by ``Signal.eq``.
 611     """
 612
 613     def __init__(self, inputs, output_width, register_levels, part_pts,
 614                        part_ops, partition_step=1):
 615         """Create an ``AddReduce``.
 616
 617         :param inputs: input ``Signal``s to be summed.
 618         :param output_width: bit-width of ``output``.
 619         :param register_levels: List of nesting levels that should have
 620             pipeline registers.
 621         :param partition_points: the input partition points.
 622         """
 623         self._inputs = inputs
 624         self._part_pts = part_pts
 625         self._part_ops = part_ops
 626         n_parts = len(part_ops)
 627         self.i = AddReduceData(part_pts, len(inputs),
 628                              output_width, n_parts)
 629         AddReduceInternal.__init__(self, self.i, output_width, partition_step)
 630         self.o = FinalReduceData(part_pts, output_width, n_parts)
 631         self.register_levels = register_levels
 632
 633     @staticmethod
 634     def get_max_level(input_count):
 635         return AddReduceSingle.get_max_level(input_count)
 636
 637     @staticmethod
 638     def next_register_levels(register_levels):
 639         """``Iterable`` of ``register_levels`` for next recursive level."""
 640         for level in register_levels:
 641             if level > 0:
 642                 yield level - 1
 643
 644     def elaborate(self, platform):
 645         """Elaborate this module."""
 646         m = Module()
 647
 648         m.d.comb += self.i.eq_from(self._part_pts, self._inputs, self._part_ops)
 649
 650         for i, next_level in enumerate(self.levels):
 651             setattr(m.submodules, "next_level%d" % i, next_level)
 652
 653         i = self.i
 654         for idx in range(len(self.levels)):
 655             mcur = self.levels[idx]
 656             if idx in self.register_levels:
 657                 m.d.sync += mcur.i.eq(i)
 658             else:
 659                 m.d.comb += mcur.i.eq(i)
 660             i = mcur.o # for next loop
 661
 662         # output comes from last module
 663         m.d.comb += self.o.eq(i)
 664
 665         return m
 666
 667
 668 OP_MUL_LOW = 0
 669 OP_MUL_SIGNED_HIGH = 1
 670 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 671 OP_MUL_UNSIGNED_HIGH = 3
 672
 673
 674 def get_term(value, shift=0, enabled=None):
 675     if enabled is not None:
 676         value = Mux(enabled, value, 0)
 677     if shift > 0:
 678         value = Cat(Repl(C(0, 1), shift), value)
 679     else:
 680         assert shift == 0
 681     return value
 682
 683
 684 class ProductTerm(Elaboratable):
 685     """ this class creates a single product term (a[..]*b[..]).
 686         it has a design flaw in that is the *output* that is selected,
 687         where the multiplication(s) are combinatorially generated
 688         all the time.
 689     """
 690
 691     def __init__(self, width, twidth, pbwid, a_index, b_index):
 692         self.a_index = a_index
 693         self.b_index = b_index
 694         shift = 8 * (self.a_index + self.b_index)
 695         self.pwidth = width
 696         self.twidth = twidth
 697         self.width = width*2
 698         self.shift = shift
 699
 700         self.ti = Signal(self.width, reset_less=True)
 701         self.term = Signal(twidth, reset_less=True)
 702         self.a = Signal(twidth//2, reset_less=True)
 703         self.b = Signal(twidth//2, reset_less=True)
 704         self.pb_en = Signal(pbwid, reset_less=True)
 705
 706         self.tl = tl = []
 707         min_index = min(self.a_index, self.b_index)
 708         max_index = max(self.a_index, self.b_index)
 709         for i in range(min_index, max_index):
 710             tl.append(self.pb_en[i])
 711         name = "te_%d_%d" % (self.a_index, self.b_index)
 712         if len(tl) > 0:
 713             term_enabled = Signal(name=name, reset_less=True)
 714         else:
 715             term_enabled = None
 716         self.enabled = term_enabled
 717         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 718
 719     def elaborate(self, platform):
 720
 721         m = Module()
 722         if self.enabled is not None:
 723             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 724
 725         bsa = Signal(self.width, reset_less=True)
 726         bsb = Signal(self.width, reset_less=True)
 727         a_index, b_index = self.a_index, self.b_index
 728         pwidth = self.pwidth
 729         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 730         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 731         m.d.comb += self.ti.eq(bsa * bsb)
 732         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 733         """
 734         #TODO: sort out width issues, get inputs a/b switched on/off.
 735         #data going into Muxes is 1/2 the required width
 736
 737         pwidth = self.pwidth
 738         width = self.width
 739         bsa = Signal(self.twidth//2, reset_less=True)
 740         bsb = Signal(self.twidth//2, reset_less=True)
 741         asel = Signal(width, reset_less=True)
 742         bsel = Signal(width, reset_less=True)
 743         a_index, b_index = self.a_index, self.b_index
 744         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 745         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 746         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 747         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 748         m.d.comb += self.ti.eq(bsa * bsb)
 749         m.d.comb += self.term.eq(self.ti)
 750         """
 751
 752         return m
 753
 754
 755 class ProductTerms(Elaboratable):
 756     """ creates a bank of product terms.  also performs the actual bit-selection
 757         this class is to be wrapped with a for-loop on the "a" operand.
 758         it creates a second-level for-loop on the "b" operand.
 759     """
 760     def __init__(self, width, twidth, pbwid, a_index, blen):
 761         self.a_index = a_index
 762         self.blen = blen
 763         self.pwidth = width
 764         self.twidth = twidth
 765         self.pbwid = pbwid
 766         self.a = Signal(twidth//2, reset_less=True)
 767         self.b = Signal(twidth//2, reset_less=True)
 768         self.pb_en = Signal(pbwid, reset_less=True)
 769         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 770                             for i in range(blen)]
 771
 772     def elaborate(self, platform):
 773
 774         m = Module()
 775
 776         for b_index in range(self.blen):
 777             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 778                             self.a_index, b_index)
 779             setattr(m.submodules, "term_%d" % b_index, t)
 780
 781             m.d.comb += t.a.eq(self.a)
 782             m.d.comb += t.b.eq(self.b)
 783             m.d.comb += t.pb_en.eq(self.pb_en)
 784
 785             m.d.comb += self.terms[b_index].eq(t.term)
 786
 787         return m
 788
 789
 790 class LSBNegTerm(Elaboratable):
 791
 792     def __init__(self, bit_width):
 793         self.bit_width = bit_width
 794         self.part = Signal(reset_less=True)
 795         self.signed = Signal(reset_less=True)
 796         self.op = Signal(bit_width, reset_less=True)
 797         self.msb = Signal(reset_less=True)
 798         self.nt = Signal(bit_width*2, reset_less=True)
 799         self.nl = Signal(bit_width*2, reset_less=True)
 800
 801     def elaborate(self, platform):
 802         m = Module()
 803         comb = m.d.comb
 804         bit_wid = self.bit_width
 805         ext = Repl(0, bit_wid) # extend output to HI part
 806
 807         # determine sign of each incoming number *in this partition*
 808         enabled = Signal(reset_less=True)
 809         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 810
 811         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 812         # negation operation is split into a bitwise not and a +1.
 813         # likewise for 16, 32, and 64-bit values.
 814
 815         # width-extended 1s complement if a is signed, otherwise zero
 816         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 817
 818         # add 1 if signed, otherwise add zero
 819         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 820
 821         return m
 822
 823
 824 class Parts(Elaboratable):
 825
 826     def __init__(self, pbwid, part_pts, n_parts):
 827         self.pbwid = pbwid
 828         # inputs
 829         self.part_pts = PartitionPoints.like(part_pts)
 830         # outputs
 831         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 832                       for i in range(n_parts)]
 833
 834     def elaborate(self, platform):
 835         m = Module()
 836
 837         part_pts, parts = self.part_pts, self.parts
 838         # collect part-bytes (double factor because the input is extended)
 839         pbs = Signal(self.pbwid, reset_less=True)
 840         tl = []
 841         for i in range(self.pbwid):
 842             pb = Signal(name="pb%d" % i, reset_less=True)
 843             m.d.comb += pb.eq(part_pts.part_byte(i))
 844             tl.append(pb)
 845         m.d.comb += pbs.eq(Cat(*tl))
 846
 847         # negated-temporary copy of partition bits
 848         npbs = Signal.like(pbs, reset_less=True)
 849         m.d.comb += npbs.eq(~pbs)
 850         byte_count = 8 // len(parts)
 851         for i in range(len(parts)):
 852             pbl = []
 853             pbl.append(npbs[i * byte_count - 1])
 854             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 855                 pbl.append(pbs[j])
 856             pbl.append(npbs[(i + 1) * byte_count - 1])
 857             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 858             m.d.comb += value.eq(Cat(*pbl))
 859             m.d.comb += parts[i].eq(~(value).bool())
 860
 861         return m
 862
 863
 864 class Part(Elaboratable):
 865     """ a key class which, depending on the partitioning, will determine
 866         what action to take when parts of the output are signed or unsigned.
 867
 868         this requires 2 pieces of data *per operand, per partition*:
 869         whether the MSB is HI/LO (per partition!), and whether a signed
 870         or unsigned operation has been *requested*.
 871
 872         once that is determined, signed is basically carried out
 873         by splitting 2's complement into 1's complement plus one.
 874         1's complement is just a bit-inversion.
 875
 876         the extra terms - as separate terms - are then thrown at the
 877         AddReduce alongside the multiplication part-results.
 878     """
 879     def __init__(self, part_pts, width, n_parts, pbwid):
 880
 881         self.pbwid = pbwid
 882         self.part_pts = part_pts
 883
 884         # inputs
 885         self.a = Signal(64, reset_less=True)
 886         self.b = Signal(64, reset_less=True)
 887         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 888                             for i in range(8)]
 889         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 890                             for i in range(8)]
 891         self.pbs = Signal(pbwid, reset_less=True)
 892
 893         # outputs
 894         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 895                             for i in range(n_parts)]
 896
 897         self.not_a_term = Signal(width, reset_less=True)
 898         self.neg_lsb_a_term = Signal(width, reset_less=True)
 899         self.not_b_term = Signal(width, reset_less=True)
 900         self.neg_lsb_b_term = Signal(width, reset_less=True)
 901
 902     def elaborate(self, platform):
 903         m = Module()
 904
 905         pbs, parts = self.pbs, self.parts
 906         part_pts = self.part_pts
 907         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 908         m.d.comb += p.part_pts.eq(part_pts)
 909         parts = p.parts
 910
 911         byte_count = 8 // len(parts)
 912
 913         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 914                 self.not_a_term, self.neg_lsb_a_term,
 915                 self.not_b_term, self.neg_lsb_b_term)
 916
 917         byte_width = 8 // len(parts) # byte width
 918         bit_wid = 8 * byte_width     # bit width
 919         nat, nbt, nla, nlb = [], [], [], []
 920         for i in range(len(parts)):
 921             # work out bit-inverted and +1 term for a.
 922             pa = LSBNegTerm(bit_wid)
 923             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 924             m.d.comb += pa.part.eq(parts[i])
 925             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 926             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 927             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 928             nat.append(pa.nt)
 929             nla.append(pa.nl)
 930
 931             # work out bit-inverted and +1 term for b
 932             pb = LSBNegTerm(bit_wid)
 933             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 934             m.d.comb += pb.part.eq(parts[i])
 935             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 936             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 937             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 938             nbt.append(pb.nt)
 939             nlb.append(pb.nl)
 940
 941         # concatenate together and return all 4 results.
 942         m.d.comb += [not_a_term.eq(Cat(*nat)),
 943                      not_b_term.eq(Cat(*nbt)),
 944                      neg_lsb_a_term.eq(Cat(*nla)),
 945                      neg_lsb_b_term.eq(Cat(*nlb)),
 946                     ]
 947
 948         return m
 949
 950
 951 class IntermediateOut(Elaboratable):
 952     """ selects the HI/LO part of the multiplication, for a given bit-width
 953         the output is also reconstructed in its SIMD (partition) lanes.
 954     """
 955     def __init__(self, width, out_wid, n_parts):
 956         self.width = width
 957         self.n_parts = n_parts
 958         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 959                                      for i in range(8)]
 960         self.intermed = Signal(out_wid, reset_less=True)
 961         self.output = Signal(out_wid//2, reset_less=True)
 962
 963     def elaborate(self, platform):
 964         m = Module()
 965
 966         ol = []
 967         w = self.width
 968         sel = w // 8
 969         for i in range(self.n_parts):
 970             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 971             m.d.comb += op.eq(
 972                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 973                     self.intermed.bit_select(i * w*2, w),
 974                     self.intermed.bit_select(i * w*2 + w, w)))
 975             ol.append(op)
 976         m.d.comb += self.output.eq(Cat(*ol))
 977
 978         return m
 979
 980
 981 class FinalOut(PipeModBase):
 982     """ selects the final output based on the partitioning.
 983
 984         each byte is selectable independently, i.e. it is possible
 985         that some partitions requested 8-bit computation whilst others
 986         requested 16 or 32 bit.
 987     """
 988     def __init__(self, pspec, part_pts):
 989
 990         self.part_pts = part_pts
 991         self.output_width = pspec.width * 2
 992         self.n_parts = pspec.n_parts
 993         self.out_wid = pspec.width
 994
 995         super().__init__(pspec, "finalout")
 996
 997     def ispec(self):
 998         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
 999
1000     def ospec(self):
1001         return OutputData()
1002
1003     def elaborate(self, platform):
1004         m = Module()
1005
1006         part_pts = self.part_pts
1007         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1008         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1009         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1010         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1011
1012         out_part_pts = self.i.part_pts
1013
1014         # temporaries
1015         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1016         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1017         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1018
1019         i8 = Signal(self.out_wid, reset_less=True)
1020         i16 = Signal(self.out_wid, reset_less=True)
1021         i32 = Signal(self.out_wid, reset_less=True)
1022         i64 = Signal(self.out_wid, reset_less=True)
1023
1024         m.d.comb += p_8.part_pts.eq(out_part_pts)
1025         m.d.comb += p_16.part_pts.eq(out_part_pts)
1026         m.d.comb += p_32.part_pts.eq(out_part_pts)
1027         m.d.comb += p_64.part_pts.eq(out_part_pts)
1028
1029         for i in range(len(p_8.parts)):
1030             m.d.comb += d8[i].eq(p_8.parts[i])
1031         for i in range(len(p_16.parts)):
1032             m.d.comb += d16[i].eq(p_16.parts[i])
1033         for i in range(len(p_32.parts)):
1034             m.d.comb += d32[i].eq(p_32.parts[i])
1035         m.d.comb += i8.eq(self.i.outputs[0])
1036         m.d.comb += i16.eq(self.i.outputs[1])
1037         m.d.comb += i32.eq(self.i.outputs[2])
1038         m.d.comb += i64.eq(self.i.outputs[3])
1039
1040         ol = []
1041         for i in range(8):
1042             # select one of the outputs: d8 selects i8, d16 selects i16
1043             # d32 selects i32, and the default is i64.
1044             # d8 and d16 are ORed together in the first Mux
1045             # then the 2nd selects either i8 or i16.
1046             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1047             op = Signal(8, reset_less=True, name="op_%d" % i)
1048             m.d.comb += op.eq(
1049                 Mux(d8[i] | d16[i // 2],
1050                     Mux(d8[i], i8.bit_select(i * 8, 8),
1051                                i16.bit_select(i * 8, 8)),
1052                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1053                                       i64.bit_select(i * 8, 8))))
1054             ol.append(op)
1055
1056         # create outputs
1057         m.d.comb += self.o.output.eq(Cat(*ol))
1058         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1059
1060         return m
1061
1062
1063 class OrMod(Elaboratable):
1064     """ ORs four values together in a hierarchical tree
1065     """
1066     def __init__(self, wid):
1067         self.wid = wid
1068         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1069                      for i in range(4)]
1070         self.orout = Signal(wid, reset_less=True)
1071
1072     def elaborate(self, platform):
1073         m = Module()
1074         or1 = Signal(self.wid, reset_less=True)
1075         or2 = Signal(self.wid, reset_less=True)
1076         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1077         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1078         m.d.comb += self.orout.eq(or1 | or2)
1079
1080         return m
1081
1082
1083 class Signs(Elaboratable):
1084     """ determines whether a or b are signed numbers
1085         based on the required operation type (OP_MUL_*)
1086     """
1087
1088     def __init__(self):
1089         self.part_ops = Signal(2, reset_less=True)
1090         self.a_signed = Signal(reset_less=True)
1091         self.b_signed = Signal(reset_less=True)
1092
1093     def elaborate(self, platform):
1094
1095         m = Module()
1096
1097         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1098         bsig = (self.part_ops == OP_MUL_LOW) \
1099                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1100         m.d.comb += self.a_signed.eq(asig)
1101         m.d.comb += self.b_signed.eq(bsig)
1102
1103         return m
1104
1105
1106 class IntermediateData:
1107
1108     def __init__(self, part_pts, output_width, n_parts):
1109         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1110                           for i in range(n_parts)]
1111         self.part_pts = part_pts.like()
1112         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1113                           for i in range(4)]
1114         # intermediates (needed for unit tests)
1115         self.intermediate_output = Signal(output_width)
1116
1117     def eq_from(self, part_pts, outputs, intermediate_output,
1118                       part_ops):
1119         return [self.part_pts.eq(part_pts)] + \
1120                [self.intermediate_output.eq(intermediate_output)] + \
1121                [self.outputs[i].eq(outputs[i])
1122                                      for i in range(4)] + \
1123                [self.part_ops[i].eq(part_ops[i])
1124                                      for i in range(len(self.part_ops))]
1125
1126     def eq(self, rhs):
1127         return self.eq_from(rhs.part_pts, rhs.outputs,
1128                             rhs.intermediate_output, rhs.part_ops)
1129
1130
1131 class InputData:
1132
1133     def __init__(self):
1134         self.a = Signal(64)
1135         self.b = Signal(64)
1136         self.part_pts = PartitionPoints()
1137         for i in range(8, 64, 8):
1138             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1139         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1140
1141     def eq_from(self, part_pts, a, b, part_ops):
1142         return [self.part_pts.eq(part_pts)] + \
1143                [self.a.eq(a), self.b.eq(b)] + \
1144                [self.part_ops[i].eq(part_ops[i])
1145                                      for i in range(len(self.part_ops))]
1146
1147     def eq(self, rhs):
1148         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1149
1150
1151 class OutputData:
1152
1153     def __init__(self):
1154         self.intermediate_output = Signal(128) # needed for unit tests
1155         self.output = Signal(64)
1156
1157     def eq(self, rhs):
1158         return [self.intermediate_output.eq(rhs.intermediate_output),
1159                 self.output.eq(rhs.output)]
1160
1161
1162 class AllTerms(PipeModBase):
1163     """Set of terms to be added together
1164     """
1165
1166     def __init__(self, pspec, n_inputs):
1167         """Create an ``AllTerms``.
1168         """
1169         self.n_inputs = n_inputs
1170         self.n_parts = pspec.n_parts
1171         self.output_width = pspec.width * 2
1172         super().__init__(pspec, "allterms")
1173
1174     def ispec(self):
1175         return InputData()
1176
1177     def ospec(self):
1178         return AddReduceData(self.i.part_pts, self.n_inputs,
1179                              self.output_width, self.n_parts)
1180
1181     def elaborate(self, platform):
1182         m = Module()
1183
1184         eps = self.i.part_pts
1185
1186         # collect part-bytes
1187         pbs = Signal(8, reset_less=True)
1188         tl = []
1189         for i in range(8):
1190             pb = Signal(name="pb%d" % i, reset_less=True)
1191             m.d.comb += pb.eq(eps.part_byte(i))
1192             tl.append(pb)
1193         m.d.comb += pbs.eq(Cat(*tl))
1194
1195         # local variables
1196         signs = []
1197         for i in range(8):
1198             s = Signs()
1199             signs.append(s)
1200             setattr(m.submodules, "signs%d" % i, s)
1201             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1202
1203         m.submodules.part_8 = part_8 = Part(eps, 128, 8, 8)
1204         m.submodules.part_16 = part_16 = Part(eps, 128, 4, 8)
1205         m.submodules.part_32 = part_32 = Part(eps, 128, 2, 8)
1206         m.submodules.part_64 = part_64 = Part(eps, 128, 1, 8)
1207         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1208         for mod in [part_8, part_16, part_32, part_64]:
1209             m.d.comb += mod.a.eq(self.i.a)
1210             m.d.comb += mod.b.eq(self.i.b)
1211             for i in range(len(signs)):
1212                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1213                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1214             m.d.comb += mod.pbs.eq(pbs)
1215             nat_l.append(mod.not_a_term)
1216             nbt_l.append(mod.not_b_term)
1217             nla_l.append(mod.neg_lsb_a_term)
1218             nlb_l.append(mod.neg_lsb_b_term)
1219
1220         terms = []
1221
1222         for a_index in range(8):
1223             t = ProductTerms(8, 128, 8, a_index, 8)
1224             setattr(m.submodules, "terms_%d" % a_index, t)
1225
1226             m.d.comb += t.a.eq(self.i.a)
1227             m.d.comb += t.b.eq(self.i.b)
1228             m.d.comb += t.pb_en.eq(pbs)
1229
1230             for term in t.terms:
1231                 terms.append(term)
1232
1233         # it's fine to bitwise-or data together since they are never enabled
1234         # at the same time
1235         m.submodules.nat_or = nat_or = OrMod(128)
1236         m.submodules.nbt_or = nbt_or = OrMod(128)
1237         m.submodules.nla_or = nla_or = OrMod(128)
1238         m.submodules.nlb_or = nlb_or = OrMod(128)
1239         for l, mod in [(nat_l, nat_or),
1240                              (nbt_l, nbt_or),
1241                              (nla_l, nla_or),
1242                              (nlb_l, nlb_or)]:
1243             for i in range(len(l)):
1244                 m.d.comb += mod.orin[i].eq(l[i])
1245             terms.append(mod.orout)
1246
1247         # copy the intermediate terms to the output
1248         for i, value in enumerate(terms):
1249             m.d.comb += self.o.terms[i].eq(value)
1250
1251         # copy reg part points and part ops to output
1252         m.d.comb += self.o.part_pts.eq(eps)
1253         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1254                                      for i in range(len(self.i.part_ops))]
1255
1256         return m
1257
1258
1259 class Intermediates(PipeModBase):
1260     """ Intermediate output modules
1261     """
1262
1263     def __init__(self, pspec, part_pts):
1264         self.part_pts = part_pts
1265         self.output_width = pspec.width * 2
1266         self.n_parts = pspec.n_parts
1267
1268         super().__init__(pspec, "intermediates")
1269
1270     def ispec(self):
1271         return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
1272
1273     def ospec(self):
1274         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1275
1276     def elaborate(self, platform):
1277         m = Module()
1278
1279         out_part_ops = self.i.part_ops
1280         out_part_pts = self.i.part_pts
1281
1282         # create _output_64
1283         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1284         m.d.comb += io64.intermed.eq(self.i.output)
1285         for i in range(8):
1286             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1287         m.d.comb += self.o.outputs[3].eq(io64.output)
1288
1289         # create _output_32
1290         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1291         m.d.comb += io32.intermed.eq(self.i.output)
1292         for i in range(8):
1293             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1294         m.d.comb += self.o.outputs[2].eq(io32.output)
1295
1296         # create _output_16
1297         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1298         m.d.comb += io16.intermed.eq(self.i.output)
1299         for i in range(8):
1300             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1301         m.d.comb += self.o.outputs[1].eq(io16.output)
1302
1303         # create _output_8
1304         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1305         m.d.comb += io8.intermed.eq(self.i.output)
1306         for i in range(8):
1307             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1308         m.d.comb += self.o.outputs[0].eq(io8.output)
1309
1310         for i in range(8):
1311             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1312         m.d.comb += self.o.part_pts.eq(out_part_pts)
1313         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1314
1315         return m
1316
1317
1318 class Mul8_16_32_64(Elaboratable):
1319     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1320
1321     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1322     partitions on naturally-aligned boundaries. Supports the operation being
1323     set for each partition independently.
1324
1325     :attribute part_pts: the input partition points. Has a partition point at
1326         multiples of 8 in 0 < i < 64. Each partition point's associated
1327         ``Value`` is a ``Signal``. Modification not supported, except for by
1328         ``Signal.eq``.
1329     :attribute part_ops: the operation for each byte. The operation for a
1330         particular partition is selected by assigning the selected operation
1331         code to each byte in the partition. The allowed operation codes are:
1332
1333         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1334             RISC-V's `mul` instruction.
1335         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1336             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1337             instruction.
1338         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1339             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1340             `mulhsu` instruction.
1341         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1342             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1343             instruction.
1344     """
1345
1346     def __init__(self, register_levels=()):
1347         """ register_levels: specifies the points in the cascade at which
1348             flip-flops are to be inserted.
1349         """
1350
1351         self.id_wid = 0 # num_bits(num_rows)
1352         self.op_wid = 0
1353         self.pspec = PipelineSpec(64, self.id_wid, self.op_wid, n_ops=3)
1354         self.pspec.n_parts = 8
1355
1356         # parameter(s)
1357         self.register_levels = list(register_levels)
1358
1359         self.i = self.ispec()
1360         self.o = self.ospec()
1361
1362         # inputs
1363         self.part_pts = self.i.part_pts
1364         self.part_ops = self.i.part_ops
1365         self.a = self.i.a
1366         self.b = self.i.b
1367
1368         # output
1369         self.intermediate_output = self.o.intermediate_output
1370         self.output = self.o.output
1371
1372     def ispec(self):
1373         return InputData()
1374
1375     def ospec(self):
1376         return OutputData()
1377
1378     def elaborate(self, platform):
1379         m = Module()
1380
1381         part_pts = self.part_pts
1382
1383         n_inputs = 64 + 4
1384         t = AllTerms(self.pspec, n_inputs)
1385         t.setup(m, self.i)
1386
1387         terms = t.o.terms
1388
1389         at = AddReduceInternal(t.process(self.i), self.pspec, partition_step=2)
1390
1391         i = at.i
1392         for idx in range(len(at.levels)):
1393             mcur = at.levels[idx]
1394             mcur.setup(m, i)
1395             o = mcur.ospec()
1396             if idx in self.register_levels:
1397                 m.d.sync += o.eq(mcur.process(i))
1398             else:
1399                 m.d.comb += o.eq(mcur.process(i))
1400             i = o # for next loop
1401
1402         interm = Intermediates(self.pspec, part_pts)
1403         interm.setup(m, i)
1404         o = interm.process(interm.i)
1405
1406         # final output
1407         finalout = FinalOut(self.pspec, part_pts)
1408         finalout.setup(m, o)
1409         m.d.comb += self.o.eq(finalout.process(o))
1410
1411         return m
1412
1413
1414 if __name__ == "__main__":
1415     m = Mul8_16_32_64()
1416     main(m, ports=[m.a,
1417                    m.b,
1418                    m.intermediate_output,
1419                    m.output,
1420                    *m.part_ops,
1421                    *m.part_pts.values()])