src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.i = AddReduceData(partition_points, n_inputs,
 351                                output_width, n_parts)
 352         self.o = FinalReduceData(partition_points, output_width, n_parts)
 353         self.output_width = output_width
 354         self.n_inputs = n_inputs
 355         self.n_parts = n_parts
 356         self.partition_points = PartitionPoints(partition_points)
 357         if not self.partition_points.fits_in_width(output_width):
 358             raise ValueError("partition_points doesn't fit in output_width")
 359
 360     def elaborate(self, platform):
 361         """Elaborate this module."""
 362         m = Module()
 363
 364         output_width = self.output_width
 365         output = Signal(output_width, reset_less=True)
 366         if self.n_inputs == 0:
 367             # use 0 as the default output value
 368             m.d.comb += output.eq(0)
 369         elif self.n_inputs == 1:
 370             # handle single input
 371             m.d.comb += output.eq(self.i.terms[0])
 372         else:
 373             # base case for adding 2 inputs
 374             assert self.n_inputs == 2
 375             adder = PartitionedAdder(output_width,
 376                                      self.i.part_pts, 2)
 377             m.submodules.final_adder = adder
 378             m.d.comb += adder.a.eq(self.i.terms[0])
 379             m.d.comb += adder.b.eq(self.i.terms[1])
 380             m.d.comb += output.eq(adder.output)
 381
 382         # create output
 383         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 384                                    self.i.part_ops)
 385
 386         return m
 387
 388
 389 class AddReduceSingle(Elaboratable):
 390     """Add list of numbers together.
 391
 392     :attribute inputs: input ``Signal``s to be summed. Modification not
 393         supported, except for by ``Signal.eq``.
 394     :attribute register_levels: List of nesting levels that should have
 395         pipeline registers.
 396     :attribute output: output sum.
 397     :attribute partition_points: the input partition points. Modification not
 398         supported, except for by ``Signal.eq``.
 399     """
 400
 401     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 402         """Create an ``AddReduce``.
 403
 404         :param inputs: input ``Signal``s to be summed.
 405         :param output_width: bit-width of ``output``.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.partition_points = PartitionPoints(partition_points)
 414         if not self.partition_points.fits_in_width(output_width):
 415             raise ValueError("partition_points doesn't fit in output_width")
 416
 417         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 418         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 419         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 420
 421     @staticmethod
 422     def calc_n_inputs(n_inputs, groups):
 423         retval = len(groups)*2
 424         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 425             retval += 1
 426         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 427             retval += 2
 428         else:
 429             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 430         return retval
 431
 432     @staticmethod
 433     def get_max_level(input_count):
 434         """Get the maximum level.
 435
 436         All ``register_levels`` must be less than or equal to the maximum
 437         level.
 438         """
 439         retval = 0
 440         while True:
 441             groups = AddReduceSingle.full_adder_groups(input_count)
 442             if len(groups) == 0:
 443                 return retval
 444             input_count %= FULL_ADDER_INPUT_COUNT
 445             input_count += 2 * len(groups)
 446             retval += 1
 447
 448     @staticmethod
 449     def full_adder_groups(input_count):
 450         """Get ``inputs`` indices for which a full adder should be built."""
 451         return range(0,
 452                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 453                      FULL_ADDER_INPUT_COUNT)
 454
 455     def create_next_terms(self):
 456         """ create next intermediate terms, for linking up in elaborate, below
 457         """
 458         terms = []
 459         adders = []
 460
 461         # create full adders for this recursive level.
 462         # this shrinks N terms to 2 * (N // 3) plus the remainder
 463         for i in self.groups:
 464             adder_i = MaskedFullAdder(self.output_width)
 465             adders.append((i, adder_i))
 466             # add both the sum and the masked-carry to the next level.
 467             # 3 inputs have now been reduced to 2...
 468             terms.append(adder_i.sum)
 469             terms.append(adder_i.mcarry)
 470         # handle the remaining inputs.
 471         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 472             terms.append(self.i.terms[-1])
 473         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 474             # Just pass the terms to the next layer, since we wouldn't gain
 475             # anything by using a half adder since there would still be 2 terms
 476             # and just passing the terms to the next layer saves gates.
 477             terms.append(self.i.terms[-2])
 478             terms.append(self.i.terms[-1])
 479         else:
 480             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 481
 482         return terms, adders
 483
 484     def elaborate(self, platform):
 485         """Elaborate this module."""
 486         m = Module()
 487
 488         terms, adders = self.create_next_terms()
 489
 490         # copy the intermediate terms to the output
 491         for i, value in enumerate(terms):
 492             m.d.comb += self.o.terms[i].eq(value)
 493
 494         # copy reg part points and part ops to output
 495         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 496         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 497                                      for i in range(len(self.i.part_ops))]
 498
 499         # set up the partition mask (for the adders)
 500         part_mask = Signal(self.output_width, reset_less=True)
 501
 502         # get partition points as a mask
 503         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 504         m.d.comb += part_mask.eq(mask)
 505
 506         # add and link the intermediate term modules
 507         for i, (iidx, adder_i) in enumerate(adders):
 508             setattr(m.submodules, f"adder_{i}", adder_i)
 509
 510             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 511             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 512             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 513             m.d.comb += adder_i.mask.eq(part_mask)
 514
 515         return m
 516
 517
 518 class AddReduceInternal:
 519     """Recursively Add list of numbers together.
 520
 521     :attribute inputs: input ``Signal``s to be summed. Modification not
 522         supported, except for by ``Signal.eq``.
 523     :attribute register_levels: List of nesting levels that should have
 524         pipeline registers.
 525     :attribute output: output sum.
 526     :attribute partition_points: the input partition points. Modification not
 527         supported, except for by ``Signal.eq``.
 528     """
 529
 530     def __init__(self, inputs, output_width, partition_points,
 531                        part_ops):
 532         """Create an ``AddReduce``.
 533
 534         :param inputs: input ``Signal``s to be summed.
 535         :param output_width: bit-width of ``output``.
 536         :param partition_points: the input partition points.
 537         """
 538         self.inputs = inputs
 539         self.part_ops = part_ops
 540         self.output_width = output_width
 541         self.partition_points = partition_points
 542
 543         self.create_levels()
 544
 545     def create_levels(self):
 546         """creates reduction levels"""
 547
 548         mods = []
 549         partition_points = self.partition_points
 550         part_ops = self.part_ops
 551         n_parts = len(part_ops)
 552         inputs = self.inputs
 553         ilen = len(inputs)
 554         while True:
 555             groups = AddReduceSingle.full_adder_groups(len(inputs))
 556             if len(groups) == 0:
 557                 break
 558             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 559                                          partition_points)
 560             mods.append(next_level)
 561             partition_points = next_level.i.part_pts
 562             inputs = next_level.o.terms
 563             ilen = len(inputs)
 564             part_ops = next_level.i.part_ops
 565
 566         next_level = FinalAdd(ilen, self.output_width, n_parts,
 567                               partition_points)
 568         mods.append(next_level)
 569
 570         self.levels = mods
 571
 572
 573 class AddReduce(AddReduceInternal, Elaboratable):
 574     """Recursively Add list of numbers together.
 575
 576     :attribute inputs: input ``Signal``s to be summed. Modification not
 577         supported, except for by ``Signal.eq``.
 578     :attribute register_levels: List of nesting levels that should have
 579         pipeline registers.
 580     :attribute output: output sum.
 581     :attribute partition_points: the input partition points. Modification not
 582         supported, except for by ``Signal.eq``.
 583     """
 584
 585     def __init__(self, inputs, output_width, register_levels, partition_points,
 586                        part_ops):
 587         """Create an ``AddReduce``.
 588
 589         :param inputs: input ``Signal``s to be summed.
 590         :param output_width: bit-width of ``output``.
 591         :param register_levels: List of nesting levels that should have
 592             pipeline registers.
 593         :param partition_points: the input partition points.
 594         """
 595         AddReduceInternal.__init__(self, inputs, output_width,
 596                                    partition_points, part_ops)
 597         n_parts = len(part_ops)
 598         self.o = FinalReduceData(partition_points, output_width, n_parts)
 599         self.register_levels = register_levels
 600
 601     @staticmethod
 602     def get_max_level(input_count):
 603         return AddReduceSingle.get_max_level(input_count)
 604
 605     @staticmethod
 606     def next_register_levels(register_levels):
 607         """``Iterable`` of ``register_levels`` for next recursive level."""
 608         for level in register_levels:
 609             if level > 0:
 610                 yield level - 1
 611
 612     def create_levels(self):
 613         """creates reduction levels"""
 614
 615         mods = []
 616         partition_points = self.partition_points
 617         part_ops = self.part_ops
 618         n_parts = len(part_ops)
 619         inputs = self.inputs
 620         ilen = len(inputs)
 621         while True:
 622             groups = AddReduceSingle.full_adder_groups(len(inputs))
 623             if len(groups) == 0:
 624                 break
 625             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 626                                          partition_points)
 627             mods.append(next_level)
 628             partition_points = next_level.i.part_pts
 629             inputs = next_level.o.terms
 630             ilen = len(inputs)
 631             part_ops = next_level.i.part_ops
 632
 633         next_level = FinalAdd(ilen, self.output_width, n_parts,
 634                               partition_points)
 635         mods.append(next_level)
 636
 637         self.levels = mods
 638
 639     def elaborate(self, platform):
 640         """Elaborate this module."""
 641         m = Module()
 642
 643         for i, next_level in enumerate(self.levels):
 644             setattr(m.submodules, "next_level%d" % i, next_level)
 645
 646         partition_points = self.partition_points
 647         inputs = self.inputs
 648         part_ops = self.part_ops
 649         n_parts = len(part_ops)
 650         n_inputs = len(inputs)
 651         output_width = self.output_width
 652         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 653         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 654         for idx in range(len(self.levels)):
 655             mcur = self.levels[idx]
 656             if idx in self.register_levels:
 657                 m.d.sync += mcur.i.eq(i)
 658             else:
 659                 m.d.comb += mcur.i.eq(i)
 660             i = mcur.o # for next loop
 661
 662         # output comes from last module
 663         m.d.comb += self.o.eq(i)
 664
 665         return m
 666
 667
 668 OP_MUL_LOW = 0
 669 OP_MUL_SIGNED_HIGH = 1
 670 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 671 OP_MUL_UNSIGNED_HIGH = 3
 672
 673
 674 def get_term(value, shift=0, enabled=None):
 675     if enabled is not None:
 676         value = Mux(enabled, value, 0)
 677     if shift > 0:
 678         value = Cat(Repl(C(0, 1), shift), value)
 679     else:
 680         assert shift == 0
 681     return value
 682
 683
 684 class ProductTerm(Elaboratable):
 685     """ this class creates a single product term (a[..]*b[..]).
 686         it has a design flaw in that is the *output* that is selected,
 687         where the multiplication(s) are combinatorially generated
 688         all the time.
 689     """
 690
 691     def __init__(self, width, twidth, pbwid, a_index, b_index):
 692         self.a_index = a_index
 693         self.b_index = b_index
 694         shift = 8 * (self.a_index + self.b_index)
 695         self.pwidth = width
 696         self.twidth = twidth
 697         self.width = width*2
 698         self.shift = shift
 699
 700         self.ti = Signal(self.width, reset_less=True)
 701         self.term = Signal(twidth, reset_less=True)
 702         self.a = Signal(twidth//2, reset_less=True)
 703         self.b = Signal(twidth//2, reset_less=True)
 704         self.pb_en = Signal(pbwid, reset_less=True)
 705
 706         self.tl = tl = []
 707         min_index = min(self.a_index, self.b_index)
 708         max_index = max(self.a_index, self.b_index)
 709         for i in range(min_index, max_index):
 710             tl.append(self.pb_en[i])
 711         name = "te_%d_%d" % (self.a_index, self.b_index)
 712         if len(tl) > 0:
 713             term_enabled = Signal(name=name, reset_less=True)
 714         else:
 715             term_enabled = None
 716         self.enabled = term_enabled
 717         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 718
 719     def elaborate(self, platform):
 720
 721         m = Module()
 722         if self.enabled is not None:
 723             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 724
 725         bsa = Signal(self.width, reset_less=True)
 726         bsb = Signal(self.width, reset_less=True)
 727         a_index, b_index = self.a_index, self.b_index
 728         pwidth = self.pwidth
 729         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 730         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 731         m.d.comb += self.ti.eq(bsa * bsb)
 732         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 733         """
 734         #TODO: sort out width issues, get inputs a/b switched on/off.
 735         #data going into Muxes is 1/2 the required width
 736
 737         pwidth = self.pwidth
 738         width = self.width
 739         bsa = Signal(self.twidth//2, reset_less=True)
 740         bsb = Signal(self.twidth//2, reset_less=True)
 741         asel = Signal(width, reset_less=True)
 742         bsel = Signal(width, reset_less=True)
 743         a_index, b_index = self.a_index, self.b_index
 744         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 745         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 746         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 747         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 748         m.d.comb += self.ti.eq(bsa * bsb)
 749         m.d.comb += self.term.eq(self.ti)
 750         """
 751
 752         return m
 753
 754
 755 class ProductTerms(Elaboratable):
 756     """ creates a bank of product terms.  also performs the actual bit-selection
 757         this class is to be wrapped with a for-loop on the "a" operand.
 758         it creates a second-level for-loop on the "b" operand.
 759     """
 760     def __init__(self, width, twidth, pbwid, a_index, blen):
 761         self.a_index = a_index
 762         self.blen = blen
 763         self.pwidth = width
 764         self.twidth = twidth
 765         self.pbwid = pbwid
 766         self.a = Signal(twidth//2, reset_less=True)
 767         self.b = Signal(twidth//2, reset_less=True)
 768         self.pb_en = Signal(pbwid, reset_less=True)
 769         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 770                             for i in range(blen)]
 771
 772     def elaborate(self, platform):
 773
 774         m = Module()
 775
 776         for b_index in range(self.blen):
 777             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 778                             self.a_index, b_index)
 779             setattr(m.submodules, "term_%d" % b_index, t)
 780
 781             m.d.comb += t.a.eq(self.a)
 782             m.d.comb += t.b.eq(self.b)
 783             m.d.comb += t.pb_en.eq(self.pb_en)
 784
 785             m.d.comb += self.terms[b_index].eq(t.term)
 786
 787         return m
 788
 789
 790 class LSBNegTerm(Elaboratable):
 791
 792     def __init__(self, bit_width):
 793         self.bit_width = bit_width
 794         self.part = Signal(reset_less=True)
 795         self.signed = Signal(reset_less=True)
 796         self.op = Signal(bit_width, reset_less=True)
 797         self.msb = Signal(reset_less=True)
 798         self.nt = Signal(bit_width*2, reset_less=True)
 799         self.nl = Signal(bit_width*2, reset_less=True)
 800
 801     def elaborate(self, platform):
 802         m = Module()
 803         comb = m.d.comb
 804         bit_wid = self.bit_width
 805         ext = Repl(0, bit_wid) # extend output to HI part
 806
 807         # determine sign of each incoming number *in this partition*
 808         enabled = Signal(reset_less=True)
 809         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 810
 811         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 812         # negation operation is split into a bitwise not and a +1.
 813         # likewise for 16, 32, and 64-bit values.
 814
 815         # width-extended 1s complement if a is signed, otherwise zero
 816         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 817
 818         # add 1 if signed, otherwise add zero
 819         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 820
 821         return m
 822
 823
 824 class Parts(Elaboratable):
 825
 826     def __init__(self, pbwid, part_pts, n_parts):
 827         self.pbwid = pbwid
 828         # inputs
 829         self.part_pts = PartitionPoints.like(part_pts)
 830         # outputs
 831         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 832                       for i in range(n_parts)]
 833
 834     def elaborate(self, platform):
 835         m = Module()
 836
 837         part_pts, parts = self.part_pts, self.parts
 838         # collect part-bytes (double factor because the input is extended)
 839         pbs = Signal(self.pbwid, reset_less=True)
 840         tl = []
 841         for i in range(self.pbwid):
 842             pb = Signal(name="pb%d" % i, reset_less=True)
 843             m.d.comb += pb.eq(part_pts.part_byte(i))
 844             tl.append(pb)
 845         m.d.comb += pbs.eq(Cat(*tl))
 846
 847         # negated-temporary copy of partition bits
 848         npbs = Signal.like(pbs, reset_less=True)
 849         m.d.comb += npbs.eq(~pbs)
 850         byte_count = 8 // len(parts)
 851         for i in range(len(parts)):
 852             pbl = []
 853             pbl.append(npbs[i * byte_count - 1])
 854             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 855                 pbl.append(pbs[j])
 856             pbl.append(npbs[(i + 1) * byte_count - 1])
 857             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 858             m.d.comb += value.eq(Cat(*pbl))
 859             m.d.comb += parts[i].eq(~(value).bool())
 860
 861         return m
 862
 863
 864 class Part(Elaboratable):
 865     """ a key class which, depending on the partitioning, will determine
 866         what action to take when parts of the output are signed or unsigned.
 867
 868         this requires 2 pieces of data *per operand, per partition*:
 869         whether the MSB is HI/LO (per partition!), and whether a signed
 870         or unsigned operation has been *requested*.
 871
 872         once that is determined, signed is basically carried out
 873         by splitting 2's complement into 1's complement plus one.
 874         1's complement is just a bit-inversion.
 875
 876         the extra terms - as separate terms - are then thrown at the
 877         AddReduce alongside the multiplication part-results.
 878     """
 879     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 880
 881         self.pbwid = pbwid
 882         self.part_pts = part_pts
 883
 884         # inputs
 885         self.a = Signal(64, reset_less=True)
 886         self.b = Signal(64, reset_less=True)
 887         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 888                             for i in range(8)]
 889         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 890                             for i in range(8)]
 891         self.pbs = Signal(pbwid, reset_less=True)
 892
 893         # outputs
 894         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 895                             for i in range(n_parts)]
 896
 897         self.not_a_term = Signal(width, reset_less=True)
 898         self.neg_lsb_a_term = Signal(width, reset_less=True)
 899         self.not_b_term = Signal(width, reset_less=True)
 900         self.neg_lsb_b_term = Signal(width, reset_less=True)
 901
 902     def elaborate(self, platform):
 903         m = Module()
 904
 905         pbs, parts = self.pbs, self.parts
 906         part_pts = self.part_pts
 907         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 908         m.d.comb += p.part_pts.eq(part_pts)
 909         parts = p.parts
 910
 911         byte_count = 8 // len(parts)
 912
 913         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 914                 self.not_a_term, self.neg_lsb_a_term,
 915                 self.not_b_term, self.neg_lsb_b_term)
 916
 917         byte_width = 8 // len(parts) # byte width
 918         bit_wid = 8 * byte_width     # bit width
 919         nat, nbt, nla, nlb = [], [], [], []
 920         for i in range(len(parts)):
 921             # work out bit-inverted and +1 term for a.
 922             pa = LSBNegTerm(bit_wid)
 923             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 924             m.d.comb += pa.part.eq(parts[i])
 925             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 926             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 927             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 928             nat.append(pa.nt)
 929             nla.append(pa.nl)
 930
 931             # work out bit-inverted and +1 term for b
 932             pb = LSBNegTerm(bit_wid)
 933             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 934             m.d.comb += pb.part.eq(parts[i])
 935             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 936             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 937             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 938             nbt.append(pb.nt)
 939             nlb.append(pb.nl)
 940
 941         # concatenate together and return all 4 results.
 942         m.d.comb += [not_a_term.eq(Cat(*nat)),
 943                      not_b_term.eq(Cat(*nbt)),
 944                      neg_lsb_a_term.eq(Cat(*nla)),
 945                      neg_lsb_b_term.eq(Cat(*nlb)),
 946                     ]
 947
 948         return m
 949
 950
 951 class IntermediateOut(Elaboratable):
 952     """ selects the HI/LO part of the multiplication, for a given bit-width
 953         the output is also reconstructed in its SIMD (partition) lanes.
 954     """
 955     def __init__(self, width, out_wid, n_parts):
 956         self.width = width
 957         self.n_parts = n_parts
 958         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 959                                      for i in range(8)]
 960         self.intermed = Signal(out_wid, reset_less=True)
 961         self.output = Signal(out_wid//2, reset_less=True)
 962
 963     def elaborate(self, platform):
 964         m = Module()
 965
 966         ol = []
 967         w = self.width
 968         sel = w // 8
 969         for i in range(self.n_parts):
 970             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 971             m.d.comb += op.eq(
 972                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 973                     self.intermed.bit_select(i * w*2, w),
 974                     self.intermed.bit_select(i * w*2 + w, w)))
 975             ol.append(op)
 976         m.d.comb += self.output.eq(Cat(*ol))
 977
 978         return m
 979
 980
 981 class FinalOut(Elaboratable):
 982     """ selects the final output based on the partitioning.
 983
 984         each byte is selectable independently, i.e. it is possible
 985         that some partitions requested 8-bit computation whilst others
 986         requested 16 or 32 bit.
 987     """
 988     def __init__(self, output_width, n_parts, part_pts):
 989         self.part_pts = part_pts
 990         self.output_width = output_width
 991         self.n_parts = n_parts
 992         self.out_wid = output_width//2
 993
 994         self.i = self.ispec()
 995         self.o = self.ospec()
 996
 997     def ispec(self):
 998         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
 999
1000     def ospec(self):
1001         return OutputData()
1002
1003     def elaborate(self, platform):
1004         m = Module()
1005
1006         part_pts = self.part_pts
1007         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1008         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1009         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1010         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1011
1012         out_part_pts = self.i.part_pts
1013
1014         # temporaries
1015         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1016         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1017         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1018
1019         i8 = Signal(self.out_wid, reset_less=True)
1020         i16 = Signal(self.out_wid, reset_less=True)
1021         i32 = Signal(self.out_wid, reset_less=True)
1022         i64 = Signal(self.out_wid, reset_less=True)
1023
1024         m.d.comb += p_8.part_pts.eq(out_part_pts)
1025         m.d.comb += p_16.part_pts.eq(out_part_pts)
1026         m.d.comb += p_32.part_pts.eq(out_part_pts)
1027         m.d.comb += p_64.part_pts.eq(out_part_pts)
1028
1029         for i in range(len(p_8.parts)):
1030             m.d.comb += d8[i].eq(p_8.parts[i])
1031         for i in range(len(p_16.parts)):
1032             m.d.comb += d16[i].eq(p_16.parts[i])
1033         for i in range(len(p_32.parts)):
1034             m.d.comb += d32[i].eq(p_32.parts[i])
1035         m.d.comb += i8.eq(self.i.outputs[0])
1036         m.d.comb += i16.eq(self.i.outputs[1])
1037         m.d.comb += i32.eq(self.i.outputs[2])
1038         m.d.comb += i64.eq(self.i.outputs[3])
1039
1040         ol = []
1041         for i in range(8):
1042             # select one of the outputs: d8 selects i8, d16 selects i16
1043             # d32 selects i32, and the default is i64.
1044             # d8 and d16 are ORed together in the first Mux
1045             # then the 2nd selects either i8 or i16.
1046             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1047             op = Signal(8, reset_less=True, name="op_%d" % i)
1048             m.d.comb += op.eq(
1049                 Mux(d8[i] | d16[i // 2],
1050                     Mux(d8[i], i8.bit_select(i * 8, 8),
1051                                i16.bit_select(i * 8, 8)),
1052                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1053                                       i64.bit_select(i * 8, 8))))
1054             ol.append(op)
1055
1056         # create outputs
1057         m.d.comb += self.o.output.eq(Cat(*ol))
1058         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1059
1060         return m
1061
1062
1063 class OrMod(Elaboratable):
1064     """ ORs four values together in a hierarchical tree
1065     """
1066     def __init__(self, wid):
1067         self.wid = wid
1068         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1069                      for i in range(4)]
1070         self.orout = Signal(wid, reset_less=True)
1071
1072     def elaborate(self, platform):
1073         m = Module()
1074         or1 = Signal(self.wid, reset_less=True)
1075         or2 = Signal(self.wid, reset_less=True)
1076         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1077         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1078         m.d.comb += self.orout.eq(or1 | or2)
1079
1080         return m
1081
1082
1083 class Signs(Elaboratable):
1084     """ determines whether a or b are signed numbers
1085         based on the required operation type (OP_MUL_*)
1086     """
1087
1088     def __init__(self):
1089         self.part_ops = Signal(2, reset_less=True)
1090         self.a_signed = Signal(reset_less=True)
1091         self.b_signed = Signal(reset_less=True)
1092
1093     def elaborate(self, platform):
1094
1095         m = Module()
1096
1097         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1098         bsig = (self.part_ops == OP_MUL_LOW) \
1099                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1100         m.d.comb += self.a_signed.eq(asig)
1101         m.d.comb += self.b_signed.eq(bsig)
1102
1103         return m
1104
1105
1106 class IntermediateData:
1107
1108     def __init__(self, part_pts, output_width, n_parts):
1109         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1110                           for i in range(n_parts)]
1111         self.part_pts = part_pts.like()
1112         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1113                           for i in range(4)]
1114         # intermediates (needed for unit tests)
1115         self.intermediate_output = Signal(output_width)
1116
1117     def eq_from(self, part_pts, outputs, intermediate_output,
1118                       part_ops):
1119         return [self.part_pts.eq(part_pts)] + \
1120                [self.intermediate_output.eq(intermediate_output)] + \
1121                [self.outputs[i].eq(outputs[i])
1122                                      for i in range(4)] + \
1123                [self.part_ops[i].eq(part_ops[i])
1124                                      for i in range(len(self.part_ops))]
1125
1126     def eq(self, rhs):
1127         return self.eq_from(rhs.part_pts, rhs.outputs,
1128                             rhs.intermediate_output, rhs.part_ops)
1129
1130
1131 class InputData:
1132
1133     def __init__(self):
1134         self.a = Signal(64)
1135         self.b = Signal(64)
1136         self.part_pts = PartitionPoints()
1137         for i in range(8, 64, 8):
1138             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1139         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1140
1141     def eq_from(self, part_pts, a, b, part_ops):
1142         return [self.part_pts.eq(part_pts)] + \
1143                [self.a.eq(a), self.b.eq(b)] + \
1144                [self.part_ops[i].eq(part_ops[i])
1145                                      for i in range(len(self.part_ops))]
1146
1147     def eq(self, rhs):
1148         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1149
1150
1151 class OutputData:
1152
1153     def __init__(self):
1154         self.intermediate_output = Signal(128) # needed for unit tests
1155         self.output = Signal(64)
1156
1157     def eq(self, rhs):
1158         return [self.intermediate_output.eq(rhs.intermediate_output),
1159                 self.output.eq(rhs.output)]
1160
1161
1162 class AllTerms(Elaboratable):
1163     """Set of terms to be added together
1164     """
1165
1166     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1167         """Create an ``AddReduce``.
1168
1169         :param inputs: input ``Signal``s to be summed.
1170         :param output_width: bit-width of ``output``.
1171         :param register_levels: List of nesting levels that should have
1172             pipeline registers.
1173         :param partition_points: the input partition points.
1174         """
1175         self.register_levels = register_levels
1176         self.n_inputs = n_inputs
1177         self.n_parts = n_parts
1178         self.output_width = output_width
1179
1180         self.i = self.ispec()
1181         self.o = self.ospec()
1182
1183     def ispec(self):
1184         return InputData()
1185
1186     def ospec(self):
1187         return AddReduceData(self.i.part_pts, self.n_inputs,
1188                              self.output_width, self.n_parts)
1189
1190     def elaborate(self, platform):
1191         m = Module()
1192
1193         eps = self.i.part_pts
1194
1195         # collect part-bytes
1196         pbs = Signal(8, reset_less=True)
1197         tl = []
1198         for i in range(8):
1199             pb = Signal(name="pb%d" % i, reset_less=True)
1200             m.d.comb += pb.eq(eps.part_byte(i))
1201             tl.append(pb)
1202         m.d.comb += pbs.eq(Cat(*tl))
1203
1204         # local variables
1205         signs = []
1206         for i in range(8):
1207             s = Signs()
1208             signs.append(s)
1209             setattr(m.submodules, "signs%d" % i, s)
1210             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1211
1212         n_levels = len(self.register_levels)+1
1213         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1214         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1215         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1216         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1217         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1218         for mod in [part_8, part_16, part_32, part_64]:
1219             m.d.comb += mod.a.eq(self.i.a)
1220             m.d.comb += mod.b.eq(self.i.b)
1221             for i in range(len(signs)):
1222                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1223                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1224             m.d.comb += mod.pbs.eq(pbs)
1225             nat_l.append(mod.not_a_term)
1226             nbt_l.append(mod.not_b_term)
1227             nla_l.append(mod.neg_lsb_a_term)
1228             nlb_l.append(mod.neg_lsb_b_term)
1229
1230         terms = []
1231
1232         for a_index in range(8):
1233             t = ProductTerms(8, 128, 8, a_index, 8)
1234             setattr(m.submodules, "terms_%d" % a_index, t)
1235
1236             m.d.comb += t.a.eq(self.i.a)
1237             m.d.comb += t.b.eq(self.i.b)
1238             m.d.comb += t.pb_en.eq(pbs)
1239
1240             for term in t.terms:
1241                 terms.append(term)
1242
1243         # it's fine to bitwise-or data together since they are never enabled
1244         # at the same time
1245         m.submodules.nat_or = nat_or = OrMod(128)
1246         m.submodules.nbt_or = nbt_or = OrMod(128)
1247         m.submodules.nla_or = nla_or = OrMod(128)
1248         m.submodules.nlb_or = nlb_or = OrMod(128)
1249         for l, mod in [(nat_l, nat_or),
1250                              (nbt_l, nbt_or),
1251                              (nla_l, nla_or),
1252                              (nlb_l, nlb_or)]:
1253             for i in range(len(l)):
1254                 m.d.comb += mod.orin[i].eq(l[i])
1255             terms.append(mod.orout)
1256
1257         # copy the intermediate terms to the output
1258         for i, value in enumerate(terms):
1259             m.d.comb += self.o.terms[i].eq(value)
1260
1261         # copy reg part points and part ops to output
1262         m.d.comb += self.o.part_pts.eq(eps)
1263         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1264                                      for i in range(len(self.i.part_ops))]
1265
1266         return m
1267
1268
1269 class Intermediates(Elaboratable):
1270     """ Intermediate output modules
1271     """
1272
1273     def __init__(self, output_width, n_parts, partition_points):
1274         self.i = FinalReduceData(partition_points, output_width, n_parts)
1275         self.o = IntermediateData(partition_points, output_width, n_parts)
1276
1277     def elaborate(self, platform):
1278         m = Module()
1279
1280         out_part_ops = self.i.part_ops
1281         out_part_pts = self.i.part_pts
1282
1283         # create _output_64
1284         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1285         m.d.comb += io64.intermed.eq(self.i.output)
1286         for i in range(8):
1287             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1288         m.d.comb += self.o.outputs[3].eq(io64.output)
1289
1290         # create _output_32
1291         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1292         m.d.comb += io32.intermed.eq(self.i.output)
1293         for i in range(8):
1294             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1295         m.d.comb += self.o.outputs[2].eq(io32.output)
1296
1297         # create _output_16
1298         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1299         m.d.comb += io16.intermed.eq(self.i.output)
1300         for i in range(8):
1301             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1302         m.d.comb += self.o.outputs[1].eq(io16.output)
1303
1304         # create _output_8
1305         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1306         m.d.comb += io8.intermed.eq(self.i.output)
1307         for i in range(8):
1308             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1309         m.d.comb += self.o.outputs[0].eq(io8.output)
1310
1311         for i in range(8):
1312             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1313         m.d.comb += self.o.part_pts.eq(out_part_pts)
1314         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1315
1316         return m
1317
1318
1319 class Mul8_16_32_64(Elaboratable):
1320     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1321
1322     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1323     partitions on naturally-aligned boundaries. Supports the operation being
1324     set for each partition independently.
1325
1326     :attribute part_pts: the input partition points. Has a partition point at
1327         multiples of 8 in 0 < i < 64. Each partition point's associated
1328         ``Value`` is a ``Signal``. Modification not supported, except for by
1329         ``Signal.eq``.
1330     :attribute part_ops: the operation for each byte. The operation for a
1331         particular partition is selected by assigning the selected operation
1332         code to each byte in the partition. The allowed operation codes are:
1333
1334         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1335             RISC-V's `mul` instruction.
1336         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1337             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1338             instruction.
1339         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1340             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1341             `mulhsu` instruction.
1342         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1343             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1344             instruction.
1345     """
1346
1347     def __init__(self, register_levels=()):
1348         """ register_levels: specifies the points in the cascade at which
1349             flip-flops are to be inserted.
1350         """
1351
1352         # parameter(s)
1353         self.register_levels = list(register_levels)
1354
1355         self.i = self.ispec()
1356         self.o = self.ospec()
1357
1358         # inputs
1359         self.part_pts = self.i.part_pts
1360         self.part_ops = self.i.part_ops
1361         self.a = self.i.a
1362         self.b = self.i.b
1363
1364         # output
1365         self.intermediate_output = self.o.intermediate_output
1366         self.output = self.o.output
1367
1368     def ispec(self):
1369         return InputData()
1370
1371     def ospec(self):
1372         return OutputData()
1373
1374     def elaborate(self, platform):
1375         m = Module()
1376
1377         part_pts = self.part_pts
1378
1379         n_inputs = 64 + 4
1380         n_parts = 8
1381         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1382         m.submodules.allterms = t
1383         m.d.comb += t.i.eq(self.i)
1384
1385         terms = t.o.terms
1386
1387         add_reduce = AddReduce(terms,
1388                                128,
1389                                self.register_levels,
1390                                t.o.part_pts,
1391                                t.o.part_ops)
1392
1393         out_part_ops = add_reduce.o.part_ops
1394         out_part_pts = add_reduce.o.part_pts
1395
1396         m.submodules.add_reduce = add_reduce
1397
1398         interm = Intermediates(128, 8, part_pts)
1399         m.submodules.intermediates = interm
1400         m.d.comb += interm.i.eq(add_reduce.o)
1401
1402         # final output
1403         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1404         m.d.comb += finalout.i.eq(interm.o)
1405         m.d.comb += self.o.eq(finalout.o)
1406
1407         return m
1408
1409
1410 if __name__ == "__main__":
1411     m = Mul8_16_32_64()
1412     main(m, ports=[m.a,
1413                    m.b,
1414                    m.intermediate_output,
1415                    m.output,
1416                    *m.part_ops,
1417                    *m.part_pts.values()])