src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.i = AddReduceData(partition_points, n_inputs,
 351                                output_width, n_parts)
 352         self.o = FinalReduceData(partition_points, output_width, n_parts)
 353         self.output_width = output_width
 354         self.n_inputs = n_inputs
 355         self.n_parts = n_parts
 356         self.partition_points = PartitionPoints(partition_points)
 357         if not self.partition_points.fits_in_width(output_width):
 358             raise ValueError("partition_points doesn't fit in output_width")
 359
 360     def elaborate(self, platform):
 361         """Elaborate this module."""
 362         m = Module()
 363
 364         output_width = self.output_width
 365         output = Signal(output_width, reset_less=True)
 366         if self.n_inputs == 0:
 367             # use 0 as the default output value
 368             m.d.comb += output.eq(0)
 369         elif self.n_inputs == 1:
 370             # handle single input
 371             m.d.comb += output.eq(self.i.terms[0])
 372         else:
 373             # base case for adding 2 inputs
 374             assert self.n_inputs == 2
 375             adder = PartitionedAdder(output_width,
 376                                      self.i.part_pts, 2)
 377             m.submodules.final_adder = adder
 378             m.d.comb += adder.a.eq(self.i.terms[0])
 379             m.d.comb += adder.b.eq(self.i.terms[1])
 380             m.d.comb += output.eq(adder.output)
 381
 382         # create output
 383         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 384                                    self.i.part_ops)
 385
 386         return m
 387
 388
 389 class AddReduceSingle(Elaboratable):
 390     """Add list of numbers together.
 391
 392     :attribute inputs: input ``Signal``s to be summed. Modification not
 393         supported, except for by ``Signal.eq``.
 394     :attribute register_levels: List of nesting levels that should have
 395         pipeline registers.
 396     :attribute output: output sum.
 397     :attribute partition_points: the input partition points. Modification not
 398         supported, except for by ``Signal.eq``.
 399     """
 400
 401     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 402         """Create an ``AddReduce``.
 403
 404         :param inputs: input ``Signal``s to be summed.
 405         :param output_width: bit-width of ``output``.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.partition_points = PartitionPoints(partition_points)
 414         if not self.partition_points.fits_in_width(output_width):
 415             raise ValueError("partition_points doesn't fit in output_width")
 416
 417         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 418         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 419         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 420
 421     @staticmethod
 422     def calc_n_inputs(n_inputs, groups):
 423         retval = len(groups)*2
 424         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 425             retval += 1
 426         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 427             retval += 2
 428         else:
 429             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 430         return retval
 431
 432     @staticmethod
 433     def get_max_level(input_count):
 434         """Get the maximum level.
 435
 436         All ``register_levels`` must be less than or equal to the maximum
 437         level.
 438         """
 439         retval = 0
 440         while True:
 441             groups = AddReduceSingle.full_adder_groups(input_count)
 442             if len(groups) == 0:
 443                 return retval
 444             input_count %= FULL_ADDER_INPUT_COUNT
 445             input_count += 2 * len(groups)
 446             retval += 1
 447
 448     @staticmethod
 449     def full_adder_groups(input_count):
 450         """Get ``inputs`` indices for which a full adder should be built."""
 451         return range(0,
 452                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 453                      FULL_ADDER_INPUT_COUNT)
 454
 455     def create_next_terms(self):
 456         """ create next intermediate terms, for linking up in elaborate, below
 457         """
 458         terms = []
 459         adders = []
 460
 461         # create full adders for this recursive level.
 462         # this shrinks N terms to 2 * (N // 3) plus the remainder
 463         for i in self.groups:
 464             adder_i = MaskedFullAdder(self.output_width)
 465             adders.append((i, adder_i))
 466             # add both the sum and the masked-carry to the next level.
 467             # 3 inputs have now been reduced to 2...
 468             terms.append(adder_i.sum)
 469             terms.append(adder_i.mcarry)
 470         # handle the remaining inputs.
 471         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 472             terms.append(self.i.terms[-1])
 473         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 474             # Just pass the terms to the next layer, since we wouldn't gain
 475             # anything by using a half adder since there would still be 2 terms
 476             # and just passing the terms to the next layer saves gates.
 477             terms.append(self.i.terms[-2])
 478             terms.append(self.i.terms[-1])
 479         else:
 480             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 481
 482         return terms, adders
 483
 484     def elaborate(self, platform):
 485         """Elaborate this module."""
 486         m = Module()
 487
 488         terms, adders = self.create_next_terms()
 489
 490         # copy the intermediate terms to the output
 491         for i, value in enumerate(terms):
 492             m.d.comb += self.o.terms[i].eq(value)
 493
 494         # copy reg part points and part ops to output
 495         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 496         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 497                                      for i in range(len(self.i.part_ops))]
 498
 499         # set up the partition mask (for the adders)
 500         part_mask = Signal(self.output_width, reset_less=True)
 501
 502         # get partition points as a mask
 503         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 504         m.d.comb += part_mask.eq(mask)
 505
 506         # add and link the intermediate term modules
 507         for i, (iidx, adder_i) in enumerate(adders):
 508             setattr(m.submodules, f"adder_{i}", adder_i)
 509
 510             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 511             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 512             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 513             m.d.comb += adder_i.mask.eq(part_mask)
 514
 515         return m
 516
 517
 518 class AddReduceInternal:
 519     """Recursively Add list of numbers together.
 520
 521     :attribute inputs: input ``Signal``s to be summed. Modification not
 522         supported, except for by ``Signal.eq``.
 523     :attribute register_levels: List of nesting levels that should have
 524         pipeline registers.
 525     :attribute output: output sum.
 526     :attribute partition_points: the input partition points. Modification not
 527         supported, except for by ``Signal.eq``.
 528     """
 529
 530     def __init__(self, inputs, output_width, partition_points,
 531                        part_ops):
 532         """Create an ``AddReduce``.
 533
 534         :param inputs: input ``Signal``s to be summed.
 535         :param output_width: bit-width of ``output``.
 536         :param partition_points: the input partition points.
 537         """
 538         self.inputs = inputs
 539         self.part_ops = part_ops
 540         self.output_width = output_width
 541         self.partition_points = partition_points
 542
 543         self.create_levels()
 544
 545     def create_levels(self):
 546         """creates reduction levels"""
 547
 548         mods = []
 549         partition_points = self.partition_points
 550         part_ops = self.part_ops
 551         n_parts = len(part_ops)
 552         inputs = self.inputs
 553         ilen = len(inputs)
 554         while True:
 555             groups = AddReduceSingle.full_adder_groups(len(inputs))
 556             if len(groups) == 0:
 557                 break
 558             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 559                                          partition_points)
 560             mods.append(next_level)
 561             partition_points = next_level.i.part_pts
 562             inputs = next_level.o.terms
 563             ilen = len(inputs)
 564             part_ops = next_level.i.part_ops
 565
 566         next_level = FinalAdd(ilen, self.output_width, n_parts,
 567                               partition_points)
 568         mods.append(next_level)
 569
 570         self.levels = mods
 571
 572
 573 class AddReduce(AddReduceInternal, Elaboratable):
 574     """Recursively Add list of numbers together.
 575
 576     :attribute inputs: input ``Signal``s to be summed. Modification not
 577         supported, except for by ``Signal.eq``.
 578     :attribute register_levels: List of nesting levels that should have
 579         pipeline registers.
 580     :attribute output: output sum.
 581     :attribute partition_points: the input partition points. Modification not
 582         supported, except for by ``Signal.eq``.
 583     """
 584
 585     def __init__(self, inputs, output_width, register_levels, partition_points,
 586                        part_ops):
 587         """Create an ``AddReduce``.
 588
 589         :param inputs: input ``Signal``s to be summed.
 590         :param output_width: bit-width of ``output``.
 591         :param register_levels: List of nesting levels that should have
 592             pipeline registers.
 593         :param partition_points: the input partition points.
 594         """
 595         AddReduceInternal.__init__(self, inputs, output_width,
 596                                    partition_points, part_ops)
 597         n_parts = len(part_ops)
 598         self.o = FinalReduceData(partition_points, output_width, n_parts)
 599         self.register_levels = register_levels
 600
 601     @staticmethod
 602     def get_max_level(input_count):
 603         return AddReduceSingle.get_max_level(input_count)
 604
 605     @staticmethod
 606     def next_register_levels(register_levels):
 607         """``Iterable`` of ``register_levels`` for next recursive level."""
 608         for level in register_levels:
 609             if level > 0:
 610                 yield level - 1
 611
 612     def create_levels(self):
 613         """creates reduction levels"""
 614
 615         mods = []
 616         partition_points = self.partition_points
 617         part_ops = self.part_ops
 618         n_parts = len(part_ops)
 619         inputs = self.inputs
 620         ilen = len(inputs)
 621         while True:
 622             groups = AddReduceSingle.full_adder_groups(len(inputs))
 623             if len(groups) == 0:
 624                 break
 625             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 626                                          partition_points)
 627             mods.append(next_level)
 628             partition_points = next_level.i.part_pts
 629             inputs = next_level.o.terms
 630             ilen = len(inputs)
 631             part_ops = next_level.i.part_ops
 632
 633         next_level = FinalAdd(ilen, self.output_width, n_parts,
 634                               partition_points)
 635         mods.append(next_level)
 636
 637         self.levels = mods
 638
 639     def elaborate(self, platform):
 640         """Elaborate this module."""
 641         m = Module()
 642
 643         for i, next_level in enumerate(self.levels):
 644             setattr(m.submodules, "next_level%d" % i, next_level)
 645
 646         partition_points = self.partition_points
 647         inputs = self.inputs
 648         part_ops = self.part_ops
 649         n_parts = len(part_ops)
 650         n_inputs = len(inputs)
 651         output_width = self.output_width
 652         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 653         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 654         for idx in range(len(self.levels)):
 655             mcur = self.levels[idx]
 656             if idx in self.register_levels:
 657                 m.d.sync += mcur.i.eq(i)
 658             else:
 659                 m.d.comb += mcur.i.eq(i)
 660             i = mcur.o # for next loop
 661
 662         # output comes from last module
 663         m.d.comb += self.o.eq(i)
 664
 665         return m
 666
 667
 668 OP_MUL_LOW = 0
 669 OP_MUL_SIGNED_HIGH = 1
 670 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 671 OP_MUL_UNSIGNED_HIGH = 3
 672
 673
 674 def get_term(value, shift=0, enabled=None):
 675     if enabled is not None:
 676         value = Mux(enabled, value, 0)
 677     if shift > 0:
 678         value = Cat(Repl(C(0, 1), shift), value)
 679     else:
 680         assert shift == 0
 681     return value
 682
 683
 684 class ProductTerm(Elaboratable):
 685     """ this class creates a single product term (a[..]*b[..]).
 686         it has a design flaw in that is the *output* that is selected,
 687         where the multiplication(s) are combinatorially generated
 688         all the time.
 689     """
 690
 691     def __init__(self, width, twidth, pbwid, a_index, b_index):
 692         self.a_index = a_index
 693         self.b_index = b_index
 694         shift = 8 * (self.a_index + self.b_index)
 695         self.pwidth = width
 696         self.twidth = twidth
 697         self.width = width*2
 698         self.shift = shift
 699
 700         self.ti = Signal(self.width, reset_less=True)
 701         self.term = Signal(twidth, reset_less=True)
 702         self.a = Signal(twidth//2, reset_less=True)
 703         self.b = Signal(twidth//2, reset_less=True)
 704         self.pb_en = Signal(pbwid, reset_less=True)
 705
 706         self.tl = tl = []
 707         min_index = min(self.a_index, self.b_index)
 708         max_index = max(self.a_index, self.b_index)
 709         for i in range(min_index, max_index):
 710             tl.append(self.pb_en[i])
 711         name = "te_%d_%d" % (self.a_index, self.b_index)
 712         if len(tl) > 0:
 713             term_enabled = Signal(name=name, reset_less=True)
 714         else:
 715             term_enabled = None
 716         self.enabled = term_enabled
 717         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 718
 719     def elaborate(self, platform):
 720
 721         m = Module()
 722         if self.enabled is not None:
 723             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 724
 725         bsa = Signal(self.width, reset_less=True)
 726         bsb = Signal(self.width, reset_less=True)
 727         a_index, b_index = self.a_index, self.b_index
 728         pwidth = self.pwidth
 729         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 730         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 731         m.d.comb += self.ti.eq(bsa * bsb)
 732         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 733         """
 734         #TODO: sort out width issues, get inputs a/b switched on/off.
 735         #data going into Muxes is 1/2 the required width
 736
 737         pwidth = self.pwidth
 738         width = self.width
 739         bsa = Signal(self.twidth//2, reset_less=True)
 740         bsb = Signal(self.twidth//2, reset_less=True)
 741         asel = Signal(width, reset_less=True)
 742         bsel = Signal(width, reset_less=True)
 743         a_index, b_index = self.a_index, self.b_index
 744         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 745         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 746         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 747         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 748         m.d.comb += self.ti.eq(bsa * bsb)
 749         m.d.comb += self.term.eq(self.ti)
 750         """
 751
 752         return m
 753
 754
 755 class ProductTerms(Elaboratable):
 756     """ creates a bank of product terms.  also performs the actual bit-selection
 757         this class is to be wrapped with a for-loop on the "a" operand.
 758         it creates a second-level for-loop on the "b" operand.
 759     """
 760     def __init__(self, width, twidth, pbwid, a_index, blen):
 761         self.a_index = a_index
 762         self.blen = blen
 763         self.pwidth = width
 764         self.twidth = twidth
 765         self.pbwid = pbwid
 766         self.a = Signal(twidth//2, reset_less=True)
 767         self.b = Signal(twidth//2, reset_less=True)
 768         self.pb_en = Signal(pbwid, reset_less=True)
 769         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 770                             for i in range(blen)]
 771
 772     def elaborate(self, platform):
 773
 774         m = Module()
 775
 776         for b_index in range(self.blen):
 777             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 778                             self.a_index, b_index)
 779             setattr(m.submodules, "term_%d" % b_index, t)
 780
 781             m.d.comb += t.a.eq(self.a)
 782             m.d.comb += t.b.eq(self.b)
 783             m.d.comb += t.pb_en.eq(self.pb_en)
 784
 785             m.d.comb += self.terms[b_index].eq(t.term)
 786
 787         return m
 788
 789
 790 class LSBNegTerm(Elaboratable):
 791
 792     def __init__(self, bit_width):
 793         self.bit_width = bit_width
 794         self.part = Signal(reset_less=True)
 795         self.signed = Signal(reset_less=True)
 796         self.op = Signal(bit_width, reset_less=True)
 797         self.msb = Signal(reset_less=True)
 798         self.nt = Signal(bit_width*2, reset_less=True)
 799         self.nl = Signal(bit_width*2, reset_less=True)
 800
 801     def elaborate(self, platform):
 802         m = Module()
 803         comb = m.d.comb
 804         bit_wid = self.bit_width
 805         ext = Repl(0, bit_wid) # extend output to HI part
 806
 807         # determine sign of each incoming number *in this partition*
 808         enabled = Signal(reset_less=True)
 809         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 810
 811         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 812         # negation operation is split into a bitwise not and a +1.
 813         # likewise for 16, 32, and 64-bit values.
 814
 815         # width-extended 1s complement if a is signed, otherwise zero
 816         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 817
 818         # add 1 if signed, otherwise add zero
 819         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 820
 821         return m
 822
 823
 824 class Parts(Elaboratable):
 825
 826     def __init__(self, pbwid, part_pts, n_parts):
 827         self.pbwid = pbwid
 828         # inputs
 829         self.part_pts = PartitionPoints.like(part_pts)
 830         # outputs
 831         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 832                       for i in range(n_parts)]
 833
 834     def elaborate(self, platform):
 835         m = Module()
 836
 837         part_pts, parts = self.part_pts, self.parts
 838         # collect part-bytes (double factor because the input is extended)
 839         pbs = Signal(self.pbwid, reset_less=True)
 840         tl = []
 841         for i in range(self.pbwid):
 842             pb = Signal(name="pb%d" % i, reset_less=True)
 843             m.d.comb += pb.eq(part_pts.part_byte(i))
 844             tl.append(pb)
 845         m.d.comb += pbs.eq(Cat(*tl))
 846
 847         # negated-temporary copy of partition bits
 848         npbs = Signal.like(pbs, reset_less=True)
 849         m.d.comb += npbs.eq(~pbs)
 850         byte_count = 8 // len(parts)
 851         for i in range(len(parts)):
 852             pbl = []
 853             pbl.append(npbs[i * byte_count - 1])
 854             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 855                 pbl.append(pbs[j])
 856             pbl.append(npbs[(i + 1) * byte_count - 1])
 857             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 858             m.d.comb += value.eq(Cat(*pbl))
 859             m.d.comb += parts[i].eq(~(value).bool())
 860
 861         return m
 862
 863
 864 class Part(Elaboratable):
 865     """ a key class which, depending on the partitioning, will determine
 866         what action to take when parts of the output are signed or unsigned.
 867
 868         this requires 2 pieces of data *per operand, per partition*:
 869         whether the MSB is HI/LO (per partition!), and whether a signed
 870         or unsigned operation has been *requested*.
 871
 872         once that is determined, signed is basically carried out
 873         by splitting 2's complement into 1's complement plus one.
 874         1's complement is just a bit-inversion.
 875
 876         the extra terms - as separate terms - are then thrown at the
 877         AddReduce alongside the multiplication part-results.
 878     """
 879     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 880
 881         self.pbwid = pbwid
 882         self.part_pts = part_pts
 883
 884         # inputs
 885         self.a = Signal(64, reset_less=True)
 886         self.b = Signal(64, reset_less=True)
 887         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 888                             for i in range(8)]
 889         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 890                             for i in range(8)]
 891         self.pbs = Signal(pbwid, reset_less=True)
 892
 893         # outputs
 894         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 895                             for i in range(n_parts)]
 896
 897         self.not_a_term = Signal(width, reset_less=True)
 898         self.neg_lsb_a_term = Signal(width, reset_less=True)
 899         self.not_b_term = Signal(width, reset_less=True)
 900         self.neg_lsb_b_term = Signal(width, reset_less=True)
 901
 902     def elaborate(self, platform):
 903         m = Module()
 904
 905         pbs, parts = self.pbs, self.parts
 906         part_pts = self.part_pts
 907         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 908         m.d.comb += p.part_pts.eq(part_pts)
 909         parts = p.parts
 910
 911         byte_count = 8 // len(parts)
 912
 913         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 914                 self.not_a_term, self.neg_lsb_a_term,
 915                 self.not_b_term, self.neg_lsb_b_term)
 916
 917         byte_width = 8 // len(parts) # byte width
 918         bit_wid = 8 * byte_width     # bit width
 919         nat, nbt, nla, nlb = [], [], [], []
 920         for i in range(len(parts)):
 921             # work out bit-inverted and +1 term for a.
 922             pa = LSBNegTerm(bit_wid)
 923             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 924             m.d.comb += pa.part.eq(parts[i])
 925             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 926             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 927             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 928             nat.append(pa.nt)
 929             nla.append(pa.nl)
 930
 931             # work out bit-inverted and +1 term for b
 932             pb = LSBNegTerm(bit_wid)
 933             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 934             m.d.comb += pb.part.eq(parts[i])
 935             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 936             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 937             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 938             nbt.append(pb.nt)
 939             nlb.append(pb.nl)
 940
 941         # concatenate together and return all 4 results.
 942         m.d.comb += [not_a_term.eq(Cat(*nat)),
 943                      not_b_term.eq(Cat(*nbt)),
 944                      neg_lsb_a_term.eq(Cat(*nla)),
 945                      neg_lsb_b_term.eq(Cat(*nlb)),
 946                     ]
 947
 948         return m
 949
 950
 951 class IntermediateOut(Elaboratable):
 952     """ selects the HI/LO part of the multiplication, for a given bit-width
 953         the output is also reconstructed in its SIMD (partition) lanes.
 954     """
 955     def __init__(self, width, out_wid, n_parts):
 956         self.width = width
 957         self.n_parts = n_parts
 958         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 959                                      for i in range(8)]
 960         self.intermed = Signal(out_wid, reset_less=True)
 961         self.output = Signal(out_wid//2, reset_less=True)
 962
 963     def elaborate(self, platform):
 964         m = Module()
 965
 966         ol = []
 967         w = self.width
 968         sel = w // 8
 969         for i in range(self.n_parts):
 970             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 971             m.d.comb += op.eq(
 972                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 973                     self.intermed.bit_select(i * w*2, w),
 974                     self.intermed.bit_select(i * w*2 + w, w)))
 975             ol.append(op)
 976         m.d.comb += self.output.eq(Cat(*ol))
 977
 978         return m
 979
 980
 981 class FinalOut(Elaboratable):
 982     """ selects the final output based on the partitioning.
 983
 984         each byte is selectable independently, i.e. it is possible
 985         that some partitions requested 8-bit computation whilst others
 986         requested 16 or 32 bit.
 987     """
 988     def __init__(self, output_width, n_parts, part_pts):
 989         self.part_pts = part_pts
 990         self.i = IntermediateData(part_pts, output_width, n_parts)
 991         self.out_wid = output_width//2
 992         # output
 993         self.out = Signal(self.out_wid, reset_less=True)
 994         self.intermediate_output = Signal(output_width, reset_less=True)
 995
 996     def elaborate(self, platform):
 997         m = Module()
 998
 999         part_pts = self.part_pts
1000         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1001         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1002         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1003         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1004
1005         out_part_pts = self.i.part_pts
1006
1007         # temporaries
1008         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1009         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1010         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1011
1012         i8 = Signal(self.out_wid, reset_less=True)
1013         i16 = Signal(self.out_wid, reset_less=True)
1014         i32 = Signal(self.out_wid, reset_less=True)
1015         i64 = Signal(self.out_wid, reset_less=True)
1016
1017         m.d.comb += p_8.part_pts.eq(out_part_pts)
1018         m.d.comb += p_16.part_pts.eq(out_part_pts)
1019         m.d.comb += p_32.part_pts.eq(out_part_pts)
1020         m.d.comb += p_64.part_pts.eq(out_part_pts)
1021
1022         for i in range(len(p_8.parts)):
1023             m.d.comb += d8[i].eq(p_8.parts[i])
1024         for i in range(len(p_16.parts)):
1025             m.d.comb += d16[i].eq(p_16.parts[i])
1026         for i in range(len(p_32.parts)):
1027             m.d.comb += d32[i].eq(p_32.parts[i])
1028         m.d.comb += i8.eq(self.i.outputs[0])
1029         m.d.comb += i16.eq(self.i.outputs[1])
1030         m.d.comb += i32.eq(self.i.outputs[2])
1031         m.d.comb += i64.eq(self.i.outputs[3])
1032
1033         ol = []
1034         for i in range(8):
1035             # select one of the outputs: d8 selects i8, d16 selects i16
1036             # d32 selects i32, and the default is i64.
1037             # d8 and d16 are ORed together in the first Mux
1038             # then the 2nd selects either i8 or i16.
1039             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1040             op = Signal(8, reset_less=True, name="op_%d" % i)
1041             m.d.comb += op.eq(
1042                 Mux(d8[i] | d16[i // 2],
1043                     Mux(d8[i], i8.bit_select(i * 8, 8),
1044                                i16.bit_select(i * 8, 8)),
1045                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1046                                       i64.bit_select(i * 8, 8))))
1047             ol.append(op)
1048         m.d.comb += self.out.eq(Cat(*ol))
1049         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
1050         return m
1051
1052
1053 class OrMod(Elaboratable):
1054     """ ORs four values together in a hierarchical tree
1055     """
1056     def __init__(self, wid):
1057         self.wid = wid
1058         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1059                      for i in range(4)]
1060         self.orout = Signal(wid, reset_less=True)
1061
1062     def elaborate(self, platform):
1063         m = Module()
1064         or1 = Signal(self.wid, reset_less=True)
1065         or2 = Signal(self.wid, reset_less=True)
1066         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1067         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1068         m.d.comb += self.orout.eq(or1 | or2)
1069
1070         return m
1071
1072
1073 class Signs(Elaboratable):
1074     """ determines whether a or b are signed numbers
1075         based on the required operation type (OP_MUL_*)
1076     """
1077
1078     def __init__(self):
1079         self.part_ops = Signal(2, reset_less=True)
1080         self.a_signed = Signal(reset_less=True)
1081         self.b_signed = Signal(reset_less=True)
1082
1083     def elaborate(self, platform):
1084
1085         m = Module()
1086
1087         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1088         bsig = (self.part_ops == OP_MUL_LOW) \
1089                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1090         m.d.comb += self.a_signed.eq(asig)
1091         m.d.comb += self.b_signed.eq(bsig)
1092
1093         return m
1094
1095
1096 class IntermediateData:
1097
1098     def __init__(self, part_pts, output_width, n_parts):
1099         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1100                           for i in range(n_parts)]
1101         self.part_pts = part_pts.like()
1102         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1103                           for i in range(4)]
1104         # intermediates (needed for unit tests)
1105         self.intermediate_output = Signal(output_width)
1106
1107     def eq_from(self, part_pts, outputs, intermediate_output,
1108                       part_ops):
1109         return [self.part_pts.eq(part_pts)] + \
1110                [self.intermediate_output.eq(intermediate_output)] + \
1111                [self.outputs[i].eq(outputs[i])
1112                                      for i in range(4)] + \
1113                [self.part_ops[i].eq(part_ops[i])
1114                                      for i in range(len(self.part_ops))]
1115
1116     def eq(self, rhs):
1117         return self.eq_from(rhs.part_pts, rhs.outputs,
1118                             rhs.intermediate_output, rhs.part_ops)
1119
1120
1121 class InputData:
1122
1123     def __init__(self):
1124         self.a = Signal(64)
1125         self.b = Signal(64)
1126         self.part_pts = PartitionPoints()
1127         for i in range(8, 64, 8):
1128             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1129         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1130
1131     def eq_from(self, part_pts, a, b, part_ops):
1132         return [self.part_pts.eq(part_pts)] + \
1133                [self.a.eq(a), self.b.eq(b)] + \
1134                [self.part_ops[i].eq(part_ops[i])
1135                                      for i in range(len(self.part_ops))]
1136
1137     def eq(self, rhs):
1138         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1139
1140
1141 class AllTerms(Elaboratable):
1142     """Set of terms to be added together
1143     """
1144
1145     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1146         """Create an ``AddReduce``.
1147
1148         :param inputs: input ``Signal``s to be summed.
1149         :param output_width: bit-width of ``output``.
1150         :param register_levels: List of nesting levels that should have
1151             pipeline registers.
1152         :param partition_points: the input partition points.
1153         """
1154         self.i = InputData()
1155         self.register_levels = register_levels
1156         self.n_inputs = n_inputs
1157         self.n_parts = n_parts
1158         self.output_width = output_width
1159         self.o = AddReduceData(self.i.part_pts, n_inputs,
1160                                output_width, n_parts)
1161
1162     def elaborate(self, platform):
1163         m = Module()
1164
1165         eps = self.i.part_pts
1166
1167         # collect part-bytes
1168         pbs = Signal(8, reset_less=True)
1169         tl = []
1170         for i in range(8):
1171             pb = Signal(name="pb%d" % i, reset_less=True)
1172             m.d.comb += pb.eq(eps.part_byte(i))
1173             tl.append(pb)
1174         m.d.comb += pbs.eq(Cat(*tl))
1175
1176         # local variables
1177         signs = []
1178         for i in range(8):
1179             s = Signs()
1180             signs.append(s)
1181             setattr(m.submodules, "signs%d" % i, s)
1182             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1183
1184         n_levels = len(self.register_levels)+1
1185         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1186         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1187         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1188         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1189         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1190         for mod in [part_8, part_16, part_32, part_64]:
1191             m.d.comb += mod.a.eq(self.i.a)
1192             m.d.comb += mod.b.eq(self.i.b)
1193             for i in range(len(signs)):
1194                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1195                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1196             m.d.comb += mod.pbs.eq(pbs)
1197             nat_l.append(mod.not_a_term)
1198             nbt_l.append(mod.not_b_term)
1199             nla_l.append(mod.neg_lsb_a_term)
1200             nlb_l.append(mod.neg_lsb_b_term)
1201
1202         terms = []
1203
1204         for a_index in range(8):
1205             t = ProductTerms(8, 128, 8, a_index, 8)
1206             setattr(m.submodules, "terms_%d" % a_index, t)
1207
1208             m.d.comb += t.a.eq(self.i.a)
1209             m.d.comb += t.b.eq(self.i.b)
1210             m.d.comb += t.pb_en.eq(pbs)
1211
1212             for term in t.terms:
1213                 terms.append(term)
1214
1215         # it's fine to bitwise-or data together since they are never enabled
1216         # at the same time
1217         m.submodules.nat_or = nat_or = OrMod(128)
1218         m.submodules.nbt_or = nbt_or = OrMod(128)
1219         m.submodules.nla_or = nla_or = OrMod(128)
1220         m.submodules.nlb_or = nlb_or = OrMod(128)
1221         for l, mod in [(nat_l, nat_or),
1222                              (nbt_l, nbt_or),
1223                              (nla_l, nla_or),
1224                              (nlb_l, nlb_or)]:
1225             for i in range(len(l)):
1226                 m.d.comb += mod.orin[i].eq(l[i])
1227             terms.append(mod.orout)
1228
1229         # copy the intermediate terms to the output
1230         for i, value in enumerate(terms):
1231             m.d.comb += self.o.terms[i].eq(value)
1232
1233         # copy reg part points and part ops to output
1234         m.d.comb += self.o.part_pts.eq(eps)
1235         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1236                                      for i in range(len(self.i.part_ops))]
1237
1238         return m
1239
1240
1241 class Intermediates(Elaboratable):
1242     """ Intermediate output modules
1243     """
1244
1245     def __init__(self, output_width, n_parts, partition_points):
1246         self.i = FinalReduceData(partition_points, output_width, n_parts)
1247         self.o = IntermediateData(partition_points, output_width, n_parts)
1248
1249     def elaborate(self, platform):
1250         m = Module()
1251
1252         out_part_ops = self.i.part_ops
1253         out_part_pts = self.i.part_pts
1254
1255         # create _output_64
1256         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1257         m.d.comb += io64.intermed.eq(self.i.output)
1258         for i in range(8):
1259             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1260         m.d.comb += self.o.outputs[3].eq(io64.output)
1261
1262         # create _output_32
1263         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1264         m.d.comb += io32.intermed.eq(self.i.output)
1265         for i in range(8):
1266             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1267         m.d.comb += self.o.outputs[2].eq(io32.output)
1268
1269         # create _output_16
1270         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1271         m.d.comb += io16.intermed.eq(self.i.output)
1272         for i in range(8):
1273             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1274         m.d.comb += self.o.outputs[1].eq(io16.output)
1275
1276         # create _output_8
1277         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1278         m.d.comb += io8.intermed.eq(self.i.output)
1279         for i in range(8):
1280             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1281         m.d.comb += self.o.outputs[0].eq(io8.output)
1282
1283         for i in range(8):
1284             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1285         m.d.comb += self.o.part_pts.eq(out_part_pts)
1286         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1287
1288         return m
1289
1290
1291 class Mul8_16_32_64(Elaboratable):
1292     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1293
1294     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1295     partitions on naturally-aligned boundaries. Supports the operation being
1296     set for each partition independently.
1297
1298     :attribute part_pts: the input partition points. Has a partition point at
1299         multiples of 8 in 0 < i < 64. Each partition point's associated
1300         ``Value`` is a ``Signal``. Modification not supported, except for by
1301         ``Signal.eq``.
1302     :attribute part_ops: the operation for each byte. The operation for a
1303         particular partition is selected by assigning the selected operation
1304         code to each byte in the partition. The allowed operation codes are:
1305
1306         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1307             RISC-V's `mul` instruction.
1308         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1309             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1310             instruction.
1311         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1312             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1313             `mulhsu` instruction.
1314         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1315             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1316             instruction.
1317     """
1318
1319     def __init__(self, register_levels=()):
1320         """ register_levels: specifies the points in the cascade at which
1321             flip-flops are to be inserted.
1322         """
1323
1324         # parameter(s)
1325         self.register_levels = list(register_levels)
1326
1327         # inputs
1328         self.i = InputData()
1329         self.part_pts = self.i.part_pts
1330         self.part_ops = self.i.part_ops
1331         self.a = self.i.a
1332         self.b = self.i.b
1333
1334         # intermediates (needed for unit tests)
1335         self.intermediate_output = Signal(128)
1336
1337         # output
1338         self.output = Signal(64)
1339
1340     def elaborate(self, platform):
1341         m = Module()
1342
1343         part_pts = self.part_pts
1344
1345         n_inputs = 64 + 4
1346         n_parts = 8
1347         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1348         m.submodules.allterms = t
1349         m.d.comb += t.i.eq(self.i)
1350
1351         terms = t.o.terms
1352
1353         add_reduce = AddReduce(terms,
1354                                128,
1355                                self.register_levels,
1356                                t.o.part_pts,
1357                                t.o.part_ops)
1358
1359         out_part_ops = add_reduce.o.part_ops
1360         out_part_pts = add_reduce.o.part_pts
1361
1362         m.submodules.add_reduce = add_reduce
1363
1364         interm = Intermediates(128, 8, part_pts)
1365         m.submodules.intermediates = interm
1366         m.d.comb += interm.i.eq(add_reduce.o)
1367
1368         # final output
1369         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1370         m.d.comb += finalout.i.eq(interm.o)
1371         m.d.comb += self.output.eq(finalout.out)
1372         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1373
1374         return m
1375
1376
1377 if __name__ == "__main__":
1378     m = Mul8_16_32_64()
1379     main(m, ports=[m.a,
1380                    m.b,
1381                    m.intermediate_output,
1382                    m.output,
1383                    *m.part_ops,
1384                    *m.part_pts.values()])