src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.partition_points = PartitionPoints(partition_points)
 354         if not self.partition_points.fits_in_width(output_width):
 355             raise ValueError("partition_points doesn't fit in output_width")
 356
 357         self.i = self.ispec()
 358         self.o = self.ospec()
 359
 360     def ispec(self):
 361         return AddReduceData(self.partition_points, self.n_inputs,
 362                              self.output_width, self.n_parts)
 363
 364     def ospec(self):
 365         return FinalReduceData(self.partition_points,
 366                                  self.output_width, self.n_parts)
 367
 368     def elaborate(self, platform):
 369         """Elaborate this module."""
 370         m = Module()
 371
 372         output_width = self.output_width
 373         output = Signal(output_width, reset_less=True)
 374         if self.n_inputs == 0:
 375             # use 0 as the default output value
 376             m.d.comb += output.eq(0)
 377         elif self.n_inputs == 1:
 378             # handle single input
 379             m.d.comb += output.eq(self.i.terms[0])
 380         else:
 381             # base case for adding 2 inputs
 382             assert self.n_inputs == 2
 383             adder = PartitionedAdder(output_width,
 384                                      self.i.part_pts, 2)
 385             m.submodules.final_adder = adder
 386             m.d.comb += adder.a.eq(self.i.terms[0])
 387             m.d.comb += adder.b.eq(self.i.terms[1])
 388             m.d.comb += output.eq(adder.output)
 389
 390         # create output
 391         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 392                                    self.i.part_ops)
 393
 394         return m
 395
 396
 397 class AddReduceSingle(Elaboratable):
 398     """Add list of numbers together.
 399
 400     :attribute inputs: input ``Signal``s to be summed. Modification not
 401         supported, except for by ``Signal.eq``.
 402     :attribute register_levels: List of nesting levels that should have
 403         pipeline registers.
 404     :attribute output: output sum.
 405     :attribute partition_points: the input partition points. Modification not
 406         supported, except for by ``Signal.eq``.
 407     """
 408
 409     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 410         """Create an ``AddReduce``.
 411
 412         :param inputs: input ``Signal``s to be summed.
 413         :param output_width: bit-width of ``output``.
 414         :param partition_points: the input partition points.
 415         """
 416         self.n_inputs = n_inputs
 417         self.n_parts = n_parts
 418         self.output_width = output_width
 419         self.partition_points = PartitionPoints(partition_points)
 420         if not self.partition_points.fits_in_width(output_width):
 421             raise ValueError("partition_points doesn't fit in output_width")
 422
 423         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 424         self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 425
 426         self.i = self.ispec()
 427         self.o = self.ospec()
 428
 429     def ispec(self):
 430         return AddReduceData(self.partition_points, self.n_inputs,
 431                              self.output_width, self.n_parts)
 432
 433     def ospec(self):
 434         return AddReduceData(self.partition_points, self.n_terms,
 435                              self.output_width, self.n_parts)
 436
 437     @staticmethod
 438     def calc_n_inputs(n_inputs, groups):
 439         retval = len(groups)*2
 440         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 441             retval += 1
 442         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 443             retval += 2
 444         else:
 445             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 446         return retval
 447
 448     @staticmethod
 449     def get_max_level(input_count):
 450         """Get the maximum level.
 451
 452         All ``register_levels`` must be less than or equal to the maximum
 453         level.
 454         """
 455         retval = 0
 456         while True:
 457             groups = AddReduceSingle.full_adder_groups(input_count)
 458             if len(groups) == 0:
 459                 return retval
 460             input_count %= FULL_ADDER_INPUT_COUNT
 461             input_count += 2 * len(groups)
 462             retval += 1
 463
 464     @staticmethod
 465     def full_adder_groups(input_count):
 466         """Get ``inputs`` indices for which a full adder should be built."""
 467         return range(0,
 468                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 469                      FULL_ADDER_INPUT_COUNT)
 470
 471     def create_next_terms(self):
 472         """ create next intermediate terms, for linking up in elaborate, below
 473         """
 474         terms = []
 475         adders = []
 476
 477         # create full adders for this recursive level.
 478         # this shrinks N terms to 2 * (N // 3) plus the remainder
 479         for i in self.groups:
 480             adder_i = MaskedFullAdder(self.output_width)
 481             adders.append((i, adder_i))
 482             # add both the sum and the masked-carry to the next level.
 483             # 3 inputs have now been reduced to 2...
 484             terms.append(adder_i.sum)
 485             terms.append(adder_i.mcarry)
 486         # handle the remaining inputs.
 487         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 488             terms.append(self.i.terms[-1])
 489         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 490             # Just pass the terms to the next layer, since we wouldn't gain
 491             # anything by using a half adder since there would still be 2 terms
 492             # and just passing the terms to the next layer saves gates.
 493             terms.append(self.i.terms[-2])
 494             terms.append(self.i.terms[-1])
 495         else:
 496             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 497
 498         return terms, adders
 499
 500     def elaborate(self, platform):
 501         """Elaborate this module."""
 502         m = Module()
 503
 504         terms, adders = self.create_next_terms()
 505
 506         # copy the intermediate terms to the output
 507         for i, value in enumerate(terms):
 508             m.d.comb += self.o.terms[i].eq(value)
 509
 510         # copy reg part points and part ops to output
 511         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 512         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 513                                      for i in range(len(self.i.part_ops))]
 514
 515         # set up the partition mask (for the adders)
 516         part_mask = Signal(self.output_width, reset_less=True)
 517
 518         # get partition points as a mask
 519         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 520         m.d.comb += part_mask.eq(mask)
 521
 522         # add and link the intermediate term modules
 523         for i, (iidx, adder_i) in enumerate(adders):
 524             setattr(m.submodules, f"adder_{i}", adder_i)
 525
 526             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 527             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 528             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 529             m.d.comb += adder_i.mask.eq(part_mask)
 530
 531         return m
 532
 533
 534 class AddReduceInternal:
 535     """Recursively Add list of numbers together.
 536
 537     :attribute inputs: input ``Signal``s to be summed. Modification not
 538         supported, except for by ``Signal.eq``.
 539     :attribute register_levels: List of nesting levels that should have
 540         pipeline registers.
 541     :attribute output: output sum.
 542     :attribute partition_points: the input partition points. Modification not
 543         supported, except for by ``Signal.eq``.
 544     """
 545
 546     def __init__(self, i, output_width):
 547         """Create an ``AddReduce``.
 548
 549         :param inputs: input ``Signal``s to be summed.
 550         :param output_width: bit-width of ``output``.
 551         :param partition_points: the input partition points.
 552         """
 553         self.i = i
 554         self.inputs = i.terms
 555         self.part_ops = i.part_ops
 556         self.output_width = output_width
 557         self.partition_points = i.part_pts
 558
 559         self.create_levels()
 560
 561     def create_levels(self):
 562         """creates reduction levels"""
 563
 564         mods = []
 565         partition_points = self.partition_points
 566         part_ops = self.part_ops
 567         n_parts = len(part_ops)
 568         inputs = self.inputs
 569         ilen = len(inputs)
 570         while True:
 571             groups = AddReduceSingle.full_adder_groups(len(inputs))
 572             if len(groups) == 0:
 573                 break
 574             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 575                                          partition_points)
 576             mods.append(next_level)
 577             partition_points = next_level.i.part_pts
 578             inputs = next_level.o.terms
 579             ilen = len(inputs)
 580             part_ops = next_level.i.part_ops
 581
 582         next_level = FinalAdd(ilen, self.output_width, n_parts,
 583                               partition_points)
 584         mods.append(next_level)
 585
 586         self.levels = mods
 587
 588
 589 class AddReduce(AddReduceInternal, Elaboratable):
 590     """Recursively Add list of numbers together.
 591
 592     :attribute inputs: input ``Signal``s to be summed. Modification not
 593         supported, except for by ``Signal.eq``.
 594     :attribute register_levels: List of nesting levels that should have
 595         pipeline registers.
 596     :attribute output: output sum.
 597     :attribute partition_points: the input partition points. Modification not
 598         supported, except for by ``Signal.eq``.
 599     """
 600
 601     def __init__(self, inputs, output_width, register_levels, part_pts,
 602                        part_ops):
 603         """Create an ``AddReduce``.
 604
 605         :param inputs: input ``Signal``s to be summed.
 606         :param output_width: bit-width of ``output``.
 607         :param register_levels: List of nesting levels that should have
 608             pipeline registers.
 609         :param partition_points: the input partition points.
 610         """
 611         self._inputs = inputs
 612         self._part_pts = part_pts
 613         self._part_ops = part_ops
 614         n_parts = len(part_ops)
 615         self.i = AddReduceData(part_pts, len(inputs),
 616                              output_width, n_parts)
 617         AddReduceInternal.__init__(self, self.i, output_width)
 618         self.o = FinalReduceData(part_pts, output_width, n_parts)
 619         self.register_levels = register_levels
 620
 621     @staticmethod
 622     def get_max_level(input_count):
 623         return AddReduceSingle.get_max_level(input_count)
 624
 625     @staticmethod
 626     def next_register_levels(register_levels):
 627         """``Iterable`` of ``register_levels`` for next recursive level."""
 628         for level in register_levels:
 629             if level > 0:
 630                 yield level - 1
 631
 632     def elaborate(self, platform):
 633         """Elaborate this module."""
 634         m = Module()
 635
 636         m.d.comb += self.i.eq_from(self._part_pts, self._inputs, self._part_ops)
 637
 638         for i, next_level in enumerate(self.levels):
 639             setattr(m.submodules, "next_level%d" % i, next_level)
 640
 641         i = self.i
 642         for idx in range(len(self.levels)):
 643             mcur = self.levels[idx]
 644             if idx in self.register_levels:
 645                 m.d.sync += mcur.i.eq(i)
 646             else:
 647                 m.d.comb += mcur.i.eq(i)
 648             i = mcur.o # for next loop
 649
 650         # output comes from last module
 651         m.d.comb += self.o.eq(i)
 652
 653         return m
 654
 655
 656 OP_MUL_LOW = 0
 657 OP_MUL_SIGNED_HIGH = 1
 658 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 659 OP_MUL_UNSIGNED_HIGH = 3
 660
 661
 662 def get_term(value, shift=0, enabled=None):
 663     if enabled is not None:
 664         value = Mux(enabled, value, 0)
 665     if shift > 0:
 666         value = Cat(Repl(C(0, 1), shift), value)
 667     else:
 668         assert shift == 0
 669     return value
 670
 671
 672 class ProductTerm(Elaboratable):
 673     """ this class creates a single product term (a[..]*b[..]).
 674         it has a design flaw in that is the *output* that is selected,
 675         where the multiplication(s) are combinatorially generated
 676         all the time.
 677     """
 678
 679     def __init__(self, width, twidth, pbwid, a_index, b_index):
 680         self.a_index = a_index
 681         self.b_index = b_index
 682         shift = 8 * (self.a_index + self.b_index)
 683         self.pwidth = width
 684         self.twidth = twidth
 685         self.width = width*2
 686         self.shift = shift
 687
 688         self.ti = Signal(self.width, reset_less=True)
 689         self.term = Signal(twidth, reset_less=True)
 690         self.a = Signal(twidth//2, reset_less=True)
 691         self.b = Signal(twidth//2, reset_less=True)
 692         self.pb_en = Signal(pbwid, reset_less=True)
 693
 694         self.tl = tl = []
 695         min_index = min(self.a_index, self.b_index)
 696         max_index = max(self.a_index, self.b_index)
 697         for i in range(min_index, max_index):
 698             tl.append(self.pb_en[i])
 699         name = "te_%d_%d" % (self.a_index, self.b_index)
 700         if len(tl) > 0:
 701             term_enabled = Signal(name=name, reset_less=True)
 702         else:
 703             term_enabled = None
 704         self.enabled = term_enabled
 705         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 706
 707     def elaborate(self, platform):
 708
 709         m = Module()
 710         if self.enabled is not None:
 711             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 712
 713         bsa = Signal(self.width, reset_less=True)
 714         bsb = Signal(self.width, reset_less=True)
 715         a_index, b_index = self.a_index, self.b_index
 716         pwidth = self.pwidth
 717         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 718         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 719         m.d.comb += self.ti.eq(bsa * bsb)
 720         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 721         """
 722         #TODO: sort out width issues, get inputs a/b switched on/off.
 723         #data going into Muxes is 1/2 the required width
 724
 725         pwidth = self.pwidth
 726         width = self.width
 727         bsa = Signal(self.twidth//2, reset_less=True)
 728         bsb = Signal(self.twidth//2, reset_less=True)
 729         asel = Signal(width, reset_less=True)
 730         bsel = Signal(width, reset_less=True)
 731         a_index, b_index = self.a_index, self.b_index
 732         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 733         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 734         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 735         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 736         m.d.comb += self.ti.eq(bsa * bsb)
 737         m.d.comb += self.term.eq(self.ti)
 738         """
 739
 740         return m
 741
 742
 743 class ProductTerms(Elaboratable):
 744     """ creates a bank of product terms.  also performs the actual bit-selection
 745         this class is to be wrapped with a for-loop on the "a" operand.
 746         it creates a second-level for-loop on the "b" operand.
 747     """
 748     def __init__(self, width, twidth, pbwid, a_index, blen):
 749         self.a_index = a_index
 750         self.blen = blen
 751         self.pwidth = width
 752         self.twidth = twidth
 753         self.pbwid = pbwid
 754         self.a = Signal(twidth//2, reset_less=True)
 755         self.b = Signal(twidth//2, reset_less=True)
 756         self.pb_en = Signal(pbwid, reset_less=True)
 757         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 758                             for i in range(blen)]
 759
 760     def elaborate(self, platform):
 761
 762         m = Module()
 763
 764         for b_index in range(self.blen):
 765             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 766                             self.a_index, b_index)
 767             setattr(m.submodules, "term_%d" % b_index, t)
 768
 769             m.d.comb += t.a.eq(self.a)
 770             m.d.comb += t.b.eq(self.b)
 771             m.d.comb += t.pb_en.eq(self.pb_en)
 772
 773             m.d.comb += self.terms[b_index].eq(t.term)
 774
 775         return m
 776
 777
 778 class LSBNegTerm(Elaboratable):
 779
 780     def __init__(self, bit_width):
 781         self.bit_width = bit_width
 782         self.part = Signal(reset_less=True)
 783         self.signed = Signal(reset_less=True)
 784         self.op = Signal(bit_width, reset_less=True)
 785         self.msb = Signal(reset_less=True)
 786         self.nt = Signal(bit_width*2, reset_less=True)
 787         self.nl = Signal(bit_width*2, reset_less=True)
 788
 789     def elaborate(self, platform):
 790         m = Module()
 791         comb = m.d.comb
 792         bit_wid = self.bit_width
 793         ext = Repl(0, bit_wid) # extend output to HI part
 794
 795         # determine sign of each incoming number *in this partition*
 796         enabled = Signal(reset_less=True)
 797         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 798
 799         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 800         # negation operation is split into a bitwise not and a +1.
 801         # likewise for 16, 32, and 64-bit values.
 802
 803         # width-extended 1s complement if a is signed, otherwise zero
 804         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 805
 806         # add 1 if signed, otherwise add zero
 807         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 808
 809         return m
 810
 811
 812 class Parts(Elaboratable):
 813
 814     def __init__(self, pbwid, part_pts, n_parts):
 815         self.pbwid = pbwid
 816         # inputs
 817         self.part_pts = PartitionPoints.like(part_pts)
 818         # outputs
 819         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 820                       for i in range(n_parts)]
 821
 822     def elaborate(self, platform):
 823         m = Module()
 824
 825         part_pts, parts = self.part_pts, self.parts
 826         # collect part-bytes (double factor because the input is extended)
 827         pbs = Signal(self.pbwid, reset_less=True)
 828         tl = []
 829         for i in range(self.pbwid):
 830             pb = Signal(name="pb%d" % i, reset_less=True)
 831             m.d.comb += pb.eq(part_pts.part_byte(i))
 832             tl.append(pb)
 833         m.d.comb += pbs.eq(Cat(*tl))
 834
 835         # negated-temporary copy of partition bits
 836         npbs = Signal.like(pbs, reset_less=True)
 837         m.d.comb += npbs.eq(~pbs)
 838         byte_count = 8 // len(parts)
 839         for i in range(len(parts)):
 840             pbl = []
 841             pbl.append(npbs[i * byte_count - 1])
 842             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 843                 pbl.append(pbs[j])
 844             pbl.append(npbs[(i + 1) * byte_count - 1])
 845             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 846             m.d.comb += value.eq(Cat(*pbl))
 847             m.d.comb += parts[i].eq(~(value).bool())
 848
 849         return m
 850
 851
 852 class Part(Elaboratable):
 853     """ a key class which, depending on the partitioning, will determine
 854         what action to take when parts of the output are signed or unsigned.
 855
 856         this requires 2 pieces of data *per operand, per partition*:
 857         whether the MSB is HI/LO (per partition!), and whether a signed
 858         or unsigned operation has been *requested*.
 859
 860         once that is determined, signed is basically carried out
 861         by splitting 2's complement into 1's complement plus one.
 862         1's complement is just a bit-inversion.
 863
 864         the extra terms - as separate terms - are then thrown at the
 865         AddReduce alongside the multiplication part-results.
 866     """
 867     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 868
 869         self.pbwid = pbwid
 870         self.part_pts = part_pts
 871
 872         # inputs
 873         self.a = Signal(64, reset_less=True)
 874         self.b = Signal(64, reset_less=True)
 875         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 876                             for i in range(8)]
 877         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 878                             for i in range(8)]
 879         self.pbs = Signal(pbwid, reset_less=True)
 880
 881         # outputs
 882         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 883                             for i in range(n_parts)]
 884
 885         self.not_a_term = Signal(width, reset_less=True)
 886         self.neg_lsb_a_term = Signal(width, reset_less=True)
 887         self.not_b_term = Signal(width, reset_less=True)
 888         self.neg_lsb_b_term = Signal(width, reset_less=True)
 889
 890     def elaborate(self, platform):
 891         m = Module()
 892
 893         pbs, parts = self.pbs, self.parts
 894         part_pts = self.part_pts
 895         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 896         m.d.comb += p.part_pts.eq(part_pts)
 897         parts = p.parts
 898
 899         byte_count = 8 // len(parts)
 900
 901         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 902                 self.not_a_term, self.neg_lsb_a_term,
 903                 self.not_b_term, self.neg_lsb_b_term)
 904
 905         byte_width = 8 // len(parts) # byte width
 906         bit_wid = 8 * byte_width     # bit width
 907         nat, nbt, nla, nlb = [], [], [], []
 908         for i in range(len(parts)):
 909             # work out bit-inverted and +1 term for a.
 910             pa = LSBNegTerm(bit_wid)
 911             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 912             m.d.comb += pa.part.eq(parts[i])
 913             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 914             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 915             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 916             nat.append(pa.nt)
 917             nla.append(pa.nl)
 918
 919             # work out bit-inverted and +1 term for b
 920             pb = LSBNegTerm(bit_wid)
 921             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 922             m.d.comb += pb.part.eq(parts[i])
 923             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 924             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 925             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 926             nbt.append(pb.nt)
 927             nlb.append(pb.nl)
 928
 929         # concatenate together and return all 4 results.
 930         m.d.comb += [not_a_term.eq(Cat(*nat)),
 931                      not_b_term.eq(Cat(*nbt)),
 932                      neg_lsb_a_term.eq(Cat(*nla)),
 933                      neg_lsb_b_term.eq(Cat(*nlb)),
 934                     ]
 935
 936         return m
 937
 938
 939 class IntermediateOut(Elaboratable):
 940     """ selects the HI/LO part of the multiplication, for a given bit-width
 941         the output is also reconstructed in its SIMD (partition) lanes.
 942     """
 943     def __init__(self, width, out_wid, n_parts):
 944         self.width = width
 945         self.n_parts = n_parts
 946         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 947                                      for i in range(8)]
 948         self.intermed = Signal(out_wid, reset_less=True)
 949         self.output = Signal(out_wid//2, reset_less=True)
 950
 951     def elaborate(self, platform):
 952         m = Module()
 953
 954         ol = []
 955         w = self.width
 956         sel = w // 8
 957         for i in range(self.n_parts):
 958             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 959             m.d.comb += op.eq(
 960                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 961                     self.intermed.bit_select(i * w*2, w),
 962                     self.intermed.bit_select(i * w*2 + w, w)))
 963             ol.append(op)
 964         m.d.comb += self.output.eq(Cat(*ol))
 965
 966         return m
 967
 968
 969 class FinalOut(Elaboratable):
 970     """ selects the final output based on the partitioning.
 971
 972         each byte is selectable independently, i.e. it is possible
 973         that some partitions requested 8-bit computation whilst others
 974         requested 16 or 32 bit.
 975     """
 976     def __init__(self, output_width, n_parts, part_pts):
 977         self.part_pts = part_pts
 978         self.output_width = output_width
 979         self.n_parts = n_parts
 980         self.out_wid = output_width//2
 981
 982         self.i = self.ispec()
 983         self.o = self.ospec()
 984
 985     def ispec(self):
 986         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
 987
 988     def ospec(self):
 989         return OutputData()
 990
 991     def elaborate(self, platform):
 992         m = Module()
 993
 994         part_pts = self.part_pts
 995         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
 996         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
 997         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
 998         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
 999
1000         out_part_pts = self.i.part_pts
1001
1002         # temporaries
1003         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1004         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1005         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1006
1007         i8 = Signal(self.out_wid, reset_less=True)
1008         i16 = Signal(self.out_wid, reset_less=True)
1009         i32 = Signal(self.out_wid, reset_less=True)
1010         i64 = Signal(self.out_wid, reset_less=True)
1011
1012         m.d.comb += p_8.part_pts.eq(out_part_pts)
1013         m.d.comb += p_16.part_pts.eq(out_part_pts)
1014         m.d.comb += p_32.part_pts.eq(out_part_pts)
1015         m.d.comb += p_64.part_pts.eq(out_part_pts)
1016
1017         for i in range(len(p_8.parts)):
1018             m.d.comb += d8[i].eq(p_8.parts[i])
1019         for i in range(len(p_16.parts)):
1020             m.d.comb += d16[i].eq(p_16.parts[i])
1021         for i in range(len(p_32.parts)):
1022             m.d.comb += d32[i].eq(p_32.parts[i])
1023         m.d.comb += i8.eq(self.i.outputs[0])
1024         m.d.comb += i16.eq(self.i.outputs[1])
1025         m.d.comb += i32.eq(self.i.outputs[2])
1026         m.d.comb += i64.eq(self.i.outputs[3])
1027
1028         ol = []
1029         for i in range(8):
1030             # select one of the outputs: d8 selects i8, d16 selects i16
1031             # d32 selects i32, and the default is i64.
1032             # d8 and d16 are ORed together in the first Mux
1033             # then the 2nd selects either i8 or i16.
1034             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1035             op = Signal(8, reset_less=True, name="op_%d" % i)
1036             m.d.comb += op.eq(
1037                 Mux(d8[i] | d16[i // 2],
1038                     Mux(d8[i], i8.bit_select(i * 8, 8),
1039                                i16.bit_select(i * 8, 8)),
1040                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1041                                       i64.bit_select(i * 8, 8))))
1042             ol.append(op)
1043
1044         # create outputs
1045         m.d.comb += self.o.output.eq(Cat(*ol))
1046         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1047
1048         return m
1049
1050
1051 class OrMod(Elaboratable):
1052     """ ORs four values together in a hierarchical tree
1053     """
1054     def __init__(self, wid):
1055         self.wid = wid
1056         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1057                      for i in range(4)]
1058         self.orout = Signal(wid, reset_less=True)
1059
1060     def elaborate(self, platform):
1061         m = Module()
1062         or1 = Signal(self.wid, reset_less=True)
1063         or2 = Signal(self.wid, reset_less=True)
1064         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1065         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1066         m.d.comb += self.orout.eq(or1 | or2)
1067
1068         return m
1069
1070
1071 class Signs(Elaboratable):
1072     """ determines whether a or b are signed numbers
1073         based on the required operation type (OP_MUL_*)
1074     """
1075
1076     def __init__(self):
1077         self.part_ops = Signal(2, reset_less=True)
1078         self.a_signed = Signal(reset_less=True)
1079         self.b_signed = Signal(reset_less=True)
1080
1081     def elaborate(self, platform):
1082
1083         m = Module()
1084
1085         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1086         bsig = (self.part_ops == OP_MUL_LOW) \
1087                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1088         m.d.comb += self.a_signed.eq(asig)
1089         m.d.comb += self.b_signed.eq(bsig)
1090
1091         return m
1092
1093
1094 class IntermediateData:
1095
1096     def __init__(self, part_pts, output_width, n_parts):
1097         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1098                           for i in range(n_parts)]
1099         self.part_pts = part_pts.like()
1100         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1101                           for i in range(4)]
1102         # intermediates (needed for unit tests)
1103         self.intermediate_output = Signal(output_width)
1104
1105     def eq_from(self, part_pts, outputs, intermediate_output,
1106                       part_ops):
1107         return [self.part_pts.eq(part_pts)] + \
1108                [self.intermediate_output.eq(intermediate_output)] + \
1109                [self.outputs[i].eq(outputs[i])
1110                                      for i in range(4)] + \
1111                [self.part_ops[i].eq(part_ops[i])
1112                                      for i in range(len(self.part_ops))]
1113
1114     def eq(self, rhs):
1115         return self.eq_from(rhs.part_pts, rhs.outputs,
1116                             rhs.intermediate_output, rhs.part_ops)
1117
1118
1119 class InputData:
1120
1121     def __init__(self):
1122         self.a = Signal(64)
1123         self.b = Signal(64)
1124         self.part_pts = PartitionPoints()
1125         for i in range(8, 64, 8):
1126             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1127         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1128
1129     def eq_from(self, part_pts, a, b, part_ops):
1130         return [self.part_pts.eq(part_pts)] + \
1131                [self.a.eq(a), self.b.eq(b)] + \
1132                [self.part_ops[i].eq(part_ops[i])
1133                                      for i in range(len(self.part_ops))]
1134
1135     def eq(self, rhs):
1136         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1137
1138
1139 class OutputData:
1140
1141     def __init__(self):
1142         self.intermediate_output = Signal(128) # needed for unit tests
1143         self.output = Signal(64)
1144
1145     def eq(self, rhs):
1146         return [self.intermediate_output.eq(rhs.intermediate_output),
1147                 self.output.eq(rhs.output)]
1148
1149
1150 class AllTerms(Elaboratable):
1151     """Set of terms to be added together
1152     """
1153
1154     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1155         """Create an ``AddReduce``.
1156
1157         :param inputs: input ``Signal``s to be summed.
1158         :param output_width: bit-width of ``output``.
1159         :param register_levels: List of nesting levels that should have
1160             pipeline registers.
1161         :param partition_points: the input partition points.
1162         """
1163         self.register_levels = register_levels
1164         self.n_inputs = n_inputs
1165         self.n_parts = n_parts
1166         self.output_width = output_width
1167
1168         self.i = self.ispec()
1169         self.o = self.ospec()
1170
1171     def ispec(self):
1172         return InputData()
1173
1174     def ospec(self):
1175         return AddReduceData(self.i.part_pts, self.n_inputs,
1176                              self.output_width, self.n_parts)
1177
1178     def elaborate(self, platform):
1179         m = Module()
1180
1181         eps = self.i.part_pts
1182
1183         # collect part-bytes
1184         pbs = Signal(8, reset_less=True)
1185         tl = []
1186         for i in range(8):
1187             pb = Signal(name="pb%d" % i, reset_less=True)
1188             m.d.comb += pb.eq(eps.part_byte(i))
1189             tl.append(pb)
1190         m.d.comb += pbs.eq(Cat(*tl))
1191
1192         # local variables
1193         signs = []
1194         for i in range(8):
1195             s = Signs()
1196             signs.append(s)
1197             setattr(m.submodules, "signs%d" % i, s)
1198             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1199
1200         n_levels = len(self.register_levels)+1
1201         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1202         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1203         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1204         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1205         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1206         for mod in [part_8, part_16, part_32, part_64]:
1207             m.d.comb += mod.a.eq(self.i.a)
1208             m.d.comb += mod.b.eq(self.i.b)
1209             for i in range(len(signs)):
1210                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1211                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1212             m.d.comb += mod.pbs.eq(pbs)
1213             nat_l.append(mod.not_a_term)
1214             nbt_l.append(mod.not_b_term)
1215             nla_l.append(mod.neg_lsb_a_term)
1216             nlb_l.append(mod.neg_lsb_b_term)
1217
1218         terms = []
1219
1220         for a_index in range(8):
1221             t = ProductTerms(8, 128, 8, a_index, 8)
1222             setattr(m.submodules, "terms_%d" % a_index, t)
1223
1224             m.d.comb += t.a.eq(self.i.a)
1225             m.d.comb += t.b.eq(self.i.b)
1226             m.d.comb += t.pb_en.eq(pbs)
1227
1228             for term in t.terms:
1229                 terms.append(term)
1230
1231         # it's fine to bitwise-or data together since they are never enabled
1232         # at the same time
1233         m.submodules.nat_or = nat_or = OrMod(128)
1234         m.submodules.nbt_or = nbt_or = OrMod(128)
1235         m.submodules.nla_or = nla_or = OrMod(128)
1236         m.submodules.nlb_or = nlb_or = OrMod(128)
1237         for l, mod in [(nat_l, nat_or),
1238                              (nbt_l, nbt_or),
1239                              (nla_l, nla_or),
1240                              (nlb_l, nlb_or)]:
1241             for i in range(len(l)):
1242                 m.d.comb += mod.orin[i].eq(l[i])
1243             terms.append(mod.orout)
1244
1245         # copy the intermediate terms to the output
1246         for i, value in enumerate(terms):
1247             m.d.comb += self.o.terms[i].eq(value)
1248
1249         # copy reg part points and part ops to output
1250         m.d.comb += self.o.part_pts.eq(eps)
1251         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1252                                      for i in range(len(self.i.part_ops))]
1253
1254         return m
1255
1256
1257 class Intermediates(Elaboratable):
1258     """ Intermediate output modules
1259     """
1260
1261     def __init__(self, output_width, n_parts, part_pts):
1262         self.part_pts = part_pts
1263         self.output_width = output_width
1264         self.n_parts = n_parts
1265
1266         self.i = self.ispec()
1267         self.o = self.ospec()
1268
1269     def ispec(self):
1270         return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
1271
1272     def ospec(self):
1273         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1274
1275     def elaborate(self, platform):
1276         m = Module()
1277
1278         out_part_ops = self.i.part_ops
1279         out_part_pts = self.i.part_pts
1280
1281         # create _output_64
1282         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1283         m.d.comb += io64.intermed.eq(self.i.output)
1284         for i in range(8):
1285             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1286         m.d.comb += self.o.outputs[3].eq(io64.output)
1287
1288         # create _output_32
1289         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1290         m.d.comb += io32.intermed.eq(self.i.output)
1291         for i in range(8):
1292             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1293         m.d.comb += self.o.outputs[2].eq(io32.output)
1294
1295         # create _output_16
1296         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1297         m.d.comb += io16.intermed.eq(self.i.output)
1298         for i in range(8):
1299             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1300         m.d.comb += self.o.outputs[1].eq(io16.output)
1301
1302         # create _output_8
1303         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1304         m.d.comb += io8.intermed.eq(self.i.output)
1305         for i in range(8):
1306             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1307         m.d.comb += self.o.outputs[0].eq(io8.output)
1308
1309         for i in range(8):
1310             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1311         m.d.comb += self.o.part_pts.eq(out_part_pts)
1312         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1313
1314         return m
1315
1316
1317 class Mul8_16_32_64(Elaboratable):
1318     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1319
1320     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1321     partitions on naturally-aligned boundaries. Supports the operation being
1322     set for each partition independently.
1323
1324     :attribute part_pts: the input partition points. Has a partition point at
1325         multiples of 8 in 0 < i < 64. Each partition point's associated
1326         ``Value`` is a ``Signal``. Modification not supported, except for by
1327         ``Signal.eq``.
1328     :attribute part_ops: the operation for each byte. The operation for a
1329         particular partition is selected by assigning the selected operation
1330         code to each byte in the partition. The allowed operation codes are:
1331
1332         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1333             RISC-V's `mul` instruction.
1334         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1335             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1336             instruction.
1337         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1338             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1339             `mulhsu` instruction.
1340         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1341             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1342             instruction.
1343     """
1344
1345     def __init__(self, register_levels=()):
1346         """ register_levels: specifies the points in the cascade at which
1347             flip-flops are to be inserted.
1348         """
1349
1350         # parameter(s)
1351         self.register_levels = list(register_levels)
1352
1353         self.i = self.ispec()
1354         self.o = self.ospec()
1355
1356         # inputs
1357         self.part_pts = self.i.part_pts
1358         self.part_ops = self.i.part_ops
1359         self.a = self.i.a
1360         self.b = self.i.b
1361
1362         # output
1363         self.intermediate_output = self.o.intermediate_output
1364         self.output = self.o.output
1365
1366     def ispec(self):
1367         return InputData()
1368
1369     def ospec(self):
1370         return OutputData()
1371
1372     def elaborate(self, platform):
1373         m = Module()
1374
1375         part_pts = self.part_pts
1376
1377         n_inputs = 64 + 4
1378         n_parts = 8
1379         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1380         m.submodules.allterms = t
1381         m.d.comb += t.i.eq(self.i)
1382
1383         terms = t.o.terms
1384
1385         add_reduce = AddReduce(terms,
1386                                128,
1387                                self.register_levels,
1388                                t.o.part_pts,
1389                                t.o.part_ops)
1390
1391         m.submodules.add_reduce = add_reduce
1392
1393         interm = Intermediates(128, 8, part_pts)
1394         m.submodules.intermediates = interm
1395         m.d.comb += interm.i.eq(add_reduce.o)
1396
1397         # final output
1398         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1399         m.d.comb += finalout.i.eq(interm.o)
1400         m.d.comb += self.o.eq(finalout.o)
1401
1402         return m
1403
1404
1405 if __name__ == "__main__":
1406     m = Mul8_16_32_64()
1407     main(m, ports=[m.a,
1408                    m.b,
1409                    m.intermediate_output,
1410                    m.output,
1411                    *m.part_ops,
1412                    *m.part_pts.values()])