src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.partition_points = PartitionPoints(partition_points)
 354         if not self.partition_points.fits_in_width(output_width):
 355             raise ValueError("partition_points doesn't fit in output_width")
 356
 357         self.i = self.ispec()
 358         self.o = self.ospec()
 359
 360     def ispec(self):
 361         return AddReduceData(self.partition_points, self.n_inputs,
 362                              self.output_width, self.n_parts)
 363
 364     def ospec(self):
 365         return FinalReduceData(self.partition_points,
 366                                  self.output_width, self.n_parts)
 367
 368     def elaborate(self, platform):
 369         """Elaborate this module."""
 370         m = Module()
 371
 372         output_width = self.output_width
 373         output = Signal(output_width, reset_less=True)
 374         if self.n_inputs == 0:
 375             # use 0 as the default output value
 376             m.d.comb += output.eq(0)
 377         elif self.n_inputs == 1:
 378             # handle single input
 379             m.d.comb += output.eq(self.i.terms[0])
 380         else:
 381             # base case for adding 2 inputs
 382             assert self.n_inputs == 2
 383             adder = PartitionedAdder(output_width,
 384                                      self.i.part_pts, 2)
 385             m.submodules.final_adder = adder
 386             m.d.comb += adder.a.eq(self.i.terms[0])
 387             m.d.comb += adder.b.eq(self.i.terms[1])
 388             m.d.comb += output.eq(adder.output)
 389
 390         # create output
 391         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 392                                    self.i.part_ops)
 393
 394         return m
 395
 396
 397 class AddReduceSingle(Elaboratable):
 398     """Add list of numbers together.
 399
 400     :attribute inputs: input ``Signal``s to be summed. Modification not
 401         supported, except for by ``Signal.eq``.
 402     :attribute register_levels: List of nesting levels that should have
 403         pipeline registers.
 404     :attribute output: output sum.
 405     :attribute partition_points: the input partition points. Modification not
 406         supported, except for by ``Signal.eq``.
 407     """
 408
 409     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 410         """Create an ``AddReduce``.
 411
 412         :param inputs: input ``Signal``s to be summed.
 413         :param output_width: bit-width of ``output``.
 414         :param partition_points: the input partition points.
 415         """
 416         self.n_inputs = n_inputs
 417         self.n_parts = n_parts
 418         self.output_width = output_width
 419         self.partition_points = PartitionPoints(partition_points)
 420         if not self.partition_points.fits_in_width(output_width):
 421             raise ValueError("partition_points doesn't fit in output_width")
 422
 423         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 424         self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 425
 426         self.i = self.ispec()
 427         self.o = self.ospec()
 428
 429     def ispec(self):
 430         return AddReduceData(self.partition_points, self.n_inputs,
 431                              self.output_width, self.n_parts)
 432
 433     def ospec(self):
 434         return AddReduceData(self.partition_points, self.n_terms,
 435                              self.output_width, self.n_parts)
 436
 437     @staticmethod
 438     def calc_n_inputs(n_inputs, groups):
 439         retval = len(groups)*2
 440         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 441             retval += 1
 442         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 443             retval += 2
 444         else:
 445             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 446         return retval
 447
 448     @staticmethod
 449     def get_max_level(input_count):
 450         """Get the maximum level.
 451
 452         All ``register_levels`` must be less than or equal to the maximum
 453         level.
 454         """
 455         retval = 0
 456         while True:
 457             groups = AddReduceSingle.full_adder_groups(input_count)
 458             if len(groups) == 0:
 459                 return retval
 460             input_count %= FULL_ADDER_INPUT_COUNT
 461             input_count += 2 * len(groups)
 462             retval += 1
 463
 464     @staticmethod
 465     def full_adder_groups(input_count):
 466         """Get ``inputs`` indices for which a full adder should be built."""
 467         return range(0,
 468                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 469                      FULL_ADDER_INPUT_COUNT)
 470
 471     def create_next_terms(self):
 472         """ create next intermediate terms, for linking up in elaborate, below
 473         """
 474         terms = []
 475         adders = []
 476
 477         # create full adders for this recursive level.
 478         # this shrinks N terms to 2 * (N // 3) plus the remainder
 479         for i in self.groups:
 480             adder_i = MaskedFullAdder(self.output_width)
 481             adders.append((i, adder_i))
 482             # add both the sum and the masked-carry to the next level.
 483             # 3 inputs have now been reduced to 2...
 484             terms.append(adder_i.sum)
 485             terms.append(adder_i.mcarry)
 486         # handle the remaining inputs.
 487         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 488             terms.append(self.i.terms[-1])
 489         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 490             # Just pass the terms to the next layer, since we wouldn't gain
 491             # anything by using a half adder since there would still be 2 terms
 492             # and just passing the terms to the next layer saves gates.
 493             terms.append(self.i.terms[-2])
 494             terms.append(self.i.terms[-1])
 495         else:
 496             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 497
 498         return terms, adders
 499
 500     def elaborate(self, platform):
 501         """Elaborate this module."""
 502         m = Module()
 503
 504         terms, adders = self.create_next_terms()
 505
 506         # copy the intermediate terms to the output
 507         for i, value in enumerate(terms):
 508             m.d.comb += self.o.terms[i].eq(value)
 509
 510         # copy reg part points and part ops to output
 511         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 512         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 513                                      for i in range(len(self.i.part_ops))]
 514
 515         # set up the partition mask (for the adders)
 516         part_mask = Signal(self.output_width, reset_less=True)
 517
 518         # get partition points as a mask
 519         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 520         m.d.comb += part_mask.eq(mask)
 521
 522         # add and link the intermediate term modules
 523         for i, (iidx, adder_i) in enumerate(adders):
 524             setattr(m.submodules, f"adder_{i}", adder_i)
 525
 526             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 527             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 528             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 529             m.d.comb += adder_i.mask.eq(part_mask)
 530
 531         return m
 532
 533
 534 class AddReduceInternal:
 535     """Recursively Add list of numbers together.
 536
 537     :attribute inputs: input ``Signal``s to be summed. Modification not
 538         supported, except for by ``Signal.eq``.
 539     :attribute register_levels: List of nesting levels that should have
 540         pipeline registers.
 541     :attribute output: output sum.
 542     :attribute partition_points: the input partition points. Modification not
 543         supported, except for by ``Signal.eq``.
 544     """
 545
 546     def __init__(self, inputs, output_width, partition_points,
 547                        part_ops):
 548         """Create an ``AddReduce``.
 549
 550         :param inputs: input ``Signal``s to be summed.
 551         :param output_width: bit-width of ``output``.
 552         :param partition_points: the input partition points.
 553         """
 554         self.inputs = inputs
 555         self.part_ops = part_ops
 556         self.output_width = output_width
 557         self.partition_points = partition_points
 558
 559         self.create_levels()
 560
 561     def create_levels(self):
 562         """creates reduction levels"""
 563
 564         mods = []
 565         partition_points = self.partition_points
 566         part_ops = self.part_ops
 567         n_parts = len(part_ops)
 568         inputs = self.inputs
 569         ilen = len(inputs)
 570         while True:
 571             groups = AddReduceSingle.full_adder_groups(len(inputs))
 572             if len(groups) == 0:
 573                 break
 574             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 575                                          partition_points)
 576             mods.append(next_level)
 577             partition_points = next_level.i.part_pts
 578             inputs = next_level.o.terms
 579             ilen = len(inputs)
 580             part_ops = next_level.i.part_ops
 581
 582         next_level = FinalAdd(ilen, self.output_width, n_parts,
 583                               partition_points)
 584         mods.append(next_level)
 585
 586         self.levels = mods
 587
 588
 589 class AddReduce(AddReduceInternal, Elaboratable):
 590     """Recursively Add list of numbers together.
 591
 592     :attribute inputs: input ``Signal``s to be summed. Modification not
 593         supported, except for by ``Signal.eq``.
 594     :attribute register_levels: List of nesting levels that should have
 595         pipeline registers.
 596     :attribute output: output sum.
 597     :attribute partition_points: the input partition points. Modification not
 598         supported, except for by ``Signal.eq``.
 599     """
 600
 601     def __init__(self, inputs, output_width, register_levels, partition_points,
 602                        part_ops):
 603         """Create an ``AddReduce``.
 604
 605         :param inputs: input ``Signal``s to be summed.
 606         :param output_width: bit-width of ``output``.
 607         :param register_levels: List of nesting levels that should have
 608             pipeline registers.
 609         :param partition_points: the input partition points.
 610         """
 611         AddReduceInternal.__init__(self, inputs, output_width,
 612                                    partition_points, part_ops)
 613         n_parts = len(part_ops)
 614         self.o = FinalReduceData(partition_points, output_width, n_parts)
 615         self.register_levels = register_levels
 616
 617     @staticmethod
 618     def get_max_level(input_count):
 619         return AddReduceSingle.get_max_level(input_count)
 620
 621     @staticmethod
 622     def next_register_levels(register_levels):
 623         """``Iterable`` of ``register_levels`` for next recursive level."""
 624         for level in register_levels:
 625             if level > 0:
 626                 yield level - 1
 627
 628     def elaborate(self, platform):
 629         """Elaborate this module."""
 630         m = Module()
 631
 632         for i, next_level in enumerate(self.levels):
 633             setattr(m.submodules, "next_level%d" % i, next_level)
 634
 635         partition_points = self.partition_points
 636         inputs = self.inputs
 637         part_ops = self.part_ops
 638         n_parts = len(part_ops)
 639         n_inputs = len(inputs)
 640         output_width = self.output_width
 641         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 642         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 643         for idx in range(len(self.levels)):
 644             mcur = self.levels[idx]
 645             if idx in self.register_levels:
 646                 m.d.sync += mcur.i.eq(i)
 647             else:
 648                 m.d.comb += mcur.i.eq(i)
 649             i = mcur.o # for next loop
 650
 651         # output comes from last module
 652         m.d.comb += self.o.eq(i)
 653
 654         return m
 655
 656
 657 OP_MUL_LOW = 0
 658 OP_MUL_SIGNED_HIGH = 1
 659 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 660 OP_MUL_UNSIGNED_HIGH = 3
 661
 662
 663 def get_term(value, shift=0, enabled=None):
 664     if enabled is not None:
 665         value = Mux(enabled, value, 0)
 666     if shift > 0:
 667         value = Cat(Repl(C(0, 1), shift), value)
 668     else:
 669         assert shift == 0
 670     return value
 671
 672
 673 class ProductTerm(Elaboratable):
 674     """ this class creates a single product term (a[..]*b[..]).
 675         it has a design flaw in that is the *output* that is selected,
 676         where the multiplication(s) are combinatorially generated
 677         all the time.
 678     """
 679
 680     def __init__(self, width, twidth, pbwid, a_index, b_index):
 681         self.a_index = a_index
 682         self.b_index = b_index
 683         shift = 8 * (self.a_index + self.b_index)
 684         self.pwidth = width
 685         self.twidth = twidth
 686         self.width = width*2
 687         self.shift = shift
 688
 689         self.ti = Signal(self.width, reset_less=True)
 690         self.term = Signal(twidth, reset_less=True)
 691         self.a = Signal(twidth//2, reset_less=True)
 692         self.b = Signal(twidth//2, reset_less=True)
 693         self.pb_en = Signal(pbwid, reset_less=True)
 694
 695         self.tl = tl = []
 696         min_index = min(self.a_index, self.b_index)
 697         max_index = max(self.a_index, self.b_index)
 698         for i in range(min_index, max_index):
 699             tl.append(self.pb_en[i])
 700         name = "te_%d_%d" % (self.a_index, self.b_index)
 701         if len(tl) > 0:
 702             term_enabled = Signal(name=name, reset_less=True)
 703         else:
 704             term_enabled = None
 705         self.enabled = term_enabled
 706         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 707
 708     def elaborate(self, platform):
 709
 710         m = Module()
 711         if self.enabled is not None:
 712             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 713
 714         bsa = Signal(self.width, reset_less=True)
 715         bsb = Signal(self.width, reset_less=True)
 716         a_index, b_index = self.a_index, self.b_index
 717         pwidth = self.pwidth
 718         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 719         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 720         m.d.comb += self.ti.eq(bsa * bsb)
 721         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 722         """
 723         #TODO: sort out width issues, get inputs a/b switched on/off.
 724         #data going into Muxes is 1/2 the required width
 725
 726         pwidth = self.pwidth
 727         width = self.width
 728         bsa = Signal(self.twidth//2, reset_less=True)
 729         bsb = Signal(self.twidth//2, reset_less=True)
 730         asel = Signal(width, reset_less=True)
 731         bsel = Signal(width, reset_less=True)
 732         a_index, b_index = self.a_index, self.b_index
 733         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 734         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 735         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 736         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 737         m.d.comb += self.ti.eq(bsa * bsb)
 738         m.d.comb += self.term.eq(self.ti)
 739         """
 740
 741         return m
 742
 743
 744 class ProductTerms(Elaboratable):
 745     """ creates a bank of product terms.  also performs the actual bit-selection
 746         this class is to be wrapped with a for-loop on the "a" operand.
 747         it creates a second-level for-loop on the "b" operand.
 748     """
 749     def __init__(self, width, twidth, pbwid, a_index, blen):
 750         self.a_index = a_index
 751         self.blen = blen
 752         self.pwidth = width
 753         self.twidth = twidth
 754         self.pbwid = pbwid
 755         self.a = Signal(twidth//2, reset_less=True)
 756         self.b = Signal(twidth//2, reset_less=True)
 757         self.pb_en = Signal(pbwid, reset_less=True)
 758         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 759                             for i in range(blen)]
 760
 761     def elaborate(self, platform):
 762
 763         m = Module()
 764
 765         for b_index in range(self.blen):
 766             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 767                             self.a_index, b_index)
 768             setattr(m.submodules, "term_%d" % b_index, t)
 769
 770             m.d.comb += t.a.eq(self.a)
 771             m.d.comb += t.b.eq(self.b)
 772             m.d.comb += t.pb_en.eq(self.pb_en)
 773
 774             m.d.comb += self.terms[b_index].eq(t.term)
 775
 776         return m
 777
 778
 779 class LSBNegTerm(Elaboratable):
 780
 781     def __init__(self, bit_width):
 782         self.bit_width = bit_width
 783         self.part = Signal(reset_less=True)
 784         self.signed = Signal(reset_less=True)
 785         self.op = Signal(bit_width, reset_less=True)
 786         self.msb = Signal(reset_less=True)
 787         self.nt = Signal(bit_width*2, reset_less=True)
 788         self.nl = Signal(bit_width*2, reset_less=True)
 789
 790     def elaborate(self, platform):
 791         m = Module()
 792         comb = m.d.comb
 793         bit_wid = self.bit_width
 794         ext = Repl(0, bit_wid) # extend output to HI part
 795
 796         # determine sign of each incoming number *in this partition*
 797         enabled = Signal(reset_less=True)
 798         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 799
 800         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 801         # negation operation is split into a bitwise not and a +1.
 802         # likewise for 16, 32, and 64-bit values.
 803
 804         # width-extended 1s complement if a is signed, otherwise zero
 805         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 806
 807         # add 1 if signed, otherwise add zero
 808         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 809
 810         return m
 811
 812
 813 class Parts(Elaboratable):
 814
 815     def __init__(self, pbwid, part_pts, n_parts):
 816         self.pbwid = pbwid
 817         # inputs
 818         self.part_pts = PartitionPoints.like(part_pts)
 819         # outputs
 820         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 821                       for i in range(n_parts)]
 822
 823     def elaborate(self, platform):
 824         m = Module()
 825
 826         part_pts, parts = self.part_pts, self.parts
 827         # collect part-bytes (double factor because the input is extended)
 828         pbs = Signal(self.pbwid, reset_less=True)
 829         tl = []
 830         for i in range(self.pbwid):
 831             pb = Signal(name="pb%d" % i, reset_less=True)
 832             m.d.comb += pb.eq(part_pts.part_byte(i))
 833             tl.append(pb)
 834         m.d.comb += pbs.eq(Cat(*tl))
 835
 836         # negated-temporary copy of partition bits
 837         npbs = Signal.like(pbs, reset_less=True)
 838         m.d.comb += npbs.eq(~pbs)
 839         byte_count = 8 // len(parts)
 840         for i in range(len(parts)):
 841             pbl = []
 842             pbl.append(npbs[i * byte_count - 1])
 843             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 844                 pbl.append(pbs[j])
 845             pbl.append(npbs[(i + 1) * byte_count - 1])
 846             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 847             m.d.comb += value.eq(Cat(*pbl))
 848             m.d.comb += parts[i].eq(~(value).bool())
 849
 850         return m
 851
 852
 853 class Part(Elaboratable):
 854     """ a key class which, depending on the partitioning, will determine
 855         what action to take when parts of the output are signed or unsigned.
 856
 857         this requires 2 pieces of data *per operand, per partition*:
 858         whether the MSB is HI/LO (per partition!), and whether a signed
 859         or unsigned operation has been *requested*.
 860
 861         once that is determined, signed is basically carried out
 862         by splitting 2's complement into 1's complement plus one.
 863         1's complement is just a bit-inversion.
 864
 865         the extra terms - as separate terms - are then thrown at the
 866         AddReduce alongside the multiplication part-results.
 867     """
 868     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 869
 870         self.pbwid = pbwid
 871         self.part_pts = part_pts
 872
 873         # inputs
 874         self.a = Signal(64, reset_less=True)
 875         self.b = Signal(64, reset_less=True)
 876         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 877                             for i in range(8)]
 878         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 879                             for i in range(8)]
 880         self.pbs = Signal(pbwid, reset_less=True)
 881
 882         # outputs
 883         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 884                             for i in range(n_parts)]
 885
 886         self.not_a_term = Signal(width, reset_less=True)
 887         self.neg_lsb_a_term = Signal(width, reset_less=True)
 888         self.not_b_term = Signal(width, reset_less=True)
 889         self.neg_lsb_b_term = Signal(width, reset_less=True)
 890
 891     def elaborate(self, platform):
 892         m = Module()
 893
 894         pbs, parts = self.pbs, self.parts
 895         part_pts = self.part_pts
 896         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 897         m.d.comb += p.part_pts.eq(part_pts)
 898         parts = p.parts
 899
 900         byte_count = 8 // len(parts)
 901
 902         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 903                 self.not_a_term, self.neg_lsb_a_term,
 904                 self.not_b_term, self.neg_lsb_b_term)
 905
 906         byte_width = 8 // len(parts) # byte width
 907         bit_wid = 8 * byte_width     # bit width
 908         nat, nbt, nla, nlb = [], [], [], []
 909         for i in range(len(parts)):
 910             # work out bit-inverted and +1 term for a.
 911             pa = LSBNegTerm(bit_wid)
 912             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 913             m.d.comb += pa.part.eq(parts[i])
 914             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 915             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 916             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 917             nat.append(pa.nt)
 918             nla.append(pa.nl)
 919
 920             # work out bit-inverted and +1 term for b
 921             pb = LSBNegTerm(bit_wid)
 922             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 923             m.d.comb += pb.part.eq(parts[i])
 924             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 925             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 926             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 927             nbt.append(pb.nt)
 928             nlb.append(pb.nl)
 929
 930         # concatenate together and return all 4 results.
 931         m.d.comb += [not_a_term.eq(Cat(*nat)),
 932                      not_b_term.eq(Cat(*nbt)),
 933                      neg_lsb_a_term.eq(Cat(*nla)),
 934                      neg_lsb_b_term.eq(Cat(*nlb)),
 935                     ]
 936
 937         return m
 938
 939
 940 class IntermediateOut(Elaboratable):
 941     """ selects the HI/LO part of the multiplication, for a given bit-width
 942         the output is also reconstructed in its SIMD (partition) lanes.
 943     """
 944     def __init__(self, width, out_wid, n_parts):
 945         self.width = width
 946         self.n_parts = n_parts
 947         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 948                                      for i in range(8)]
 949         self.intermed = Signal(out_wid, reset_less=True)
 950         self.output = Signal(out_wid//2, reset_less=True)
 951
 952     def elaborate(self, platform):
 953         m = Module()
 954
 955         ol = []
 956         w = self.width
 957         sel = w // 8
 958         for i in range(self.n_parts):
 959             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 960             m.d.comb += op.eq(
 961                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 962                     self.intermed.bit_select(i * w*2, w),
 963                     self.intermed.bit_select(i * w*2 + w, w)))
 964             ol.append(op)
 965         m.d.comb += self.output.eq(Cat(*ol))
 966
 967         return m
 968
 969
 970 class FinalOut(Elaboratable):
 971     """ selects the final output based on the partitioning.
 972
 973         each byte is selectable independently, i.e. it is possible
 974         that some partitions requested 8-bit computation whilst others
 975         requested 16 or 32 bit.
 976     """
 977     def __init__(self, output_width, n_parts, part_pts):
 978         self.part_pts = part_pts
 979         self.output_width = output_width
 980         self.n_parts = n_parts
 981         self.out_wid = output_width//2
 982
 983         self.i = self.ispec()
 984         self.o = self.ospec()
 985
 986     def ispec(self):
 987         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
 988
 989     def ospec(self):
 990         return OutputData()
 991
 992     def elaborate(self, platform):
 993         m = Module()
 994
 995         part_pts = self.part_pts
 996         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
 997         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
 998         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
 999         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1000
1001         out_part_pts = self.i.part_pts
1002
1003         # temporaries
1004         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1005         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1006         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1007
1008         i8 = Signal(self.out_wid, reset_less=True)
1009         i16 = Signal(self.out_wid, reset_less=True)
1010         i32 = Signal(self.out_wid, reset_less=True)
1011         i64 = Signal(self.out_wid, reset_less=True)
1012
1013         m.d.comb += p_8.part_pts.eq(out_part_pts)
1014         m.d.comb += p_16.part_pts.eq(out_part_pts)
1015         m.d.comb += p_32.part_pts.eq(out_part_pts)
1016         m.d.comb += p_64.part_pts.eq(out_part_pts)
1017
1018         for i in range(len(p_8.parts)):
1019             m.d.comb += d8[i].eq(p_8.parts[i])
1020         for i in range(len(p_16.parts)):
1021             m.d.comb += d16[i].eq(p_16.parts[i])
1022         for i in range(len(p_32.parts)):
1023             m.d.comb += d32[i].eq(p_32.parts[i])
1024         m.d.comb += i8.eq(self.i.outputs[0])
1025         m.d.comb += i16.eq(self.i.outputs[1])
1026         m.d.comb += i32.eq(self.i.outputs[2])
1027         m.d.comb += i64.eq(self.i.outputs[3])
1028
1029         ol = []
1030         for i in range(8):
1031             # select one of the outputs: d8 selects i8, d16 selects i16
1032             # d32 selects i32, and the default is i64.
1033             # d8 and d16 are ORed together in the first Mux
1034             # then the 2nd selects either i8 or i16.
1035             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1036             op = Signal(8, reset_less=True, name="op_%d" % i)
1037             m.d.comb += op.eq(
1038                 Mux(d8[i] | d16[i // 2],
1039                     Mux(d8[i], i8.bit_select(i * 8, 8),
1040                                i16.bit_select(i * 8, 8)),
1041                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1042                                       i64.bit_select(i * 8, 8))))
1043             ol.append(op)
1044
1045         # create outputs
1046         m.d.comb += self.o.output.eq(Cat(*ol))
1047         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1048
1049         return m
1050
1051
1052 class OrMod(Elaboratable):
1053     """ ORs four values together in a hierarchical tree
1054     """
1055     def __init__(self, wid):
1056         self.wid = wid
1057         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1058                      for i in range(4)]
1059         self.orout = Signal(wid, reset_less=True)
1060
1061     def elaborate(self, platform):
1062         m = Module()
1063         or1 = Signal(self.wid, reset_less=True)
1064         or2 = Signal(self.wid, reset_less=True)
1065         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1066         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1067         m.d.comb += self.orout.eq(or1 | or2)
1068
1069         return m
1070
1071
1072 class Signs(Elaboratable):
1073     """ determines whether a or b are signed numbers
1074         based on the required operation type (OP_MUL_*)
1075     """
1076
1077     def __init__(self):
1078         self.part_ops = Signal(2, reset_less=True)
1079         self.a_signed = Signal(reset_less=True)
1080         self.b_signed = Signal(reset_less=True)
1081
1082     def elaborate(self, platform):
1083
1084         m = Module()
1085
1086         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1087         bsig = (self.part_ops == OP_MUL_LOW) \
1088                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1089         m.d.comb += self.a_signed.eq(asig)
1090         m.d.comb += self.b_signed.eq(bsig)
1091
1092         return m
1093
1094
1095 class IntermediateData:
1096
1097     def __init__(self, part_pts, output_width, n_parts):
1098         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1099                           for i in range(n_parts)]
1100         self.part_pts = part_pts.like()
1101         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1102                           for i in range(4)]
1103         # intermediates (needed for unit tests)
1104         self.intermediate_output = Signal(output_width)
1105
1106     def eq_from(self, part_pts, outputs, intermediate_output,
1107                       part_ops):
1108         return [self.part_pts.eq(part_pts)] + \
1109                [self.intermediate_output.eq(intermediate_output)] + \
1110                [self.outputs[i].eq(outputs[i])
1111                                      for i in range(4)] + \
1112                [self.part_ops[i].eq(part_ops[i])
1113                                      for i in range(len(self.part_ops))]
1114
1115     def eq(self, rhs):
1116         return self.eq_from(rhs.part_pts, rhs.outputs,
1117                             rhs.intermediate_output, rhs.part_ops)
1118
1119
1120 class InputData:
1121
1122     def __init__(self):
1123         self.a = Signal(64)
1124         self.b = Signal(64)
1125         self.part_pts = PartitionPoints()
1126         for i in range(8, 64, 8):
1127             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1128         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1129
1130     def eq_from(self, part_pts, a, b, part_ops):
1131         return [self.part_pts.eq(part_pts)] + \
1132                [self.a.eq(a), self.b.eq(b)] + \
1133                [self.part_ops[i].eq(part_ops[i])
1134                                      for i in range(len(self.part_ops))]
1135
1136     def eq(self, rhs):
1137         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1138
1139
1140 class OutputData:
1141
1142     def __init__(self):
1143         self.intermediate_output = Signal(128) # needed for unit tests
1144         self.output = Signal(64)
1145
1146     def eq(self, rhs):
1147         return [self.intermediate_output.eq(rhs.intermediate_output),
1148                 self.output.eq(rhs.output)]
1149
1150
1151 class AllTerms(Elaboratable):
1152     """Set of terms to be added together
1153     """
1154
1155     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1156         """Create an ``AddReduce``.
1157
1158         :param inputs: input ``Signal``s to be summed.
1159         :param output_width: bit-width of ``output``.
1160         :param register_levels: List of nesting levels that should have
1161             pipeline registers.
1162         :param partition_points: the input partition points.
1163         """
1164         self.register_levels = register_levels
1165         self.n_inputs = n_inputs
1166         self.n_parts = n_parts
1167         self.output_width = output_width
1168
1169         self.i = self.ispec()
1170         self.o = self.ospec()
1171
1172     def ispec(self):
1173         return InputData()
1174
1175     def ospec(self):
1176         return AddReduceData(self.i.part_pts, self.n_inputs,
1177                              self.output_width, self.n_parts)
1178
1179     def elaborate(self, platform):
1180         m = Module()
1181
1182         eps = self.i.part_pts
1183
1184         # collect part-bytes
1185         pbs = Signal(8, reset_less=True)
1186         tl = []
1187         for i in range(8):
1188             pb = Signal(name="pb%d" % i, reset_less=True)
1189             m.d.comb += pb.eq(eps.part_byte(i))
1190             tl.append(pb)
1191         m.d.comb += pbs.eq(Cat(*tl))
1192
1193         # local variables
1194         signs = []
1195         for i in range(8):
1196             s = Signs()
1197             signs.append(s)
1198             setattr(m.submodules, "signs%d" % i, s)
1199             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1200
1201         n_levels = len(self.register_levels)+1
1202         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1203         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1204         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1205         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1206         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1207         for mod in [part_8, part_16, part_32, part_64]:
1208             m.d.comb += mod.a.eq(self.i.a)
1209             m.d.comb += mod.b.eq(self.i.b)
1210             for i in range(len(signs)):
1211                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1212                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1213             m.d.comb += mod.pbs.eq(pbs)
1214             nat_l.append(mod.not_a_term)
1215             nbt_l.append(mod.not_b_term)
1216             nla_l.append(mod.neg_lsb_a_term)
1217             nlb_l.append(mod.neg_lsb_b_term)
1218
1219         terms = []
1220
1221         for a_index in range(8):
1222             t = ProductTerms(8, 128, 8, a_index, 8)
1223             setattr(m.submodules, "terms_%d" % a_index, t)
1224
1225             m.d.comb += t.a.eq(self.i.a)
1226             m.d.comb += t.b.eq(self.i.b)
1227             m.d.comb += t.pb_en.eq(pbs)
1228
1229             for term in t.terms:
1230                 terms.append(term)
1231
1232         # it's fine to bitwise-or data together since they are never enabled
1233         # at the same time
1234         m.submodules.nat_or = nat_or = OrMod(128)
1235         m.submodules.nbt_or = nbt_or = OrMod(128)
1236         m.submodules.nla_or = nla_or = OrMod(128)
1237         m.submodules.nlb_or = nlb_or = OrMod(128)
1238         for l, mod in [(nat_l, nat_or),
1239                              (nbt_l, nbt_or),
1240                              (nla_l, nla_or),
1241                              (nlb_l, nlb_or)]:
1242             for i in range(len(l)):
1243                 m.d.comb += mod.orin[i].eq(l[i])
1244             terms.append(mod.orout)
1245
1246         # copy the intermediate terms to the output
1247         for i, value in enumerate(terms):
1248             m.d.comb += self.o.terms[i].eq(value)
1249
1250         # copy reg part points and part ops to output
1251         m.d.comb += self.o.part_pts.eq(eps)
1252         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1253                                      for i in range(len(self.i.part_ops))]
1254
1255         return m
1256
1257
1258 class Intermediates(Elaboratable):
1259     """ Intermediate output modules
1260     """
1261
1262     def __init__(self, output_width, n_parts, part_pts):
1263         self.part_pts = part_pts
1264         self.output_width = output_width
1265         self.n_parts = n_parts
1266
1267         self.i = self.ispec()
1268         self.o = self.ospec()
1269
1270     def ispec(self):
1271         return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
1272
1273     def ospec(self):
1274         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1275
1276     def elaborate(self, platform):
1277         m = Module()
1278
1279         out_part_ops = self.i.part_ops
1280         out_part_pts = self.i.part_pts
1281
1282         # create _output_64
1283         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1284         m.d.comb += io64.intermed.eq(self.i.output)
1285         for i in range(8):
1286             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1287         m.d.comb += self.o.outputs[3].eq(io64.output)
1288
1289         # create _output_32
1290         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1291         m.d.comb += io32.intermed.eq(self.i.output)
1292         for i in range(8):
1293             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1294         m.d.comb += self.o.outputs[2].eq(io32.output)
1295
1296         # create _output_16
1297         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1298         m.d.comb += io16.intermed.eq(self.i.output)
1299         for i in range(8):
1300             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1301         m.d.comb += self.o.outputs[1].eq(io16.output)
1302
1303         # create _output_8
1304         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1305         m.d.comb += io8.intermed.eq(self.i.output)
1306         for i in range(8):
1307             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1308         m.d.comb += self.o.outputs[0].eq(io8.output)
1309
1310         for i in range(8):
1311             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1312         m.d.comb += self.o.part_pts.eq(out_part_pts)
1313         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1314
1315         return m
1316
1317
1318 class Mul8_16_32_64(Elaboratable):
1319     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1320
1321     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1322     partitions on naturally-aligned boundaries. Supports the operation being
1323     set for each partition independently.
1324
1325     :attribute part_pts: the input partition points. Has a partition point at
1326         multiples of 8 in 0 < i < 64. Each partition point's associated
1327         ``Value`` is a ``Signal``. Modification not supported, except for by
1328         ``Signal.eq``.
1329     :attribute part_ops: the operation for each byte. The operation for a
1330         particular partition is selected by assigning the selected operation
1331         code to each byte in the partition. The allowed operation codes are:
1332
1333         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1334             RISC-V's `mul` instruction.
1335         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1336             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1337             instruction.
1338         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1339             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1340             `mulhsu` instruction.
1341         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1342             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1343             instruction.
1344     """
1345
1346     def __init__(self, register_levels=()):
1347         """ register_levels: specifies the points in the cascade at which
1348             flip-flops are to be inserted.
1349         """
1350
1351         # parameter(s)
1352         self.register_levels = list(register_levels)
1353
1354         self.i = self.ispec()
1355         self.o = self.ospec()
1356
1357         # inputs
1358         self.part_pts = self.i.part_pts
1359         self.part_ops = self.i.part_ops
1360         self.a = self.i.a
1361         self.b = self.i.b
1362
1363         # output
1364         self.intermediate_output = self.o.intermediate_output
1365         self.output = self.o.output
1366
1367     def ispec(self):
1368         return InputData()
1369
1370     def ospec(self):
1371         return OutputData()
1372
1373     def elaborate(self, platform):
1374         m = Module()
1375
1376         part_pts = self.part_pts
1377
1378         n_inputs = 64 + 4
1379         n_parts = 8
1380         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1381         m.submodules.allterms = t
1382         m.d.comb += t.i.eq(self.i)
1383
1384         terms = t.o.terms
1385
1386         add_reduce = AddReduce(terms,
1387                                128,
1388                                self.register_levels,
1389                                t.o.part_pts,
1390                                t.o.part_ops)
1391
1392         m.submodules.add_reduce = add_reduce
1393
1394         interm = Intermediates(128, 8, part_pts)
1395         m.submodules.intermediates = interm
1396         m.d.comb += interm.i.eq(add_reduce.o)
1397
1398         # final output
1399         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1400         m.d.comb += finalout.i.eq(interm.o)
1401         m.d.comb += self.o.eq(finalout.o)
1402
1403         return m
1404
1405
1406 if __name__ == "__main__":
1407     m = Mul8_16_32_64()
1408     main(m, ports=[m.a,
1409                    m.b,
1410                    m.intermediate_output,
1411                    m.output,
1412                    *m.part_ops,
1413                    *m.part_pts.values()])