src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points,
 350                        partition_step=1):
 351         self.partition_step = partition_step
 352         self.output_width = output_width
 353         self.n_inputs = n_inputs
 354         self.n_parts = n_parts
 355         self.partition_points = PartitionPoints(partition_points)
 356         if not self.partition_points.fits_in_width(output_width):
 357             raise ValueError("partition_points doesn't fit in output_width")
 358
 359         self.i = self.ispec()
 360         self.o = self.ospec()
 361
 362     def ispec(self):
 363         return AddReduceData(self.partition_points, self.n_inputs,
 364                              self.output_width, self.n_parts)
 365
 366     def ospec(self):
 367         return FinalReduceData(self.partition_points,
 368                                  self.output_width, self.n_parts)
 369
 370     def elaborate(self, platform):
 371         """Elaborate this module."""
 372         m = Module()
 373
 374         output_width = self.output_width
 375         output = Signal(output_width, reset_less=True)
 376         if self.n_inputs == 0:
 377             # use 0 as the default output value
 378             m.d.comb += output.eq(0)
 379         elif self.n_inputs == 1:
 380             # handle single input
 381             m.d.comb += output.eq(self.i.terms[0])
 382         else:
 383             # base case for adding 2 inputs
 384             assert self.n_inputs == 2
 385             adder = PartitionedAdder(output_width,
 386                                      self.i.part_pts, self.partition_step)
 387             m.submodules.final_adder = adder
 388             m.d.comb += adder.a.eq(self.i.terms[0])
 389             m.d.comb += adder.b.eq(self.i.terms[1])
 390             m.d.comb += output.eq(adder.output)
 391
 392         # create output
 393         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 394                                    self.i.part_ops)
 395
 396         return m
 397
 398
 399 class AddReduceSingle(Elaboratable):
 400     """Add list of numbers together.
 401
 402     :attribute inputs: input ``Signal``s to be summed. Modification not
 403         supported, except for by ``Signal.eq``.
 404     :attribute register_levels: List of nesting levels that should have
 405         pipeline registers.
 406     :attribute output: output sum.
 407     :attribute partition_points: the input partition points. Modification not
 408         supported, except for by ``Signal.eq``.
 409     """
 410
 411     def __init__(self, n_inputs, output_width, n_parts, partition_points,
 412                        partition_step=1):
 413         """Create an ``AddReduce``.
 414
 415         :param inputs: input ``Signal``s to be summed.
 416         :param output_width: bit-width of ``output``.
 417         :param partition_points: the input partition points.
 418         """
 419         self.partition_step = partition_step
 420         self.n_inputs = n_inputs
 421         self.n_parts = n_parts
 422         self.output_width = output_width
 423         self.partition_points = PartitionPoints(partition_points)
 424         if not self.partition_points.fits_in_width(output_width):
 425             raise ValueError("partition_points doesn't fit in output_width")
 426
 427         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 428         self.n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 429
 430         self.i = self.ispec()
 431         self.o = self.ospec()
 432
 433     def ispec(self):
 434         return AddReduceData(self.partition_points, self.n_inputs,
 435                              self.output_width, self.n_parts)
 436
 437     def ospec(self):
 438         return AddReduceData(self.partition_points, self.n_terms,
 439                              self.output_width, self.n_parts)
 440
 441     @staticmethod
 442     def calc_n_inputs(n_inputs, groups):
 443         retval = len(groups)*2
 444         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 445             retval += 1
 446         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 447             retval += 2
 448         else:
 449             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 450         return retval
 451
 452     @staticmethod
 453     def get_max_level(input_count):
 454         """Get the maximum level.
 455
 456         All ``register_levels`` must be less than or equal to the maximum
 457         level.
 458         """
 459         retval = 0
 460         while True:
 461             groups = AddReduceSingle.full_adder_groups(input_count)
 462             if len(groups) == 0:
 463                 return retval
 464             input_count %= FULL_ADDER_INPUT_COUNT
 465             input_count += 2 * len(groups)
 466             retval += 1
 467
 468     @staticmethod
 469     def full_adder_groups(input_count):
 470         """Get ``inputs`` indices for which a full adder should be built."""
 471         return range(0,
 472                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 473                      FULL_ADDER_INPUT_COUNT)
 474
 475     def create_next_terms(self):
 476         """ create next intermediate terms, for linking up in elaborate, below
 477         """
 478         terms = []
 479         adders = []
 480
 481         # create full adders for this recursive level.
 482         # this shrinks N terms to 2 * (N // 3) plus the remainder
 483         for i in self.groups:
 484             adder_i = MaskedFullAdder(self.output_width)
 485             adders.append((i, adder_i))
 486             # add both the sum and the masked-carry to the next level.
 487             # 3 inputs have now been reduced to 2...
 488             terms.append(adder_i.sum)
 489             terms.append(adder_i.mcarry)
 490         # handle the remaining inputs.
 491         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 492             terms.append(self.i.terms[-1])
 493         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 494             # Just pass the terms to the next layer, since we wouldn't gain
 495             # anything by using a half adder since there would still be 2 terms
 496             # and just passing the terms to the next layer saves gates.
 497             terms.append(self.i.terms[-2])
 498             terms.append(self.i.terms[-1])
 499         else:
 500             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 501
 502         return terms, adders
 503
 504     def elaborate(self, platform):
 505         """Elaborate this module."""
 506         m = Module()
 507
 508         terms, adders = self.create_next_terms()
 509
 510         # copy the intermediate terms to the output
 511         for i, value in enumerate(terms):
 512             m.d.comb += self.o.terms[i].eq(value)
 513
 514         # copy reg part points and part ops to output
 515         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 516         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 517                                      for i in range(len(self.i.part_ops))]
 518
 519         # set up the partition mask (for the adders)
 520         part_mask = Signal(self.output_width, reset_less=True)
 521
 522         # get partition points as a mask
 523         mask = self.i.part_pts.as_mask(self.output_width,
 524                                        mul=self.partition_step)
 525         m.d.comb += part_mask.eq(mask)
 526
 527         # add and link the intermediate term modules
 528         for i, (iidx, adder_i) in enumerate(adders):
 529             setattr(m.submodules, f"adder_{i}", adder_i)
 530
 531             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 532             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 533             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 534             m.d.comb += adder_i.mask.eq(part_mask)
 535
 536         return m
 537
 538
 539 class AddReduceInternal:
 540     """Recursively Add list of numbers together.
 541
 542     :attribute inputs: input ``Signal``s to be summed. Modification not
 543         supported, except for by ``Signal.eq``.
 544     :attribute register_levels: List of nesting levels that should have
 545         pipeline registers.
 546     :attribute output: output sum.
 547     :attribute partition_points: the input partition points. Modification not
 548         supported, except for by ``Signal.eq``.
 549     """
 550
 551     def __init__(self, i, output_width, partition_step=1):
 552         """Create an ``AddReduce``.
 553
 554         :param inputs: input ``Signal``s to be summed.
 555         :param output_width: bit-width of ``output``.
 556         :param partition_points: the input partition points.
 557         """
 558         self.i = i
 559         self.inputs = i.terms
 560         self.part_ops = i.part_ops
 561         self.output_width = output_width
 562         self.partition_points = i.part_pts
 563         self.partition_step = partition_step
 564
 565         self.create_levels()
 566
 567     def create_levels(self):
 568         """creates reduction levels"""
 569
 570         mods = []
 571         partition_points = self.partition_points
 572         part_ops = self.part_ops
 573         n_parts = len(part_ops)
 574         inputs = self.inputs
 575         ilen = len(inputs)
 576         while True:
 577             groups = AddReduceSingle.full_adder_groups(len(inputs))
 578             if len(groups) == 0:
 579                 break
 580             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 581                                          partition_points,
 582                                          self.partition_step)
 583             mods.append(next_level)
 584             partition_points = next_level.i.part_pts
 585             inputs = next_level.o.terms
 586             ilen = len(inputs)
 587             part_ops = next_level.i.part_ops
 588
 589         next_level = FinalAdd(ilen, self.output_width, n_parts,
 590                               partition_points, self.partition_step)
 591         mods.append(next_level)
 592
 593         self.levels = mods
 594
 595
 596 class AddReduce(AddReduceInternal, Elaboratable):
 597     """Recursively Add list of numbers together.
 598
 599     :attribute inputs: input ``Signal``s to be summed. Modification not
 600         supported, except for by ``Signal.eq``.
 601     :attribute register_levels: List of nesting levels that should have
 602         pipeline registers.
 603     :attribute output: output sum.
 604     :attribute partition_points: the input partition points. Modification not
 605         supported, except for by ``Signal.eq``.
 606     """
 607
 608     def __init__(self, inputs, output_width, register_levels, part_pts,
 609                        part_ops, partition_step=1):
 610         """Create an ``AddReduce``.
 611
 612         :param inputs: input ``Signal``s to be summed.
 613         :param output_width: bit-width of ``output``.
 614         :param register_levels: List of nesting levels that should have
 615             pipeline registers.
 616         :param partition_points: the input partition points.
 617         """
 618         self._inputs = inputs
 619         self._part_pts = part_pts
 620         self._part_ops = part_ops
 621         n_parts = len(part_ops)
 622         self.i = AddReduceData(part_pts, len(inputs),
 623                              output_width, n_parts)
 624         AddReduceInternal.__init__(self, self.i, output_width, partition_step)
 625         self.o = FinalReduceData(part_pts, output_width, n_parts)
 626         self.register_levels = register_levels
 627
 628     @staticmethod
 629     def get_max_level(input_count):
 630         return AddReduceSingle.get_max_level(input_count)
 631
 632     @staticmethod
 633     def next_register_levels(register_levels):
 634         """``Iterable`` of ``register_levels`` for next recursive level."""
 635         for level in register_levels:
 636             if level > 0:
 637                 yield level - 1
 638
 639     def elaborate(self, platform):
 640         """Elaborate this module."""
 641         m = Module()
 642
 643         m.d.comb += self.i.eq_from(self._part_pts, self._inputs, self._part_ops)
 644
 645         for i, next_level in enumerate(self.levels):
 646             setattr(m.submodules, "next_level%d" % i, next_level)
 647
 648         i = self.i
 649         for idx in range(len(self.levels)):
 650             mcur = self.levels[idx]
 651             if idx in self.register_levels:
 652                 m.d.sync += mcur.i.eq(i)
 653             else:
 654                 m.d.comb += mcur.i.eq(i)
 655             i = mcur.o # for next loop
 656
 657         # output comes from last module
 658         m.d.comb += self.o.eq(i)
 659
 660         return m
 661
 662
 663 OP_MUL_LOW = 0
 664 OP_MUL_SIGNED_HIGH = 1
 665 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 666 OP_MUL_UNSIGNED_HIGH = 3
 667
 668
 669 def get_term(value, shift=0, enabled=None):
 670     if enabled is not None:
 671         value = Mux(enabled, value, 0)
 672     if shift > 0:
 673         value = Cat(Repl(C(0, 1), shift), value)
 674     else:
 675         assert shift == 0
 676     return value
 677
 678
 679 class ProductTerm(Elaboratable):
 680     """ this class creates a single product term (a[..]*b[..]).
 681         it has a design flaw in that is the *output* that is selected,
 682         where the multiplication(s) are combinatorially generated
 683         all the time.
 684     """
 685
 686     def __init__(self, width, twidth, pbwid, a_index, b_index):
 687         self.a_index = a_index
 688         self.b_index = b_index
 689         shift = 8 * (self.a_index + self.b_index)
 690         self.pwidth = width
 691         self.twidth = twidth
 692         self.width = width*2
 693         self.shift = shift
 694
 695         self.ti = Signal(self.width, reset_less=True)
 696         self.term = Signal(twidth, reset_less=True)
 697         self.a = Signal(twidth//2, reset_less=True)
 698         self.b = Signal(twidth//2, reset_less=True)
 699         self.pb_en = Signal(pbwid, reset_less=True)
 700
 701         self.tl = tl = []
 702         min_index = min(self.a_index, self.b_index)
 703         max_index = max(self.a_index, self.b_index)
 704         for i in range(min_index, max_index):
 705             tl.append(self.pb_en[i])
 706         name = "te_%d_%d" % (self.a_index, self.b_index)
 707         if len(tl) > 0:
 708             term_enabled = Signal(name=name, reset_less=True)
 709         else:
 710             term_enabled = None
 711         self.enabled = term_enabled
 712         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 713
 714     def elaborate(self, platform):
 715
 716         m = Module()
 717         if self.enabled is not None:
 718             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 719
 720         bsa = Signal(self.width, reset_less=True)
 721         bsb = Signal(self.width, reset_less=True)
 722         a_index, b_index = self.a_index, self.b_index
 723         pwidth = self.pwidth
 724         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 725         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 726         m.d.comb += self.ti.eq(bsa * bsb)
 727         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 728         """
 729         #TODO: sort out width issues, get inputs a/b switched on/off.
 730         #data going into Muxes is 1/2 the required width
 731
 732         pwidth = self.pwidth
 733         width = self.width
 734         bsa = Signal(self.twidth//2, reset_less=True)
 735         bsb = Signal(self.twidth//2, reset_less=True)
 736         asel = Signal(width, reset_less=True)
 737         bsel = Signal(width, reset_less=True)
 738         a_index, b_index = self.a_index, self.b_index
 739         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 740         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 741         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 742         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 743         m.d.comb += self.ti.eq(bsa * bsb)
 744         m.d.comb += self.term.eq(self.ti)
 745         """
 746
 747         return m
 748
 749
 750 class ProductTerms(Elaboratable):
 751     """ creates a bank of product terms.  also performs the actual bit-selection
 752         this class is to be wrapped with a for-loop on the "a" operand.
 753         it creates a second-level for-loop on the "b" operand.
 754     """
 755     def __init__(self, width, twidth, pbwid, a_index, blen):
 756         self.a_index = a_index
 757         self.blen = blen
 758         self.pwidth = width
 759         self.twidth = twidth
 760         self.pbwid = pbwid
 761         self.a = Signal(twidth//2, reset_less=True)
 762         self.b = Signal(twidth//2, reset_less=True)
 763         self.pb_en = Signal(pbwid, reset_less=True)
 764         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 765                             for i in range(blen)]
 766
 767     def elaborate(self, platform):
 768
 769         m = Module()
 770
 771         for b_index in range(self.blen):
 772             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 773                             self.a_index, b_index)
 774             setattr(m.submodules, "term_%d" % b_index, t)
 775
 776             m.d.comb += t.a.eq(self.a)
 777             m.d.comb += t.b.eq(self.b)
 778             m.d.comb += t.pb_en.eq(self.pb_en)
 779
 780             m.d.comb += self.terms[b_index].eq(t.term)
 781
 782         return m
 783
 784
 785 class LSBNegTerm(Elaboratable):
 786
 787     def __init__(self, bit_width):
 788         self.bit_width = bit_width
 789         self.part = Signal(reset_less=True)
 790         self.signed = Signal(reset_less=True)
 791         self.op = Signal(bit_width, reset_less=True)
 792         self.msb = Signal(reset_less=True)
 793         self.nt = Signal(bit_width*2, reset_less=True)
 794         self.nl = Signal(bit_width*2, reset_less=True)
 795
 796     def elaborate(self, platform):
 797         m = Module()
 798         comb = m.d.comb
 799         bit_wid = self.bit_width
 800         ext = Repl(0, bit_wid) # extend output to HI part
 801
 802         # determine sign of each incoming number *in this partition*
 803         enabled = Signal(reset_less=True)
 804         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 805
 806         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 807         # negation operation is split into a bitwise not and a +1.
 808         # likewise for 16, 32, and 64-bit values.
 809
 810         # width-extended 1s complement if a is signed, otherwise zero
 811         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 812
 813         # add 1 if signed, otherwise add zero
 814         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 815
 816         return m
 817
 818
 819 class Parts(Elaboratable):
 820
 821     def __init__(self, pbwid, part_pts, n_parts):
 822         self.pbwid = pbwid
 823         # inputs
 824         self.part_pts = PartitionPoints.like(part_pts)
 825         # outputs
 826         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 827                       for i in range(n_parts)]
 828
 829     def elaborate(self, platform):
 830         m = Module()
 831
 832         part_pts, parts = self.part_pts, self.parts
 833         # collect part-bytes (double factor because the input is extended)
 834         pbs = Signal(self.pbwid, reset_less=True)
 835         tl = []
 836         for i in range(self.pbwid):
 837             pb = Signal(name="pb%d" % i, reset_less=True)
 838             m.d.comb += pb.eq(part_pts.part_byte(i))
 839             tl.append(pb)
 840         m.d.comb += pbs.eq(Cat(*tl))
 841
 842         # negated-temporary copy of partition bits
 843         npbs = Signal.like(pbs, reset_less=True)
 844         m.d.comb += npbs.eq(~pbs)
 845         byte_count = 8 // len(parts)
 846         for i in range(len(parts)):
 847             pbl = []
 848             pbl.append(npbs[i * byte_count - 1])
 849             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 850                 pbl.append(pbs[j])
 851             pbl.append(npbs[(i + 1) * byte_count - 1])
 852             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 853             m.d.comb += value.eq(Cat(*pbl))
 854             m.d.comb += parts[i].eq(~(value).bool())
 855
 856         return m
 857
 858
 859 class Part(Elaboratable):
 860     """ a key class which, depending on the partitioning, will determine
 861         what action to take when parts of the output are signed or unsigned.
 862
 863         this requires 2 pieces of data *per operand, per partition*:
 864         whether the MSB is HI/LO (per partition!), and whether a signed
 865         or unsigned operation has been *requested*.
 866
 867         once that is determined, signed is basically carried out
 868         by splitting 2's complement into 1's complement plus one.
 869         1's complement is just a bit-inversion.
 870
 871         the extra terms - as separate terms - are then thrown at the
 872         AddReduce alongside the multiplication part-results.
 873     """
 874     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 875
 876         self.pbwid = pbwid
 877         self.part_pts = part_pts
 878
 879         # inputs
 880         self.a = Signal(64, reset_less=True)
 881         self.b = Signal(64, reset_less=True)
 882         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 883                             for i in range(8)]
 884         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 885                             for i in range(8)]
 886         self.pbs = Signal(pbwid, reset_less=True)
 887
 888         # outputs
 889         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 890                             for i in range(n_parts)]
 891
 892         self.not_a_term = Signal(width, reset_less=True)
 893         self.neg_lsb_a_term = Signal(width, reset_less=True)
 894         self.not_b_term = Signal(width, reset_less=True)
 895         self.neg_lsb_b_term = Signal(width, reset_less=True)
 896
 897     def elaborate(self, platform):
 898         m = Module()
 899
 900         pbs, parts = self.pbs, self.parts
 901         part_pts = self.part_pts
 902         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 903         m.d.comb += p.part_pts.eq(part_pts)
 904         parts = p.parts
 905
 906         byte_count = 8 // len(parts)
 907
 908         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 909                 self.not_a_term, self.neg_lsb_a_term,
 910                 self.not_b_term, self.neg_lsb_b_term)
 911
 912         byte_width = 8 // len(parts) # byte width
 913         bit_wid = 8 * byte_width     # bit width
 914         nat, nbt, nla, nlb = [], [], [], []
 915         for i in range(len(parts)):
 916             # work out bit-inverted and +1 term for a.
 917             pa = LSBNegTerm(bit_wid)
 918             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 919             m.d.comb += pa.part.eq(parts[i])
 920             m.d.comb += pa.op.eq(self.a.bit_select(bit_wid * i, bit_wid))
 921             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 922             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 923             nat.append(pa.nt)
 924             nla.append(pa.nl)
 925
 926             # work out bit-inverted and +1 term for b
 927             pb = LSBNegTerm(bit_wid)
 928             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 929             m.d.comb += pb.part.eq(parts[i])
 930             m.d.comb += pb.op.eq(self.b.bit_select(bit_wid * i, bit_wid))
 931             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 932             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 933             nbt.append(pb.nt)
 934             nlb.append(pb.nl)
 935
 936         # concatenate together and return all 4 results.
 937         m.d.comb += [not_a_term.eq(Cat(*nat)),
 938                      not_b_term.eq(Cat(*nbt)),
 939                      neg_lsb_a_term.eq(Cat(*nla)),
 940                      neg_lsb_b_term.eq(Cat(*nlb)),
 941                     ]
 942
 943         return m
 944
 945
 946 class IntermediateOut(Elaboratable):
 947     """ selects the HI/LO part of the multiplication, for a given bit-width
 948         the output is also reconstructed in its SIMD (partition) lanes.
 949     """
 950     def __init__(self, width, out_wid, n_parts):
 951         self.width = width
 952         self.n_parts = n_parts
 953         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 954                                      for i in range(8)]
 955         self.intermed = Signal(out_wid, reset_less=True)
 956         self.output = Signal(out_wid//2, reset_less=True)
 957
 958     def elaborate(self, platform):
 959         m = Module()
 960
 961         ol = []
 962         w = self.width
 963         sel = w // 8
 964         for i in range(self.n_parts):
 965             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 966             m.d.comb += op.eq(
 967                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 968                     self.intermed.bit_select(i * w*2, w),
 969                     self.intermed.bit_select(i * w*2 + w, w)))
 970             ol.append(op)
 971         m.d.comb += self.output.eq(Cat(*ol))
 972
 973         return m
 974
 975
 976 class FinalOut(Elaboratable):
 977     """ selects the final output based on the partitioning.
 978
 979         each byte is selectable independently, i.e. it is possible
 980         that some partitions requested 8-bit computation whilst others
 981         requested 16 or 32 bit.
 982     """
 983     def __init__(self, output_width, n_parts, part_pts):
 984         self.part_pts = part_pts
 985         self.output_width = output_width
 986         self.n_parts = n_parts
 987         self.out_wid = output_width//2
 988
 989         self.i = self.ispec()
 990         self.o = self.ospec()
 991
 992     def ispec(self):
 993         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
 994
 995     def ospec(self):
 996         return OutputData()
 997
 998     def setup(self, m, i):
 999         m.submodules.finalout = self
1000         m.d.comb += self.i.eq(i)
1001
1002     def process(self, i):
1003         return self.o
1004
1005     def elaborate(self, platform):
1006         m = Module()
1007
1008         part_pts = self.part_pts
1009         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
1010         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
1011         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
1012         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
1013
1014         out_part_pts = self.i.part_pts
1015
1016         # temporaries
1017         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
1018         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
1019         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
1020
1021         i8 = Signal(self.out_wid, reset_less=True)
1022         i16 = Signal(self.out_wid, reset_less=True)
1023         i32 = Signal(self.out_wid, reset_less=True)
1024         i64 = Signal(self.out_wid, reset_less=True)
1025
1026         m.d.comb += p_8.part_pts.eq(out_part_pts)
1027         m.d.comb += p_16.part_pts.eq(out_part_pts)
1028         m.d.comb += p_32.part_pts.eq(out_part_pts)
1029         m.d.comb += p_64.part_pts.eq(out_part_pts)
1030
1031         for i in range(len(p_8.parts)):
1032             m.d.comb += d8[i].eq(p_8.parts[i])
1033         for i in range(len(p_16.parts)):
1034             m.d.comb += d16[i].eq(p_16.parts[i])
1035         for i in range(len(p_32.parts)):
1036             m.d.comb += d32[i].eq(p_32.parts[i])
1037         m.d.comb += i8.eq(self.i.outputs[0])
1038         m.d.comb += i16.eq(self.i.outputs[1])
1039         m.d.comb += i32.eq(self.i.outputs[2])
1040         m.d.comb += i64.eq(self.i.outputs[3])
1041
1042         ol = []
1043         for i in range(8):
1044             # select one of the outputs: d8 selects i8, d16 selects i16
1045             # d32 selects i32, and the default is i64.
1046             # d8 and d16 are ORed together in the first Mux
1047             # then the 2nd selects either i8 or i16.
1048             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1049             op = Signal(8, reset_less=True, name="op_%d" % i)
1050             m.d.comb += op.eq(
1051                 Mux(d8[i] | d16[i // 2],
1052                     Mux(d8[i], i8.bit_select(i * 8, 8),
1053                                i16.bit_select(i * 8, 8)),
1054                     Mux(d32[i // 4], i32.bit_select(i * 8, 8),
1055                                       i64.bit_select(i * 8, 8))))
1056             ol.append(op)
1057
1058         # create outputs
1059         m.d.comb += self.o.output.eq(Cat(*ol))
1060         m.d.comb += self.o.intermediate_output.eq(self.i.intermediate_output)
1061
1062         return m
1063
1064
1065 class OrMod(Elaboratable):
1066     """ ORs four values together in a hierarchical tree
1067     """
1068     def __init__(self, wid):
1069         self.wid = wid
1070         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1071                      for i in range(4)]
1072         self.orout = Signal(wid, reset_less=True)
1073
1074     def elaborate(self, platform):
1075         m = Module()
1076         or1 = Signal(self.wid, reset_less=True)
1077         or2 = Signal(self.wid, reset_less=True)
1078         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1079         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1080         m.d.comb += self.orout.eq(or1 | or2)
1081
1082         return m
1083
1084
1085 class Signs(Elaboratable):
1086     """ determines whether a or b are signed numbers
1087         based on the required operation type (OP_MUL_*)
1088     """
1089
1090     def __init__(self):
1091         self.part_ops = Signal(2, reset_less=True)
1092         self.a_signed = Signal(reset_less=True)
1093         self.b_signed = Signal(reset_less=True)
1094
1095     def elaborate(self, platform):
1096
1097         m = Module()
1098
1099         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1100         bsig = (self.part_ops == OP_MUL_LOW) \
1101                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1102         m.d.comb += self.a_signed.eq(asig)
1103         m.d.comb += self.b_signed.eq(bsig)
1104
1105         return m
1106
1107
1108 class IntermediateData:
1109
1110     def __init__(self, part_pts, output_width, n_parts):
1111         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1112                           for i in range(n_parts)]
1113         self.part_pts = part_pts.like()
1114         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1115                           for i in range(4)]
1116         # intermediates (needed for unit tests)
1117         self.intermediate_output = Signal(output_width)
1118
1119     def eq_from(self, part_pts, outputs, intermediate_output,
1120                       part_ops):
1121         return [self.part_pts.eq(part_pts)] + \
1122                [self.intermediate_output.eq(intermediate_output)] + \
1123                [self.outputs[i].eq(outputs[i])
1124                                      for i in range(4)] + \
1125                [self.part_ops[i].eq(part_ops[i])
1126                                      for i in range(len(self.part_ops))]
1127
1128     def eq(self, rhs):
1129         return self.eq_from(rhs.part_pts, rhs.outputs,
1130                             rhs.intermediate_output, rhs.part_ops)
1131
1132
1133 class InputData:
1134
1135     def __init__(self):
1136         self.a = Signal(64)
1137         self.b = Signal(64)
1138         self.part_pts = PartitionPoints()
1139         for i in range(8, 64, 8):
1140             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1141         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1142
1143     def eq_from(self, part_pts, a, b, part_ops):
1144         return [self.part_pts.eq(part_pts)] + \
1145                [self.a.eq(a), self.b.eq(b)] + \
1146                [self.part_ops[i].eq(part_ops[i])
1147                                      for i in range(len(self.part_ops))]
1148
1149     def eq(self, rhs):
1150         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1151
1152
1153 class OutputData:
1154
1155     def __init__(self):
1156         self.intermediate_output = Signal(128) # needed for unit tests
1157         self.output = Signal(64)
1158
1159     def eq(self, rhs):
1160         return [self.intermediate_output.eq(rhs.intermediate_output),
1161                 self.output.eq(rhs.output)]
1162
1163
1164 class AllTerms(Elaboratable):
1165     """Set of terms to be added together
1166     """
1167
1168     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1169         """Create an ``AddReduce``.
1170
1171         :param inputs: input ``Signal``s to be summed.
1172         :param output_width: bit-width of ``output``.
1173         :param register_levels: List of nesting levels that should have
1174             pipeline registers.
1175         :param partition_points: the input partition points.
1176         """
1177         self.register_levels = register_levels
1178         self.n_inputs = n_inputs
1179         self.n_parts = n_parts
1180         self.output_width = output_width
1181
1182         self.i = self.ispec()
1183         self.o = self.ospec()
1184
1185     def setup(self, m, i):
1186         m.submodules.allterms = self
1187         m.d.comb += self.i.eq(i)
1188
1189     def process(self, i):
1190         return self.o
1191
1192     def ispec(self):
1193         return InputData()
1194
1195     def ospec(self):
1196         return AddReduceData(self.i.part_pts, self.n_inputs,
1197                              self.output_width, self.n_parts)
1198
1199     def elaborate(self, platform):
1200         m = Module()
1201
1202         eps = self.i.part_pts
1203
1204         # collect part-bytes
1205         pbs = Signal(8, reset_less=True)
1206         tl = []
1207         for i in range(8):
1208             pb = Signal(name="pb%d" % i, reset_less=True)
1209             m.d.comb += pb.eq(eps.part_byte(i))
1210             tl.append(pb)
1211         m.d.comb += pbs.eq(Cat(*tl))
1212
1213         # local variables
1214         signs = []
1215         for i in range(8):
1216             s = Signs()
1217             signs.append(s)
1218             setattr(m.submodules, "signs%d" % i, s)
1219             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1220
1221         n_levels = len(self.register_levels)+1
1222         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1223         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1224         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1225         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1226         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1227         for mod in [part_8, part_16, part_32, part_64]:
1228             m.d.comb += mod.a.eq(self.i.a)
1229             m.d.comb += mod.b.eq(self.i.b)
1230             for i in range(len(signs)):
1231                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1232                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1233             m.d.comb += mod.pbs.eq(pbs)
1234             nat_l.append(mod.not_a_term)
1235             nbt_l.append(mod.not_b_term)
1236             nla_l.append(mod.neg_lsb_a_term)
1237             nlb_l.append(mod.neg_lsb_b_term)
1238
1239         terms = []
1240
1241         for a_index in range(8):
1242             t = ProductTerms(8, 128, 8, a_index, 8)
1243             setattr(m.submodules, "terms_%d" % a_index, t)
1244
1245             m.d.comb += t.a.eq(self.i.a)
1246             m.d.comb += t.b.eq(self.i.b)
1247             m.d.comb += t.pb_en.eq(pbs)
1248
1249             for term in t.terms:
1250                 terms.append(term)
1251
1252         # it's fine to bitwise-or data together since they are never enabled
1253         # at the same time
1254         m.submodules.nat_or = nat_or = OrMod(128)
1255         m.submodules.nbt_or = nbt_or = OrMod(128)
1256         m.submodules.nla_or = nla_or = OrMod(128)
1257         m.submodules.nlb_or = nlb_or = OrMod(128)
1258         for l, mod in [(nat_l, nat_or),
1259                              (nbt_l, nbt_or),
1260                              (nla_l, nla_or),
1261                              (nlb_l, nlb_or)]:
1262             for i in range(len(l)):
1263                 m.d.comb += mod.orin[i].eq(l[i])
1264             terms.append(mod.orout)
1265
1266         # copy the intermediate terms to the output
1267         for i, value in enumerate(terms):
1268             m.d.comb += self.o.terms[i].eq(value)
1269
1270         # copy reg part points and part ops to output
1271         m.d.comb += self.o.part_pts.eq(eps)
1272         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1273                                      for i in range(len(self.i.part_ops))]
1274
1275         return m
1276
1277
1278 class Intermediates(Elaboratable):
1279     """ Intermediate output modules
1280     """
1281
1282     def __init__(self, output_width, n_parts, part_pts):
1283         self.part_pts = part_pts
1284         self.output_width = output_width
1285         self.n_parts = n_parts
1286
1287         self.i = self.ispec()
1288         self.o = self.ospec()
1289
1290     def ispec(self):
1291         return FinalReduceData(self.part_pts, self.output_width, self.n_parts)
1292
1293     def ospec(self):
1294         return IntermediateData(self.part_pts, self.output_width, self.n_parts)
1295
1296     def setup(self, m, i):
1297         m.submodules.intermediates = self
1298         m.d.comb += self.i.eq(i)
1299
1300     def process(self, i):
1301         return self.o
1302
1303     def elaborate(self, platform):
1304         m = Module()
1305
1306         out_part_ops = self.i.part_ops
1307         out_part_pts = self.i.part_pts
1308
1309         # create _output_64
1310         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1311         m.d.comb += io64.intermed.eq(self.i.output)
1312         for i in range(8):
1313             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1314         m.d.comb += self.o.outputs[3].eq(io64.output)
1315
1316         # create _output_32
1317         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1318         m.d.comb += io32.intermed.eq(self.i.output)
1319         for i in range(8):
1320             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1321         m.d.comb += self.o.outputs[2].eq(io32.output)
1322
1323         # create _output_16
1324         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1325         m.d.comb += io16.intermed.eq(self.i.output)
1326         for i in range(8):
1327             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1328         m.d.comb += self.o.outputs[1].eq(io16.output)
1329
1330         # create _output_8
1331         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1332         m.d.comb += io8.intermed.eq(self.i.output)
1333         for i in range(8):
1334             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1335         m.d.comb += self.o.outputs[0].eq(io8.output)
1336
1337         for i in range(8):
1338             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1339         m.d.comb += self.o.part_pts.eq(out_part_pts)
1340         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1341
1342         return m
1343
1344
1345 class Mul8_16_32_64(Elaboratable):
1346     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1347
1348     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1349     partitions on naturally-aligned boundaries. Supports the operation being
1350     set for each partition independently.
1351
1352     :attribute part_pts: the input partition points. Has a partition point at
1353         multiples of 8 in 0 < i < 64. Each partition point's associated
1354         ``Value`` is a ``Signal``. Modification not supported, except for by
1355         ``Signal.eq``.
1356     :attribute part_ops: the operation for each byte. The operation for a
1357         particular partition is selected by assigning the selected operation
1358         code to each byte in the partition. The allowed operation codes are:
1359
1360         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1361             RISC-V's `mul` instruction.
1362         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1363             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1364             instruction.
1365         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1366             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1367             `mulhsu` instruction.
1368         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1369             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1370             instruction.
1371     """
1372
1373     def __init__(self, register_levels=()):
1374         """ register_levels: specifies the points in the cascade at which
1375             flip-flops are to be inserted.
1376         """
1377
1378         # parameter(s)
1379         self.register_levels = list(register_levels)
1380
1381         self.i = self.ispec()
1382         self.o = self.ospec()
1383
1384         # inputs
1385         self.part_pts = self.i.part_pts
1386         self.part_ops = self.i.part_ops
1387         self.a = self.i.a
1388         self.b = self.i.b
1389
1390         # output
1391         self.intermediate_output = self.o.intermediate_output
1392         self.output = self.o.output
1393
1394     def ispec(self):
1395         return InputData()
1396
1397     def ospec(self):
1398         return OutputData()
1399
1400     def elaborate(self, platform):
1401         m = Module()
1402
1403         part_pts = self.part_pts
1404
1405         n_inputs = 64 + 4
1406         n_parts = 8
1407         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1408         t.setup(m, self.i)
1409
1410         terms = t.o.terms
1411
1412         at = AddReduceInternal(t.process(self.i), 128, partition_step=2)
1413
1414         i = at.i
1415         for idx in range(len(at.levels)):
1416             mcur = at.levels[idx]
1417             setattr(m.submodules, "addreduce_%d" % idx, mcur)
1418             if idx in self.register_levels:
1419                 m.d.sync += mcur.i.eq(i)
1420             else:
1421                 m.d.comb += mcur.i.eq(i)
1422             i = mcur.o # for next loop
1423
1424         interm = Intermediates(128, 8, part_pts)
1425         interm.setup(m, i)
1426         o = interm.process(interm.i)
1427
1428         # final output
1429         finalout = FinalOut(128, 8, part_pts)
1430         finalout.setup(m, o)
1431         m.d.comb += self.o.eq(finalout.process(o))
1432
1433         return m
1434
1435
1436 if __name__ == "__main__":
1437     m = Mul8_16_32_64()
1438     main(m, ports=[m.a,
1439                    m.b,
1440                    m.intermediate_output,
1441                    m.output,
1442                    *m.part_ops,
1443                    *m.part_pts.values()])