src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 350         self.i = AddReduceData(partition_points, n_inputs,
 351                                output_width, n_parts)
 352         self.o = FinalReduceData(partition_points, output_width, n_parts)
 353         self.output_width = output_width
 354         self.n_inputs = n_inputs
 355         self.n_parts = n_parts
 356         self.partition_points = PartitionPoints(partition_points)
 357         if not self.partition_points.fits_in_width(output_width):
 358             raise ValueError("partition_points doesn't fit in output_width")
 359
 360     def elaborate(self, platform):
 361         """Elaborate this module."""
 362         m = Module()
 363
 364         output_width = self.output_width
 365         output = Signal(output_width, reset_less=True)
 366         if self.n_inputs == 0:
 367             # use 0 as the default output value
 368             m.d.comb += output.eq(0)
 369         elif self.n_inputs == 1:
 370             # handle single input
 371             m.d.comb += output.eq(self.i.terms[0])
 372         else:
 373             # base case for adding 2 inputs
 374             assert self.n_inputs == 2
 375             adder = PartitionedAdder(output_width,
 376                                      self.i.part_pts, 2)
 377             m.submodules.final_adder = adder
 378             m.d.comb += adder.a.eq(self.i.terms[0])
 379             m.d.comb += adder.b.eq(self.i.terms[1])
 380             m.d.comb += output.eq(adder.output)
 381
 382         # create output
 383         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 384                                    self.i.part_ops)
 385
 386         return m
 387
 388
 389 class AddReduceSingle(Elaboratable):
 390     """Add list of numbers together.
 391
 392     :attribute inputs: input ``Signal``s to be summed. Modification not
 393         supported, except for by ``Signal.eq``.
 394     :attribute register_levels: List of nesting levels that should have
 395         pipeline registers.
 396     :attribute output: output sum.
 397     :attribute partition_points: the input partition points. Modification not
 398         supported, except for by ``Signal.eq``.
 399     """
 400
 401     def __init__(self, n_inputs, output_width, n_parts, partition_points):
 402         """Create an ``AddReduce``.
 403
 404         :param inputs: input ``Signal``s to be summed.
 405         :param output_width: bit-width of ``output``.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.partition_points = PartitionPoints(partition_points)
 414         if not self.partition_points.fits_in_width(output_width):
 415             raise ValueError("partition_points doesn't fit in output_width")
 416
 417         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 418         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 419         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 420
 421     @staticmethod
 422     def calc_n_inputs(n_inputs, groups):
 423         retval = len(groups)*2
 424         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 425             retval += 1
 426         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 427             retval += 2
 428         else:
 429             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 430         return retval
 431
 432     @staticmethod
 433     def get_max_level(input_count):
 434         """Get the maximum level.
 435
 436         All ``register_levels`` must be less than or equal to the maximum
 437         level.
 438         """
 439         retval = 0
 440         while True:
 441             groups = AddReduceSingle.full_adder_groups(input_count)
 442             if len(groups) == 0:
 443                 return retval
 444             input_count %= FULL_ADDER_INPUT_COUNT
 445             input_count += 2 * len(groups)
 446             retval += 1
 447
 448     @staticmethod
 449     def full_adder_groups(input_count):
 450         """Get ``inputs`` indices for which a full adder should be built."""
 451         return range(0,
 452                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 453                      FULL_ADDER_INPUT_COUNT)
 454
 455     def create_next_terms(self):
 456         """ create next intermediate terms, for linking up in elaborate, below
 457         """
 458         terms = []
 459         adders = []
 460
 461         # create full adders for this recursive level.
 462         # this shrinks N terms to 2 * (N // 3) plus the remainder
 463         for i in self.groups:
 464             adder_i = MaskedFullAdder(self.output_width)
 465             adders.append((i, adder_i))
 466             # add both the sum and the masked-carry to the next level.
 467             # 3 inputs have now been reduced to 2...
 468             terms.append(adder_i.sum)
 469             terms.append(adder_i.mcarry)
 470         # handle the remaining inputs.
 471         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 472             terms.append(self.i.terms[-1])
 473         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 474             # Just pass the terms to the next layer, since we wouldn't gain
 475             # anything by using a half adder since there would still be 2 terms
 476             # and just passing the terms to the next layer saves gates.
 477             terms.append(self.i.terms[-2])
 478             terms.append(self.i.terms[-1])
 479         else:
 480             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 481
 482         return terms, adders
 483
 484     def elaborate(self, platform):
 485         """Elaborate this module."""
 486         m = Module()
 487
 488         terms, adders = self.create_next_terms()
 489
 490         # copy the intermediate terms to the output
 491         for i, value in enumerate(terms):
 492             m.d.comb += self.o.terms[i].eq(value)
 493
 494         # copy reg part points and part ops to output
 495         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 496         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 497                                      for i in range(len(self.i.part_ops))]
 498
 499         # set up the partition mask (for the adders)
 500         part_mask = Signal(self.output_width, reset_less=True)
 501
 502         # get partition points as a mask
 503         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 504         m.d.comb += part_mask.eq(mask)
 505
 506         # add and link the intermediate term modules
 507         for i, (iidx, adder_i) in enumerate(adders):
 508             setattr(m.submodules, f"adder_{i}", adder_i)
 509
 510             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 511             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 512             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 513             m.d.comb += adder_i.mask.eq(part_mask)
 514
 515         return m
 516
 517
 518 class AddReduce(Elaboratable):
 519     """Recursively Add list of numbers together.
 520
 521     :attribute inputs: input ``Signal``s to be summed. Modification not
 522         supported, except for by ``Signal.eq``.
 523     :attribute register_levels: List of nesting levels that should have
 524         pipeline registers.
 525     :attribute output: output sum.
 526     :attribute partition_points: the input partition points. Modification not
 527         supported, except for by ``Signal.eq``.
 528     """
 529
 530     def __init__(self, inputs, output_width, register_levels, partition_points,
 531                        part_ops):
 532         """Create an ``AddReduce``.
 533
 534         :param inputs: input ``Signal``s to be summed.
 535         :param output_width: bit-width of ``output``.
 536         :param register_levels: List of nesting levels that should have
 537             pipeline registers.
 538         :param partition_points: the input partition points.
 539         """
 540         self.inputs = inputs
 541         self.part_ops = part_ops
 542         n_parts = len(part_ops)
 543         self.o = FinalReduceData(partition_points, output_width, n_parts)
 544         self.output_width = output_width
 545         self.register_levels = register_levels
 546         self.partition_points = partition_points
 547
 548         self.create_levels()
 549
 550     @staticmethod
 551     def get_max_level(input_count):
 552         return AddReduceSingle.get_max_level(input_count)
 553
 554     @staticmethod
 555     def next_register_levels(register_levels):
 556         """``Iterable`` of ``register_levels`` for next recursive level."""
 557         for level in register_levels:
 558             if level > 0:
 559                 yield level - 1
 560
 561     def create_levels(self):
 562         """creates reduction levels"""
 563
 564         mods = []
 565         partition_points = self.partition_points
 566         part_ops = self.part_ops
 567         n_parts = len(part_ops)
 568         inputs = self.inputs
 569         ilen = len(inputs)
 570         while True:
 571             groups = AddReduceSingle.full_adder_groups(len(inputs))
 572             if len(groups) == 0:
 573                 break
 574             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 575                                          partition_points)
 576             mods.append(next_level)
 577             partition_points = next_level.i.part_pts
 578             inputs = next_level.o.terms
 579             ilen = len(inputs)
 580             part_ops = next_level.i.part_ops
 581
 582         next_level = FinalAdd(ilen, self.output_width, n_parts,
 583                               partition_points)
 584         mods.append(next_level)
 585
 586         self.levels = mods
 587
 588     def elaborate(self, platform):
 589         """Elaborate this module."""
 590         m = Module()
 591
 592         for i, next_level in enumerate(self.levels):
 593             setattr(m.submodules, "next_level%d" % i, next_level)
 594
 595         partition_points = self.partition_points
 596         inputs = self.inputs
 597         part_ops = self.part_ops
 598         n_parts = len(part_ops)
 599         n_inputs = len(inputs)
 600         output_width = self.output_width
 601         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 602         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 603         for idx in range(len(self.levels)):
 604             mcur = self.levels[idx]
 605             if idx in self.register_levels:
 606                 m.d.sync += mcur.i.eq(i)
 607             else:
 608                 m.d.comb += mcur.i.eq(i)
 609             i = mcur.o # for next loop
 610
 611         # output comes from last module
 612         m.d.comb += self.o.eq(i)
 613
 614         return m
 615
 616
 617 OP_MUL_LOW = 0
 618 OP_MUL_SIGNED_HIGH = 1
 619 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 620 OP_MUL_UNSIGNED_HIGH = 3
 621
 622
 623 def get_term(value, shift=0, enabled=None):
 624     if enabled is not None:
 625         value = Mux(enabled, value, 0)
 626     if shift > 0:
 627         value = Cat(Repl(C(0, 1), shift), value)
 628     else:
 629         assert shift == 0
 630     return value
 631
 632
 633 class ProductTerm(Elaboratable):
 634     """ this class creates a single product term (a[..]*b[..]).
 635         it has a design flaw in that is the *output* that is selected,
 636         where the multiplication(s) are combinatorially generated
 637         all the time.
 638     """
 639
 640     def __init__(self, width, twidth, pbwid, a_index, b_index):
 641         self.a_index = a_index
 642         self.b_index = b_index
 643         shift = 8 * (self.a_index + self.b_index)
 644         self.pwidth = width
 645         self.twidth = twidth
 646         self.width = width*2
 647         self.shift = shift
 648
 649         self.ti = Signal(self.width, reset_less=True)
 650         self.term = Signal(twidth, reset_less=True)
 651         self.a = Signal(twidth//2, reset_less=True)
 652         self.b = Signal(twidth//2, reset_less=True)
 653         self.pb_en = Signal(pbwid, reset_less=True)
 654
 655         self.tl = tl = []
 656         min_index = min(self.a_index, self.b_index)
 657         max_index = max(self.a_index, self.b_index)
 658         for i in range(min_index, max_index):
 659             tl.append(self.pb_en[i])
 660         name = "te_%d_%d" % (self.a_index, self.b_index)
 661         if len(tl) > 0:
 662             term_enabled = Signal(name=name, reset_less=True)
 663         else:
 664             term_enabled = None
 665         self.enabled = term_enabled
 666         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 667
 668     def elaborate(self, platform):
 669
 670         m = Module()
 671         if self.enabled is not None:
 672             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 673
 674         bsa = Signal(self.width, reset_less=True)
 675         bsb = Signal(self.width, reset_less=True)
 676         a_index, b_index = self.a_index, self.b_index
 677         pwidth = self.pwidth
 678         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 679         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 680         m.d.comb += self.ti.eq(bsa * bsb)
 681         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 682         """
 683         #TODO: sort out width issues, get inputs a/b switched on/off.
 684         #data going into Muxes is 1/2 the required width
 685
 686         pwidth = self.pwidth
 687         width = self.width
 688         bsa = Signal(self.twidth//2, reset_less=True)
 689         bsb = Signal(self.twidth//2, reset_less=True)
 690         asel = Signal(width, reset_less=True)
 691         bsel = Signal(width, reset_less=True)
 692         a_index, b_index = self.a_index, self.b_index
 693         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 694         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 695         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 696         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 697         m.d.comb += self.ti.eq(bsa * bsb)
 698         m.d.comb += self.term.eq(self.ti)
 699         """
 700
 701         return m
 702
 703
 704 class ProductTerms(Elaboratable):
 705     """ creates a bank of product terms.  also performs the actual bit-selection
 706         this class is to be wrapped with a for-loop on the "a" operand.
 707         it creates a second-level for-loop on the "b" operand.
 708     """
 709     def __init__(self, width, twidth, pbwid, a_index, blen):
 710         self.a_index = a_index
 711         self.blen = blen
 712         self.pwidth = width
 713         self.twidth = twidth
 714         self.pbwid = pbwid
 715         self.a = Signal(twidth//2, reset_less=True)
 716         self.b = Signal(twidth//2, reset_less=True)
 717         self.pb_en = Signal(pbwid, reset_less=True)
 718         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 719                             for i in range(blen)]
 720
 721     def elaborate(self, platform):
 722
 723         m = Module()
 724
 725         for b_index in range(self.blen):
 726             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 727                             self.a_index, b_index)
 728             setattr(m.submodules, "term_%d" % b_index, t)
 729
 730             m.d.comb += t.a.eq(self.a)
 731             m.d.comb += t.b.eq(self.b)
 732             m.d.comb += t.pb_en.eq(self.pb_en)
 733
 734             m.d.comb += self.terms[b_index].eq(t.term)
 735
 736         return m
 737
 738
 739 class LSBNegTerm(Elaboratable):
 740
 741     def __init__(self, bit_width):
 742         self.bit_width = bit_width
 743         self.part = Signal(reset_less=True)
 744         self.signed = Signal(reset_less=True)
 745         self.op = Signal(bit_width, reset_less=True)
 746         self.msb = Signal(reset_less=True)
 747         self.nt = Signal(bit_width*2, reset_less=True)
 748         self.nl = Signal(bit_width*2, reset_less=True)
 749
 750     def elaborate(self, platform):
 751         m = Module()
 752         comb = m.d.comb
 753         bit_wid = self.bit_width
 754         ext = Repl(0, bit_wid) # extend output to HI part
 755
 756         # determine sign of each incoming number *in this partition*
 757         enabled = Signal(reset_less=True)
 758         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 759
 760         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 761         # negation operation is split into a bitwise not and a +1.
 762         # likewise for 16, 32, and 64-bit values.
 763
 764         # width-extended 1s complement if a is signed, otherwise zero
 765         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 766
 767         # add 1 if signed, otherwise add zero
 768         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 769
 770         return m
 771
 772
 773 class Parts(Elaboratable):
 774
 775     def __init__(self, pbwid, part_pts, n_parts):
 776         self.pbwid = pbwid
 777         # inputs
 778         self.part_pts = PartitionPoints.like(part_pts)
 779         # outputs
 780         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 781                       for i in range(n_parts)]
 782
 783     def elaborate(self, platform):
 784         m = Module()
 785
 786         part_pts, parts = self.part_pts, self.parts
 787         # collect part-bytes (double factor because the input is extended)
 788         pbs = Signal(self.pbwid, reset_less=True)
 789         tl = []
 790         for i in range(self.pbwid):
 791             pb = Signal(name="pb%d" % i, reset_less=True)
 792             m.d.comb += pb.eq(part_pts.part_byte(i))
 793             tl.append(pb)
 794         m.d.comb += pbs.eq(Cat(*tl))
 795
 796         # negated-temporary copy of partition bits
 797         npbs = Signal.like(pbs, reset_less=True)
 798         m.d.comb += npbs.eq(~pbs)
 799         byte_count = 8 // len(parts)
 800         for i in range(len(parts)):
 801             pbl = []
 802             pbl.append(npbs[i * byte_count - 1])
 803             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 804                 pbl.append(pbs[j])
 805             pbl.append(npbs[(i + 1) * byte_count - 1])
 806             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 807             m.d.comb += value.eq(Cat(*pbl))
 808             m.d.comb += parts[i].eq(~(value).bool())
 809
 810         return m
 811
 812
 813 class Part(Elaboratable):
 814     """ a key class which, depending on the partitioning, will determine
 815         what action to take when parts of the output are signed or unsigned.
 816
 817         this requires 2 pieces of data *per operand, per partition*:
 818         whether the MSB is HI/LO (per partition!), and whether a signed
 819         or unsigned operation has been *requested*.
 820
 821         once that is determined, signed is basically carried out
 822         by splitting 2's complement into 1's complement plus one.
 823         1's complement is just a bit-inversion.
 824
 825         the extra terms - as separate terms - are then thrown at the
 826         AddReduce alongside the multiplication part-results.
 827     """
 828     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 829
 830         self.pbwid = pbwid
 831         self.part_pts = part_pts
 832
 833         # inputs
 834         self.a = Signal(64, reset_less=True)
 835         self.b = Signal(64, reset_less=True)
 836         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 837                             for i in range(8)]
 838         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 839                             for i in range(8)]
 840         self.pbs = Signal(pbwid, reset_less=True)
 841
 842         # outputs
 843         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 844                             for i in range(n_parts)]
 845
 846         self.not_a_term = Signal(width, reset_less=True)
 847         self.neg_lsb_a_term = Signal(width, reset_less=True)
 848         self.not_b_term = Signal(width, reset_less=True)
 849         self.neg_lsb_b_term = Signal(width, reset_less=True)
 850
 851     def elaborate(self, platform):
 852         m = Module()
 853
 854         pbs, parts = self.pbs, self.parts
 855         part_pts = self.part_pts
 856         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 857         m.d.comb += p.part_pts.eq(part_pts)
 858         parts = p.parts
 859
 860         byte_count = 8 // len(parts)
 861
 862         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 863                 self.not_a_term, self.neg_lsb_a_term,
 864                 self.not_b_term, self.neg_lsb_b_term)
 865
 866         byte_width = 8 // len(parts) # byte width
 867         bit_wid = 8 * byte_width     # bit width
 868         nat, nbt, nla, nlb = [], [], [], []
 869         for i in range(len(parts)):
 870             # work out bit-inverted and +1 term for a.
 871             pa = LSBNegTerm(bit_wid)
 872             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 873             m.d.comb += pa.part.eq(parts[i])
 874             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 875             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 876             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 877             nat.append(pa.nt)
 878             nla.append(pa.nl)
 879
 880             # work out bit-inverted and +1 term for b
 881             pb = LSBNegTerm(bit_wid)
 882             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 883             m.d.comb += pb.part.eq(parts[i])
 884             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 885             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 886             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 887             nbt.append(pb.nt)
 888             nlb.append(pb.nl)
 889
 890         # concatenate together and return all 4 results.
 891         m.d.comb += [not_a_term.eq(Cat(*nat)),
 892                      not_b_term.eq(Cat(*nbt)),
 893                      neg_lsb_a_term.eq(Cat(*nla)),
 894                      neg_lsb_b_term.eq(Cat(*nlb)),
 895                     ]
 896
 897         return m
 898
 899
 900 class IntermediateOut(Elaboratable):
 901     """ selects the HI/LO part of the multiplication, for a given bit-width
 902         the output is also reconstructed in its SIMD (partition) lanes.
 903     """
 904     def __init__(self, width, out_wid, n_parts):
 905         self.width = width
 906         self.n_parts = n_parts
 907         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 908                                      for i in range(8)]
 909         self.intermed = Signal(out_wid, reset_less=True)
 910         self.output = Signal(out_wid//2, reset_less=True)
 911
 912     def elaborate(self, platform):
 913         m = Module()
 914
 915         ol = []
 916         w = self.width
 917         sel = w // 8
 918         for i in range(self.n_parts):
 919             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 920             m.d.comb += op.eq(
 921                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 922                     self.intermed.part(i * w*2, w),
 923                     self.intermed.part(i * w*2 + w, w)))
 924             ol.append(op)
 925         m.d.comb += self.output.eq(Cat(*ol))
 926
 927         return m
 928
 929
 930 class FinalOut(Elaboratable):
 931     """ selects the final output based on the partitioning.
 932
 933         each byte is selectable independently, i.e. it is possible
 934         that some partitions requested 8-bit computation whilst others
 935         requested 16 or 32 bit.
 936     """
 937     def __init__(self, output_width, n_parts, part_pts):
 938         self.part_pts = part_pts
 939         self.i = IntermediateData(part_pts, output_width, n_parts)
 940         self.out_wid = output_width//2
 941         # output
 942         self.out = Signal(self.out_wid, reset_less=True)
 943         self.intermediate_output = Signal(output_width, reset_less=True)
 944
 945     def elaborate(self, platform):
 946         m = Module()
 947
 948         part_pts = self.part_pts
 949         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
 950         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
 951         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
 952         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
 953
 954         out_part_pts = self.i.part_pts
 955
 956         # temporaries
 957         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 958         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 959         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 960
 961         i8 = Signal(self.out_wid, reset_less=True)
 962         i16 = Signal(self.out_wid, reset_less=True)
 963         i32 = Signal(self.out_wid, reset_less=True)
 964         i64 = Signal(self.out_wid, reset_less=True)
 965
 966         m.d.comb += p_8.part_pts.eq(out_part_pts)
 967         m.d.comb += p_16.part_pts.eq(out_part_pts)
 968         m.d.comb += p_32.part_pts.eq(out_part_pts)
 969         m.d.comb += p_64.part_pts.eq(out_part_pts)
 970
 971         for i in range(len(p_8.parts)):
 972             m.d.comb += d8[i].eq(p_8.parts[i])
 973         for i in range(len(p_16.parts)):
 974             m.d.comb += d16[i].eq(p_16.parts[i])
 975         for i in range(len(p_32.parts)):
 976             m.d.comb += d32[i].eq(p_32.parts[i])
 977         m.d.comb += i8.eq(self.i.outputs[0])
 978         m.d.comb += i16.eq(self.i.outputs[1])
 979         m.d.comb += i32.eq(self.i.outputs[2])
 980         m.d.comb += i64.eq(self.i.outputs[3])
 981
 982         ol = []
 983         for i in range(8):
 984             # select one of the outputs: d8 selects i8, d16 selects i16
 985             # d32 selects i32, and the default is i64.
 986             # d8 and d16 are ORed together in the first Mux
 987             # then the 2nd selects either i8 or i16.
 988             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 989             op = Signal(8, reset_less=True, name="op_%d" % i)
 990             m.d.comb += op.eq(
 991                 Mux(d8[i] | d16[i // 2],
 992                     Mux(d8[i], i8.part(i * 8, 8), i16.part(i * 8, 8)),
 993                     Mux(d32[i // 4], i32.part(i * 8, 8), i64.part(i * 8, 8))))
 994             ol.append(op)
 995         m.d.comb += self.out.eq(Cat(*ol))
 996         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
 997         return m
 998
 999
1000 class OrMod(Elaboratable):
1001     """ ORs four values together in a hierarchical tree
1002     """
1003     def __init__(self, wid):
1004         self.wid = wid
1005         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1006                      for i in range(4)]
1007         self.orout = Signal(wid, reset_less=True)
1008
1009     def elaborate(self, platform):
1010         m = Module()
1011         or1 = Signal(self.wid, reset_less=True)
1012         or2 = Signal(self.wid, reset_less=True)
1013         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1014         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1015         m.d.comb += self.orout.eq(or1 | or2)
1016
1017         return m
1018
1019
1020 class Signs(Elaboratable):
1021     """ determines whether a or b are signed numbers
1022         based on the required operation type (OP_MUL_*)
1023     """
1024
1025     def __init__(self):
1026         self.part_ops = Signal(2, reset_less=True)
1027         self.a_signed = Signal(reset_less=True)
1028         self.b_signed = Signal(reset_less=True)
1029
1030     def elaborate(self, platform):
1031
1032         m = Module()
1033
1034         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1035         bsig = (self.part_ops == OP_MUL_LOW) \
1036                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1037         m.d.comb += self.a_signed.eq(asig)
1038         m.d.comb += self.b_signed.eq(bsig)
1039
1040         return m
1041
1042
1043 class IntermediateData:
1044
1045     def __init__(self, part_pts, output_width, n_parts):
1046         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1047                           for i in range(n_parts)]
1048         self.part_pts = part_pts.like()
1049         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1050                           for i in range(4)]
1051         # intermediates (needed for unit tests)
1052         self.intermediate_output = Signal(output_width)
1053
1054     def eq_from(self, part_pts, outputs, intermediate_output,
1055                       part_ops):
1056         return [self.part_pts.eq(part_pts)] + \
1057                [self.intermediate_output.eq(intermediate_output)] + \
1058                [self.outputs[i].eq(outputs[i])
1059                                      for i in range(4)] + \
1060                [self.part_ops[i].eq(part_ops[i])
1061                                      for i in range(len(self.part_ops))]
1062
1063     def eq(self, rhs):
1064         return self.eq_from(rhs.part_pts, rhs.outputs,
1065                             rhs.intermediate_output, rhs.part_ops)
1066
1067
1068 class InputData:
1069
1070     def __init__(self):
1071         self.a = Signal(64)
1072         self.b = Signal(64)
1073         self.part_pts = PartitionPoints()
1074         for i in range(8, 64, 8):
1075             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1076         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1077
1078     def eq_from(self, part_pts, inputs, part_ops):
1079         return [self.part_pts.eq(part_pts)] + \
1080                [self.a.eq(a), self.b.eq(b)] + \
1081                [self.part_ops[i].eq(part_ops[i])
1082                                      for i in range(len(self.part_ops))]
1083
1084     def eq(self, rhs):
1085         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1086
1087
1088 class AllTerms(Elaboratable):
1089     """Set of terms to be added together
1090     """
1091
1092     def __init__(self, n_inputs, output_width, n_parts, register_levels):
1093         """Create an ``AddReduce``.
1094
1095         :param inputs: input ``Signal``s to be summed.
1096         :param output_width: bit-width of ``output``.
1097         :param register_levels: List of nesting levels that should have
1098             pipeline registers.
1099         :param partition_points: the input partition points.
1100         """
1101         self.i = InputData()
1102         self.register_levels = register_levels
1103         self.n_inputs = n_inputs
1104         self.n_parts = n_parts
1105         self.output_width = output_width
1106         self.o = AddReduceData(self.i.part_pts, n_inputs,
1107                                output_width, n_parts)
1108
1109     def elaborate(self, platform):
1110         m = Module()
1111
1112         eps = self.i.part_pts
1113
1114         # collect part-bytes
1115         pbs = Signal(8, reset_less=True)
1116         tl = []
1117         for i in range(8):
1118             pb = Signal(name="pb%d" % i, reset_less=True)
1119             m.d.comb += pb.eq(eps.part_byte(i))
1120             tl.append(pb)
1121         m.d.comb += pbs.eq(Cat(*tl))
1122
1123         # local variables
1124         signs = []
1125         for i in range(8):
1126             s = Signs()
1127             signs.append(s)
1128             setattr(m.submodules, "signs%d" % i, s)
1129             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1130
1131         n_levels = len(self.register_levels)+1
1132         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1133         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1134         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1135         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1136         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1137         for mod in [part_8, part_16, part_32, part_64]:
1138             m.d.comb += mod.a.eq(self.i.a)
1139             m.d.comb += mod.b.eq(self.i.b)
1140             for i in range(len(signs)):
1141                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1142                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1143             m.d.comb += mod.pbs.eq(pbs)
1144             nat_l.append(mod.not_a_term)
1145             nbt_l.append(mod.not_b_term)
1146             nla_l.append(mod.neg_lsb_a_term)
1147             nlb_l.append(mod.neg_lsb_b_term)
1148
1149         terms = []
1150
1151         for a_index in range(8):
1152             t = ProductTerms(8, 128, 8, a_index, 8)
1153             setattr(m.submodules, "terms_%d" % a_index, t)
1154
1155             m.d.comb += t.a.eq(self.i.a)
1156             m.d.comb += t.b.eq(self.i.b)
1157             m.d.comb += t.pb_en.eq(pbs)
1158
1159             for term in t.terms:
1160                 terms.append(term)
1161
1162         # it's fine to bitwise-or data together since they are never enabled
1163         # at the same time
1164         m.submodules.nat_or = nat_or = OrMod(128)
1165         m.submodules.nbt_or = nbt_or = OrMod(128)
1166         m.submodules.nla_or = nla_or = OrMod(128)
1167         m.submodules.nlb_or = nlb_or = OrMod(128)
1168         for l, mod in [(nat_l, nat_or),
1169                              (nbt_l, nbt_or),
1170                              (nla_l, nla_or),
1171                              (nlb_l, nlb_or)]:
1172             for i in range(len(l)):
1173                 m.d.comb += mod.orin[i].eq(l[i])
1174             terms.append(mod.orout)
1175
1176         # copy the intermediate terms to the output
1177         for i, value in enumerate(terms):
1178             m.d.comb += self.o.terms[i].eq(value)
1179
1180         # copy reg part points and part ops to output
1181         m.d.comb += self.o.part_pts.eq(eps)
1182         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1183                                      for i in range(len(self.i.part_ops))]
1184
1185         return m
1186
1187
1188 class Intermediates(Elaboratable):
1189     """ Intermediate output modules
1190     """
1191
1192     def __init__(self, output_width, n_parts, partition_points):
1193         self.i = FinalReduceData(partition_points, output_width, n_parts)
1194         self.o = IntermediateData(partition_points, output_width, n_parts)
1195
1196     def elaborate(self, platform):
1197         m = Module()
1198
1199         out_part_ops = self.i.part_ops
1200         out_part_pts = self.i.part_pts
1201
1202         # create _output_64
1203         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1204         m.d.comb += io64.intermed.eq(self.i.output)
1205         for i in range(8):
1206             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1207         m.d.comb += self.o.outputs[3].eq(io64.output)
1208
1209         # create _output_32
1210         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1211         m.d.comb += io32.intermed.eq(self.i.output)
1212         for i in range(8):
1213             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1214         m.d.comb += self.o.outputs[2].eq(io32.output)
1215
1216         # create _output_16
1217         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1218         m.d.comb += io16.intermed.eq(self.i.output)
1219         for i in range(8):
1220             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1221         m.d.comb += self.o.outputs[1].eq(io16.output)
1222
1223         # create _output_8
1224         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1225         m.d.comb += io8.intermed.eq(self.i.output)
1226         for i in range(8):
1227             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1228         m.d.comb += self.o.outputs[0].eq(io8.output)
1229
1230         for i in range(8):
1231             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1232         m.d.comb += self.o.part_pts.eq(out_part_pts)
1233         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1234
1235         return m
1236
1237
1238 class Mul8_16_32_64(Elaboratable):
1239     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1240
1241     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1242     partitions on naturally-aligned boundaries. Supports the operation being
1243     set for each partition independently.
1244
1245     :attribute part_pts: the input partition points. Has a partition point at
1246         multiples of 8 in 0 < i < 64. Each partition point's associated
1247         ``Value`` is a ``Signal``. Modification not supported, except for by
1248         ``Signal.eq``.
1249     :attribute part_ops: the operation for each byte. The operation for a
1250         particular partition is selected by assigning the selected operation
1251         code to each byte in the partition. The allowed operation codes are:
1252
1253         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1254             RISC-V's `mul` instruction.
1255         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1256             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1257             instruction.
1258         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1259             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1260             `mulhsu` instruction.
1261         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1262             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1263             instruction.
1264     """
1265
1266     def __init__(self, register_levels=()):
1267         """ register_levels: specifies the points in the cascade at which
1268             flip-flops are to be inserted.
1269         """
1270
1271         # parameter(s)
1272         self.register_levels = list(register_levels)
1273
1274         # inputs
1275         self.i = InputData()
1276         self.part_pts = self.i.part_pts
1277         self.part_ops = self.i.part_ops
1278         self.a = self.i.a
1279         self.b = self.i.b
1280
1281         # intermediates (needed for unit tests)
1282         self.intermediate_output = Signal(128)
1283
1284         # output
1285         self.output = Signal(64)
1286
1287     def elaborate(self, platform):
1288         m = Module()
1289
1290         part_pts = self.part_pts
1291
1292         n_inputs = 64 + 4
1293         n_parts = 8 #len(self.part_pts)
1294         t = AllTerms(n_inputs, 128, n_parts, self.register_levels)
1295         m.submodules.allterms = t
1296         m.d.comb += t.i.a.eq(self.a)
1297         m.d.comb += t.i.b.eq(self.b)
1298         m.d.comb += t.i.part_pts.eq(part_pts)
1299         for i in range(8):
1300             m.d.comb += t.i.part_ops[i].eq(self.part_ops[i])
1301
1302         terms = t.o.terms
1303
1304         add_reduce = AddReduce(terms,
1305                                128,
1306                                self.register_levels,
1307                                t.o.part_pts,
1308                                t.o.part_ops)
1309
1310         out_part_ops = add_reduce.o.part_ops
1311         out_part_pts = add_reduce.o.part_pts
1312
1313         m.submodules.add_reduce = add_reduce
1314
1315         interm = Intermediates(128, 8, part_pts)
1316         m.submodules.intermediates = interm
1317         m.d.comb += interm.i.eq(add_reduce.o)
1318
1319         # final output
1320         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1321         m.d.comb += finalout.i.eq(interm.o)
1322         m.d.comb += self.output.eq(finalout.out)
1323         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1324
1325         return m
1326
1327
1328 if __name__ == "__main__":
1329     m = Mul8_16_32_64()
1330     main(m, ports=[m.a,
1331                    m.b,
1332                    m.intermediate_output,
1333                    m.output,
1334                    *m.part_ops,
1335                    *m.part_pts.values()])