src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.inputs = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.reg_partition_points = ppoints.like()
 315
 316     def eq_from(self, reg_partition_points, inputs, part_ops):
 317         return [self.reg_partition_points.eq(reg_partition_points)] + \
 318                [self.inputs[i].eq(inputs[i])
 319                                      for i in range(len(self.inputs))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, ppoints, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.reg_partition_points = ppoints.like()
 334
 335     def eq_from(self, reg_partition_points, output, part_ops):
 336         return [self.reg_partition_points.eq(reg_partition_points)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 350                        partition_points):
 351         self.i = AddReduceData(partition_points, n_inputs,
 352                                output_width, n_parts)
 353         self.o = FinalReduceData(partition_points, output_width, n_parts)
 354         self.output_width = output_width
 355         self.n_inputs = n_inputs
 356         self.n_parts = n_parts
 357         self.register_levels = list(register_levels)
 358         self.partition_points = PartitionPoints(partition_points)
 359         if not self.partition_points.fits_in_width(output_width):
 360             raise ValueError("partition_points doesn't fit in output_width")
 361
 362     def elaborate(self, platform):
 363         """Elaborate this module."""
 364         m = Module()
 365
 366         output_width = self.output_width
 367         output = Signal(output_width, reset_less=True)
 368         if self.n_inputs == 0:
 369             # use 0 as the default output value
 370             m.d.comb += output.eq(0)
 371         elif self.n_inputs == 1:
 372             # handle single input
 373             m.d.comb += output.eq(self.i.inputs[0])
 374         else:
 375             # base case for adding 2 inputs
 376             assert self.n_inputs == 2
 377             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 378             m.submodules.final_adder = adder
 379             m.d.comb += adder.a.eq(self.i.inputs[0])
 380             m.d.comb += adder.b.eq(self.i.inputs[1])
 381             m.d.comb += output.eq(adder.output)
 382
 383         # create output
 384         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 385                                    self.i.part_ops)
 386
 387         return m
 388
 389
 390 class AddReduceSingle(Elaboratable):
 391     """Add list of numbers together.
 392
 393     :attribute inputs: input ``Signal``s to be summed. Modification not
 394         supported, except for by ``Signal.eq``.
 395     :attribute register_levels: List of nesting levels that should have
 396         pipeline registers.
 397     :attribute output: output sum.
 398     :attribute partition_points: the input partition points. Modification not
 399         supported, except for by ``Signal.eq``.
 400     """
 401
 402     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 403                        partition_points):
 404         """Create an ``AddReduce``.
 405
 406         :param inputs: input ``Signal``s to be summed.
 407         :param output_width: bit-width of ``output``.
 408         :param register_levels: List of nesting levels that should have
 409             pipeline registers.
 410         :param partition_points: the input partition points.
 411         """
 412         self.n_inputs = n_inputs
 413         self.n_parts = n_parts
 414         self.output_width = output_width
 415         self.i = AddReduceData(partition_points, n_inputs,
 416                                output_width, n_parts)
 417         self.register_levels = list(register_levels)
 418         self.partition_points = PartitionPoints(partition_points)
 419         if not self.partition_points.fits_in_width(output_width):
 420             raise ValueError("partition_points doesn't fit in output_width")
 421
 422         max_level = AddReduceSingle.get_max_level(n_inputs)
 423         for level in self.register_levels:
 424             if level > max_level:
 425                 raise ValueError(
 426                     "not enough adder levels for specified register levels")
 427
 428         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 429         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 430         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 431
 432     @staticmethod
 433     def calc_n_inputs(n_inputs, groups):
 434         retval = len(groups)*2
 435         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 436             retval += 1
 437         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 438             retval += 2
 439         else:
 440             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 441         return retval
 442
 443     @staticmethod
 444     def get_max_level(input_count):
 445         """Get the maximum level.
 446
 447         All ``register_levels`` must be less than or equal to the maximum
 448         level.
 449         """
 450         retval = 0
 451         while True:
 452             groups = AddReduceSingle.full_adder_groups(input_count)
 453             if len(groups) == 0:
 454                 return retval
 455             input_count %= FULL_ADDER_INPUT_COUNT
 456             input_count += 2 * len(groups)
 457             retval += 1
 458
 459     @staticmethod
 460     def full_adder_groups(input_count):
 461         """Get ``inputs`` indices for which a full adder should be built."""
 462         return range(0,
 463                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 464                      FULL_ADDER_INPUT_COUNT)
 465
 466     def create_next_terms(self):
 467         """ create next intermediate terms, for linking up in elaborate, below
 468         """
 469         terms = []
 470         adders = []
 471
 472         # create full adders for this recursive level.
 473         # this shrinks N terms to 2 * (N // 3) plus the remainder
 474         for i in self.groups:
 475             adder_i = MaskedFullAdder(self.output_width)
 476             adders.append((i, adder_i))
 477             # add both the sum and the masked-carry to the next level.
 478             # 3 inputs have now been reduced to 2...
 479             terms.append(adder_i.sum)
 480             terms.append(adder_i.mcarry)
 481         # handle the remaining inputs.
 482         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 483             terms.append(self.i.inputs[-1])
 484         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 485             # Just pass the terms to the next layer, since we wouldn't gain
 486             # anything by using a half adder since there would still be 2 terms
 487             # and just passing the terms to the next layer saves gates.
 488             terms.append(self.i.inputs[-2])
 489             terms.append(self.i.inputs[-1])
 490         else:
 491             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 492
 493         return terms, adders
 494
 495     def elaborate(self, platform):
 496         """Elaborate this module."""
 497         m = Module()
 498
 499         terms, adders = self.create_next_terms()
 500
 501         # copy the intermediate terms to the output
 502         for i, value in enumerate(terms):
 503             m.d.comb += self.o.inputs[i].eq(value)
 504
 505         # copy reg part points and part ops to output
 506         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 507         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 508                                      for i in range(len(self.i.part_ops))]
 509
 510         # set up the partition mask (for the adders)
 511         part_mask = Signal(self.output_width, reset_less=True)
 512
 513         # get partition points as a mask
 514         mask = self.i.reg_partition_points.as_mask(self.output_width, mul=2)
 515         m.d.comb += part_mask.eq(mask)
 516
 517         # add and link the intermediate term modules
 518         for i, (iidx, adder_i) in enumerate(adders):
 519             setattr(m.submodules, f"adder_{i}", adder_i)
 520
 521             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 522             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 523             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 524             m.d.comb += adder_i.mask.eq(part_mask)
 525
 526         return m
 527
 528
 529 class AddReduce(Elaboratable):
 530     """Recursively Add list of numbers together.
 531
 532     :attribute inputs: input ``Signal``s to be summed. Modification not
 533         supported, except for by ``Signal.eq``.
 534     :attribute register_levels: List of nesting levels that should have
 535         pipeline registers.
 536     :attribute output: output sum.
 537     :attribute partition_points: the input partition points. Modification not
 538         supported, except for by ``Signal.eq``.
 539     """
 540
 541     def __init__(self, inputs, output_width, register_levels, partition_points,
 542                        part_ops):
 543         """Create an ``AddReduce``.
 544
 545         :param inputs: input ``Signal``s to be summed.
 546         :param output_width: bit-width of ``output``.
 547         :param register_levels: List of nesting levels that should have
 548             pipeline registers.
 549         :param partition_points: the input partition points.
 550         """
 551         self.inputs = inputs
 552         self.part_ops = part_ops
 553         n_parts = len(part_ops)
 554         self.o = FinalReduceData(partition_points, output_width, n_parts)
 555         self.output_width = output_width
 556         self.register_levels = register_levels
 557         self.partition_points = partition_points
 558
 559         self.create_levels()
 560
 561     @staticmethod
 562     def get_max_level(input_count):
 563         return AddReduceSingle.get_max_level(input_count)
 564
 565     @staticmethod
 566     def next_register_levels(register_levels):
 567         """``Iterable`` of ``register_levels`` for next recursive level."""
 568         for level in register_levels:
 569             if level > 0:
 570                 yield level - 1
 571
 572     def create_levels(self):
 573         """creates reduction levels"""
 574
 575         mods = []
 576         next_levels = self.register_levels
 577         partition_points = self.partition_points
 578         part_ops = self.part_ops
 579         n_parts = len(part_ops)
 580         inputs = self.inputs
 581         ilen = len(inputs)
 582         while True:
 583             groups = AddReduceSingle.full_adder_groups(len(inputs))
 584             if len(groups) == 0:
 585                 break
 586             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 587                                          next_levels, partition_points)
 588             mods.append(next_level)
 589             next_levels = list(AddReduce.next_register_levels(next_levels))
 590             partition_points = next_level.i.reg_partition_points
 591             inputs = next_level.o.inputs
 592             ilen = len(inputs)
 593             part_ops = next_level.i.part_ops
 594
 595         next_level = FinalAdd(ilen, self.output_width, n_parts,
 596                               next_levels, partition_points)
 597         mods.append(next_level)
 598
 599         self.levels = mods
 600
 601     def elaborate(self, platform):
 602         """Elaborate this module."""
 603         m = Module()
 604
 605         for i, next_level in enumerate(self.levels):
 606             setattr(m.submodules, "next_level%d" % i, next_level)
 607
 608         partition_points = self.partition_points
 609         inputs = self.inputs
 610         part_ops = self.part_ops
 611         n_parts = len(part_ops)
 612         n_inputs = len(inputs)
 613         output_width = self.output_width
 614         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 615         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 616         for idx in range(len(self.levels)):
 617             mcur = self.levels[idx]
 618             if 0 in mcur.register_levels:
 619                 m.d.sync += mcur.i.eq(i)
 620             else:
 621                 m.d.comb += mcur.i.eq(i)
 622             i = mcur.o # for next loop
 623
 624         # output comes from last module
 625         m.d.comb += self.o.eq(i)
 626
 627         return m
 628
 629
 630 OP_MUL_LOW = 0
 631 OP_MUL_SIGNED_HIGH = 1
 632 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 633 OP_MUL_UNSIGNED_HIGH = 3
 634
 635
 636 def get_term(value, shift=0, enabled=None):
 637     if enabled is not None:
 638         value = Mux(enabled, value, 0)
 639     if shift > 0:
 640         value = Cat(Repl(C(0, 1), shift), value)
 641     else:
 642         assert shift == 0
 643     return value
 644
 645
 646 class ProductTerm(Elaboratable):
 647     """ this class creates a single product term (a[..]*b[..]).
 648         it has a design flaw in that is the *output* that is selected,
 649         where the multiplication(s) are combinatorially generated
 650         all the time.
 651     """
 652
 653     def __init__(self, width, twidth, pbwid, a_index, b_index):
 654         self.a_index = a_index
 655         self.b_index = b_index
 656         shift = 8 * (self.a_index + self.b_index)
 657         self.pwidth = width
 658         self.twidth = twidth
 659         self.width = width*2
 660         self.shift = shift
 661
 662         self.ti = Signal(self.width, reset_less=True)
 663         self.term = Signal(twidth, reset_less=True)
 664         self.a = Signal(twidth//2, reset_less=True)
 665         self.b = Signal(twidth//2, reset_less=True)
 666         self.pb_en = Signal(pbwid, reset_less=True)
 667
 668         self.tl = tl = []
 669         min_index = min(self.a_index, self.b_index)
 670         max_index = max(self.a_index, self.b_index)
 671         for i in range(min_index, max_index):
 672             tl.append(self.pb_en[i])
 673         name = "te_%d_%d" % (self.a_index, self.b_index)
 674         if len(tl) > 0:
 675             term_enabled = Signal(name=name, reset_less=True)
 676         else:
 677             term_enabled = None
 678         self.enabled = term_enabled
 679         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 680
 681     def elaborate(self, platform):
 682
 683         m = Module()
 684         if self.enabled is not None:
 685             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 686
 687         bsa = Signal(self.width, reset_less=True)
 688         bsb = Signal(self.width, reset_less=True)
 689         a_index, b_index = self.a_index, self.b_index
 690         pwidth = self.pwidth
 691         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 692         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 693         m.d.comb += self.ti.eq(bsa * bsb)
 694         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 695         """
 696         #TODO: sort out width issues, get inputs a/b switched on/off.
 697         #data going into Muxes is 1/2 the required width
 698
 699         pwidth = self.pwidth
 700         width = self.width
 701         bsa = Signal(self.twidth//2, reset_less=True)
 702         bsb = Signal(self.twidth//2, reset_less=True)
 703         asel = Signal(width, reset_less=True)
 704         bsel = Signal(width, reset_less=True)
 705         a_index, b_index = self.a_index, self.b_index
 706         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 707         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 708         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 709         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 710         m.d.comb += self.ti.eq(bsa * bsb)
 711         m.d.comb += self.term.eq(self.ti)
 712         """
 713
 714         return m
 715
 716
 717 class ProductTerms(Elaboratable):
 718     """ creates a bank of product terms.  also performs the actual bit-selection
 719         this class is to be wrapped with a for-loop on the "a" operand.
 720         it creates a second-level for-loop on the "b" operand.
 721     """
 722     def __init__(self, width, twidth, pbwid, a_index, blen):
 723         self.a_index = a_index
 724         self.blen = blen
 725         self.pwidth = width
 726         self.twidth = twidth
 727         self.pbwid = pbwid
 728         self.a = Signal(twidth//2, reset_less=True)
 729         self.b = Signal(twidth//2, reset_less=True)
 730         self.pb_en = Signal(pbwid, reset_less=True)
 731         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 732                             for i in range(blen)]
 733
 734     def elaborate(self, platform):
 735
 736         m = Module()
 737
 738         for b_index in range(self.blen):
 739             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 740                             self.a_index, b_index)
 741             setattr(m.submodules, "term_%d" % b_index, t)
 742
 743             m.d.comb += t.a.eq(self.a)
 744             m.d.comb += t.b.eq(self.b)
 745             m.d.comb += t.pb_en.eq(self.pb_en)
 746
 747             m.d.comb += self.terms[b_index].eq(t.term)
 748
 749         return m
 750
 751
 752 class LSBNegTerm(Elaboratable):
 753
 754     def __init__(self, bit_width):
 755         self.bit_width = bit_width
 756         self.part = Signal(reset_less=True)
 757         self.signed = Signal(reset_less=True)
 758         self.op = Signal(bit_width, reset_less=True)
 759         self.msb = Signal(reset_less=True)
 760         self.nt = Signal(bit_width*2, reset_less=True)
 761         self.nl = Signal(bit_width*2, reset_less=True)
 762
 763     def elaborate(self, platform):
 764         m = Module()
 765         comb = m.d.comb
 766         bit_wid = self.bit_width
 767         ext = Repl(0, bit_wid) # extend output to HI part
 768
 769         # determine sign of each incoming number *in this partition*
 770         enabled = Signal(reset_less=True)
 771         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 772
 773         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 774         # negation operation is split into a bitwise not and a +1.
 775         # likewise for 16, 32, and 64-bit values.
 776
 777         # width-extended 1s complement if a is signed, otherwise zero
 778         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 779
 780         # add 1 if signed, otherwise add zero
 781         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 782
 783         return m
 784
 785
 786 class Parts(Elaboratable):
 787
 788     def __init__(self, pbwid, epps, n_parts):
 789         self.pbwid = pbwid
 790         # inputs
 791         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 792         # outputs
 793         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 794                       for i in range(n_parts)]
 795
 796     def elaborate(self, platform):
 797         m = Module()
 798
 799         epps, parts = self.epps, self.parts
 800         # collect part-bytes (double factor because the input is extended)
 801         pbs = Signal(self.pbwid, reset_less=True)
 802         tl = []
 803         for i in range(self.pbwid):
 804             pb = Signal(name="pb%d" % i, reset_less=True)
 805             m.d.comb += pb.eq(epps.part_byte(i))
 806             tl.append(pb)
 807         m.d.comb += pbs.eq(Cat(*tl))
 808
 809         # negated-temporary copy of partition bits
 810         npbs = Signal.like(pbs, reset_less=True)
 811         m.d.comb += npbs.eq(~pbs)
 812         byte_count = 8 // len(parts)
 813         for i in range(len(parts)):
 814             pbl = []
 815             pbl.append(npbs[i * byte_count - 1])
 816             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 817                 pbl.append(pbs[j])
 818             pbl.append(npbs[(i + 1) * byte_count - 1])
 819             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 820             m.d.comb += value.eq(Cat(*pbl))
 821             m.d.comb += parts[i].eq(~(value).bool())
 822
 823         return m
 824
 825
 826 class Part(Elaboratable):
 827     """ a key class which, depending on the partitioning, will determine
 828         what action to take when parts of the output are signed or unsigned.
 829
 830         this requires 2 pieces of data *per operand, per partition*:
 831         whether the MSB is HI/LO (per partition!), and whether a signed
 832         or unsigned operation has been *requested*.
 833
 834         once that is determined, signed is basically carried out
 835         by splitting 2's complement into 1's complement plus one.
 836         1's complement is just a bit-inversion.
 837
 838         the extra terms - as separate terms - are then thrown at the
 839         AddReduce alongside the multiplication part-results.
 840     """
 841     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 842
 843         self.pbwid = pbwid
 844         self.epps = epps
 845
 846         # inputs
 847         self.a = Signal(64, reset_less=True)
 848         self.b = Signal(64, reset_less=True)
 849         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 850                             for i in range(8)]
 851         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 852                             for i in range(8)]
 853         self.pbs = Signal(pbwid, reset_less=True)
 854
 855         # outputs
 856         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 857                             for i in range(n_parts)]
 858
 859         self.not_a_term = Signal(width, reset_less=True)
 860         self.neg_lsb_a_term = Signal(width, reset_less=True)
 861         self.not_b_term = Signal(width, reset_less=True)
 862         self.neg_lsb_b_term = Signal(width, reset_less=True)
 863
 864     def elaborate(self, platform):
 865         m = Module()
 866
 867         pbs, parts = self.pbs, self.parts
 868         epps = self.epps
 869         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 870         m.d.comb += p.epps.eq(epps)
 871         parts = p.parts
 872
 873         byte_count = 8 // len(parts)
 874
 875         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 876                 self.not_a_term, self.neg_lsb_a_term,
 877                 self.not_b_term, self.neg_lsb_b_term)
 878
 879         byte_width = 8 // len(parts) # byte width
 880         bit_wid = 8 * byte_width     # bit width
 881         nat, nbt, nla, nlb = [], [], [], []
 882         for i in range(len(parts)):
 883             # work out bit-inverted and +1 term for a.
 884             pa = LSBNegTerm(bit_wid)
 885             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 886             m.d.comb += pa.part.eq(parts[i])
 887             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 888             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 889             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 890             nat.append(pa.nt)
 891             nla.append(pa.nl)
 892
 893             # work out bit-inverted and +1 term for b
 894             pb = LSBNegTerm(bit_wid)
 895             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 896             m.d.comb += pb.part.eq(parts[i])
 897             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 898             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 899             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 900             nbt.append(pb.nt)
 901             nlb.append(pb.nl)
 902
 903         # concatenate together and return all 4 results.
 904         m.d.comb += [not_a_term.eq(Cat(*nat)),
 905                      not_b_term.eq(Cat(*nbt)),
 906                      neg_lsb_a_term.eq(Cat(*nla)),
 907                      neg_lsb_b_term.eq(Cat(*nlb)),
 908                     ]
 909
 910         return m
 911
 912
 913 class IntermediateOut(Elaboratable):
 914     """ selects the HI/LO part of the multiplication, for a given bit-width
 915         the output is also reconstructed in its SIMD (partition) lanes.
 916     """
 917     def __init__(self, width, out_wid, n_parts):
 918         self.width = width
 919         self.n_parts = n_parts
 920         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 921                                      for i in range(8)]
 922         self.intermed = Signal(out_wid, reset_less=True)
 923         self.output = Signal(out_wid//2, reset_less=True)
 924
 925     def elaborate(self, platform):
 926         m = Module()
 927
 928         ol = []
 929         w = self.width
 930         sel = w // 8
 931         for i in range(self.n_parts):
 932             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 933             m.d.comb += op.eq(
 934                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 935                     self.intermed.part(i * w*2, w),
 936                     self.intermed.part(i * w*2 + w, w)))
 937             ol.append(op)
 938         m.d.comb += self.output.eq(Cat(*ol))
 939
 940         return m
 941
 942
 943 class FinalOut(Elaboratable):
 944     """ selects the final output based on the partitioning.
 945
 946         each byte is selectable independently, i.e. it is possible
 947         that some partitions requested 8-bit computation whilst others
 948         requested 16 or 32 bit.
 949     """
 950     def __init__(self, output_width, n_parts, partition_points):
 951         self.expanded_part_points = partition_points
 952         self.i = IntermediateData(partition_points, output_width, n_parts)
 953         self.out_wid = output_width//2
 954         # output
 955         self.out = Signal(self.out_wid, reset_less=True)
 956         self.intermediate_output = Signal(output_width, reset_less=True)
 957
 958     def elaborate(self, platform):
 959         m = Module()
 960
 961         eps = self.expanded_part_points
 962         m.submodules.p_8 = p_8 = Parts(8, eps, 8)
 963         m.submodules.p_16 = p_16 = Parts(8, eps, 4)
 964         m.submodules.p_32 = p_32 = Parts(8, eps, 2)
 965         m.submodules.p_64 = p_64 = Parts(8, eps, 1)
 966
 967         out_part_pts = self.i.reg_partition_points
 968
 969         # temporaries
 970         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 971         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 972         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 973
 974         i8 = Signal(self.out_wid, reset_less=True)
 975         i16 = Signal(self.out_wid, reset_less=True)
 976         i32 = Signal(self.out_wid, reset_less=True)
 977         i64 = Signal(self.out_wid, reset_less=True)
 978
 979         m.d.comb += p_8.epps.eq(out_part_pts)
 980         m.d.comb += p_16.epps.eq(out_part_pts)
 981         m.d.comb += p_32.epps.eq(out_part_pts)
 982         m.d.comb += p_64.epps.eq(out_part_pts)
 983
 984         for i in range(len(p_8.parts)):
 985             m.d.comb += d8[i].eq(p_8.parts[i])
 986         for i in range(len(p_16.parts)):
 987             m.d.comb += d16[i].eq(p_16.parts[i])
 988         for i in range(len(p_32.parts)):
 989             m.d.comb += d32[i].eq(p_32.parts[i])
 990         m.d.comb += i8.eq(self.i.outputs[0])
 991         m.d.comb += i16.eq(self.i.outputs[1])
 992         m.d.comb += i32.eq(self.i.outputs[2])
 993         m.d.comb += i64.eq(self.i.outputs[3])
 994
 995         ol = []
 996         for i in range(8):
 997             # select one of the outputs: d8 selects i8, d16 selects i16
 998             # d32 selects i32, and the default is i64.
 999             # d8 and d16 are ORed together in the first Mux
1000             # then the 2nd selects either i8 or i16.
1001             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1002             op = Signal(8, reset_less=True, name="op_%d" % i)
1003             m.d.comb += op.eq(
1004                 Mux(d8[i] | d16[i // 2],
1005                     Mux(d8[i], i8.part(i * 8, 8), i16.part(i * 8, 8)),
1006                     Mux(d32[i // 4], i32.part(i * 8, 8), i64.part(i * 8, 8))))
1007             ol.append(op)
1008         m.d.comb += self.out.eq(Cat(*ol))
1009         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
1010         return m
1011
1012
1013 class OrMod(Elaboratable):
1014     """ ORs four values together in a hierarchical tree
1015     """
1016     def __init__(self, wid):
1017         self.wid = wid
1018         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1019                      for i in range(4)]
1020         self.orout = Signal(wid, reset_less=True)
1021
1022     def elaborate(self, platform):
1023         m = Module()
1024         or1 = Signal(self.wid, reset_less=True)
1025         or2 = Signal(self.wid, reset_less=True)
1026         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1027         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1028         m.d.comb += self.orout.eq(or1 | or2)
1029
1030         return m
1031
1032
1033 class Signs(Elaboratable):
1034     """ determines whether a or b are signed numbers
1035         based on the required operation type (OP_MUL_*)
1036     """
1037
1038     def __init__(self):
1039         self.part_ops = Signal(2, reset_less=True)
1040         self.a_signed = Signal(reset_less=True)
1041         self.b_signed = Signal(reset_less=True)
1042
1043     def elaborate(self, platform):
1044
1045         m = Module()
1046
1047         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1048         bsig = (self.part_ops == OP_MUL_LOW) \
1049                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1050         m.d.comb += self.a_signed.eq(asig)
1051         m.d.comb += self.b_signed.eq(bsig)
1052
1053         return m
1054
1055
1056 class IntermediateData:
1057
1058     def __init__(self, ppoints, output_width, n_parts):
1059         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1060                           for i in range(n_parts)]
1061         self.reg_partition_points = ppoints.like()
1062         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1063                           for i in range(4)]
1064         # intermediates (needed for unit tests)
1065         self.intermediate_output = Signal(output_width)
1066
1067     def eq_from(self, reg_partition_points, outputs, intermediate_output,
1068                       part_ops):
1069         return [self.reg_partition_points.eq(reg_partition_points)] + \
1070                [self.intermediate_output.eq(intermediate_output)] + \
1071                [self.outputs[i].eq(outputs[i])
1072                                      for i in range(4)] + \
1073                [self.part_ops[i].eq(part_ops[i])
1074                                      for i in range(len(self.part_ops))]
1075
1076     def eq(self, rhs):
1077         return self.eq_from(rhs.reg_partition_points, rhs.outputs,
1078                             rhs.intermediate_output, rhs.part_ops)
1079
1080
1081 class AllTermsData:
1082
1083     def __init__(self, partition_points):
1084         self.a = Signal(64)
1085         self.b = Signal(64)
1086         self.epps = partition_points.like()
1087         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1088
1089     def eq_from(self, epps, inputs, part_ops):
1090         return [self.epps.eq(epps)] + \
1091                [self.a.eq(a), self.b.eq(b)] + \
1092                [self.part_ops[i].eq(part_ops[i])
1093                                      for i in range(len(self.part_ops))]
1094
1095     def eq(self, rhs):
1096         return self.eq_from(rhs.epps, rhs.a, rhs.b, rhs.part_ops)
1097
1098
1099 class AllTerms(Elaboratable):
1100     """Set of terms to be added together
1101     """
1102
1103     def __init__(self, n_inputs, output_width, n_parts, register_levels,
1104                        partition_points):
1105         """Create an ``AddReduce``.
1106
1107         :param inputs: input ``Signal``s to be summed.
1108         :param output_width: bit-width of ``output``.
1109         :param register_levels: List of nesting levels that should have
1110             pipeline registers.
1111         :param partition_points: the input partition points.
1112         """
1113         self.i = AllTermsData(partition_points)
1114         self.register_levels = register_levels
1115         self.n_inputs = n_inputs
1116         self.n_parts = n_parts
1117         self.output_width = output_width
1118         self.o = AddReduceData(self.i.epps, n_inputs,
1119                                output_width, n_parts)
1120
1121     def elaborate(self, platform):
1122         m = Module()
1123
1124         eps = self.i.epps
1125
1126         # collect part-bytes
1127         pbs = Signal(8, reset_less=True)
1128         tl = []
1129         for i in range(8):
1130             pb = Signal(name="pb%d" % i, reset_less=True)
1131             m.d.comb += pb.eq(eps.part_byte(i))
1132             tl.append(pb)
1133         m.d.comb += pbs.eq(Cat(*tl))
1134
1135         # local variables
1136         signs = []
1137         for i in range(8):
1138             s = Signs()
1139             signs.append(s)
1140             setattr(m.submodules, "signs%d" % i, s)
1141             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1142
1143         n_levels = len(self.register_levels)+1
1144         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1145         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1146         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1147         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1148         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1149         for mod in [part_8, part_16, part_32, part_64]:
1150             m.d.comb += mod.a.eq(self.i.a)
1151             m.d.comb += mod.b.eq(self.i.b)
1152             for i in range(len(signs)):
1153                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1154                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1155             m.d.comb += mod.pbs.eq(pbs)
1156             nat_l.append(mod.not_a_term)
1157             nbt_l.append(mod.not_b_term)
1158             nla_l.append(mod.neg_lsb_a_term)
1159             nlb_l.append(mod.neg_lsb_b_term)
1160
1161         terms = []
1162
1163         for a_index in range(8):
1164             t = ProductTerms(8, 128, 8, a_index, 8)
1165             setattr(m.submodules, "terms_%d" % a_index, t)
1166
1167             m.d.comb += t.a.eq(self.i.a)
1168             m.d.comb += t.b.eq(self.i.b)
1169             m.d.comb += t.pb_en.eq(pbs)
1170
1171             for term in t.terms:
1172                 terms.append(term)
1173
1174         # it's fine to bitwise-or data together since they are never enabled
1175         # at the same time
1176         m.submodules.nat_or = nat_or = OrMod(128)
1177         m.submodules.nbt_or = nbt_or = OrMod(128)
1178         m.submodules.nla_or = nla_or = OrMod(128)
1179         m.submodules.nlb_or = nlb_or = OrMod(128)
1180         for l, mod in [(nat_l, nat_or),
1181                              (nbt_l, nbt_or),
1182                              (nla_l, nla_or),
1183                              (nlb_l, nlb_or)]:
1184             for i in range(len(l)):
1185                 m.d.comb += mod.orin[i].eq(l[i])
1186             terms.append(mod.orout)
1187
1188         # copy the intermediate terms to the output
1189         for i, value in enumerate(terms):
1190             m.d.comb += self.o.inputs[i].eq(value)
1191
1192         # copy reg part points and part ops to output
1193         m.d.comb += self.o.reg_partition_points.eq(eps)
1194         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1195                                      for i in range(len(self.i.part_ops))]
1196
1197         return m
1198
1199
1200 class Intermediates(Elaboratable):
1201     """ Intermediate output modules
1202     """
1203
1204     def __init__(self, output_width, n_parts, partition_points):
1205         self.i = FinalReduceData(partition_points, output_width, n_parts)
1206         self.o = IntermediateData(partition_points, output_width, n_parts)
1207
1208     def elaborate(self, platform):
1209         m = Module()
1210
1211         out_part_ops = self.i.part_ops
1212         out_part_pts = self.i.reg_partition_points
1213
1214         # create _output_64
1215         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1216         m.d.comb += io64.intermed.eq(self.i.output)
1217         for i in range(8):
1218             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1219         m.d.comb += self.o.outputs[3].eq(io64.output)
1220
1221         # create _output_32
1222         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1223         m.d.comb += io32.intermed.eq(self.i.output)
1224         for i in range(8):
1225             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1226         m.d.comb += self.o.outputs[2].eq(io32.output)
1227
1228         # create _output_16
1229         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1230         m.d.comb += io16.intermed.eq(self.i.output)
1231         for i in range(8):
1232             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1233         m.d.comb += self.o.outputs[1].eq(io16.output)
1234
1235         # create _output_8
1236         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1237         m.d.comb += io8.intermed.eq(self.i.output)
1238         for i in range(8):
1239             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1240         m.d.comb += self.o.outputs[0].eq(io8.output)
1241
1242         for i in range(8):
1243             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1244         m.d.comb += self.o.reg_partition_points.eq(out_part_pts)
1245         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1246
1247         return m
1248
1249
1250 class Mul8_16_32_64(Elaboratable):
1251     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1252
1253     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1254     partitions on naturally-aligned boundaries. Supports the operation being
1255     set for each partition independently.
1256
1257     :attribute part_pts: the input partition points. Has a partition point at
1258         multiples of 8 in 0 < i < 64. Each partition point's associated
1259         ``Value`` is a ``Signal``. Modification not supported, except for by
1260         ``Signal.eq``.
1261     :attribute part_ops: the operation for each byte. The operation for a
1262         particular partition is selected by assigning the selected operation
1263         code to each byte in the partition. The allowed operation codes are:
1264
1265         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1266             RISC-V's `mul` instruction.
1267         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1268             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1269             instruction.
1270         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1271             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1272             `mulhsu` instruction.
1273         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1274             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1275             instruction.
1276     """
1277
1278     def __init__(self, register_levels=()):
1279         """ register_levels: specifies the points in the cascade at which
1280             flip-flops are to be inserted.
1281         """
1282
1283         # parameter(s)
1284         self.register_levels = list(register_levels)
1285
1286         # inputs
1287         self.part_pts = PartitionPoints()
1288         for i in range(8, 64, 8):
1289             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1290         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1291         self.a = Signal(64)
1292         self.b = Signal(64)
1293
1294         # intermediates (needed for unit tests)
1295         self.intermediate_output = Signal(128)
1296
1297         # output
1298         self.output = Signal(64)
1299
1300     def elaborate(self, platform):
1301         m = Module()
1302
1303         # create (doubled) PartitionPoints (output is double input width)
1304         expanded_part_pts = eps = PartitionPoints()
1305         for i, v in self.part_pts.items():
1306             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1307             expanded_part_pts[i] = ep
1308             m.d.comb += ep.eq(v)
1309
1310         n_inputs = 64 + 4
1311         n_parts = 8 #len(self.part_pts)
1312         t = AllTerms(n_inputs, 128, n_parts, self.register_levels,
1313                        eps)
1314         m.submodules.allterms = t
1315         m.d.comb += t.i.a.eq(self.a)
1316         m.d.comb += t.i.b.eq(self.b)
1317         m.d.comb += t.i.epps.eq(eps)
1318         for i in range(8):
1319             m.d.comb += t.i.part_ops[i].eq(self.part_ops[i])
1320
1321         terms = t.o.inputs
1322
1323         add_reduce = AddReduce(terms,
1324                                128,
1325                                self.register_levels,
1326                                t.o.reg_partition_points,
1327                                t.o.part_ops)
1328
1329         out_part_ops = add_reduce.o.part_ops
1330         out_part_pts = add_reduce.o.reg_partition_points
1331
1332         m.submodules.add_reduce = add_reduce
1333
1334         interm = Intermediates(128, 8, expanded_part_pts)
1335         m.submodules.intermediates = interm
1336         m.d.comb += interm.i.eq(add_reduce.o)
1337
1338         # final output
1339         m.submodules.finalout = finalout = FinalOut(128, 8, expanded_part_pts)
1340         m.d.comb += finalout.i.eq(interm.o)
1341         m.d.comb += self.output.eq(finalout.out)
1342         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1343
1344         return m
1345
1346
1347 if __name__ == "__main__":
1348     m = Mul8_16_32_64()
1349     main(m, ports=[m.a,
1350                    m.b,
1351                    m.intermediate_output,
1352                    m.output,
1353                    *m.part_ops,
1354                    *m.part_pts.values()])