src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width, reset_less=True)
 136         self.in1 = Signal(width, reset_less=True)
 137         self.in2 = Signal(width, reset_less=True)
 138         self.sum = Signal(width, reset_less=True)
 139         self.carry = Signal(width, reset_less=True)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width, reset_less=True)
 238         self.b = Signal(width, reset_less=True)
 239         self.output = Signal(width, reset_less=True)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249
 250     def elaborate(self, platform):
 251         """Elaborate this module."""
 252         m = Module()
 253         expanded_a = Signal(self._expanded_width, reset_less=True)
 254         expanded_b = Signal(self._expanded_width, reset_less=True)
 255         expanded_o = Signal(self._expanded_width, reset_less=True)
 256
 257         expanded_index = 0
 258         # store bits in a list, use Cat later.  graphviz is much cleaner
 259         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 260
 261         # partition points are "breaks" (extra zeros or 1s) in what would
 262         # otherwise be a massive long add.  when the "break" points are 0,
 263         # whatever is in it (in the output) is discarded.  however when
 264         # there is a "1", it causes a roll-over carry to the *next* bit.
 265         # we still ignore the "break" bit in the [intermediate] output,
 266         # however by that time we've got the effect that we wanted: the
 267         # carry has been carried *over* the break point.
 268
 269         for i in range(self.width):
 270             if i in self.partition_points:
 271                 # add extra bit set to 0 + 0 for enabled partition points
 272                 # and 1 + 0 for disabled partition points
 273                 ea.append(expanded_a[expanded_index])
 274                 al.append(~self.partition_points[i]) # add extra bit in a
 275                 eb.append(expanded_b[expanded_index])
 276                 bl.append(C(0)) # yes, add a zero
 277                 expanded_index += 1 # skip the extra point.  NOT in the output
 278             ea.append(expanded_a[expanded_index])
 279             eb.append(expanded_b[expanded_index])
 280             eo.append(expanded_o[expanded_index])
 281             al.append(self.a[i])
 282             bl.append(self.b[i])
 283             ol.append(self.output[i])
 284             expanded_index += 1
 285
 286         # combine above using Cat
 287         m.d.comb += Cat(*ea).eq(Cat(*al))
 288         m.d.comb += Cat(*eb).eq(Cat(*bl))
 289         m.d.comb += Cat(*ol).eq(Cat(*eo))
 290
 291         # use only one addition to take advantage of look-ahead carry and
 292         # special hardware on FPGAs
 293         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 294         return m
 295
 296
 297 FULL_ADDER_INPUT_COUNT = 3
 298
 299 class AddReduceData:
 300
 301     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 302         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 303                           for i in range(n_parts)]
 304         self.inputs = [Signal(output_width, name=f"inputs_{i}",
 305                               reset_less=True)
 306                         for i in range(n_inputs)]
 307         self.reg_partition_points = ppoints.like()
 308
 309     def eq_from(self, reg_partition_points, inputs, part_ops):
 310         return [self.reg_partition_points.eq(reg_partition_points)] + \
 311                [self.inputs[i].eq(inputs[i])
 312                                      for i in range(len(self.inputs))] + \
 313                [self.part_ops[i].eq(part_ops[i])
 314                                      for i in range(len(self.part_ops))]
 315
 316     def eq(self, rhs):
 317         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 318
 319
 320 class FinalReduceData:
 321
 322     def __init__(self, ppoints, output_width, n_parts):
 323         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 324                           for i in range(n_parts)]
 325         self.output = Signal(output_width, reset_less=True)
 326         self.reg_partition_points = ppoints.like()
 327
 328     def eq_from(self, reg_partition_points, output, part_ops):
 329         return [self.reg_partition_points.eq(reg_partition_points)] + \
 330                [self.output.eq(output)] + \
 331                [self.part_ops[i].eq(part_ops[i])
 332                                      for i in range(len(self.part_ops))]
 333
 334     def eq(self, rhs):
 335         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 336
 337
 338 class FinalAdd(Elaboratable):
 339     """ Final stage of add reduce
 340     """
 341
 342     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 343                        partition_points):
 344         self.i = AddReduceData(partition_points, n_inputs,
 345                                output_width, n_parts)
 346         self.o = FinalReduceData(partition_points, output_width, n_parts)
 347         self.output_width = output_width
 348         self.n_inputs = n_inputs
 349         self.n_parts = n_parts
 350         self.register_levels = list(register_levels)
 351         self.partition_points = PartitionPoints(partition_points)
 352         if not self.partition_points.fits_in_width(output_width):
 353             raise ValueError("partition_points doesn't fit in output_width")
 354
 355     def elaborate(self, platform):
 356         """Elaborate this module."""
 357         m = Module()
 358
 359         output_width = self.output_width
 360         output = Signal(output_width, reset_less=True)
 361         if self.n_inputs == 0:
 362             # use 0 as the default output value
 363             m.d.comb += output.eq(0)
 364         elif self.n_inputs == 1:
 365             # handle single input
 366             m.d.comb += output.eq(self.i.inputs[0])
 367         else:
 368             # base case for adding 2 inputs
 369             assert self.n_inputs == 2
 370             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 371             m.submodules.final_adder = adder
 372             m.d.comb += adder.a.eq(self.i.inputs[0])
 373             m.d.comb += adder.b.eq(self.i.inputs[1])
 374             m.d.comb += output.eq(adder.output)
 375
 376         # create output
 377         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 378                                    self.i.part_ops)
 379
 380         return m
 381
 382
 383 class AddReduceSingle(Elaboratable):
 384     """Add list of numbers together.
 385
 386     :attribute inputs: input ``Signal``s to be summed. Modification not
 387         supported, except for by ``Signal.eq``.
 388     :attribute register_levels: List of nesting levels that should have
 389         pipeline registers.
 390     :attribute output: output sum.
 391     :attribute partition_points: the input partition points. Modification not
 392         supported, except for by ``Signal.eq``.
 393     """
 394
 395     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 396                        partition_points):
 397         """Create an ``AddReduce``.
 398
 399         :param inputs: input ``Signal``s to be summed.
 400         :param output_width: bit-width of ``output``.
 401         :param register_levels: List of nesting levels that should have
 402             pipeline registers.
 403         :param partition_points: the input partition points.
 404         """
 405         self.n_inputs = n_inputs
 406         self.n_parts = n_parts
 407         self.output_width = output_width
 408         self.i = AddReduceData(partition_points, n_inputs,
 409                                output_width, n_parts)
 410         self.register_levels = list(register_levels)
 411         self.partition_points = PartitionPoints(partition_points)
 412         if not self.partition_points.fits_in_width(output_width):
 413             raise ValueError("partition_points doesn't fit in output_width")
 414
 415         max_level = AddReduceSingle.get_max_level(n_inputs)
 416         for level in self.register_levels:
 417             if level > max_level:
 418                 raise ValueError(
 419                     "not enough adder levels for specified register levels")
 420
 421         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 422         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 423         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 424
 425     @staticmethod
 426     def calc_n_inputs(n_inputs, groups):
 427         retval = len(groups)*2
 428         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 429             retval += 1
 430         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 431             retval += 2
 432         else:
 433             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 434         return retval
 435
 436     @staticmethod
 437     def get_max_level(input_count):
 438         """Get the maximum level.
 439
 440         All ``register_levels`` must be less than or equal to the maximum
 441         level.
 442         """
 443         retval = 0
 444         while True:
 445             groups = AddReduceSingle.full_adder_groups(input_count)
 446             if len(groups) == 0:
 447                 return retval
 448             input_count %= FULL_ADDER_INPUT_COUNT
 449             input_count += 2 * len(groups)
 450             retval += 1
 451
 452     @staticmethod
 453     def full_adder_groups(input_count):
 454         """Get ``inputs`` indices for which a full adder should be built."""
 455         return range(0,
 456                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 457                      FULL_ADDER_INPUT_COUNT)
 458
 459     def create_next_terms(self):
 460         """ create next intermediate terms, for linking up in elaborate, below
 461         """
 462         terms = []
 463         adders = []
 464
 465         # create full adders for this recursive level.
 466         # this shrinks N terms to 2 * (N // 3) plus the remainder
 467         for i in self.groups:
 468             adder_i = MaskedFullAdder(self.output_width)
 469             adders.append((i, adder_i))
 470             # add both the sum and the masked-carry to the next level.
 471             # 3 inputs have now been reduced to 2...
 472             terms.append(adder_i.sum)
 473             terms.append(adder_i.mcarry)
 474         # handle the remaining inputs.
 475         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 476             terms.append(self.i.inputs[-1])
 477         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 478             # Just pass the terms to the next layer, since we wouldn't gain
 479             # anything by using a half adder since there would still be 2 terms
 480             # and just passing the terms to the next layer saves gates.
 481             terms.append(self.i.inputs[-2])
 482             terms.append(self.i.inputs[-1])
 483         else:
 484             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 485
 486         return terms, adders
 487
 488     def elaborate(self, platform):
 489         """Elaborate this module."""
 490         m = Module()
 491
 492         terms, adders = self.create_next_terms()
 493
 494         # copy the intermediate terms to the output
 495         for i, value in enumerate(terms):
 496             m.d.comb += self.o.inputs[i].eq(value)
 497
 498         # copy reg part points and part ops to output
 499         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 500         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 501                                      for i in range(len(self.i.part_ops))]
 502
 503         # set up the partition mask (for the adders)
 504         part_mask = Signal(self.output_width, reset_less=True)
 505
 506         mask = self.i.reg_partition_points.as_mask(self.output_width)
 507         m.d.comb += part_mask.eq(mask)
 508
 509         # add and link the intermediate term modules
 510         for i, (iidx, adder_i) in enumerate(adders):
 511             setattr(m.submodules, f"adder_{i}", adder_i)
 512
 513             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 514             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 515             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 516             m.d.comb += adder_i.mask.eq(part_mask)
 517
 518         return m
 519
 520
 521 class AddReduce(Elaboratable):
 522     """Recursively Add list of numbers together.
 523
 524     :attribute inputs: input ``Signal``s to be summed. Modification not
 525         supported, except for by ``Signal.eq``.
 526     :attribute register_levels: List of nesting levels that should have
 527         pipeline registers.
 528     :attribute output: output sum.
 529     :attribute partition_points: the input partition points. Modification not
 530         supported, except for by ``Signal.eq``.
 531     """
 532
 533     def __init__(self, inputs, output_width, register_levels, partition_points,
 534                        part_ops):
 535         """Create an ``AddReduce``.
 536
 537         :param inputs: input ``Signal``s to be summed.
 538         :param output_width: bit-width of ``output``.
 539         :param register_levels: List of nesting levels that should have
 540             pipeline registers.
 541         :param partition_points: the input partition points.
 542         """
 543         self.inputs = inputs
 544         self.part_ops = part_ops
 545         n_parts = len(part_ops)
 546         self.o = FinalReduceData(partition_points, output_width, n_parts)
 547         self.output_width = output_width
 548         self.register_levels = register_levels
 549         self.partition_points = partition_points
 550
 551         self.create_levels()
 552
 553     @staticmethod
 554     def get_max_level(input_count):
 555         return AddReduceSingle.get_max_level(input_count)
 556
 557     @staticmethod
 558     def next_register_levels(register_levels):
 559         """``Iterable`` of ``register_levels`` for next recursive level."""
 560         for level in register_levels:
 561             if level > 0:
 562                 yield level - 1
 563
 564     def create_levels(self):
 565         """creates reduction levels"""
 566
 567         mods = []
 568         next_levels = self.register_levels
 569         partition_points = self.partition_points
 570         part_ops = self.part_ops
 571         n_parts = len(part_ops)
 572         inputs = self.inputs
 573         ilen = len(inputs)
 574         while True:
 575             groups = AddReduceSingle.full_adder_groups(len(inputs))
 576             if len(groups) == 0:
 577                 break
 578             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 579                                          next_levels, partition_points)
 580             mods.append(next_level)
 581             next_levels = list(AddReduce.next_register_levels(next_levels))
 582             partition_points = next_level.i.reg_partition_points
 583             inputs = next_level.o.inputs
 584             ilen = len(inputs)
 585             part_ops = next_level.i.part_ops
 586
 587         next_level = FinalAdd(ilen, self.output_width, n_parts,
 588                               next_levels, partition_points)
 589         mods.append(next_level)
 590
 591         self.levels = mods
 592
 593     def elaborate(self, platform):
 594         """Elaborate this module."""
 595         m = Module()
 596
 597         for i, next_level in enumerate(self.levels):
 598             setattr(m.submodules, "next_level%d" % i, next_level)
 599
 600         partition_points = self.partition_points
 601         inputs = self.inputs
 602         part_ops = self.part_ops
 603         n_parts = len(part_ops)
 604         n_inputs = len(inputs)
 605         output_width = self.output_width
 606         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 607         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 608         for idx in range(len(self.levels)):
 609             mcur = self.levels[idx]
 610             if 0 in mcur.register_levels:
 611                 m.d.sync += mcur.i.eq(i)
 612             else:
 613                 m.d.comb += mcur.i.eq(i)
 614             i = mcur.o # for next loop
 615
 616         # output comes from last module
 617         m.d.comb += self.o.eq(i)
 618
 619         return m
 620
 621
 622 OP_MUL_LOW = 0
 623 OP_MUL_SIGNED_HIGH = 1
 624 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 625 OP_MUL_UNSIGNED_HIGH = 3
 626
 627
 628 def get_term(value, shift=0, enabled=None):
 629     if enabled is not None:
 630         value = Mux(enabled, value, 0)
 631     if shift > 0:
 632         value = Cat(Repl(C(0, 1), shift), value)
 633     else:
 634         assert shift == 0
 635     return value
 636
 637
 638 class ProductTerm(Elaboratable):
 639     """ this class creates a single product term (a[..]*b[..]).
 640         it has a design flaw in that is the *output* that is selected,
 641         where the multiplication(s) are combinatorially generated
 642         all the time.
 643     """
 644
 645     def __init__(self, width, twidth, pbwid, a_index, b_index):
 646         self.a_index = a_index
 647         self.b_index = b_index
 648         shift = 8 * (self.a_index + self.b_index)
 649         self.pwidth = width
 650         self.twidth = twidth
 651         self.width = width*2
 652         self.shift = shift
 653
 654         self.ti = Signal(self.width, reset_less=True)
 655         self.term = Signal(twidth, reset_less=True)
 656         self.a = Signal(twidth//2, reset_less=True)
 657         self.b = Signal(twidth//2, reset_less=True)
 658         self.pb_en = Signal(pbwid, reset_less=True)
 659
 660         self.tl = tl = []
 661         min_index = min(self.a_index, self.b_index)
 662         max_index = max(self.a_index, self.b_index)
 663         for i in range(min_index, max_index):
 664             tl.append(self.pb_en[i])
 665         name = "te_%d_%d" % (self.a_index, self.b_index)
 666         if len(tl) > 0:
 667             term_enabled = Signal(name=name, reset_less=True)
 668         else:
 669             term_enabled = None
 670         self.enabled = term_enabled
 671         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 672
 673     def elaborate(self, platform):
 674
 675         m = Module()
 676         if self.enabled is not None:
 677             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 678
 679         bsa = Signal(self.width, reset_less=True)
 680         bsb = Signal(self.width, reset_less=True)
 681         a_index, b_index = self.a_index, self.b_index
 682         pwidth = self.pwidth
 683         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 684         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 685         m.d.comb += self.ti.eq(bsa * bsb)
 686         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 687         """
 688         #TODO: sort out width issues, get inputs a/b switched on/off.
 689         #data going into Muxes is 1/2 the required width
 690
 691         pwidth = self.pwidth
 692         width = self.width
 693         bsa = Signal(self.twidth//2, reset_less=True)
 694         bsb = Signal(self.twidth//2, reset_less=True)
 695         asel = Signal(width, reset_less=True)
 696         bsel = Signal(width, reset_less=True)
 697         a_index, b_index = self.a_index, self.b_index
 698         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 699         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 700         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 701         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 702         m.d.comb += self.ti.eq(bsa * bsb)
 703         m.d.comb += self.term.eq(self.ti)
 704         """
 705
 706         return m
 707
 708
 709 class ProductTerms(Elaboratable):
 710     """ creates a bank of product terms.  also performs the actual bit-selection
 711         this class is to be wrapped with a for-loop on the "a" operand.
 712         it creates a second-level for-loop on the "b" operand.
 713     """
 714     def __init__(self, width, twidth, pbwid, a_index, blen):
 715         self.a_index = a_index
 716         self.blen = blen
 717         self.pwidth = width
 718         self.twidth = twidth
 719         self.pbwid = pbwid
 720         self.a = Signal(twidth//2, reset_less=True)
 721         self.b = Signal(twidth//2, reset_less=True)
 722         self.pb_en = Signal(pbwid, reset_less=True)
 723         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 724                             for i in range(blen)]
 725
 726     def elaborate(self, platform):
 727
 728         m = Module()
 729
 730         for b_index in range(self.blen):
 731             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 732                             self.a_index, b_index)
 733             setattr(m.submodules, "term_%d" % b_index, t)
 734
 735             m.d.comb += t.a.eq(self.a)
 736             m.d.comb += t.b.eq(self.b)
 737             m.d.comb += t.pb_en.eq(self.pb_en)
 738
 739             m.d.comb += self.terms[b_index].eq(t.term)
 740
 741         return m
 742
 743
 744 class LSBNegTerm(Elaboratable):
 745
 746     def __init__(self, bit_width):
 747         self.bit_width = bit_width
 748         self.part = Signal(reset_less=True)
 749         self.signed = Signal(reset_less=True)
 750         self.op = Signal(bit_width, reset_less=True)
 751         self.msb = Signal(reset_less=True)
 752         self.nt = Signal(bit_width*2, reset_less=True)
 753         self.nl = Signal(bit_width*2, reset_less=True)
 754
 755     def elaborate(self, platform):
 756         m = Module()
 757         comb = m.d.comb
 758         bit_wid = self.bit_width
 759         ext = Repl(0, bit_wid) # extend output to HI part
 760
 761         # determine sign of each incoming number *in this partition*
 762         enabled = Signal(reset_less=True)
 763         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 764
 765         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 766         # negation operation is split into a bitwise not and a +1.
 767         # likewise for 16, 32, and 64-bit values.
 768
 769         # width-extended 1s complement if a is signed, otherwise zero
 770         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 771
 772         # add 1 if signed, otherwise add zero
 773         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 774
 775         return m
 776
 777
 778 class Parts(Elaboratable):
 779
 780     def __init__(self, pbwid, epps, n_parts):
 781         self.pbwid = pbwid
 782         # inputs
 783         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 784         # outputs
 785         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 786                       for i in range(n_parts)]
 787
 788     def elaborate(self, platform):
 789         m = Module()
 790
 791         epps, parts = self.epps, self.parts
 792         # collect part-bytes (double factor because the input is extended)
 793         pbs = Signal(self.pbwid, reset_less=True)
 794         tl = []
 795         for i in range(self.pbwid):
 796             pb = Signal(name="pb%d" % i, reset_less=True)
 797             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 798             tl.append(pb)
 799         m.d.comb += pbs.eq(Cat(*tl))
 800
 801         # negated-temporary copy of partition bits
 802         npbs = Signal.like(pbs, reset_less=True)
 803         m.d.comb += npbs.eq(~pbs)
 804         byte_count = 8 // len(parts)
 805         for i in range(len(parts)):
 806             pbl = []
 807             pbl.append(npbs[i * byte_count - 1])
 808             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 809                 pbl.append(pbs[j])
 810             pbl.append(npbs[(i + 1) * byte_count - 1])
 811             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 812             m.d.comb += value.eq(Cat(*pbl))
 813             m.d.comb += parts[i].eq(~(value).bool())
 814
 815         return m
 816
 817
 818 class Part(Elaboratable):
 819     """ a key class which, depending on the partitioning, will determine
 820         what action to take when parts of the output are signed or unsigned.
 821
 822         this requires 2 pieces of data *per operand, per partition*:
 823         whether the MSB is HI/LO (per partition!), and whether a signed
 824         or unsigned operation has been *requested*.
 825
 826         once that is determined, signed is basically carried out
 827         by splitting 2's complement into 1's complement plus one.
 828         1's complement is just a bit-inversion.
 829
 830         the extra terms - as separate terms - are then thrown at the
 831         AddReduce alongside the multiplication part-results.
 832     """
 833     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 834
 835         self.pbwid = pbwid
 836         self.epps = epps
 837
 838         # inputs
 839         self.a = Signal(64, reset_less=True)
 840         self.b = Signal(64, reset_less=True)
 841         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 842                             for i in range(8)]
 843         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 844                             for i in range(8)]
 845         self.pbs = Signal(pbwid, reset_less=True)
 846
 847         # outputs
 848         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 849                             for i in range(n_parts)]
 850
 851         self.not_a_term = Signal(width, reset_less=True)
 852         self.neg_lsb_a_term = Signal(width, reset_less=True)
 853         self.not_b_term = Signal(width, reset_less=True)
 854         self.neg_lsb_b_term = Signal(width, reset_less=True)
 855
 856     def elaborate(self, platform):
 857         m = Module()
 858
 859         pbs, parts = self.pbs, self.parts
 860         epps = self.epps
 861         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 862         m.d.comb += p.epps.eq(epps)
 863         parts = p.parts
 864
 865         byte_count = 8 // len(parts)
 866
 867         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 868                 self.not_a_term, self.neg_lsb_a_term,
 869                 self.not_b_term, self.neg_lsb_b_term)
 870
 871         byte_width = 8 // len(parts) # byte width
 872         bit_wid = 8 * byte_width     # bit width
 873         nat, nbt, nla, nlb = [], [], [], []
 874         for i in range(len(parts)):
 875             # work out bit-inverted and +1 term for a.
 876             pa = LSBNegTerm(bit_wid)
 877             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 878             m.d.comb += pa.part.eq(parts[i])
 879             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 880             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 881             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 882             nat.append(pa.nt)
 883             nla.append(pa.nl)
 884
 885             # work out bit-inverted and +1 term for b
 886             pb = LSBNegTerm(bit_wid)
 887             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 888             m.d.comb += pb.part.eq(parts[i])
 889             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 890             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 891             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 892             nbt.append(pb.nt)
 893             nlb.append(pb.nl)
 894
 895         # concatenate together and return all 4 results.
 896         m.d.comb += [not_a_term.eq(Cat(*nat)),
 897                      not_b_term.eq(Cat(*nbt)),
 898                      neg_lsb_a_term.eq(Cat(*nla)),
 899                      neg_lsb_b_term.eq(Cat(*nlb)),
 900                     ]
 901
 902         return m
 903
 904
 905 class IntermediateOut(Elaboratable):
 906     """ selects the HI/LO part of the multiplication, for a given bit-width
 907         the output is also reconstructed in its SIMD (partition) lanes.
 908     """
 909     def __init__(self, width, out_wid, n_parts):
 910         self.width = width
 911         self.n_parts = n_parts
 912         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 913                                      for i in range(8)]
 914         self.intermed = Signal(out_wid, reset_less=True)
 915         self.output = Signal(out_wid//2, reset_less=True)
 916
 917     def elaborate(self, platform):
 918         m = Module()
 919
 920         ol = []
 921         w = self.width
 922         sel = w // 8
 923         for i in range(self.n_parts):
 924             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 925             m.d.comb += op.eq(
 926                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 927                     self.intermed.part(i * w*2, w),
 928                     self.intermed.part(i * w*2 + w, w)))
 929             ol.append(op)
 930         m.d.comb += self.output.eq(Cat(*ol))
 931
 932         return m
 933
 934
 935 class FinalOut(Elaboratable):
 936     """ selects the final output based on the partitioning.
 937
 938         each byte is selectable independently, i.e. it is possible
 939         that some partitions requested 8-bit computation whilst others
 940         requested 16 or 32 bit.
 941     """
 942     def __init__(self, output_width, n_parts, partition_points):
 943         self.expanded_part_points = partition_points
 944         self.i = IntermediateData(partition_points, output_width, n_parts)
 945         self.out_wid = output_width//2
 946         # output
 947         self.out = Signal(self.out_wid, reset_less=True)
 948         self.intermediate_output = Signal(output_width, reset_less=True)
 949
 950     def elaborate(self, platform):
 951         m = Module()
 952
 953         eps = self.expanded_part_points
 954         m.submodules.p_8 = p_8 = Parts(8, eps, 8)
 955         m.submodules.p_16 = p_16 = Parts(8, eps, 4)
 956         m.submodules.p_32 = p_32 = Parts(8, eps, 2)
 957         m.submodules.p_64 = p_64 = Parts(8, eps, 1)
 958
 959         out_part_pts = self.i.reg_partition_points
 960
 961         # temporaries
 962         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 963         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 964         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 965
 966         i8 = Signal(self.out_wid, reset_less=True)
 967         i16 = Signal(self.out_wid, reset_less=True)
 968         i32 = Signal(self.out_wid, reset_less=True)
 969         i64 = Signal(self.out_wid, reset_less=True)
 970
 971         m.d.comb += p_8.epps.eq(out_part_pts)
 972         m.d.comb += p_16.epps.eq(out_part_pts)
 973         m.d.comb += p_32.epps.eq(out_part_pts)
 974         m.d.comb += p_64.epps.eq(out_part_pts)
 975
 976         for i in range(len(p_8.parts)):
 977             m.d.comb += d8[i].eq(p_8.parts[i])
 978         for i in range(len(p_16.parts)):
 979             m.d.comb += d16[i].eq(p_16.parts[i])
 980         for i in range(len(p_32.parts)):
 981             m.d.comb += d32[i].eq(p_32.parts[i])
 982         m.d.comb += i8.eq(self.i.outputs[0])
 983         m.d.comb += i16.eq(self.i.outputs[1])
 984         m.d.comb += i32.eq(self.i.outputs[2])
 985         m.d.comb += i64.eq(self.i.outputs[3])
 986
 987         ol = []
 988         for i in range(8):
 989             # select one of the outputs: d8 selects i8, d16 selects i16
 990             # d32 selects i32, and the default is i64.
 991             # d8 and d16 are ORed together in the first Mux
 992             # then the 2nd selects either i8 or i16.
 993             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 994             op = Signal(8, reset_less=True, name="op_%d" % i)
 995             m.d.comb += op.eq(
 996                 Mux(d8[i] | d16[i // 2],
 997                     Mux(d8[i], i8.part(i * 8, 8), i16.part(i * 8, 8)),
 998                     Mux(d32[i // 4], i32.part(i * 8, 8), i64.part(i * 8, 8))))
 999             ol.append(op)
1000         m.d.comb += self.out.eq(Cat(*ol))
1001         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
1002         return m
1003
1004
1005 class OrMod(Elaboratable):
1006     """ ORs four values together in a hierarchical tree
1007     """
1008     def __init__(self, wid):
1009         self.wid = wid
1010         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1011                      for i in range(4)]
1012         self.orout = Signal(wid, reset_less=True)
1013
1014     def elaborate(self, platform):
1015         m = Module()
1016         or1 = Signal(self.wid, reset_less=True)
1017         or2 = Signal(self.wid, reset_less=True)
1018         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1019         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1020         m.d.comb += self.orout.eq(or1 | or2)
1021
1022         return m
1023
1024
1025 class Signs(Elaboratable):
1026     """ determines whether a or b are signed numbers
1027         based on the required operation type (OP_MUL_*)
1028     """
1029
1030     def __init__(self):
1031         self.part_ops = Signal(2, reset_less=True)
1032         self.a_signed = Signal(reset_less=True)
1033         self.b_signed = Signal(reset_less=True)
1034
1035     def elaborate(self, platform):
1036
1037         m = Module()
1038
1039         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1040         bsig = (self.part_ops == OP_MUL_LOW) \
1041                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1042         m.d.comb += self.a_signed.eq(asig)
1043         m.d.comb += self.b_signed.eq(bsig)
1044
1045         return m
1046
1047
1048 class IntermediateData:
1049
1050     def __init__(self, ppoints, output_width, n_parts):
1051         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1052                           for i in range(n_parts)]
1053         self.reg_partition_points = ppoints.like()
1054         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1055                           for i in range(4)]
1056         # intermediates (needed for unit tests)
1057         self.intermediate_output = Signal(output_width)
1058
1059     def eq_from(self, reg_partition_points, outputs, intermediate_output,
1060                       part_ops):
1061         return [self.reg_partition_points.eq(reg_partition_points)] + \
1062                [self.intermediate_output.eq(intermediate_output)] + \
1063                [self.outputs[i].eq(outputs[i])
1064                                      for i in range(4)] + \
1065                [self.part_ops[i].eq(part_ops[i])
1066                                      for i in range(len(self.part_ops))]
1067
1068     def eq(self, rhs):
1069         return self.eq_from(rhs.reg_partition_points, rhs.outputs,
1070                             rhs.intermediate_output, rhs.part_ops)
1071
1072
1073 class AllTermsData:
1074
1075     def __init__(self, partition_points):
1076         self.a = Signal(64)
1077         self.b = Signal(64)
1078         self.epps = partition_points.like()
1079         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1080
1081     def eq_from(self, epps, inputs, part_ops):
1082         return [self.epps.eq(epps)] + \
1083                [self.a.eq(a), self.b.eq(b)] + \
1084                [self.part_ops[i].eq(part_ops[i])
1085                                      for i in range(len(self.part_ops))]
1086
1087     def eq(self, rhs):
1088         return self.eq_from(rhs.epps, rhs.a, rhs.b, rhs.part_ops)
1089
1090
1091 class AllTerms(Elaboratable):
1092     """Set of terms to be added together
1093     """
1094
1095     def __init__(self, n_inputs, output_width, n_parts, register_levels,
1096                        partition_points):
1097         """Create an ``AddReduce``.
1098
1099         :param inputs: input ``Signal``s to be summed.
1100         :param output_width: bit-width of ``output``.
1101         :param register_levels: List of nesting levels that should have
1102             pipeline registers.
1103         :param partition_points: the input partition points.
1104         """
1105         self.i = AllTermsData(partition_points)
1106         self.register_levels = register_levels
1107         self.n_inputs = n_inputs
1108         self.n_parts = n_parts
1109         self.output_width = output_width
1110         self.o = AddReduceData(self.i.epps, n_inputs,
1111                                output_width, n_parts)
1112
1113     def elaborate(self, platform):
1114         m = Module()
1115
1116         eps = self.i.epps
1117
1118         # collect part-bytes
1119         pbs = Signal(8, reset_less=True)
1120         tl = []
1121         for i in range(8):
1122             pb = Signal(name="pb%d" % i, reset_less=True)
1123             m.d.comb += pb.eq(eps.part_byte(i, mfactor=2))
1124             tl.append(pb)
1125         m.d.comb += pbs.eq(Cat(*tl))
1126
1127         # local variables
1128         signs = []
1129         for i in range(8):
1130             s = Signs()
1131             signs.append(s)
1132             setattr(m.submodules, "signs%d" % i, s)
1133             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1134
1135         n_levels = len(self.register_levels)+1
1136         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1137         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1138         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1139         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1140         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1141         for mod in [part_8, part_16, part_32, part_64]:
1142             m.d.comb += mod.a.eq(self.i.a)
1143             m.d.comb += mod.b.eq(self.i.b)
1144             for i in range(len(signs)):
1145                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1146                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1147             m.d.comb += mod.pbs.eq(pbs)
1148             nat_l.append(mod.not_a_term)
1149             nbt_l.append(mod.not_b_term)
1150             nla_l.append(mod.neg_lsb_a_term)
1151             nlb_l.append(mod.neg_lsb_b_term)
1152
1153         terms = []
1154
1155         for a_index in range(8):
1156             t = ProductTerms(8, 128, 8, a_index, 8)
1157             setattr(m.submodules, "terms_%d" % a_index, t)
1158
1159             m.d.comb += t.a.eq(self.i.a)
1160             m.d.comb += t.b.eq(self.i.b)
1161             m.d.comb += t.pb_en.eq(pbs)
1162
1163             for term in t.terms:
1164                 terms.append(term)
1165
1166         # it's fine to bitwise-or data together since they are never enabled
1167         # at the same time
1168         m.submodules.nat_or = nat_or = OrMod(128)
1169         m.submodules.nbt_or = nbt_or = OrMod(128)
1170         m.submodules.nla_or = nla_or = OrMod(128)
1171         m.submodules.nlb_or = nlb_or = OrMod(128)
1172         for l, mod in [(nat_l, nat_or),
1173                              (nbt_l, nbt_or),
1174                              (nla_l, nla_or),
1175                              (nlb_l, nlb_or)]:
1176             for i in range(len(l)):
1177                 m.d.comb += mod.orin[i].eq(l[i])
1178             terms.append(mod.orout)
1179
1180         # copy the intermediate terms to the output
1181         for i, value in enumerate(terms):
1182             m.d.comb += self.o.inputs[i].eq(value)
1183
1184         # copy reg part points and part ops to output
1185         m.d.comb += self.o.reg_partition_points.eq(eps)
1186         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1187                                      for i in range(len(self.i.part_ops))]
1188
1189         return m
1190
1191
1192 class Intermediates(Elaboratable):
1193     """ Intermediate output modules
1194     """
1195
1196     def __init__(self, output_width, n_parts, partition_points):
1197         self.i = FinalReduceData(partition_points, output_width, n_parts)
1198         self.o = IntermediateData(partition_points, output_width, n_parts)
1199
1200     def elaborate(self, platform):
1201         m = Module()
1202
1203         out_part_ops = self.i.part_ops
1204         out_part_pts = self.i.reg_partition_points
1205
1206         # create _output_64
1207         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1208         m.d.comb += io64.intermed.eq(self.i.output)
1209         for i in range(8):
1210             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1211         m.d.comb += self.o.outputs[3].eq(io64.output)
1212
1213         # create _output_32
1214         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1215         m.d.comb += io32.intermed.eq(self.i.output)
1216         for i in range(8):
1217             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1218         m.d.comb += self.o.outputs[2].eq(io32.output)
1219
1220         # create _output_16
1221         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1222         m.d.comb += io16.intermed.eq(self.i.output)
1223         for i in range(8):
1224             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1225         m.d.comb += self.o.outputs[1].eq(io16.output)
1226
1227         # create _output_8
1228         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1229         m.d.comb += io8.intermed.eq(self.i.output)
1230         for i in range(8):
1231             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1232         m.d.comb += self.o.outputs[0].eq(io8.output)
1233
1234         for i in range(8):
1235             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1236         m.d.comb += self.o.reg_partition_points.eq(out_part_pts)
1237         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1238
1239         return m
1240
1241
1242 class Mul8_16_32_64(Elaboratable):
1243     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1244
1245     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1246     partitions on naturally-aligned boundaries. Supports the operation being
1247     set for each partition independently.
1248
1249     :attribute part_pts: the input partition points. Has a partition point at
1250         multiples of 8 in 0 < i < 64. Each partition point's associated
1251         ``Value`` is a ``Signal``. Modification not supported, except for by
1252         ``Signal.eq``.
1253     :attribute part_ops: the operation for each byte. The operation for a
1254         particular partition is selected by assigning the selected operation
1255         code to each byte in the partition. The allowed operation codes are:
1256
1257         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1258             RISC-V's `mul` instruction.
1259         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1260             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1261             instruction.
1262         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1263             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1264             `mulhsu` instruction.
1265         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1266             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1267             instruction.
1268     """
1269
1270     def __init__(self, register_levels=()):
1271         """ register_levels: specifies the points in the cascade at which
1272             flip-flops are to be inserted.
1273         """
1274
1275         # parameter(s)
1276         self.register_levels = list(register_levels)
1277
1278         # inputs
1279         self.part_pts = PartitionPoints()
1280         for i in range(8, 64, 8):
1281             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1282         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1283         self.a = Signal(64)
1284         self.b = Signal(64)
1285
1286         # intermediates (needed for unit tests)
1287         self.intermediate_output = Signal(128)
1288
1289         # output
1290         self.output = Signal(64)
1291
1292     def elaborate(self, platform):
1293         m = Module()
1294
1295         # create (doubled) PartitionPoints (output is double input width)
1296         expanded_part_pts = eps = PartitionPoints()
1297         for i, v in self.part_pts.items():
1298             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1299             expanded_part_pts[i * 2] = ep
1300             m.d.comb += ep.eq(v)
1301
1302         n_inputs = 64 + 4
1303         n_parts = 8 #len(self.part_pts)
1304         t = AllTerms(n_inputs, 128, n_parts, self.register_levels,
1305                        eps)
1306         m.submodules.allterms = t
1307         m.d.comb += t.i.a.eq(self.a)
1308         m.d.comb += t.i.b.eq(self.b)
1309         m.d.comb += t.i.epps.eq(eps)
1310         for i in range(8):
1311             m.d.comb += t.i.part_ops[i].eq(self.part_ops[i])
1312
1313         terms = t.o.inputs
1314
1315         add_reduce = AddReduce(terms,
1316                                128,
1317                                self.register_levels,
1318                                t.o.reg_partition_points,
1319                                t.o.part_ops)
1320
1321         out_part_ops = add_reduce.o.part_ops
1322         out_part_pts = add_reduce.o.reg_partition_points
1323
1324         m.submodules.add_reduce = add_reduce
1325
1326         interm = Intermediates(128, 8, expanded_part_pts)
1327         m.submodules.intermediates = interm
1328         m.d.comb += interm.i.eq(add_reduce.o)
1329
1330         # final output
1331         m.submodules.finalout = finalout = FinalOut(128, 8, expanded_part_pts)
1332         m.d.comb += finalout.i.eq(interm.o)
1333         m.d.comb += self.output.eq(finalout.out)
1334         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1335
1336         return m
1337
1338
1339 if __name__ == "__main__":
1340     m = Mul8_16_32_64()
1341     main(m, ports=[m.a,
1342                    m.b,
1343                    m.intermediate_output,
1344                    m.output,
1345                    *m.part_ops,
1346                    *m.part_pts.values()])