src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303
 304 class AddReduceSingle(Elaboratable):
 305     """Add list of numbers together.
 306
 307     :attribute inputs: input ``Signal``s to be summed. Modification not
 308         supported, except for by ``Signal.eq``.
 309     :attribute register_levels: List of nesting levels that should have
 310         pipeline registers.
 311     :attribute output: output sum.
 312     :attribute partition_points: the input partition points. Modification not
 313         supported, except for by ``Signal.eq``.
 314     """
 315
 316     def __init__(self, inputs, output_width, register_levels, partition_points,
 317                        part_ops):
 318         """Create an ``AddReduce``.
 319
 320         :param inputs: input ``Signal``s to be summed.
 321         :param output_width: bit-width of ``output``.
 322         :param register_levels: List of nesting levels that should have
 323             pipeline registers.
 324         :param partition_points: the input partition points.
 325         """
 326         self.part_ops = part_ops
 327         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 328                           for i in range(len(part_ops))]
 329         self.inputs = list(inputs)
 330         self._resized_inputs = [
 331             Signal(output_width, name=f"resized_inputs[{i}]")
 332             for i in range(len(self.inputs))]
 333         self.register_levels = list(register_levels)
 334         self.output = Signal(output_width)
 335         self.partition_points = PartitionPoints(partition_points)
 336         if not self.partition_points.fits_in_width(output_width):
 337             raise ValueError("partition_points doesn't fit in output_width")
 338         self._reg_partition_points = self.partition_points.like()
 339
 340         max_level = AddReduceSingle.get_max_level(len(self.inputs))
 341         for level in self.register_levels:
 342             if level > max_level:
 343                 raise ValueError(
 344                     "not enough adder levels for specified register levels")
 345
 346         # this is annoying.  we have to create the modules (and terms)
 347         # because we need to know what they are (in order to set up the
 348         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 349         # etc because this is not in elaboratable.
 350         self.groups = AddReduceSingle.full_adder_groups(len(self.inputs))
 351         self._intermediate_terms = []
 352         if len(self.groups) != 0:
 353             self.create_next_terms()
 354
 355     @staticmethod
 356     def get_max_level(input_count):
 357         """Get the maximum level.
 358
 359         All ``register_levels`` must be less than or equal to the maximum
 360         level.
 361         """
 362         retval = 0
 363         while True:
 364             groups = AddReduceSingle.full_adder_groups(input_count)
 365             if len(groups) == 0:
 366                 return retval
 367             input_count %= FULL_ADDER_INPUT_COUNT
 368             input_count += 2 * len(groups)
 369             retval += 1
 370
 371     @staticmethod
 372     def full_adder_groups(input_count):
 373         """Get ``inputs`` indices for which a full adder should be built."""
 374         return range(0,
 375                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 376                      FULL_ADDER_INPUT_COUNT)
 377
 378     def elaborate(self, platform):
 379         """Elaborate this module."""
 380         m = Module()
 381
 382         # resize inputs to correct bit-width and optionally add in
 383         # pipeline registers
 384         resized_input_assignments = [self._resized_inputs[i].eq(self.inputs[i])
 385                                      for i in range(len(self.inputs))]
 386         copy_part_ops = [self.out_part_ops[i].eq(self.part_ops[i])
 387                                      for i in range(len(self.part_ops))]
 388         if 0 in self.register_levels:
 389             m.d.sync += copy_part_ops
 390             m.d.sync += resized_input_assignments
 391             m.d.sync += self._reg_partition_points.eq(self.partition_points)
 392         else:
 393             m.d.comb += copy_part_ops
 394             m.d.comb += resized_input_assignments
 395             m.d.comb += self._reg_partition_points.eq(self.partition_points)
 396
 397         for (value, term) in self._intermediate_terms:
 398             m.d.comb += term.eq(value)
 399
 400         # if there are no full adders to create, then we handle the base cases
 401         # and return, otherwise we go on to the recursive case
 402         if len(self.groups) == 0:
 403             if len(self.inputs) == 0:
 404                 # use 0 as the default output value
 405                 m.d.comb += self.output.eq(0)
 406             elif len(self.inputs) == 1:
 407                 # handle single input
 408                 m.d.comb += self.output.eq(self._resized_inputs[0])
 409             else:
 410                 # base case for adding 2 inputs
 411                 assert len(self.inputs) == 2
 412                 adder = PartitionedAdder(len(self.output),
 413                                          self._reg_partition_points)
 414                 m.submodules.final_adder = adder
 415                 m.d.comb += adder.a.eq(self._resized_inputs[0])
 416                 m.d.comb += adder.b.eq(self._resized_inputs[1])
 417                 m.d.comb += self.output.eq(adder.output)
 418             return m
 419
 420         mask = self._reg_partition_points.as_mask(len(self.output))
 421         m.d.comb += self.part_mask.eq(mask)
 422
 423         # add and link the intermediate term modules
 424         for i, (iidx, adder_i) in enumerate(self.adders):
 425             setattr(m.submodules, f"adder_{i}", adder_i)
 426
 427             m.d.comb += adder_i.in0.eq(self._resized_inputs[iidx])
 428             m.d.comb += adder_i.in1.eq(self._resized_inputs[iidx + 1])
 429             m.d.comb += adder_i.in2.eq(self._resized_inputs[iidx + 2])
 430             m.d.comb += adder_i.mask.eq(self.part_mask)
 431
 432         return m
 433
 434     def create_next_terms(self):
 435
 436         # go on to prepare recursive case
 437         intermediate_terms = []
 438         _intermediate_terms = []
 439
 440         def add_intermediate_term(value):
 441             intermediate_term = Signal(
 442                 len(self.output),
 443                 name=f"intermediate_terms[{len(intermediate_terms)}]")
 444             _intermediate_terms.append((value, intermediate_term))
 445             intermediate_terms.append(intermediate_term)
 446
 447         # store mask in intermediary (simplifies graph)
 448         self.part_mask = Signal(len(self.output), reset_less=True)
 449
 450         # create full adders for this recursive level.
 451         # this shrinks N terms to 2 * (N // 3) plus the remainder
 452         self.adders = []
 453         for i in self.groups:
 454             adder_i = MaskedFullAdder(len(self.output))
 455             self.adders.append((i, adder_i))
 456             # add both the sum and the masked-carry to the next level.
 457             # 3 inputs have now been reduced to 2...
 458             add_intermediate_term(adder_i.sum)
 459             add_intermediate_term(adder_i.mcarry)
 460         # handle the remaining inputs.
 461         if len(self.inputs) % FULL_ADDER_INPUT_COUNT == 1:
 462             add_intermediate_term(self._resized_inputs[-1])
 463         elif len(self.inputs) % FULL_ADDER_INPUT_COUNT == 2:
 464             # Just pass the terms to the next layer, since we wouldn't gain
 465             # anything by using a half adder since there would still be 2 terms
 466             # and just passing the terms to the next layer saves gates.
 467             add_intermediate_term(self._resized_inputs[-2])
 468             add_intermediate_term(self._resized_inputs[-1])
 469         else:
 470             assert len(self.inputs) % FULL_ADDER_INPUT_COUNT == 0
 471
 472         self.intermediate_terms = intermediate_terms
 473         self._intermediate_terms = _intermediate_terms
 474
 475
 476 class AddReduce(Elaboratable):
 477     """Recursively Add list of numbers together.
 478
 479     :attribute inputs: input ``Signal``s to be summed. Modification not
 480         supported, except for by ``Signal.eq``.
 481     :attribute register_levels: List of nesting levels that should have
 482         pipeline registers.
 483     :attribute output: output sum.
 484     :attribute partition_points: the input partition points. Modification not
 485         supported, except for by ``Signal.eq``.
 486     """
 487
 488     def __init__(self, inputs, output_width, register_levels, partition_points,
 489                        part_ops):
 490         """Create an ``AddReduce``.
 491
 492         :param inputs: input ``Signal``s to be summed.
 493         :param output_width: bit-width of ``output``.
 494         :param register_levels: List of nesting levels that should have
 495             pipeline registers.
 496         :param partition_points: the input partition points.
 497         """
 498         self.inputs = inputs
 499         self.part_ops = part_ops
 500         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 501                           for i in range(len(part_ops))]
 502         self.output = Signal(output_width)
 503         self.output_width = output_width
 504         self.register_levels = register_levels
 505         self.partition_points = partition_points
 506
 507         self.create_levels()
 508
 509     @staticmethod
 510     def get_max_level(input_count):
 511         return AddReduceSingle.get_max_level(input_count)
 512
 513     @staticmethod
 514     def next_register_levels(register_levels):
 515         """``Iterable`` of ``register_levels`` for next recursive level."""
 516         for level in register_levels:
 517             if level > 0:
 518                 yield level - 1
 519
 520     def create_levels(self):
 521         """creates reduction levels"""
 522
 523         mods = []
 524         next_levels = self.register_levels
 525         partition_points = self.partition_points
 526         inputs = self.inputs
 527         part_ops = self.part_ops
 528         while True:
 529             next_level = AddReduceSingle(inputs, self.output_width, next_levels,
 530                                          partition_points, part_ops)
 531             mods.append(next_level)
 532             if len(next_level.groups) == 0:
 533                 break
 534             next_levels = list(AddReduce.next_register_levels(next_levels))
 535             partition_points = next_level._reg_partition_points
 536             inputs = next_level.intermediate_terms
 537             part_ops = next_level.out_part_ops
 538
 539         self.levels = mods
 540
 541     def elaborate(self, platform):
 542         """Elaborate this module."""
 543         m = Module()
 544
 545         for i, next_level in enumerate(self.levels):
 546             setattr(m.submodules, "next_level%d" % i, next_level)
 547
 548         # output comes from last module
 549         m.d.comb += self.output.eq(next_level.output)
 550         copy_part_ops = [self.out_part_ops[i].eq(next_level.out_part_ops[i])
 551                                      for i in range(len(self.part_ops))]
 552         m.d.comb += copy_part_ops
 553
 554         return m
 555
 556
 557 OP_MUL_LOW = 0
 558 OP_MUL_SIGNED_HIGH = 1
 559 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 560 OP_MUL_UNSIGNED_HIGH = 3
 561
 562
 563 def get_term(value, shift=0, enabled=None):
 564     if enabled is not None:
 565         value = Mux(enabled, value, 0)
 566     if shift > 0:
 567         value = Cat(Repl(C(0, 1), shift), value)
 568     else:
 569         assert shift == 0
 570     return value
 571
 572
 573 class ProductTerm(Elaboratable):
 574     """ this class creates a single product term (a[..]*b[..]).
 575         it has a design flaw in that is the *output* that is selected,
 576         where the multiplication(s) are combinatorially generated
 577         all the time.
 578     """
 579
 580     def __init__(self, width, twidth, pbwid, a_index, b_index):
 581         self.a_index = a_index
 582         self.b_index = b_index
 583         shift = 8 * (self.a_index + self.b_index)
 584         self.pwidth = width
 585         self.twidth = twidth
 586         self.width = width*2
 587         self.shift = shift
 588
 589         self.ti = Signal(self.width, reset_less=True)
 590         self.term = Signal(twidth, reset_less=True)
 591         self.a = Signal(twidth//2, reset_less=True)
 592         self.b = Signal(twidth//2, reset_less=True)
 593         self.pb_en = Signal(pbwid, reset_less=True)
 594
 595         self.tl = tl = []
 596         min_index = min(self.a_index, self.b_index)
 597         max_index = max(self.a_index, self.b_index)
 598         for i in range(min_index, max_index):
 599             tl.append(self.pb_en[i])
 600         name = "te_%d_%d" % (self.a_index, self.b_index)
 601         if len(tl) > 0:
 602             term_enabled = Signal(name=name, reset_less=True)
 603         else:
 604             term_enabled = None
 605         self.enabled = term_enabled
 606         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 607
 608     def elaborate(self, platform):
 609
 610         m = Module()
 611         if self.enabled is not None:
 612             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 613
 614         bsa = Signal(self.width, reset_less=True)
 615         bsb = Signal(self.width, reset_less=True)
 616         a_index, b_index = self.a_index, self.b_index
 617         pwidth = self.pwidth
 618         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 619         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 620         m.d.comb += self.ti.eq(bsa * bsb)
 621         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 622         """
 623         #TODO: sort out width issues, get inputs a/b switched on/off.
 624         #data going into Muxes is 1/2 the required width
 625
 626         pwidth = self.pwidth
 627         width = self.width
 628         bsa = Signal(self.twidth//2, reset_less=True)
 629         bsb = Signal(self.twidth//2, reset_less=True)
 630         asel = Signal(width, reset_less=True)
 631         bsel = Signal(width, reset_less=True)
 632         a_index, b_index = self.a_index, self.b_index
 633         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 634         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 635         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 636         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 637         m.d.comb += self.ti.eq(bsa * bsb)
 638         m.d.comb += self.term.eq(self.ti)
 639         """
 640
 641         return m
 642
 643
 644 class ProductTerms(Elaboratable):
 645     """ creates a bank of product terms.  also performs the actual bit-selection
 646         this class is to be wrapped with a for-loop on the "a" operand.
 647         it creates a second-level for-loop on the "b" operand.
 648     """
 649     def __init__(self, width, twidth, pbwid, a_index, blen):
 650         self.a_index = a_index
 651         self.blen = blen
 652         self.pwidth = width
 653         self.twidth = twidth
 654         self.pbwid = pbwid
 655         self.a = Signal(twidth//2, reset_less=True)
 656         self.b = Signal(twidth//2, reset_less=True)
 657         self.pb_en = Signal(pbwid, reset_less=True)
 658         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 659                             for i in range(blen)]
 660
 661     def elaborate(self, platform):
 662
 663         m = Module()
 664
 665         for b_index in range(self.blen):
 666             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 667                             self.a_index, b_index)
 668             setattr(m.submodules, "term_%d" % b_index, t)
 669
 670             m.d.comb += t.a.eq(self.a)
 671             m.d.comb += t.b.eq(self.b)
 672             m.d.comb += t.pb_en.eq(self.pb_en)
 673
 674             m.d.comb += self.terms[b_index].eq(t.term)
 675
 676         return m
 677
 678
 679 class LSBNegTerm(Elaboratable):
 680
 681     def __init__(self, bit_width):
 682         self.bit_width = bit_width
 683         self.part = Signal(reset_less=True)
 684         self.signed = Signal(reset_less=True)
 685         self.op = Signal(bit_width, reset_less=True)
 686         self.msb = Signal(reset_less=True)
 687         self.nt = Signal(bit_width*2, reset_less=True)
 688         self.nl = Signal(bit_width*2, reset_less=True)
 689
 690     def elaborate(self, platform):
 691         m = Module()
 692         comb = m.d.comb
 693         bit_wid = self.bit_width
 694         ext = Repl(0, bit_wid) # extend output to HI part
 695
 696         # determine sign of each incoming number *in this partition*
 697         enabled = Signal(reset_less=True)
 698         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 699
 700         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 701         # negation operation is split into a bitwise not and a +1.
 702         # likewise for 16, 32, and 64-bit values.
 703
 704         # width-extended 1s complement if a is signed, otherwise zero
 705         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 706
 707         # add 1 if signed, otherwise add zero
 708         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 709
 710         return m
 711
 712
 713 class Parts(Elaboratable):
 714
 715     def __init__(self, pbwid, epps, n_parts):
 716         self.pbwid = pbwid
 717         # inputs
 718         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 719         # outputs
 720         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 721
 722     def elaborate(self, platform):
 723         m = Module()
 724
 725         epps, parts = self.epps, self.parts
 726         # collect part-bytes (double factor because the input is extended)
 727         pbs = Signal(self.pbwid, reset_less=True)
 728         tl = []
 729         for i in range(self.pbwid):
 730             pb = Signal(name="pb%d" % i, reset_less=True)
 731             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 732             tl.append(pb)
 733         m.d.comb += pbs.eq(Cat(*tl))
 734
 735         # negated-temporary copy of partition bits
 736         npbs = Signal.like(pbs, reset_less=True)
 737         m.d.comb += npbs.eq(~pbs)
 738         byte_count = 8 // len(parts)
 739         for i in range(len(parts)):
 740             pbl = []
 741             pbl.append(npbs[i * byte_count - 1])
 742             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 743                 pbl.append(pbs[j])
 744             pbl.append(npbs[(i + 1) * byte_count - 1])
 745             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 746             m.d.comb += value.eq(Cat(*pbl))
 747             m.d.comb += parts[i].eq(~(value).bool())
 748
 749         return m
 750
 751
 752 class Part(Elaboratable):
 753     """ a key class which, depending on the partitioning, will determine
 754         what action to take when parts of the output are signed or unsigned.
 755
 756         this requires 2 pieces of data *per operand, per partition*:
 757         whether the MSB is HI/LO (per partition!), and whether a signed
 758         or unsigned operation has been *requested*.
 759
 760         once that is determined, signed is basically carried out
 761         by splitting 2's complement into 1's complement plus one.
 762         1's complement is just a bit-inversion.
 763
 764         the extra terms - as separate terms - are then thrown at the
 765         AddReduce alongside the multiplication part-results.
 766     """
 767     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 768
 769         self.pbwid = pbwid
 770         self.epps = epps
 771
 772         # inputs
 773         self.a = Signal(64)
 774         self.b = Signal(64)
 775         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 776         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 777         self.pbs = Signal(pbwid, reset_less=True)
 778
 779         # outputs
 780         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 781         self.delayed_parts = [
 782             [Signal(name=f"delayed_part_{delay}_{i}")
 783              for i in range(n_parts)]
 784                 for delay in range(n_levels)]
 785         # XXX REALLY WEIRD BUG - have to take a copy of the last delayed_parts
 786         self.dplast = [Signal(name=f"dplast_{i}")
 787                          for i in range(n_parts)]
 788
 789         self.not_a_term = Signal(width)
 790         self.neg_lsb_a_term = Signal(width)
 791         self.not_b_term = Signal(width)
 792         self.neg_lsb_b_term = Signal(width)
 793
 794     def elaborate(self, platform):
 795         m = Module()
 796
 797         pbs, parts, delayed_parts = self.pbs, self.parts, self.delayed_parts
 798         epps = self.epps
 799         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 800         m.d.comb += p.epps.eq(epps)
 801         parts = p.parts
 802
 803         npbs = Signal.like(pbs, reset_less=True)
 804         byte_count = 8 // len(parts)
 805         for i in range(len(parts)):
 806             m.d.comb += delayed_parts[0][i].eq(parts[i])
 807             m.d.sync += [delayed_parts[j + 1][i].eq(delayed_parts[j][i])
 808                          for j in range(len(delayed_parts)-1)]
 809             m.d.comb += self.dplast[i].eq(delayed_parts[-1][i])
 810
 811         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = \
 812                 self.not_a_term, self.neg_lsb_a_term, \
 813                 self.not_b_term, self.neg_lsb_b_term
 814
 815         byte_width = 8 // len(parts) # byte width
 816         bit_wid = 8 * byte_width     # bit width
 817         nat, nbt, nla, nlb = [], [], [], []
 818         for i in range(len(parts)):
 819             # work out bit-inverted and +1 term for a.
 820             pa = LSBNegTerm(bit_wid)
 821             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 822             m.d.comb += pa.part.eq(parts[i])
 823             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 824             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 825             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 826             nat.append(pa.nt)
 827             nla.append(pa.nl)
 828
 829             # work out bit-inverted and +1 term for b
 830             pb = LSBNegTerm(bit_wid)
 831             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 832             m.d.comb += pb.part.eq(parts[i])
 833             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 834             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 835             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 836             nbt.append(pb.nt)
 837             nlb.append(pb.nl)
 838
 839         # concatenate together and return all 4 results.
 840         m.d.comb += [not_a_term.eq(Cat(*nat)),
 841                      not_b_term.eq(Cat(*nbt)),
 842                      neg_lsb_a_term.eq(Cat(*nla)),
 843                      neg_lsb_b_term.eq(Cat(*nlb)),
 844                     ]
 845
 846         return m
 847
 848
 849 class IntermediateOut(Elaboratable):
 850     """ selects the HI/LO part of the multiplication, for a given bit-width
 851         the output is also reconstructed in its SIMD (partition) lanes.
 852     """
 853     def __init__(self, width, out_wid, n_parts):
 854         self.width = width
 855         self.n_parts = n_parts
 856         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 857                                      for i in range(8)]
 858         self.intermed = Signal(out_wid, reset_less=True)
 859         self.output = Signal(out_wid//2, reset_less=True)
 860
 861     def elaborate(self, platform):
 862         m = Module()
 863
 864         ol = []
 865         w = self.width
 866         sel = w // 8
 867         for i in range(self.n_parts):
 868             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 869             m.d.comb += op.eq(
 870                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 871                     self.intermed.part(i * w*2, w),
 872                     self.intermed.part(i * w*2 + w, w)))
 873             ol.append(op)
 874         m.d.comb += self.output.eq(Cat(*ol))
 875
 876         return m
 877
 878
 879 class FinalOut(Elaboratable):
 880     """ selects the final output based on the partitioning.
 881
 882         each byte is selectable independently, i.e. it is possible
 883         that some partitions requested 8-bit computation whilst others
 884         requested 16 or 32 bit.
 885     """
 886     def __init__(self, out_wid):
 887         # inputs
 888         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 889         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 890         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 891
 892         self.i8 = Signal(out_wid, reset_less=True)
 893         self.i16 = Signal(out_wid, reset_less=True)
 894         self.i32 = Signal(out_wid, reset_less=True)
 895         self.i64 = Signal(out_wid, reset_less=True)
 896
 897         # output
 898         self.out = Signal(out_wid, reset_less=True)
 899
 900     def elaborate(self, platform):
 901         m = Module()
 902         ol = []
 903         for i in range(8):
 904             # select one of the outputs: d8 selects i8, d16 selects i16
 905             # d32 selects i32, and the default is i64.
 906             # d8 and d16 are ORed together in the first Mux
 907             # then the 2nd selects either i8 or i16.
 908             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 909             op = Signal(8, reset_less=True, name="op_%d" % i)
 910             m.d.comb += op.eq(
 911                 Mux(self.d8[i] | self.d16[i // 2],
 912                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 913                                      self.i16.part(i * 8, 8)),
 914                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 915                                           self.i64.part(i * 8, 8))))
 916             ol.append(op)
 917         m.d.comb += self.out.eq(Cat(*ol))
 918         return m
 919
 920
 921 class OrMod(Elaboratable):
 922     """ ORs four values together in a hierarchical tree
 923     """
 924     def __init__(self, wid):
 925         self.wid = wid
 926         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 927                      for i in range(4)]
 928         self.orout = Signal(wid, reset_less=True)
 929
 930     def elaborate(self, platform):
 931         m = Module()
 932         or1 = Signal(self.wid, reset_less=True)
 933         or2 = Signal(self.wid, reset_less=True)
 934         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 935         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 936         m.d.comb += self.orout.eq(or1 | or2)
 937
 938         return m
 939
 940
 941 class Signs(Elaboratable):
 942     """ determines whether a or b are signed numbers
 943         based on the required operation type (OP_MUL_*)
 944     """
 945
 946     def __init__(self):
 947         self.part_ops = Signal(2, reset_less=True)
 948         self.a_signed = Signal(reset_less=True)
 949         self.b_signed = Signal(reset_less=True)
 950
 951     def elaborate(self, platform):
 952
 953         m = Module()
 954
 955         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
 956         bsig = (self.part_ops == OP_MUL_LOW) \
 957                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
 958         m.d.comb += self.a_signed.eq(asig)
 959         m.d.comb += self.b_signed.eq(bsig)
 960
 961         return m
 962
 963
 964 class Mul8_16_32_64(Elaboratable):
 965     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
 966
 967     Supports partitioning into any combination of 8, 16, 32, and 64-bit
 968     partitions on naturally-aligned boundaries. Supports the operation being
 969     set for each partition independently.
 970
 971     :attribute part_pts: the input partition points. Has a partition point at
 972         multiples of 8 in 0 < i < 64. Each partition point's associated
 973         ``Value`` is a ``Signal``. Modification not supported, except for by
 974         ``Signal.eq``.
 975     :attribute part_ops: the operation for each byte. The operation for a
 976         particular partition is selected by assigning the selected operation
 977         code to each byte in the partition. The allowed operation codes are:
 978
 979         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
 980             RISC-V's `mul` instruction.
 981         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
 982             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
 983             instruction.
 984         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
 985             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
 986             `mulhsu` instruction.
 987         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
 988             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
 989             instruction.
 990     """
 991
 992     def __init__(self, register_levels=()):
 993         """ register_levels: specifies the points in the cascade at which
 994             flip-flops are to be inserted.
 995         """
 996
 997         # parameter(s)
 998         self.register_levels = list(register_levels)
 999
1000         # inputs
1001         self.part_pts = PartitionPoints()
1002         for i in range(8, 64, 8):
1003             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1004         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1005         self.a = Signal(64)
1006         self.b = Signal(64)
1007
1008         # intermediates (needed for unit tests)
1009         self._intermediate_output = Signal(128)
1010
1011         # output
1012         self.output = Signal(64)
1013
1014     def elaborate(self, platform):
1015         m = Module()
1016
1017         # collect part-bytes
1018         pbs = Signal(8, reset_less=True)
1019         tl = []
1020         for i in range(8):
1021             pb = Signal(name="pb%d" % i, reset_less=True)
1022             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1023             tl.append(pb)
1024         m.d.comb += pbs.eq(Cat(*tl))
1025
1026         # create (doubled) PartitionPoints (output is double input width)
1027         expanded_part_pts = eps = PartitionPoints()
1028         for i, v in self.part_pts.items():
1029             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1030             expanded_part_pts[i * 2] = ep
1031             m.d.comb += ep.eq(v)
1032
1033         # local variables
1034         signs = []
1035         for i in range(8):
1036             s = Signs()
1037             signs.append(s)
1038             setattr(m.submodules, "signs%d" % i, s)
1039             m.d.comb += s.part_ops.eq(self.part_ops[i])
1040
1041         n_levels = len(self.register_levels)+1
1042         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1043         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1044         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1045         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1046         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1047         for mod in [part_8, part_16, part_32, part_64]:
1048             m.d.comb += mod.a.eq(self.a)
1049             m.d.comb += mod.b.eq(self.b)
1050             for i in range(len(signs)):
1051                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1052                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1053             m.d.comb += mod.pbs.eq(pbs)
1054             nat_l.append(mod.not_a_term)
1055             nbt_l.append(mod.not_b_term)
1056             nla_l.append(mod.neg_lsb_a_term)
1057             nlb_l.append(mod.neg_lsb_b_term)
1058
1059         terms = []
1060
1061         for a_index in range(8):
1062             t = ProductTerms(8, 128, 8, a_index, 8)
1063             setattr(m.submodules, "terms_%d" % a_index, t)
1064
1065             m.d.comb += t.a.eq(self.a)
1066             m.d.comb += t.b.eq(self.b)
1067             m.d.comb += t.pb_en.eq(pbs)
1068
1069             for term in t.terms:
1070                 terms.append(term)
1071
1072         # it's fine to bitwise-or data together since they are never enabled
1073         # at the same time
1074         m.submodules.nat_or = nat_or = OrMod(128)
1075         m.submodules.nbt_or = nbt_or = OrMod(128)
1076         m.submodules.nla_or = nla_or = OrMod(128)
1077         m.submodules.nlb_or = nlb_or = OrMod(128)
1078         for l, mod in [(nat_l, nat_or),
1079                              (nbt_l, nbt_or),
1080                              (nla_l, nla_or),
1081                              (nlb_l, nlb_or)]:
1082             for i in range(len(l)):
1083                 m.d.comb += mod.orin[i].eq(l[i])
1084             terms.append(mod.orout)
1085
1086         add_reduce = AddReduce(terms,
1087                                128,
1088                                self.register_levels,
1089                                expanded_part_pts,
1090                                self.part_ops)
1091
1092         out_part_ops = add_reduce.levels[-1].out_part_ops
1093         out_part_pts = add_reduce.levels[-1]._reg_partition_points
1094
1095         m.submodules.add_reduce = add_reduce
1096         m.d.comb += self._intermediate_output.eq(add_reduce.output)
1097         # create _output_64
1098         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1099         m.d.comb += io64.intermed.eq(self._intermediate_output)
1100         for i in range(8):
1101             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1102
1103         # create _output_32
1104         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1105         m.d.comb += io32.intermed.eq(self._intermediate_output)
1106         for i in range(8):
1107             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1108
1109         # create _output_16
1110         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1111         m.d.comb += io16.intermed.eq(self._intermediate_output)
1112         for i in range(8):
1113             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1114
1115         # create _output_8
1116         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1117         m.d.comb += io8.intermed.eq(self._intermediate_output)
1118         for i in range(8):
1119             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1120
1121         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1122         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1123         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1124         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1125
1126         m.d.comb += p_8.epps.eq(out_part_pts)
1127         m.d.comb += p_16.epps.eq(out_part_pts)
1128         m.d.comb += p_32.epps.eq(out_part_pts)
1129         m.d.comb += p_64.epps.eq(out_part_pts)
1130
1131         # final output
1132         m.submodules.finalout = finalout = FinalOut(64)
1133         for i in range(len(part_8.parts)):
1134             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1135         for i in range(len(part_16.parts)):
1136             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1137         for i in range(len(part_32.parts)):
1138             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1139         m.d.comb += finalout.i8.eq(io8.output)
1140         m.d.comb += finalout.i16.eq(io16.output)
1141         m.d.comb += finalout.i32.eq(io32.output)
1142         m.d.comb += finalout.i64.eq(io64.output)
1143         m.d.comb += self.output.eq(finalout.out)
1144
1145         return m
1146
1147
1148 if __name__ == "__main__":
1149     m = Mul8_16_32_64()
1150     main(m, ports=[m.a,
1151                    m.b,
1152                    m._intermediate_output,
1153                    m.output,
1154                    *m.part_ops,
1155                    *m.part_pts.values()])