src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(n_inputs)]
 310         self.reg_partition_points = ppoints.like()
 311
 312     def eq(self, rhs):
 313         return [self.reg_partition_points.eq(rhs.reg_partition_points)] + \
 314                [self.inputs[i].eq(rhs.inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(rhs.part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319
 320 class FinalAdd(Elaboratable):
 321     """ Final stage of add reduce
 322     """
 323
 324     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 325                        partition_points):
 326         self.i = AddReduceData(partition_points, n_inputs,
 327                                output_width, n_parts)
 328         self.n_inputs = n_inputs
 329         self.n_parts = n_parts
 330         self.register_levels = list(register_levels)
 331         self.output = Signal(output_width)
 332         self.partition_points = PartitionPoints(partition_points)
 333         if not self.partition_points.fits_in_width(output_width):
 334             raise ValueError("partition_points doesn't fit in output_width")
 335         self.intermediate_terms = []
 336
 337     def elaborate(self, platform):
 338         """Elaborate this module."""
 339         m = Module()
 340
 341         if self.n_inputs == 0:
 342             # use 0 as the default output value
 343             m.d.comb += self.output.eq(0)
 344         elif self.n_inputs == 1:
 345             # handle single input
 346             m.d.comb += self.output.eq(self.i.inputs[0])
 347         else:
 348             # base case for adding 2 inputs
 349             assert self.n_inputs == 2
 350             adder = PartitionedAdder(len(self.output),
 351                                      self.i.reg_partition_points)
 352             m.submodules.final_adder = adder
 353             m.d.comb += adder.a.eq(self.i.inputs[0])
 354             m.d.comb += adder.b.eq(self.i.inputs[1])
 355             m.d.comb += self.output.eq(adder.output)
 356         return m
 357
 358
 359 class AddReduceSingle(Elaboratable):
 360     """Add list of numbers together.
 361
 362     :attribute inputs: input ``Signal``s to be summed. Modification not
 363         supported, except for by ``Signal.eq``.
 364     :attribute register_levels: List of nesting levels that should have
 365         pipeline registers.
 366     :attribute output: output sum.
 367     :attribute partition_points: the input partition points. Modification not
 368         supported, except for by ``Signal.eq``.
 369     """
 370
 371     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 372                        partition_points):
 373         """Create an ``AddReduce``.
 374
 375         :param inputs: input ``Signal``s to be summed.
 376         :param output_width: bit-width of ``output``.
 377         :param register_levels: List of nesting levels that should have
 378             pipeline registers.
 379         :param partition_points: the input partition points.
 380         """
 381         self.n_inputs = n_inputs
 382         self.n_parts = n_parts
 383         self.output_width = output_width
 384         self.i = AddReduceData(partition_points, n_inputs,
 385                                output_width, n_parts)
 386         self.register_levels = list(register_levels)
 387         self.partition_points = PartitionPoints(partition_points)
 388         if not self.partition_points.fits_in_width(output_width):
 389             raise ValueError("partition_points doesn't fit in output_width")
 390
 391         max_level = AddReduceSingle.get_max_level(n_inputs)
 392         for level in self.register_levels:
 393             if level > max_level:
 394                 raise ValueError(
 395                     "not enough adder levels for specified register levels")
 396
 397         # this is annoying.  we have to create the modules (and terms)
 398         # because we need to know what they are (in order to set up the
 399         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 400         # etc because this is not in elaboratable.
 401         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 402         self._intermediate_terms = []
 403         if len(self.groups) != 0:
 404             self.create_next_terms()
 405
 406     @staticmethod
 407     def get_max_level(input_count):
 408         """Get the maximum level.
 409
 410         All ``register_levels`` must be less than or equal to the maximum
 411         level.
 412         """
 413         retval = 0
 414         while True:
 415             groups = AddReduceSingle.full_adder_groups(input_count)
 416             if len(groups) == 0:
 417                 return retval
 418             input_count %= FULL_ADDER_INPUT_COUNT
 419             input_count += 2 * len(groups)
 420             retval += 1
 421
 422     @staticmethod
 423     def full_adder_groups(input_count):
 424         """Get ``inputs`` indices for which a full adder should be built."""
 425         return range(0,
 426                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 427                      FULL_ADDER_INPUT_COUNT)
 428
 429     def elaborate(self, platform):
 430         """Elaborate this module."""
 431         m = Module()
 432
 433         for (value, term) in self._intermediate_terms:
 434             m.d.comb += term.eq(value)
 435
 436         mask = self.i.reg_partition_points.as_mask(self.output_width)
 437         m.d.comb += self.part_mask.eq(mask)
 438
 439         # add and link the intermediate term modules
 440         for i, (iidx, adder_i) in enumerate(self.adders):
 441             setattr(m.submodules, f"adder_{i}", adder_i)
 442
 443             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 444             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 445             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 446             m.d.comb += adder_i.mask.eq(self.part_mask)
 447
 448         return m
 449
 450     def create_next_terms(self):
 451
 452         # go on to prepare recursive case
 453         intermediate_terms = []
 454         _intermediate_terms = []
 455
 456         def add_intermediate_term(value):
 457             intermediate_term = Signal(
 458                 self.output_width,
 459                 name=f"intermediate_terms[{len(intermediate_terms)}]")
 460             _intermediate_terms.append((value, intermediate_term))
 461             intermediate_terms.append(intermediate_term)
 462
 463         # store mask in intermediary (simplifies graph)
 464         self.part_mask = Signal(self.output_width, reset_less=True)
 465
 466         # create full adders for this recursive level.
 467         # this shrinks N terms to 2 * (N // 3) plus the remainder
 468         self.adders = []
 469         for i in self.groups:
 470             adder_i = MaskedFullAdder(self.output_width)
 471             self.adders.append((i, adder_i))
 472             # add both the sum and the masked-carry to the next level.
 473             # 3 inputs have now been reduced to 2...
 474             add_intermediate_term(adder_i.sum)
 475             add_intermediate_term(adder_i.mcarry)
 476         # handle the remaining inputs.
 477         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 478             add_intermediate_term(self.i.inputs[-1])
 479         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 480             # Just pass the terms to the next layer, since we wouldn't gain
 481             # anything by using a half adder since there would still be 2 terms
 482             # and just passing the terms to the next layer saves gates.
 483             add_intermediate_term(self.i.inputs[-2])
 484             add_intermediate_term(self.i.inputs[-1])
 485         else:
 486             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 487
 488         self.intermediate_terms = intermediate_terms
 489         self._intermediate_terms = _intermediate_terms
 490
 491
 492 class AddReduce(Elaboratable):
 493     """Recursively Add list of numbers together.
 494
 495     :attribute inputs: input ``Signal``s to be summed. Modification not
 496         supported, except for by ``Signal.eq``.
 497     :attribute register_levels: List of nesting levels that should have
 498         pipeline registers.
 499     :attribute output: output sum.
 500     :attribute partition_points: the input partition points. Modification not
 501         supported, except for by ``Signal.eq``.
 502     """
 503
 504     def __init__(self, inputs, output_width, register_levels, partition_points,
 505                        part_ops):
 506         """Create an ``AddReduce``.
 507
 508         :param inputs: input ``Signal``s to be summed.
 509         :param output_width: bit-width of ``output``.
 510         :param register_levels: List of nesting levels that should have
 511             pipeline registers.
 512         :param partition_points: the input partition points.
 513         """
 514         self.inputs = inputs
 515         self.part_ops = part_ops
 516         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 517                           for i in range(len(part_ops))]
 518         self.output = Signal(output_width)
 519         self.output_width = output_width
 520         self.register_levels = register_levels
 521         self.partition_points = partition_points
 522
 523         self.create_levels()
 524
 525     @staticmethod
 526     def get_max_level(input_count):
 527         return AddReduceSingle.get_max_level(input_count)
 528
 529     @staticmethod
 530     def next_register_levels(register_levels):
 531         """``Iterable`` of ``register_levels`` for next recursive level."""
 532         for level in register_levels:
 533             if level > 0:
 534                 yield level - 1
 535
 536     def create_levels(self):
 537         """creates reduction levels"""
 538
 539         mods = []
 540         next_levels = self.register_levels
 541         partition_points = self.partition_points
 542         inputs = self.inputs
 543         part_ops = self.part_ops
 544         n_parts = len(part_ops)
 545         while True:
 546             ilen = len(inputs)
 547             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 548                                          next_levels, partition_points)
 549             mods.append(next_level)
 550             next_levels = list(AddReduce.next_register_levels(next_levels))
 551             partition_points = next_level.i.reg_partition_points
 552             inputs = next_level.intermediate_terms
 553             ilen = len(inputs)
 554             part_ops = next_level.i.part_ops
 555             groups = AddReduceSingle.full_adder_groups(len(inputs))
 556             if len(groups) == 0:
 557                 break
 558
 559         if ilen != 0:
 560             next_level = FinalAdd(ilen, self.output_width, n_parts,
 561                                   next_levels, partition_points)
 562             mods.append(next_level)
 563
 564         self.levels = mods
 565
 566     def elaborate(self, platform):
 567         """Elaborate this module."""
 568         m = Module()
 569
 570         for i, next_level in enumerate(self.levels):
 571             setattr(m.submodules, "next_level%d" % i, next_level)
 572
 573         partition_points = self.partition_points
 574         inputs = self.inputs
 575         part_ops = self.part_ops
 576         for i in range(len(self.levels)):
 577             mcur = self.levels[i]
 578             inassign = [mcur.i.inputs[i].eq(inputs[i])
 579                                          for i in range(len(inputs))]
 580             copy_part_ops = [mcur.i.part_ops[i].eq(part_ops[i])
 581                                          for i in range(len(part_ops))]
 582             if 0 in mcur.register_levels:
 583                 m.d.sync += copy_part_ops
 584                 m.d.sync += inassign
 585                 m.d.sync += mcur.i.reg_partition_points.eq(partition_points)
 586             else:
 587                 m.d.comb += copy_part_ops
 588                 m.d.comb += inassign
 589                 m.d.comb += mcur.i.reg_partition_points.eq(partition_points)
 590             partition_points = mcur.i.reg_partition_points
 591             inputs = mcur.intermediate_terms
 592             part_ops = mcur.i.part_ops
 593
 594         # output comes from last module
 595         m.d.comb += self.output.eq(next_level.output)
 596         copy_part_ops = [self.out_part_ops[i].eq(next_level.i.part_ops[i])
 597                                      for i in range(len(self.part_ops))]
 598         m.d.comb += copy_part_ops
 599
 600         return m
 601
 602
 603 OP_MUL_LOW = 0
 604 OP_MUL_SIGNED_HIGH = 1
 605 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 606 OP_MUL_UNSIGNED_HIGH = 3
 607
 608
 609 def get_term(value, shift=0, enabled=None):
 610     if enabled is not None:
 611         value = Mux(enabled, value, 0)
 612     if shift > 0:
 613         value = Cat(Repl(C(0, 1), shift), value)
 614     else:
 615         assert shift == 0
 616     return value
 617
 618
 619 class ProductTerm(Elaboratable):
 620     """ this class creates a single product term (a[..]*b[..]).
 621         it has a design flaw in that is the *output* that is selected,
 622         where the multiplication(s) are combinatorially generated
 623         all the time.
 624     """
 625
 626     def __init__(self, width, twidth, pbwid, a_index, b_index):
 627         self.a_index = a_index
 628         self.b_index = b_index
 629         shift = 8 * (self.a_index + self.b_index)
 630         self.pwidth = width
 631         self.twidth = twidth
 632         self.width = width*2
 633         self.shift = shift
 634
 635         self.ti = Signal(self.width, reset_less=True)
 636         self.term = Signal(twidth, reset_less=True)
 637         self.a = Signal(twidth//2, reset_less=True)
 638         self.b = Signal(twidth//2, reset_less=True)
 639         self.pb_en = Signal(pbwid, reset_less=True)
 640
 641         self.tl = tl = []
 642         min_index = min(self.a_index, self.b_index)
 643         max_index = max(self.a_index, self.b_index)
 644         for i in range(min_index, max_index):
 645             tl.append(self.pb_en[i])
 646         name = "te_%d_%d" % (self.a_index, self.b_index)
 647         if len(tl) > 0:
 648             term_enabled = Signal(name=name, reset_less=True)
 649         else:
 650             term_enabled = None
 651         self.enabled = term_enabled
 652         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 653
 654     def elaborate(self, platform):
 655
 656         m = Module()
 657         if self.enabled is not None:
 658             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 659
 660         bsa = Signal(self.width, reset_less=True)
 661         bsb = Signal(self.width, reset_less=True)
 662         a_index, b_index = self.a_index, self.b_index
 663         pwidth = self.pwidth
 664         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 665         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 666         m.d.comb += self.ti.eq(bsa * bsb)
 667         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 668         """
 669         #TODO: sort out width issues, get inputs a/b switched on/off.
 670         #data going into Muxes is 1/2 the required width
 671
 672         pwidth = self.pwidth
 673         width = self.width
 674         bsa = Signal(self.twidth//2, reset_less=True)
 675         bsb = Signal(self.twidth//2, reset_less=True)
 676         asel = Signal(width, reset_less=True)
 677         bsel = Signal(width, reset_less=True)
 678         a_index, b_index = self.a_index, self.b_index
 679         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 680         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 681         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 682         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 683         m.d.comb += self.ti.eq(bsa * bsb)
 684         m.d.comb += self.term.eq(self.ti)
 685         """
 686
 687         return m
 688
 689
 690 class ProductTerms(Elaboratable):
 691     """ creates a bank of product terms.  also performs the actual bit-selection
 692         this class is to be wrapped with a for-loop on the "a" operand.
 693         it creates a second-level for-loop on the "b" operand.
 694     """
 695     def __init__(self, width, twidth, pbwid, a_index, blen):
 696         self.a_index = a_index
 697         self.blen = blen
 698         self.pwidth = width
 699         self.twidth = twidth
 700         self.pbwid = pbwid
 701         self.a = Signal(twidth//2, reset_less=True)
 702         self.b = Signal(twidth//2, reset_less=True)
 703         self.pb_en = Signal(pbwid, reset_less=True)
 704         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 705                             for i in range(blen)]
 706
 707     def elaborate(self, platform):
 708
 709         m = Module()
 710
 711         for b_index in range(self.blen):
 712             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 713                             self.a_index, b_index)
 714             setattr(m.submodules, "term_%d" % b_index, t)
 715
 716             m.d.comb += t.a.eq(self.a)
 717             m.d.comb += t.b.eq(self.b)
 718             m.d.comb += t.pb_en.eq(self.pb_en)
 719
 720             m.d.comb += self.terms[b_index].eq(t.term)
 721
 722         return m
 723
 724
 725 class LSBNegTerm(Elaboratable):
 726
 727     def __init__(self, bit_width):
 728         self.bit_width = bit_width
 729         self.part = Signal(reset_less=True)
 730         self.signed = Signal(reset_less=True)
 731         self.op = Signal(bit_width, reset_less=True)
 732         self.msb = Signal(reset_less=True)
 733         self.nt = Signal(bit_width*2, reset_less=True)
 734         self.nl = Signal(bit_width*2, reset_less=True)
 735
 736     def elaborate(self, platform):
 737         m = Module()
 738         comb = m.d.comb
 739         bit_wid = self.bit_width
 740         ext = Repl(0, bit_wid) # extend output to HI part
 741
 742         # determine sign of each incoming number *in this partition*
 743         enabled = Signal(reset_less=True)
 744         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 745
 746         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 747         # negation operation is split into a bitwise not and a +1.
 748         # likewise for 16, 32, and 64-bit values.
 749
 750         # width-extended 1s complement if a is signed, otherwise zero
 751         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 752
 753         # add 1 if signed, otherwise add zero
 754         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 755
 756         return m
 757
 758
 759 class Parts(Elaboratable):
 760
 761     def __init__(self, pbwid, epps, n_parts):
 762         self.pbwid = pbwid
 763         # inputs
 764         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 765         # outputs
 766         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 767
 768     def elaborate(self, platform):
 769         m = Module()
 770
 771         epps, parts = self.epps, self.parts
 772         # collect part-bytes (double factor because the input is extended)
 773         pbs = Signal(self.pbwid, reset_less=True)
 774         tl = []
 775         for i in range(self.pbwid):
 776             pb = Signal(name="pb%d" % i, reset_less=True)
 777             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 778             tl.append(pb)
 779         m.d.comb += pbs.eq(Cat(*tl))
 780
 781         # negated-temporary copy of partition bits
 782         npbs = Signal.like(pbs, reset_less=True)
 783         m.d.comb += npbs.eq(~pbs)
 784         byte_count = 8 // len(parts)
 785         for i in range(len(parts)):
 786             pbl = []
 787             pbl.append(npbs[i * byte_count - 1])
 788             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 789                 pbl.append(pbs[j])
 790             pbl.append(npbs[(i + 1) * byte_count - 1])
 791             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 792             m.d.comb += value.eq(Cat(*pbl))
 793             m.d.comb += parts[i].eq(~(value).bool())
 794
 795         return m
 796
 797
 798 class Part(Elaboratable):
 799     """ a key class which, depending on the partitioning, will determine
 800         what action to take when parts of the output are signed or unsigned.
 801
 802         this requires 2 pieces of data *per operand, per partition*:
 803         whether the MSB is HI/LO (per partition!), and whether a signed
 804         or unsigned operation has been *requested*.
 805
 806         once that is determined, signed is basically carried out
 807         by splitting 2's complement into 1's complement plus one.
 808         1's complement is just a bit-inversion.
 809
 810         the extra terms - as separate terms - are then thrown at the
 811         AddReduce alongside the multiplication part-results.
 812     """
 813     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 814
 815         self.pbwid = pbwid
 816         self.epps = epps
 817
 818         # inputs
 819         self.a = Signal(64)
 820         self.b = Signal(64)
 821         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 822         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 823         self.pbs = Signal(pbwid, reset_less=True)
 824
 825         # outputs
 826         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 827
 828         self.not_a_term = Signal(width)
 829         self.neg_lsb_a_term = Signal(width)
 830         self.not_b_term = Signal(width)
 831         self.neg_lsb_b_term = Signal(width)
 832
 833     def elaborate(self, platform):
 834         m = Module()
 835
 836         pbs, parts = self.pbs, self.parts
 837         epps = self.epps
 838         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 839         m.d.comb += p.epps.eq(epps)
 840         parts = p.parts
 841
 842         byte_count = 8 // len(parts)
 843
 844         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 845                 self.not_a_term, self.neg_lsb_a_term,
 846                 self.not_b_term, self.neg_lsb_b_term)
 847
 848         byte_width = 8 // len(parts) # byte width
 849         bit_wid = 8 * byte_width     # bit width
 850         nat, nbt, nla, nlb = [], [], [], []
 851         for i in range(len(parts)):
 852             # work out bit-inverted and +1 term for a.
 853             pa = LSBNegTerm(bit_wid)
 854             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 855             m.d.comb += pa.part.eq(parts[i])
 856             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 857             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 858             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 859             nat.append(pa.nt)
 860             nla.append(pa.nl)
 861
 862             # work out bit-inverted and +1 term for b
 863             pb = LSBNegTerm(bit_wid)
 864             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 865             m.d.comb += pb.part.eq(parts[i])
 866             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 867             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 868             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 869             nbt.append(pb.nt)
 870             nlb.append(pb.nl)
 871
 872         # concatenate together and return all 4 results.
 873         m.d.comb += [not_a_term.eq(Cat(*nat)),
 874                      not_b_term.eq(Cat(*nbt)),
 875                      neg_lsb_a_term.eq(Cat(*nla)),
 876                      neg_lsb_b_term.eq(Cat(*nlb)),
 877                     ]
 878
 879         return m
 880
 881
 882 class IntermediateOut(Elaboratable):
 883     """ selects the HI/LO part of the multiplication, for a given bit-width
 884         the output is also reconstructed in its SIMD (partition) lanes.
 885     """
 886     def __init__(self, width, out_wid, n_parts):
 887         self.width = width
 888         self.n_parts = n_parts
 889         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 890                                      for i in range(8)]
 891         self.intermed = Signal(out_wid, reset_less=True)
 892         self.output = Signal(out_wid//2, reset_less=True)
 893
 894     def elaborate(self, platform):
 895         m = Module()
 896
 897         ol = []
 898         w = self.width
 899         sel = w // 8
 900         for i in range(self.n_parts):
 901             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 902             m.d.comb += op.eq(
 903                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 904                     self.intermed.part(i * w*2, w),
 905                     self.intermed.part(i * w*2 + w, w)))
 906             ol.append(op)
 907         m.d.comb += self.output.eq(Cat(*ol))
 908
 909         return m
 910
 911
 912 class FinalOut(Elaboratable):
 913     """ selects the final output based on the partitioning.
 914
 915         each byte is selectable independently, i.e. it is possible
 916         that some partitions requested 8-bit computation whilst others
 917         requested 16 or 32 bit.
 918     """
 919     def __init__(self, out_wid):
 920         # inputs
 921         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 922         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 923         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 924
 925         self.i8 = Signal(out_wid, reset_less=True)
 926         self.i16 = Signal(out_wid, reset_less=True)
 927         self.i32 = Signal(out_wid, reset_less=True)
 928         self.i64 = Signal(out_wid, reset_less=True)
 929
 930         # output
 931         self.out = Signal(out_wid, reset_less=True)
 932
 933     def elaborate(self, platform):
 934         m = Module()
 935         ol = []
 936         for i in range(8):
 937             # select one of the outputs: d8 selects i8, d16 selects i16
 938             # d32 selects i32, and the default is i64.
 939             # d8 and d16 are ORed together in the first Mux
 940             # then the 2nd selects either i8 or i16.
 941             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 942             op = Signal(8, reset_less=True, name="op_%d" % i)
 943             m.d.comb += op.eq(
 944                 Mux(self.d8[i] | self.d16[i // 2],
 945                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 946                                      self.i16.part(i * 8, 8)),
 947                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 948                                           self.i64.part(i * 8, 8))))
 949             ol.append(op)
 950         m.d.comb += self.out.eq(Cat(*ol))
 951         return m
 952
 953
 954 class OrMod(Elaboratable):
 955     """ ORs four values together in a hierarchical tree
 956     """
 957     def __init__(self, wid):
 958         self.wid = wid
 959         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 960                      for i in range(4)]
 961         self.orout = Signal(wid, reset_less=True)
 962
 963     def elaborate(self, platform):
 964         m = Module()
 965         or1 = Signal(self.wid, reset_less=True)
 966         or2 = Signal(self.wid, reset_less=True)
 967         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 968         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 969         m.d.comb += self.orout.eq(or1 | or2)
 970
 971         return m
 972
 973
 974 class Signs(Elaboratable):
 975     """ determines whether a or b are signed numbers
 976         based on the required operation type (OP_MUL_*)
 977     """
 978
 979     def __init__(self):
 980         self.part_ops = Signal(2, reset_less=True)
 981         self.a_signed = Signal(reset_less=True)
 982         self.b_signed = Signal(reset_less=True)
 983
 984     def elaborate(self, platform):
 985
 986         m = Module()
 987
 988         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
 989         bsig = (self.part_ops == OP_MUL_LOW) \
 990                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
 991         m.d.comb += self.a_signed.eq(asig)
 992         m.d.comb += self.b_signed.eq(bsig)
 993
 994         return m
 995
 996
 997 class Mul8_16_32_64(Elaboratable):
 998     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
 999
1000     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1001     partitions on naturally-aligned boundaries. Supports the operation being
1002     set for each partition independently.
1003
1004     :attribute part_pts: the input partition points. Has a partition point at
1005         multiples of 8 in 0 < i < 64. Each partition point's associated
1006         ``Value`` is a ``Signal``. Modification not supported, except for by
1007         ``Signal.eq``.
1008     :attribute part_ops: the operation for each byte. The operation for a
1009         particular partition is selected by assigning the selected operation
1010         code to each byte in the partition. The allowed operation codes are:
1011
1012         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1013             RISC-V's `mul` instruction.
1014         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1015             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1016             instruction.
1017         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1018             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1019             `mulhsu` instruction.
1020         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1021             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1022             instruction.
1023     """
1024
1025     def __init__(self, register_levels=()):
1026         """ register_levels: specifies the points in the cascade at which
1027             flip-flops are to be inserted.
1028         """
1029
1030         # parameter(s)
1031         self.register_levels = list(register_levels)
1032
1033         # inputs
1034         self.part_pts = PartitionPoints()
1035         for i in range(8, 64, 8):
1036             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1037         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1038         self.a = Signal(64)
1039         self.b = Signal(64)
1040
1041         # intermediates (needed for unit tests)
1042         self._intermediate_output = Signal(128)
1043
1044         # output
1045         self.output = Signal(64)
1046
1047     def elaborate(self, platform):
1048         m = Module()
1049
1050         # collect part-bytes
1051         pbs = Signal(8, reset_less=True)
1052         tl = []
1053         for i in range(8):
1054             pb = Signal(name="pb%d" % i, reset_less=True)
1055             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1056             tl.append(pb)
1057         m.d.comb += pbs.eq(Cat(*tl))
1058
1059         # create (doubled) PartitionPoints (output is double input width)
1060         expanded_part_pts = eps = PartitionPoints()
1061         for i, v in self.part_pts.items():
1062             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1063             expanded_part_pts[i * 2] = ep
1064             m.d.comb += ep.eq(v)
1065
1066         # local variables
1067         signs = []
1068         for i in range(8):
1069             s = Signs()
1070             signs.append(s)
1071             setattr(m.submodules, "signs%d" % i, s)
1072             m.d.comb += s.part_ops.eq(self.part_ops[i])
1073
1074         n_levels = len(self.register_levels)+1
1075         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1076         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1077         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1078         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1079         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1080         for mod in [part_8, part_16, part_32, part_64]:
1081             m.d.comb += mod.a.eq(self.a)
1082             m.d.comb += mod.b.eq(self.b)
1083             for i in range(len(signs)):
1084                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1085                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1086             m.d.comb += mod.pbs.eq(pbs)
1087             nat_l.append(mod.not_a_term)
1088             nbt_l.append(mod.not_b_term)
1089             nla_l.append(mod.neg_lsb_a_term)
1090             nlb_l.append(mod.neg_lsb_b_term)
1091
1092         terms = []
1093
1094         for a_index in range(8):
1095             t = ProductTerms(8, 128, 8, a_index, 8)
1096             setattr(m.submodules, "terms_%d" % a_index, t)
1097
1098             m.d.comb += t.a.eq(self.a)
1099             m.d.comb += t.b.eq(self.b)
1100             m.d.comb += t.pb_en.eq(pbs)
1101
1102             for term in t.terms:
1103                 terms.append(term)
1104
1105         # it's fine to bitwise-or data together since they are never enabled
1106         # at the same time
1107         m.submodules.nat_or = nat_or = OrMod(128)
1108         m.submodules.nbt_or = nbt_or = OrMod(128)
1109         m.submodules.nla_or = nla_or = OrMod(128)
1110         m.submodules.nlb_or = nlb_or = OrMod(128)
1111         for l, mod in [(nat_l, nat_or),
1112                              (nbt_l, nbt_or),
1113                              (nla_l, nla_or),
1114                              (nlb_l, nlb_or)]:
1115             for i in range(len(l)):
1116                 m.d.comb += mod.orin[i].eq(l[i])
1117             terms.append(mod.orout)
1118
1119         add_reduce = AddReduce(terms,
1120                                128,
1121                                self.register_levels,
1122                                expanded_part_pts,
1123                                self.part_ops)
1124
1125         out_part_ops = add_reduce.levels[-1].i.part_ops
1126         out_part_pts = add_reduce.levels[-1].i.reg_partition_points
1127
1128         m.submodules.add_reduce = add_reduce
1129         m.d.comb += self._intermediate_output.eq(add_reduce.output)
1130         # create _output_64
1131         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1132         m.d.comb += io64.intermed.eq(self._intermediate_output)
1133         for i in range(8):
1134             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1135
1136         # create _output_32
1137         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1138         m.d.comb += io32.intermed.eq(self._intermediate_output)
1139         for i in range(8):
1140             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1141
1142         # create _output_16
1143         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1144         m.d.comb += io16.intermed.eq(self._intermediate_output)
1145         for i in range(8):
1146             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1147
1148         # create _output_8
1149         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1150         m.d.comb += io8.intermed.eq(self._intermediate_output)
1151         for i in range(8):
1152             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1153
1154         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1155         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1156         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1157         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1158
1159         m.d.comb += p_8.epps.eq(out_part_pts)
1160         m.d.comb += p_16.epps.eq(out_part_pts)
1161         m.d.comb += p_32.epps.eq(out_part_pts)
1162         m.d.comb += p_64.epps.eq(out_part_pts)
1163
1164         # final output
1165         m.submodules.finalout = finalout = FinalOut(64)
1166         for i in range(len(part_8.parts)):
1167             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1168         for i in range(len(part_16.parts)):
1169             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1170         for i in range(len(part_32.parts)):
1171             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1172         m.d.comb += finalout.i8.eq(io8.output)
1173         m.d.comb += finalout.i16.eq(io16.output)
1174         m.d.comb += finalout.i32.eq(io32.output)
1175         m.d.comb += finalout.i64.eq(io64.output)
1176         m.d.comb += self.output.eq(finalout.out)
1177
1178         return m
1179
1180
1181 if __name__ == "__main__":
1182     m = Mul8_16_32_64()
1183     main(m, ports=[m.a,
1184                    m.b,
1185                    m._intermediate_output,
1186                    m.output,
1187                    *m.part_ops,
1188                    *m.part_pts.values()])