src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         """
  58         if name is None:
  59             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  60         retval = PartitionPoints()
  61         for point, enabled in self.items():
  62             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  63         return retval
  64
  65     def eq(self, rhs):
  66         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  67         if set(self.keys()) != set(rhs.keys()):
  68             raise ValueError("incompatible point set")
  69         for point, enabled in self.items():
  70             yield enabled.eq(rhs[point])
  71
  72     def as_mask(self, width):
  73         """Create a bit-mask from `self`.
  74
  75         Each bit in the returned mask is clear only if the partition point at
  76         the same bit-index is enabled.
  77
  78         :param width: the bit width of the resulting mask
  79         """
  80         bits = []
  81         for i in range(width):
  82             if i in self:
  83                 bits.append(~self[i])
  84             else:
  85                 bits.append(True)
  86         return Cat(*bits)
  87
  88     def get_max_partition_count(self, width):
  89         """Get the maximum number of partitions.
  90
  91         Gets the number of partitions when all partition points are enabled.
  92         """
  93         retval = 1
  94         for point in self.keys():
  95             if point < width:
  96                 retval += 1
  97         return retval
  98
  99     def fits_in_width(self, width):
 100         """Check if all partition points are smaller than `width`."""
 101         for point in self.keys():
 102             if point >= width:
 103                 return False
 104         return True
 105
 106
 107 class FullAdder(Elaboratable):
 108     """Full Adder.
 109
 110     :attribute in0: the first input
 111     :attribute in1: the second input
 112     :attribute in2: the third input
 113     :attribute sum: the sum output
 114     :attribute carry: the carry output
 115
 116     Rather than do individual full adders (and have an array of them,
 117     which would be very slow to simulate), this module can specify the
 118     bit width of the inputs and outputs: in effect it performs multiple
 119     Full 3-2 Add operations "in parallel".
 120     """
 121
 122     def __init__(self, width):
 123         """Create a ``FullAdder``.
 124
 125         :param width: the bit width of the input and output
 126         """
 127         self.in0 = Signal(width)
 128         self.in1 = Signal(width)
 129         self.in2 = Signal(width)
 130         self.sum = Signal(width)
 131         self.carry = Signal(width)
 132
 133     def elaborate(self, platform):
 134         """Elaborate this module."""
 135         m = Module()
 136         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 137         m.d.comb += self.carry.eq((self.in0 & self.in1)
 138                                   | (self.in1 & self.in2)
 139                                   | (self.in2 & self.in0))
 140         return m
 141
 142
 143 class PartitionedAdder(Elaboratable):
 144     """Partitioned Adder.
 145
 146     :attribute width: the bit width of the input and output. Read-only.
 147     :attribute a: the first input to the adder
 148     :attribute b: the second input to the adder
 149     :attribute output: the sum output
 150     :attribute partition_points: the input partition points. Modification not
 151         supported, except for by ``Signal.eq``.
 152     """
 153
 154     def __init__(self, width, partition_points):
 155         """Create a ``PartitionedAdder``.
 156
 157         :param width: the bit width of the input and output
 158         :param partition_points: the input partition points
 159         """
 160         self.width = width
 161         self.a = Signal(width)
 162         self.b = Signal(width)
 163         self.output = Signal(width)
 164         self.partition_points = PartitionPoints(partition_points)
 165         if not self.partition_points.fits_in_width(width):
 166             raise ValueError("partition_points doesn't fit in width")
 167         expanded_width = 0
 168         for i in range(self.width):
 169             if i in self.partition_points:
 170                 expanded_width += 1
 171             expanded_width += 1
 172         self._expanded_width = expanded_width
 173         # XXX these have to remain here due to some horrible nmigen
 174         # simulation bugs involving sync.  it is *not* necessary to
 175         # have them here, they should (under normal circumstances)
 176         # be moved into elaborate, as they are entirely local
 177         self._expanded_a = Signal(expanded_width)
 178         self._expanded_b = Signal(expanded_width)
 179         self._expanded_output = Signal(expanded_width)
 180
 181     def elaborate(self, platform):
 182         """Elaborate this module."""
 183         m = Module()
 184         expanded_index = 0
 185         # store bits in a list, use Cat later.  graphviz is much cleaner
 186         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 187
 188         # partition points are "breaks" (extra zeros) in what would otherwise
 189         # be a massive long add.
 190         for i in range(self.width):
 191             if i in self.partition_points:
 192                 # add extra bit set to 0 + 0 for enabled partition points
 193                 # and 1 + 0 for disabled partition points
 194                 ea.append(self._expanded_a[expanded_index])
 195                 al.append(~self.partition_points[i])
 196                 eb.append(self._expanded_b[expanded_index])
 197                 bl.append(C(0))
 198                 expanded_index += 1
 199             ea.append(self._expanded_a[expanded_index])
 200             al.append(self.a[i])
 201             eb.append(self._expanded_b[expanded_index])
 202             bl.append(self.b[i])
 203             eo.append(self._expanded_output[expanded_index])
 204             ol.append(self.output[i])
 205             expanded_index += 1
 206         # combine above using Cat
 207         m.d.comb += Cat(*ea).eq(Cat(*al))
 208         m.d.comb += Cat(*eb).eq(Cat(*bl))
 209         m.d.comb += Cat(*ol).eq(Cat(*eo))
 210         # use only one addition to take advantage of look-ahead carry and
 211         # special hardware on FPGAs
 212         m.d.comb += self._expanded_output.eq(
 213             self._expanded_a + self._expanded_b)
 214         return m
 215
 216
 217 FULL_ADDER_INPUT_COUNT = 3
 218
 219
 220 class AddReduce(Elaboratable):
 221     """Add list of numbers together.
 222
 223     :attribute inputs: input ``Signal``s to be summed. Modification not
 224         supported, except for by ``Signal.eq``.
 225     :attribute register_levels: List of nesting levels that should have
 226         pipeline registers.
 227     :attribute output: output sum.
 228     :attribute partition_points: the input partition points. Modification not
 229         supported, except for by ``Signal.eq``.
 230     """
 231
 232     def __init__(self, inputs, output_width, register_levels, partition_points):
 233         """Create an ``AddReduce``.
 234
 235         :param inputs: input ``Signal``s to be summed.
 236         :param output_width: bit-width of ``output``.
 237         :param register_levels: List of nesting levels that should have
 238             pipeline registers.
 239         :param partition_points: the input partition points.
 240         """
 241         self.inputs = list(inputs)
 242         self._resized_inputs = [
 243             Signal(output_width, name=f"resized_inputs[{i}]")
 244             for i in range(len(self.inputs))]
 245         self.register_levels = list(register_levels)
 246         self.output = Signal(output_width)
 247         self.partition_points = PartitionPoints(partition_points)
 248         if not self.partition_points.fits_in_width(output_width):
 249             raise ValueError("partition_points doesn't fit in output_width")
 250         self._reg_partition_points = self.partition_points.like()
 251         max_level = AddReduce.get_max_level(len(self.inputs))
 252         for level in self.register_levels:
 253             if level > max_level:
 254                 raise ValueError(
 255                     "not enough adder levels for specified register levels")
 256
 257     @staticmethod
 258     def get_max_level(input_count):
 259         """Get the maximum level.
 260
 261         All ``register_levels`` must be less than or equal to the maximum
 262         level.
 263         """
 264         retval = 0
 265         while True:
 266             groups = AddReduce.full_adder_groups(input_count)
 267             if len(groups) == 0:
 268                 return retval
 269             input_count %= FULL_ADDER_INPUT_COUNT
 270             input_count += 2 * len(groups)
 271             retval += 1
 272
 273     def next_register_levels(self):
 274         """``Iterable`` of ``register_levels`` for next recursive level."""
 275         for level in self.register_levels:
 276             if level > 0:
 277                 yield level - 1
 278
 279     @staticmethod
 280     def full_adder_groups(input_count):
 281         """Get ``inputs`` indices for which a full adder should be built."""
 282         return range(0,
 283                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 284                      FULL_ADDER_INPUT_COUNT)
 285
 286     def elaborate(self, platform):
 287         """Elaborate this module."""
 288         m = Module()
 289
 290         # resize inputs to correct bit-width and optionally add in
 291         # pipeline registers
 292         resized_input_assignments = [self._resized_inputs[i].eq(self.inputs[i])
 293                                      for i in range(len(self.inputs))]
 294         if 0 in self.register_levels:
 295             m.d.sync += resized_input_assignments
 296             m.d.sync += self._reg_partition_points.eq(self.partition_points)
 297         else:
 298             m.d.comb += resized_input_assignments
 299             m.d.comb += self._reg_partition_points.eq(self.partition_points)
 300
 301         groups = AddReduce.full_adder_groups(len(self.inputs))
 302         # if there are no full adders to create, then we handle the base cases
 303         # and return, otherwise we go on to the recursive case
 304         if len(groups) == 0:
 305             if len(self.inputs) == 0:
 306                 # use 0 as the default output value
 307                 m.d.comb += self.output.eq(0)
 308             elif len(self.inputs) == 1:
 309                 # handle single input
 310                 m.d.comb += self.output.eq(self._resized_inputs[0])
 311             else:
 312                 # base case for adding 2 or more inputs, which get recursively
 313                 # reduced to 2 inputs
 314                 assert len(self.inputs) == 2
 315                 adder = PartitionedAdder(len(self.output),
 316                                          self._reg_partition_points)
 317                 m.submodules.final_adder = adder
 318                 m.d.comb += adder.a.eq(self._resized_inputs[0])
 319                 m.d.comb += adder.b.eq(self._resized_inputs[1])
 320                 m.d.comb += self.output.eq(adder.output)
 321             return m
 322         # go on to handle recursive case
 323         intermediate_terms = []
 324
 325         def add_intermediate_term(value):
 326             intermediate_term = Signal(
 327                 len(self.output),
 328                 name=f"intermediate_terms[{len(intermediate_terms)}]")
 329             intermediate_terms.append(intermediate_term)
 330             m.d.comb += intermediate_term.eq(value)
 331
 332         # store mask in intermediary (simplifies graph)
 333         part_mask = Signal(len(self.output), reset_less=True)
 334         mask = self._reg_partition_points.as_mask(len(self.output))
 335         m.d.comb += part_mask.eq(mask)
 336
 337         # create full adders for this recursive level.
 338         # this shrinks N terms to 2 * (N // 3) plus the remainder
 339         for i in groups:
 340             adder_i = FullAdder(len(self.output))
 341             setattr(m.submodules, f"adder_{i}", adder_i)
 342             m.d.comb += adder_i.in0.eq(self._resized_inputs[i])
 343             m.d.comb += adder_i.in1.eq(self._resized_inputs[i + 1])
 344             m.d.comb += adder_i.in2.eq(self._resized_inputs[i + 2])
 345             add_intermediate_term(adder_i.sum)
 346             shifted_carry = adder_i.carry << 1
 347             # mask out carry bits to prevent carries between partitions
 348             add_intermediate_term((adder_i.carry << 1) & part_mask)
 349         # handle the remaining inputs.
 350         if len(self.inputs) % FULL_ADDER_INPUT_COUNT == 1:
 351             add_intermediate_term(self._resized_inputs[-1])
 352         elif len(self.inputs) % FULL_ADDER_INPUT_COUNT == 2:
 353             # Just pass the terms to the next layer, since we wouldn't gain
 354             # anything by using a half adder since there would still be 2 terms
 355             # and just passing the terms to the next layer saves gates.
 356             add_intermediate_term(self._resized_inputs[-2])
 357             add_intermediate_term(self._resized_inputs[-1])
 358         else:
 359             assert len(self.inputs) % FULL_ADDER_INPUT_COUNT == 0
 360         # recursive invocation of ``AddReduce``
 361         next_level = AddReduce(intermediate_terms,
 362                                len(self.output),
 363                                self.next_register_levels(),
 364                                self._reg_partition_points)
 365         m.submodules.next_level = next_level
 366         m.d.comb += self.output.eq(next_level.output)
 367         return m
 368
 369
 370 OP_MUL_LOW = 0
 371 OP_MUL_SIGNED_HIGH = 1
 372 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 373 OP_MUL_UNSIGNED_HIGH = 3
 374
 375
 376 def get_term(value, shift=0, enabled=None):
 377     if enabled is not None:
 378         value = Mux(enabled, value, 0)
 379     if shift > 0:
 380         value = Cat(Repl(C(0, 1), shift), value)
 381     else:
 382         assert shift == 0
 383     return value
 384
 385
 386 class ProductTerm(Elaboratable):
 387     """ this class creates a single product term (a[..]*b[..]).
 388         it has a design flaw in that is the *output* that is selected,
 389         where the multiplication(s) are combinatorially generated
 390         all the time.
 391     """
 392
 393     def __init__(self, width, twidth, pbwid, a_index, b_index):
 394         self.a_index = a_index
 395         self.b_index = b_index
 396         shift = 8 * (self.a_index + self.b_index)
 397         self.pwidth = width
 398         self.twidth = twidth
 399         self.width = width*2
 400         self.shift = shift
 401
 402         self.ti = Signal(self.width, reset_less=True)
 403         self.term = Signal(twidth, reset_less=True)
 404         self.a = Signal(twidth//2, reset_less=True)
 405         self.b = Signal(twidth//2, reset_less=True)
 406         self.pb_en = Signal(pbwid, reset_less=True)
 407
 408         self.tl = tl = []
 409         min_index = min(self.a_index, self.b_index)
 410         max_index = max(self.a_index, self.b_index)
 411         for i in range(min_index, max_index):
 412             tl.append(self.pb_en[i])
 413         name = "te_%d_%d" % (self.a_index, self.b_index)
 414         if len(tl) > 0:
 415             term_enabled = Signal(name=name, reset_less=True)
 416         else:
 417             term_enabled = None
 418         self.enabled = term_enabled
 419         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 420
 421     def elaborate(self, platform):
 422
 423         m = Module()
 424         if self.enabled is not None:
 425             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 426
 427         bsa = Signal(self.width, reset_less=True)
 428         bsb = Signal(self.width, reset_less=True)
 429         a_index, b_index = self.a_index, self.b_index
 430         pwidth = self.pwidth
 431         m.d.comb += bsa.eq(self.a.bit_select(a_index * pwidth, pwidth))
 432         m.d.comb += bsb.eq(self.b.bit_select(b_index * pwidth, pwidth))
 433         m.d.comb += self.ti.eq(bsa * bsb)
 434         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 435         """
 436         #TODO: sort out width issues, get inputs a/b switched on/off.
 437         #data going into Muxes is 1/2 the required width
 438
 439         pwidth = self.pwidth
 440         width = self.width
 441         bsa = Signal(self.twidth//2, reset_less=True)
 442         bsb = Signal(self.twidth//2, reset_less=True)
 443         asel = Signal(width, reset_less=True)
 444         bsel = Signal(width, reset_less=True)
 445         a_index, b_index = self.a_index, self.b_index
 446         m.d.comb += asel.eq(self.a.bit_select(a_index * pwidth, pwidth))
 447         m.d.comb += bsel.eq(self.b.bit_select(b_index * pwidth, pwidth))
 448         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 449         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 450         m.d.comb += self.ti.eq(bsa * bsb)
 451         m.d.comb += self.term.eq(self.ti)
 452         """
 453
 454         return m
 455
 456
 457 class ProductTerms(Elaboratable):
 458     """ creates a bank of product terms.  also performs the actual bit-selection
 459         this class is to be wrapped with a for-loop on the "a" operand.
 460         it creates a second-level for-loop on the "b" operand.
 461     """
 462     def __init__(self, width, twidth, pbwid, a_index, blen):
 463         self.a_index = a_index
 464         self.blen = blen
 465         self.pwidth = width
 466         self.twidth = twidth
 467         self.pbwid = pbwid
 468         self.a = Signal(twidth//2, reset_less=True)
 469         self.b = Signal(twidth//2, reset_less=True)
 470         self.pb_en = Signal(pbwid, reset_less=True)
 471         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 472                             for i in range(blen)]
 473
 474     def elaborate(self, platform):
 475
 476         m = Module()
 477
 478         for b_index in range(self.blen):
 479             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 480                             self.a_index, b_index)
 481             setattr(m.submodules, "term_%d" % b_index, t)
 482
 483             m.d.comb += t.a.eq(self.a)
 484             m.d.comb += t.b.eq(self.b)
 485             m.d.comb += t.pb_en.eq(self.pb_en)
 486
 487             m.d.comb += self.terms[b_index].eq(t.term)
 488
 489         return m
 490
 491
 492 class Part(Elaboratable):
 493     """ a key class which, depending on the partitioning, will determine
 494         what action to take when parts of the output are signed or unsigned.
 495
 496         this requires 2 pieces of data *per operand, per partition*:
 497         whether the MSB is HI/LO (per partition!), and whether a signed
 498         or unsigned operation has been *requested*.
 499
 500         once that is determined, signed is basically carried out
 501         by splitting 2's complement into 1's complement plus one.
 502         1's complement is just a bit-inversion.
 503
 504         the extra terms - as separate terms - are then thrown at the
 505         AddReduce alongside the multiplication part-results.
 506     """
 507     def __init__(self, width, n_parts, n_levels, pbwid):
 508
 509         # inputs
 510         self.a = Signal(64)
 511         self.b = Signal(64)
 512         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 513         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 514         self.pbs = Signal(pbwid, reset_less=True)
 515
 516         # outputs
 517         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 518         self.delayed_parts = [
 519             [Signal(name=f"delayed_part_{delay}_{i}")
 520              for i in range(n_parts)]
 521                 for delay in range(n_levels)]
 522         # XXX REALLY WEIRD BUG - have to take a copy of the last delayed_parts
 523         self.dplast = [Signal(name=f"dplast_{i}")
 524                          for i in range(n_parts)]
 525
 526         self.not_a_term = Signal(width)
 527         self.neg_lsb_a_term = Signal(width)
 528         self.not_b_term = Signal(width)
 529         self.neg_lsb_b_term = Signal(width)
 530
 531     def elaborate(self, platform):
 532         m = Module()
 533
 534         pbs, parts, delayed_parts = self.pbs, self.parts, self.delayed_parts
 535         byte_count = 8 // len(parts)
 536         for i in range(len(parts)):
 537             pbl = []
 538             pbl.append(~pbs[i * byte_count - 1])
 539             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 540                 pbl.append(pbs[j])
 541             pbl.append(~pbs[(i + 1) * byte_count - 1])
 542             value = Signal(len(pbl), reset_less=True)
 543             m.d.comb += value.eq(Cat(*pbl))
 544             m.d.comb += parts[i].eq(~(value).bool())
 545             m.d.comb += delayed_parts[0][i].eq(parts[i])
 546             m.d.sync += [delayed_parts[j + 1][i].eq(delayed_parts[j][i])
 547                          for j in range(len(delayed_parts)-1)]
 548             m.d.comb += self.dplast[i].eq(delayed_parts[-1][i])
 549
 550         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = \
 551                 self.not_a_term, self.neg_lsb_a_term, \
 552                 self.not_b_term, self.neg_lsb_b_term
 553
 554         byte_width = 8 // len(parts) # byte width
 555         bit_wid = 8 * byte_width     # bit width
 556         ext = Repl(0, bit_wid)       # extend output to HI part
 557         nat, nbt, nla, nlb = [], [], [], []
 558         for i in range(len(parts)):
 559             # determine sign of each incoming number *in this partition*
 560             be = (parts[i] & self.a[(i + 1) * bit_wid - 1]  # MSB
 561                 & self.a_signed[i * byte_width])            # a op is signed?
 562             ae = (parts[i] & self.b[(i + 1) * bit_wid - 1]  # MSB
 563                 & self.b_signed[i * byte_width])            # b op is signed?
 564             a_enabled = Signal(name="a_en_%d" % i, reset_less=True)
 565             b_enabled = Signal(name="b_en_%d" % i, reset_less=True)
 566             m.d.comb += a_enabled.eq(ae)
 567             m.d.comb += b_enabled.eq(be)
 568
 569             # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 570             # negation operation is split into a bitwise not and a +1.
 571             # likewise for 16, 32, and 64-bit values.
 572
 573             # a: width-extended 1s complement if a is signed, otherwise zero
 574             nat.append(Mux(a_enabled,
 575                            Cat(ext, ~self.a.bit_select(bit_wid * i, bit_wid)),
 576                            0))
 577
 578             # a: add 1 if a signed, otherwise add zero
 579             nla.append(Cat(ext, a_enabled, Repl(0, bit_wid-1)))
 580
 581             # b: width-extended 1s complement if a is signed, otherwise zero
 582             nbt.append(Mux(b_enabled,
 583                            Cat(ext, ~self.b.bit_select(bit_wid * i, bit_wid)),
 584                            0))
 585
 586             # b: add 1 if b signed, otherwise add zero
 587             nlb.append(Cat(ext, b_enabled, Repl(0, bit_wid-1)))
 588
 589         # concatenate together and return all 4 results.
 590         m.d.comb += [not_a_term.eq(Cat(*nat)),
 591                      not_b_term.eq(Cat(*nbt)),
 592                      neg_lsb_a_term.eq(Cat(*nla)),
 593                      neg_lsb_b_term.eq(Cat(*nlb)),
 594                     ]
 595
 596         return m
 597
 598
 599 class IntermediateOut(Elaboratable):
 600     """ selects the HI/LO part of the multiplication, for a given bit-width
 601         the output is also reconstructed in its SIMD (partition) lanes.
 602     """
 603     def __init__(self, width, out_wid, n_parts):
 604         self.width = width
 605         self.n_parts = n_parts
 606         self.delayed_part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 607                                      for i in range(8)]
 608         self.intermed = Signal(out_wid, reset_less=True)
 609         self.output = Signal(out_wid//2, reset_less=True)
 610
 611     def elaborate(self, platform):
 612         m = Module()
 613
 614         ol = []
 615         w = self.width
 616         sel = w // 8
 617         for i in range(self.n_parts):
 618             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 619             m.d.comb += op.eq(
 620                 Mux(self.delayed_part_ops[sel * i] == OP_MUL_LOW,
 621                     self.intermed.bit_select(i * w*2, w),
 622                     self.intermed.bit_select(i * w*2 + w, w)))
 623             ol.append(op)
 624         m.d.comb += self.output.eq(Cat(*ol))
 625
 626         return m
 627
 628
 629 class FinalOut(Elaboratable):
 630     """ selects the final output based on the partitioning.
 631
 632         each byte is selectable independently, i.e. it is possible
 633         that some partitions requested 8-bit computation whilst others
 634         requested 16 or 32 bit.
 635     """
 636     def __init__(self, out_wid):
 637         # inputs
 638         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 639         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 640         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 641
 642         self.i8 = Signal(out_wid, reset_less=True)
 643         self.i16 = Signal(out_wid, reset_less=True)
 644         self.i32 = Signal(out_wid, reset_less=True)
 645         self.i64 = Signal(out_wid, reset_less=True)
 646
 647         # output
 648         self.out = Signal(out_wid, reset_less=True)
 649
 650     def elaborate(self, platform):
 651         m = Module()
 652         ol = []
 653         for i in range(8):
 654             # select one of the outputs: d8 selects i8, d16 selects i16
 655             # d32 selects i32, and the default is i64.
 656             # d8 and d16 are ORed together in the first Mux
 657             # then the 2nd selects either i8 or i16.
 658             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 659             op = Signal(8, reset_less=True, name="op_%d" % i)
 660             m.d.comb += op.eq(
 661                 Mux(self.d8[i] | self.d16[i // 2],
 662                     Mux(self.d8[i], self.i8.bit_select(i * 8, 8),
 663                                      self.i16.bit_select(i * 8, 8)),
 664                     Mux(self.d32[i // 4], self.i32.bit_select(i * 8, 8),
 665                                           self.i64.bit_select(i * 8, 8))))
 666             ol.append(op)
 667         m.d.comb += self.out.eq(Cat(*ol))
 668         return m
 669
 670
 671 class OrMod(Elaboratable):
 672     """ ORs four values together in a hierarchical tree
 673     """
 674     def __init__(self, wid):
 675         self.wid = wid
 676         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 677                      for i in range(4)]
 678         self.orout = Signal(wid, reset_less=True)
 679
 680     def elaborate(self, platform):
 681         m = Module()
 682         or1 = Signal(self.wid, reset_less=True)
 683         or2 = Signal(self.wid, reset_less=True)
 684         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 685         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 686         m.d.comb += self.orout.eq(or1 | or2)
 687
 688         return m
 689
 690
 691 class Signs(Elaboratable):
 692     """ determines whether a or b are signed numbers
 693         based on the required operation type (OP_MUL_*)
 694     """
 695
 696     def __init__(self):
 697         self.part_ops = Signal(2, reset_less=True)
 698         self.a_signed = Signal(reset_less=True)
 699         self.b_signed = Signal(reset_less=True)
 700
 701     def elaborate(self, platform):
 702
 703         m = Module()
 704
 705         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
 706         bsig = (self.part_ops == OP_MUL_LOW) \
 707                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
 708         m.d.comb += self.a_signed.eq(asig)
 709         m.d.comb += self.b_signed.eq(bsig)
 710
 711         return m
 712
 713
 714 class Mul8_16_32_64(Elaboratable):
 715     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
 716
 717     Supports partitioning into any combination of 8, 16, 32, and 64-bit
 718     partitions on naturally-aligned boundaries. Supports the operation being
 719     set for each partition independently.
 720
 721     :attribute part_pts: the input partition points. Has a partition point at
 722         multiples of 8 in 0 < i < 64. Each partition point's associated
 723         ``Value`` is a ``Signal``. Modification not supported, except for by
 724         ``Signal.eq``.
 725     :attribute part_ops: the operation for each byte. The operation for a
 726         particular partition is selected by assigning the selected operation
 727         code to each byte in the partition. The allowed operation codes are:
 728
 729         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
 730             RISC-V's `mul` instruction.
 731         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
 732             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
 733             instruction.
 734         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
 735             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
 736             `mulhsu` instruction.
 737         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
 738             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
 739             instruction.
 740     """
 741
 742     def __init__(self, register_levels=()):
 743         """ register_levels: specifies the points in the cascade at which
 744             flip-flops are to be inserted.
 745         """
 746
 747         # parameter(s)
 748         self.register_levels = list(register_levels)
 749
 750         # inputs
 751         self.part_pts = PartitionPoints()
 752         for i in range(8, 64, 8):
 753             self.part_pts[i] = Signal(name=f"part_pts_{i}")
 754         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
 755         self.a = Signal(64)
 756         self.b = Signal(64)
 757
 758         # intermediates (needed for unit tests)
 759         self._intermediate_output = Signal(128)
 760
 761         # output
 762         self.output = Signal(64)
 763
 764     def _part_byte(self, index):
 765         if index == -1 or index == 7:
 766             return C(True, 1)
 767         assert index >= 0 and index < 8
 768         return self.part_pts[index * 8 + 8]
 769
 770     def elaborate(self, platform):
 771         m = Module()
 772
 773         # collect part-bytes
 774         pbs = Signal(8, reset_less=True)
 775         tl = []
 776         for i in range(8):
 777             pb = Signal(name="pb%d" % i, reset_less=True)
 778             m.d.comb += pb.eq(self._part_byte(i))
 779             tl.append(pb)
 780         m.d.comb += pbs.eq(Cat(*tl))
 781
 782         # local variables
 783         signs = []
 784         for i in range(8):
 785             s = Signs()
 786             signs.append(s)
 787             setattr(m.submodules, "signs%d" % i, s)
 788             m.d.comb += s.part_ops.eq(self.part_ops[i])
 789
 790         delayed_part_ops = [
 791             [Signal(2, name=f"_delayed_part_ops_{delay}_{i}")
 792              for i in range(8)]
 793             for delay in range(1 + len(self.register_levels))]
 794         for i in range(len(self.part_ops)):
 795             m.d.comb += delayed_part_ops[0][i].eq(self.part_ops[i])
 796             m.d.sync += [delayed_part_ops[j + 1][i].eq(delayed_part_ops[j][i])
 797                          for j in range(len(self.register_levels))]
 798
 799         n_levels = len(self.register_levels)+1
 800         m.submodules.part_8 = part_8 = Part(128, 8, n_levels, 8)
 801         m.submodules.part_16 = part_16 = Part(128, 4, n_levels, 8)
 802         m.submodules.part_32 = part_32 = Part(128, 2, n_levels, 8)
 803         m.submodules.part_64 = part_64 = Part(128, 1, n_levels, 8)
 804         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
 805         for mod in [part_8, part_16, part_32, part_64]:
 806             m.d.comb += mod.a.eq(self.a)
 807             m.d.comb += mod.b.eq(self.b)
 808             for i in range(len(signs)):
 809                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
 810                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
 811             m.d.comb += mod.pbs.eq(pbs)
 812             nat_l.append(mod.not_a_term)
 813             nbt_l.append(mod.not_b_term)
 814             nla_l.append(mod.neg_lsb_a_term)
 815             nlb_l.append(mod.neg_lsb_b_term)
 816
 817         terms = []
 818
 819         for a_index in range(8):
 820             t = ProductTerms(8, 128, 8, a_index, 8)
 821             setattr(m.submodules, "terms_%d" % a_index, t)
 822
 823             m.d.comb += t.a.eq(self.a)
 824             m.d.comb += t.b.eq(self.b)
 825             m.d.comb += t.pb_en.eq(pbs)
 826
 827             for term in t.terms:
 828                 terms.append(term)
 829
 830         # it's fine to bitwise-or data together since they are never enabled
 831         # at the same time
 832         m.submodules.nat_or = nat_or = OrMod(128)
 833         m.submodules.nbt_or = nbt_or = OrMod(128)
 834         m.submodules.nla_or = nla_or = OrMod(128)
 835         m.submodules.nlb_or = nlb_or = OrMod(128)
 836         for l, mod in [(nat_l, nat_or),
 837                              (nbt_l, nbt_or),
 838                              (nla_l, nla_or),
 839                              (nlb_l, nlb_or)]:
 840             for i in range(len(l)):
 841                 m.d.comb += mod.orin[i].eq(l[i])
 842             terms.append(mod.orout)
 843
 844         expanded_part_pts = PartitionPoints()
 845         for i, v in self.part_pts.items():
 846             signal = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
 847             expanded_part_pts[i * 2] = signal
 848             m.d.comb += signal.eq(v)
 849
 850         add_reduce = AddReduce(terms,
 851                                128,
 852                                self.register_levels,
 853                                expanded_part_pts)
 854         m.submodules.add_reduce = add_reduce
 855         m.d.comb += self._intermediate_output.eq(add_reduce.output)
 856         # create _output_64
 857         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
 858         m.d.comb += io64.intermed.eq(self._intermediate_output)
 859         for i in range(8):
 860             m.d.comb += io64.delayed_part_ops[i].eq(delayed_part_ops[-1][i])
 861
 862         # create _output_32
 863         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
 864         m.d.comb += io32.intermed.eq(self._intermediate_output)
 865         for i in range(8):
 866             m.d.comb += io32.delayed_part_ops[i].eq(delayed_part_ops[-1][i])
 867
 868         # create _output_16
 869         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
 870         m.d.comb += io16.intermed.eq(self._intermediate_output)
 871         for i in range(8):
 872             m.d.comb += io16.delayed_part_ops[i].eq(delayed_part_ops[-1][i])
 873
 874         # create _output_8
 875         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
 876         m.d.comb += io8.intermed.eq(self._intermediate_output)
 877         for i in range(8):
 878             m.d.comb += io8.delayed_part_ops[i].eq(delayed_part_ops[-1][i])
 879
 880         # final output
 881         m.submodules.finalout = finalout = FinalOut(64)
 882         for i in range(len(part_8.delayed_parts[-1])):
 883             m.d.comb += finalout.d8[i].eq(part_8.dplast[i])
 884         for i in range(len(part_16.delayed_parts[-1])):
 885             m.d.comb += finalout.d16[i].eq(part_16.dplast[i])
 886         for i in range(len(part_32.delayed_parts[-1])):
 887             m.d.comb += finalout.d32[i].eq(part_32.dplast[i])
 888         m.d.comb += finalout.i8.eq(io8.output)
 889         m.d.comb += finalout.i16.eq(io16.output)
 890         m.d.comb += finalout.i32.eq(io32.output)
 891         m.d.comb += finalout.i64.eq(io64.output)
 892         m.d.comb += self.output.eq(finalout.out)
 893
 894         return m
 895
 896
 897 if __name__ == "__main__":
 898     m = Mul8_16_32_64()
 899     main(m, ports=[m.a,
 900                    m.b,
 901                    m._intermediate_output,
 902                    m.output,
 903                    *m.part_ops,
 904                    *m.part_pts.values()])