src/ieee754/part_mul_add/adder.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Partitioned Integer Addition.
   4
   5 See:
   6 * https://libre-riscv.org/3d_gpu/architecture/dynamic_simd/add/
   7 """
   8
   9 from nmigen import Signal, Module, Elaboratable, Cat
  10
  11 from ieee754.part_mul_add.partpoints import PartitionPoints
  12 from ieee754.part_cmp.ripple import MoveMSBDown
  13
  14
  15 class FullAdder(Elaboratable):
  16     """Full Adder.
  17
  18     :attribute in0: the first input
  19     :attribute in1: the second input
  20     :attribute in2: the third input
  21     :attribute sum: the sum output
  22     :attribute carry: the carry output
  23
  24     Rather than do individual full adders (and have an array of them,
  25     which would be very slow to simulate), this module can specify the
  26     bit width of the inputs and outputs: in effect it performs multiple
  27     Full 3-2 Add operations "in parallel".
  28     """
  29
  30     def __init__(self, width):
  31         """Create a ``FullAdder``.
  32
  33         :param width: the bit width of the input and output
  34         """
  35         self.in0 = Signal(width, reset_less=True)
  36         self.in1 = Signal(width, reset_less=True)
  37         self.in2 = Signal(width, reset_less=True)
  38         self.sum = Signal(width, reset_less=True)
  39         self.carry = Signal(width, reset_less=True)
  40
  41     def elaborate(self, platform):
  42         """Elaborate this module."""
  43         m = Module()
  44         comb = m.d.comb
  45         comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
  46         comb += self.carry.eq((self.in0 & self.in1)
  47                               | (self.in1 & self.in2)
  48                               | (self.in2 & self.in0))
  49         return m
  50
  51
  52 class MaskedFullAdder(Elaboratable):
  53     """Masked Full Adder.
  54
  55     :attribute mask: the carry partition mask
  56     :attribute in0: the first input
  57     :attribute in1: the second input
  58     :attribute in2: the third input
  59     :attribute sum: the sum output
  60     :attribute mcarry: the masked carry output
  61
  62     FullAdders are always used with a "mask" on the output.  To keep
  63     the graphviz "clean", this class performs the masking here rather
  64     than inside a large for-loop.
  65
  66     See the following discussion as to why this is no longer derived
  67     from FullAdder.  Each carry is shifted here *before* being ANDed
  68     with the mask, so that an AOI cell may be used (which is more
  69     gate-efficient)
  70     https://en.wikipedia.org/wiki/AND-OR-Invert
  71     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
  72     """
  73
  74     def __init__(self, width):
  75         """Create a ``MaskedFullAdder``.
  76
  77         :param width: the bit width of the input and output
  78         """
  79         self.width = width
  80         self.mask = Signal(width, reset_less=True)
  81         self.mcarry = Signal(width, reset_less=True)
  82         self.in0 = Signal(width, reset_less=True)
  83         self.in1 = Signal(width, reset_less=True)
  84         self.in2 = Signal(width, reset_less=True)
  85         self.sum = Signal(width, reset_less=True)
  86
  87     def elaborate(self, platform):
  88         """Elaborate this module."""
  89         m = Module()
  90         comb = m.d.comb
  91         s1 = Signal(self.width, reset_less=True)
  92         s2 = Signal(self.width, reset_less=True)
  93         s3 = Signal(self.width, reset_less=True)
  94         c1 = Signal(self.width, reset_less=True)
  95         c2 = Signal(self.width, reset_less=True)
  96         c3 = Signal(self.width, reset_less=True)
  97         comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
  98         comb += s1.eq(Cat(0, self.in0))
  99         comb += s2.eq(Cat(0, self.in1))
 100         comb += s3.eq(Cat(0, self.in2))
 101         comb += c1.eq(s1 & s2 & self.mask)
 102         comb += c2.eq(s2 & s3 & self.mask)
 103         comb += c3.eq(s3 & s1 & self.mask)
 104         comb += self.mcarry.eq(c1 | c2 | c3)
 105         return m
 106
 107
 108 class PartitionedAdder(Elaboratable):
 109     """Partitioned Adder.
 110
 111     Performs the final add.  The partition points are included in the
 112     actual add (in one of the operands only), which causes a carry over
 113     to the next bit.  Then the final output *removes* the extra bits from
 114     the result.
 115
 116     In the case of no carry:
 117     partition: .... P... P... P... P... (32 bits)
 118     a        : .... .... .... .... .... (32 bits)
 119     b        : .... .... .... .... .... (32 bits)
 120     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 121     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 122     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 123     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 124
 125     However, with carry the behavior is a little different:
 126     partition:      p    p    p    p      (4 bits)
 127     carry-in :      c    c    c    c    c (5 bits)
 128     C = c & P:      C    C    C    C    c (5 bits)
 129     I = P=>c :      I    I    I    I    c (5 bits)
 130     a        :  AAAA AAAA AAAA AAAA AAAA  (32 bits)
 131     b        :  BBBB BBBB BBBB BBBB BBBB  (32 bits)
 132     exp-a    : 0AAAACAAAACAAAACAAAACAAAAc (32+4+2 bits, P=1 if no partition)
 133     exp-b    : 0BBBBIBBBBIBBBBIBBBBIBBBBc (32+2 bits plus 4 zeros)
 134     exp-o    : o....oN...oN...oN...oN...x (32+4+2 bits - x to be discarded)
 135     o        :  .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 136     carry-out: o    o    o    o    o      (5 bits)
 137
 138     A couple of differences should be noted:
 139      - The expanded a/b/o have 2 extra bits added to them. These bits
 140        allow the carry-in for the least significant partition to be
 141        injected, and the carry out for the most significant partition
 142        to be extracted.
 143      - The partition bits P and 0 in the first example have been
 144        replaced with bits C and I. Bits C and I are set to 1 when
 145        there is a partition and a carry-in at that position. This has
 146        the effect of creating a carry at that position in the expanded
 147        adder, while preventing carries from the previous partition
 148        from propogating through to the next. These bits are also used
 149        to extract the carry-out information for each partition, as
 150        when there is a carry out in a partition, the next most
 151        significant partition bit will be set to 1
 152
 153     Additionally, the carry-out bits must be rearranged before being
 154     output to move the most significant carry bit for each partition
 155     into the least significant bit for that partition, as well as to
 156     ignore the other carry bits in that partition. This is
 157     accomplished by the MoveMSBDown module
 158
 159     :attribute width: the bit width of the input and output. Read-only.
 160     :attribute a: the first input to the adder
 161     :attribute b: the second input to the adder
 162     :attribute output: the sum output
 163     :attribute part_pts: the input partition points. Modification not
 164         supported, except for by ``Signal.eq``.
 165     """
 166
 167     def __init__(self, width, part_pts, partition_step=1):
 168         """Create a ``PartitionedAdder``.
 169
 170         :param width: the bit width of the input and output
 171         :param part_pts: the input partition points
 172         :param partition_step: a multiplier (typically double) step
 173                                which in-place "expands" the partition points
 174         """
 175         self.width = width
 176         self.pmul = partition_step
 177         self.part_pts = PartitionPoints(part_pts)
 178         self.a = Signal(width, reset_less=True)
 179         self.b = Signal(width, reset_less=True)
 180         self.carry_in = Signal(self.part_pts.get_max_partition_count(width))
 181         self.carry_out = Signal(self.part_pts.get_max_partition_count(width))
 182         self.output = Signal(width, reset_less=True)
 183         if not self.part_pts.fits_in_width(width):
 184             raise ValueError("partition_points doesn't fit in width")
 185         expanded_width = 2
 186         for i in range(self.width):
 187             if i in self.part_pts:
 188                 expanded_width += 1
 189             expanded_width += 1
 190         self._expanded_width = expanded_width
 191
 192     def elaborate(self, platform):
 193         """Elaborate this module."""
 194         m = Module()
 195         comb = m.d.comb
 196
 197         carry_tmp = Signal(self.carry_out.width)
 198         m.submodules.ripple = ripple = MoveMSBDown(self.carry_out.width)
 199
 200         expanded_a = Signal(self._expanded_width, reset_less=True)
 201         expanded_b = Signal(self._expanded_width, reset_less=True)
 202         expanded_o = Signal(self._expanded_width, reset_less=True)
 203
 204         expanded_index = 0
 205         # store bits in a list, use Cat later.  graphviz is much cleaner
 206         al, bl, ol, cl, ea, eb, eo, co = [], [], [], [], [], [], [], []
 207
 208         # partition points are "breaks" (extra zeros or 1s) in what would
 209         # otherwise be a massive long add.  when the "break" points are 0,
 210         # whatever is in it (in the output) is discarded.  however when
 211         # there is a "1", it causes a roll-over carry to the *next* bit.
 212         # we still ignore the "break" bit in the [intermediate] output,
 213         # however by that time we've got the effect that we wanted: the
 214         # carry has been carried *over* the break point.
 215
 216         carry_bit = 0
 217         al.append(self.carry_in[carry_bit])
 218         bl.append(self.carry_in[carry_bit])
 219         ea.append(expanded_a[expanded_index])
 220         eb.append(expanded_b[expanded_index])
 221         carry_bit += 1
 222         expanded_index += 1
 223
 224         for i in range(self.width):
 225             pi = i/self.pmul  # double the range of the partition point test
 226             if pi.is_integer() and pi in self.part_pts:
 227                 # add extra bit set to carry + carry for enabled
 228                 # partition points
 229                 a_bit = Signal(name="a_bit_%d" % i, reset_less=True)
 230                 carry_in = self.carry_in[carry_bit]  # convenience
 231                 m.d.comb += a_bit.eq(self.part_pts[pi].implies(carry_in))
 232
 233                 # and 1 + 0 for disabled partition points
 234                 ea.append(expanded_a[expanded_index])
 235                 al.append(a_bit)  # add extra bit in a
 236                 eb.append(expanded_b[expanded_index])
 237                 bl.append(carry_in & self.part_pts[pi])  # carry bit
 238                 co.append(expanded_o[expanded_index])
 239                 cl.append(carry_tmp[carry_bit-1])
 240                 expanded_index += 1  # skip the extra point.  NOT in the output
 241                 carry_bit += 1
 242             ea.append(expanded_a[expanded_index])
 243             eb.append(expanded_b[expanded_index])
 244             eo.append(expanded_o[expanded_index])
 245             al.append(self.a[i])
 246             bl.append(self.b[i])
 247             ol.append(self.output[i])
 248             expanded_index += 1
 249         al.append(0)
 250         bl.append(0)
 251         co.append(expanded_o[-1])
 252         cl.append(carry_tmp[carry_bit-1])
 253
 254         # combine above using Cat
 255         comb += Cat(*ea).eq(Cat(*al))
 256         comb += Cat(*eb).eq(Cat(*bl))
 257         comb += Cat(*ol).eq(Cat(*eo))
 258         comb += Cat(*cl).eq(Cat(*co))
 259
 260         # use only one addition to take advantage of look-ahead carry and
 261         # special hardware on FPGAs
 262         comb += expanded_o.eq(expanded_a + expanded_b)
 263
 264         # ok now we have the carry-out, however because it's the MSB it's
 265         # in the wrong position in the output as far as putting it into
 266         # a chain of adds (or other operations).  therefore we need to
 267         # "ripple" carry-out down to the same position that carry-in is
 268         # in [the LSB of each partition].
 269         comb += ripple.results_in.eq(carry_tmp)
 270         comb += ripple.gates.eq(self.part_pts.as_sig())
 271         m.d.sync += self.carry_out.eq(ripple.output)
 272
 273         return m