run tests in parallel
[ieee754fpu.git] / src / ieee754 / part_mul_add / adder.py
1 # SPDX-License-Identifier: LGPL-2.1-or-later
2 # See Notices.txt for copyright information
3 """Partitioned Integer Addition.
4
5 See:
6 * https://libre-riscv.org/3d_gpu/architecture/dynamic_simd/add/
7 """
8
9 from nmigen import Signal, Module, Elaboratable, Cat
10
11 from ieee754.part_mul_add.partpoints import PartitionPoints
12 from ieee754.part_cmp.ripple import MoveMSBDown
13
14
15 class FullAdder(Elaboratable):
16 """Full Adder.
17
18 :attribute in0: the first input
19 :attribute in1: the second input
20 :attribute in2: the third input
21 :attribute sum: the sum output
22 :attribute carry: the carry output
23
24 Rather than do individual full adders (and have an array of them,
25 which would be very slow to simulate), this module can specify the
26 bit width of the inputs and outputs: in effect it performs multiple
27 Full 3-2 Add operations "in parallel".
28 """
29
30 def __init__(self, width):
31 """Create a ``FullAdder``.
32
33 :param width: the bit width of the input and output
34 """
35 self.in0 = Signal(width, reset_less=True)
36 self.in1 = Signal(width, reset_less=True)
37 self.in2 = Signal(width, reset_less=True)
38 self.sum = Signal(width, reset_less=True)
39 self.carry = Signal(width, reset_less=True)
40
41 def elaborate(self, platform):
42 """Elaborate this module."""
43 m = Module()
44 comb = m.d.comb
45 comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
46 comb += self.carry.eq((self.in0 & self.in1)
47 | (self.in1 & self.in2)
48 | (self.in2 & self.in0))
49 return m
50
51
52 class MaskedFullAdder(Elaboratable):
53 """Masked Full Adder.
54
55 :attribute mask: the carry partition mask
56 :attribute in0: the first input
57 :attribute in1: the second input
58 :attribute in2: the third input
59 :attribute sum: the sum output
60 :attribute mcarry: the masked carry output
61
62 FullAdders are always used with a "mask" on the output. To keep
63 the graphviz "clean", this class performs the masking here rather
64 than inside a large for-loop.
65
66 See the following discussion as to why this is no longer derived
67 from FullAdder. Each carry is shifted here *before* being ANDed
68 with the mask, so that an AOI cell may be used (which is more
69 gate-efficient)
70 https://en.wikipedia.org/wiki/AND-OR-Invert
71 https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
72 """
73
74 def __init__(self, width):
75 """Create a ``MaskedFullAdder``.
76
77 :param width: the bit width of the input and output
78 """
79 self.width = width
80 self.mask = Signal(width, reset_less=True)
81 self.mcarry = Signal(width, reset_less=True)
82 self.in0 = Signal(width, reset_less=True)
83 self.in1 = Signal(width, reset_less=True)
84 self.in2 = Signal(width, reset_less=True)
85 self.sum = Signal(width, reset_less=True)
86
87 def elaborate(self, platform):
88 """Elaborate this module."""
89 m = Module()
90 comb = m.d.comb
91 s1 = Signal(self.width, reset_less=True)
92 s2 = Signal(self.width, reset_less=True)
93 s3 = Signal(self.width, reset_less=True)
94 c1 = Signal(self.width, reset_less=True)
95 c2 = Signal(self.width, reset_less=True)
96 c3 = Signal(self.width, reset_less=True)
97 comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
98 comb += s1.eq(Cat(0, self.in0))
99 comb += s2.eq(Cat(0, self.in1))
100 comb += s3.eq(Cat(0, self.in2))
101 comb += c1.eq(s1 & s2 & self.mask)
102 comb += c2.eq(s2 & s3 & self.mask)
103 comb += c3.eq(s3 & s1 & self.mask)
104 comb += self.mcarry.eq(c1 | c2 | c3)
105 return m
106
107
108 class PartitionedAdder(Elaboratable):
109 """Partitioned Adder.
110
111 Performs the final add. The partition points are included in the
112 actual add (in one of the operands only), which causes a carry over
113 to the next bit. Then the final output *removes* the extra bits from
114 the result.
115
116 In the case of no carry:
117 partition: .... P... P... P... P... (32 bits)
118 a : .... .... .... .... .... (32 bits)
119 b : .... .... .... .... .... (32 bits)
120 exp-a : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
121 exp-b : ....0....0....0....0.... (32 bits plus 4 zeros)
122 exp-o : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
123 o : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
124
125 However, with carry the behavior is a little different:
126 partition: p p p p (4 bits)
127 carry-in : c c c c c (5 bits)
128 C = c & P: C C C C c (5 bits)
129 I = P=>c : I I I I c (5 bits)
130 a : AAAA AAAA AAAA AAAA AAAA (32 bits)
131 b : BBBB BBBB BBBB BBBB BBBB (32 bits)
132 exp-a : 0AAAACAAAACAAAACAAAACAAAAc (32+4+2 bits, P=1 if no partition)
133 exp-b : 0BBBBIBBBBIBBBBIBBBBIBBBBc (32+2 bits plus 4 zeros)
134 exp-o : o....oN...oN...oN...oN...x (32+4+2 bits - x to be discarded)
135 o : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
136 carry-out: o o o o o (5 bits)
137
138 A couple of differences should be noted:
139 - The expanded a/b/o have 2 extra bits added to them. These bits
140 allow the carry-in for the least significant partition to be
141 injected, and the carry out for the most significant partition
142 to be extracted.
143 - The partition bits P and 0 in the first example have been
144 replaced with bits C and I. Bits C and I are set to 1 when
145 there is a partition and a carry-in at that position. This has
146 the effect of creating a carry at that position in the expanded
147 adder, while preventing carries from the previous partition
148 from propogating through to the next. These bits are also used
149 to extract the carry-out information for each partition, as
150 when there is a carry out in a partition, the next most
151 significant partition bit will be set to 1
152
153 Additionally, the carry-out bits must be rearranged before being
154 output to move the most significant carry bit for each partition
155 into the least significant bit for that partition, as well as to
156 ignore the other carry bits in that partition. This is
157 accomplished by the MoveMSBDown module
158
159 :attribute width: the bit width of the input and output. Read-only.
160 :attribute a: the first input to the adder
161 :attribute b: the second input to the adder
162 :attribute output: the sum output
163 :attribute part_pts: the input partition points. Modification not
164 supported, except for by ``Signal.eq``.
165 """
166
167 def __init__(self, width, part_pts, partition_step=1):
168 """Create a ``PartitionedAdder``.
169
170 :param width: the bit width of the input and output
171 :param part_pts: the input partition points
172 :param partition_step: a multiplier (typically double) step
173 which in-place "expands" the partition points
174 """
175 self.width = width
176 self.pmul = partition_step
177 self.part_pts = PartitionPoints(part_pts)
178 self.a = Signal(width, reset_less=True)
179 self.b = Signal(width, reset_less=True)
180 self.carry_in = Signal(self.part_pts.get_max_partition_count(width))
181 self.carry_out = Signal(self.part_pts.get_max_partition_count(width))
182 self.output = Signal(width, reset_less=True)
183 if not self.part_pts.fits_in_width(width):
184 raise ValueError("partition_points doesn't fit in width")
185 expanded_width = 2
186 for i in range(self.width):
187 if i in self.part_pts:
188 expanded_width += 1
189 expanded_width += 1
190 self._expanded_width = expanded_width
191
192 def elaborate(self, platform):
193 """Elaborate this module."""
194 m = Module()
195 comb = m.d.comb
196
197 carry_tmp = Signal(self.carry_out.width)
198 m.submodules.ripple = ripple = MoveMSBDown(self.carry_out.width)
199
200 expanded_a = Signal(self._expanded_width, reset_less=True)
201 expanded_b = Signal(self._expanded_width, reset_less=True)
202 expanded_o = Signal(self._expanded_width, reset_less=True)
203
204 expanded_index = 0
205 # store bits in a list, use Cat later. graphviz is much cleaner
206 al, bl, ol, cl, ea, eb, eo, co = [], [], [], [], [], [], [], []
207
208 # partition points are "breaks" (extra zeros or 1s) in what would
209 # otherwise be a massive long add. when the "break" points are 0,
210 # whatever is in it (in the output) is discarded. however when
211 # there is a "1", it causes a roll-over carry to the *next* bit.
212 # we still ignore the "break" bit in the [intermediate] output,
213 # however by that time we've got the effect that we wanted: the
214 # carry has been carried *over* the break point.
215
216 carry_bit = 0
217 al.append(self.carry_in[carry_bit])
218 bl.append(self.carry_in[carry_bit])
219 ea.append(expanded_a[expanded_index])
220 eb.append(expanded_b[expanded_index])
221 carry_bit += 1
222 expanded_index += 1
223
224 for i in range(self.width):
225 pi = i/self.pmul # double the range of the partition point test
226 if pi.is_integer() and pi in self.part_pts:
227 # add extra bit set to carry + carry for enabled
228 # partition points
229 a_bit = Signal(name="a_bit_%d" % i, reset_less=True)
230 carry_in = self.carry_in[carry_bit] # convenience
231 m.d.comb += a_bit.eq(self.part_pts[pi].implies(carry_in))
232
233 # and 1 + 0 for disabled partition points
234 ea.append(expanded_a[expanded_index])
235 al.append(a_bit) # add extra bit in a
236 eb.append(expanded_b[expanded_index])
237 bl.append(carry_in & self.part_pts[pi]) # carry bit
238 co.append(expanded_o[expanded_index])
239 cl.append(carry_tmp[carry_bit-1])
240 expanded_index += 1 # skip the extra point. NOT in the output
241 carry_bit += 1
242 ea.append(expanded_a[expanded_index])
243 eb.append(expanded_b[expanded_index])
244 eo.append(expanded_o[expanded_index])
245 al.append(self.a[i])
246 bl.append(self.b[i])
247 ol.append(self.output[i])
248 expanded_index += 1
249 al.append(0)
250 bl.append(0)
251 co.append(expanded_o[-1])
252 cl.append(carry_tmp[carry_bit-1])
253
254 # combine above using Cat
255 comb += Cat(*ea).eq(Cat(*al))
256 comb += Cat(*eb).eq(Cat(*bl))
257 comb += Cat(*ol).eq(Cat(*eo))
258 comb += Cat(*cl).eq(Cat(*co))
259
260 # use only one addition to take advantage of look-ahead carry and
261 # special hardware on FPGAs
262 comb += expanded_o.eq(expanded_a + expanded_b)
263
264 # ok now we have the carry-out, however because it's the MSB it's
265 # in the wrong position in the output as far as putting it into
266 # a chain of adds (or other operations). therefore we need to
267 # "ripple" carry-out down to the same position that carry-in is
268 # in [the LSB of each partition].
269 comb += ripple.results_in.eq(carry_tmp)
270 comb += ripple.gates.eq(self.part_pts.as_sig())
271 m.d.sync += self.carry_out.eq(ripple.output)
272
273 return m