From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Thu, 2 May 2019 12:42:57 +0000 (+0100)
Subject: move add to ieee754 directory
X-Git-Tag: ls180-24jan2020~1090
X-Git-Url: https://git.libre-soc.org/?p=ieee754fpu.git;a=commitdiff_plain;h=58e455d3bd9b43d076468bf2b7b1f0784e5c4fd2;hp=6bff1a997f3846872cf489c24b5c01426c4dc97c

move add to ieee754 directory
---

diff --git a/src/add/concurrentunit.py b/src/add/concurrentunit.py
deleted file mode 100644
index c0053c8b..00000000
--- a/src/add/concurrentunit.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from math import log
-from nmigen import Module
-from nmigen.cli import main, verilog
-
-from singlepipe import PassThroughStage
-from multipipe import CombMuxOutPipe
-from multipipe import PriorityCombMuxInPipe
-
-from fpcommon.getop import FPADDBaseData
-from fpcommon.denorm import FPSCData
-from fpcommon.pack import FPPackData
-from fpcommon.normtopack import FPNormToPack
-from fpadd.specialcases import FPAddSpecialCasesDeNorm
-from fpadd.addstages import FPAddAlignSingleAdd
-
-
-def num_bits(n):
-    return int(log(n) / log(2))
-
-class FPADDInMuxPipe(PriorityCombMuxInPipe):
-    def __init__(self, num_rows, iospecfn):
-        self.num_rows = num_rows
-        stage = PassThroughStage(iospecfn)
-        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
-
-
-class FPADDMuxOutPipe(CombMuxOutPipe):
-    def __init__(self, num_rows, iospecfn):
-        self.num_rows = num_rows
-        stage = PassThroughStage(iospecfn)
-        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
-
-
-class ReservationStations:
-    """ Reservation-Station pipeline
-
-        Input: num_rows - number of input and output Reservation Stations
-
-        Requires: the addition of an "alu" object, an i_specfn and an o_specfn
-
-        * fan-in on inputs (an array of FPADDBaseData: a,b,mid)
-        * ALU pipeline
-        * fan-out on outputs (an array of FPPackData: z,mid)
-
-        Fan-in and Fan-out are combinatorial.
-    """
-    def __init__(self, num_rows):
-        self.num_rows = num_rows
-        self.inpipe = FPADDInMuxPipe(num_rows, self.i_specfn)   # fan-in
-        self.outpipe = FPADDMuxOutPipe(num_rows, self.o_specfn) # fan-out
-
-        self.p = self.inpipe.p  # kinda annoying,
-        self.n = self.outpipe.n # use pipe in/out as this class in/out
-        self._ports = self.inpipe.ports() + self.outpipe.ports()
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.inpipe = self.inpipe
-        m.submodules.alu = self.alu
-        m.submodules.outpipe = self.outpipe
-
-        m.d.comb += self.inpipe.n.connect_to_next(self.alu.p)
-        m.d.comb += self.alu.connect_to_next(self.outpipe)
-
-        return m
-
-    def ports(self):
-        return self._ports
-
-
diff --git a/src/add/dual_add_experiment.py b/src/add/dual_add_experiment.py
deleted file mode 100644
index 7ec479f5..00000000
--- a/src/add/dual_add_experiment.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from nmigen import *
-from nmigen.cli import main
-
-from nmigen_add_experiment import FPADD
-from fpbase import FPOp
-
-
-class Adder:
-    def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.a + self.b)
-        return m
-
-
-class Subtractor:
-    def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.a - self.b)
-        return m
-
-
-class ALU:
-    def __init__(self, width):
-        #self.op  = Signal()
-        self.a   = FPOp(width)
-        self.b   = FPOp(width)
-        self.c   = FPOp(width)
-        self.z   = FPOp(width)
-        self.int_stb = Signal()
-
-        self.add1 = FPADD(width)
-        self.add2 = FPADD(width)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.add1 = self.add1
-        m.submodules.add2 = self.add2
-        # join add1 a to a: add1.in_a = a
-        m.d.comb += self.add1.in_a.chain_from(self.a)
-        # join add1 b to b: add1.in_b = b
-        m.d.comb += self.add1.in_b.chain_from(self.b)
-        # join add2 a to c: add2.in_a = c
-        m.d.comb += self.add2.in_a.chain_from(self.c)
-        # join add2 b to add1 z: add2.in_b = add1.out_z
-        m.d.comb += self.add2.in_b.chain_inv(self.add1.out_z)
-        # join output from add2 to z: z = add2.out_z
-        m.d.comb += self.z.chain_from(self.add2.out_z)
-        # get at add1's stb signal
-        m.d.comb += self.int_stb.eq(self.add1.out_z.stb)
-        #with m.If(self.op):
-        #    m.d.comb += self.o.eq(self.sub.o)
-        #with m.Else():
-        #    m.d.comb += self.o.eq(self.add.o)
-        return m
-
-
-if __name__ == "__main__":
-    alu = ALU(width=16)
-    main(alu, ports=alu.a.ports() + \
-                     alu.b.ports() + \
-                     alu.c.ports() + \
-                     alu.z.ports())
diff --git a/src/add/example_buf_pipe.py b/src/add/example_buf_pipe.py
deleted file mode 100644
index 4bb7cdf1..00000000
--- a/src/add/example_buf_pipe.py
+++ /dev/null
@@ -1,103 +0,0 @@
-""" Pipeline and BufferedHandshake examples
-"""
-
-from nmoperator import eq
-from iocontrol import (PrevControl, NextControl)
-from singlepipe import (PrevControl, NextControl, ControlBase,
-                        StageCls, Stage, StageChain,
-                        BufferedHandshake, UnbufferedPipeline)
-
-from nmigen import Signal, Module
-from nmigen.cli import verilog, rtlil
-
-
-class ExampleAddStage(StageCls):
-    """ an example of how to use the buffered pipeline, as a class instance
-    """
-
-    def ispec(self):
-        """ returns a tuple of input signals which will be the incoming data
-        """
-        return (Signal(16), Signal(16))
-
-    def ospec(self):
-        """ returns an output signal which will happen to contain the sum
-            of the two inputs
-        """
-        return Signal(16)
-
-    def process(self, i):
-        """ process the input data (sums the values in the tuple) and returns it
-        """
-        return i[0] + i[1]
-
-
-class ExampleBufPipeAdd(BufferedHandshake):
-    """ an example of how to use the buffered pipeline, using a class instance
-    """
-
-    def __init__(self):
-        addstage = ExampleAddStage()
-        BufferedHandshake.__init__(self, addstage)
-
-
-class ExampleStage(Stage):
-    """ an example of how to use the buffered pipeline, in a static class
-        fashion
-    """
-
-    def ispec():
-        return Signal(16, name="example_input_signal")
-
-    def ospec():
-        return Signal(16, name="example_output_signal")
-
-    def process(i):
-        """ process the input data and returns it (adds 1)
-        """
-        return i + 1
-
-
-class ExampleStageCls(StageCls):
-    """ an example of how to use the buffered pipeline, in a static class
-        fashion
-    """
-
-    def ispec(self):
-        return Signal(16, name="example_input_signal")
-
-    def ospec(self):
-        return Signal(16, name="example_output_signal")
-
-    def process(self, i):
-        """ process the input data and returns it (adds 1)
-        """
-        return i + 1
-
-
-class ExampleBufPipe(BufferedHandshake):
-    """ an example of how to use the buffered pipeline.
-    """
-
-    def __init__(self):
-        BufferedHandshake.__init__(self, ExampleStage)
-
-
-class ExamplePipeline(UnbufferedPipeline):
-    """ an example of how to use the unbuffered pipeline.
-    """
-
-    def __init__(self):
-        UnbufferedPipeline.__init__(self, ExampleStage)
-
-
-if __name__ == '__main__':
-    dut = ExampleBufPipe()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_bufpipe.il", "w") as f:
-        f.write(vl)
-
-    dut = ExamplePipeline()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_combpipe.il", "w") as f:
-        f.write(vl)
diff --git a/src/add/fadd_state.py b/src/add/fadd_state.py
deleted file mode 100644
index 7ad88786..00000000
--- a/src/add/fadd_state.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase
-
-from singlepipe import eq
-
-
-class FPADD(FPBase):
-
-    def __init__(self, width, single_cycle=False):
-        FPBase.__init__(self)
-        self.width = width
-        self.single_cycle = single_cycle
-
-        self.in_a  = FPOp(width)
-        self.in_b  = FPOp(width)
-        self.out_z = FPOp(width)
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPAdd
-        """
-        m = Module()
-
-        # Latches
-        a = FPNumIn(self.in_a, self.width)
-        b = FPNumIn(self.in_b, self.width)
-        z = FPNumOut(self.width, False)
-
-        m.submodules.fpnum_a = a
-        m.submodules.fpnum_b = b
-        m.submodules.fpnum_z = z
-
-        m.d.comb += a.v.eq(self.in_a.v)
-        m.d.comb += b.v.eq(self.in_b.v)
-
-        w = z.m_width + 4
-        tot = Signal(w, reset_less=True) # sticky/round/guard, {mantissa} result, 1 overflow
-
-        of = Overflow()
-
-        m.submodules.overflow = of
-
-        with m.FSM() as fsm:
-
-            # ******
-            # gets operand a
-
-            with m.State("get_a"):
-                res = self.get_op(m, self.in_a, a, "get_b")
-                m.d.sync += eq([a, self.in_a.ack], res)
-
-            # ******
-            # gets operand b
-
-            with m.State("get_b"):
-                res = self.get_op(m, self.in_b, b, "special_cases")
-                m.d.sync += eq([b, self.in_b.ack], res)
-
-            # ******
-            # special cases: NaNs, infs, zeros, denormalised
-            # NOTE: some of these are unique to add.  see "Special Operations"
-            # https://steve.hollasch.net/cgindex/coding/ieeefloat.html
-
-            with m.State("special_cases"):
-
-                s_nomatch = Signal()
-                m.d.comb += s_nomatch.eq(a.s != b.s)
-
-                m_match = Signal()
-                m.d.comb += m_match.eq(a.m == b.m)
-
-                # if a is NaN or b is NaN return NaN
-                with m.If(a.is_nan | b.is_nan):
-                    m.next = "put_z"
-                    m.d.sync += z.nan(1)
-
-                # XXX WEIRDNESS for FP16 non-canonical NaN handling
-                # under review
-
-                ## if a is zero and b is NaN return -b
-                #with m.If(a.is_zero & (a.s==0) & b.is_nan):
-                #    m.next = "put_z"
-                #    m.d.sync += z.create(b.s, b.e, Cat(b.m[3:-2], ~b.m[0]))
-
-                ## if b is zero and a is NaN return -a
-                #with m.Elif(b.is_zero & (b.s==0) & a.is_nan):
-                #    m.next = "put_z"
-                #    m.d.sync += z.create(a.s, a.e, Cat(a.m[3:-2], ~a.m[0]))
-
-                ## if a is -zero and b is NaN return -b
-                #with m.Elif(a.is_zero & (a.s==1) & b.is_nan):
-                #    m.next = "put_z"
-                #    m.d.sync += z.create(a.s & b.s, b.e, Cat(b.m[3:-2], 1))
-
-                ## if b is -zero and a is NaN return -a
-                #with m.Elif(b.is_zero & (b.s==1) & a.is_nan):
-                #    m.next = "put_z"
-                #    m.d.sync += z.create(a.s & b.s, a.e, Cat(a.m[3:-2], 1))
-
-                # if a is inf return inf (or NaN)
-                with m.Elif(a.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(a.s)
-                    # if a is inf and signs don't match return NaN
-                    with m.If(b.exp_128 & s_nomatch):
-                        m.d.sync += z.nan(1)
-
-                # if b is inf return inf
-                with m.Elif(b.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(b.s)
-
-                # if a is zero and b zero return signed-a/b
-                with m.Elif(a.is_zero & b.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.create(a.s & b.s, b.e, b.m[3:-1])
-
-                # if a is zero return b
-                with m.Elif(a.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.create(b.s, b.e, b.m[3:-1])
-
-                # if b is zero return a
-                with m.Elif(b.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.create(a.s, a.e, a.m[3:-1])
-
-                # if a equal to -b return zero (+ve zero)
-                with m.Elif(s_nomatch & m_match & (a.e == b.e)):
-                    m.next = "put_z"
-                    m.d.sync += z.zero(0)
-
-                # Denormalised Number checks
-                with m.Else():
-                    m.next = "align"
-                    self.denormalise(m, a)
-                    self.denormalise(m, b)
-
-            # ******
-            # align.
-
-            with m.State("align"):
-                if not self.single_cycle:
-                    # NOTE: this does *not* do single-cycle multi-shifting,
-                    #       it *STAYS* in the align state until exponents match
-
-                    # exponent of a greater than b: shift b down
-                    with m.If(a.e > b.e):
-                        m.d.sync += b.shift_down()
-                    # exponent of b greater than a: shift a down
-                    with m.Elif(a.e < b.e):
-                        m.d.sync += a.shift_down()
-                    # exponents equal: move to next stage.
-                    with m.Else():
-                        m.next = "add_0"
-                else:
-                    # This one however (single-cycle) will do the shift
-                    # in one go.
-
-                    # XXX TODO: the shifter used here is quite expensive
-                    # having only one would be better
-
-                    ediff = Signal((len(a.e), True), reset_less=True)
-                    ediffr = Signal((len(a.e), True), reset_less=True)
-                    m.d.comb += ediff.eq(a.e - b.e)
-                    m.d.comb += ediffr.eq(b.e - a.e)
-                    with m.If(ediff > 0):
-                        m.d.sync += b.shift_down_multi(ediff)
-                    # exponent of b greater than a: shift a down
-                    with m.Elif(ediff < 0):
-                        m.d.sync += a.shift_down_multi(ediffr)
-
-                    m.next = "add_0"
-
-            # ******
-            # First stage of add.  covers same-sign (add) and subtract
-            # special-casing when mantissas are greater or equal, to
-            # give greatest accuracy.
-
-            with m.State("add_0"):
-                m.next = "add_1"
-                m.d.sync += z.e.eq(a.e)
-                # same-sign (both negative or both positive) add mantissas
-                with m.If(a.s == b.s):
-                    m.d.sync += [
-                        tot.eq(Cat(a.m, 0) + Cat(b.m, 0)),
-                        z.s.eq(a.s)
-                    ]
-                # a mantissa greater than b, use a
-                with m.Elif(a.m >= b.m):
-                    m.d.sync += [
-                        tot.eq(Cat(a.m, 0) - Cat(b.m, 0)),
-                        z.s.eq(a.s)
-                    ]
-                # b mantissa greater than a, use b
-                with m.Else():
-                    m.d.sync += [
-                        tot.eq(Cat(b.m, 0) - Cat(a.m, 0)),
-                        z.s.eq(b.s)
-                ]
-
-            # ******
-            # Second stage of add: preparation for normalisation.
-            # detects when tot sum is too big (tot[27] is kinda a carry bit)
-
-            with m.State("add_1"):
-                m.next = "normalise_1"
-                # tot[27] gets set when the sum overflows. shift result down
-                with m.If(tot[-1]):
-                    m.d.sync += [
-                        z.m.eq(tot[4:]),
-                        of.m0.eq(tot[4]),
-                        of.guard.eq(tot[3]),
-                        of.round_bit.eq(tot[2]),
-                        of.sticky.eq(tot[1] | tot[0]),
-                        z.e.eq(z.e + 1)
-                ]
-                # tot[27] zero case
-                with m.Else():
-                    m.d.sync += [
-                        z.m.eq(tot[3:]),
-                        of.m0.eq(tot[3]),
-                        of.guard.eq(tot[2]),
-                        of.round_bit.eq(tot[1]),
-                        of.sticky.eq(tot[0])
-                ]
-
-            # ******
-            # First stage of normalisation.
-
-            with m.State("normalise_1"):
-                self.normalise_1(m, z, of, "normalise_2")
-
-            # ******
-            # Second stage of normalisation.
-
-            with m.State("normalise_2"):
-                self.normalise_2(m, z, of, "round")
-
-            # ******
-            # rounding stage
-
-            with m.State("round"):
-                self.roundz(m, z, of.roundz)
-                m.next = "corrections"
-
-            # ******
-            # correction stage
-
-            with m.State("corrections"):
-                self.corrections(m, z, "pack")
-
-            # ******
-            # pack stage
-
-            with m.State("pack"):
-                self.pack(m, z, "put_z")
-
-            # ******
-            # put_z stage
-
-            with m.State("put_z"):
-                self.put_z(m, z, self.out_z, "get_a")
-
-        return m
-
-
-if __name__ == "__main__":
-    alu = FPADD(width=32)
-    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
-
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/fmul.py b/src/add/fmul.py
deleted file mode 100644
index a2ba41e7..00000000
--- a/src/add/fmul.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from nmigen import Module, Signal, Cat, Mux, Array, Const
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase, FPState
-from fpcommon.getop import FPGetOp
-from singlepipe import eq
-
-
-class FPMUL(FPBase):
-
-    def __init__(self, width):
-        FPBase.__init__(self)
-        self.width = width
-
-        self.in_a  = FPOp(width)
-        self.in_b  = FPOp(width)
-        self.out_z = FPOp(width)
-
-        self.states = []
-
-    def add_state(self, state):
-        self.states.append(state)
-        return state
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPMUL
-        """
-        m = Module()
-
-        # Latches
-        a = FPNumIn(None, self.width, False)
-        b = FPNumIn(None, self.width, False)
-        z = FPNumOut(self.width, False)
-
-        mw = (z.m_width)*2 - 1 + 3 # sticky/round/guard bits + (2*mant) - 1
-        product = Signal(mw)
-
-        of = Overflow()
-        m.submodules.of = of
-        m.submodules.a = a
-        m.submodules.b = b
-        m.submodules.z = z
-
-        m.d.comb += a.v.eq(self.in_a.v)
-        m.d.comb += b.v.eq(self.in_b.v)
-
-        with m.FSM() as fsm:
-
-            # ******
-            # gets operand a
-
-            with m.State("get_a"):
-                res = self.get_op(m, self.in_a, a, "get_b")
-                m.d.sync += eq([a, self.in_a.ack], res)
-
-            # ******
-            # gets operand b
-
-            with m.State("get_b"):
-                res = self.get_op(m, self.in_b, b, "special_cases")
-                m.d.sync += eq([b, self.in_b.ack], res)
-
-            # ******
-            # special cases
-
-            with m.State("special_cases"):
-                #if a or b is NaN return NaN
-                with m.If(a.is_nan | b.is_nan):
-                    m.next = "put_z"
-                    m.d.sync += z.nan(1)
-                #if a is inf return inf
-                with m.Elif(a.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(a.s ^ b.s)
-                    #if b is zero return NaN
-                    with m.If(b.is_zero):
-                        m.d.sync += z.nan(1)
-                #if b is inf return inf
-                with m.Elif(b.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(a.s ^ b.s)
-                    #if a is zero return NaN
-                    with m.If(a.is_zero):
-                        m.next = "put_z"
-                        m.d.sync += z.nan(1)
-                #if a is zero return zero
-                with m.Elif(a.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.zero(a.s ^ b.s)
-                #if b is zero return zero
-                with m.Elif(b.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.zero(a.s ^ b.s)
-                # Denormalised Number checks
-                with m.Else():
-                    m.next = "normalise_a"
-                    self.denormalise(m, a)
-                    self.denormalise(m, b)
-
-            # ******
-            # normalise_a
-
-            with m.State("normalise_a"):
-                self.op_normalise(m, a, "normalise_b")
-
-            # ******
-            # normalise_b
-
-            with m.State("normalise_b"):
-                self.op_normalise(m, b, "multiply_0")
-
-            #multiply_0
-            with m.State("multiply_0"):
-                m.next = "multiply_1"
-                m.d.sync += [
-                   z.s.eq(a.s ^ b.s),
-                   z.e.eq(a.e + b.e + 1),
-                   product.eq(a.m * b.m * 4)
-                ]
-
-            #multiply_1
-            with m.State("multiply_1"):
-                mw = z.m_width
-                m.next = "normalise_1"
-                m.d.sync += [
-                z.m.eq(product[mw+2:]),
-                of.guard.eq(product[mw+1]),
-                of.round_bit.eq(product[mw]),
-                of.sticky.eq(product[0:mw] != 0)
-            ]
-
-            # ******
-            # First stage of normalisation.
-            with m.State("normalise_1"):
-                self.normalise_1(m, z, of, "normalise_2")
-
-            # ******
-            # Second stage of normalisation.
-
-            with m.State("normalise_2"):
-                self.normalise_2(m, z, of, "round")
-
-            # ******
-            # rounding stage
-
-            with m.State("round"):
-                self.roundz(m, z, of.roundz)
-                m.next = "corrections"
-
-            # ******
-            # correction stage
-
-            with m.State("corrections"):
-                self.corrections(m, z, "pack")
-
-            # ******
-            # pack stage
-            with m.State("pack"):
-                self.pack(m, z, "put_z")
-
-            # ******
-            # put_z stage
-
-            with m.State("put_z"):
-                self.put_z(m, z, self.out_z, "get_a")
-
-        return m
-
-
-if __name__ == "__main__":
-    alu = FPMUL(width=32)
-    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
diff --git a/src/add/fpadd/__init__.py b/src/add/fpadd/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/add/fpadd/add0.py b/src/add/fpadd/add0.py
deleted file mode 100644
index 76790fe2..00000000
--- a/src/add/fpadd/add0.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat, Elaboratable
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumBase
-from fpbase import FPState
-from fpcommon.denorm import FPSCData
-
-
-class FPAddStage0Data:
-
-    def __init__(self, width, id_wid):
-        self.z = FPNumBase(width, False)
-        self.out_do_z = Signal(reset_less=True)
-        self.oz = Signal(width, reset_less=True)
-        self.tot = Signal(self.z.m_width + 4, reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.tot.eq(i.tot), self.mid.eq(i.mid)]
-
-
-class FPAddStage0Mod(Elaboratable):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPAddStage0Data(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.add0 = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.add0_in_a = self.i.a
-        m.submodules.add0_in_b = self.i.b
-        m.submodules.add0_out_z = self.o.z
-
-        # store intermediate tests (and zero-extended mantissas)
-        seq = Signal(reset_less=True)
-        mge = Signal(reset_less=True)
-        am0 = Signal(len(self.i.a.m)+1, reset_less=True)
-        bm0 = Signal(len(self.i.b.m)+1, reset_less=True)
-        m.d.comb += [seq.eq(self.i.a.s == self.i.b.s),
-                     mge.eq(self.i.a.m >= self.i.b.m),
-                     am0.eq(Cat(self.i.a.m, 0)),
-                     bm0.eq(Cat(self.i.b.m, 0))
-                    ]
-        # same-sign (both negative or both positive) add mantissas
-        with m.If(~self.i.out_do_z):
-            m.d.comb += self.o.z.e.eq(self.i.a.e)
-            with m.If(seq):
-                m.d.comb += [
-                    self.o.tot.eq(am0 + bm0),
-                    self.o.z.s.eq(self.i.a.s)
-                ]
-            # a mantissa greater than b, use a
-            with m.Elif(mge):
-                m.d.comb += [
-                    self.o.tot.eq(am0 - bm0),
-                    self.o.z.s.eq(self.i.a.s)
-                ]
-            # b mantissa greater than a, use b
-            with m.Else():
-                m.d.comb += [
-                    self.o.tot.eq(bm0 - am0),
-                    self.o.z.s.eq(self.i.b.s)
-            ]
-
-        m.d.comb += self.o.oz.eq(self.i.oz)
-        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
-        m.d.comb += self.o.mid.eq(self.i.mid)
-        return m
-
-
-class FPAddStage0(FPState):
-    """ First stage of add.  covers same-sign (add) and subtract
-        special-casing when mantissas are greater or equal, to
-        give greatest accuracy.
-    """
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "add_0")
-        self.mod = FPAddStage0Mod(width)
-        self.o = self.mod.ospec()
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-        # NOTE: these could be done as combinatorial (merge add0+add1)
-        m.d.sync += self.o.eq(self.mod.o)
-
-    def action(self, m):
-        m.next = "add_1"
diff --git a/src/add/fpadd/add1.py b/src/add/fpadd/add1.py
deleted file mode 100644
index 679f5176..00000000
--- a/src/add/fpadd/add1.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import FPState
-from fpcommon.postcalc import FPAddStage1Data
-from fpadd.add0 import FPAddStage0Data
-
-
-class FPAddStage1Mod(FPState, Elaboratable):
-    """ Second stage of add: preparation for normalisation.
-        detects when tot sum is too big (tot[27] is kinda a carry bit)
-    """
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPAddStage0Data(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPAddStage1Data(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.add1 = self
-        m.submodules.add1_out_overflow = self.o.of
-
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.z.eq(self.i.z)
-        # tot[-1] (MSB) gets set when the sum overflows. shift result down
-        with m.If(~self.i.out_do_z):
-            with m.If(self.i.tot[-1]):
-                m.d.comb += [
-                    self.o.z.m.eq(self.i.tot[4:]),
-                    self.o.of.m0.eq(self.i.tot[4]),
-                    self.o.of.guard.eq(self.i.tot[3]),
-                    self.o.of.round_bit.eq(self.i.tot[2]),
-                    self.o.of.sticky.eq(self.i.tot[1] | self.i.tot[0]),
-                    self.o.z.e.eq(self.i.z.e + 1)
-            ]
-            # tot[-1] (MSB) zero case
-            with m.Else():
-                m.d.comb += [
-                    self.o.z.m.eq(self.i.tot[3:]),
-                    self.o.of.m0.eq(self.i.tot[3]),
-                    self.o.of.guard.eq(self.i.tot[2]),
-                    self.o.of.round_bit.eq(self.i.tot[1]),
-                    self.o.of.sticky.eq(self.i.tot[0])
-            ]
-
-        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
-        m.d.comb += self.o.oz.eq(self.i.oz)
-        m.d.comb += self.o.mid.eq(self.i.mid)
-
-        return m
-
-
-class FPAddStage1(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "add_1")
-        self.mod = FPAddStage1Mod(width)
-        self.out_z = FPNumBase(width, False)
-        self.out_of = Overflow()
-        self.norm_stb = Signal()
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-        m.d.sync += self.norm_stb.eq(0) # sets to zero when not in add1 state
-
-        m.d.sync += self.out_of.eq(self.mod.out_of)
-        m.d.sync += self.out_z.eq(self.mod.out_z)
-        m.d.sync += self.norm_stb.eq(1)
-
-    def action(self, m):
-        m.next = "normalise_1"
-
diff --git a/src/add/fpadd/addstages.py b/src/add/fpadd/addstages.py
deleted file mode 100644
index f5703aec..00000000
--- a/src/add/fpadd/addstages.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module
-from nmigen.cli import main, verilog
-
-from singlepipe import (StageChain, SimpleHandshake,
-                        PassThroughStage)
-
-from fpbase import FPState
-from fpcommon.denorm import FPSCData
-from fpcommon.postcalc import FPAddStage1Data
-from fpadd.align import FPAddAlignSingleMod
-from fpadd.add0 import FPAddStage0Mod
-from fpadd.add1 import FPAddStage1Mod
-
-
-class FPAddAlignSingleAdd(FPState, SimpleHandshake):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "align")
-        self.width = width
-        self.id_wid = id_wid
-        SimpleHandshake.__init__(self, self) # pipeline is its own stage
-        self.a1o = self.ospec()
-
-    def ispec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPAddStage1Data(self.width, self.id_wid) # AddStage1 ospec
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-
-        # chain AddAlignSingle, AddStage0 and AddStage1
-        mod = FPAddAlignSingleMod(self.width, self.id_wid)
-        a0mod = FPAddStage0Mod(self.width, self.id_wid)
-        a1mod = FPAddStage1Mod(self.width, self.id_wid)
-
-        chain = StageChain([mod, a0mod, a1mod])
-        chain.setup(m, i)
-
-        self.o = a1mod.o
-
-    def process(self, i):
-        return self.o
-
-    def action(self, m):
-        m.d.sync += self.a1o.eq(self.process(None))
-        m.next = "normalise_1"
-
-
diff --git a/src/add/fpadd/align.py b/src/add/fpadd/align.py
deleted file mode 100644
index 9837a0b8..00000000
--- a/src/add/fpadd/align.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumOut, FPNumIn, FPNumBase
-from fpbase import MultiShiftRMerge
-from fpbase import FPState
-from fpcommon.denorm import FPSCData
-
-
-class FPNumIn2Ops:
-
-    def __init__(self, width, id_wid):
-        self.a = FPNumIn(None, width)
-        self.b = FPNumIn(None, width)
-        self.z = FPNumOut(width, False)
-        self.out_do_z = Signal(reset_less=True)
-        self.oz = Signal(width, reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
-
-
-
-class FPAddAlignMultiMod(FPState):
-
-    def __init__(self, width):
-        self.in_a = FPNumBase(width)
-        self.in_b = FPNumBase(width)
-        self.out_a = FPNumIn(None, width)
-        self.out_b = FPNumIn(None, width)
-        self.exp_eq = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        # This one however (single-cycle) will do the shift
-        # in one go.
-
-        m = Module()
-
-        m.submodules.align_in_a = self.in_a
-        m.submodules.align_in_b = self.in_b
-        m.submodules.align_out_a = self.out_a
-        m.submodules.align_out_b = self.out_b
-
-        # NOTE: this does *not* do single-cycle multi-shifting,
-        #       it *STAYS* in the align state until exponents match
-
-        # exponent of a greater than b: shift b down
-        m.d.comb += self.exp_eq.eq(0)
-        m.d.comb += self.out_a.eq(self.in_a)
-        m.d.comb += self.out_b.eq(self.in_b)
-        agtb = Signal(reset_less=True)
-        altb = Signal(reset_less=True)
-        m.d.comb += agtb.eq(self.in_a.e > self.in_b.e)
-        m.d.comb += altb.eq(self.in_a.e < self.in_b.e)
-        with m.If(agtb):
-            m.d.comb += self.out_b.shift_down(self.in_b)
-        # exponent of b greater than a: shift a down
-        with m.Elif(altb):
-            m.d.comb += self.out_a.shift_down(self.in_a)
-        # exponents equal: move to next stage.
-        with m.Else():
-            m.d.comb += self.exp_eq.eq(1)
-        return m
-
-
-class FPAddAlignMulti(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "align")
-        self.mod = FPAddAlignMultiMod(width)
-        self.out_a = FPNumIn(None, width)
-        self.out_b = FPNumIn(None, width)
-        self.exp_eq = Signal(reset_less=True)
-
-    def setup(self, m, in_a, in_b):
-        """ links module to inputs and outputs
-        """
-        m.submodules.align = self.mod
-        m.d.comb += self.mod.in_a.eq(in_a)
-        m.d.comb += self.mod.in_b.eq(in_b)
-        m.d.comb += self.exp_eq.eq(self.mod.exp_eq)
-        m.d.sync += self.out_a.eq(self.mod.out_a)
-        m.d.sync += self.out_b.eq(self.mod.out_b)
-
-    def action(self, m):
-        with m.If(self.exp_eq):
-            m.next = "add_0"
-
-
-class FPAddAlignSingleMod:
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPNumIn2Ops(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.align = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        """ Aligns A against B or B against A, depending on which has the
-            greater exponent.  This is done in a *single* cycle using
-            variable-width bit-shift
-
-            the shifter used here is quite expensive in terms of gates.
-            Mux A or B in (and out) into temporaries, as only one of them
-            needs to be aligned against the other
-        """
-        m = Module()
-
-        m.submodules.align_in_a = self.i.a
-        m.submodules.align_in_b = self.i.b
-        m.submodules.align_out_a = self.o.a
-        m.submodules.align_out_b = self.o.b
-
-        # temporary (muxed) input and output to be shifted
-        t_inp = FPNumBase(self.width)
-        t_out = FPNumIn(None, self.width)
-        espec = (len(self.i.a.e), True)
-        msr = MultiShiftRMerge(self.i.a.m_width, espec)
-        m.submodules.align_t_in = t_inp
-        m.submodules.align_t_out = t_out
-        m.submodules.multishift_r = msr
-
-        ediff = Signal(espec, reset_less=True)
-        ediffr = Signal(espec, reset_less=True)
-        tdiff = Signal(espec, reset_less=True)
-        elz = Signal(reset_less=True)
-        egz = Signal(reset_less=True)
-
-        # connect multi-shifter to t_inp/out mantissa (and tdiff)
-        m.d.comb += msr.inp.eq(t_inp.m)
-        m.d.comb += msr.diff.eq(tdiff)
-        m.d.comb += t_out.m.eq(msr.m)
-        m.d.comb += t_out.e.eq(t_inp.e + tdiff)
-        m.d.comb += t_out.s.eq(t_inp.s)
-
-        m.d.comb += ediff.eq(self.i.a.e - self.i.b.e)
-        m.d.comb += ediffr.eq(self.i.b.e - self.i.a.e)
-        m.d.comb += elz.eq(self.i.a.e < self.i.b.e)
-        m.d.comb += egz.eq(self.i.a.e > self.i.b.e)
-
-        # default: A-exp == B-exp, A and B untouched (fall through)
-        m.d.comb += self.o.a.eq(self.i.a)
-        m.d.comb += self.o.b.eq(self.i.b)
-        # only one shifter (muxed)
-        #m.d.comb += t_out.shift_down_multi(tdiff, t_inp)
-        # exponent of a greater than b: shift b down
-        with m.If(~self.i.out_do_z):
-            with m.If(egz):
-                m.d.comb += [t_inp.eq(self.i.b),
-                             tdiff.eq(ediff),
-                             self.o.b.eq(t_out),
-                             self.o.b.s.eq(self.i.b.s), # whoops forgot sign
-                            ]
-            # exponent of b greater than a: shift a down
-            with m.Elif(elz):
-                m.d.comb += [t_inp.eq(self.i.a),
-                             tdiff.eq(ediffr),
-                             self.o.a.eq(t_out),
-                             self.o.a.s.eq(self.i.a.s), # whoops forgot sign
-                            ]
-
-        m.d.comb += self.o.mid.eq(self.i.mid)
-        m.d.comb += self.o.z.eq(self.i.z)
-        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
-        m.d.comb += self.o.oz.eq(self.i.oz)
-
-        return m
-
-
-class FPAddAlignSingle(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "align")
-        self.mod = FPAddAlignSingleMod(width, id_wid)
-        self.out_a = FPNumIn(None, width)
-        self.out_b = FPNumIn(None, width)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-        # NOTE: could be done as comb
-        m.d.sync += self.out_a.eq(self.mod.out_a)
-        m.d.sync += self.out_b.eq(self.mod.out_b)
-
-    def action(self, m):
-        m.next = "add_0"
-
-
diff --git a/src/add/fpadd/pipeline.py b/src/add/fpadd/pipeline.py
deleted file mode 100644
index e244ee60..00000000
--- a/src/add/fpadd/pipeline.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module
-from nmigen.cli import main, verilog
-
-from singlepipe import (ControlBase, SimpleHandshake, PassThroughStage)
-from multipipe import CombMuxOutPipe
-from multipipe import PriorityCombMuxInPipe
-
-from fpcommon.getop import FPADDBaseData
-from fpcommon.denorm import FPSCData
-from fpcommon.pack import FPPackData
-from fpcommon.normtopack import FPNormToPack
-from fpadd.specialcases import FPAddSpecialCasesDeNorm
-from fpadd.addstages import FPAddAlignSingleAdd
-
-from concurrentunit import ReservationStations, num_bits
-
-
-class FPADDBasePipe(ControlBase):
-    def __init__(self, width, id_wid):
-        ControlBase.__init__(self)
-        self.pipe1 = FPAddSpecialCasesDeNorm(width, id_wid)
-        self.pipe2 = FPAddAlignSingleAdd(width, id_wid)
-        self.pipe3 = FPNormToPack(width, id_wid)
-
-        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-        m.submodules.scnorm = self.pipe1
-        m.submodules.addalign = self.pipe2
-        m.submodules.normpack = self.pipe3
-        m.d.comb += self._eqs
-        return m
-
-
-class FPADDMuxInOut(ReservationStations):
-    """ Reservation-Station version of FPADD pipeline.
-
-        * fan-in on inputs (an array of FPADDBaseData: a,b,mid)
-        * 3-stage adder pipeline
-        * fan-out on outputs (an array of FPPackData: z,mid)
-
-        Fan-in and Fan-out are combinatorial.
-    """
-    def __init__(self, width, num_rows):
-        self.width = width
-        self.id_wid = num_bits(width)
-        self.alu = FPADDBasePipe(width, self.id_wid)
-        ReservationStations.__init__(self, num_rows)
-
-    def i_specfn(self):
-        return FPADDBaseData(self.width, self.id_wid)
-
-    def o_specfn(self):
-        return FPPackData(self.width, self.id_wid)
diff --git a/src/add/fpadd/specialcases.py b/src/add/fpadd/specialcases.py
deleted file mode 100644
index 6f9d1a08..00000000
--- a/src/add/fpadd/specialcases.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat, Const
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import FPNumDecode
-from singlepipe import SimpleHandshake, StageChain
-
-from fpbase import FPState, FPID
-from fpcommon.getop import FPADDBaseData
-from fpcommon.denorm import (FPSCData, FPAddDeNormMod)
-
-
-class FPAddSpecialCasesMod:
-    """ special cases: NaNs, infs, zeros, denormalised
-        NOTE: some of these are unique to add.  see "Special Operations"
-        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
-    """
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPADDBaseData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.specialcases = self
-        m.d.comb += self.i.eq(i)
-
-    def process(self, i):
-        return self.o
-
-    def elaborate(self, platform):
-        m = Module()
-
-        m.submodules.sc_out_z = self.o.z
-
-        # decode: XXX really should move to separate stage
-        a1 = FPNumDecode(None, self.width)
-        b1 = FPNumDecode(None, self.width)
-        m.submodules.sc_decode_a = a1
-        m.submodules.sc_decode_b = b1
-        m.d.comb += [a1.v.eq(self.i.a),
-                     b1.v.eq(self.i.b),
-                     self.o.a.eq(a1),
-                     self.o.b.eq(b1)
-                    ]
-
-        s_nomatch = Signal(reset_less=True)
-        m.d.comb += s_nomatch.eq(a1.s != b1.s)
-
-        m_match = Signal(reset_less=True)
-        m.d.comb += m_match.eq(a1.m == b1.m)
-
-        e_match = Signal(reset_less=True)
-        m.d.comb += e_match.eq(a1.e == b1.e)
-
-        aeqmb = Signal(reset_less=True)
-        m.d.comb += aeqmb.eq(s_nomatch & m_match & e_match)
-
-        abz = Signal(reset_less=True)
-        m.d.comb += abz.eq(a1.is_zero & b1.is_zero)
-
-        abnan = Signal(reset_less=True)
-        m.d.comb += abnan.eq(a1.is_nan | b1.is_nan)
-
-        bexp128s = Signal(reset_less=True)
-        m.d.comb += bexp128s.eq(b1.exp_128 & s_nomatch)
-
-        # if a is NaN or b is NaN return NaN
-        with m.If(abnan):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.nan(0)
-
-        # XXX WEIRDNESS for FP16 non-canonical NaN handling
-        # under review
-
-        ## if a is zero and b is NaN return -b
-        #with m.If(a.is_zero & (a.s==0) & b.is_nan):
-        #    m.d.comb += self.o.out_do_z.eq(1)
-        #    m.d.comb += z.create(b.s, b.e, Cat(b.m[3:-2], ~b.m[0]))
-
-        ## if b is zero and a is NaN return -a
-        #with m.Elif(b.is_zero & (b.s==0) & a.is_nan):
-        #    m.d.comb += self.o.out_do_z.eq(1)
-        #    m.d.comb += z.create(a.s, a.e, Cat(a.m[3:-2], ~a.m[0]))
-
-        ## if a is -zero and b is NaN return -b
-        #with m.Elif(a.is_zero & (a.s==1) & b.is_nan):
-        #    m.d.comb += self.o.out_do_z.eq(1)
-        #    m.d.comb += z.create(a.s & b.s, b.e, Cat(b.m[3:-2], 1))
-
-        ## if b is -zero and a is NaN return -a
-        #with m.Elif(b.is_zero & (b.s==1) & a.is_nan):
-        #    m.d.comb += self.o.out_do_z.eq(1)
-        #    m.d.comb += z.create(a.s & b.s, a.e, Cat(a.m[3:-2], 1))
-
-        # if a is inf return inf (or NaN)
-        with m.Elif(a1.is_inf):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.inf(a1.s)
-            # if a is inf and signs don't match return NaN
-            with m.If(bexp128s):
-                m.d.comb += self.o.z.nan(0)
-
-        # if b is inf return inf
-        with m.Elif(b1.is_inf):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.inf(b1.s)
-
-        # if a is zero and b zero return signed-a/b
-        with m.Elif(abz):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.create(a1.s & b1.s, b1.e, b1.m[3:-1])
-
-        # if a is zero return b
-        with m.Elif(a1.is_zero):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.create(b1.s, b1.e, b1.m[3:-1])
-
-        # if b is zero return a
-        with m.Elif(b1.is_zero):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.create(a1.s, a1.e, a1.m[3:-1])
-
-        # if a equal to -b return zero (+ve zero)
-        with m.Elif(aeqmb):
-            m.d.comb += self.o.out_do_z.eq(1)
-            m.d.comb += self.o.z.zero(0)
-
-        # Denormalised Number checks next, so pass a/b data through
-        with m.Else():
-            m.d.comb += self.o.out_do_z.eq(0)
-
-        m.d.comb += self.o.oz.eq(self.o.z.v)
-        m.d.comb += self.o.mid.eq(self.i.mid)
-
-        return m
-
-
-class FPAddSpecialCases(FPState):
-    """ special cases: NaNs, infs, zeros, denormalised
-        NOTE: some of these are unique to add.  see "Special Operations"
-        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
-    """
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "special_cases")
-        self.mod = FPAddSpecialCasesMod(width)
-        self.out_z = self.mod.ospec()
-        self.out_do_z = Signal(reset_less=True)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i, self.out_do_z)
-        m.d.sync += self.out_z.v.eq(self.mod.out_z.v) # only take the output
-        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)  # (and mid)
-
-    def action(self, m):
-        self.idsync(m)
-        with m.If(self.out_do_z):
-            m.next = "put_z"
-        with m.Else():
-            m.next = "denormalise"
-
-
-class FPAddSpecialCasesDeNorm(FPState, SimpleHandshake):
-    """ special cases: NaNs, infs, zeros, denormalised
-        NOTE: some of these are unique to add.  see "Special Operations"
-        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
-    """
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "special_cases")
-        self.width = width
-        self.id_wid = id_wid
-        SimpleHandshake.__init__(self, self) # pipe is its own stage
-        self.out = self.ospec()
-
-    def ispec(self):
-        return FPADDBaseData(self.width, self.id_wid) # SpecialCases ispec
-
-    def ospec(self):
-        return FPSCData(self.width, self.id_wid) # DeNorm ospec
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        smod = FPAddSpecialCasesMod(self.width, self.id_wid)
-        dmod = FPAddDeNormMod(self.width, self.id_wid)
-
-        chain = StageChain([smod, dmod])
-        chain.setup(m, i)
-
-        # only needed for break-out (early-out)
-        # self.out_do_z = smod.o.out_do_z
-
-        self.o = dmod.o
-
-    def process(self, i):
-        return self.o
-
-    def action(self, m):
-        # for break-out (early-out)
-        #with m.If(self.out_do_z):
-        #    m.next = "put_z"
-        #with m.Else():
-            m.d.sync += self.out.eq(self.process(None))
-            m.next = "align"
-
-
diff --git a/src/add/fpadd/statemachine.py b/src/add/fpadd/statemachine.py
deleted file mode 100644
index 4418b3fa..00000000
--- a/src/add/fpadd/statemachine.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat, Mux, Array, Const
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import FPOpIn, FPOpOut
-from fpbase import Trigger
-from singlepipe import (StageChain, SimpleHandshake)
-
-from fpbase import FPState, FPID
-from fpcommon.getop import (FPGetOp, FPADDBaseData, FPGet2Op)
-from fpcommon.denorm import (FPSCData, FPAddDeNorm)
-from fpcommon.postcalc import FPAddStage1Data
-from fpcommon.postnormalise import (FPNorm1Data,
-                            FPNorm1Single, FPNorm1Multi)
-from fpcommon.roundz import (FPRoundData, FPRound)
-from fpcommon.corrections import FPCorrections
-from fpcommon.pack import (FPPackData, FPPackMod, FPPack)
-from fpcommon.normtopack import FPNormToPack
-from fpcommon.putz import (FPPutZ, FPPutZIdx)
-
-from fpadd.specialcases import (FPAddSpecialCases, FPAddSpecialCasesDeNorm)
-from fpadd.align import (FPAddAlignMulti, FPAddAlignSingle)
-from fpadd.add0 import (FPAddStage0Data, FPAddStage0)
-from fpadd.add1 import (FPAddStage1Mod, FPAddStage1)
-from fpadd.addstages import FPAddAlignSingleAdd
-
-
-class FPOpData:
-    def __init__(self, width, id_wid):
-        self.z = FPOpOut(width)
-        self.z.data_o = Signal(width)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def __iter__(self):
-        yield self.z
-        yield self.mid
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.mid.eq(i.mid)]
-
-    def ports(self):
-        return list(self)
-
-
-class FPADDBaseMod:
-
-    def __init__(self, width, id_wid=None, single_cycle=False, compact=True):
-        """ IEEE754 FP Add
-
-            * width: bit-width of IEEE754.  supported: 16, 32, 64
-            * id_wid: an identifier that is sync-connected to the input
-            * single_cycle: True indicates each stage to complete in 1 clock
-            * compact: True indicates a reduced number of stages
-        """
-        self.width = width
-        self.id_wid = id_wid
-        self.single_cycle = single_cycle
-        self.compact = compact
-
-        self.in_t = Trigger()
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-        self.states = []
-
-    def ispec(self):
-        return FPADDBaseData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPOpData(self.width, self.id_wid)
-
-    def add_state(self, state):
-        self.states.append(state)
-        return state
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPAdd
-        """
-        m = Module()
-        m.submodules.out_z = self.o.z
-        m.submodules.in_t = self.in_t
-        if self.compact:
-            self.get_compact_fragment(m, platform)
-        else:
-            self.get_longer_fragment(m, platform)
-
-        with m.FSM() as fsm:
-
-            for state in self.states:
-                with m.State(state.state_from):
-                    state.action(m)
-
-        return m
-
-    def get_longer_fragment(self, m, platform=None):
-
-        get = self.add_state(FPGet2Op("get_ops", "special_cases",
-                                      self.width))
-        get.setup(m, self.i)
-        a = get.out_op1
-        b = get.out_op2
-        get.trigger_setup(m, self.in_t.stb, self.in_t.ack)
-
-        sc = self.add_state(FPAddSpecialCases(self.width, self.id_wid))
-        sc.setup(m, a, b, self.in_mid)
-
-        dn = self.add_state(FPAddDeNorm(self.width, self.id_wid))
-        dn.setup(m, a, b, sc.in_mid)
-
-        if self.single_cycle:
-            alm = self.add_state(FPAddAlignSingle(self.width, self.id_wid))
-            alm.setup(m, dn.out_a, dn.out_b, dn.in_mid)
-        else:
-            alm = self.add_state(FPAddAlignMulti(self.width, self.id_wid))
-            alm.setup(m, dn.out_a, dn.out_b, dn.in_mid)
-
-        add0 = self.add_state(FPAddStage0(self.width, self.id_wid))
-        add0.setup(m, alm.out_a, alm.out_b, alm.in_mid)
-
-        add1 = self.add_state(FPAddStage1(self.width, self.id_wid))
-        add1.setup(m, add0.out_tot, add0.out_z, add0.in_mid)
-
-        if self.single_cycle:
-            n1 = self.add_state(FPNorm1Single(self.width, self.id_wid))
-            n1.setup(m, add1.out_z, add1.out_of, add0.in_mid)
-        else:
-            n1 = self.add_state(FPNorm1Multi(self.width, self.id_wid))
-            n1.setup(m, add1.out_z, add1.out_of, add1.norm_stb, add0.in_mid)
-
-        rn = self.add_state(FPRound(self.width, self.id_wid))
-        rn.setup(m, n1.out_z, n1.out_roundz, n1.in_mid)
-
-        cor = self.add_state(FPCorrections(self.width, self.id_wid))
-        cor.setup(m, rn.out_z, rn.in_mid)
-
-        pa = self.add_state(FPPack(self.width, self.id_wid))
-        pa.setup(m, cor.out_z, rn.in_mid)
-
-        ppz = self.add_state(FPPutZ("pack_put_z", pa.out_z, self.out_z,
-                                    pa.in_mid, self.out_mid))
-
-        pz = self.add_state(FPPutZ("put_z", sc.out_z, self.out_z,
-                                    pa.in_mid, self.out_mid))
-
-    def get_compact_fragment(self, m, platform=None):
-
-        get = FPGet2Op("get_ops", "special_cases", self.width, self.id_wid)
-        sc = FPAddSpecialCasesDeNorm(self.width, self.id_wid)
-        alm = FPAddAlignSingleAdd(self.width, self.id_wid)
-        n1 = FPNormToPack(self.width, self.id_wid)
-
-        get.trigger_setup(m, self.in_t.stb, self.in_t.ack)
-
-        chainlist = [get, sc, alm, n1]
-        chain = StageChain(chainlist, specallocate=True)
-        chain.setup(m, self.i)
-
-        for mod in chainlist:
-            sc = self.add_state(mod)
-
-        ppz = self.add_state(FPPutZ("pack_put_z", n1.out_z.z, self.o,
-                                    n1.out_z.mid, self.o.mid))
-
-        #pz = self.add_state(FPPutZ("put_z", sc.out_z.z, self.o,
-        #                            sc.o.mid, self.o.mid))
-
-
-class FPADDBase(FPState):
-
-    def __init__(self, width, id_wid=None, single_cycle=False):
-        """ IEEE754 FP Add
-
-            * width: bit-width of IEEE754.  supported: 16, 32, 64
-            * id_wid: an identifier that is sync-connected to the input
-            * single_cycle: True indicates each stage to complete in 1 clock
-        """
-        FPState.__init__(self, "fpadd")
-        self.width = width
-        self.single_cycle = single_cycle
-        self.mod = FPADDBaseMod(width, id_wid, single_cycle)
-        self.o = self.ospec()
-
-        self.in_t = Trigger()
-        self.i = self.ispec()
-
-        self.z_done = Signal(reset_less=True) # connects to out_z Strobe
-        self.in_accept = Signal(reset_less=True)
-        self.add_stb = Signal(reset_less=True)
-        self.add_ack = Signal(reset=0, reset_less=True)
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def setup(self, m, i, add_stb, in_mid):
-        m.d.comb += [self.i.eq(i),
-                     self.mod.i.eq(self.i),
-                     self.z_done.eq(self.mod.o.z.trigger),
-                     #self.add_stb.eq(add_stb),
-                     self.mod.in_t.stb.eq(self.in_t.stb),
-                     self.in_t.ack.eq(self.mod.in_t.ack),
-                     self.o.mid.eq(self.mod.o.mid),
-                     self.o.z.v.eq(self.mod.o.z.v),
-                     self.o.z.valid_o.eq(self.mod.o.z.valid_o),
-                     self.mod.o.z.ready_i.eq(self.o.z.ready_i_test),
-                    ]
-
-        m.d.sync += self.add_stb.eq(add_stb)
-        m.d.sync += self.add_ack.eq(0) # sets to zero when not in active state
-        m.d.sync += self.o.z.ready_i.eq(0) # likewise
-        #m.d.sync += self.in_t.stb.eq(0)
-
-        m.submodules.fpadd = self.mod
-
-    def action(self, m):
-
-        # in_accept is set on incoming strobe HIGH and ack LOW.
-        m.d.comb += self.in_accept.eq((~self.add_ack) & (self.add_stb))
-
-        #with m.If(self.in_t.ack):
-        #    m.d.sync += self.in_t.stb.eq(0)
-        with m.If(~self.z_done):
-            # not done: test for accepting an incoming operand pair
-            with m.If(self.in_accept):
-                m.d.sync += [
-                    self.add_ack.eq(1), # acknowledge receipt...
-                    self.in_t.stb.eq(1), # initiate add
-                ]
-            with m.Else():
-                m.d.sync += [self.add_ack.eq(0),
-                             self.in_t.stb.eq(0),
-                             self.o.z.ready_i.eq(1),
-                            ]
-        with m.Else():
-            # done: acknowledge, and write out id and value
-            m.d.sync += [self.add_ack.eq(1),
-                         self.in_t.stb.eq(0)
-                        ]
-            m.next = "put_z"
-
-            return
-
-            if self.in_mid is not None:
-                m.d.sync += self.out_mid.eq(self.mod.out_mid)
-
-            m.d.sync += [
-              self.out_z.v.eq(self.mod.out_z.v)
-            ]
-            # move to output state on detecting z ack
-            with m.If(self.out_z.trigger):
-                m.d.sync += self.out_z.stb.eq(0)
-                m.next = "put_z"
-            with m.Else():
-                m.d.sync += self.out_z.stb.eq(1)
-
-
-class FPADD(FPID):
-    """ FPADD: stages as follows:
-
-        FPGetOp (a)
-           |
-        FPGetOp (b)
-           |
-        FPAddBase---> FPAddBaseMod
-           |            |
-        PutZ          GetOps->Specials->Align->Add1/2->Norm->Round/Pack->PutZ
-
-        FPAddBase is tricky: it is both a stage and *has* stages.
-        Connection to FPAddBaseMod therefore requires an in stb/ack
-        and an out stb/ack.  Just as with Add1-Norm1 interaction, FPGetOp
-        needs to be the thing that raises the incoming stb.
-    """
-
-    def __init__(self, width, id_wid=None, single_cycle=False, rs_sz=2):
-        """ IEEE754 FP Add
-
-            * width: bit-width of IEEE754.  supported: 16, 32, 64
-            * id_wid: an identifier that is sync-connected to the input
-            * single_cycle: True indicates each stage to complete in 1 clock
-        """
-        self.width = width
-        self.id_wid = id_wid
-        self.single_cycle = single_cycle
-
-        #self.out_z = FPOp(width)
-        self.ids = FPID(id_wid)
-
-        rs = []
-        for i in range(rs_sz):
-            in_a  = FPOpIn(width)
-            in_b  = FPOpIn(width)
-            in_a.data_i = Signal(width)
-            in_b.data_i = Signal(width)
-            in_a.name = "in_a_%d" % i
-            in_b.name = "in_b_%d" % i
-            rs.append((in_a, in_b))
-        self.rs = Array(rs)
-
-        res = []
-        for i in range(rs_sz):
-            out_z = FPOpOut(width)
-            out_z.data_o = Signal(width)
-            out_z.name = "out_z_%d" % i
-            res.append(out_z)
-        self.res = Array(res)
-
-        self.states = []
-
-    def add_state(self, state):
-        self.states.append(state)
-        return state
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPAdd
-        """
-        m = Module()
-        #m.submodules += self.rs
-
-        in_a = self.rs[0][0]
-        in_b = self.rs[0][1]
-
-        geta = self.add_state(FPGetOp("get_a", "get_b",
-                                      in_a, self.width))
-        geta.setup(m, in_a)
-        a = geta.out_op
-
-        getb = self.add_state(FPGetOp("get_b", "fpadd",
-                                      in_b, self.width))
-        getb.setup(m, in_b)
-        b = getb.out_op
-
-        ab = FPADDBase(self.width, self.id_wid, self.single_cycle)
-        ab = self.add_state(ab)
-        abd = ab.ispec() # create an input spec object for FPADDBase
-        m.d.sync += [abd.a.eq(a), abd.b.eq(b), abd.mid.eq(self.ids.in_mid)]
-        ab.setup(m, abd, getb.out_decode, self.ids.in_mid)
-        o = ab.o
-
-        pz = self.add_state(FPPutZIdx("put_z", o.z, self.res,
-                                    o.mid, "get_a"))
-
-        with m.FSM() as fsm:
-
-            for state in self.states:
-                with m.State(state.state_from):
-                    state.action(m)
-
-        return m
-
-
-if __name__ == "__main__":
-    if True:
-        alu = FPADD(width=32, id_wid=5, single_cycle=True)
-        main(alu, ports=alu.rs[0][0].ports() + \
-                        alu.rs[0][1].ports() + \
-                        alu.res[0].ports() + \
-                        [alu.ids.in_mid, alu.ids.out_mid])
-    else:
-        alu = FPADDBase(width=32, id_wid=5, single_cycle=True)
-        main(alu, ports=[alu.in_a, alu.in_b] + \
-                        alu.in_t.ports() + \
-                        alu.out_z.ports() + \
-                        [alu.in_mid, alu.out_mid])
-
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/fpbase.py b/src/add/fpbase.py
deleted file mode 100644
index f4908592..00000000
--- a/src/add/fpbase.py
+++ /dev/null
@@ -1,733 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Signal, Cat, Const, Mux, Module, Elaboratable
-from math import log
-from operator import or_
-from functools import reduce
-
-from singlepipe import PrevControl, NextControl
-from pipeline import ObjectProxy
-
-
-class MultiShiftR:
-
-    def __init__(self, width):
-        self.width = width
-        self.smax = int(log(width) / log(2))
-        self.i = Signal(width, reset_less=True)
-        self.s = Signal(self.smax, reset_less=True)
-        self.o = Signal(width, reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.o.eq(self.i >> self.s)
-        return m
-
-
-class MultiShift:
-    """ Generates variable-length single-cycle shifter from a series
-        of conditional tests on each bit of the left/right shift operand.
-        Each bit tested produces output shifted by that number of bits,
-        in a binary fashion: bit 1 if set shifts by 1 bit, bit 2 if set
-        shifts by 2 bits, each partial result cascading to the next Mux.
-
-        Could be adapted to do arithmetic shift by taking copies of the
-        MSB instead of zeros.
-    """
-
-    def __init__(self, width):
-        self.width = width
-        self.smax = int(log(width) / log(2))
-
-    def lshift(self, op, s):
-        res = op << s
-        return res[:len(op)]
-        res = op
-        for i in range(self.smax):
-            zeros = [0] * (1<<i)
-            res = Mux(s & (1<<i), Cat(zeros, res[0:-(1<<i)]), res)
-        return res
-
-    def rshift(self, op, s):
-        res = op >> s
-        return res[:len(op)]
-        res = op
-        for i in range(self.smax):
-            zeros = [0] * (1<<i)
-            res = Mux(s & (1<<i), Cat(res[(1<<i):], zeros), res)
-        return res
-
-
-class FPNumBase: #(Elaboratable):
-    """ Floating-point Base Number Class
-    """
-    def __init__(self, width, m_extra=True):
-        self.width = width
-        m_width = {16: 11, 32: 24, 64: 53}[width] # 1 extra bit (overflow)
-        e_width = {16: 7,  32: 10, 64: 13}[width] # 2 extra bits (overflow)
-        e_max = 1<<(e_width-3)
-        self.rmw = m_width # real mantissa width (not including extras)
-        self.e_max = e_max
-        if m_extra:
-            # mantissa extra bits (top,guard,round)
-            self.m_extra = 3
-            m_width += self.m_extra
-        else:
-            self.m_extra = 0
-        #print (m_width, e_width, e_max, self.rmw, self.m_extra)
-        self.m_width = m_width
-        self.e_width = e_width
-        self.e_start = self.rmw - 1
-        self.e_end = self.rmw + self.e_width - 3 # for decoding
-
-        self.v = Signal(width, reset_less=True)      # Latched copy of value
-        self.m = Signal(m_width, reset_less=True)    # Mantissa
-        self.e = Signal((e_width, True), reset_less=True) # Exponent: IEEE754exp+2 bits, signed
-        self.s = Signal(reset_less=True)           # Sign bit
-
-        self.mzero = Const(0, (m_width, False))
-        m_msb = 1<<(self.m_width-2)
-        self.msb1 = Const(m_msb, (m_width, False))
-        self.m1s = Const(-1, (m_width, False))
-        self.P128 = Const(e_max, (e_width, True))
-        self.P127 = Const(e_max-1, (e_width, True))
-        self.N127 = Const(-(e_max-1), (e_width, True))
-        self.N126 = Const(-(e_max-2), (e_width, True))
-
-        self.is_nan = Signal(reset_less=True)
-        self.is_zero = Signal(reset_less=True)
-        self.is_inf = Signal(reset_less=True)
-        self.is_overflowed = Signal(reset_less=True)
-        self.is_denormalised = Signal(reset_less=True)
-        self.exp_128 = Signal(reset_less=True)
-        self.exp_sub_n126 = Signal((e_width, True), reset_less=True)
-        self.exp_lt_n126 = Signal(reset_less=True)
-        self.exp_gt_n126 = Signal(reset_less=True)
-        self.exp_gt127 = Signal(reset_less=True)
-        self.exp_n127 = Signal(reset_less=True)
-        self.exp_n126 = Signal(reset_less=True)
-        self.m_zero = Signal(reset_less=True)
-        self.m_msbzero = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.is_nan.eq(self._is_nan())
-        m.d.comb += self.is_zero.eq(self._is_zero())
-        m.d.comb += self.is_inf.eq(self._is_inf())
-        m.d.comb += self.is_overflowed.eq(self._is_overflowed())
-        m.d.comb += self.is_denormalised.eq(self._is_denormalised())
-        m.d.comb += self.exp_128.eq(self.e == self.P128)
-        m.d.comb += self.exp_sub_n126.eq(self.e - self.N126)
-        m.d.comb += self.exp_gt_n126.eq(self.exp_sub_n126 > 0)
-        m.d.comb += self.exp_lt_n126.eq(self.exp_sub_n126 < 0)
-        m.d.comb += self.exp_gt127.eq(self.e > self.P127)
-        m.d.comb += self.exp_n127.eq(self.e == self.N127)
-        m.d.comb += self.exp_n126.eq(self.e == self.N126)
-        m.d.comb += self.m_zero.eq(self.m == self.mzero)
-        m.d.comb += self.m_msbzero.eq(self.m[self.e_start] == 0)
-
-        return m
-
-    def _is_nan(self):
-        return (self.exp_128) & (~self.m_zero)
-
-    def _is_inf(self):
-        return (self.exp_128) & (self.m_zero)
-
-    def _is_zero(self):
-        return (self.exp_n127) & (self.m_zero)
-
-    def _is_overflowed(self):
-        return self.exp_gt127
-
-    def _is_denormalised(self):
-        return (self.exp_n126) & (self.m_msbzero)
-
-    def __iter__(self):
-        yield self.s
-        yield self.e
-        yield self.m
-
-    def eq(self, inp):
-        return [self.s.eq(inp.s), self.e.eq(inp.e), self.m.eq(inp.m)]
-
-
-class FPNumOut(FPNumBase):
-    """ Floating-point Number Class
-
-        Contains signals for an incoming copy of the value, decoded into
-        sign / exponent / mantissa.
-        Also contains encoding functions, creation and recognition of
-        zero, NaN and inf (all signed)
-
-        Four extra bits are included in the mantissa: the top bit
-        (m[-1]) is effectively a carry-overflow.  The other three are
-        guard (m[2]), round (m[1]), and sticky (m[0])
-    """
-    def __init__(self, width, m_extra=True):
-        FPNumBase.__init__(self, width, m_extra)
-
-    def elaborate(self, platform):
-        m = FPNumBase.elaborate(self, platform)
-
-        return m
-
-    def create(self, s, e, m):
-        """ creates a value from sign / exponent / mantissa
-
-            bias is added here, to the exponent
-        """
-        return [
-          self.v[-1].eq(s),          # sign
-          self.v[self.e_start:self.e_end].eq(e + self.P127), # exp (add on bias)
-          self.v[0:self.e_start].eq(m)         # mantissa
-        ]
-
-    def nan(self, s):
-        return self.create(s, self.P128, 1<<(self.e_start-1))
-
-    def inf(self, s):
-        return self.create(s, self.P128, 0)
-
-    def zero(self, s):
-        return self.create(s, self.N127, 0)
-
-    def create2(self, s, e, m):
-        """ creates a value from sign / exponent / mantissa
-
-            bias is added here, to the exponent
-        """
-        e = e + self.P127 # exp (add on bias)
-        return Cat(m[0:self.e_start],
-                   e[0:self.e_end-self.e_start],
-                   s)
-
-    def nan2(self, s):
-        return self.create2(s, self.P128, self.msb1)
-
-    def inf2(self, s):
-        return self.create2(s, self.P128, self.mzero)
-
-    def zero2(self, s):
-        return self.create2(s, self.N127, self.mzero)
-
-
-class MultiShiftRMerge(Elaboratable):
-    """ shifts down (right) and merges lower bits into m[0].
-        m[0] is the "sticky" bit, basically
-    """
-    def __init__(self, width, s_max=None):
-        if s_max is None:
-            s_max = int(log(width) / log(2))
-        self.smax = s_max
-        self.m = Signal(width, reset_less=True)
-        self.inp = Signal(width, reset_less=True)
-        self.diff = Signal(s_max, reset_less=True)
-        self.width = width
-
-    def elaborate(self, platform):
-        m = Module()
-
-        rs = Signal(self.width, reset_less=True)
-        m_mask = Signal(self.width, reset_less=True)
-        smask = Signal(self.width, reset_less=True)
-        stickybit = Signal(reset_less=True)
-        maxslen = Signal(self.smax, reset_less=True)
-        maxsleni = Signal(self.smax, reset_less=True)
-
-        sm = MultiShift(self.width-1)
-        m0s = Const(0, self.width-1)
-        mw = Const(self.width-1, len(self.diff))
-        m.d.comb += [maxslen.eq(Mux(self.diff > mw, mw, self.diff)),
-                     maxsleni.eq(Mux(self.diff > mw, 0, mw-self.diff)),
-                    ]
-
-        m.d.comb += [
-                # shift mantissa by maxslen, mask by inverse
-                rs.eq(sm.rshift(self.inp[1:], maxslen)),
-                m_mask.eq(sm.rshift(~m0s, maxsleni)),
-                smask.eq(self.inp[1:] & m_mask),
-                # sticky bit combines all mask (and mantissa low bit)
-                stickybit.eq(smask.bool() | self.inp[0]),
-                # mantissa result contains m[0] already.
-                self.m.eq(Cat(stickybit, rs))
-           ]
-        return m
-
-
-class FPNumShift(FPNumBase, Elaboratable):
-    """ Floating-point Number Class for shifting
-    """
-    def __init__(self, mainm, op, inv, width, m_extra=True):
-        FPNumBase.__init__(self, width, m_extra)
-        self.latch_in = Signal()
-        self.mainm = mainm
-        self.inv = inv
-        self.op = op
-
-    def elaborate(self, platform):
-        m = FPNumBase.elaborate(self, platform)
-
-        m.d.comb += self.s.eq(op.s)
-        m.d.comb += self.e.eq(op.e)
-        m.d.comb += self.m.eq(op.m)
-
-        with self.mainm.State("align"):
-            with m.If(self.e < self.inv.e):
-                m.d.sync += self.shift_down()
-
-        return m
-
-    def shift_down(self, inp):
-        """ shifts a mantissa down by one. exponent is increased to compensate
-
-            accuracy is lost as a result in the mantissa however there are 3
-            guard bits (the latter of which is the "sticky" bit)
-        """
-        return [self.e.eq(inp.e + 1),
-                self.m.eq(Cat(inp.m[0] | inp.m[1], inp.m[2:], 0))
-               ]
-
-    def shift_down_multi(self, diff):
-        """ shifts a mantissa down. exponent is increased to compensate
-
-            accuracy is lost as a result in the mantissa however there are 3
-            guard bits (the latter of which is the "sticky" bit)
-
-            this code works by variable-shifting the mantissa by up to
-            its maximum bit-length: no point doing more (it'll still be
-            zero).
-
-            the sticky bit is computed by shifting a batch of 1s by
-            the same amount, which will introduce zeros.  it's then
-            inverted and used as a mask to get the LSBs of the mantissa.
-            those are then |'d into the sticky bit.
-        """
-        sm = MultiShift(self.width)
-        mw = Const(self.m_width-1, len(diff))
-        maxslen = Mux(diff > mw, mw, diff)
-        rs = sm.rshift(self.m[1:], maxslen)
-        maxsleni = mw - maxslen
-        m_mask = sm.rshift(self.m1s[1:], maxsleni) # shift and invert
-
-        stickybits = reduce(or_, self.m[1:] & m_mask) | self.m[0]
-        return [self.e.eq(self.e + diff),
-                self.m.eq(Cat(stickybits, rs))
-               ]
-
-    def shift_up_multi(self, diff):
-        """ shifts a mantissa up. exponent is decreased to compensate
-        """
-        sm = MultiShift(self.width)
-        mw = Const(self.m_width, len(diff))
-        maxslen = Mux(diff > mw, mw, diff)
-
-        return [self.e.eq(self.e - diff),
-                self.m.eq(sm.lshift(self.m, maxslen))
-               ]
-
-
-class FPNumDecode(FPNumBase):
-    """ Floating-point Number Class
-
-        Contains signals for an incoming copy of the value, decoded into
-        sign / exponent / mantissa.
-        Also contains encoding functions, creation and recognition of
-        zero, NaN and inf (all signed)
-
-        Four extra bits are included in the mantissa: the top bit
-        (m[-1]) is effectively a carry-overflow.  The other three are
-        guard (m[2]), round (m[1]), and sticky (m[0])
-    """
-    def __init__(self, op, width, m_extra=True):
-        FPNumBase.__init__(self, width, m_extra)
-        self.op = op
-
-    def elaborate(self, platform):
-        m = FPNumBase.elaborate(self, platform)
-
-        m.d.comb += self.decode(self.v)
-
-        return m
-
-    def decode(self, v):
-        """ decodes a latched value into sign / exponent / mantissa
-
-            bias is subtracted here, from the exponent.  exponent
-            is extended to 10 bits so that subtract 127 is done on
-            a 10-bit number
-        """
-        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
-        #print ("decode", self.e_end)
-        return [self.m.eq(Cat(*args)), # mantissa
-                self.e.eq(v[self.e_start:self.e_end] - self.P127), # exp
-                self.s.eq(v[-1]),                 # sign
-                ]
-
-class FPNumIn(FPNumBase):
-    """ Floating-point Number Class
-
-        Contains signals for an incoming copy of the value, decoded into
-        sign / exponent / mantissa.
-        Also contains encoding functions, creation and recognition of
-        zero, NaN and inf (all signed)
-
-        Four extra bits are included in the mantissa: the top bit
-        (m[-1]) is effectively a carry-overflow.  The other three are
-        guard (m[2]), round (m[1]), and sticky (m[0])
-    """
-    def __init__(self, op, width, m_extra=True):
-        FPNumBase.__init__(self, width, m_extra)
-        self.latch_in = Signal()
-        self.op = op
-
-    def decode2(self, m):
-        """ decodes a latched value into sign / exponent / mantissa
-
-            bias is subtracted here, from the exponent.  exponent
-            is extended to 10 bits so that subtract 127 is done on
-            a 10-bit number
-        """
-        v = self.v
-        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
-        #print ("decode", self.e_end)
-        res = ObjectProxy(m, pipemode=False)
-        res.m = Cat(*args)                             # mantissa
-        res.e = v[self.e_start:self.e_end] - self.P127 # exp
-        res.s = v[-1]                                  # sign
-        return res
-
-    def decode(self, v):
-        """ decodes a latched value into sign / exponent / mantissa
-
-            bias is subtracted here, from the exponent.  exponent
-            is extended to 10 bits so that subtract 127 is done on
-            a 10-bit number
-        """
-        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
-        #print ("decode", self.e_end)
-        return [self.m.eq(Cat(*args)), # mantissa
-                self.e.eq(v[self.e_start:self.e_end] - self.P127), # exp
-                self.s.eq(v[-1]),                 # sign
-                ]
-
-    def shift_down(self, inp):
-        """ shifts a mantissa down by one. exponent is increased to compensate
-
-            accuracy is lost as a result in the mantissa however there are 3
-            guard bits (the latter of which is the "sticky" bit)
-        """
-        return [self.e.eq(inp.e + 1),
-                self.m.eq(Cat(inp.m[0] | inp.m[1], inp.m[2:], 0))
-               ]
-
-    def shift_down_multi(self, diff, inp=None):
-        """ shifts a mantissa down. exponent is increased to compensate
-
-            accuracy is lost as a result in the mantissa however there are 3
-            guard bits (the latter of which is the "sticky" bit)
-
-            this code works by variable-shifting the mantissa by up to
-            its maximum bit-length: no point doing more (it'll still be
-            zero).
-
-            the sticky bit is computed by shifting a batch of 1s by
-            the same amount, which will introduce zeros.  it's then
-            inverted and used as a mask to get the LSBs of the mantissa.
-            those are then |'d into the sticky bit.
-        """
-        if inp is None:
-            inp = self
-        sm = MultiShift(self.width)
-        mw = Const(self.m_width-1, len(diff))
-        maxslen = Mux(diff > mw, mw, diff)
-        rs = sm.rshift(inp.m[1:], maxslen)
-        maxsleni = mw - maxslen
-        m_mask = sm.rshift(self.m1s[1:], maxsleni) # shift and invert
-
-        #stickybit = reduce(or_, inp.m[1:] & m_mask) | inp.m[0]
-        stickybit = (inp.m[1:] & m_mask).bool() | inp.m[0]
-        return [self.e.eq(inp.e + diff),
-                self.m.eq(Cat(stickybit, rs))
-               ]
-
-    def shift_up_multi(self, diff):
-        """ shifts a mantissa up. exponent is decreased to compensate
-        """
-        sm = MultiShift(self.width)
-        mw = Const(self.m_width, len(diff))
-        maxslen = Mux(diff > mw, mw, diff)
-
-        return [self.e.eq(self.e - diff),
-                self.m.eq(sm.lshift(self.m, maxslen))
-               ]
-
-class Trigger(Elaboratable):
-    def __init__(self):
-
-        self.stb = Signal(reset=0)
-        self.ack = Signal()
-        self.trigger = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.trigger.eq(self.stb & self.ack)
-        return m
-
-    def eq(self, inp):
-        return [self.stb.eq(inp.stb),
-                self.ack.eq(inp.ack)
-               ]
-
-    def ports(self):
-        return [self.stb, self.ack]
-
-
-class FPOpIn(PrevControl):
-    def __init__(self, width):
-        PrevControl.__init__(self)
-        self.width = width
-
-    @property
-    def v(self):
-        return self.data_i
-
-    def chain_inv(self, in_op, extra=None):
-        stb = in_op.stb
-        if extra is not None:
-            stb = stb & extra
-        return [self.v.eq(in_op.v),          # receive value
-                self.stb.eq(stb),      # receive STB
-                in_op.ack.eq(~self.ack), # send ACK
-               ]
-
-    def chain_from(self, in_op, extra=None):
-        stb = in_op.stb
-        if extra is not None:
-            stb = stb & extra
-        return [self.v.eq(in_op.v),          # receive value
-                self.stb.eq(stb),      # receive STB
-                in_op.ack.eq(self.ack), # send ACK
-               ]
-
-
-class FPOpOut(NextControl):
-    def __init__(self, width):
-        NextControl.__init__(self)
-        self.width = width
-
-    @property
-    def v(self):
-        return self.data_o
-
-    def chain_inv(self, in_op, extra=None):
-        stb = in_op.stb
-        if extra is not None:
-            stb = stb & extra
-        return [self.v.eq(in_op.v),          # receive value
-                self.stb.eq(stb),      # receive STB
-                in_op.ack.eq(~self.ack), # send ACK
-               ]
-
-    def chain_from(self, in_op, extra=None):
-        stb = in_op.stb
-        if extra is not None:
-            stb = stb & extra
-        return [self.v.eq(in_op.v),          # receive value
-                self.stb.eq(stb),      # receive STB
-                in_op.ack.eq(self.ack), # send ACK
-               ]
-
-
-class Overflow: #(Elaboratable):
-    def __init__(self):
-        self.guard = Signal(reset_less=True)     # tot[2]
-        self.round_bit = Signal(reset_less=True) # tot[1]
-        self.sticky = Signal(reset_less=True)    # tot[0]
-        self.m0 = Signal(reset_less=True)        # mantissa zero bit
-
-        self.roundz = Signal(reset_less=True)
-
-    def __iter__(self):
-        yield self.guard
-        yield self.round_bit
-        yield self.sticky
-        yield self.m0
-
-    def eq(self, inp):
-        return [self.guard.eq(inp.guard),
-                self.round_bit.eq(inp.round_bit),
-                self.sticky.eq(inp.sticky),
-                self.m0.eq(inp.m0)]
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.roundz.eq(self.guard & \
-                                   (self.round_bit | self.sticky | self.m0))
-        return m
-
-
-class FPBase:
-    """ IEEE754 Floating Point Base Class
-
-        contains common functions for FP manipulation, such as
-        extracting and packing operands, normalisation, denormalisation,
-        rounding etc.
-    """
-
-    def get_op(self, m, op, v, next_state):
-        """ this function moves to the next state and copies the operand
-            when both stb and ack are 1.
-            acknowledgement is sent by setting ack to ZERO.
-        """
-        res = v.decode2(m)
-        ack = Signal()
-        with m.If((op.ready_o) & (op.valid_i_test)):
-            m.next = next_state
-            # op is latched in from FPNumIn class on same ack/stb
-            m.d.comb += ack.eq(0)
-        with m.Else():
-            m.d.comb += ack.eq(1)
-        return [res, ack]
-
-    def denormalise(self, m, a):
-        """ denormalises a number.  this is probably the wrong name for
-            this function.  for normalised numbers (exponent != minimum)
-            one *extra* bit (the implicit 1) is added *back in*.
-            for denormalised numbers, the mantissa is left alone
-            and the exponent increased by 1.
-
-            both cases *effectively multiply the number stored by 2*,
-            which has to be taken into account when extracting the result.
-        """
-        with m.If(a.exp_n127):
-            m.d.sync += a.e.eq(a.N126) # limit a exponent
-        with m.Else():
-            m.d.sync += a.m[-1].eq(1) # set top mantissa bit
-
-    def op_normalise(self, m, op, next_state):
-        """ operand normalisation
-            NOTE: just like "align", this one keeps going round every clock
-                  until the result's exponent is within acceptable "range"
-        """
-        with m.If((op.m[-1] == 0)): # check last bit of mantissa
-            m.d.sync +=[
-                op.e.eq(op.e - 1),  # DECREASE exponent
-                op.m.eq(op.m << 1), # shift mantissa UP
-            ]
-        with m.Else():
-            m.next = next_state
-
-    def normalise_1(self, m, z, of, next_state):
-        """ first stage normalisation
-
-            NOTE: just like "align", this one keeps going round every clock
-                  until the result's exponent is within acceptable "range"
-            NOTE: the weirdness of reassigning guard and round is due to
-                  the extra mantissa bits coming from tot[0..2]
-        """
-        with m.If((z.m[-1] == 0) & (z.e > z.N126)):
-            m.d.sync += [
-                z.e.eq(z.e - 1),  # DECREASE exponent
-                z.m.eq(z.m << 1), # shift mantissa UP
-                z.m[0].eq(of.guard),       # steal guard bit (was tot[2])
-                of.guard.eq(of.round_bit), # steal round_bit (was tot[1])
-                of.round_bit.eq(0),        # reset round bit
-                of.m0.eq(of.guard),
-            ]
-        with m.Else():
-            m.next = next_state
-
-    def normalise_2(self, m, z, of, next_state):
-        """ second stage normalisation
-
-            NOTE: just like "align", this one keeps going round every clock
-                  until the result's exponent is within acceptable "range"
-            NOTE: the weirdness of reassigning guard and round is due to
-                  the extra mantissa bits coming from tot[0..2]
-        """
-        with m.If(z.e < z.N126):
-            m.d.sync +=[
-                z.e.eq(z.e + 1),  # INCREASE exponent
-                z.m.eq(z.m >> 1), # shift mantissa DOWN
-                of.guard.eq(z.m[0]),
-                of.m0.eq(z.m[1]),
-                of.round_bit.eq(of.guard),
-                of.sticky.eq(of.sticky | of.round_bit)
-            ]
-        with m.Else():
-            m.next = next_state
-
-    def roundz(self, m, z, roundz):
-        """ performs rounding on the output.  TODO: different kinds of rounding
-        """
-        with m.If(roundz):
-            m.d.sync += z.m.eq(z.m + 1) # mantissa rounds up
-            with m.If(z.m == z.m1s): # all 1s
-                m.d.sync += z.e.eq(z.e + 1) # exponent rounds up
-
-    def corrections(self, m, z, next_state):
-        """ denormalisation and sign-bug corrections
-        """
-        m.next = next_state
-        # denormalised, correct exponent to zero
-        with m.If(z.is_denormalised):
-            m.d.sync += z.e.eq(z.N127)
-
-    def pack(self, m, z, next_state):
-        """ packs the result into the output (detects overflow->Inf)
-        """
-        m.next = next_state
-        # if overflow occurs, return inf
-        with m.If(z.is_overflowed):
-            m.d.sync += z.inf(z.s)
-        with m.Else():
-            m.d.sync += z.create(z.s, z.e, z.m)
-
-    def put_z(self, m, z, out_z, next_state):
-        """ put_z: stores the result in the output.  raises stb and waits
-            for ack to be set to 1 before moving to the next state.
-            resets stb back to zero when that occurs, as acknowledgement.
-        """
-        m.d.sync += [
-          out_z.v.eq(z.v)
-        ]
-        with m.If(out_z.valid_o & out_z.ready_i_test):
-            m.d.sync += out_z.valid_o.eq(0)
-            m.next = next_state
-        with m.Else():
-            m.d.sync += out_z.valid_o.eq(1)
-
-
-class FPState(FPBase):
-    def __init__(self, state_from):
-        self.state_from = state_from
-
-    def set_inputs(self, inputs):
-        self.inputs = inputs
-        for k,v in inputs.items():
-            setattr(self, k, v)
-
-    def set_outputs(self, outputs):
-        self.outputs = outputs
-        for k,v in outputs.items():
-            setattr(self, k, v)
-
-
-class FPID:
-    def __init__(self, id_wid):
-        self.id_wid = id_wid
-        if self.id_wid:
-            self.in_mid = Signal(id_wid, reset_less=True)
-            self.out_mid = Signal(id_wid, reset_less=True)
-        else:
-            self.in_mid = None
-            self.out_mid = None
-
-    def idsync(self, m):
-        if self.id_wid is not None:
-            m.d.sync += self.out_mid.eq(self.in_mid)
-
-
diff --git a/src/add/fpcommon/__init__.py b/src/add/fpcommon/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/add/fpcommon/corrections.py b/src/add/fpcommon/corrections.py
deleted file mode 100644
index ce9ba3cd..00000000
--- a/src/add/fpcommon/corrections.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Elaboratable
-from nmigen.cli import main, verilog
-from fpbase import FPState
-from fpcommon.roundz import FPRoundData
-
-
-class FPCorrectionsMod(Elaboratable):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return FPRoundData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPRoundData(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.out_z
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.corrections = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.corr_in_z = self.i.z
-        m.submodules.corr_out_z = self.out_z.z
-        m.d.comb += self.out_z.eq(self.i) # copies mid, z, out_do_z
-        with m.If(~self.i.out_do_z):
-            with m.If(self.i.z.is_denormalised):
-                m.d.comb += self.out_z.z.e.eq(self.i.z.N127)
-        return m
-
-
-class FPCorrections(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "corrections")
-        self.mod = FPCorrectionsMod(width)
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def setup(self, m, in_z):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, in_z)
-
-        m.d.sync += self.out_z.eq(self.mod.out_z)
-        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
-
-    def action(self, m):
-        m.next = "pack"
-
-
diff --git a/src/add/fpcommon/denorm.py b/src/add/fpcommon/denorm.py
deleted file mode 100644
index 9fbbc976..00000000
--- a/src/add/fpcommon/denorm.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import FPNumIn, FPNumOut, FPNumBase
-from fpbase import FPState
-
-
-class FPSCData:
-
-    def __init__(self, width, id_wid):
-        self.a = FPNumBase(width, True)
-        self.b = FPNumBase(width, True)
-        self.z = FPNumOut(width, False)
-        self.oz = Signal(width, reset_less=True)
-        self.out_do_z = Signal(reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def __iter__(self):
-        yield from self.a
-        yield from self.b
-        yield from self.z
-        yield self.oz
-        yield self.out_do_z
-        yield self.mid
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
-
-
-class FPAddDeNormMod(FPState):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPSCData(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.denormalise = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.denorm_in_a = self.i.a
-        m.submodules.denorm_in_b = self.i.b
-        m.submodules.denorm_out_a = self.o.a
-        m.submodules.denorm_out_b = self.o.b
-
-        with m.If(~self.i.out_do_z):
-            # XXX hmmm, don't like repeating identical code
-            m.d.comb += self.o.a.eq(self.i.a)
-            with m.If(self.i.a.exp_n127):
-                m.d.comb += self.o.a.e.eq(self.i.a.N126) # limit a exponent
-            with m.Else():
-                m.d.comb += self.o.a.m[-1].eq(1) # set top mantissa bit
-
-            m.d.comb += self.o.b.eq(self.i.b)
-            with m.If(self.i.b.exp_n127):
-                m.d.comb += self.o.b.e.eq(self.i.b.N126) # limit a exponent
-            with m.Else():
-                m.d.comb += self.o.b.m[-1].eq(1) # set top mantissa bit
-
-        m.d.comb += self.o.mid.eq(self.i.mid)
-        m.d.comb += self.o.z.eq(self.i.z)
-        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
-        m.d.comb += self.o.oz.eq(self.i.oz)
-
-        return m
-
-
-class FPAddDeNorm(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "denormalise")
-        self.mod = FPAddDeNormMod(width)
-        self.out_a = FPNumBase(width)
-        self.out_b = FPNumBase(width)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-        m.d.sync += self.out_a.eq(self.mod.out_a)
-        m.d.sync += self.out_b.eq(self.mod.out_b)
-
-    def action(self, m):
-        # Denormalised Number checks
-        m.next = "align"
-
-
diff --git a/src/add/fpcommon/getop.py b/src/add/fpcommon/getop.py
deleted file mode 100644
index 1988997a..00000000
--- a/src/add/fpcommon/getop.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat, Mux, Array, Const, Elaboratable
-from nmigen.lib.coding import PriorityEncoder
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import FPNumIn, FPNumOut, FPOpIn, Overflow, FPBase, FPNumBase
-from fpbase import MultiShiftRMerge, Trigger
-from singlepipe import (ControlBase, StageChain, SimpleHandshake,
-                        PassThroughStage, PrevControl)
-from multipipe import CombMuxOutPipe
-from multipipe import PriorityCombMuxInPipe
-
-from fpbase import FPState
-import nmoperator
-
-
-class FPGetOpMod(Elaboratable):
-    def __init__(self, width):
-        self.in_op = FPOpIn(width)
-        self.in_op.data_i = Signal(width)
-        self.out_op = Signal(width)
-        self.out_decode = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.out_decode.eq((self.in_op.ready_o) & \
-                                       (self.in_op.valid_i_test))
-        m.submodules.get_op_in = self.in_op
-        #m.submodules.get_op_out = self.out_op
-        with m.If(self.out_decode):
-            m.d.comb += [
-                self.out_op.eq(self.in_op.v),
-            ]
-        return m
-
-
-class FPGetOp(FPState):
-    """ gets operand
-    """
-
-    def __init__(self, in_state, out_state, in_op, width):
-        FPState.__init__(self, in_state)
-        self.out_state = out_state
-        self.mod = FPGetOpMod(width)
-        self.in_op = in_op
-        self.out_op = Signal(width)
-        self.out_decode = Signal(reset_less=True)
-
-    def setup(self, m, in_op):
-        """ links module to inputs and outputs
-        """
-        setattr(m.submodules, self.state_from, self.mod)
-        m.d.comb += nmoperator.eq(self.mod.in_op, in_op)
-        m.d.comb += self.out_decode.eq(self.mod.out_decode)
-
-    def action(self, m):
-        with m.If(self.out_decode):
-            m.next = self.out_state
-            m.d.sync += [
-                self.in_op.ready_o.eq(0),
-                self.out_op.eq(self.mod.out_op)
-            ]
-        with m.Else():
-            m.d.sync += self.in_op.ready_o.eq(1)
-
-
-class FPNumBase2Ops:
-
-    def __init__(self, width, id_wid, m_extra=True):
-        self.a = FPNumBase(width, m_extra)
-        self.b = FPNumBase(width, m_extra)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
-
-    def ports(self):
-        return [self.a, self.b, self.mid]
-
-
-class FPADDBaseData:
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.a  = Signal(width)
-        self.b  = Signal(width)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
-
-    def ports(self):
-        return [self.a, self.b, self.mid]
-
-
-class FPGet2OpMod(PrevControl):
-    def __init__(self, width, id_wid):
-        PrevControl.__init__(self)
-        self.width = width
-        self.id_wid = id_wid
-        self.data_i = self.ispec()
-        self.i = self.data_i
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPADDBaseData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPADDBaseData(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def elaborate(self, platform):
-        m = PrevControl.elaborate(self, platform)
-        with m.If(self.trigger):
-            m.d.comb += [
-                self.o.eq(self.data_i),
-            ]
-        return m
-
-
-class FPGet2Op(FPState):
-    """ gets operands
-    """
-
-    def __init__(self, in_state, out_state, width, id_wid):
-        FPState.__init__(self, in_state)
-        self.out_state = out_state
-        self.mod = FPGet2OpMod(width, id_wid)
-        self.o = self.ospec()
-        self.in_stb = Signal(reset_less=True)
-        self.out_ack = Signal(reset_less=True)
-        self.out_decode = Signal(reset_less=True)
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def trigger_setup(self, m, in_stb, in_ack):
-        """ links stb/ack
-        """
-        m.d.comb += self.mod.valid_i.eq(in_stb)
-        m.d.comb += in_ack.eq(self.mod.ready_o)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.get_ops = self.mod
-        m.d.comb += self.mod.i.eq(i)
-        m.d.comb += self.out_ack.eq(self.mod.ready_o)
-        m.d.comb += self.out_decode.eq(self.mod.trigger)
-
-    def process(self, i):
-        return self.o
-
-    def action(self, m):
-        with m.If(self.out_decode):
-            m.next = self.out_state
-            m.d.sync += [
-                self.mod.ready_o.eq(0),
-                self.o.eq(self.mod.o),
-            ]
-        with m.Else():
-            m.d.sync += self.mod.ready_o.eq(1)
-
-
diff --git a/src/add/fpcommon/normtopack.py b/src/add/fpcommon/normtopack.py
deleted file mode 100644
index 87d08125..00000000
--- a/src/add/fpcommon/normtopack.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-#from nmigen.cli import main, verilog
-
-from singlepipe import StageChain, SimpleHandshake
-
-from fpbase import FPState, FPID
-from fpcommon.postcalc import FPAddStage1Data
-from fpcommon.postnormalise import FPNorm1ModSingle
-from fpcommon.roundz import FPRoundMod
-from fpcommon.corrections import FPCorrectionsMod
-from fpcommon.pack import FPPackData, FPPackMod
-
-
-class FPNormToPack(FPState, SimpleHandshake):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "normalise_1")
-        self.id_wid = id_wid
-        self.width = width
-        SimpleHandshake.__init__(self, self) # pipeline is its own stage
-
-    def ispec(self):
-        return FPAddStage1Data(self.width, self.id_wid) # Norm1ModSingle ispec
-
-    def ospec(self):
-        return FPPackData(self.width, self.id_wid) # FPPackMod ospec
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-
-        # Normalisation, Rounding Corrections, Pack - in a chain
-        nmod = FPNorm1ModSingle(self.width, self.id_wid)
-        rmod = FPRoundMod(self.width, self.id_wid)
-        cmod = FPCorrectionsMod(self.width, self.id_wid)
-        pmod = FPPackMod(self.width, self.id_wid)
-        stages = [nmod, rmod, cmod, pmod]
-        chain = StageChain(stages)
-        chain.setup(m, i)
-        self.out_z = pmod.ospec()
-
-        self.o = pmod.o
-
-    def process(self, i):
-        return self.o
-
-    def action(self, m):
-        m.d.sync += self.out_z.eq(self.process(None))
-        m.next = "pack_put_z"
diff --git a/src/add/fpcommon/pack.py b/src/add/fpcommon/pack.py
deleted file mode 100644
index 1464883c..00000000
--- a/src/add/fpcommon/pack.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumOut
-from fpbase import FPState
-from fpcommon.roundz import FPRoundData
-from singlepipe import Object
-
-
-class FPPackData(Object):
-
-    def __init__(self, width, id_wid):
-        Object.__init__(self)
-        self.z = Signal(width, reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-
-class FPPackMod(Elaboratable):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPRoundData(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPPackData(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.o
-
-    def setup(self, m, in_z):
-        """ links module to inputs and outputs
-        """
-        m.submodules.pack = self
-        m.d.comb += self.i.eq(in_z)
-
-    def elaborate(self, platform):
-        m = Module()
-        z = FPNumOut(self.width, False)
-        m.submodules.pack_in_z = self.i.z
-        m.submodules.pack_out_z = z
-        m.d.comb += self.o.mid.eq(self.i.mid)
-        with m.If(~self.i.out_do_z):
-            with m.If(self.i.z.is_overflowed):
-                m.d.comb += z.inf(self.i.z.s)
-            with m.Else():
-                m.d.comb += z.create(self.i.z.s, self.i.z.e, self.i.z.m)
-        with m.Else():
-            m.d.comb += z.v.eq(self.i.oz)
-        m.d.comb += self.o.z.eq(z.v)
-        return m
-
-
-class FPPack(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "pack")
-        self.mod = FPPackMod(width)
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def setup(self, m, in_z):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, in_z)
-
-        m.d.sync += self.out_z.v.eq(self.mod.out_z.v)
-        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
-
-    def action(self, m):
-        m.next = "pack_put_z"
diff --git a/src/add/fpcommon/postcalc.py b/src/add/fpcommon/postcalc.py
deleted file mode 100644
index 7111dc8a..00000000
--- a/src/add/fpcommon/postcalc.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Signal
-from fpbase import Overflow, FPNumBase
-
-class FPAddStage1Data:
-
-    def __init__(self, width, id_wid):
-        self.z = FPNumBase(width, False)
-        self.out_do_z = Signal(reset_less=True)
-        self.oz = Signal(width, reset_less=True)
-        self.of = Overflow()
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def __iter__(self):
-        yield from self.z
-        yield self.out_do_z
-        yield self.oz
-        yield from self.of
-        yield self.mid
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.of.eq(i.of), self.mid.eq(i.mid)]
diff --git a/src/add/fpcommon/postnormalise.py b/src/add/fpcommon/postnormalise.py
deleted file mode 100644
index b072490f..00000000
--- a/src/add/fpcommon/postnormalise.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat, Mux, Elaboratable
-from nmigen.lib.coding import PriorityEncoder
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import Overflow, FPNumBase
-from fpbase import MultiShiftRMerge
-from fpbase import FPState
-from .postcalc import FPAddStage1Data
-
-
-class FPNorm1Data:
-
-    def __init__(self, width, id_wid):
-        self.roundz = Signal(reset_less=True)
-        self.z = FPNumBase(width, False)
-        self.out_do_z = Signal(reset_less=True)
-        self.oz = Signal(width, reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.roundz.eq(i.roundz), self.mid.eq(i.mid)]
-
-
-class FPNorm1ModSingle(Elaboratable):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.o = self.ospec()
-
-    def ispec(self):
-        return FPAddStage1Data(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPNorm1Data(self.width, self.id_wid)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.normalise_1 = self
-        m.d.comb += self.i.eq(i)
-
-    def process(self, i):
-        return self.o
-
-    def elaborate(self, platform):
-        m = Module()
-
-        mwid = self.o.z.m_width+2
-        pe = PriorityEncoder(mwid)
-        m.submodules.norm_pe = pe
-
-        of = Overflow()
-        m.d.comb += self.o.roundz.eq(of.roundz)
-
-        m.submodules.norm1_out_z = self.o.z
-        m.submodules.norm1_out_overflow = of
-        m.submodules.norm1_in_z = self.i.z
-        m.submodules.norm1_in_overflow = self.i.of
-
-        i = self.ispec()
-        m.submodules.norm1_insel_z = i.z
-        m.submodules.norm1_insel_overflow = i.of
-
-        espec = (len(i.z.e), True)
-        ediff_n126 = Signal(espec, reset_less=True)
-        msr = MultiShiftRMerge(mwid, espec)
-        m.submodules.multishift_r = msr
-
-        m.d.comb += i.eq(self.i)
-        # initialise out from in (overridden below)
-        m.d.comb += self.o.z.eq(i.z)
-        m.d.comb += of.eq(i.of)
-        # normalisation increase/decrease conditions
-        decrease = Signal(reset_less=True)
-        increase = Signal(reset_less=True)
-        m.d.comb += decrease.eq(i.z.m_msbzero & i.z.exp_gt_n126)
-        m.d.comb += increase.eq(i.z.exp_lt_n126)
-        # decrease exponent
-        with m.If(~self.i.out_do_z):
-            with m.If(decrease):
-                # *sigh* not entirely obvious: count leading zeros (clz)
-                # with a PriorityEncoder: to find from the MSB
-                # we reverse the order of the bits.
-                temp_m = Signal(mwid, reset_less=True)
-                temp_s = Signal(mwid+1, reset_less=True)
-                clz = Signal((len(i.z.e), True), reset_less=True)
-                # make sure that the amount to decrease by does NOT
-                # go below the minimum non-INF/NaN exponent
-                limclz = Mux(i.z.exp_sub_n126 > pe.o, pe.o,
-                             i.z.exp_sub_n126)
-                m.d.comb += [
-                    # cat round and guard bits back into the mantissa
-                    temp_m.eq(Cat(i.of.round_bit, i.of.guard, i.z.m)),
-                    pe.i.eq(temp_m[::-1]),          # inverted
-                    clz.eq(limclz),                 # count zeros from MSB down
-                    temp_s.eq(temp_m << clz),       # shift mantissa UP
-                    self.o.z.e.eq(i.z.e - clz),  # DECREASE exponent
-                    self.o.z.m.eq(temp_s[2:]),    # exclude bits 0&1
-                    of.m0.eq(temp_s[2]),          # copy of mantissa[0]
-                    # overflow in bits 0..1: got shifted too (leave sticky)
-                    of.guard.eq(temp_s[1]),       # guard
-                    of.round_bit.eq(temp_s[0]),   # round
-                ]
-            # increase exponent
-            with m.Elif(increase):
-                temp_m = Signal(mwid+1, reset_less=True)
-                m.d.comb += [
-                    temp_m.eq(Cat(i.of.sticky, i.of.round_bit, i.of.guard,
-                                  i.z.m)),
-                    ediff_n126.eq(i.z.N126 - i.z.e),
-                    # connect multi-shifter to inp/out mantissa (and ediff)
-                    msr.inp.eq(temp_m),
-                    msr.diff.eq(ediff_n126),
-                    self.o.z.m.eq(msr.m[3:]),
-                    of.m0.eq(temp_s[3]),   # copy of mantissa[0]
-                    # overflow in bits 0..1: got shifted too (leave sticky)
-                    of.guard.eq(temp_s[2]),     # guard
-                    of.round_bit.eq(temp_s[1]), # round
-                    of.sticky.eq(temp_s[0]),    # sticky
-                    self.o.z.e.eq(i.z.e + ediff_n126),
-                ]
-
-        m.d.comb += self.o.mid.eq(self.i.mid)
-        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
-        m.d.comb += self.o.oz.eq(self.i.oz)
-
-        return m
-
-
-class FPNorm1ModMulti:
-
-    def __init__(self, width, single_cycle=True):
-        self.width = width
-        self.in_select = Signal(reset_less=True)
-        self.in_z = FPNumBase(width, False)
-        self.in_of = Overflow()
-        self.temp_z = FPNumBase(width, False)
-        self.temp_of = Overflow()
-        self.out_z = FPNumBase(width, False)
-        self.out_of = Overflow()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        m.submodules.norm1_out_z = self.out_z
-        m.submodules.norm1_out_overflow = self.out_of
-        m.submodules.norm1_temp_z = self.temp_z
-        m.submodules.norm1_temp_of = self.temp_of
-        m.submodules.norm1_in_z = self.in_z
-        m.submodules.norm1_in_overflow = self.in_of
-
-        in_z = FPNumBase(self.width, False)
-        in_of = Overflow()
-        m.submodules.norm1_insel_z = in_z
-        m.submodules.norm1_insel_overflow = in_of
-
-        # select which of temp or in z/of to use
-        with m.If(self.in_select):
-            m.d.comb += in_z.eq(self.in_z)
-            m.d.comb += in_of.eq(self.in_of)
-        with m.Else():
-            m.d.comb += in_z.eq(self.temp_z)
-            m.d.comb += in_of.eq(self.temp_of)
-        # initialise out from in (overridden below)
-        m.d.comb += self.out_z.eq(in_z)
-        m.d.comb += self.out_of.eq(in_of)
-        # normalisation increase/decrease conditions
-        decrease = Signal(reset_less=True)
-        increase = Signal(reset_less=True)
-        m.d.comb += decrease.eq(in_z.m_msbzero & in_z.exp_gt_n126)
-        m.d.comb += increase.eq(in_z.exp_lt_n126)
-        m.d.comb += self.out_norm.eq(decrease | increase) # loop-end
-        # decrease exponent
-        with m.If(decrease):
-            m.d.comb += [
-                self.out_z.e.eq(in_z.e - 1),  # DECREASE exponent
-                self.out_z.m.eq(in_z.m << 1), # shift mantissa UP
-                self.out_z.m[0].eq(in_of.guard), # steal guard (was tot[2])
-                self.out_of.guard.eq(in_of.round_bit), # round (was tot[1])
-                self.out_of.round_bit.eq(0),        # reset round bit
-                self.out_of.m0.eq(in_of.guard),
-            ]
-        # increase exponent
-        with m.Elif(increase):
-            m.d.comb += [
-                self.out_z.e.eq(in_z.e + 1),  # INCREASE exponent
-                self.out_z.m.eq(in_z.m >> 1), # shift mantissa DOWN
-                self.out_of.guard.eq(in_z.m[0]),
-                self.out_of.m0.eq(in_z.m[1]),
-                self.out_of.round_bit.eq(in_of.guard),
-                self.out_of.sticky.eq(in_of.sticky | in_of.round_bit)
-            ]
-
-        return m
-
-
-class FPNorm1Single(FPState):
-
-    def __init__(self, width, id_wid, single_cycle=True):
-        FPState.__init__(self, "normalise_1")
-        self.mod = FPNorm1ModSingle(width)
-        self.o = self.ospec()
-        self.out_z = FPNumBase(width, False)
-        self.out_roundz = Signal(reset_less=True)
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-    def action(self, m):
-        m.next = "round"
-
-
-class FPNorm1Multi(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "normalise_1")
-        self.mod = FPNorm1ModMulti(width)
-        self.stb = Signal(reset_less=True)
-        self.ack = Signal(reset=0, reset_less=True)
-        self.out_norm = Signal(reset_less=True)
-        self.in_accept = Signal(reset_less=True)
-        self.temp_z = FPNumBase(width)
-        self.temp_of = Overflow()
-        self.out_z = FPNumBase(width)
-        self.out_roundz = Signal(reset_less=True)
-
-    def setup(self, m, in_z, in_of, norm_stb):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, in_z, in_of, norm_stb,
-                       self.in_accept, self.temp_z, self.temp_of,
-                       self.out_z, self.out_norm)
-
-        m.d.comb += self.stb.eq(norm_stb)
-        m.d.sync += self.ack.eq(0) # sets to zero when not in normalise_1 state
-
-    def action(self, m):
-        m.d.comb += self.in_accept.eq((~self.ack) & (self.stb))
-        m.d.sync += self.temp_of.eq(self.mod.out_of)
-        m.d.sync += self.temp_z.eq(self.out_z)
-        with m.If(self.out_norm):
-            with m.If(self.in_accept):
-                m.d.sync += [
-                    self.ack.eq(1),
-                ]
-            with m.Else():
-                m.d.sync += self.ack.eq(0)
-        with m.Else():
-            # normalisation not required (or done).
-            m.next = "round"
-            m.d.sync += self.ack.eq(1)
-            m.d.sync += self.out_roundz.eq(self.mod.out_of.roundz)
-
-
diff --git a/src/add/fpcommon/prenormalise.py b/src/add/fpcommon/prenormalise.py
deleted file mode 100644
index 0b3a65cb..00000000
--- a/src/add/fpcommon/prenormalise.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Cat,
-from nmigen.lib.coding import PriorityEncoder
-from nmigen.cli import main, verilog
-from math import log
-
-from fpbase import Overflow, FPNumBase
-from fpbase import MultiShiftRMerge
-
-from fpbase import FPState
-
-
-class FPNormaliseModSingle:
-
-    def __init__(self, width):
-        self.width = width
-        self.in_z = self.ispec()
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return FPNumBase(self.width, False)
-
-    def ospec(self):
-        return FPNumBase(self.width, False)
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        m.submodules.normalise = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        mwid = self.out_z.m_width+2
-        pe = PriorityEncoder(mwid)
-        m.submodules.norm_pe = pe
-
-        m.submodules.norm1_out_z = self.out_z
-        m.submodules.norm1_in_z = self.in_z
-
-        in_z = FPNumBase(self.width, False)
-        in_of = Overflow()
-        m.submodules.norm1_insel_z = in_z
-        m.submodules.norm1_insel_overflow = in_of
-
-        espec = (len(in_z.e), True)
-        ediff_n126 = Signal(espec, reset_less=True)
-        msr = MultiShiftRMerge(mwid, espec)
-        m.submodules.multishift_r = msr
-
-        m.d.comb += in_z.eq(self.in_z)
-        m.d.comb += in_of.eq(self.in_of)
-        # initialise out from in (overridden below)
-        m.d.comb += self.out_z.eq(in_z)
-        m.d.comb += self.out_of.eq(in_of)
-        # normalisation decrease condition
-        decrease = Signal(reset_less=True)
-        m.d.comb += decrease.eq(in_z.m_msbzero)
-        # decrease exponent
-        with m.If(decrease):
-            # *sigh* not entirely obvious: count leading zeros (clz)
-            # with a PriorityEncoder: to find from the MSB
-            # we reverse the order of the bits.
-            temp_m = Signal(mwid, reset_less=True)
-            temp_s = Signal(mwid+1, reset_less=True)
-            clz = Signal((len(in_z.e), True), reset_less=True)
-            m.d.comb += [
-                # cat round and guard bits back into the mantissa
-                temp_m.eq(Cat(in_of.round_bit, in_of.guard, in_z.m)),
-                pe.i.eq(temp_m[::-1]),          # inverted
-                clz.eq(pe.o),                   # count zeros from MSB down
-                temp_s.eq(temp_m << clz),       # shift mantissa UP
-                self.out_z.e.eq(in_z.e - clz),  # DECREASE exponent
-                self.out_z.m.eq(temp_s[2:]),    # exclude bits 0&1
-            ]
-
-        return m
-
-
diff --git a/src/add/fpcommon/putz.py b/src/add/fpcommon/putz.py
deleted file mode 100644
index 8173ed85..00000000
--- a/src/add/fpcommon/putz.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Signal
-from nmigen.cli import main, verilog
-from fpbase import FPState
-
-
-class FPPutZ(FPState):
-
-    def __init__(self, state, in_z, out_z, in_mid, out_mid, to_state=None):
-        FPState.__init__(self, state)
-        if to_state is None:
-            to_state = "get_ops"
-        self.to_state = to_state
-        self.in_z = in_z
-        self.out_z = out_z
-        self.in_mid = in_mid
-        self.out_mid = out_mid
-
-    def action(self, m):
-        if self.in_mid is not None:
-            m.d.sync += self.out_mid.eq(self.in_mid)
-        m.d.sync += [
-          self.out_z.z.v.eq(self.in_z)
-        ]
-        with m.If(self.out_z.z.valid_o & self.out_z.z.ready_i_test):
-            m.d.sync += self.out_z.z.valid_o.eq(0)
-            m.next = self.to_state
-        with m.Else():
-            m.d.sync += self.out_z.z.valid_o.eq(1)
-
-
-class FPPutZIdx(FPState):
-
-    def __init__(self, state, in_z, out_zs, in_mid, to_state=None):
-        FPState.__init__(self, state)
-        if to_state is None:
-            to_state = "get_ops"
-        self.to_state = to_state
-        self.in_z = in_z
-        self.out_zs = out_zs
-        self.in_mid = in_mid
-
-    def action(self, m):
-        outz_stb = Signal(reset_less=True)
-        outz_ack = Signal(reset_less=True)
-        m.d.comb += [outz_stb.eq(self.out_zs[self.in_mid].valid_o),
-                     outz_ack.eq(self.out_zs[self.in_mid].ready_i_test),
-                    ]
-        m.d.sync += [
-          self.out_zs[self.in_mid].v.eq(self.in_z.v)
-        ]
-        with m.If(outz_stb & outz_ack):
-            m.d.sync += self.out_zs[self.in_mid].valid_o.eq(0)
-            m.next = self.to_state
-        with m.Else():
-            m.d.sync += self.out_zs[self.in_mid].valid_o.eq(1)
-
diff --git a/src/add/fpcommon/roundz.py b/src/add/fpcommon/roundz.py
deleted file mode 100644
index 420d6669..00000000
--- a/src/add/fpcommon/roundz.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Elaboratable
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumBase
-from fpbase import FPState
-from fpcommon.postnormalise import FPNorm1Data
-
-
-class FPRoundData:
-
-    def __init__(self, width, id_wid):
-        self.z = FPNumBase(width, False)
-        self.out_do_z = Signal(reset_less=True)
-        self.oz = Signal(width, reset_less=True)
-        self.mid = Signal(id_wid, reset_less=True)
-
-    def eq(self, i):
-        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
-                self.mid.eq(i.mid)]
-
-
-class FPRoundMod(Elaboratable):
-
-    def __init__(self, width, id_wid):
-        self.width = width
-        self.id_wid = id_wid
-        self.i = self.ispec()
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return FPNorm1Data(self.width, self.id_wid)
-
-    def ospec(self):
-        return FPRoundData(self.width, self.id_wid)
-
-    def process(self, i):
-        return self.out_z
-
-    def setup(self, m, i):
-        m.submodules.roundz = self
-        m.d.comb += self.i.eq(i)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.out_z.eq(self.i) # copies mid, z, out_do_z
-        with m.If(~self.i.out_do_z):
-            with m.If(self.i.roundz):
-                m.d.comb += self.out_z.z.m.eq(self.i.z.m + 1) # mantissa up
-                with m.If(self.i.z.m == self.i.z.m1s): # all 1s
-                    m.d.comb += self.out_z.z.e.eq(self.i.z.e + 1) # exponent up
-
-        return m
-
-
-class FPRound(FPState):
-
-    def __init__(self, width, id_wid):
-        FPState.__init__(self, "round")
-        self.mod = FPRoundMod(width)
-        self.out_z = self.ospec()
-
-    def ispec(self):
-        return self.mod.ispec()
-
-    def ospec(self):
-        return self.mod.ospec()
-
-    def setup(self, m, i):
-        """ links module to inputs and outputs
-        """
-        self.mod.setup(m, i)
-
-        self.idsync(m)
-        m.d.sync += self.out_z.eq(self.mod.out_z)
-        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
-
-    def action(self, m):
-        m.next = "corrections"
diff --git a/src/add/fsqrt.py b/src/add/fsqrt.py
deleted file mode 100644
index 02449b0f..00000000
--- a/src/add/fsqrt.py
+++ /dev/null
@@ -1,256 +0,0 @@
-from sfpy import Float32
-
-
-# XXX DO NOT USE, fails on num=65536.  wark-wark...
-def sqrtsimple(num):
-    res = 0
-    bit = 1
-
-    while (bit < num):
-        bit <<= 2
-
-    while (bit != 0):
-        if (num >= res + bit):
-            num -= res + bit
-            res = (res >> 1) + bit
-        else:
-            res >>= 1
-        bit >>= 2
-
-    return res
-
-
-def sqrt(num):
-    D = num # D is input (from num)
-    Q = 0 # quotient
-    R = 0 # remainder
-    for i in range(64, -1, -1): # negative ranges are weird...
-
-        R = (R<<2)|((D>>(i+i))&3)
-
-        if R >= 0:
-            R -= ((Q<<2)|1) # -Q01
-        else:
-            R += ((Q<<2)|3) # +Q11
-
-        Q <<= 1
-        if R >= 0:
-            Q |= 1 # new Q
-
-    if R < 0:
-        R = R + ((Q<<1)|1)
-
-    return Q, R
-
-
-# grabbed these from unit_test_single (convenience, this is just experimenting)
-
-def get_mantissa(x):
-    return 0x7fffff & x
-
-def get_exponent(x):
-    return ((x & 0x7f800000) >> 23) - 127
-
-def set_exponent(x, e):
-    return (x & ~0x7f800000) | ((e+127) << 23)
-
-def get_sign(x):
-    return ((x & 0x80000000) >> 31)
-
-# convert FP32 to s/e/m
-def create_fp32(s, e, m):
-    """ receive sign, exponent, mantissa, return FP32 """
-    return set_exponent((s << 31) | get_mantissa(m))
-
-# convert s/e/m to FP32
-def decode_fp32(x):
-    """ receive FP32, return sign, exponent, mantissa """
-    return get_sign(x), get_exponent(x), get_mantissa(x)
-
-
-# main function, takes mantissa and exponent as separate arguments
-# returns a tuple, sqrt'd mantissa, sqrt'd exponent
-
-def main(mantissa, exponent):
-    if exponent & 1 != 0:
-        # shift mantissa up, subtract 1 from exp to compensate
-        mantissa <<= 1
-        exponent -= 1
-    m, r = sqrt(mantissa)
-    return m, r, exponent >> 1
-
-
-#normalization function
-def normalise(s, m, e, lowbits):
-    if (lowbits >= 2):
-        m += 1
-    if get_mantissa(m) == ((1<<24)-1):
-        e += 1
-    return s, m, e
-
-
-def fsqrt_test(x):
-
-    xbits = x.bits
-    print ("x", x, type(x))
-    sq_test = x.sqrt()
-    print ("sqrt", sq_test)
-
-    print (xbits, type(xbits))
-    s, e, m = decode_fp32(xbits)
-    print("x decode", s, e, m, hex(m))
-
-    m |= 1<<23 # set top bit (the missing "1" from mantissa)
-    m <<= 27
-
-    sm, sr, se = main(m, e)
-    lowbits = sm & 0x3
-    sm >>= 2
-    sm = get_mantissa(sm)
-    #sm += 2
-
-    s, sm, se = normalise(s, sm, se, lowbits)
-
-    print("our  sqrt", s, se, sm, hex(sm), bin(sm), "lowbits", lowbits,
-                                                    "rem", hex(sr))
-    if lowbits >= 2:
-        print ("probably needs rounding (+1 on mantissa)")
-
-    sq_xbits = sq_test.bits
-    s, e, m = decode_fp32(sq_xbits)
-    print ("sf32 sqrt", s, e, m, hex(m), bin(m))
-    print ()
-
-if __name__ == '__main__':
-
-    # quick test up to 1000 of two sqrt functions
-    for Q in range(1, int(1e4)):
-        print(Q, sqrt(Q), sqrtsimple(Q), int(Q**0.5))
-        assert int(Q**0.5) == sqrtsimple(Q), "Q sqrtsimpl fail %d" % Q
-        assert int(Q**0.5) == sqrt(Q)[0], "Q sqrt fail %d" % Q
-
-    # quick mantissa/exponent demo
-    for e in range(26):
-        for m in range(26):
-            ms, mr, es = main(m, e)
-            print("m:%d e:%d sqrt: m:%d-%d e:%d" % (m, e, ms, mr, es))
-
-    x = Float32(1234.123456789)
-    fsqrt_test(x)
-    x = Float32(32.1)
-    fsqrt_test(x)
-    x = Float32(16.0)
-    fsqrt_test(x)
-    x = Float32(8.0)
-    fsqrt_test(x)
-    x = Float32(8.5)
-    fsqrt_test(x)
-    x = Float32(3.14159265358979323)
-    fsqrt_test(x)
-    x = Float32(12.99392923123123)
-    fsqrt_test(x)
-    x = Float32(0.123456)
-    fsqrt_test(x)
-
-
-
-
-"""
-
-Notes:
-https://pdfs.semanticscholar.org/5060/4e9aff0e37089c4ab9a376c3f35761ffe28b.pdf
-
-//This is the main code of integer sqrt function found here:http://verilogcodes.blogspot.com/2017/11/a-verilog-function-for-finding-square-root.html
-//
-
-module testbench;
-
-reg [15:0] sqr;
-
-//Verilog function to find square root of a 32 bit number.
-//The output is 16 bit.
-function [15:0] sqrt;
-    input [31:0] num;  //declare input
-    //intermediate signals.
-    reg [31:0] a;
-    reg [15:0] q;
-    reg [17:0] left,right,r;
-    integer i;
-begin
-    //initialize all the variables.
-    a = num;
-    q = 0;
-    i = 0;
-    left = 0;   //input to adder/sub
-    right = 0;  //input to adder/sub
-    r = 0;  //remainder
-    //run the calculations for 16 iterations.
-    for(i=0;i<16;i=i+1) begin
-        right = {q,r[17],1'b1};
-        left = {r[15:0],a[31:30]};
-        a = {a[29:0],2'b00};    //left shift by 2 bits.
-        if (r[17] == 1) //add if r is negative
-            r = left + right;
-        else    //subtract if r is positive
-            r = left - right;
-        q = {q[14:0],!r[17]};
-    end
-    sqrt = q;   //final assignment of output.
-end
-endfunction //end of Function
-
-
-c version (from paper linked from URL)
-
-unsigned squart(D, r) /*Non-Restoring sqrt*/
-    unsigned D; /*D:32-bit unsigned integer to be square rooted */
-    int *r;
-{
-    unsigned Q = 0; /*Q:16-bit unsigned integer (root)*/
-    int R = 0; /*R:17-bit integer (remainder)*/
-    int i;
-    for (i = 15;i>=0;i--) /*for each root bit*/
-    {
-        if (R>=0)
-        { /*new remainder:*/
-            R = R<<2)|((D>>(i+i))&3);
-            R = R-((Q<<2)|1); /*-Q01*/
-        }
-        else
-        { /*new remainder:*/
-            R = R<<2)|((D>>(i+i))&3);
-            R = R+((Q<<2)|3); /*+Q11*/
-        }
-        if (R>=0) Q = Q<<1)|1; /*new Q:*/
-        else Q = Q<<1)|0; /*new Q:*/
-    }
-
-    /*remainder adjusting*/
-    if (R<0) R = R+((Q<<1)|1);
-    *r = R; /*return remainder*/
-    return(Q); /*return root*/
-}
-
-From wikipedia page:
-
-short isqrt(short num) {
-    short res = 0;
-    short bit = 1 << 14; // The second-to-top bit is set: 1 << 30 for 32 bits
-
-    // "bit" starts at the highest power of four <= the argument.
-    while (bit > num)
-        bit >>= 2;
-
-    while (bit != 0) {
-        if (num >= res + bit) {
-            num -= res + bit;
-            res = (res >> 1) + bit;
-        }
-        else
-            res >>= 1;
-        bit >>= 2;
-    }
-    return res;
-}
-
-"""
diff --git a/src/add/function_unit.py b/src/add/function_unit.py
deleted file mode 100644
index 108c84f3..00000000
--- a/src/add/function_unit.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from nmigen import Signal, Cat, Const, Mux, Module, Array
-from nmigen.cli import main, verilog
-
-from nmigen_add_experiment import FPADD
-from rstation_row import ReservationStationRow
-
-from math import log
-
-class FunctionUnit:
-
-    def __init__(self, width, num_units):
-        """ Function Unit
-
-            * width: bit-width of IEEE754.  supported: 16, 32, 64
-            * num_units: number of Reservation Stations
-        """
-        self.width = width
-
-        fus = []
-        bsz = int(log(width) / log(2))
-        for i in range(num_units):
-            mid = Const(i, bsz)
-            rs = ReservationStationRow(width, mid)
-            rs.name = "RS%d" % i
-            fus.append(rs)
-        self.fus = Array(fus)
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for ReservationStationRow
-        """
-        m = Module()
-
-        return m
-
-
-if __name__ == "__main__":
-    rs = ReservationStationRow(width=32, id_wid=Const(1,4)
-    main(alu, ports=[rs.in_a, rs.in_b, rs.out_z]
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/inputgroup.py b/src/add/inputgroup.py
deleted file mode 100644
index e1b775d4..00000000
--- a/src/add/inputgroup.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from nmigen import Module, Signal, Cat, Array, Const
-from nmigen.lib.coding import PriorityEncoder
-from math import log
-
-from fpbase import Trigger
-
-
-class FPGetSyncOpsMod:
-    def __init__(self, width, num_ops=2):
-        self.width = width
-        self.num_ops = num_ops
-        inops = []
-        outops = []
-        for i in range(num_ops):
-            inops.append(Signal(width, reset_less=True))
-            outops.append(Signal(width, reset_less=True))
-        self.in_op = inops
-        self.out_op = outops
-        self.stb = Signal(num_ops)
-        self.ack = Signal()
-        self.ready = Signal(reset_less=True)
-        self.out_decode = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.ready.eq(self.stb == Const(-1, (self.num_ops, False)))
-        m.d.comb += self.out_decode.eq(self.ack & self.ready)
-        with m.If(self.out_decode):
-            for i in range(self.num_ops):
-                m.d.comb += [
-                        self.out_op[i].eq(self.in_op[i]),
-                ]
-        return m
-
-    def ports(self):
-        return self.in_op + self.out_op + [self.stb, self.ack]
-
-
-class FPOps(Trigger):
-    def __init__(self, width, num_ops):
-        Trigger.__init__(self)
-        self.width = width
-        self.num_ops = num_ops
-
-        res = []
-        for i in range(num_ops):
-            res.append(Signal(width))
-        self.v  = Array(res)
-
-    def ports(self):
-        res = []
-        for i in range(self.num_ops):
-            res.append(self.v[i])
-        res.append(self.ack)
-        res.append(self.stb)
-        return res
-
-
-class InputGroup:
-    def __init__(self, width, num_ops=2, num_rows=4):
-        self.width = width
-        self.num_ops = num_ops
-        self.num_rows = num_rows
-        self.mmax = int(log(self.num_rows) / log(2))
-        self.rs = []
-        self.mid = Signal(self.mmax, reset_less=True) # multiplex id
-        for i in range(num_rows):
-            self.rs.append(FPGetSyncOpsMod(width, num_ops))
-        self.rs = Array(self.rs)
-
-        self.out_op = FPOps(width, num_ops)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        pe = PriorityEncoder(self.num_rows)
-        m.submodules.selector = pe
-        m.submodules.out_op = self.out_op
-        m.submodules += self.rs
-
-        # connect priority encoder
-        in_ready = []
-        for i in range(self.num_rows):
-            in_ready.append(self.rs[i].ready)
-        m.d.comb += pe.i.eq(Cat(*in_ready))
-
-        active = Signal(reset_less=True)
-        out_en = Signal(reset_less=True)
-        m.d.comb += active.eq(~pe.n) # encoder active
-        m.d.comb += out_en.eq(active & self.out_op.trigger)
-
-        # encoder active: ack relevant input, record MID, pass output
-        with m.If(out_en):
-            rs = self.rs[pe.o]
-            m.d.sync += self.mid.eq(pe.o)
-            m.d.sync += rs.ack.eq(0)
-            m.d.sync += self.out_op.stb.eq(0)
-            for j in range(self.num_ops):
-                m.d.sync += self.out_op.v[j].eq(rs.out_op[j])
-        with m.Else():
-            m.d.sync += self.out_op.stb.eq(1)
-            # acks all default to zero
-            for i in range(self.num_rows):
-                m.d.sync += self.rs[i].ack.eq(1)
-
-        return m
-
-    def ports(self):
-        res = []
-        for i in range(self.num_rows):
-            inop = self.rs[i]
-            res += inop.in_op + [inop.stb]
-        return self.out_op.ports() + res + [self.mid]
-
-
diff --git a/src/add/iocontrol.py b/src/add/iocontrol.py
deleted file mode 100644
index 3d823c9b..00000000
--- a/src/add/iocontrol.py
+++ /dev/null
@@ -1,306 +0,0 @@
-""" IO Control API
-
-    Associated development bugs:
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
-
-    Stage API:
-    ---------
-
-    stage requires compliance with a strict API that may be
-    implemented in several means, including as a static class.
-
-    Stages do not HOLD data, and they definitely do not contain
-    signalling (ready/valid).  They do however specify the FORMAT
-    of the incoming and outgoing data, and they provide a means to
-    PROCESS that data (from incoming format to outgoing format).
-
-    Stage Blocks really must be combinatorial blocks.  It would be ok
-    to have input come in from sync'd sources (clock-driven) however by
-    doing so they would no longer be deterministic, and chaining such
-    blocks with such side-effects together could result in unexpected,
-    unpredictable, unreproduceable behaviour.
-    So generally to be avoided, then unless you know what you are doing.
-
-    the methods of a stage instance must be as follows:
-
-    * ispec() - Input data format specification.  Takes a bit of explaining.
-                The requirements are: something that eventually derives from
-                nmigen Value must be returned *OR* an iterator or iterable
-                or sequence (list, tuple etc.) or generator must *yield*
-                thing(s) that (eventually) derive from the nmigen Value class.
-
-                Complex to state, very simple in practice:
-                see test_buf_pipe.py for over 25 worked examples.
-
-    * ospec() - Output data format specification.
-                format requirements identical to ispec.
-
-    * process(m, i) - Optional function for processing ispec-formatted data.
-                returns a combinatorial block of a result that
-                may be assigned to the output, by way of the "nmoperator.eq"
-                function.  Note that what is returned here can be
-                extremely flexible.  Even a dictionary can be returned
-                as long as it has fields that match precisely with the
-                Record into which its values is intended to be assigned.
-                Again: see example unit tests for details.
-
-    * setup(m, i) - Optional function for setting up submodules.
-                may be used for more complex stages, to link
-                the input (i) to submodules.  must take responsibility
-                for adding those submodules to the module (m).
-                the submodules must be combinatorial blocks and
-                must have their inputs and output linked combinatorially.
-
-    Both StageCls (for use with non-static classes) and Stage (for use
-    by static classes) are abstract classes from which, for convenience
-    and as a courtesy to other developers, anything conforming to the
-    Stage API may *choose* to derive.  See Liskov Substitution Principle:
-    https://en.wikipedia.org/wiki/Liskov_substitution_principle
-
-    StageChain:
-    ----------
-
-    A useful combinatorial wrapper around stages that chains them together
-    and then presents a Stage-API-conformant interface.  By presenting
-    the same API as the stages it wraps, it can clearly be used recursively.
-
-    ControlBase:
-    -----------
-
-    The base class for pipelines.  Contains previous and next ready/valid/data.
-    Also has an extremely useful "connect" function that can be used to
-    connect a chain of pipelines and present the exact same prev/next
-    ready/valid/data API.
-
-    Note: pipelines basically do not become pipelines as such until
-    handed to a derivative of ControlBase.  ControlBase itself is *not*
-    strictly considered a pipeline class.  Wishbone and AXI4 (master or
-    slave) could be derived from ControlBase, for example.
-"""
-
-from nmigen import Signal, Cat, Const, Module, Value, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.hdl.rec import Record
-
-from collections.abc import Sequence, Iterable
-from collections import OrderedDict
-
-import nmoperator
-
-
-class Object:
-    def __init__(self):
-        self.fields = OrderedDict()
-
-    def __setattr__(self, k, v):
-        print ("kv", k, v)
-        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
-           k in dir(Object) or "fields" not in self.__dict__):
-            return object.__setattr__(self, k, v)
-        self.fields[k] = v
-
-    def __getattr__(self, k):
-        if k in self.__dict__:
-            return object.__getattr__(self, k)
-        try:
-            return self.fields[k]
-        except KeyError as e:
-            raise AttributeError(e)
-
-    def __iter__(self):
-        for x in self.fields.values():  # OrderedDict so order is preserved
-            if isinstance(x, Iterable):
-                yield from x
-            else:
-                yield x
-
-    def eq(self, inp):
-        res = []
-        for (k, o) in self.fields.items():
-            i = getattr(inp, k)
-            print ("eq", o, i)
-            rres = o.eq(i)
-            if isinstance(rres, Sequence):
-                res += rres
-            else:
-                res.append(rres)
-        print (res)
-        return res
-
-    def ports(self): # being called "keys" would be much better
-        return list(self)
-
-
-class RecordObject(Record):
-    def __init__(self, layout=None, name=None):
-        Record.__init__(self, layout=layout or [], name=None)
-
-    def __setattr__(self, k, v):
-        #print (dir(Record))
-        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
-           k in dir(Record) or "fields" not in self.__dict__):
-            return object.__setattr__(self, k, v)
-        self.fields[k] = v
-        #print ("RecordObject setattr", k, v)
-        if isinstance(v, Record):
-            newlayout = {k: (k, v.layout)}
-        elif isinstance(v, Value):
-            newlayout = {k: (k, v.shape())}
-        else:
-            newlayout = {k: (k, nmoperator.shape(v))}
-        self.layout.fields.update(newlayout)
-
-    def __iter__(self):
-        for x in self.fields.values(): # remember: fields is an OrderedDict
-            if isinstance(x, Iterable):
-                yield from x           # a bit like flatten (nmigen.tools)
-            else:
-                yield x
-
-    def ports(self): # would be better being called "keys"
-        return list(self)
-
-
-class PrevControl(Elaboratable):
-    """ contains signals that come *from* the previous stage (both in and out)
-        * valid_i: previous stage indicating all incoming data is valid.
-                   may be a multi-bit signal, where all bits are required
-                   to be asserted to indicate "valid".
-        * ready_o: output to next stage indicating readiness to accept data
-        * data_i : an input - MUST be added by the USER of this class
-    """
-
-    def __init__(self, i_width=1, stage_ctl=False):
-        self.stage_ctl = stage_ctl
-        self.valid_i = Signal(i_width, name="p_valid_i") # prev   >>in  self
-        self._ready_o = Signal(name="p_ready_o")         # prev   <<out self
-        self.data_i = None # XXX MUST BE ADDED BY USER
-        if stage_ctl:
-            self.s_ready_o = Signal(name="p_s_o_rdy")    # prev   <<out self
-        self.trigger = Signal(reset_less=True)
-
-    @property
-    def ready_o(self):
-        """ public-facing API: indicates (externally) that stage is ready
-        """
-        if self.stage_ctl:
-            return self.s_ready_o # set dynamically by stage
-        return self._ready_o      # return this when not under dynamic control
-
-    def _connect_in(self, prev, direct=False, fn=None, do_data=True):
-        """ internal helper function to connect stage to an input source.
-            do not use to connect stage-to-stage!
-        """
-        valid_i = prev.valid_i if direct else prev.valid_i_test
-        res = [self.valid_i.eq(valid_i),
-               prev.ready_o.eq(self.ready_o)]
-        if do_data is False:
-            return res
-        data_i = fn(prev.data_i) if fn is not None else prev.data_i
-        return res + [nmoperator.eq(self.data_i, data_i)]
-
-    @property
-    def valid_i_test(self):
-        vlen = len(self.valid_i)
-        if vlen > 1:
-            # multi-bit case: valid only when valid_i is all 1s
-            all1s = Const(-1, (len(self.valid_i), False))
-            valid_i = (self.valid_i == all1s)
-        else:
-            # single-bit valid_i case
-            valid_i = self.valid_i
-
-        # when stage indicates not ready, incoming data
-        # must "appear" to be not ready too
-        if self.stage_ctl:
-            valid_i = valid_i & self.s_ready_o
-
-        return valid_i
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.trigger.eq(self.valid_i_test & self.ready_o)
-        return m
-
-    def eq(self, i):
-        return [nmoperator.eq(self.data_i, i.data_i),
-                self.ready_o.eq(i.ready_o),
-                self.valid_i.eq(i.valid_i)]
-
-    def __iter__(self):
-        yield self.valid_i
-        yield self.ready_o
-        if hasattr(self.data_i, "ports"):
-            yield from self.data_i.ports()
-        elif isinstance(self.data_i, Sequence):
-            yield from self.data_i
-        else:
-            yield self.data_i
-
-    def ports(self):
-        return list(self)
-
-
-class NextControl(Elaboratable):
-    """ contains the signals that go *to* the next stage (both in and out)
-        * valid_o: output indicating to next stage that data is valid
-        * ready_i: input from next stage indicating that it can accept data
-        * data_o : an output - MUST be added by the USER of this class
-    """
-    def __init__(self, stage_ctl=False):
-        self.stage_ctl = stage_ctl
-        self.valid_o = Signal(name="n_valid_o") # self out>>  next
-        self.ready_i = Signal(name="n_ready_i") # self <<in   next
-        self.data_o = None # XXX MUST BE ADDED BY USER
-        #if self.stage_ctl:
-        self.d_valid = Signal(reset=1) # INTERNAL (data valid)
-        self.trigger = Signal(reset_less=True)
-
-    @property
-    def ready_i_test(self):
-        if self.stage_ctl:
-            return self.ready_i & self.d_valid
-        return self.ready_i
-
-    def connect_to_next(self, nxt, do_data=True):
-        """ helper function to connect to the next stage data/valid/ready.
-            data/valid is passed *TO* nxt, and ready comes *IN* from nxt.
-            use this when connecting stage-to-stage
-        """
-        res = [nxt.valid_i.eq(self.valid_o),
-               self.ready_i.eq(nxt.ready_o)]
-        if do_data:
-            res.append(nmoperator.eq(nxt.data_i, self.data_o))
-        return res
-
-    def _connect_out(self, nxt, direct=False, fn=None, do_data=True):
-        """ internal helper function to connect stage to an output source.
-            do not use to connect stage-to-stage!
-        """
-        ready_i = nxt.ready_i if direct else nxt.ready_i_test
-        res = [nxt.valid_o.eq(self.valid_o),
-               self.ready_i.eq(ready_i)]
-        if not do_data:
-            return res
-        data_o = fn(nxt.data_o) if fn is not None else nxt.data_o
-        return res + [nmoperator.eq(data_o, self.data_o)]
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.comb += self.trigger.eq(self.ready_i_test & self.valid_o)
-        return m
-
-    def __iter__(self):
-        yield self.ready_i
-        yield self.valid_o
-        if hasattr(self.data_o, "ports"):
-            yield from self.data_o.ports()
-        elif isinstance(self.data_o, Sequence):
-            yield from self.data_o
-        else:
-            yield self.data_o
-
-    def ports(self):
-        return list(self)
-
diff --git a/src/add/multipipe.py b/src/add/multipipe.py
deleted file mode 100644
index e24703f8..00000000
--- a/src/add/multipipe.py
+++ /dev/null
@@ -1,358 +0,0 @@
-""" Combinatorial Multi-input and Multi-output multiplexer blocks
-    conforming to Pipeline API
-
-    Multi-input is complex because if any one input is ready, the output
-    can be ready, and the decision comes from a separate module.
-
-    Multi-output is simple (pretty much identical to UnbufferedPipeline),
-    and the selection is just a mux.  The only proviso (difference) being:
-    the outputs not being selected have to have their ready_o signals
-    DEASSERTED.
-"""
-
-from math import log
-from nmigen import Signal, Cat, Const, Mux, Module, Array, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.lib.coding import PriorityEncoder
-from nmigen.hdl.rec import Record, Layout
-from stageapi import _spec
-
-from collections.abc import Sequence
-
-from example_buf_pipe import eq, NextControl, PrevControl, ExampleStage
-
-
-class MultiInControlBase(Elaboratable):
-    """ Common functions for Pipeline API
-    """
-    def __init__(self, in_multi=None, p_len=1):
-        """ Multi-input Control class.  Conforms to same API as ControlBase...
-            mostly.  has additional indices to the *multiple* input stages
-
-            * p: contains ready/valid to the previous stages PLURAL
-            * n: contains ready/valid to the next stage
-
-            User must also:
-            * add data_i members to PrevControl and
-            * add data_o member  to NextControl
-        """
-        # set up input and output IO ACK (prev/next ready/valid)
-        p = []
-        for i in range(p_len):
-            p.append(PrevControl(in_multi))
-        self.p = Array(p)
-        self.n = NextControl()
-
-    def connect_to_next(self, nxt, p_idx=0):
-        """ helper function to connect to the next stage data/valid/ready.
-        """
-        return self.n.connect_to_next(nxt.p[p_idx])
-
-    def _connect_in(self, prev, idx=0, prev_idx=None):
-        """ helper function to connect stage to an input source.  do not
-            use to connect stage-to-stage!
-        """
-        if prev_idx is None:
-            return self.p[idx]._connect_in(prev.p)
-        return self.p[idx]._connect_in(prev.p[prev_idx])
-
-    def _connect_out(self, nxt):
-        """ helper function to connect stage to an output source.  do not
-            use to connect stage-to-stage!
-        """
-        if nxt_idx is None:
-            return self.n._connect_out(nxt.n)
-        return self.n._connect_out(nxt.n)
-
-    def set_input(self, i, idx=0):
-        """ helper function to set the input data
-        """
-        return eq(self.p[idx].data_i, i)
-
-    def elaborate(self, platform):
-        m = Module()
-        for i, p in enumerate(self.p):
-            setattr(m.submodules, "p%d" % i, p)
-        m.submodules.n = self.n
-        return m
-
-    def __iter__(self):
-        for p in self.p:
-            yield from p
-        yield from self.n
-
-    def ports(self):
-        return list(self)
-
-
-class MultiOutControlBase(Elaboratable):
-    """ Common functions for Pipeline API
-    """
-    def __init__(self, n_len=1, in_multi=None):
-        """ Multi-output Control class.  Conforms to same API as ControlBase...
-            mostly.  has additional indices to the multiple *output* stages
-            [MultiInControlBase has multiple *input* stages]
-
-            * p: contains ready/valid to the previou stage
-            * n: contains ready/valid to the next stages PLURAL
-
-            User must also:
-            * add data_i member to PrevControl and
-            * add data_o members to NextControl
-        """
-
-        # set up input and output IO ACK (prev/next ready/valid)
-        self.p = PrevControl(in_multi)
-        n = []
-        for i in range(n_len):
-            n.append(NextControl())
-        self.n = Array(n)
-
-    def connect_to_next(self, nxt, n_idx=0):
-        """ helper function to connect to the next stage data/valid/ready.
-        """
-        return self.n[n_idx].connect_to_next(nxt.p)
-
-    def _connect_in(self, prev, idx=0):
-        """ helper function to connect stage to an input source.  do not
-            use to connect stage-to-stage!
-        """
-        return self.n[idx]._connect_in(prev.p)
-
-    def _connect_out(self, nxt, idx=0, nxt_idx=None):
-        """ helper function to connect stage to an output source.  do not
-            use to connect stage-to-stage!
-        """
-        if nxt_idx is None:
-            return self.n[idx]._connect_out(nxt.n)
-        return self.n[idx]._connect_out(nxt.n[nxt_idx])
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.p = self.p
-        for i, n in enumerate(self.n):
-            setattr(m.submodules, "n%d" % i, n)
-        return m
-
-    def set_input(self, i):
-        """ helper function to set the input data
-        """
-        return eq(self.p.data_i, i)
-
-    def __iter__(self):
-        yield from self.p
-        for n in self.n:
-            yield from n
-
-    def ports(self):
-        return list(self)
-
-
-class CombMultiOutPipeline(MultiOutControlBase):
-    """ A multi-input Combinatorial block conforming to the Pipeline API
-
-        Attributes:
-        -----------
-        p.data_i : stage input data (non-array).  shaped according to ispec
-        n.data_o : stage output data array.       shaped according to ospec
-    """
-
-    def __init__(self, stage, n_len, n_mux):
-        MultiOutControlBase.__init__(self, n_len=n_len)
-        self.stage = stage
-        self.n_mux = n_mux
-
-        # set up the input and output data
-        self.p.data_i = _spec(stage.ispec, 'data_i') # input type
-        for i in range(n_len):
-            name = 'data_o_%d' % i
-            self.n[i].data_o = _spec(stage.ospec, name) # output type
-
-    def process(self, i):
-        if hasattr(self.stage, "process"):
-            return self.stage.process(i)
-        return i
-
-    def elaborate(self, platform):
-        m = MultiOutControlBase.elaborate(self, platform)
-
-        if hasattr(self.n_mux, "elaborate"): # TODO: identify submodule?
-            m.submodules += self.n_mux
-
-        # need buffer register conforming to *input* spec
-        r_data = _spec(self.stage.ispec, 'r_data') # input type
-        if hasattr(self.stage, "setup"):
-            self.stage.setup(m, r_data)
-
-        # multiplexer id taken from n_mux
-        mid = self.n_mux.m_id
-
-        # temporaries
-        p_valid_i = Signal(reset_less=True)
-        pv = Signal(reset_less=True)
-        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
-        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
-
-        # all outputs to next stages first initialised to zero (invalid)
-        # the only output "active" is then selected by the muxid
-        for i in range(len(self.n)):
-            m.d.comb += self.n[i].valid_o.eq(0)
-        data_valid = self.n[mid].valid_o
-        m.d.comb += self.p.ready_o.eq(~data_valid | self.n[mid].ready_i)
-        m.d.comb += data_valid.eq(p_valid_i | \
-                                    (~self.n[mid].ready_i & data_valid))
-        with m.If(pv):
-            m.d.comb += eq(r_data, self.p.data_i)
-        m.d.comb += eq(self.n[mid].data_o, self.process(r_data))
-
-        return m
-
-
-class CombMultiInPipeline(MultiInControlBase):
-    """ A multi-input Combinatorial block conforming to the Pipeline API
-
-        Attributes:
-        -----------
-        p.data_i : StageInput, shaped according to ispec
-            The pipeline input
-        p.data_o : StageOutput, shaped according to ospec
-            The pipeline output
-        r_data : input_shape according to ispec
-            A temporary (buffered) copy of a prior (valid) input.
-            This is HELD if the output is not ready.  It is updated
-            SYNCHRONOUSLY.
-    """
-
-    def __init__(self, stage, p_len, p_mux):
-        MultiInControlBase.__init__(self, p_len=p_len)
-        self.stage = stage
-        self.p_mux = p_mux
-
-        # set up the input and output data
-        for i in range(p_len):
-            name = 'data_i_%d' % i
-            self.p[i].data_i = _spec(stage.ispec, name) # input type
-        self.n.data_o = _spec(stage.ospec, 'data_o')
-
-    def process(self, i):
-        if hasattr(self.stage, "process"):
-            return self.stage.process(i)
-        return i
-
-    def elaborate(self, platform):
-        m = MultiInControlBase.elaborate(self, platform)
-
-        m.submodules += self.p_mux
-
-        # need an array of buffer registers conforming to *input* spec
-        r_data = []
-        data_valid = []
-        p_valid_i = []
-        n_ready_in = []
-        p_len = len(self.p)
-        for i in range(p_len):
-            name = 'r_%d' % i
-            r = _spec(self.stage.ispec, name) # input type
-            r_data.append(r)
-            data_valid.append(Signal(name="data_valid", reset_less=True))
-            p_valid_i.append(Signal(name="p_valid_i", reset_less=True))
-            n_ready_in.append(Signal(name="n_ready_in", reset_less=True))
-            if hasattr(self.stage, "setup"):
-                self.stage.setup(m, r)
-        if len(r_data) > 1:
-            r_data = Array(r_data)
-            p_valid_i = Array(p_valid_i)
-            n_ready_in = Array(n_ready_in)
-            data_valid = Array(data_valid)
-
-        nirn = Signal(reset_less=True)
-        m.d.comb += nirn.eq(~self.n.ready_i)
-        mid = self.p_mux.m_id
-        for i in range(p_len):
-            m.d.comb += data_valid[i].eq(0)
-            m.d.comb += n_ready_in[i].eq(1)
-            m.d.comb += p_valid_i[i].eq(0)
-            m.d.comb += self.p[i].ready_o.eq(0)
-        m.d.comb += p_valid_i[mid].eq(self.p_mux.active)
-        m.d.comb += self.p[mid].ready_o.eq(~data_valid[mid] | self.n.ready_i)
-        m.d.comb += n_ready_in[mid].eq(nirn & data_valid[mid])
-        anyvalid = Signal(i, reset_less=True)
-        av = []
-        for i in range(p_len):
-            av.append(data_valid[i])
-        anyvalid = Cat(*av)
-        m.d.comb += self.n.valid_o.eq(anyvalid.bool())
-        m.d.comb += data_valid[mid].eq(p_valid_i[mid] | \
-                                    (n_ready_in[mid] & data_valid[mid]))
-
-        for i in range(p_len):
-            vr = Signal(reset_less=True)
-            m.d.comb += vr.eq(self.p[i].valid_i & self.p[i].ready_o)
-            with m.If(vr):
-                m.d.comb += eq(r_data[i], self.p[i].data_i)
-
-        m.d.comb += eq(self.n.data_o, self.process(r_data[mid]))
-
-        return m
-
-
-class CombMuxOutPipe(CombMultiOutPipeline):
-    def __init__(self, stage, n_len):
-        # HACK: stage is also the n-way multiplexer
-        CombMultiOutPipeline.__init__(self, stage, n_len=n_len, n_mux=stage)
-
-        # HACK: n-mux is also the stage... so set the muxid equal to input mid
-        stage.m_id = self.p.data_i.mid
-
-
-
-class InputPriorityArbiter(Elaboratable):
-    """ arbitration module for Input-Mux pipe, baed on PriorityEncoder
-    """
-    def __init__(self, pipe, num_rows):
-        self.pipe = pipe
-        self.num_rows = num_rows
-        self.mmax = int(log(self.num_rows) / log(2))
-        self.m_id = Signal(self.mmax, reset_less=True) # multiplex id
-        self.active = Signal(reset_less=True)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        assert len(self.pipe.p) == self.num_rows, \
-                "must declare input to be same size"
-        pe = PriorityEncoder(self.num_rows)
-        m.submodules.selector = pe
-
-        # connect priority encoder
-        in_ready = []
-        for i in range(self.num_rows):
-            p_valid_i = Signal(reset_less=True)
-            m.d.comb += p_valid_i.eq(self.pipe.p[i].valid_i_test)
-            in_ready.append(p_valid_i)
-        m.d.comb += pe.i.eq(Cat(*in_ready)) # array of input "valids"
-        m.d.comb += self.active.eq(~pe.n)   # encoder active (one input valid)
-        m.d.comb += self.m_id.eq(pe.o)       # output one active input
-
-        return m
-
-    def ports(self):
-        return [self.m_id, self.active]
-
-
-
-class PriorityCombMuxInPipe(CombMultiInPipeline):
-    """ an example of how to use the combinatorial pipeline.
-    """
-
-    def __init__(self, stage, p_len=2):
-        p_mux = InputPriorityArbiter(self, p_len)
-        CombMultiInPipeline.__init__(self, stage, p_len, p_mux)
-
-
-if __name__ == '__main__':
-
-    dut = PriorityCombMuxInPipe(ExampleStage)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_combpipe.il", "w") as f:
-        f.write(vl)
diff --git a/src/add/nmigen_add_experiment.py b/src/add/nmigen_add_experiment.py
deleted file mode 100644
index ecb1d35b..00000000
--- a/src/add/nmigen_add_experiment.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# IEEE Floating Point Adder (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen.cli import main, verilog
-from fpadd.statemachine import FPADDBase, FPADD
-from fpadd.pipeline import FPADDMuxInOut
-
-if __name__ == "__main__":
-    if True:
-        alu = FPADD(width=32, id_wid=5, single_cycle=True)
-        main(alu, ports=alu.rs[0][0].ports() + \
-                        alu.rs[0][1].ports() + \
-                        alu.res[0].ports() + \
-                        [alu.ids.in_mid, alu.ids.out_mid])
-    else:
-        alu = FPADDBase(width=32, id_wid=5, single_cycle=True)
-        main(alu, ports=[alu.in_a, alu.in_b] + \
-                        alu.in_t.ports() + \
-                        alu.out_z.ports() + \
-                        [alu.in_mid, alu.out_mid])
-
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/nmigen_div_experiment.py b/src/add/nmigen_div_experiment.py
deleted file mode 100644
index a7e215cb..00000000
--- a/src/add/nmigen_div_experiment.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# IEEE Floating Point Divider (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Const, Cat
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumIn, FPNumOut, FPOpIn, FPOpOut, Overflow, FPBase, FPState
-from singlepipe import eq
-
-class Div:
-    def __init__(self, width):
-        self.width = width
-        self.quot = Signal(width)  # quotient
-        self.dor = Signal(width)   # divisor
-        self.dend = Signal(width)  # dividend
-        self.rem = Signal(width)   # remainder
-        self.count = Signal(7)     # loop count
-
-        self.czero = Const(0, width)
-
-    def reset(self, m):
-        m.d.sync += [
-            self.quot.eq(self.czero),
-            self.rem.eq(self.czero),
-            self.count.eq(Const(0, 7))
-        ]
-
-
-class FPDIV(FPBase):
-
-    def __init__(self, width):
-        FPBase.__init__(self)
-        self.width = width
-
-        self.in_a  = FPOpIn(width)
-        self.in_b  = FPOpIn(width)
-        self.out_z = FPOpOut(width)
-
-        self.states = []
-
-    def add_state(self, state):
-        self.states.append(state)
-        return state
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPDiv
-        """
-        m = Module()
-
-        # Latches
-        a = FPNumIn(None, self.width, False)
-        b = FPNumIn(None, self.width, False)
-        z = FPNumOut(self.width, False)
-
-        div = Div(a.m_width*2 + 3) # double the mantissa width plus g/r/sticky
-
-        of = Overflow()
-        m.submodules.in_a = a
-        m.submodules.in_b = b
-        m.submodules.z = z
-        m.submodules.of = of
-
-        m.d.comb += a.v.eq(self.in_a.v)
-        m.d.comb += b.v.eq(self.in_b.v)
-
-        with m.FSM() as fsm:
-
-            # ******
-            # gets operand a
-
-            with m.State("get_a"):
-                res = self.get_op(m, self.in_a, a, "get_b")
-                m.d.sync += eq([a, self.in_a.ready_o], res)
-
-            # ******
-            # gets operand b
-
-            with m.State("get_b"):
-                res = self.get_op(m, self.in_b, b, "special_cases")
-                m.d.sync += eq([b, self.in_b.ready_o], res)
-
-            # ******
-            # special cases: NaNs, infs, zeros, denormalised
-            # NOTE: some of these are unique to div.  see "Special Operations"
-            # https://steve.hollasch.net/cgindex/coding/ieeefloat.html
-
-            with m.State("special_cases"):
-
-                # if a is NaN or b is NaN return NaN
-                with m.If(a.is_nan | b.is_nan):
-                    m.next = "put_z"
-                    m.d.sync += z.nan(1)
-
-                # if a is Inf and b is Inf return NaN
-                with m.Elif(a.is_inf & b.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.nan(1)
-
-                # if a is inf return inf (or NaN if b is zero)
-                with m.Elif(a.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(a.s ^ b.s)
-
-                # if b is inf return zero
-                with m.Elif(b.is_inf):
-                    m.next = "put_z"
-                    m.d.sync += z.zero(a.s ^ b.s)
-
-                # if a is zero return zero (or NaN if b is zero)
-                with m.Elif(a.is_zero):
-                    m.next = "put_z"
-                    # if b is zero return NaN
-                    with m.If(b.is_zero):
-                        m.d.sync += z.nan(1)
-                    with m.Else():
-                        m.d.sync += z.zero(a.s ^ b.s)
-
-                # if b is zero return Inf
-                with m.Elif(b.is_zero):
-                    m.next = "put_z"
-                    m.d.sync += z.inf(a.s ^ b.s)
-
-                # Denormalised Number checks
-                with m.Else():
-                    m.next = "normalise_a"
-                    self.denormalise(m, a)
-                    self.denormalise(m, b)
-
-            # ******
-            # normalise_a
-
-            with m.State("normalise_a"):
-                self.op_normalise(m, a, "normalise_b")
-
-            # ******
-            # normalise_b
-
-            with m.State("normalise_b"):
-                self.op_normalise(m, b, "divide_0")
-
-            # ******
-            # First stage of divide.  initialise state
-
-            with m.State("divide_0"):
-                m.next = "divide_1"
-                m.d.sync += [
-                    z.s.eq(a.s ^ b.s), # sign
-                    z.e.eq(a.e - b.e), # exponent
-                    div.dend.eq(a.m<<(a.m_width+3)), # 3 bits for g/r/sticky
-                    div.dor.eq(b.m),
-                ]
-                div.reset(m)
-
-            # ******
-            # Second stage of divide.
-
-            with m.State("divide_1"):
-                m.next = "divide_2"
-                m.d.sync += [
-                    div.quot.eq(div.quot << 1),
-                    div.rem.eq(Cat(div.dend[-1], div.rem[0:])),
-                    div.dend.eq(div.dend << 1),
-                ]
-
-            # ******
-            # Third stage of divide.
-            # This stage ends by jumping out to divide_3
-            # However it defaults to jumping to divide_1 (which comes back here)
-
-            with m.State("divide_2"):
-                with m.If(div.rem >= div.dor):
-                    m.d.sync += [
-                        div.quot[0].eq(1),
-                        div.rem.eq(div.rem - div.dor),
-                    ]
-                with m.If(div.count == div.width-2):
-                    m.next = "divide_3"
-                with m.Else():
-                    m.next = "divide_1"
-                    m.d.sync += [
-                        div.count.eq(div.count + 1),
-                    ]
-
-            # ******
-            # Fourth stage of divide.
-
-            with m.State("divide_3"):
-                m.next = "normalise_1"
-                m.d.sync += [
-                    z.m.eq(div.quot[3:]),
-                    of.guard.eq(div.quot[2]),
-                    of.round_bit.eq(div.quot[1]),
-                    of.sticky.eq(div.quot[0] | (div.rem != 0))
-                ]
-
-            # ******
-            # First stage of normalisation.
-
-            with m.State("normalise_1"):
-                self.normalise_1(m, z, of, "normalise_2")
-
-            # ******
-            # Second stage of normalisation.
-
-            with m.State("normalise_2"):
-                self.normalise_2(m, z, of, "round")
-
-            # ******
-            # rounding stage
-
-            with m.State("round"):
-                self.roundz(m, z, of.roundz)
-                m.next = "corrections"
-
-            # ******
-            # correction stage
-
-            with m.State("corrections"):
-                self.corrections(m, z, "pack")
-
-            # ******
-            # pack stage
-
-            with m.State("pack"):
-                self.pack(m, z, "put_z")
-
-            # ******
-            # put_z stage
-
-            with m.State("put_z"):
-                self.put_z(m, z, self.out_z, "get_a")
-
-        return m
-
-
-if __name__ == "__main__":
-    alu = FPDIV(width=32)
-    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
-
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/nmoperator.py b/src/add/nmoperator.py
deleted file mode 100644
index bd5e5544..00000000
--- a/src/add/nmoperator.py
+++ /dev/null
@@ -1,171 +0,0 @@
-""" nmigen operator functions / utils
-
-    eq:
-    --
-
-    a strategically very important function that is identical in function
-    to nmigen's Signal.eq function, except it may take objects, or a list
-    of objects, or a tuple of objects, and where objects may also be
-    Records.
-"""
-
-from nmigen import Signal, Cat, Const, Mux, Module, Value, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.lib.fifo import SyncFIFO, SyncFIFOBuffered
-from nmigen.hdl.ast import ArrayProxy
-from nmigen.hdl.rec import Record, Layout
-
-from abc import ABCMeta, abstractmethod
-from collections.abc import Sequence, Iterable
-from collections import OrderedDict
-from queue import Queue
-import inspect
-
-
-class Visitor2:
-    """ a helper class for iterating twin-argument compound data structures.
-
-        Record is a special (unusual, recursive) case, where the input may be
-        specified as a dictionary (which may contain further dictionaries,
-        recursively), where the field names of the dictionary must match
-        the Record's field spec.  Alternatively, an object with the same
-        member names as the Record may be assigned: it does not have to
-        *be* a Record.
-
-        ArrayProxy is also special-cased, it's a bit messy: whilst ArrayProxy
-        has an eq function, the object being assigned to it (e.g. a python
-        object) might not.  despite the *input* having an eq function,
-        that doesn't help us, because it's the *ArrayProxy* that's being
-        assigned to.  so.... we cheat.  use the ports() function of the
-        python object, enumerate them, find out the list of Signals that way,
-        and assign them.
-    """
-    def iterator2(self, o, i):
-        if isinstance(o, dict):
-            yield from self.dict_iter2(o, i)
-
-        if not isinstance(o, Sequence):
-            o, i = [o], [i]
-        for (ao, ai) in zip(o, i):
-            #print ("visit", fn, ao, ai)
-            if isinstance(ao, Record):
-                yield from self.record_iter2(ao, ai)
-            elif isinstance(ao, ArrayProxy) and not isinstance(ai, Value):
-                yield from self.arrayproxy_iter2(ao, ai)
-            else:
-                yield (ao, ai)
-
-    def dict_iter2(self, o, i):
-        for (k, v) in o.items():
-            print ("d-iter", v, i[k])
-            yield (v, i[k])
-        return res
-
-    def _not_quite_working_with_all_unit_tests_record_iter2(self, ao, ai):
-        print ("record_iter2", ao, ai, type(ao), type(ai))
-        if isinstance(ai, Value):
-            if isinstance(ao, Sequence):
-                ao, ai = [ao], [ai]
-            for o, i in zip(ao, ai):
-                yield (o, i)
-            return
-        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
-            if isinstance(field_shape, Layout):
-                val = ai.fields
-            else:
-                val = ai
-            if hasattr(val, field_name): # check for attribute
-                val = getattr(val, field_name)
-            else:
-                val = val[field_name] # dictionary-style specification
-            yield from self.iterator2(ao.fields[field_name], val)
-
-    def record_iter2(self, ao, ai):
-        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
-            if isinstance(field_shape, Layout):
-                val = ai.fields
-            else:
-                val = ai
-            if hasattr(val, field_name): # check for attribute
-                val = getattr(val, field_name)
-            else:
-                val = val[field_name] # dictionary-style specification
-            yield from self.iterator2(ao.fields[field_name], val)
-
-    def arrayproxy_iter2(self, ao, ai):
-        for p in ai.ports():
-            op = getattr(ao, p.name)
-            print ("arrayproxy - p", p, p.name)
-            yield from self.iterator2(op, p)
-
-
-class Visitor:
-    """ a helper class for iterating single-argument compound data structures.
-        similar to Visitor2.
-    """
-    def iterate(self, i):
-        """ iterate a compound structure recursively using yield
-        """
-        if not isinstance(i, Sequence):
-            i = [i]
-        for ai in i:
-            #print ("iterate", ai)
-            if isinstance(ai, Record):
-                #print ("record", list(ai.layout))
-                yield from self.record_iter(ai)
-            elif isinstance(ai, ArrayProxy) and not isinstance(ai, Value):
-                yield from self.array_iter(ai)
-            else:
-                yield ai
-
-    def record_iter(self, ai):
-        for idx, (field_name, field_shape, _) in enumerate(ai.layout):
-            if isinstance(field_shape, Layout):
-                val = ai.fields
-            else:
-                val = ai
-            if hasattr(val, field_name): # check for attribute
-                val = getattr(val, field_name)
-            else:
-                val = val[field_name] # dictionary-style specification
-            #print ("recidx", idx, field_name, field_shape, val)
-            yield from self.iterate(val)
-
-    def array_iter(self, ai):
-        for p in ai.ports():
-            yield from self.iterate(p)
-
-
-def eq(o, i):
-    """ makes signals equal: a helper routine which identifies if it is being
-        passed a list (or tuple) of objects, or signals, or Records, and calls
-        the objects' eq function.
-    """
-    res = []
-    for (ao, ai) in Visitor2().iterator2(o, i):
-        rres = ao.eq(ai)
-        if not isinstance(rres, Sequence):
-            rres = [rres]
-        res += rres
-    return res
-
-
-def shape(i):
-    #print ("shape", i)
-    r = 0
-    for part in list(i):
-        #print ("shape?", part)
-        s, _ = part.shape()
-        r += s
-    return r, False
-
-
-def cat(i):
-    """ flattens a compound structure recursively using Cat
-    """
-    from nmigen.tools import flatten
-    #res = list(flatten(i)) # works (as of nmigen commit f22106e5) HOWEVER...
-    res = list(Visitor().iterate(i)) # needed because input may be a sequence
-    return Cat(*res)
-
-
diff --git a/src/add/pipeline.py b/src/add/pipeline.py
deleted file mode 100644
index afcee743..00000000
--- a/src/add/pipeline.py
+++ /dev/null
@@ -1,394 +0,0 @@
-""" Example 5: Making use of PyRTL and Introspection. """
-
-from collections.abc import Sequence
-
-from nmigen import Signal
-from nmigen.hdl.rec import Record
-from nmigen import tracer
-from nmigen.compat.fhdl.bitcontainer import value_bits_sign
-from contextlib import contextmanager
-
-from nmoperator import eq
-from singlepipe import StageCls, ControlBase, BufferedHandshake
-from singlepipe import UnbufferedPipeline
-
-
-# The following example shows how pyrtl can be used to make some interesting
-# hardware structures using python introspection.  In particular, this example
-# makes a N-stage pipeline structure.  Any specific pipeline is then a derived
-# class of SimplePipeline where methods with names starting with "stage" are
-# stages, and new members with names not starting with "_" are to be registered
-# for the next stage.
-
-def like(value, rname, pipe, pipemode=False):
-    if isinstance(value, ObjectProxy):
-        return ObjectProxy.like(pipe, value, pipemode=pipemode,
-                                name=rname, reset_less=True)
-    else:
-        return Signal(value_bits_sign(value), name=rname,
-                             reset_less=True)
-        return Signal.like(value, name=rname, reset_less=True)
-
-def get_assigns(_assigns):
-    assigns = []
-    for e in _assigns:
-        if isinstance(e, ObjectProxy):
-            assigns += get_assigns(e._assigns)
-        else:
-            assigns.append(e)
-    return assigns
-
-
-def get_eqs(_eqs):
-    eqs = []
-    for e in _eqs:
-        if isinstance(e, ObjectProxy):
-            eqs += get_eqs(e._eqs)
-        else:
-            eqs.append(e)
-    return eqs
-
-
-class ObjectProxy:
-    def __init__(self, m, name=None, pipemode=False, syncmode=True):
-        self._m = m
-        if name is None:
-            name = tracer.get_var_name(default=None)
-        self.name = name
-        self._pipemode = pipemode
-        self._syncmode = syncmode
-        self._eqs = {}
-        self._assigns = []
-        self._preg_map = {}
-
-    @classmethod
-    def like(cls, m, value, pipemode=False, name=None, src_loc_at=0, **kwargs):
-        name = name or tracer.get_var_name(depth=2 + src_loc_at,
-                                            default="$like")
-
-        src_loc_at_1 = 1 + src_loc_at
-        r = ObjectProxy(m, value.name, pipemode)
-        #for a, aname in value._preg_map.items():
-        #    r._preg_map[aname] = like(a, aname, m, pipemode)
-        for a in value.ports():
-            aname = a.name
-            r._preg_map[aname] = like(a, aname, m, pipemode)
-        return r
-
-    def __repr__(self):
-        subobjs = []
-        for a in self.ports():
-            aname = a.name
-            ai = self._preg_map[aname]
-            subobjs.append(repr(ai))
-        return "<OP %s>" % subobjs
-
-    def get_specs(self, liked=False):
-        res = []
-        for k, v in self._preg_map.items():
-            #v = like(v, k, stage._m)
-            res.append(v)
-            if isinstance(v, ObjectProxy):
-                res += v.get_specs()
-        return res
-
-    def eq(self, i):
-        print ("ObjectProxy eq", self, i)
-        res = []
-        for a in self.ports():
-            aname = a.name
-            ai = i._preg_map[aname]
-            res.append(a.eq(ai))
-        return res
-
-    def ports(self):
-        res = []
-        for aname, a in self._preg_map.items():
-            if isinstance(a, Signal) or isinstance(a, ObjectProxy) or \
-               isinstance(a, Record):
-                res.append(a)
-        #print ("ObjectPorts", res)
-        return res
-
-    def __getattr__(self, name):
-        try:
-            v = self._preg_map[name]
-            return v
-            #return like(v, name, self._m)
-        except KeyError:
-            raise AttributeError(
-                'error, no pipeline register "%s" defined for OP %s'
-                % (name, self.name))
-
-    def __setattr__(self, name, value):
-        if name.startswith('_') or name in ['name', 'ports', 'eq', 'like']:
-            # do not do anything tricky with variables starting with '_'
-            object.__setattr__(self, name, value)
-            return
-        #rname = "%s_%s" % (self.name, name)
-        rname = name
-        new_pipereg = like(value, rname, self._m, self._pipemode)
-        self._preg_map[name] = new_pipereg
-        #object.__setattr__(self, name, new_pipereg)
-        if self._pipemode:
-            #print ("OP pipemode", self._syncmode, new_pipereg, value)
-            assign = eq(new_pipereg, value)
-            if self._syncmode:
-                self._m.d.sync += assign
-            else:
-                self._m.d.comb += assign
-        elif self._m:
-            #print ("OP !pipemode assign", new_pipereg, value, type(value))
-            self._m.d.comb += eq(new_pipereg, value)
-        else:
-            #print ("OP !pipemode !m", new_pipereg, value, type(value))
-            self._assigns += eq(new_pipereg, value)
-            if isinstance(value, ObjectProxy):
-                #print ("OP, defer assigns:", value._assigns)
-                self._assigns += value._assigns
-                self._eqs.append(value._eqs)
-
-
-class PipelineStage:
-    """ Pipeline builder stage with auto generation of pipeline registers.
-    """
-
-    def __init__(self, name, m, prev=None, pipemode=False, ispec=None):
-        self._m = m
-        self._stagename = name
-        self._preg_map = {'__nextstage__': {}}
-        self._prev_stage = prev
-        self._ispec = ispec
-        if ispec:
-            self._preg_map[self._stagename] = ispec
-        if prev:
-            print ("prev", prev._stagename, prev._preg_map)
-            #if prev._stagename in prev._preg_map:
-            #    m = prev._preg_map[prev._stagename]
-            #    self._preg_map[prev._stagename] = m
-            if '__nextstage__' in prev._preg_map:
-                m = prev._preg_map['__nextstage__']
-                m = likedict(m)
-                self._preg_map[self._stagename] = m
-                #for k, v in m.items():
-                    #m[k] = like(v, k, self._m)
-                print ("make current", self._stagename, m)
-        self._pipemode = pipemode
-        self._eqs = {}
-        self._assigns = []
-
-    def __getattribute__(self, name):
-        if name.startswith('_'):
-            return object.__getattribute__(self, name)
-        #if name in self._preg_map['__nextstage__']:
-        #    return self._preg_map['__nextstage__'][name]
-        try:
-            print ("getattr", name, object.__getattribute__(self, '_preg_map'))
-            v = self._preg_map[self._stagename][name]
-            return v
-            #return like(v, name, self._m)
-        except KeyError:
-            raise AttributeError(
-                'error, no pipeline register "%s" defined for stage %s'
-                % (name, self._stagename))
-
-    def __setattr__(self, name, value):
-        if name.startswith('_'):
-            # do not do anything tricky with variables starting with '_'
-            object.__setattr__(self, name, value)
-            return
-        pipereg_id = self._stagename
-        rname = 'pipereg_' + pipereg_id + '_' + name
-        new_pipereg = like(value, rname, self._m, self._pipemode)
-        next_stage = '__nextstage__'
-        if next_stage not in self._preg_map:
-            self._preg_map[next_stage] = {}
-        self._preg_map[next_stage][name] = new_pipereg
-        print ("setattr", name, value, self._preg_map)
-        if self._pipemode:
-            self._eqs[name] = new_pipereg
-            assign = eq(new_pipereg, value)
-            print ("pipemode: append", new_pipereg, value, assign)
-            if isinstance(value, ObjectProxy):
-                print ("OP, assigns:", value._assigns)
-                self._assigns += value._assigns
-                self._eqs[name]._eqs = value._eqs
-            #self._m.d.comb += assign
-            self._assigns += assign
-        elif self._m:
-            print ("!pipemode: assign", new_pipereg, value)
-            assign = eq(new_pipereg, value)
-            self._m.d.sync += assign
-        else:
-            print ("!pipemode !m: defer assign", new_pipereg, value)
-            assign = eq(new_pipereg, value)
-            self._eqs[name] = new_pipereg
-            self._assigns += assign
-            if isinstance(value, ObjectProxy):
-                print ("OP, defer assigns:", value._assigns)
-                self._assigns += value._assigns
-                self._eqs[name]._eqs = value._eqs
-
-def likelist(specs):
-    res = []
-    for v in specs:
-        res.append(like(v, v.name, None, pipemode=True))
-    return res
-
-def likedict(specs):
-    if not isinstance(specs, dict):
-        return like(specs, specs.name, None, pipemode=True)
-    res = {}
-    for k, v in specs.items():
-        res[k] = likedict(v)
-    return res
-
-
-class AutoStage(StageCls):
-    def __init__(self, inspecs, outspecs, eqs, assigns):
-        self.inspecs, self.outspecs = inspecs, outspecs
-        self.eqs, self.assigns = eqs, assigns
-        #self.o = self.ospec()
-    def ispec(self): return likedict(self.inspecs)
-    def ospec(self): return likedict(self.outspecs)
-
-    def process(self, i):
-        print ("stage process", i)
-        return self.eqs
-
-    def setup(self, m, i):
-        print ("stage setup i", i, m)
-        print ("stage setup inspecs", self.inspecs)
-        print ("stage setup outspecs", self.outspecs)
-        print ("stage setup eqs", self.eqs)
-        #self.o = self.ospec()
-        m.d.comb += eq(self.inspecs, i)
-        #m.d.comb += eq(self.outspecs, self.eqs)
-        #m.d.comb += eq(self.o, i)
-
-
-class AutoPipe(UnbufferedPipeline):
-    def __init__(self, stage, assigns):
-        UnbufferedPipeline.__init__(self, stage)
-        self.assigns = assigns
-
-    def elaborate(self, platform):
-        m = UnbufferedPipeline.elaborate(self, platform)
-        m.d.comb += self.assigns
-        print ("assigns", self.assigns, m)
-        return m
-
-
-class PipeManager:
-    def __init__(self, m, pipemode=False, pipetype=None):
-        self.m = m
-        self.pipemode = pipemode
-        self.pipetype = pipetype
-
-    @contextmanager
-    def Stage(self, name, prev=None, ispec=None):
-        if ispec:
-            ispec = likedict(ispec)
-        print ("start stage", name, ispec)
-        stage = PipelineStage(name, None, prev, self.pipemode, ispec=ispec)
-        try:
-            yield stage, self.m #stage._m
-        finally:
-            pass
-        if self.pipemode:
-            if stage._ispec:
-                print ("use ispec", stage._ispec)
-                inspecs = stage._ispec
-            else:
-                inspecs = self.get_specs(stage, name)
-                #inspecs = likedict(inspecs)
-            outspecs = self.get_specs(stage, '__nextstage__', liked=True)
-            print ("stage inspecs", name, inspecs)
-            print ("stage outspecs", name, outspecs)
-            eqs = stage._eqs # get_eqs(stage._eqs)
-            assigns = get_assigns(stage._assigns)
-            print ("stage eqs", name, eqs)
-            print ("stage assigns", name, assigns)
-            s = AutoStage(inspecs, outspecs, eqs, assigns)
-            self.stages.append(s)
-        print ("end stage", name, self.pipemode, "\n")
-
-    def get_specs(self, stage, name, liked=False):
-        return stage._preg_map[name]
-        if name in stage._preg_map:
-            res = []
-            for k, v in stage._preg_map[name].items():
-                #v = like(v, k, stage._m)
-                res.append(v)
-                #if isinstance(v, ObjectProxy):
-                #    res += v.get_specs()
-            return res
-        return {}
-
-    def __enter__(self):
-        self.stages = []
-        return self
-
-    def __exit__(self, *args):
-        print ("exit stage", args)
-        pipes = []
-        cb = ControlBase()
-        for s in self.stages:
-            print ("stage specs", s, s.inspecs, s.outspecs)
-            if self.pipetype == 'buffered':
-                p = BufferedHandshake(s)
-            else:
-                p = AutoPipe(s, s.assigns)
-            pipes.append(p)
-            self.m.submodules += p
-
-        self.m.d.comb += cb.connect(pipes)
-
-
-class SimplePipeline:
-    """ Pipeline builder with auto generation of pipeline registers.
-    """
-
-    def __init__(self, m):
-        self._m = m
-        self._pipeline_register_map = {}
-        self._current_stage_num = 0
-
-    def _setup(self):
-        stage_list = []
-        for method in dir(self):
-            if method.startswith('stage'):
-                stage_list.append(method)
-        for stage in sorted(stage_list):
-            stage_method = getattr(self, stage)
-            stage_method()
-            self._current_stage_num += 1
-
-    def __getattr__(self, name):
-        try:
-            return self._pipeline_register_map[self._current_stage_num][name]
-        except KeyError:
-            raise AttributeError(
-                'error, no pipeline register "%s" defined for stage %d'
-                % (name, self._current_stage_num))
-
-    def __setattr__(self, name, value):
-        if name.startswith('_'):
-            # do not do anything tricky with variables starting with '_'
-            object.__setattr__(self, name, value)
-            return
-        next_stage = self._current_stage_num + 1
-        pipereg_id = str(self._current_stage_num) + 'to' + str(next_stage)
-        rname = 'pipereg_' + pipereg_id + '_' + name
-        #new_pipereg = Signal(value_bits_sign(value), name=rname,
-        #                     reset_less=True)
-        if isinstance(value, ObjectProxy):
-            new_pipereg = ObjectProxy.like(self._m, value,
-                                           name=rname, reset_less = True)
-        else:
-            new_pipereg = Signal.like(value, name=rname, reset_less = True)
-        if next_stage not in self._pipeline_register_map:
-            self._pipeline_register_map[next_stage] = {}
-        self._pipeline_register_map[next_stage][name] = new_pipereg
-        self._m.d.sync += eq(new_pipereg, value)
-
diff --git a/src/add/pipeline_example.py b/src/add/pipeline_example.py
deleted file mode 100644
index 799caf6d..00000000
--- a/src/add/pipeline_example.py
+++ /dev/null
@@ -1,204 +0,0 @@
-""" Example 5: Making use of PyRTL and Introspection. """
-
-from nmigen import Module, Signal, Const
-from nmigen.cli import main, verilog, rtlil
-
-
-from pipeline import SimplePipeline, ObjectProxy, PipeManager
-
-
-class SimplePipelineExample(SimplePipeline):
-    """ A very simple pipeline to show how registers are inferred. """
-
-    def __init__(self, pipe):
-        SimplePipeline.__init__(self, pipe)
-        self._loopback = Signal(4)
-        self._setup()
-
-    def stage0(self):
-        self.n = ~self._loopback
-
-    def stage1(self):
-        self.n = self.n + 2
-
-    def stage2(self):
-        localv = Signal(4)
-        self._pipe.comb += localv.eq(2)
-        self.n = self.n << localv
-
-    def stage3(self):
-        self.n = ~self.n
-
-    def stage4(self):
-        self._pipe.sync += self._loopback.eq(self.n + 3)
-
-
-class ObjectBasedPipelineExample(SimplePipeline):
-    """ A very simple pipeline to show how registers are inferred. """
-
-    def __init__(self, m):
-        SimplePipeline.__init__(self, m)
-        self._loopback = Signal(4)
-        o = ObjectProxy(m)
-        o.a = Signal(4)
-        o.b = Signal(4)
-        self._obj = o
-        self._setup()
-
-    def stage0(self):
-        self.n = ~self._loopback
-        self.o = self._obj
-
-    def stage1(self):
-        self.n = self.n + self.o.a
-        o = ObjectProxy(self._m)
-        o.c = self.n
-        o.d = self.o.b + self.n + Const(5)
-        self.o = o
-
-    def stage2(self):
-        localv = Signal(4)
-        self._m.d.comb += localv.eq(2)
-        self.n = self.n << localv
-        o = ObjectProxy(self._m)
-        o.e = self.n + self.o.c + self.o.d
-        self.o = o
-
-    def stage3(self):
-        self.n = ~self.n
-        self.o = self.o
-        self.o.e = self.o.e + self.n
-
-    def stage4(self):
-        self._m.d.sync += self._loopback.eq(self.n + 3 + self.o.e)
-
-
-class PipeModule:
-
-    def __init__(self):
-        self.m = Module()
-        self.p = ObjectBasedPipelineExample(self.m)
-
-    def elaborate(self, platform=None):
-        return self.m
-
-
-class PipelineStageExample:
-
-    def __init__(self):
-        self._loopback = Signal(4, name="loopback")
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-
-        with PipeManager(m, pipemode=True) as pipe:
-
-            ispec={'loopback': self._loopback}
-            with pipe.Stage("first", ispec=ispec) as (p, m):
-                p.n = ~p.loopback
-            with pipe.Stage("second", p) as (p, m):
-                #p.n = ~self._loopback + 2
-                p.n = p.n + Const(2)
-            with pipe.Stage("third", p) as (p, m):
-                #p.n = ~self._loopback + 5
-                localv = Signal(4)
-                m.d.comb += localv.eq(2)
-                p.n = p.n << localv + Const(1)
-                #p.m = p.n + 2
-
-        print (pipe.stages)
-
-        return m
-
-class PipelineStageObjectExample:
-
-    def __init__(self):
-        self.loopback = Signal(4)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-
-        o = ObjectProxy(None, pipemode=False)
-        o.a = Signal(4)
-        o.b = Signal(4)
-        self.obj = o
-
-        localv2 = Signal(4)
-        m.d.sync += localv2.eq(localv2 + 3)
-
-        #m.d.comb += self.obj.a.eq(localv2 + 1)
-        #m.d.sync += self._loopback.eq(localv2)
-
-        ispec= {'loopback': self.loopback, 'obj': self.obj}
-        with PipeManager(m, pipemode=True) as pipe:
-
-            with pipe.Stage("first", ispec=ispec) as (p, m):
-                p.n = ~p.loopback
-                p.o = p.obj
-            with pipe.Stage("second", p) as (p, m):
-                #p.n = ~self.loopback + 2
-                localn = Signal(4)
-                m.d.comb += localn.eq(p.n)
-                o = ObjectProxy(None, pipemode=False)
-                o.c = localn
-                o.d = p.o.b + localn + Const(5)
-                p.n = localn
-                p.o = o
-            with pipe.Stage("third", p) as (p, m):
-                #p.n = ~self._loopback + 5
-                localv = Signal(4)
-                m.d.comb += localv.eq(2)
-                p.n = p.n << localv
-                o = ObjectProxy(None, pipemode=False)
-                o.e = p.n + p.o.c + p.o.d
-                p.o = o
-
-        print ("stages", pipe.stages)
-
-        return m
-
-
-class PipelineStageObjectExample2:
-
-    def __init__(self):
-        self._loopback = Signal(4)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-
-        ispec= [self._loopback]
-        with PipeManager(m, pipemode=True) as pipe:
-
-            with pipe.Stage("first",
-                            ispec=ispec) as (p, m):
-                p.n = ~self._loopback
-                o = ObjectProxy(None, pipemode=False)
-                o.b = ~self._loopback + Const(5)
-                p.o = o
-
-        print ("stages", pipe.stages)
-
-        return m
-
-
-
-if __name__ == "__main__":
-    example = PipeModule()
-    with open("pipe_module.il", "w") as f:
-        f.write(rtlil.convert(example, ports=[
-               example.p._loopback,
-             ]))
-    example = PipelineStageExample()
-    with open("pipe_stage_module.il", "w") as f:
-        f.write(rtlil.convert(example, ports=[
-               example._loopback,
-             ]))
-    #exit(0)
-    example = PipelineStageObjectExample()
-    with open("pipe_stage_object_module.il", "w") as f:
-        f.write(rtlil.convert(example, ports=[
-               example.loopback,
-             ]))
diff --git a/src/add/queue.py b/src/add/queue.py
deleted file mode 100644
index 0038953d..00000000
--- a/src/add/queue.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2014 - 2019 The Regents of the University of
-# California (Regents). All Rights Reserved.  Redistribution and use in
-# source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#    * Redistributions of source code must retain the above
-#      copyright notice, this list of conditions and the following
-#      two paragraphs of disclaimer.
-#    * Redistributions in binary form must reproduce the above
-#      copyright notice, this list of conditions and the following
-#      two paragraphs of disclaimer in the documentation and/or other materials
-#      provided with the distribution.
-#    * Neither the name of the Regents nor the names of its contributors
-#      may be used to endorse or promote products derived from this
-#      software without specific prior written permission.
-# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
-# SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
-# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
-# REGENTS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF
-# ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION
-# TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
-# MODIFICATIONS.
-
-from nmigen import Module, Signal, Memory, Mux, Elaboratable
-from nmigen.tools import bits_for
-from nmigen.cli import main
-from nmigen.lib.fifo import FIFOInterface
-
-# translated from https://github.com/freechipsproject/chisel3/blob/a4a29e29c3f1eed18f851dcf10bdc845571dfcb6/src/main/scala/chisel3/util/Decoupled.scala#L185   # noqa
-
-
-class Queue(FIFOInterface, Elaboratable):
-    def __init__(self, width, depth, fwft=True, pipe=False):
-        """ Queue (FIFO) with pipe mode and first-write fall-through capability
-
-            * :width: width of Queue data in/out
-            * :depth: queue depth.  NOTE: may be set to 0 (this is ok)
-            * :fwft : first-write, fall-through mode (Chisel Queue "flow" mode)
-            * :pipe : pipe mode.  NOTE: this mode can cause unanticipated
-                      problems.  when read is enabled, so is writeable.
-                      therefore if read is enabled, the data ABSOLUTELY MUST
-                      be read.
-
-            fwft mode = True basically means that the data may be transferred
-            combinatorially from input to output.
-
-            Attributes:
-            * level: available free space (number of unread entries)
-
-            din  = enq_data, writable  = enq_ready, we = enq_valid
-            dout = deq_data, re = deq_ready, readable = deq_valid
-        """
-        FIFOInterface.__init__(self, width, depth, fwft)
-        self.pipe = pipe
-        self.depth = depth
-        self.level = Signal(bits_for(depth))
-
-    def elaborate(self, platform):
-        m = Module()
-
-        # set up an SRAM.  XXX bug in Memory: cannot create SRAM of depth 1
-        ram = Memory(self.width, self.depth if self.depth > 1 else 2)
-        m.submodules.ram_read = ram_read = ram.read_port(synchronous=False)
-        m.submodules.ram_write = ram_write = ram.write_port()
-
-        # convenience names
-        p_ready_o = self.writable
-        p_valid_i = self.we
-        enq_data = self.din
-
-        n_valid_o = self.readable
-        n_ready_i = self.re
-        deq_data = self.dout
-
-        # intermediaries
-        ptr_width = bits_for(self.depth - 1) if self.depth > 1 else 0
-        enq_ptr = Signal(ptr_width) # cyclic pointer to "insert" point (wrport)
-        deq_ptr = Signal(ptr_width) # cyclic pointer to "remove" point (rdport)
-        maybe_full = Signal() # not reset_less (set by sync)
-
-        # temporaries
-        do_enq = Signal(reset_less=True)
-        do_deq = Signal(reset_less=True)
-        ptr_diff = Signal(ptr_width)
-        ptr_match = Signal(reset_less=True)
-        empty = Signal(reset_less=True)
-        full = Signal(reset_less=True)
-        enq_max = Signal(reset_less=True)
-        deq_max = Signal(reset_less=True)
-
-        m.d.comb += [ptr_match.eq(enq_ptr == deq_ptr), # read-ptr = write-ptr
-                     ptr_diff.eq(enq_ptr - deq_ptr),
-                     enq_max.eq(enq_ptr == self.depth - 1),
-                     deq_max.eq(deq_ptr == self.depth - 1),
-                     empty.eq(ptr_match & ~maybe_full),
-                     full.eq(ptr_match & maybe_full),
-                     do_enq.eq(p_ready_o & p_valid_i), # write conditions ok
-                     do_deq.eq(n_ready_i & n_valid_o), # read conditions ok
-
-                     # set readable and writable (NOTE: see pipe mode below)
-                     n_valid_o.eq(~empty), # cannot read if empty!
-                     p_ready_o.eq(~full),  # cannot write if full!
-
-                     # set up memory and connect to input and output
-                     ram_write.addr.eq(enq_ptr),
-                     ram_write.data.eq(enq_data),
-                     ram_write.en.eq(do_enq),
-                     ram_read.addr.eq(deq_ptr),
-                     deq_data.eq(ram_read.data) # NOTE: overridden in fwft mode
-                    ]
-
-        # under write conditions, SRAM write-pointer moves on next clock
-        with m.If(do_enq):
-            m.d.sync += enq_ptr.eq(Mux(enq_max, 0, enq_ptr+1))
-
-        # under read conditions, SRAM read-pointer moves on next clock
-        with m.If(do_deq):
-            m.d.sync += deq_ptr.eq(Mux(deq_max, 0, deq_ptr+1))
-
-        # if read-but-not-write or write-but-not-read, maybe_full set
-        with m.If(do_enq != do_deq):
-            m.d.sync += maybe_full.eq(do_enq)
-
-        # first-word fall-through: same as "flow" parameter in Chisel3 Queue
-        # basically instead of relying on the Memory characteristics (which
-        # in FPGAs do not have write-through), then when the queue is empty
-        # take the output directly from the input, i.e. *bypass* the SRAM.
-        # this done combinatorially to give the exact same characteristics
-        # as Memory "write-through"... without relying on a changing API
-        if self.fwft:
-            with m.If(p_valid_i):
-                m.d.comb += n_valid_o.eq(1)
-            with m.If(empty):
-                m.d.comb += deq_data.eq(enq_data)
-                m.d.comb += do_deq.eq(0)
-                with m.If(n_ready_i):
-                    m.d.comb += do_enq.eq(0)
-
-        # pipe mode: if next stage says it's ready (readable), we
-        #            *must* declare the input ready (writeable).
-        if self.pipe:
-            with m.If(n_ready_i):
-                m.d.comb += p_ready_o.eq(1)
-
-        # set the count (available free space), optimise on power-of-two
-        if self.depth == 1 << ptr_width:  # is depth a power of 2
-            m.d.comb += self.level.eq(
-                Mux(maybe_full & ptr_match, self.depth, 0) | ptr_diff)
-        else:
-            m.d.comb += self.level.eq(Mux(ptr_match,
-                                          Mux(maybe_full, self.depth, 0),
-                                          Mux(deq_ptr > enq_ptr,
-                                              self.depth + ptr_diff,
-                                              ptr_diff)))
-
-        return m
-
-
-if __name__ == "__main__":
-    reg_stage = Queue(1, 1, pipe=True)
-    break_ready_chain_stage = Queue(1, 1, pipe=True, fwft=True)
-    m = Module()
-    ports = []
-
-    def queue_ports(queue, name_prefix):
-        retval = []
-        for name in ["level",
-                     "dout",
-                     "readable",
-                     "writable"]:
-            port = getattr(queue, name)
-            signal = Signal(port.shape(), name=name_prefix+name)
-            m.d.comb += signal.eq(port)
-            retval.append(signal)
-        for name in ["re",
-                     "din",
-                     "we"]:
-            port = getattr(queue, name)
-            signal = Signal(port.shape(), name=name_prefix+name)
-            m.d.comb += port.eq(signal)
-            retval.append(signal)
-        return retval
-
-    m.submodules.reg_stage = reg_stage
-    ports += queue_ports(reg_stage, "reg_stage_")
-    m.submodules.break_ready_chain_stage = break_ready_chain_stage
-    ports += queue_ports(break_ready_chain_stage, "break_ready_chain_stage_")
-    main(m, ports=ports)
diff --git a/src/add/record_experiment.py b/src/add/record_experiment.py
deleted file mode 100644
index 1789c3bd..00000000
--- a/src/add/record_experiment.py
+++ /dev/null
@@ -1,106 +0,0 @@
-from nmigen import Module, Signal, Mux, Const, Elaboratable
-from nmigen.hdl.rec import Record, Layout, DIR_NONE
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-from nmigen.compat.fhdl.bitcontainer import value_bits_sign
-from singlepipe import cat, RecordObject
-
-
-class RecordTest:
-
-    def __init__(self):
-        self.r1 = RecordObject()
-        self.r1.sig1 = Signal(16)
-        self.r1.r2 = RecordObject()
-        self.r1.r2.sig2 = Signal(16)
-        self.r1.r3 = RecordObject()
-        self.r1.r3.sig3 = Signal(16)
-        self.sig123 = Signal(48)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        sig1 = Signal(16)
-        m.d.comb += sig1.eq(self.r1.sig1)
-        sig2 = Signal(16)
-        m.d.comb += sig2.eq(self.r1.r2.sig2)
-
-        print (self.r1.fields)
-        print (self.r1.shape())
-        print ("width", len(self.r1))
-        m.d.comb += self.sig123.eq(cat(self.r1))
-
-        return m
-
-
-def testbench(dut):
-    yield dut.r1.sig1.eq(5)
-    yield dut.r1.r2.sig2.eq(10)
-    yield dut.r1.r3.sig3.eq(1)
-    
-    sig1 = yield dut.r1.sig1
-    assert sig1 == 5
-    sig2 = yield dut.r1.r2.sig2
-    assert sig2 == 10
-
-    yield
-
-    sig123 = yield dut.sig123
-    print ("sig123", hex(sig123))
-    assert sig123 == 0x1000a0005
-
-
-
-class RecordTest2(Elaboratable):
-
-    def __init__(self):
-        self.r1 = RecordObject()
-        self.r1.sig1 = Signal(16)
-        self.r1.r2 = RecordObject()
-        self.r1.r2.sig2 = Signal(16)
-        self.r1.r3 = RecordObject()
-        self.r1.r3.sig3 = Signal(16)
-        self.sig123 = Signal(48)
-
-    def elaborate(self, platform):
-        m = Module()
-
-        m.d.comb += cat(self.r1).eq(self.sig123)
-
-        return m
-
-
-def testbench2(dut):
-    
-    sig123 = yield dut.sig123.eq(0x1000a0005)
-
-    yield
-
-    sig1 = yield dut.r1.sig1
-    assert sig1 == 5
-    sig2 = yield dut.r1.r2.sig2
-    assert sig2 == 10
-    sig3 = yield dut.r1.r3.sig3
-    assert sig3 == 1
-
-
-
-######################################################################
-# Unit Tests
-######################################################################
-
-if __name__ == '__main__':
-    print ("test 1")
-    dut = RecordTest()
-    run_simulation(dut, testbench(dut), vcd_name="test_record1.vcd")
-    vl = rtlil.convert(dut, ports=[dut.sig123, dut.r1.sig1, dut.r1.r2.sig2])
-    with open("test_record1.il", "w") as f:
-        f.write(vl)
-
-    print ("test 2")
-    dut = RecordTest2()
-    run_simulation(dut, testbench2(dut), vcd_name="test_record2.vcd")
-    vl = rtlil.convert(dut, ports=[dut.sig123, dut.r1.sig1, dut.r1.r2.sig2])
-    with open("test_record2.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/add/rstation_row.py b/src/add/rstation_row.py
deleted file mode 100644
index aeb58732..00000000
--- a/src/add/rstation_row.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from nmigen import Signal, Cat, Const, Mux, Module
-
-from nmigen.cli import main, verilog
-
-from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase, FPNumBase
-from fpbase import MultiShiftRMerge
-
-class ReservationStationRow:
-
-    def __init__(self, width, id_wid):
-        """ Reservation Station row
-
-            * width: bit-width of IEEE754.  supported: 16, 32, 64
-            * id_wid: an identifier to be passed through to the FunctionUnit
-        """
-        self.width = width
-
-        self.in_a  = Signal(width)
-        self.in_b  = Signal(width)
-        self.id_wid = id_wid
-        self.out_z = Signal(width)
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for ReservationStationRow
-        """
-        m = Module()
-
-        return m
-
-
-if __name__ == "__main__":
-    rs = ReservationStationRow(width=32, id_wid=Const(1,4))
-    main(alu, ports=[rs.in_a, rs.in_b, rs.out_z]
-
-    # works... but don't use, just do "python fname.py convert -t v"
-    #print (verilog.convert(alu, ports=[
-    #                        ports=alu.in_a.ports() + \
-    #                              alu.in_b.ports() + \
-    #                              alu.out_z.ports())
diff --git a/src/add/singlepipe.py b/src/add/singlepipe.py
deleted file mode 100644
index 68b62e43..00000000
--- a/src/add/singlepipe.py
+++ /dev/null
@@ -1,829 +0,0 @@
-""" Pipeline API.  For multi-input and multi-output variants, see multipipe.
-
-    Associated development bugs:
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
-
-    Important: see Stage API (stageapi.py) in combination with below
-
-    RecordBasedStage:
-    ----------------
-
-    A convenience class that takes an input shape, output shape, a
-    "processing" function and an optional "setup" function.  Honestly
-    though, there's not much more effort to just... create a class
-    that returns a couple of Records (see ExampleAddRecordStage in
-    examples).
-
-    PassThroughStage:
-    ----------------
-
-    A convenience class that takes a single function as a parameter,
-    that is chain-called to create the exact same input and output spec.
-    It has a process() function that simply returns its input.
-
-    Instances of this class are completely redundant if handed to
-    StageChain, however when passed to UnbufferedPipeline they
-    can be used to introduce a single clock delay.
-
-    ControlBase:
-    -----------
-
-    The base class for pipelines.  Contains previous and next ready/valid/data.
-    Also has an extremely useful "connect" function that can be used to
-    connect a chain of pipelines and present the exact same prev/next
-    ready/valid/data API.
-
-    Note: pipelines basically do not become pipelines as such until
-    handed to a derivative of ControlBase.  ControlBase itself is *not*
-    strictly considered a pipeline class.  Wishbone and AXI4 (master or
-    slave) could be derived from ControlBase, for example.
-    UnbufferedPipeline:
-    ------------------
-
-    A simple stalling clock-synchronised pipeline that has no buffering
-    (unlike BufferedHandshake).  Data flows on *every* clock cycle when
-    the conditions are right (this is nominally when the input is valid
-    and the output is ready).
-
-    A stall anywhere along the line will result in a stall back-propagating
-    down the entire chain.  The BufferedHandshake by contrast will buffer
-    incoming data, allowing previous stages one clock cycle's grace before
-    also having to stall.
-
-    An advantage of the UnbufferedPipeline over the Buffered one is
-    that the amount of logic needed (number of gates) is greatly
-    reduced (no second set of buffers basically)
-
-    The disadvantage of the UnbufferedPipeline is that the valid/ready
-    logic, if chained together, is *combinatorial*, resulting in
-    progressively larger gate delay.
-
-    PassThroughHandshake:
-    ------------------
-
-    A Control class that introduces a single clock delay, passing its
-    data through unaltered.  Unlike RegisterPipeline (which relies
-    on UnbufferedPipeline and PassThroughStage) it handles ready/valid
-    itself.
-
-    RegisterPipeline:
-    ----------------
-
-    A convenience class that, because UnbufferedPipeline introduces a single
-    clock delay, when its stage is a PassThroughStage, it results in a Pipeline
-    stage that, duh, delays its (unmodified) input by one clock cycle.
-
-    BufferedHandshake:
-    ----------------
-
-    nmigen implementation of buffered pipeline stage, based on zipcpu:
-    https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html
-
-    this module requires quite a bit of thought to understand how it works
-    (and why it is needed in the first place).  reading the above is
-    *strongly* recommended.
-
-    unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires
-    the STB / ACK signals to raise and lower (on separate clocks) before
-    data may proceeed (thus only allowing one piece of data to proceed
-    on *ALTERNATE* cycles), the signalling here is a true pipeline
-    where data will flow on *every* clock when the conditions are right.
-
-    input acceptance conditions are when:
-        * incoming previous-stage strobe (p.valid_i) is HIGH
-        * outgoing previous-stage ready   (p.ready_o) is LOW
-
-    output transmission conditions are when:
-        * outgoing next-stage strobe (n.valid_o) is HIGH
-        * outgoing next-stage ready   (n.ready_i) is LOW
-
-    the tricky bit is when the input has valid data and the output is not
-    ready to accept it.  if it wasn't for the clock synchronisation, it
-    would be possible to tell the input "hey don't send that data, we're
-    not ready".  unfortunately, it's not possible to "change the past":
-    the previous stage *has no choice* but to pass on its data.
-
-    therefore, the incoming data *must* be accepted - and stored: that
-    is the responsibility / contract that this stage *must* accept.
-    on the same clock, it's possible to tell the input that it must
-    not send any more data.  this is the "stall" condition.
-
-    we now effectively have *two* possible pieces of data to "choose" from:
-    the buffered data, and the incoming data.  the decision as to which
-    to process and output is based on whether we are in "stall" or not.
-    i.e. when the next stage is no longer ready, the output comes from
-    the buffer if a stall had previously occurred, otherwise it comes
-    direct from processing the input.
-
-    this allows us to respect a synchronous "travelling STB" with what
-    dan calls a "buffered handshake".
-
-    it's quite a complex state machine!
-
-    SimpleHandshake
-    ---------------
-
-    Synchronised pipeline, Based on:
-    https://github.com/ZipCPU/dbgbus/blob/master/hexbus/rtl/hbdeword.v
-"""
-
-from nmigen import Signal, Mux, Module, Elaboratable
-from nmigen.cli import verilog, rtlil
-from nmigen.hdl.rec import Record
-
-from queue import Queue
-import inspect
-
-from iocontrol import (PrevControl, NextControl, Object, RecordObject)
-from stageapi import (_spec, StageCls, Stage, StageChain, StageHelper)
-import nmoperator
-                      
-
-class RecordBasedStage(Stage):
-    """ convenience class which provides a Records-based layout.
-        honestly it's a lot easier just to create a direct Records-based
-        class (see ExampleAddRecordStage)
-    """
-    def __init__(self, in_shape, out_shape, processfn, setupfn=None):
-        self.in_shape = in_shape
-        self.out_shape = out_shape
-        self.__process = processfn
-        self.__setup = setupfn
-    def ispec(self): return Record(self.in_shape)
-    def ospec(self): return Record(self.out_shape)
-    def process(seif, i): return self.__process(i)
-    def setup(seif, m, i): return self.__setup(m, i)
-
-
-class PassThroughStage(StageCls):
-    """ a pass-through stage with its input data spec identical to its output,
-        and "passes through" its data from input to output (does nothing).
-
-        use this basically to explicitly make any data spec Stage-compliant.
-        (many APIs would potentially use a static "wrap" method in e.g.
-         StageCls to achieve a similar effect)
-    """
-    def __init__(self, iospecfn): self.iospecfn = iospecfn
-    def ispec(self): return self.iospecfn()
-    def ospec(self): return self.iospecfn()
-
-
-class ControlBase(StageHelper, Elaboratable):
-    """ Common functions for Pipeline API.  Note: a "pipeline stage" only
-        exists (conceptually) when a ControlBase derivative is handed
-        a Stage (combinatorial block)
-
-        NOTE: ControlBase derives from StageHelper, making it accidentally
-        compliant with the Stage API.  Using those functions directly
-        *BYPASSES* a ControlBase instance ready/valid signalling, which
-        clearly should not be done without a really, really good reason.
-    """
-    def __init__(self, stage=None, in_multi=None, stage_ctl=False):
-        """ Base class containing ready/valid/data to previous and next stages
-
-            * p: contains ready/valid to the previous stage
-            * n: contains ready/valid to the next stage
-
-            Except when calling Controlbase.connect(), user must also:
-            * add data_i member to PrevControl (p) and
-            * add data_o member to NextControl (n)
-            Calling ControlBase._new_data is a good way to do that.
-        """
-        StageHelper.__init__(self, stage)
-
-        # set up input and output IO ACK (prev/next ready/valid)
-        self.p = PrevControl(in_multi, stage_ctl)
-        self.n = NextControl(stage_ctl)
-
-        # set up the input and output data
-        if stage is not None:
-            self._new_data("data")
-
-    def _new_data(self, name):
-        """ allocates new data_i and data_o
-        """
-        self.p.data_i, self.n.data_o = self.new_specs(name)
-
-    @property
-    def data_r(self):
-        return self.process(self.p.data_i)
-
-    def connect_to_next(self, nxt):
-        """ helper function to connect to the next stage data/valid/ready.
-        """
-        return self.n.connect_to_next(nxt.p)
-
-    def _connect_in(self, prev):
-        """ internal helper function to connect stage to an input source.
-            do not use to connect stage-to-stage!
-        """
-        return self.p._connect_in(prev.p)
-
-    def _connect_out(self, nxt):
-        """ internal helper function to connect stage to an output source.
-            do not use to connect stage-to-stage!
-        """
-        return self.n._connect_out(nxt.n)
-
-    def connect(self, pipechain):
-        """ connects a chain (list) of Pipeline instances together and
-            links them to this ControlBase instance:
-
-                      in <----> self <---> out
-                       |                   ^
-                       v                   |
-                    [pipe1, pipe2, pipe3, pipe4]
-                       |    ^  |    ^  |     ^
-                       v    |  v    |  v     |
-                     out---in out--in out---in
-
-            Also takes care of allocating data_i/data_o, by looking up
-            the data spec for each end of the pipechain.  i.e It is NOT
-            necessary to allocate self.p.data_i or self.n.data_o manually:
-            this is handled AUTOMATICALLY, here.
-
-            Basically this function is the direct equivalent of StageChain,
-            except that unlike StageChain, the Pipeline logic is followed.
-
-            Just as StageChain presents an object that conforms to the
-            Stage API from a list of objects that also conform to the
-            Stage API, an object that calls this Pipeline connect function
-            has the exact same pipeline API as the list of pipline objects
-            it is called with.
-
-            Thus it becomes possible to build up larger chains recursively.
-            More complex chains (multi-input, multi-output) will have to be
-            done manually.
-
-            Argument:
-
-            * :pipechain: - a sequence of ControlBase-derived classes
-                            (must be one or more in length)
-
-            Returns:
-
-            * a list of eq assignments that will need to be added in
-              an elaborate() to m.d.comb
-        """
-        assert len(pipechain) > 0, "pipechain must be non-zero length"
-        assert self.stage is None, "do not use connect with a stage"
-        eqs = [] # collated list of assignment statements
-
-        # connect inter-chain
-        for i in range(len(pipechain)-1):
-            pipe1 = pipechain[i]                # earlier
-            pipe2 = pipechain[i+1]              # later (by 1)
-            eqs += pipe1.connect_to_next(pipe2) # earlier n to later p
-
-        # connect front and back of chain to ourselves
-        front = pipechain[0]                # first in chain
-        end = pipechain[-1]                 # last in chain
-        self.set_specs(front, end) # sets up ispec/ospec functions
-        self._new_data("chain") # NOTE: REPLACES existing data
-        eqs += front._connect_in(self)      # front p to our p
-        eqs += end._connect_out(self)       # end n   to our n
-
-        return eqs
-
-    def set_input(self, i):
-        """ helper function to set the input data (used in unit tests)
-        """
-        return nmoperator.eq(self.p.data_i, i)
-
-    def __iter__(self):
-        yield from self.p # yields ready/valid/data (data also gets yielded)
-        yield from self.n # ditto
-
-    def ports(self):
-        return list(self)
-
-    def elaborate(self, platform):
-        """ handles case where stage has dynamic ready/valid functions
-        """
-        m = Module()
-        m.submodules.p = self.p
-        m.submodules.n = self.n
-
-        self.setup(m, self.p.data_i)
-
-        if not self.p.stage_ctl:
-            return m
-
-        # intercept the previous (outgoing) "ready", combine with stage ready
-        m.d.comb += self.p.s_ready_o.eq(self.p._ready_o & self.stage.d_ready)
-
-        # intercept the next (incoming) "ready" and combine it with data valid
-        sdv = self.stage.d_valid(self.n.ready_i)
-        m.d.comb += self.n.d_valid.eq(self.n.ready_i & sdv)
-
-        return m
-
-
-class BufferedHandshake(ControlBase):
-    """ buffered pipeline stage.  data and strobe signals travel in sync.
-        if ever the input is ready and the output is not, processed data
-        is shunted in a temporary register.
-
-        Argument: stage.  see Stage API above
-
-        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
-        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
-        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
-                              |             |
-                            process --->----^
-                              |             |
-                              +-- r_data ->-+
-
-        input data p.data_i is read (only), is processed and goes into an
-        intermediate result store [process()].  this is updated combinatorially.
-
-        in a non-stall condition, the intermediate result will go into the
-        output (update_output).  however if ever there is a stall, it goes
-        into r_data instead [update_buffer()].
-
-        when the non-stall condition is released, r_data is the first
-        to be transferred to the output [flush_buffer()], and the stall
-        condition cleared.
-
-        on the next cycle (as long as stall is not raised again) the
-        input may begin to be processed and transferred directly to output.
-    """
-
-    def elaborate(self, platform):
-        self.m = ControlBase.elaborate(self, platform)
-
-        result = _spec(self.stage.ospec, "r_tmp")
-        r_data = _spec(self.stage.ospec, "r_data")
-
-        # establish some combinatorial temporaries
-        o_n_validn = Signal(reset_less=True)
-        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
-        nir_por = Signal(reset_less=True)
-        nir_por_n = Signal(reset_less=True)
-        p_valid_i = Signal(reset_less=True)
-        nir_novn = Signal(reset_less=True)
-        nirn_novn = Signal(reset_less=True)
-        por_pivn = Signal(reset_less=True)
-        npnn = Signal(reset_less=True)
-        self.m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
-                     o_n_validn.eq(~self.n.valid_o),
-                     n_ready_i.eq(self.n.ready_i_test),
-                     nir_por.eq(n_ready_i & self.p._ready_o),
-                     nir_por_n.eq(n_ready_i & ~self.p._ready_o),
-                     nir_novn.eq(n_ready_i | o_n_validn),
-                     nirn_novn.eq(~n_ready_i & o_n_validn),
-                     npnn.eq(nir_por | nirn_novn),
-                     por_pivn.eq(self.p._ready_o & ~p_valid_i)
-        ]
-
-        # store result of processing in combinatorial temporary
-        self.m.d.comb += nmoperator.eq(result, self.data_r)
-
-        # if not in stall condition, update the temporary register
-        with self.m.If(self.p.ready_o): # not stalled
-            self.m.d.sync += nmoperator.eq(r_data, result) # update buffer
-
-        # data pass-through conditions
-        with self.m.If(npnn):
-            data_o = self._postprocess(result) # XXX TBD, does nothing right now
-            self.m.d.sync += [self.n.valid_o.eq(p_valid_i), # valid if p_valid
-                              nmoperator.eq(self.n.data_o, data_o), # update out
-                             ]
-        # buffer flush conditions (NOTE: can override data passthru conditions)
-        with self.m.If(nir_por_n): # not stalled
-            # Flush the [already processed] buffer to the output port.
-            data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
-            self.m.d.sync += [self.n.valid_o.eq(1),  # reg empty
-                              nmoperator.eq(self.n.data_o, data_o), # flush
-                             ]
-        # output ready conditions
-        self.m.d.sync += self.p._ready_o.eq(nir_novn | por_pivn)
-
-        return self.m
-
-
-class SimpleHandshake(ControlBase):
-    """ simple handshake control.  data and strobe signals travel in sync.
-        implements the protocol used by Wishbone and AXI4.
-
-        Argument: stage.  see Stage API above
-
-        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
-        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
-        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
-                              |             |
-                              +--process->--^
-        Truth Table
-
-        Inputs   Temporary  Output Data
-        -------  ---------- -----  ----
-        P P N N  PiV& ~NiR&  N P
-        i o i o  PoR  NoV    o o
-        V R R V              V R
-
-        -------   -    -     - -
-        0 0 0 0   0    0    >0 0    reg
-        0 0 0 1   0    1    >1 0    reg
-        0 0 1 0   0    0     0 1    process(data_i)
-        0 0 1 1   0    0     0 1    process(data_i)
-        -------   -    -     - -
-        0 1 0 0   0    0    >0 0    reg
-        0 1 0 1   0    1    >1 0    reg
-        0 1 1 0   0    0     0 1    process(data_i)
-        0 1 1 1   0    0     0 1    process(data_i)
-        -------   -    -     - -
-        1 0 0 0   0    0    >0 0    reg
-        1 0 0 1   0    1    >1 0    reg
-        1 0 1 0   0    0     0 1    process(data_i)
-        1 0 1 1   0    0     0 1    process(data_i)
-        -------   -    -     - -
-        1 1 0 0   1    0     1 0    process(data_i)
-        1 1 0 1   1    1     1 0    process(data_i)
-        1 1 1 0   1    0     1 1    process(data_i)
-        1 1 1 1   1    0     1 1    process(data_i)
-        -------   -    -     - -
-    """
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        r_busy = Signal()
-        result = _spec(self.stage.ospec, "r_tmp")
-
-        # establish some combinatorial temporaries
-        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
-        p_valid_i_p_ready_o = Signal(reset_less=True)
-        p_valid_i = Signal(reset_less=True)
-        m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
-                     n_ready_i.eq(self.n.ready_i_test),
-                     p_valid_i_p_ready_o.eq(p_valid_i & self.p.ready_o),
-        ]
-
-        # store result of processing in combinatorial temporary
-        m.d.comb += nmoperator.eq(result, self.data_r)
-
-        # previous valid and ready
-        with m.If(p_valid_i_p_ready_o):
-            data_o = self._postprocess(result) # XXX TBD, does nothing right now
-            m.d.sync += [r_busy.eq(1),      # output valid
-                         nmoperator.eq(self.n.data_o, data_o), # update output
-                        ]
-        # previous invalid or not ready, however next is accepting
-        with m.Elif(n_ready_i):
-            data_o = self._postprocess(result) # XXX TBD, does nothing right now
-            m.d.sync += [nmoperator.eq(self.n.data_o, data_o)]
-            # TODO: could still send data here (if there was any)
-            #m.d.sync += self.n.valid_o.eq(0) # ...so set output invalid
-            m.d.sync += r_busy.eq(0) # ...so set output invalid
-
-        m.d.comb += self.n.valid_o.eq(r_busy)
-        # if next is ready, so is previous
-        m.d.comb += self.p._ready_o.eq(n_ready_i)
-
-        return self.m
-
-
-class UnbufferedPipeline(ControlBase):
-    """ A simple pipeline stage with single-clock synchronisation
-        and two-way valid/ready synchronised signalling.
-
-        Note that a stall in one stage will result in the entire pipeline
-        chain stalling.
-
-        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
-        travel synchronously with the data: the valid/ready signalling
-        combines in a *combinatorial* fashion.  Therefore, a long pipeline
-        chain will lengthen propagation delays.
-
-        Argument: stage.  see Stage API, above
-
-        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
-        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
-        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
-                              |             |
-                            r_data        result
-                              |             |
-                              +--process ->-+
-
-        Attributes:
-        -----------
-        p.data_i : StageInput, shaped according to ispec
-            The pipeline input
-        p.data_o : StageOutput, shaped according to ospec
-            The pipeline output
-        r_data : input_shape according to ispec
-            A temporary (buffered) copy of a prior (valid) input.
-            This is HELD if the output is not ready.  It is updated
-            SYNCHRONOUSLY.
-        result: output_shape according to ospec
-            The output of the combinatorial logic.  it is updated
-            COMBINATORIALLY (no clock dependence).
-
-        Truth Table
-
-        Inputs  Temp  Output  Data
-        -------   -   -----   ----
-        P P N N ~NiR&  N P
-        i o i o  NoV   o o
-        V R R V        V R
-
-        -------   -    - -
-        0 0 0 0   0    0 1    reg
-        0 0 0 1   1    1 0    reg
-        0 0 1 0   0    0 1    reg
-        0 0 1 1   0    0 1    reg
-        -------   -    - -
-        0 1 0 0   0    0 1    reg
-        0 1 0 1   1    1 0    reg
-        0 1 1 0   0    0 1    reg
-        0 1 1 1   0    0 1    reg
-        -------   -    - -
-        1 0 0 0   0    1 1    reg
-        1 0 0 1   1    1 0    reg
-        1 0 1 0   0    1 1    reg
-        1 0 1 1   0    1 1    reg
-        -------   -    - -
-        1 1 0 0   0    1 1    process(data_i)
-        1 1 0 1   1    1 0    process(data_i)
-        1 1 1 0   0    1 1    process(data_i)
-        1 1 1 1   0    1 1    process(data_i)
-        -------   -    - -
-
-        Note: PoR is *NOT* involved in the above decision-making.
-    """
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        data_valid = Signal() # is data valid or not
-        r_data = _spec(self.stage.ospec, "r_tmp") # output type
-
-        # some temporaries
-        p_valid_i = Signal(reset_less=True)
-        pv = Signal(reset_less=True)
-        buf_full = Signal(reset_less=True)
-        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
-        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
-        m.d.comb += buf_full.eq(~self.n.ready_i_test & data_valid)
-
-        m.d.comb += self.n.valid_o.eq(data_valid)
-        m.d.comb += self.p._ready_o.eq(~data_valid | self.n.ready_i_test)
-        m.d.sync += data_valid.eq(p_valid_i | buf_full)
-
-        with m.If(pv):
-            m.d.sync += nmoperator.eq(r_data, self.data_r)
-        data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
-        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
-
-        return self.m
-
-class UnbufferedPipeline2(ControlBase):
-    """ A simple pipeline stage with single-clock synchronisation
-        and two-way valid/ready synchronised signalling.
-
-        Note that a stall in one stage will result in the entire pipeline
-        chain stalling.
-
-        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
-        travel synchronously with the data: the valid/ready signalling
-        combines in a *combinatorial* fashion.  Therefore, a long pipeline
-        chain will lengthen propagation delays.
-
-        Argument: stage.  see Stage API, above
-
-        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
-        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
-        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
-                              |             |    |
-                              +- process-> buf <-+
-        Attributes:
-        -----------
-        p.data_i : StageInput, shaped according to ispec
-            The pipeline input
-        p.data_o : StageOutput, shaped according to ospec
-            The pipeline output
-        buf : output_shape according to ospec
-            A temporary (buffered) copy of a valid output
-            This is HELD if the output is not ready.  It is updated
-            SYNCHRONOUSLY.
-
-        Inputs  Temp  Output Data
-        -------   -   -----
-        P P N N ~NiR&  N P   (buf_full)
-        i o i o  NoV   o o
-        V R R V        V R
-
-        -------   -    - -
-        0 0 0 0   0    0 1   process(data_i)
-        0 0 0 1   1    1 0   reg (odata, unchanged)
-        0 0 1 0   0    0 1   process(data_i)
-        0 0 1 1   0    0 1   process(data_i)
-        -------   -    - -
-        0 1 0 0   0    0 1   process(data_i)
-        0 1 0 1   1    1 0   reg (odata, unchanged)
-        0 1 1 0   0    0 1   process(data_i)
-        0 1 1 1   0    0 1   process(data_i)
-        -------   -    - -
-        1 0 0 0   0    1 1   process(data_i)
-        1 0 0 1   1    1 0   reg (odata, unchanged)
-        1 0 1 0   0    1 1   process(data_i)
-        1 0 1 1   0    1 1   process(data_i)
-        -------   -    - -
-        1 1 0 0   0    1 1   process(data_i)
-        1 1 0 1   1    1 0   reg (odata, unchanged)
-        1 1 1 0   0    1 1   process(data_i)
-        1 1 1 1   0    1 1   process(data_i)
-        -------   -    - -
-
-        Note: PoR is *NOT* involved in the above decision-making.
-    """
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        buf_full = Signal() # is data valid or not
-        buf = _spec(self.stage.ospec, "r_tmp") # output type
-
-        # some temporaries
-        p_valid_i = Signal(reset_less=True)
-        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
-
-        m.d.comb += self.n.valid_o.eq(buf_full | p_valid_i)
-        m.d.comb += self.p._ready_o.eq(~buf_full)
-        m.d.sync += buf_full.eq(~self.n.ready_i_test & self.n.valid_o)
-
-        data_o = Mux(buf_full, buf, self.data_r)
-        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
-        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
-        m.d.sync += nmoperator.eq(buf, self.n.data_o)
-
-        return self.m
-
-
-class PassThroughHandshake(ControlBase):
-    """ A control block that delays by one clock cycle.
-
-        Inputs   Temporary          Output Data
-        -------  ------------------  ----- ----
-        P P N N  PiV& PiV| NiR| pvr   N P  (pvr)
-        i o i o  PoR  ~PoR ~NoV       o o
-        V R R V                       V R
-
-        -------   -    -    -   -     - -
-        0 0 0 0   0    1    1   0     1 1   odata (unchanged)
-        0 0 0 1   0    1    0   0     1 0   odata (unchanged)
-        0 0 1 0   0    1    1   0     1 1   odata (unchanged)
-        0 0 1 1   0    1    1   0     1 1   odata (unchanged)
-        -------   -    -    -   -     - -
-        0 1 0 0   0    0    1   0     0 1   odata (unchanged)
-        0 1 0 1   0    0    0   0     0 0   odata (unchanged)
-        0 1 1 0   0    0    1   0     0 1   odata (unchanged)
-        0 1 1 1   0    0    1   0     0 1   odata (unchanged)
-        -------   -    -    -   -     - -
-        1 0 0 0   0    1    1   1     1 1   process(in)
-        1 0 0 1   0    1    0   0     1 0   odata (unchanged)
-        1 0 1 0   0    1    1   1     1 1   process(in)
-        1 0 1 1   0    1    1   1     1 1   process(in)
-        -------   -    -    -   -     - -
-        1 1 0 0   1    1    1   1     1 1   process(in)
-        1 1 0 1   1    1    0   0     1 0   odata (unchanged)
-        1 1 1 0   1    1    1   1     1 1   process(in)
-        1 1 1 1   1    1    1   1     1 1   process(in)
-        -------   -    -    -   -     - -
-
-    """
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        r_data = _spec(self.stage.ospec, "r_tmp") # output type
-
-        # temporaries
-        p_valid_i = Signal(reset_less=True)
-        pvr = Signal(reset_less=True)
-        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
-        m.d.comb += pvr.eq(p_valid_i & self.p.ready_o)
-
-        m.d.comb += self.p.ready_o.eq(~self.n.valid_o |  self.n.ready_i_test)
-        m.d.sync += self.n.valid_o.eq(p_valid_i       | ~self.p.ready_o)
-
-        odata = Mux(pvr, self.data_r, r_data)
-        m.d.sync += nmoperator.eq(r_data, odata)
-        r_data = self._postprocess(r_data) # XXX TBD, does nothing right now
-        m.d.comb += nmoperator.eq(self.n.data_o, r_data)
-
-        return m
-
-
-class RegisterPipeline(UnbufferedPipeline):
-    """ A pipeline stage that delays by one clock cycle, creating a
-        sync'd latch out of data_o and valid_o as an indirect byproduct
-        of using PassThroughStage
-    """
-    def __init__(self, iospecfn):
-        UnbufferedPipeline.__init__(self, PassThroughStage(iospecfn))
-
-
-class FIFOControl(ControlBase):
-    """ FIFO Control.  Uses Queue to store data, coincidentally
-        happens to have same valid/ready signalling as Stage API.
-
-        data_i -> fifo.din -> FIFO -> fifo.dout -> data_o
-    """
-    def __init__(self, depth, stage, in_multi=None, stage_ctl=False,
-                                     fwft=True, pipe=False):
-        """ FIFO Control
-
-            * :depth: number of entries in the FIFO
-            * :stage: data processing block
-            * :fwft:  first word fall-thru mode (non-fwft introduces delay)
-            * :pipe:  specifies pipe mode.
-
-            when fwft = True it indicates that transfers may occur
-            combinatorially through stage processing in the same clock cycle.
-            This requires that the Stage be a Moore FSM:
-            https://en.wikipedia.org/wiki/Moore_machine
-
-            when fwft = False it indicates that all output signals are
-            produced only from internal registers or memory, i.e. that the
-            Stage is a Mealy FSM:
-            https://en.wikipedia.org/wiki/Mealy_machine
-
-            data is processed (and located) as follows:
-
-            self.p  self.stage temp    fn temp  fn  temp  fp   self.n
-            data_i->process()->result->cat->din.FIFO.dout->cat(data_o)
-
-            yes, really: cat produces a Cat() which can be assigned to.
-            this is how the FIFO gets de-catted without needing a de-cat
-            function
-        """
-        self.fwft = fwft
-        self.pipe = pipe
-        self.fdepth = depth
-        ControlBase.__init__(self, stage, in_multi, stage_ctl)
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        # make a FIFO with a signal of equal width to the data_o.
-        (fwidth, _) = nmoperator.shape(self.n.data_o)
-        fifo = Queue(fwidth, self.fdepth, fwft=self.fwft, pipe=self.pipe)
-        m.submodules.fifo = fifo
-
-        def processfn(data_i):
-            # store result of processing in combinatorial temporary
-            result = _spec(self.stage.ospec, "r_temp")
-            m.d.comb += nmoperator.eq(result, self.process(data_i))
-            return nmoperator.cat(result)
-
-        ## prev: make the FIFO (Queue object) "look" like a PrevControl...
-        m.submodules.fp = fp = PrevControl()
-        fp.valid_i, fp._ready_o, fp.data_i = fifo.we, fifo.writable, fifo.din
-        m.d.comb += fp._connect_in(self.p, fn=processfn)
-
-        # next: make the FIFO (Queue object) "look" like a NextControl...
-        m.submodules.fn = fn = NextControl()
-        fn.valid_o, fn.ready_i, fn.data_o  = fifo.readable, fifo.re, fifo.dout
-        connections = fn._connect_out(self.n, fn=nmoperator.cat)
-
-        # ok ok so we can't just do the ready/valid eqs straight:
-        # first 2 from connections are the ready/valid, 3rd is data.
-        if self.fwft:
-            m.d.comb += connections[:2] # combinatorial on next ready/valid
-        else:
-            m.d.sync += connections[:2]  # non-fwft mode needs sync
-        data_o = connections[2] # get the data
-        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
-        m.d.comb += data_o
-
-        return m
-
-
-# aka "RegStage".
-class UnbufferedPipeline(FIFOControl):
-    def __init__(self, stage, in_multi=None, stage_ctl=False):
-        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
-                                   fwft=True, pipe=False)
-
-# aka "BreakReadyStage" XXX had to set fwft=True to get it to work
-class PassThroughHandshake(FIFOControl):
-    def __init__(self, stage, in_multi=None, stage_ctl=False):
-        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
-                                   fwft=True, pipe=True)
-
-# this is *probably* BufferedHandshake, although test #997 now succeeds.
-class BufferedHandshake(FIFOControl):
-    def __init__(self, stage, in_multi=None, stage_ctl=False):
-        FIFOControl.__init__(self, 2, stage, in_multi, stage_ctl,
-                                   fwft=True, pipe=False)
-
-
-"""
-# this is *probably* SimpleHandshake (note: memory cell size=0)
-class SimpleHandshake(FIFOControl):
-    def __init__(self, stage, in_multi=None, stage_ctl=False):
-        FIFOControl.__init__(self, 0, stage, in_multi, stage_ctl,
-                                   fwft=True, pipe=False)
-"""
diff --git a/src/add/stageapi.py b/src/add/stageapi.py
deleted file mode 100644
index 9651bf79..00000000
--- a/src/add/stageapi.py
+++ /dev/null
@@ -1,271 +0,0 @@
-""" Stage API
-
-    Associated development bugs:
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
-    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
-
-    Stage API:
-    ---------
-
-    stage requires compliance with a strict API that may be
-    implemented in several means, including as a static class.
-
-    Stages do not HOLD data, and they definitely do not contain
-    signalling (ready/valid).  They do however specify the FORMAT
-    of the incoming and outgoing data, and they provide a means to
-    PROCESS that data (from incoming format to outgoing format).
-
-    Stage Blocks really should be combinatorial blocks (Moore FSMs).
-    It would be ok to have input come in from sync'd sources
-    (clock-driven, Mealy FSMs) however by doing so they would no longer
-    be deterministic, and chaining such blocks with such side-effects
-    together could result in unexpected, unpredictable, unreproduceable
-    behaviour.
-
-    So generally to be avoided, then unless you know what you are doing.
-    https://en.wikipedia.org/wiki/Moore_machine
-    https://en.wikipedia.org/wiki/Mealy_machine
-
-    the methods of a stage instance must be as follows:
-
-    * ispec() - Input data format specification.  Takes a bit of explaining.
-                The requirements are: something that eventually derives from
-                nmigen Value must be returned *OR* an iterator or iterable
-                or sequence (list, tuple etc.) or generator must *yield*
-                thing(s) that (eventually) derive from the nmigen Value class.
-
-                Complex to state, very simple in practice:
-                see test_buf_pipe.py for over 25 worked examples.
-
-    * ospec() - Output data format specification.
-                format requirements identical to ispec.
-
-    * process(m, i) - Optional function for processing ispec-formatted data.
-                returns a combinatorial block of a result that
-                may be assigned to the output, by way of the "nmoperator.eq"
-                function.  Note that what is returned here can be
-                extremely flexible.  Even a dictionary can be returned
-                as long as it has fields that match precisely with the
-                Record into which its values is intended to be assigned.
-                Again: see example unit tests for details.
-
-    * setup(m, i) - Optional function for setting up submodules.
-                may be used for more complex stages, to link
-                the input (i) to submodules.  must take responsibility
-                for adding those submodules to the module (m).
-                the submodules must be combinatorial blocks and
-                must have their inputs and output linked combinatorially.
-
-    Both StageCls (for use with non-static classes) and Stage (for use
-    by static classes) are abstract classes from which, for convenience
-    and as a courtesy to other developers, anything conforming to the
-    Stage API may *choose* to derive.  See Liskov Substitution Principle:
-    https://en.wikipedia.org/wiki/Liskov_substitution_principle
-
-    StageChain:
-    ----------
-
-    A useful combinatorial wrapper around stages that chains them together
-    and then presents a Stage-API-conformant interface.  By presenting
-    the same API as the stages it wraps, it can clearly be used recursively.
-
-    StageHelper:
-    ----------
-
-    A convenience wrapper around a Stage-API-compliant "thing" which
-    complies with the Stage API and provides mandatory versions of
-    all the optional bits.
-"""
-
-from abc import ABCMeta, abstractmethod
-import inspect
-
-import nmoperator
-
-
-def _spec(fn, name=None):
-    """ useful function that determines if "fn" has an argument "name".
-        if so, fn(name) is called otherwise fn() is called.
-
-        means that ispec and ospec can be declared with *or without*
-        a name argument.  normally it would be necessary to have
-        "ispec(name=None)" to achieve the same effect.
-    """
-    if name is None:
-        return fn()
-    varnames = dict(inspect.getmembers(fn.__code__))['co_varnames']
-    if 'name' in varnames:
-        return fn(name=name)
-    return fn()
-
-
-class StageCls(metaclass=ABCMeta):
-    """ Class-based "Stage" API.  requires instantiation (after derivation)
-
-        see "Stage API" above..  Note: python does *not* require derivation
-        from this class.  All that is required is that the pipelines *have*
-        the functions listed in this class.  Derivation from this class
-        is therefore merely a "courtesy" to maintainers.
-    """
-    @abstractmethod
-    def ispec(self): pass       # REQUIRED
-    @abstractmethod
-    def ospec(self): pass       # REQUIRED
-    #@abstractmethod
-    #def setup(self, m, i): pass # OPTIONAL
-    #@abstractmethod
-    #def process(self, i): pass  # OPTIONAL
-
-
-class Stage(metaclass=ABCMeta):
-    """ Static "Stage" API.  does not require instantiation (after derivation)
-
-        see "Stage API" above.  Note: python does *not* require derivation
-        from this class.  All that is required is that the pipelines *have*
-        the functions listed in this class.  Derivation from this class
-        is therefore merely a "courtesy" to maintainers.
-    """
-    @staticmethod
-    @abstractmethod
-    def ispec(): pass
-
-    @staticmethod
-    @abstractmethod
-    def ospec(): pass
-
-    #@staticmethod
-    #@abstractmethod
-    #def setup(m, i): pass
-
-    #@staticmethod
-    #@abstractmethod
-    #def process(i): pass
-
-
-class StageHelper(Stage):
-    """ a convenience wrapper around something that is Stage-API-compliant.
-        (that "something" may be a static class, for example).
-
-        StageHelper happens to also be compliant with the Stage API,
-        it differs from the stage that it wraps in that all the "optional"
-        functions are provided (hence the designation "convenience wrapper")
-    """
-    def __init__(self, stage):
-        self.stage = stage
-        self._ispecfn = None
-        self._ospecfn = None
-        if stage is not None:
-            self.set_specs(self, self)
-
-    def ospec(self, name):
-        assert self._ospecfn is not None
-        return _spec(self._ospecfn, name)
-
-    def ispec(self, name):
-        assert self._ispecfn is not None
-        return _spec(self._ispecfn, name)
-
-    def set_specs(self, p, n):
-        """ sets up the ispecfn and ospecfn for getting input and output data
-        """
-        if hasattr(p, "stage"):
-            p = p.stage
-        if hasattr(n, "stage"):
-            n = n.stage
-        self._ispecfn = p.ispec
-        self._ospecfn = n.ospec
-
-    def new_specs(self, name):
-        """ allocates new ispec and ospec pair
-        """
-        return (_spec(self.ispec, "%s_i" % name),
-                _spec(self.ospec, "%s_o" % name))
-
-    def process(self, i):
-        if self.stage and hasattr(self.stage, "process"):
-            return self.stage.process(i)
-        return i
-
-    def setup(self, m, i):
-        if self.stage is not None and hasattr(self.stage, "setup"):
-            self.stage.setup(m, i)
-
-    def _postprocess(self, i): # XXX DISABLED
-        return i # RETURNS INPUT
-        if hasattr(self.stage, "postprocess"):
-            return self.stage.postprocess(i)
-        return i
-
-
-class StageChain(StageHelper):
-    """ pass in a list of stages, and they will automatically be
-        chained together via their input and output specs into a
-        combinatorial chain, to create one giant combinatorial block.
-
-        the end result basically conforms to the exact same Stage API.
-
-        * input to this class will be the input of the first stage
-        * output of first stage goes into input of second
-        * output of second goes into input into third
-        * ... (etc. etc.)
-        * the output of this class will be the output of the last stage
-
-        NOTE: whilst this is very similar to ControlBase.connect(), it is
-        *really* important to appreciate that StageChain is pure
-        combinatorial and bypasses (does not involve, at all, ready/valid
-        signalling of any kind).
-
-        ControlBase.connect on the other hand respects, connects, and uses
-        ready/valid signalling.
-
-        Arguments:
-
-        * :chain: a chain of combinatorial blocks conforming to the Stage API
-                  NOTE: StageChain.ispec and ospect have to have something
-                  to return (beginning and end specs of the chain),
-                  therefore the chain argument must be non-zero length
-
-        * :specallocate: if set, new input and output data will be allocated
-                         and connected (eq'd) to each chained Stage.
-                         in some cases if this is not done, the nmigen warning
-                         "driving from two sources, module is being flattened"
-                         will be issued.
-
-        NOTE: do NOT use StageChain with combinatorial blocks that have
-        side-effects (state-based / clock-based input) or conditional
-        (inter-chain) dependencies, unless you really know what you are doing.
-    """
-    def __init__(self, chain, specallocate=False):
-        assert len(chain) > 0, "stage chain must be non-zero length"
-        self.chain = chain
-        StageHelper.__init__(self, None)
-        self.setup = self._sa_setup if specallocate else self._na_setup
-        self.set_specs(self.chain[0], self.chain[-1])
-
-    def _sa_setup(self, m, i):
-        for (idx, c) in enumerate(self.chain):
-            if hasattr(c, "setup"):
-                c.setup(m, i)               # stage may have some module stuff
-            ofn = self.chain[idx].ospec     # last assignment survives
-            o = _spec(ofn, 'chainin%d' % idx)
-            m.d.comb += nmoperator.eq(o, c.process(i)) # process input into "o"
-            if idx == len(self.chain)-1:
-                break
-            ifn = self.chain[idx+1].ispec   # new input on next loop
-            i = _spec(ifn, 'chainin%d' % (idx+1))
-            m.d.comb += nmoperator.eq(i, o) # assign to next input
-        self.o = o
-        return self.o                       # last loop is the output
-
-    def _na_setup(self, m, i):
-        for (idx, c) in enumerate(self.chain):
-            if hasattr(c, "setup"):
-                c.setup(m, i)               # stage may have some module stuff
-            i = o = c.process(i)            # store input into "o"
-        self.o = o
-        return self.o                       # last loop is the output
-
-    def process(self, i):
-        return self.o # conform to Stage API: return last-loop output
-
-
diff --git a/src/add/test_add.py b/src/add/test_add.py
deleted file mode 100644
index 989cf482..00000000
--- a/src/add/test_add.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from operator import add
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from nmigen_add_experiment import FPADD
-
-from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_rs_case, check_rs_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-def testbench(dut):
-    yield from check_rs_case(dut, 0x36093399, 0x7f6a12f1, 0x7f6a12f1)
-    yield from check_rs_case(dut, 0x006CE3EE, 0x806CE3EC, 0x00000002)
-    yield from check_rs_case(dut, 0x00000047, 0x80000048, 0x80000001)
-    yield from check_rs_case(dut, 0x000116C2, 0x8001170A, 0x80000048)
-    yield from check_rs_case(dut, 0x7ed01f25, 0xff559e2c, 0xfedb1d33)
-    yield from check_rs_case(dut, 0, 0, 0)
-    yield from check_rs_case(dut, 0xFFFFFFFF, 0xC63B800A, 0x7FC00000)
-    yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    #yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    yield from check_rs_case(dut, 0x7F800000, 0xFF800000, 0x7FC00000)
-    yield from check_rs_case(dut, 0x42540000, 0xC2540000, 0x00000000)
-    yield from check_rs_case(dut, 0xC2540000, 0x42540000, 0x00000000)
-    yield from check_rs_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
-    yield from check_rs_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
-    yield from check_rs_case(dut, 0x40000000, 0xc0000000, 0x00000000)
-    yield from check_rs_case(dut, 0x3F800000, 0x40000000, 0x40400000)
-    yield from check_rs_case(dut, 0x40000000, 0x3F800000, 0x40400000)
-    yield from check_rs_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
-    yield from check_rs_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
-    yield from check_rs_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
-    yield from check_rs_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
-    yield from check_rs_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
-    yield from check_rs_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
-    yield from check_rs_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
-    yield from check_rs_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
-    yield from check_rs_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
-    yield from check_rs_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
-    yield from check_rs_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
-    yield from check_rs_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
-    yield from check_rs_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
-    yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    yield from check_rs_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
-    yield from check_rs_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
-    yield from check_rs_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
-    yield from check_rs_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
-    yield from check_rs_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
-    yield from check_rs_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
-    yield from check_rs_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
-    yield from check_rs_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
-    yield from check_rs_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
-    yield from check_rs_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
-    yield from check_rs_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
-    #yield from check_rs_case(dut, 1, 0, 1)
-    #yield from check_rs_case(dut, 1, 1, 1)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0x80000000, 0x22cb525a, 0x40000000, 0x83e73d5c,
-                  0xbf9b1e94, 0x34082401,
-                    0x5e8ef81, 0x5c75da81, 0x2b017]
-    stimulus_b = [0xff800001, 0xadd79efa, 0xC0000000, 0x1c800000,
-                  0xc038ed3a, 0xb328cd45, 
-                    0x114f3db, 0x2f642a39, 0xff3807ab]
-    yield from run_test(dut, stimulus_a, stimulus_b, add, get_rs_case)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, add, get_rs_case)
-    yield from run_edge_cases(dut, count, add, get_rs_case)
-
-if __name__ == '__main__':
-    dut = FPADD(width=32, id_wid=5, single_cycle=True)
-    run_simulation(dut, testbench(dut), vcd_name="test_add.vcd")
-
diff --git a/src/add/test_add16.py b/src/add/test_add16.py
deleted file mode 100644
index f39ae8ae..00000000
--- a/src/add/test_add16.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from operator import add
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from nmigen_add_experiment import FPADD
-
-from unit_test_half import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-def testbench(dut):
-    #yield from check_case(dut, 0x7800, 0xff6f, 0xff6f)
-    #yield from check_case(dut, 0x0000, 0x7c32, 0x7e32)
-    #yield from check_case(dut, 0x0000, 0x7da9, 0x7fa9)
-    #yield from check_case(dut, 0x0000, 0x7ea0, 0x7ea0)
-    #yield from check_case(dut, 0x7c9a, 0x8000, 0x7e9a)
-    #yield from check_case(dut, 0x7d5e, 0x0000, 0x7f5e)
-    #yield from check_case(dut, 0x8000, 0x7c8c, 0x7e8c)
-    #yield from check_case(dut, 0x8000, 0xfc55, 0xfe55)
-    #yield from check_case(dut, 0x8000, 0x7e1a, 0x7e1a)
-
-    #yield from check_case(dut, 0x8000, 0xfc01, 0x7e00)
-    yield from check_case(dut, 0xfc00, 0x7c00, 0x7e00)
-    yield from check_case(dut, 0x8000, 0, 0)
-    yield from check_case(dut, 0, 0, 0)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [ 0x8000, 0x8000 ]
-    stimulus_b = [ 0x0000, 0xfc01 ]
-    yield from run_test(dut, stimulus_a, stimulus_b, add)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, add)
-    yield from run_edge_cases(dut, count, add)
-
-if __name__ == '__main__':
-    dut = FPADD(width=16, single_cycle=True)
-    run_simulation(dut, testbench(dut), vcd_name="test_add16.vcd")
-
diff --git a/src/add/test_add64.py b/src/add/test_add64.py
deleted file mode 100644
index dcca12c6..00000000
--- a/src/add/test_add64.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-from operator import add
-
-from nmigen_add_experiment import FPADD
-
-import sys
-import atexit
-from random import randint
-from random import seed
-
-from unit_test_double import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-
-def testbench(dut):
-    yield from check_case(dut, 0, 0, 0)
-    yield from check_case(dut, 0x3FF0000000000000, 0x4000000000000000,
-                               0x4008000000000000)
-    yield from check_case(dut, 0x4000000000000000, 0x3FF0000000000000,
-                               0x4008000000000000)
-    yield from check_case(dut, 0x4056C00000000000, 0x4042800000000000,
-                               0x4060000000000000)
-    yield from check_case(dut, 0x4056C00000000000, 0x4042EA3D70A3D70A,
-                               0x40601A8F5C28F5C2)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0x3ff00000000000c5, 0xff80000000000000]
-    stimulus_b = [0xbd28a404211fb72b, 0x7f80000000000000]
-    yield from run_test(dut, stimulus_a, stimulus_b, add)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, add)
-    yield from run_edge_cases(dut, count, add)
-
-
-if __name__ == '__main__':
-    dut = FPADD(width=64, single_cycle=False)
-    run_simulation(dut, testbench(dut), vcd_name="test_add64.vcd")
-
diff --git a/src/add/test_add_base.py b/src/add/test_add_base.py
deleted file mode 100644
index 248f719a..00000000
--- a/src/add/test_add_base.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from random import randint
-from operator import add
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from nmigen_add_experiment import FPADDBase, FPADDBaseMod
-
-def get_case(dut, a, b, mid):
-    yield dut.in_mid.eq(mid)
-    yield dut.in_a.eq(a)
-    yield dut.in_b.eq(b)
-    yield dut.in_t.stb.eq(1)
-    yield
-    yield
-    yield
-    yield
-    ack = (yield dut.in_t.ack)
-    assert ack == 0
-
-    yield dut.in_t.stb.eq(0)
-
-    yield dut.out_z.ack.eq(1)
-
-    while True:
-        out_z_stb = (yield dut.out_z.stb)
-        if not out_z_stb:
-            yield
-            continue
-        out_z = yield dut.out_z.v
-        out_mid = yield dut.out_mid
-        yield dut.out_z.ack.eq(0)
-        yield
-        break
-
-    return out_z, out_mid
-
-def check_case(dut, a, b, z, mid=None):
-    if mid is None:
-        mid = randint(0, 6)
-    out_z, out_mid = yield from get_case(dut, a, b, mid)
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
-
-
-
-def testbench(dut):
-    yield from check_case(dut, 0x36093399, 0x7f6a12f1, 0x7f6a12f1)
-    yield from check_case(dut, 0x006CE3EE, 0x806CE3EC, 0x00000002)
-    yield from check_case(dut, 0x00000047, 0x80000048, 0x80000001)
-    yield from check_case(dut, 0x000116C2, 0x8001170A, 0x80000048)
-    yield from check_case(dut, 0x7ed01f25, 0xff559e2c, 0xfedb1d33)
-    yield from check_case(dut, 0, 0, 0)
-    yield from check_case(dut, 0xFFFFFFFF, 0xC63B800A, 0x7FC00000)
-    yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    #yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    yield from check_case(dut, 0x7F800000, 0xFF800000, 0x7FC00000)
-    yield from check_case(dut, 0x42540000, 0xC2540000, 0x00000000)
-    yield from check_case(dut, 0xC2540000, 0x42540000, 0x00000000)
-    yield from check_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
-    yield from check_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
-    yield from check_case(dut, 0x40000000, 0xc0000000, 0x00000000)
-    yield from check_case(dut, 0x3F800000, 0x40000000, 0x40400000)
-    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40400000)
-    yield from check_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
-    yield from check_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
-    yield from check_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
-    yield from check_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
-    yield from check_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
-    yield from check_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
-    yield from check_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
-    yield from check_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
-    yield from check_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
-    yield from check_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
-    yield from check_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
-    yield from check_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
-    yield from check_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
-    yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    yield from check_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
-    yield from check_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
-    yield from check_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
-    yield from check_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
-    yield from check_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
-    yield from check_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
-    yield from check_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
-    yield from check_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
-    yield from check_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
-    yield from check_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
-    yield from check_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
-
-if __name__ == '__main__':
-    dut = FPADDBaseMod(width=32, id_wid=5, single_cycle=True)
-    run_simulation(dut, testbench(dut), vcd_name="test_add.vcd")
-
diff --git a/src/add/test_buf_pipe.py b/src/add/test_buf_pipe.py
deleted file mode 100644
index 37f2b31f..00000000
--- a/src/add/test_buf_pipe.py
+++ /dev/null
@@ -1,1308 +0,0 @@
-""" Unit tests for Buffered and Unbuffered pipelines
-
-    contains useful worked examples of how to use the Pipeline API,
-    including:
-
-    * Combinatorial Stage "Chaining"
-    * class-based data stages
-    * nmigen module-based data stages
-    * special nmigen module-based data stage, where the stage *is* the module
-    * Record-based data stages
-    * static-class data stages
-    * multi-stage pipelines (and how to connect them)
-    * how to *use* the pipelines (see Test5) - how to get data in and out
-
-"""
-
-from nmigen import Module, Signal, Mux, Const, Elaboratable
-from nmigen.hdl.rec import Record
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from example_buf_pipe import ExampleBufPipe, ExampleBufPipeAdd
-from example_buf_pipe import ExamplePipeline, UnbufferedPipeline
-from example_buf_pipe import ExampleStageCls
-from example_buf_pipe import PrevControl, NextControl, BufferedHandshake
-from example_buf_pipe import StageChain, ControlBase, StageCls
-from singlepipe import UnbufferedPipeline2
-from singlepipe import SimpleHandshake
-from singlepipe import PassThroughHandshake
-from singlepipe import PassThroughStage
-from singlepipe import FIFOControl
-from singlepipe import RecordObject
-
-from random import randint, seed
-
-#seed(4)
-
-
-def check_o_n_valid(dut, val):
-    o_n_valid = yield dut.n.valid_o
-    assert o_n_valid == val
-
-def check_o_n_valid2(dut, val):
-    o_n_valid = yield dut.n.valid_o
-    assert o_n_valid == val
-
-
-def tbench(dut):
-    #yield dut.i_p_rst.eq(1)
-    yield dut.n.ready_i.eq(0)
-    #yield dut.p.ready_o.eq(0)
-    yield
-    yield
-    #yield dut.i_p_rst.eq(0)
-    yield dut.n.ready_i.eq(1)
-    yield dut.p.data_i.eq(5)
-    yield dut.p.valid_i.eq(1)
-    yield
-
-    yield dut.p.data_i.eq(7)
-    yield from check_o_n_valid(dut, 0) # effects of i_p_valid delayed
-    yield
-    yield from check_o_n_valid(dut, 1) # ok *now* i_p_valid effect is felt
-
-    yield dut.p.data_i.eq(2)
-    yield
-    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
-    yield dut.p.data_i.eq(9)
-    yield
-    yield dut.p.valid_i.eq(0)
-    yield dut.p.data_i.eq(12)
-    yield
-    yield dut.p.data_i.eq(32)
-    yield dut.n.ready_i.eq(1)
-    yield
-    yield from check_o_n_valid(dut, 1) # buffer still needs to output
-    yield
-    yield from check_o_n_valid(dut, 1) # buffer still needs to output
-    yield
-    yield from check_o_n_valid(dut, 0) # buffer outputted, *now* we're done.
-    yield
-
-
-def tbench2(dut):
-    #yield dut.p.i_rst.eq(1)
-    yield dut.n.ready_i.eq(0)
-    #yield dut.p.ready_o.eq(0)
-    yield
-    yield
-    #yield dut.p.i_rst.eq(0)
-    yield dut.n.ready_i.eq(1)
-    yield dut.p.data_i.eq(5)
-    yield dut.p.valid_i.eq(1)
-    yield
-
-    yield dut.p.data_i.eq(7)
-    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
-    yield
-    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
-
-    yield dut.p.data_i.eq(2)
-    yield
-    yield from check_o_n_valid2(dut, 1) # ok *now* i_p_valid effect is felt
-    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
-    yield dut.p.data_i.eq(9)
-    yield
-    yield dut.p.valid_i.eq(0)
-    yield dut.p.data_i.eq(12)
-    yield
-    yield dut.p.data_i.eq(32)
-    yield dut.n.ready_i.eq(1)
-    yield
-    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
-    yield
-    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
-    yield
-    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
-    yield
-    yield from check_o_n_valid2(dut, 0) # buffer outputted, *now* we're done.
-    yield
-    yield
-    yield
-
-
-class Test3:
-    def __init__(self, dut, resultfn):
-        self.dut = dut
-        self.resultfn = resultfn
-        self.data = []
-        for i in range(num_tests):
-            #data.append(randint(0, 1<<16-1))
-            self.data.append(i+1)
-        self.i = 0
-        self.o = 0
-
-    def send(self):
-        while self.o != len(self.data):
-            send_range = randint(0, 3)
-            for j in range(randint(1,10)):
-                if send_range == 0:
-                    send = True
-                else:
-                    send = randint(0, send_range) != 0
-                o_p_ready = yield self.dut.p.ready_o
-                if not o_p_ready:
-                    yield
-                    continue
-                if send and self.i != len(self.data):
-                    yield self.dut.p.valid_i.eq(1)
-                    yield self.dut.p.data_i.eq(self.data[self.i])
-                    self.i += 1
-                else:
-                    yield self.dut.p.valid_i.eq(0)
-                yield
-
-    def rcv(self):
-        while self.o != len(self.data):
-            stall_range = randint(0, 3)
-            for j in range(randint(1,10)):
-                stall = randint(0, stall_range) != 0
-                yield self.dut.n.ready_i.eq(stall)
-                yield
-                o_n_valid = yield self.dut.n.valid_o
-                i_n_ready = yield self.dut.n.ready_i_test
-                if not o_n_valid or not i_n_ready:
-                    continue
-                data_o = yield self.dut.n.data_o
-                self.resultfn(data_o, self.data[self.o], self.i, self.o)
-                self.o += 1
-                if self.o == len(self.data):
-                    break
-
-def resultfn_3(data_o, expected, i, o):
-    assert data_o == expected + 1, \
-                "%d-%d data %x not match %x\n" \
-                % (i, o, data_o, expected)
-
-def data_placeholder():
-        data = []
-        for i in range(num_tests):
-            d = PlaceHolder()
-            d.src1 = randint(0, 1<<16-1)
-            d.src2 = randint(0, 1<<16-1)
-            data.append(d)
-        return data
-
-def data_dict():
-        data = []
-        for i in range(num_tests):
-            data.append({'src1': randint(0, 1<<16-1),
-                         'src2': randint(0, 1<<16-1)})
-        return data
-
-
-class Test5:
-    def __init__(self, dut, resultfn, data=None, stage_ctl=False):
-        self.dut = dut
-        self.resultfn = resultfn
-        self.stage_ctl = stage_ctl
-        if data:
-            self.data = data
-        else:
-            self.data = []
-            for i in range(num_tests):
-                self.data.append((randint(0, 1<<16-1), randint(0, 1<<16-1)))
-        self.i = 0
-        self.o = 0
-
-    def send(self):
-        while self.o != len(self.data):
-            send_range = randint(0, 3)
-            for j in range(randint(1,10)):
-                if send_range == 0:
-                    send = True
-                else:
-                    send = randint(0, send_range) != 0
-                #send = True
-                o_p_ready = yield self.dut.p.ready_o
-                if not o_p_ready:
-                    yield
-                    continue
-                if send and self.i != len(self.data):
-                    yield self.dut.p.valid_i.eq(1)
-                    for v in self.dut.set_input(self.data[self.i]):
-                        yield v
-                    self.i += 1
-                else:
-                    yield self.dut.p.valid_i.eq(0)
-                yield
-
-    def rcv(self):
-        while self.o != len(self.data):
-            stall_range = randint(0, 3)
-            for j in range(randint(1,10)):
-                ready = randint(0, stall_range) != 0
-                #ready = True
-                yield self.dut.n.ready_i.eq(ready)
-                yield
-                o_n_valid = yield self.dut.n.valid_o
-                i_n_ready = yield self.dut.n.ready_i_test
-                if not o_n_valid or not i_n_ready:
-                    continue
-                if isinstance(self.dut.n.data_o, Record):
-                    data_o = {}
-                    dod = self.dut.n.data_o
-                    for k, v in dod.fields.items():
-                        data_o[k] = yield v
-                else:
-                    data_o = yield self.dut.n.data_o
-                self.resultfn(data_o, self.data[self.o], self.i, self.o)
-                self.o += 1
-                if self.o == len(self.data):
-                    break
-
-def resultfn_5(data_o, expected, i, o):
-    res = expected[0] + expected[1]
-    assert data_o == res, \
-                "%d-%d data %x not match %s\n" \
-                % (i, o, data_o, repr(expected))
-
-def tbench4(dut):
-    data = []
-    for i in range(num_tests):
-        #data.append(randint(0, 1<<16-1))
-        data.append(i+1)
-    i = 0
-    o = 0
-    while True:
-        stall = randint(0, 3) != 0
-        send = randint(0, 5) != 0
-        yield dut.n.ready_i.eq(stall)
-        o_p_ready = yield dut.p.ready_o
-        if o_p_ready:
-            if send and i != len(data):
-                yield dut.p.valid_i.eq(1)
-                yield dut.p.data_i.eq(data[i])
-                i += 1
-            else:
-                yield dut.p.valid_i.eq(0)
-        yield
-        o_n_valid = yield dut.n.valid_o
-        i_n_ready = yield dut.n.ready_i_test
-        if o_n_valid and i_n_ready:
-            data_o = yield dut.n.data_o
-            assert data_o == data[o] + 2, "%d-%d data %x not match %x\n" \
-                                        % (i, o, data_o, data[o])
-            o += 1
-            if o == len(data):
-                break
-
-######################################################################
-# Test 2 and 4
-######################################################################
-
-class ExampleBufPipe2(ControlBase):
-    """ Example of how to do chained pipeline stages.
-    """
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = ExampleBufPipe()
-        pipe2 = ExampleBufPipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Test 9
-######################################################################
-
-class ExampleBufPipeChain2(BufferedHandshake):
-    """ connects two stages together as a *single* combinatorial stage.
-    """
-    def __init__(self):
-        stage1 = ExampleStageCls()
-        stage2 = ExampleStageCls()
-        combined = StageChain([stage1, stage2])
-        BufferedHandshake.__init__(self, combined)
-
-
-def data_chain2():
-        data = []
-        for i in range(num_tests):
-            data.append(randint(0, 1<<16-2))
-        return data
-
-
-def resultfn_9(data_o, expected, i, o):
-    res = expected + 2
-    assert data_o == res, \
-                "%d-%d received data %x not match expected %x\n" \
-                % (i, o, data_o, res)
-
-
-######################################################################
-# Test 6 and 10
-######################################################################
-
-class SetLessThan(Elaboratable):
-    def __init__(self, width, signed):
-        self.m = Module()
-        self.src1 = Signal((width, signed), name="src1")
-        self.src2 = Signal((width, signed), name="src2")
-        self.output = Signal(width, name="out")
-
-    def elaborate(self, platform):
-        self.m.d.comb += self.output.eq(Mux(self.src1 < self.src2, 1, 0))
-        return self.m
-
-
-class LTStage(StageCls):
-    """ module-based stage example
-    """
-    def __init__(self):
-        self.slt = SetLessThan(16, True)
-
-    def ispec(self, name):
-        return (Signal(16, name="%s_sig1" % name),
-                Signal(16, name="%s_sig2" % name))
-
-    def ospec(self, name):
-        return Signal(16, "%s_out" % name)
-
-    def setup(self, m, i):
-        self.o = Signal(16)
-        m.submodules.slt = self.slt
-        m.d.comb += self.slt.src1.eq(i[0])
-        m.d.comb += self.slt.src2.eq(i[1])
-        m.d.comb += self.o.eq(self.slt.output)
-
-    def process(self, i):
-        return self.o
-
-
-class LTStageDerived(SetLessThan, StageCls):
-    """ special version of a nmigen module where the module is also a stage
-
-        shows that you don't actually need to combinatorially connect
-        to the outputs, or add the module as a submodule: just return
-        the module output parameter(s) from the Stage.process() function
-    """
-
-    def __init__(self):
-        SetLessThan.__init__(self, 16, True)
-
-    def ispec(self):
-        return (Signal(16), Signal(16))
-
-    def ospec(self):
-        return Signal(16)
-
-    def setup(self, m, i):
-        m.submodules.slt = self
-        m.d.comb += self.src1.eq(i[0])
-        m.d.comb += self.src2.eq(i[1])
-
-    def process(self, i):
-        return self.output
-
-
-class ExampleLTPipeline(UnbufferedPipeline):
-    """ an example of how to use the unbuffered pipeline.
-    """
-
-    def __init__(self):
-        stage = LTStage()
-        UnbufferedPipeline.__init__(self, stage)
-
-
-class ExampleLTBufferedPipeDerived(BufferedHandshake):
-    """ an example of how to use the buffered pipeline.
-    """
-
-    def __init__(self):
-        stage = LTStageDerived()
-        BufferedHandshake.__init__(self, stage)
-
-
-def resultfn_6(data_o, expected, i, o):
-    res = 1 if expected[0] < expected[1] else 0
-    assert data_o == res, \
-                "%d-%d data %x not match %s\n" \
-                % (i, o, data_o, repr(expected))
-
-
-######################################################################
-# Test 7
-######################################################################
-
-class ExampleAddRecordStage(StageCls):
-    """ example use of a Record
-    """
-
-    record_spec = [('src1', 16), ('src2', 16)]
-    def ispec(self):
-        """ returns a Record using the specification
-        """
-        return Record(self.record_spec)
-
-    def ospec(self):
-        return Record(self.record_spec)
-
-    def process(self, i):
-        """ process the input data, returning a dictionary with key names
-            that exactly match the Record's attributes.
-        """
-        return {'src1': i.src1 + 1,
-                'src2': i.src2 + 1}
-
-######################################################################
-# Test 11
-######################################################################
-
-class ExampleAddRecordPlaceHolderStage(StageCls):
-    """ example use of a Record, with a placeholder as the processing result
-    """
-
-    record_spec = [('src1', 16), ('src2', 16)]
-    def ispec(self):
-        """ returns a Record using the specification
-        """
-        return Record(self.record_spec)
-
-    def ospec(self):
-        return Record(self.record_spec)
-
-    def process(self, i):
-        """ process the input data, returning a PlaceHolder class instance
-            with attributes that exactly match those of the Record.
-        """
-        o = PlaceHolder()
-        o.src1 = i.src1 + 1
-        o.src2 = i.src2 + 1
-        return o
-
-
-# a dummy class that may have stuff assigned to instances once created
-class PlaceHolder: pass
-
-
-class ExampleAddRecordPipe(UnbufferedPipeline):
-    """ an example of how to use the combinatorial pipeline.
-    """
-
-    def __init__(self):
-        stage = ExampleAddRecordStage()
-        UnbufferedPipeline.__init__(self, stage)
-
-
-def resultfn_7(data_o, expected, i, o):
-    res = (expected['src1'] + 1, expected['src2'] + 1)
-    assert data_o['src1'] == res[0] and data_o['src2'] == res[1], \
-                "%d-%d data %s not match %s\n" \
-                % (i, o, repr(data_o), repr(expected))
-
-
-class ExampleAddRecordPlaceHolderPipe(UnbufferedPipeline):
-    """ an example of how to use the combinatorial pipeline.
-    """
-
-    def __init__(self):
-        stage = ExampleAddRecordPlaceHolderStage()
-        UnbufferedPipeline.__init__(self, stage)
-
-
-def resultfn_test11(data_o, expected, i, o):
-    res1 = expected.src1 + 1
-    res2 = expected.src2 + 1
-    assert data_o['src1'] == res1 and data_o['src2'] == res2, \
-                "%d-%d data %s not match %s\n" \
-                % (i, o, repr(data_o), repr(expected))
-
-
-######################################################################
-# Test 8
-######################################################################
-
-
-class Example2OpClass:
-    """ an example of a class used to store 2 operands.
-        requires an eq function, to conform with the pipeline stage API
-    """
-
-    def __init__(self):
-        self.op1 = Signal(16)
-        self.op2 = Signal(16)
-
-    def eq(self, i):
-        return [self.op1.eq(i.op1), self.op2.eq(i.op2)]
-
-
-class ExampleAddClassStage(StageCls):
-    """ an example of how to use the buffered pipeline, as a class instance
-    """
-
-    def ispec(self):
-        """ returns an instance of an Example2OpClass.
-        """
-        return Example2OpClass()
-
-    def ospec(self):
-        """ returns an output signal which will happen to contain the sum
-            of the two inputs
-        """
-        return Signal(16, name="add2_out")
-
-    def process(self, i):
-        """ process the input data (sums the values in the tuple) and returns it
-        """
-        return i.op1 + i.op2
-
-
-class ExampleBufPipeAddClass(BufferedHandshake):
-    """ an example of how to use the buffered pipeline, using a class instance
-    """
-
-    def __init__(self):
-        addstage = ExampleAddClassStage()
-        BufferedHandshake.__init__(self, addstage)
-
-
-class TestInputAdd:
-    """ the eq function, called by set_input, needs an incoming object
-        that conforms to the Example2OpClass.eq function requirements
-        easiest way to do that is to create a class that has the exact
-        same member layout (self.op1, self.op2) as Example2OpClass
-    """
-    def __init__(self, op1, op2):
-        self.op1 = op1
-        self.op2 = op2
-
-
-def resultfn_8(data_o, expected, i, o):
-    res = expected.op1 + expected.op2 # these are a TestInputAdd instance
-    assert data_o == res, \
-                "%d-%d data %s res %x not match %s\n" \
-                % (i, o, repr(data_o), res, repr(expected))
-
-def data_2op():
-        data = []
-        for i in range(num_tests):
-            data.append(TestInputAdd(randint(0, 1<<16-1), randint(0, 1<<16-1)))
-        return data
-
-
-######################################################################
-# Test 12
-######################################################################
-
-class ExampleStageDelayCls(StageCls, Elaboratable):
-    """ an example of how to use the buffered pipeline, in a static class
-        fashion
-    """
-
-    def __init__(self, valid_trigger=2):
-        self.count = Signal(2)
-        self.valid_trigger = valid_trigger
-
-    def ispec(self):
-        return Signal(16, name="example_input_signal")
-
-    def ospec(self):
-        return Signal(16, name="example_output_signal")
-
-    @property
-    def d_ready(self):
-        """ data is ready to be accepted when this is true
-        """
-        return (self.count == 1)# | (self.count == 3)
-        return Const(1)
-
-    def d_valid(self, ready_i):
-        """ data is valid at output when this is true
-        """
-        return self.count == self.valid_trigger
-        return Const(1)
-
-    def process(self, i):
-        """ process the input data and returns it (adds 1)
-        """
-        return i + 1
-
-    def elaborate(self, platform):
-        m = Module()
-        m.d.sync += self.count.eq(self.count + 1)
-        return m
-
-
-class ExampleBufDelayedPipe(BufferedHandshake):
-
-    def __init__(self):
-        stage = ExampleStageDelayCls(valid_trigger=2)
-        BufferedHandshake.__init__(self, stage, stage_ctl=True)
-
-    def elaborate(self, platform):
-        m = BufferedHandshake.elaborate(self, platform)
-        m.submodules.stage = self.stage
-        return m
-
-
-def data_chain1():
-        data = []
-        for i in range(num_tests):
-            data.append(1<<((i*3)%15))
-            #data.append(randint(0, 1<<16-2))
-            #print (hex(data[-1]))
-        return data
-
-
-def resultfn_12(data_o, expected, i, o):
-    res = expected + 1
-    assert data_o == res, \
-                "%d-%d data %x not match %x\n" \
-                % (i, o, data_o, res)
-
-
-######################################################################
-# Test 13
-######################################################################
-
-class ExampleUnBufDelayedPipe(BufferedHandshake):
-
-    def __init__(self):
-        stage = ExampleStageDelayCls(valid_trigger=3)
-        BufferedHandshake.__init__(self, stage, stage_ctl=True)
-
-    def elaborate(self, platform):
-        m = BufferedHandshake.elaborate(self, platform)
-        m.submodules.stage = self.stage
-        return m
-
-######################################################################
-# Test 15
-######################################################################
-
-class ExampleBufModeAdd1Pipe(SimpleHandshake):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        SimpleHandshake.__init__(self, stage)
-
-
-######################################################################
-# Test 16
-######################################################################
-
-class ExampleBufModeUnBufPipe(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = ExampleBufModeAdd1Pipe()
-        pipe2 = ExampleBufAdd1Pipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-######################################################################
-# Test 17
-######################################################################
-
-class ExampleUnBufAdd1Pipe2(UnbufferedPipeline2):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        UnbufferedPipeline2.__init__(self, stage)
-
-
-######################################################################
-# Test 18
-######################################################################
-
-class PassThroughTest(PassThroughHandshake):
-
-    def iospecfn(self):
-        return Signal(16, "out")
-
-    def __init__(self):
-        stage = PassThroughStage(self.iospecfn)
-        PassThroughHandshake.__init__(self, stage)
-
-def resultfn_identical(data_o, expected, i, o):
-    res = expected
-    assert data_o == res, \
-                "%d-%d data %x not match %x\n" \
-                % (i, o, data_o, res)
-
-
-######################################################################
-# Test 19
-######################################################################
-
-class ExamplePassAdd1Pipe(PassThroughHandshake):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        PassThroughHandshake.__init__(self, stage)
-
-
-class ExampleBufPassThruPipe(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        # XXX currently fails: any other permutation works fine.
-        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
-        # also fails using UnbufferedPipeline as well
-        pipe1 = ExampleBufModeAdd1Pipe()
-        pipe2 = ExamplePassAdd1Pipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Test 20
-######################################################################
-
-def iospecfn():
-    return Signal(16, name="d_in")
-
-class FIFOTest16(FIFOControl):
-
-    def __init__(self):
-        stage = PassThroughStage(iospecfn)
-        FIFOControl.__init__(self, 2, stage)
-
-
-######################################################################
-# Test 21
-######################################################################
-
-class ExampleFIFOPassThruPipe1(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = FIFOTest16()
-        pipe2 = FIFOTest16()
-        pipe3 = ExamplePassAdd1Pipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-        m.submodules.pipe3 = pipe3
-
-        m.d.comb += self.connect([pipe1, pipe2, pipe3])
-
-        return m
-
-
-######################################################################
-# Test 22
-######################################################################
-
-class Example2OpRecord(RecordObject):
-    def __init__(self):
-        RecordObject.__init__(self)
-        self.op1 = Signal(16)
-        self.op2 = Signal(16)
-
-
-class ExampleAddRecordObjectStage(StageCls):
-
-    def ispec(self):
-        """ returns an instance of an Example2OpRecord.
-        """
-        return Example2OpRecord()
-
-    def ospec(self):
-        """ returns an output signal which will happen to contain the sum
-            of the two inputs
-        """
-        return Signal(16)
-
-    def process(self, i):
-        """ process the input data (sums the values in the tuple) and returns it
-        """
-        return i.op1 + i.op2
-
-
-class ExampleRecordHandshakeAddClass(SimpleHandshake):
-
-    def __init__(self):
-        addstage = ExampleAddRecordObjectStage()
-        SimpleHandshake.__init__(self, stage=addstage)
-
-
-######################################################################
-# Test 23
-######################################################################
-
-def iospecfnrecord():
-    return Example2OpRecord()
-
-class FIFOTestRecordControl(FIFOControl):
-
-    def __init__(self):
-        stage = PassThroughStage(iospecfnrecord)
-        FIFOControl.__init__(self, 2, stage)
-
-
-class ExampleFIFORecordObjectPipe(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = FIFOTestRecordControl()
-        pipe2 = ExampleRecordHandshakeAddClass()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Test 24
-######################################################################
-
-class FIFOTestRecordAddStageControl(FIFOControl):
-
-    def __init__(self):
-        stage = ExampleAddRecordObjectStage()
-        FIFOControl.__init__(self, 2, stage)
-
-
-
-######################################################################
-# Test 25
-######################################################################
-
-class FIFOTestAdd16(FIFOControl):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        FIFOControl.__init__(self, 2, stage)
-
-
-class ExampleFIFOAdd2Pipe(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = FIFOTestAdd16()
-        pipe2 = FIFOTestAdd16()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Test 26
-######################################################################
-
-def iospecfn24():
-    return (Signal(16, name="src1"), Signal(16, name="src2"))
-
-class FIFOTest2x16(FIFOControl):
-
-    def __init__(self):
-        stage = PassThroughStage(iospecfn2)
-        FIFOControl.__init__(self, 2, stage)
-
-
-######################################################################
-# Test 997
-######################################################################
-
-class ExampleBufPassThruPipe2(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        # XXX currently fails: any other permutation works fine.
-        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
-        # also fails using UnbufferedPipeline as well
-        #pipe1 = ExampleUnBufAdd1Pipe()
-        #pipe2 = ExampleBufAdd1Pipe()
-        pipe1 = ExampleBufAdd1Pipe()
-        pipe2 = ExamplePassAdd1Pipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Test 998
-######################################################################
-
-class ExampleBufPipe3(ControlBase):
-    """ Example of how to do delayed pipeline, where the stage signals
-        whether it is ready.
-    """
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        pipe1 = ExampleBufDelayedPipe()
-        pipe2 = ExampleBufPipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-######################################################################
-# Test 999 - XXX FAILS
-# http://bugs.libre-riscv.org/show_bug.cgi?id=57
-######################################################################
-
-class ExampleBufAdd1Pipe(BufferedHandshake):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        BufferedHandshake.__init__(self, stage)
-
-
-class ExampleUnBufAdd1Pipe(UnbufferedPipeline):
-
-    def __init__(self):
-        stage = ExampleStageCls()
-        UnbufferedPipeline.__init__(self, stage)
-
-
-class ExampleBufUnBufPipe(ControlBase):
-
-    def elaborate(self, platform):
-        m = ControlBase.elaborate(self, platform)
-
-        # XXX currently fails: any other permutation works fine.
-        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
-        # also fails using UnbufferedPipeline as well
-        #pipe1 = ExampleUnBufAdd1Pipe()
-        #pipe2 = ExampleBufAdd1Pipe()
-        pipe1 = ExampleBufAdd1Pipe()
-        pipe2 = ExampleUnBufAdd1Pipe()
-
-        m.submodules.pipe1 = pipe1
-        m.submodules.pipe2 = pipe2
-
-        m.d.comb += self.connect([pipe1, pipe2])
-
-        return m
-
-
-######################################################################
-# Unit Tests
-######################################################################
-
-num_tests = 10
-
-if __name__ == '__main__':
-    if False:
-        print ("test 1")
-        dut = ExampleBufPipe()
-        run_simulation(dut, tbench(dut), vcd_name="test_bufpipe.vcd")
-
-        print ("test 2")
-        dut = ExampleBufPipe2()
-        run_simulation(dut, tbench2(dut), vcd_name="test_bufpipe2.vcd")
-        ports = [dut.p.valid_i, dut.n.ready_i,
-                 dut.n.valid_o, dut.p.ready_o] + \
-                 [dut.p.data_i] + [dut.n.data_o]
-        vl = rtlil.convert(dut, ports=ports)
-        with open("test_bufpipe2.il", "w") as f:
-            f.write(vl)
-
-
-    print ("test 3")
-    dut = ExampleBufPipe()
-    test = Test3(dut, resultfn_3)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe3.vcd")
-
-    print ("test 3.5")
-    dut = ExamplePipeline()
-    test = Test3(dut, resultfn_3)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_combpipe3.vcd")
-
-    print ("test 4")
-    dut = ExampleBufPipe2()
-    run_simulation(dut, tbench4(dut), vcd_name="test_bufpipe4.vcd")
-
-    print ("test 5")
-    dut = ExampleBufPipeAdd()
-    test = Test5(dut, resultfn_5, stage_ctl=True)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe5.vcd")
-
-    print ("test 6")
-    dut = ExampleLTPipeline()
-    test = Test5(dut, resultfn_6)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltcomb6.vcd")
-
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             list(dut.p.data_i) + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_ltcomb_pipe.il", "w") as f:
-        f.write(vl)
-
-    print ("test 7")
-    dut = ExampleAddRecordPipe()
-    data=data_dict()
-    test = Test5(dut, resultfn_7, data=data)
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o,
-             dut.p.data_i.src1, dut.p.data_i.src2,
-             dut.n.data_o.src1, dut.n.data_o.src2]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_recordcomb_pipe.il", "w") as f:
-        f.write(vl)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
-
-    print ("test 8")
-    dut = ExampleBufPipeAddClass()
-    data=data_2op()
-    test = Test5(dut, resultfn_8, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe8.vcd")
-
-    print ("test 9")
-    dut = ExampleBufPipeChain2()
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufpipechain2.il", "w") as f:
-        f.write(vl)
-
-    data = data_chain2()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv],
-                        vcd_name="test_bufpipechain2.vcd")
-
-    print ("test 10")
-    dut = ExampleLTBufferedPipeDerived()
-    test = Test5(dut, resultfn_6)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltbufpipe10.vcd")
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_ltbufpipe10.il", "w") as f:
-        f.write(vl)
-
-    print ("test 11")
-    dut = ExampleAddRecordPlaceHolderPipe()
-    data=data_placeholder()
-    test = Test5(dut, resultfn_test11, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
-
-
-    print ("test 12")
-    dut = ExampleBufDelayedPipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_12, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe12.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufpipe12.il", "w") as f:
-        f.write(vl)
-
-    print ("test 13")
-    dut = ExampleUnBufDelayedPipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_12, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe13.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_unbufpipe13.il", "w") as f:
-        f.write(vl)
-
-    print ("test 15")
-    dut = ExampleBufModeAdd1Pipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_12, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf15.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufunbuf15.il", "w") as f:
-        f.write(vl)
-
-    print ("test 16")
-    dut = ExampleBufModeUnBufPipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf16.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufunbuf16.il", "w") as f:
-        f.write(vl)
-
-    print ("test 17")
-    dut = ExampleUnBufAdd1Pipe2()
-    data = data_chain1()
-    test = Test5(dut, resultfn_12, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe17.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_unbufpipe17.il", "w") as f:
-        f.write(vl)
-
-    print ("test 18")
-    dut = PassThroughTest()
-    data = data_chain1()
-    test = Test5(dut, resultfn_identical, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_passthru18.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_passthru18.il", "w") as f:
-        f.write(vl)
-
-    print ("test 19")
-    dut = ExampleBufPassThruPipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass19.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufpass19.il", "w") as f:
-        f.write(vl)
-
-    print ("test 20")
-    dut = FIFOTest16()
-    data = data_chain1()
-    test = Test5(dut, resultfn_identical, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifo20.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_fifo20.il", "w") as f:
-        f.write(vl)
-
-    print ("test 21")
-    dut = ExampleFIFOPassThruPipe1()
-    data = data_chain1()
-    test = Test5(dut, resultfn_12, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifopass21.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_fifopass21.il", "w") as f:
-        f.write(vl)
-
-    print ("test 22")
-    dut = ExampleRecordHandshakeAddClass()
-    data=data_2op()
-    test = Test5(dut, resultfn_8, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord22.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i.op1, dut.p.data_i.op2] + \
-             [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_addrecord22.il", "w") as f:
-        f.write(vl)
-
-    print ("test 23")
-    dut = ExampleFIFORecordObjectPipe()
-    data=data_2op()
-    test = Test5(dut, resultfn_8, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord23.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i.op1, dut.p.data_i.op2] + \
-             [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_addrecord23.il", "w") as f:
-        f.write(vl)
-
-    print ("test 24")
-    dut = FIFOTestRecordAddStageControl()
-    data=data_2op()
-    test = Test5(dut, resultfn_8, data=data)
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i.op1, dut.p.data_i.op2] + \
-             [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_addrecord24.il", "w") as f:
-        f.write(vl)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord24.vcd")
-
-    print ("test 25")
-    dut = ExampleFIFOAdd2Pipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_add2pipe25.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_add2pipe25.il", "w") as f:
-        f.write(vl)
-
-    print ("test 997")
-    dut = ExampleBufPassThruPipe2()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass997.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufpass997.il", "w") as f:
-        f.write(vl)
-
-    print ("test 998 (fails, bug)")
-    dut = ExampleBufPipe3()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe14.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufpipe14.il", "w") as f:
-        f.write(vl)
-
-    print ("test 999 (expected to fail, which is a bug)")
-    dut = ExampleBufUnBufPipe()
-    data = data_chain1()
-    test = Test5(dut, resultfn_9, data=data)
-    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf999.vcd")
-    ports = [dut.p.valid_i, dut.n.ready_i,
-             dut.n.valid_o, dut.p.ready_o] + \
-             [dut.p.data_i] + [dut.n.data_o]
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_bufunbuf999.il", "w") as f:
-        f.write(vl)
-
diff --git a/src/add/test_div.py b/src/add/test_div.py
deleted file mode 100644
index 3f192338..00000000
--- a/src/add/test_div.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import sys
-from random import randint
-from random import seed
-from operator import truediv
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from nmigen_div_experiment import FPDIV
-
-from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-
-def testbench(dut):
-    yield from check_case(dut, 0x80000000, 0x00000000, 0xffc00000)
-    yield from check_case(dut, 0x00000000, 0x80000000, 0xffc00000)
-    yield from check_case(dut, 0x0002b017, 0xff3807ab, 0x80000000)
-    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40000000)
-    yield from check_case(dut, 0x3F800000, 0x40000000, 0x3F000000)
-    yield from check_case(dut, 0x3F800000, 0x40400000, 0x3EAAAAAB)
-    yield from check_case(dut, 0x40400000, 0x41F80000, 0x3DC6318C)
-    yield from check_case(dut, 0x41F9EB4D, 0x429A4C70, 0x3ECF52B2)
-    yield from check_case(dut, 0x7F7FFFFE, 0x70033181, 0x4EF9C4C8)
-    yield from check_case(dut, 0x7F7FFFFE, 0x70000001, 0x4EFFFFFC)
-    yield from check_case(dut, 0x7F7FFCFF, 0x70200201, 0x4ECCC7D5)
-    yield from check_case(dut, 0x70200201, 0x7F7FFCFF, 0x302003E2)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0xbf9b1e94, 0x34082401, 0x5e8ef81, 0x5c75da81, 0x2b017]
-    stimulus_b = [0xc038ed3a, 0xb328cd45, 0x114f3db, 0x2f642a39, 0xff3807ab]
-    yield from run_test(dut, stimulus_a, stimulus_b, truediv, get_case)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, truediv, get_case)
-    yield from run_edge_cases(dut, count, truediv, get_case)
-
-
-if __name__ == '__main__':
-    dut = FPDIV(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_div.vcd")
-
diff --git a/src/add/test_div64.py b/src/add/test_div64.py
deleted file mode 100644
index 5a9daf23..00000000
--- a/src/add/test_div64.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from nmigen_div_experiment import FPDIV
-
-class ORGate:
-    def __init__(self):
-        self.a = Signal()
-        self.b = Signal()
-        self.x = Signal()
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        m.d.comb += self.x.eq(self.a | self.b)
-
-        return m
-
-def check_case(dut, a, b, z):
-    yield dut.in_a.v.eq(a)
-    yield dut.in_a.stb.eq(1)
-    yield
-    yield
-    a_ack = (yield dut.in_a.ack)
-    assert a_ack == 0
-    yield dut.in_b.v.eq(b)
-    yield dut.in_b.stb.eq(1)
-    b_ack = (yield dut.in_b.ack)
-    assert b_ack == 0
-
-    while True:
-        yield
-        out_z_stb = (yield dut.out_z.stb)
-        if not out_z_stb:
-            continue
-        yield dut.in_a.stb.eq(0)
-        yield dut.in_b.stb.eq(0)
-        yield dut.out_z.ack.eq(1)
-        yield
-        yield dut.out_z.ack.eq(0)
-        yield
-        yield
-        break
-
-    out_z = yield dut.out_z.v
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-
-def testbench(dut):
-    yield from check_case(dut, 0x4008000000000000, 0x3FF0000000000000,
-                               0x4008000000000000)
-    yield from check_case(dut, 0x3FF0000000000000, 0x4008000000000000,
-                               0x3FD5555555555555)
-
-    if False:
-        yield from check_case(dut, 0x3F800000, 0x40000000, 0x3F000000)
-        yield from check_case(dut, 0x3F800000, 0x40400000, 0x3EAAAAAB)
-        yield from check_case(dut, 0x40400000, 0x41F80000, 0x3DC6318C)
-        yield from check_case(dut, 0x41F9EB4D, 0x429A4C70, 0x3ECF52B2)
-        yield from check_case(dut, 0x7F7FFFFE, 0x70033181, 0x4EF9C4C8)
-        yield from check_case(dut, 0x7F7FFFFE, 0x70000001, 0x4EFFFFFC)
-        yield from check_case(dut, 0x7F7FFCFF, 0x70200201, 0x4ECCC7D5)
-        yield from check_case(dut, 0x70200201, 0x7F7FFCFF, 0x302003E2)
-
-if __name__ == '__main__':
-    dut = FPDIV(width=64)
-    run_simulation(dut, testbench(dut), vcd_name="test_div64.vcd")
-
diff --git a/src/add/test_dual.py b/src/add/test_dual.py
deleted file mode 100644
index 15f5c762..00000000
--- a/src/add/test_dual.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from sfpy import Float32
-from nmigen.compat.sim import run_simulation
-from dual_add_experiment import ALU
-
-
-def get_case(dut, a, b, c):
-    yield dut.a.v.eq(a)
-    yield dut.a.stb.eq(1)
-    yield
-    yield
-    a_ack = (yield dut.a.ack)
-    assert a_ack == 0
-
-    yield dut.a.stb.eq(0)
-
-    yield dut.b.v.eq(b)
-    yield dut.b.stb.eq(1)
-    yield
-    yield
-    b_ack = (yield dut.b.ack)
-    assert b_ack == 0
-
-    yield dut.b.stb.eq(0)
-
-    yield dut.c.v.eq(c)
-    yield dut.c.stb.eq(1)
-    yield
-    yield
-    c_ack = (yield dut.c.ack)
-    assert c_ack == 0
-
-    yield dut.c.stb.eq(0)
-
-    yield dut.z.ack.eq(1)
-
-    while True:
-        out_z_stb = (yield dut.z.stb)
-        if not out_z_stb:
-            yield
-            continue
-
-        out_z = yield dut.z.v
-
-        yield dut.z.ack.eq(0)
-        break
-
-    return out_z
-
-def check_case(dut, a, b, c, z):
-    out_z = yield from get_case(dut, a, b, c)
-    assert out_z == z, "Output z 0x%x != 0x%x" % (out_z, z)
-
-def testbench(dut):
-    yield from check_case(dut, 0, 0, 0, 0)
-    yield from check_case(dut, 0x3F800000, 0x40000000, 0xc0000000, 0x3F800000)
-
-if __name__ == '__main__':
-    dut = ALU(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_dual_add.vcd")
-
diff --git a/src/add/test_fpadd_pipe.py b/src/add/test_fpadd_pipe.py
deleted file mode 100644
index df25e55f..00000000
--- a/src/add/test_fpadd_pipe.py
+++ /dev/null
@@ -1,126 +0,0 @@
-""" key strategic example showing how to do multi-input fan-in into a
-    multi-stage pipeline, then multi-output fanout.
-
-    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
-    and used as a routing ID on the fanout.
-"""
-
-from random import randint
-from math import log
-from nmigen import Module, Signal, Cat, Value
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from nmigen_add_experiment import (FPADDMuxInOut,)
-
-from sfpy import Float32
-
-class InputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = {}
-        self.do = {}
-        self.tlen = 10
-        self.width = 32
-        for mid in range(dut.num_rows):
-            self.di[mid] = {}
-            self.do[mid] = []
-            for i in range(self.tlen):
-                op1 = randint(0, (1<<self.width)-1)
-                op2 = randint(0, (1<<self.width)-1)
-                #op1 = 0x40900000
-                #op2 = 0x40200000
-                res = Float32(op1) + Float32(op2)
-                self.di[mid][i] = (op1, op2)
-                self.do[mid].append(res.bits)
-
-    def send(self, mid):
-        for i in range(self.tlen):
-            op1, op2 = self.di[mid][i]
-            rs = dut.p[mid]
-            yield rs.valid_i.eq(1)
-            yield rs.data_i.a.eq(op1)
-            yield rs.data_i.b.eq(op2)
-            yield rs.data_i.mid.eq(mid)
-            yield
-            o_p_ready = yield rs.ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield rs.ready_o
-
-            fop1 = Float32(op1)
-            fop2 = Float32(op2)
-            res = fop1 + fop2
-            print ("send", mid, i, hex(op1), hex(op2), hex(res.bits),
-                           fop1, fop2, res)
-
-            yield rs.valid_i.eq(0)
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 3)):
-                yield
-
-        yield rs.valid_i.eq(0)
-        yield
-
-        print ("send ended", mid)
-
-        ## wait random period of time before queueing another value
-        #for i in range(randint(0, 3)):
-        #    yield
-
-        #send_range = randint(0, 3)
-        #if send_range == 0:
-        #    send = True
-        #else:
-        #    send = randint(0, send_range) != 0
-
-    def rcv(self, mid):
-        while True:
-            #stall_range = randint(0, 3)
-            #for j in range(randint(1,10)):
-            #    stall = randint(0, stall_range) != 0
-            #    yield self.dut.n[0].ready_i.eq(stall)
-            #    yield
-            n = self.dut.n[mid]
-            yield n.ready_i.eq(1)
-            yield
-            o_n_valid = yield n.valid_o
-            i_n_ready = yield n.ready_i
-            if not o_n_valid or not i_n_ready:
-                continue
-
-            out_mid = yield n.data_o.mid
-            out_z = yield n.data_o.z
-
-            out_i = 0
-
-            print ("recv", out_mid, hex(out_z), "expected",
-                        hex(self.do[mid][out_i] ))
-
-            # see if this output has occurred already, delete it if it has
-            assert mid == out_mid, "out_mid %d not correct %d" % (out_mid, mid)
-            assert self.do[mid][out_i] == out_z
-            del self.do[mid][out_i]
-
-            # check if there's any more outputs
-            if len(self.do[mid]) == 0:
-                break
-        print ("recv ended", mid)
-
-
-
-if __name__ == '__main__':
-    dut = FPADDMuxInOut(32, 4)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_fpadd_pipe.il", "w") as f:
-        f.write(vl)
-    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
-
-    test = InputTest(dut)
-    run_simulation(dut, [test.rcv(1), test.rcv(0),
-                         test.rcv(3), test.rcv(2),
-                         test.send(0), test.send(1),
-                         test.send(3), test.send(2),
-                        ],
-                   vcd_name="test_fpadd_pipe.vcd")
-
diff --git a/src/add/test_fpnum.py b/src/add/test_fpnum.py
deleted file mode 100644
index 6d9ecd10..00000000
--- a/src/add/test_fpnum.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from random import randint
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from fpbase import FPNum
-
-class FPNumModShiftMulti:
-    def __init__(self, width):
-        self.a = FPNum(width)
-        self.ediff = Signal((self.a.e_width, True))
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        #m.d.sync += self.a.decode(self.a.v)
-        m.d.sync += self.a.shift_down_multi(self.ediff)
-
-        return m
-
-def check_case(dut, width, e_width, m, e, i):
-    yield dut.a.m.eq(m)
-    yield dut.a.e.eq(e)
-    yield dut.ediff.eq(i)
-    yield
-    yield
-
-    out_m = yield dut.a.m
-    out_e = yield dut.a.e
-    ed = yield dut.ediff
-    calc_e = (e + i) 
-    print (e, bin(m), out_e, calc_e, bin(out_m), i, ed)
-
-    calc_m = ((m >> (i+1)) << 1) | (m & 1)
-    for l in range(i):
-        if m & (1<<(l+1)):
-            calc_m |= 1
-
-    assert out_e == calc_e, "Output e 0x%x != expected 0x%x" % (out_e, calc_e)
-    assert out_m == calc_m, "Output m 0x%x != expected 0x%x" % (out_m, calc_m)
-
-def testbench(dut):
-    m_width = dut.a.m_width
-    e_width = dut.a.e_width
-    e_max = dut.a.e_max
-    for j in range(200):
-        m = randint(0, (1<<m_width)-1)
-        zeros = randint(0, 31)
-        for i in range(zeros):
-            m &= ~(1<<i)
-        e = randint(-e_max, e_max)
-        for i in range(32):
-            yield from check_case(dut, m_width, e_width, m, e, i)
-
-if __name__ == '__main__':
-    dut = FPNumModShiftMulti(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
-
-    #dut = MultiShiftModL(width=32)
-    #run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
-
diff --git a/src/add/test_fsm_experiment.py b/src/add/test_fsm_experiment.py
deleted file mode 100644
index 17cee24e..00000000
--- a/src/add/test_fsm_experiment.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# IEEE Floating Point Divider (Single Precision)
-# Copyright (C) Jonathan P Dawson 2013
-# 2013-12-12
-
-from nmigen import Module, Signal, Const, Cat, Elaboratable
-from nmigen.cli import main, verilog, rtlil
-from nmigen.compat.sim import run_simulation
-
-
-from fpbase import FPNumIn, FPNumOut, FPOpIn, FPOpOut, FPBase, FPState
-from nmoperator import eq
-from singlepipe import SimpleHandshake, ControlBase
-from test_buf_pipe import data_chain2, Test5
-
-
-class FPDIV(FPBase, Elaboratable):
-
-    def __init__(self, width):
-        FPBase.__init__(self)
-        self.width = width
-
-        self.p = FPOpIn(width)
-        self.n = FPOpOut(width)
-
-        self.p.data_i = self.ispec()
-        self.n.data_o = self.ospec()
-
-        self.states = []
-
-    def ispec(self):
-        return Signal(self.width, name="a")
-
-    def ospec(self):
-        return Signal(self.width, name="z")
-
-    def setup(self, m, i):
-        m.d.comb += self.p.v.eq(i) # connect input
-
-    def process(self, i):
-        return self.n.v # return z output
-
-    def add_state(self, state):
-        self.states.append(state)
-        return state
-
-    def elaborate(self, platform=None):
-        """ creates the HDL code-fragment for FPDiv
-        """
-        m = Module()
-
-        # Latches
-        a = FPNumIn(None, self.width, False)
-        z = FPNumOut(self.width, False)
-
-        m.submodules.p = self.p
-        m.submodules.n = self.n
-        m.submodules.a = a
-        m.submodules.z = z
-
-        m.d.comb += a.v.eq(self.p.v)
-
-        with m.FSM() as fsm:
-
-            # ******
-            # gets operand a
-
-            with m.State("get_a"):
-                res = self.get_op(m, self.p, a, "add_1")
-                m.d.sync += eq([a, self.p.ready_o], res)
-
-            with m.State("add_1"):
-                m.next = "pack"
-                m.d.sync += [
-                    z.s.eq(a.s), # sign
-                    z.e.eq(a.e), # exponent
-                    z.m.eq(a.m + 1), # mantissa
-                ]
-
-            # ******
-            # pack stage
-
-            with m.State("pack"):
-                self.pack(m, z, "put_z")
-
-            # ******
-            # put_z stage
-
-            with m.State("put_z"):
-                self.put_z(m, z, self.n, "get_a")
-
-        return m
-
-class FPDIVPipe(ControlBase):
-
-    def __init__(self, width):
-        self.width = width
-        self.fpdiv = FPDIV(width=width)
-        ControlBase.__init__(self, self.fpdiv)
-
-    def elaborate(self, platform):
-        self.m = m = ControlBase.elaborate(self, platform)
-
-        m.submodules.fpdiv = self.fpdiv
-
-        # see if connecting to stb/ack works
-        m.d.comb += self.fpdiv.p._connect_in(self.p)
-        m.d.comb += self.fpdiv.n._connect_out(self.n, do_data=False)
-        m.d.comb += self.n.data_o.eq(self.data_r)
-
-        return m
-
-def resultfn(data_o, expected, i, o):
-    res = expected + 1
-    assert data_o == res, \
-                "%d-%d received data %x not match expected %x\n" \
-                % (i, o, data_o, res)
-
-
-if __name__ == "__main__":
-    dut = FPDIVPipe(width=16)
-    data = data_chain2()
-    ports = dut.ports()
-    vl = rtlil.convert(dut, ports=ports)
-    with open("test_fsm_experiment.il", "w") as f:
-        f.write(vl)
-    test = Test5(dut, resultfn, data=data)
-    run_simulation(dut, [test.send, test.rcv],
-                    vcd_name="test_fsm_experiment.vcd")
-
diff --git a/src/add/test_inout_mux_pipe.py b/src/add/test_inout_mux_pipe.py
deleted file mode 100644
index 35abe2ea..00000000
--- a/src/add/test_inout_mux_pipe.py
+++ /dev/null
@@ -1,229 +0,0 @@
-""" key strategic example showing how to do multi-input fan-in into a
-    multi-stage pipeline, then multi-output fanout.
-
-    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
-    and used as a routing ID on the fanout.
-"""
-
-from random import randint
-from math import log
-from nmigen import Module, Signal, Cat, Value, Elaboratable
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from multipipe import CombMultiOutPipeline, CombMuxOutPipe
-from multipipe import PriorityCombMuxInPipe
-from singlepipe import SimpleHandshake, RecordObject, Object
-
-
-class PassData2(RecordObject):
-    def __init__(self):
-        RecordObject.__init__(self)
-        self.mid = Signal(2, reset_less=True)
-        self.idx = Signal(8, reset_less=True)
-        self.data = Signal(16, reset_less=True)
-
-
-class PassData(Object):
-    def __init__(self):
-        Object.__init__(self)
-        self.mid = Signal(2, reset_less=True)
-        self.idx = Signal(8, reset_less=True)
-        self.data = Signal(16, reset_less=True)
-
-
-
-class PassThroughStage:
-    def ispec(self):
-        return PassData()
-    def ospec(self):
-        return self.ispec() # same as ospec
-
-    def process(self, i):
-        return i # pass-through
-
-
-
-class PassThroughPipe(SimpleHandshake):
-    def __init__(self):
-        SimpleHandshake.__init__(self, PassThroughStage())
-
-
-class InputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = {}
-        self.do = {}
-        self.tlen = 100
-        for mid in range(dut.num_rows):
-            self.di[mid] = {}
-            self.do[mid] = {}
-            for i in range(self.tlen):
-                self.di[mid][i] = randint(0, 255) + (mid<<8)
-                self.do[mid][i] = self.di[mid][i]
-
-    def send(self, mid):
-        for i in range(self.tlen):
-            op2 = self.di[mid][i]
-            rs = dut.p[mid]
-            yield rs.valid_i.eq(1)
-            yield rs.data_i.data.eq(op2)
-            yield rs.data_i.idx.eq(i)
-            yield rs.data_i.mid.eq(mid)
-            yield
-            o_p_ready = yield rs.ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield rs.ready_o
-
-            print ("send", mid, i, hex(op2))
-
-            yield rs.valid_i.eq(0)
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 3)):
-                yield
-
-        yield rs.valid_i.eq(0)
-        yield
-
-        print ("send ended", mid)
-
-        ## wait random period of time before queueing another value
-        #for i in range(randint(0, 3)):
-        #    yield
-
-        #send_range = randint(0, 3)
-        #if send_range == 0:
-        #    send = True
-        #else:
-        #    send = randint(0, send_range) != 0
-
-    def rcv(self, mid):
-        while True:
-            #stall_range = randint(0, 3)
-            #for j in range(randint(1,10)):
-            #    stall = randint(0, stall_range) != 0
-            #    yield self.dut.n[0].ready_i.eq(stall)
-            #    yield
-            n = self.dut.n[mid]
-            yield n.ready_i.eq(1)
-            yield
-            o_n_valid = yield n.valid_o
-            i_n_ready = yield n.ready_i
-            if not o_n_valid or not i_n_ready:
-                continue
-
-            out_mid = yield n.data_o.mid
-            out_i = yield n.data_o.idx
-            out_v = yield n.data_o.data
-
-            print ("recv", out_mid, out_i, hex(out_v))
-
-            # see if this output has occurred already, delete it if it has
-            assert mid == out_mid, "out_mid %d not correct %d" % (out_mid, mid)
-            assert out_i in self.do[mid], "out_i %d not in array %s" % \
-                                          (out_i, repr(self.do[mid]))
-            assert self.do[mid][out_i] == out_v # pass-through data
-            del self.do[mid][out_i]
-
-            # check if there's any more outputs
-            if len(self.do[mid]) == 0:
-                break
-        print ("recv ended", mid)
-
-
-class TestPriorityMuxPipe(PriorityCombMuxInPipe):
-    def __init__(self, num_rows):
-        self.num_rows = num_rows
-        stage = PassThroughStage()
-        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
-
-
-class OutputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = []
-        self.do = {}
-        self.tlen = 100
-        for i in range(self.tlen * dut.num_rows):
-            if i < dut.num_rows:
-                mid = i
-            else:
-                mid = randint(0, dut.num_rows-1)
-            data = randint(0, 255) + (mid<<8)
-
-    def send(self):
-        for i in range(self.tlen * dut.num_rows):
-            op2 = self.di[i][0]
-            mid = self.di[i][1]
-            rs = dut.p
-            yield rs.valid_i.eq(1)
-            yield rs.data_i.data.eq(op2)
-            yield rs.data_i.mid.eq(mid)
-            yield
-            o_p_ready = yield rs.ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield rs.ready_o
-
-            print ("send", mid, i, hex(op2))
-
-            yield rs.valid_i.eq(0)
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 3)):
-                yield
-
-        yield rs.valid_i.eq(0)
-
-
-class TestMuxOutPipe(CombMuxOutPipe):
-    def __init__(self, num_rows):
-        self.num_rows = num_rows
-        stage = PassThroughStage()
-        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
-
-
-class TestInOutPipe(Elaboratable):
-    def __init__(self, num_rows=4):
-        self.num_rows = num_rows
-        self.inpipe = TestPriorityMuxPipe(num_rows) # fan-in (combinatorial)
-        self.pipe1 = PassThroughPipe()              # stage 1 (clock-sync)
-        self.pipe2 = PassThroughPipe()              # stage 2 (clock-sync)
-        self.outpipe = TestMuxOutPipe(num_rows)     # fan-out (combinatorial)
-
-        self.p = self.inpipe.p  # kinda annoying,
-        self.n = self.outpipe.n # use pipe in/out as this class in/out
-        self._ports = self.inpipe.ports() + self.outpipe.ports()
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.inpipe = self.inpipe
-        m.submodules.pipe1 = self.pipe1
-        m.submodules.pipe2 = self.pipe2
-        m.submodules.outpipe = self.outpipe
-
-        m.d.comb += self.inpipe.n.connect_to_next(self.pipe1.p)
-        m.d.comb += self.pipe1.connect_to_next(self.pipe2)
-        m.d.comb += self.pipe2.connect_to_next(self.outpipe)
-
-        return m
-
-    def ports(self):
-        return self._ports
-
-
-if __name__ == '__main__':
-    dut = TestInOutPipe()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_inoutmux_pipe.il", "w") as f:
-        f.write(vl)
-    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
-
-    test = InputTest(dut)
-    run_simulation(dut, [test.rcv(1), test.rcv(0),
-                         test.rcv(3), test.rcv(2),
-                         test.send(0), test.send(1),
-                         test.send(3), test.send(2),
-                        ],
-                   vcd_name="test_inoutmux_pipe.vcd")
-
diff --git a/src/add/test_inputgroup.py b/src/add/test_inputgroup.py
deleted file mode 100644
index 09a72e17..00000000
--- a/src/add/test_inputgroup.py
+++ /dev/null
@@ -1,179 +0,0 @@
-from random import randint
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from inputgroup import InputGroup
-
-
-def testbench(dut):
-    stb = yield dut.out_op.stb
-    assert stb == 0
-    ack = yield dut.out_op.ack
-    assert ack == 0
-
-    # set row 1 input 0
-    yield dut.rs[1].in_op[0].eq(5)
-    yield dut.rs[1].stb.eq(0b01) # strobe indicate 1st op ready
-    #yield dut.rs[1].ack.eq(1)
-    yield
-
-    # check row 1 output (should be inactive)
-    decode = yield dut.rs[1].out_decode
-    assert decode == 0
-    if False:
-        op0 = yield dut.rs[1].out_op[0]
-        op1 = yield dut.rs[1].out_op[1]
-        assert op0 == 0 and op1 == 0
-
-    # output should be inactive
-    out_stb = yield dut.out_op.stb
-    assert out_stb == 1
-
-    # set row 0 input 1
-    yield dut.rs[1].in_op[1].eq(6)
-    yield dut.rs[1].stb.eq(0b11) # strobe indicate both ops ready
-
-    # set acknowledgement of output... takes 1 cycle to respond
-    yield dut.out_op.ack.eq(1)
-    yield
-    yield dut.out_op.ack.eq(0) # clear ack on output
-    yield dut.rs[1].stb.eq(0) # clear row 1 strobe
-
-    # output strobe should be active, MID should be 0 until "ack" is set...
-    out_stb = yield dut.out_op.stb
-    assert out_stb == 1
-    out_mid = yield dut.mid
-    assert out_mid == 0
-
-    # ... and output should not yet be passed through either
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 0 and op1 == 0
-
-    # wait for out_op.ack to activate...
-    yield dut.rs[1].stb.eq(0b00) # set row 1 strobes to zero
-    yield
-
-    # *now* output should be passed through
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 5 and op1 == 6
-
-    # set row 2 input
-    yield dut.rs[2].in_op[0].eq(3)
-    yield dut.rs[2].in_op[1].eq(4)
-    yield dut.rs[2].stb.eq(0b11) # strobe indicate 1st op ready
-    yield dut.out_op.ack.eq(1) # set output ack
-    yield
-    yield dut.rs[2].stb.eq(0) # clear row 2 strobe
-    yield dut.out_op.ack.eq(0) # set output ack
-    yield
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 3 and op1 == 4, "op0 %d op1 %d" % (op0, op1)
-    out_mid = yield dut.mid
-    assert out_mid == 2
-
-    # set row 0 and 3 input
-    yield dut.rs[0].in_op[0].eq(9)
-    yield dut.rs[0].in_op[1].eq(8)
-    yield dut.rs[0].stb.eq(0b11) # strobe indicate 1st op ready
-    yield dut.rs[3].in_op[0].eq(1)
-    yield dut.rs[3].in_op[1].eq(2)
-    yield dut.rs[3].stb.eq(0b11) # strobe indicate 1st op ready
-
-    # set acknowledgement of output... takes 1 cycle to respond
-    yield dut.out_op.ack.eq(1)
-    yield
-    yield dut.rs[0].stb.eq(0) # clear row 1 strobe
-    yield
-    out_mid = yield dut.mid
-    assert out_mid == 0, "out mid %d" % out_mid
-
-    yield
-    yield dut.rs[3].stb.eq(0) # clear row 1 strobe
-    yield dut.out_op.ack.eq(0) # clear ack on output
-    yield
-    out_mid = yield dut.mid
-    assert out_mid == 3, "out mid %d" % out_mid
-
-
-class InputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = {}
-        self.do = {}
-        self.tlen = 10
-        for mid in range(dut.num_rows):
-            self.di[mid] = {}
-            self.do[mid] = {}
-            for i in range(self.tlen):
-                self.di[mid][i] = randint(0, 100)
-                self.do[mid][i] = self.di[mid][i]
-
-    def send(self, mid):
-        for i in range(self.tlen):
-            op2 = self.di[mid][i]
-            rs = dut.rs[mid]
-            ack = yield rs.ack
-            while not ack:
-                yield
-                ack = yield rs.ack
-            yield rs.in_op[0].eq(i)
-            yield rs.in_op[1].eq(op2)
-            yield rs.stb.eq(0b11) # strobe indicate 1st op ready
-            ack = yield rs.ack
-            while ack:
-                yield
-                ack = yield rs.ack
-            yield rs.stb.eq(0)
-
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 8)):
-                yield
-
-    def recv(self):
-        while True:
-            stb = yield dut.out_op.stb
-            yield dut.out_op.ack.eq(0)
-            while not stb:
-                yield dut.out_op.ack.eq(1)
-                yield
-                stb = yield dut.out_op.stb
-
-            stb = yield dut.out_op.stb
-            while stb:
-                yield
-                stb = yield dut.out_op.stb
-            mid = yield dut.mid
-            out_i = yield dut.out_op.v[0]
-            out_v = yield dut.out_op.v[1]
-
-            # see if this output has occurred already, delete it if it has
-            assert out_i in self.do[mid]
-            assert self.do[mid][out_i] == out_v
-            del self.do[mid][out_i]
-
-            # check if there's any more outputs
-            zerolen = True
-            for (k, v) in self.do.items():
-                if v:
-                    zerolen = False
-            if zerolen:
-                break
-
-if __name__ == '__main__':
-    dut = InputGroup(width=32)
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_inputgroup.il", "w") as f:
-        f.write(vl)
-    run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
-
-    dut = InputGroup(width=16)
-    test = InputTest(dut)
-    run_simulation(dut, [test.send(3), test.send(2),
-                         test.send(1), test.send(0),
-                         test.recv()],
-                   vcd_name="test_inputgroup_parallel.vcd")
-
diff --git a/src/add/test_mul.py b/src/add/test_mul.py
deleted file mode 100644
index 21d82528..00000000
--- a/src/add/test_mul.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import sys
-from random import randint
-from random import seed
-from operator import mul
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from fmul import FPMUL
-
-from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-
-def testbench(dut):
-    yield from check_case(dut, 0x40000000, 0x40000000, 0x40800000)
-    yield from check_case(dut, 0x41400000, 0x40A00000, 0x42700000)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0xba57711a, 0xbf9b1e94, 0x34082401, 0x5e8ef81,
-                  0x5c75da81, 0x2b017]
-    stimulus_b = [0xee1818c5, 0xc038ed3a, 0xb328cd45, 0x114f3db,
-                  0x2f642a39, 0xff3807ab]
-    yield from run_test(dut, stimulus_a, stimulus_b, mul, get_case)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, mul, get_case)
-    yield from run_edge_cases(dut, count, mul, get_case)
-
-
-if __name__ == '__main__':
-    dut = FPMUL(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_mul.vcd")
-
diff --git a/src/add/test_mul64.py b/src/add/test_mul64.py
deleted file mode 100644
index 81c5b5a4..00000000
--- a/src/add/test_mul64.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-from operator import mul
-
-from fmul import FPMUL
-
-import sys
-import atexit
-from random import randint
-from random import seed
-
-from unit_test_double import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-
-def testbench(dut):
-    yield from check_case(dut, 0, 0, 0)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0xff80000000000000, 0x3351099a0528e138]
-    stimulus_b = [0x7f80000000000000, 0xd651a9a9986af2b5]
-    yield from run_test(dut, stimulus_a, stimulus_b, mul)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, mul)
-    yield from run_edge_cases(dut, count, mul)
-
-
-if __name__ == '__main__':
-    dut = FPMUL(width=64)
-    run_simulation(dut, testbench(dut), vcd_name="test_mul64.vcd")
-
diff --git a/src/add/test_multishift.py b/src/add/test_multishift.py
deleted file mode 100644
index 651e5018..00000000
--- a/src/add/test_multishift.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from random import randint
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from fpbase import MultiShift, MultiShiftR, MultiShiftRMerge
-
-class MultiShiftModL:
-    def __init__(self, width):
-        self.ms = MultiShift(width)
-        self.a = Signal(width)
-        self.b = Signal(self.ms.smax)
-        self.x = Signal(width)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        m.d.comb += self.x.eq(self.ms.lshift(self.a, self.b))
-
-        return m
-
-class MultiShiftModR:
-    def __init__(self, width):
-        self.ms = MultiShift(width)
-        self.a = Signal(width)
-        self.b = Signal(self.ms.smax)
-        self.x = Signal(width)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        m.d.comb += self.x.eq(self.ms.rshift(self.a, self.b))
-
-        return m
-
-class MultiShiftModRMod:
-    def __init__(self, width):
-        self.ms = MultiShiftR(width)
-        self.a = Signal(width)
-        self.b = Signal(self.ms.smax)
-        self.x = Signal(width)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        m.submodules += self.ms
-        m.d.comb += self.ms.i.eq(self.a)
-        m.d.comb += self.ms.s.eq(self.b)
-        m.d.comb += self.x.eq(self.ms.o)
-
-        return m
-
-class MultiShiftRMergeMod:
-    def __init__(self, width):
-        self.ms = MultiShiftRMerge(width)
-        self.a = Signal(width)
-        self.b = Signal(self.ms.smax)
-        self.x = Signal(width)
-
-    def elaborate(self, platform=None):
-
-        m = Module()
-        m.submodules += self.ms
-        m.d.comb += self.ms.inp.eq(self.a)
-        m.d.comb += self.ms.diff.eq(self.b)
-        m.d.comb += self.x.eq(self.ms.m)
-
-        return m
-
-
-def check_case(dut, width, a, b):
-    yield dut.a.eq(a)
-    yield dut.b.eq(b)
-    yield
-
-    x = (a << b) & ((1<<width)-1)
-
-    out_x = yield dut.x
-    assert out_x == x, "Output x 0x%x not equal to expected 0x%x" % (out_x, x)
-
-def check_caser(dut, width, a, b):
-    yield dut.a.eq(a)
-    yield dut.b.eq(b)
-    yield
-
-    x = (a >> b) & ((1<<width)-1)
-
-    out_x = yield dut.x
-    assert out_x == x, "Output x 0x%x not equal to expected 0x%x" % (out_x, x)
-
-
-def check_case_merge(dut, width, a, b):
-    yield dut.a.eq(a)
-    yield dut.b.eq(b)
-    yield
-
-    x = (a >> b) & ((1<<width)-1) # actual shift
-    if (a & ((2<<b)-1)) != 0: # mask for sticky bit
-        x |= 1 # set LSB
-
-    out_x = yield dut.x
-    assert out_x == x, \
-                "\nshift %d\nInput\n%+32s\nOutput x\n%+32s != \n%+32s" % \
-                        (b, bin(a), bin(out_x), bin(x))
-
-def testmerge(dut):
-    for i in range(32):
-        for j in range(1000):
-            a = randint(0, (1<<32)-1)
-            yield from check_case_merge(dut, 32, a, i)
-
-def testbench(dut):
-    for i in range(32):
-        for j in range(1000):
-            a = randint(0, (1<<32)-1)
-            yield from check_case(dut, 32, a, i)
-
-def testbenchr(dut):
-    for i in range(32):
-        for j in range(1000):
-            a = randint(0, (1<<32)-1)
-            yield from check_caser(dut, 32, a, i)
-
-if __name__ == '__main__':
-    dut = MultiShiftRMergeMod(width=32)
-    run_simulation(dut, testmerge(dut), vcd_name="test_multishiftmerge.vcd")
-    dut = MultiShiftModRMod(width=32)
-    run_simulation(dut, testbenchr(dut), vcd_name="test_multishift.vcd")
-
-    dut = MultiShiftModR(width=32)
-    run_simulation(dut, testbenchr(dut), vcd_name="test_multishift.vcd")
-
-    dut = MultiShiftModL(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
-
diff --git a/src/add/test_outmux_pipe.py b/src/add/test_outmux_pipe.py
deleted file mode 100644
index b674a870..00000000
--- a/src/add/test_outmux_pipe.py
+++ /dev/null
@@ -1,162 +0,0 @@
-from random import randint
-from math import log
-from nmigen import Module, Signal, Cat, Elaboratable
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from multipipe import CombMuxOutPipe
-from singlepipe import SimpleHandshake, PassThroughHandshake, RecordObject
-
-
-class PassInData(RecordObject):
-    def __init__(self):
-        RecordObject.__init__(self)
-        self.mid = Signal(2, reset_less=True)
-        self.data = Signal(16, reset_less=True)
-
-
-class PassThroughStage:
-
-    def ispec(self):
-        return PassInData()
-
-    def ospec(self, name):
-        return Signal(16, name="%s_dout" % name, reset_less=True)
-                
-    def process(self, i):
-        return i.data
-
-
-class PassThroughDataStage:
-    def ispec(self):
-        return PassInData()
-    def ospec(self):
-        return self.ispec() # same as ospec
-
-    def process(self, i):
-        return i # pass-through
-
-
-
-class PassThroughPipe(PassThroughHandshake):
-    def __init__(self):
-        PassThroughHandshake.__init__(self, PassThroughDataStage())
-
-
-class OutputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = []
-        self.do = {}
-        self.tlen = 10
-        for i in range(self.tlen * dut.num_rows):
-            if i < dut.num_rows:
-                mid = i
-            else:
-                mid = randint(0, dut.num_rows-1)
-            data = randint(0, 255) + (mid<<8)
-            if mid not in self.do:
-                self.do[mid] = []
-            self.di.append((data, mid))
-            self.do[mid].append(data)
-
-    def send(self):
-        for i in range(self.tlen * dut.num_rows):
-            op2 = self.di[i][0]
-            mid = self.di[i][1]
-            rs = dut.p
-            yield rs.valid_i.eq(1)
-            yield rs.data_i.data.eq(op2)
-            yield rs.data_i.mid.eq(mid)
-            yield
-            o_p_ready = yield rs.ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield rs.ready_o
-
-            print ("send", mid, i, hex(op2))
-
-            yield rs.valid_i.eq(0)
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 3)):
-                yield
-
-        yield rs.valid_i.eq(0)
-
-    def rcv(self, mid):
-        out_i = 0
-        count = 0
-        stall_range = randint(0, 3)
-        while out_i != len(self.do[mid]):
-            count += 1
-            assert count != 2000, "timeout: too long"
-            n = self.dut.n[mid]
-            yield n.ready_i.eq(1)
-            yield
-            o_n_valid = yield n.valid_o
-            i_n_ready = yield n.ready_i
-            if not o_n_valid or not i_n_ready:
-                continue
-
-            out_v = yield n.data_o
-
-            print ("recv", mid, out_i, hex(out_v))
-
-            assert self.do[mid][out_i] == out_v # pass-through data
-
-            out_i += 1
-
-            if randint(0, 5) == 0:
-                stall_range = randint(0, 3)
-            stall = randint(0, stall_range) != 0
-            if stall:
-                yield n.ready_i.eq(0)
-                for i in range(stall_range):
-                    yield
-
-
-class TestPriorityMuxPipe(CombMuxOutPipe):
-    def __init__(self, num_rows):
-        self.num_rows = num_rows
-        stage = PassThroughStage()
-        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
-
-
-class TestSyncToPriorityPipe(Elaboratable):
-    def __init__(self):
-        self.num_rows = 4
-        self.pipe = PassThroughPipe()
-        self.muxpipe = TestPriorityMuxPipe(self.num_rows)
-
-        self.p = self.pipe.p
-        self.n = self.muxpipe.n
-
-    def elaborate(self, platform):
-        m = Module()
-        m.submodules.pipe = self.pipe
-        m.submodules.muxpipe = self.muxpipe
-        m.d.comb += self.pipe.n.connect_to_next(self.muxpipe.p)
-        return m
-
-    def ports(self):
-        res = [self.p.valid_i, self.p.ready_o] + \
-                self.p.data_i.ports()
-        for i in range(len(self.n)):
-            res += [self.n[i].ready_i, self.n[i].valid_o] + \
-                    [self.n[i].data_o]
-                    #self.n[i].data_o.ports()
-        return res
-
-
-if __name__ == '__main__':
-    dut = TestSyncToPriorityPipe()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_outmux_pipe.il", "w") as f:
-        f.write(vl)
-
-    test = OutputTest(dut)
-    run_simulation(dut, [test.rcv(1), test.rcv(0),
-                         test.rcv(3), test.rcv(2),
-                         test.send()],
-                   vcd_name="test_outmux_pipe.vcd")
-
diff --git a/src/add/test_prioritymux_pipe.py b/src/add/test_prioritymux_pipe.py
deleted file mode 100644
index 5f7891e8..00000000
--- a/src/add/test_prioritymux_pipe.py
+++ /dev/null
@@ -1,218 +0,0 @@
-from random import randint
-from math import log
-from nmigen import Module, Signal, Cat
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog, rtlil
-
-from singlepipe import PassThroughStage
-from multipipe import (CombMultiInPipeline, PriorityCombMuxInPipe)
-
-
-class PassData:
-    def __init__(self):
-        self.mid = Signal(2, reset_less=True)
-        self.idx = Signal(6, reset_less=True)
-        self.data = Signal(16, reset_less=True)
-
-    def eq(self, i):
-        return [self.mid.eq(i.mid), self.idx.eq(i.idx), self.data.eq(i.data)]
-
-    def ports(self):
-        return [self.mid, self.idx, self.data]
-
-
-def testbench(dut):
-    stb = yield dut.out_op.stb
-    assert stb == 0
-    ack = yield dut.out_op.ack
-    assert ack == 0
-
-    # set row 1 input 0
-    yield dut.rs[1].in_op[0].eq(5)
-    yield dut.rs[1].stb.eq(0b01) # strobe indicate 1st op ready
-    #yield dut.rs[1].ack.eq(1)
-    yield
-
-    # check row 1 output (should be inactive)
-    decode = yield dut.rs[1].out_decode
-    assert decode == 0
-    if False:
-        op0 = yield dut.rs[1].out_op[0]
-        op1 = yield dut.rs[1].out_op[1]
-        assert op0 == 0 and op1 == 0
-
-    # output should be inactive
-    out_stb = yield dut.out_op.stb
-    assert out_stb == 1
-
-    # set row 0 input 1
-    yield dut.rs[1].in_op[1].eq(6)
-    yield dut.rs[1].stb.eq(0b11) # strobe indicate both ops ready
-
-    # set acknowledgement of output... takes 1 cycle to respond
-    yield dut.out_op.ack.eq(1)
-    yield
-    yield dut.out_op.ack.eq(0) # clear ack on output
-    yield dut.rs[1].stb.eq(0) # clear row 1 strobe
-
-    # output strobe should be active, MID should be 0 until "ack" is set...
-    out_stb = yield dut.out_op.stb
-    assert out_stb == 1
-    out_mid = yield dut.mid
-    assert out_mid == 0
-
-    # ... and output should not yet be passed through either
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 0 and op1 == 0
-
-    # wait for out_op.ack to activate...
-    yield dut.rs[1].stb.eq(0b00) # set row 1 strobes to zero
-    yield
-
-    # *now* output should be passed through
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 5 and op1 == 6
-
-    # set row 2 input
-    yield dut.rs[2].in_op[0].eq(3)
-    yield dut.rs[2].in_op[1].eq(4)
-    yield dut.rs[2].stb.eq(0b11) # strobe indicate 1st op ready
-    yield dut.out_op.ack.eq(1) # set output ack
-    yield
-    yield dut.rs[2].stb.eq(0) # clear row 2 strobe
-    yield dut.out_op.ack.eq(0) # set output ack
-    yield
-    op0 = yield dut.out_op.v[0]
-    op1 = yield dut.out_op.v[1]
-    assert op0 == 3 and op1 == 4, "op0 %d op1 %d" % (op0, op1)
-    out_mid = yield dut.mid
-    assert out_mid == 2
-
-    # set row 0 and 3 input
-    yield dut.rs[0].in_op[0].eq(9)
-    yield dut.rs[0].in_op[1].eq(8)
-    yield dut.rs[0].stb.eq(0b11) # strobe indicate 1st op ready
-    yield dut.rs[3].in_op[0].eq(1)
-    yield dut.rs[3].in_op[1].eq(2)
-    yield dut.rs[3].stb.eq(0b11) # strobe indicate 1st op ready
-
-    # set acknowledgement of output... takes 1 cycle to respond
-    yield dut.out_op.ack.eq(1)
-    yield
-    yield dut.rs[0].stb.eq(0) # clear row 1 strobe
-    yield
-    out_mid = yield dut.mid
-    assert out_mid == 0, "out mid %d" % out_mid
-
-    yield
-    yield dut.rs[3].stb.eq(0) # clear row 1 strobe
-    yield dut.out_op.ack.eq(0) # clear ack on output
-    yield
-    out_mid = yield dut.mid
-    assert out_mid == 3, "out mid %d" % out_mid
-
-
-class InputTest:
-    def __init__(self, dut):
-        self.dut = dut
-        self.di = {}
-        self.do = {}
-        self.tlen = 10
-        for mid in range(dut.num_rows):
-            self.di[mid] = {}
-            self.do[mid] = {}
-            for i in range(self.tlen):
-                self.di[mid][i] = randint(0, 100) + (mid<<8)
-                self.do[mid][i] = self.di[mid][i]
-
-    def send(self, mid):
-        for i in range(self.tlen):
-            op2 = self.di[mid][i]
-            rs = dut.p[mid]
-            yield rs.valid_i.eq(1)
-            yield rs.data_i.data.eq(op2)
-            yield rs.data_i.idx.eq(i)
-            yield rs.data_i.mid.eq(mid)
-            yield
-            o_p_ready = yield rs.ready_o
-            while not o_p_ready:
-                yield
-                o_p_ready = yield rs.ready_o
-
-            print ("send", mid, i, hex(op2))
-
-            yield rs.valid_i.eq(0)
-            # wait random period of time before queueing another value
-            for i in range(randint(0, 3)):
-                yield
-
-        yield rs.valid_i.eq(0)
-        ## wait random period of time before queueing another value
-        #for i in range(randint(0, 3)):
-        #    yield
-
-        #send_range = randint(0, 3)
-        #if send_range == 0:
-        #    send = True
-        #else:
-        #    send = randint(0, send_range) != 0
-
-    def rcv(self):
-        while True:
-            #stall_range = randint(0, 3)
-            #for j in range(randint(1,10)):
-            #    stall = randint(0, stall_range) != 0
-            #    yield self.dut.n[0].ready_i.eq(stall)
-            #    yield
-            n = self.dut.n
-            yield n.ready_i.eq(1)
-            yield
-            o_n_valid = yield n.valid_o
-            i_n_ready = yield n.ready_i
-            if not o_n_valid or not i_n_ready:
-                continue
-
-            mid = yield n.data_o.mid
-            out_i = yield n.data_o.idx
-            out_v = yield n.data_o.data
-
-            print ("recv", mid, out_i, hex(out_v))
-
-            # see if this output has occurred already, delete it if it has
-            assert out_i in self.do[mid], "out_i %d not in array %s" % \
-                                          (out_i, repr(self.do[mid]))
-            assert self.do[mid][out_i] == out_v # pass-through data
-            del self.do[mid][out_i]
-
-            # check if there's any more outputs
-            zerolen = True
-            for (k, v) in self.do.items():
-                if v:
-                    zerolen = False
-            if zerolen:
-                break
-
-
-class TestPriorityMuxPipe(PriorityCombMuxInPipe):
-    def __init__(self):
-        self.num_rows = 4
-        def iospecfn(): return PassData()
-        stage = PassThroughStage(iospecfn)
-        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
-
-
-if __name__ == '__main__':
-    dut = TestPriorityMuxPipe()
-    vl = rtlil.convert(dut, ports=dut.ports())
-    with open("test_inputgroup_multi.il", "w") as f:
-        f.write(vl)
-    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
-
-    test = InputTest(dut)
-    run_simulation(dut, [test.send(1), test.send(0),
-                         test.send(3), test.send(2),
-                         test.rcv()],
-                   vcd_name="test_inputgroup_multi.vcd")
-
diff --git a/src/add/test_state_add.py b/src/add/test_state_add.py
deleted file mode 100644
index 8d1ccf59..00000000
--- a/src/add/test_state_add.py
+++ /dev/null
@@ -1,72 +0,0 @@
-from random import randint
-from random import seed
-from operator import add
-
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-
-from fadd_state import FPADD
-
-from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
-                                is_inf, is_pos_inf, is_neg_inf,
-                                match, get_case, check_case, run_test,
-                                run_edge_cases, run_corner_cases)
-
-def testbench(dut):
-    yield from check_case(dut, 0xFFFFFFFF, 0xC63B800A, 0xFFC00000)
-    yield from check_case(dut, 0xFF800000, 0x7F800000, 0xFFC00000)
-    #yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
-    yield from check_case(dut, 0x7F800000, 0xFF800000, 0xFFC00000)
-    yield from check_case(dut, 0x42540000, 0xC2540000, 0x00000000)
-    yield from check_case(dut, 0xC2540000, 0x42540000, 0x00000000)
-    yield from check_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
-    yield from check_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
-    yield from check_case(dut, 0, 0, 0)
-    yield from check_case(dut, 0x40000000, 0xc0000000, 0x00000000)
-    yield from check_case(dut, 0x3F800000, 0x40000000, 0x40400000)
-    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40400000)
-    yield from check_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
-    yield from check_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
-    yield from check_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
-    yield from check_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
-    yield from check_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
-    yield from check_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
-    yield from check_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
-    yield from check_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
-    yield from check_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
-    yield from check_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
-    yield from check_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
-    yield from check_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
-    yield from check_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
-    yield from check_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
-    yield from check_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
-    yield from check_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
-    yield from check_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
-    yield from check_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
-    yield from check_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
-    yield from check_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
-    yield from check_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
-    yield from check_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
-    yield from check_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
-    yield from check_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
-    #yield from check_case(dut, 1, 0, 1)
-    #yield from check_case(dut, 1, 1, 1)
-
-    count = 0
-
-    #regression tests
-    stimulus_a = [0x22cb525a, 0x40000000, 0x83e73d5c, 0xbf9b1e94, 0x34082401,
-                    0x5e8ef81, 0x5c75da81, 0x2b017]
-    stimulus_b = [0xadd79efa, 0xC0000000, 0x1c800000, 0xc038ed3a, 0xb328cd45, 
-                    0x114f3db, 0x2f642a39, 0xff3807ab]
-    yield from run_test(dut, stimulus_a, stimulus_b, add, get_case)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    yield from run_corner_cases(dut, count, add, get_case)
-    yield from run_edge_cases(dut, count, add, get_case)
-
-if __name__ == '__main__':
-    dut = FPADD(width=32, single_cycle=True)
-    run_simulation(dut, testbench(dut), vcd_name="test_state_add.vcd")
-
diff --git a/src/add/test_syncops.py b/src/add/test_syncops.py
deleted file mode 100644
index 484597ca..00000000
--- a/src/add/test_syncops.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from random import randint
-from nmigen import Module, Signal
-from nmigen.compat.sim import run_simulation
-from nmigen.cli import verilog
-
-from inputgroup import FPGetSyncOpsMod
-
-
-def testbench(dut):
-    stb = yield dut.stb
-    assert stb == 0
-    ack = yield dut.ack
-    assert ack == 0
-
-    yield dut.in_op[0].eq(5)
-    yield dut.stb.eq(0b01)
-    yield dut.ack.eq(1)
-    yield
-    yield
-    decode = yield dut.out_decode
-    assert decode == 0
-
-    op0 = yield dut.out_op[0]
-    op1 = yield dut.out_op[1]
-    assert op0 == 0 and op1 == 0
-
-    yield dut.in_op[1].eq(6)
-    yield dut.stb.eq(0b11)
-    yield
-    yield
-
-    op0 = yield dut.out_op[0]
-    op1 = yield dut.out_op[1]
-    assert op0 == 5 and op1 == 6
-
-    yield dut.ack.eq(0)
-    yield
-
-    op0 = yield dut.out_op[0]
-    op1 = yield dut.out_op[1]
-    assert op0 == 0 and op1 == 0
-
-if __name__ == '__main__':
-    dut = FPGetSyncOpsMod(width=32)
-    run_simulation(dut, testbench(dut), vcd_name="test_getsyncops.vcd")
-    vl = verilog.convert(dut, ports=dut.ports())
-    with open("test_getsyncops.v", "w") as f:
-        f.write(vl)
diff --git a/src/add/unit_test_double.py b/src/add/unit_test_double.py
deleted file mode 100644
index 8cc097f2..00000000
--- a/src/add/unit_test_double.py
+++ /dev/null
@@ -1,227 +0,0 @@
-import sys
-from random import randint
-from random import seed
-
-from sfpy import Float64
-
-def get_mantissa(x):
-    return x & 0x000fffffffffffff
-
-def get_exponent(x):
-    return ((x & 0x7ff0000000000000) >> 52) - 1023
-
-def get_sign(x):
-    return ((x & 0x8000000000000000) >> 63)
-
-def is_nan(x):
-    return get_exponent(x) == 1024 and get_mantissa(x) != 0
-
-def is_inf(x):
-    return get_exponent(x) == 1024 and get_mantissa(x) == 0
-
-def is_pos_inf(x):
-    return is_inf(x) and not get_sign(x)
-
-def is_neg_inf(x):
-    return is_inf(x) and get_sign(x)
-
-def match(x, y):
-    return (
-        (is_pos_inf(x) and is_pos_inf(y)) or
-        (is_neg_inf(x) and is_neg_inf(y)) or
-        (is_nan(x) and is_nan(y)) or
-        (x == y)
-        )
-
-def get_case(dut, a, b):
-    yield dut.in_a.v.eq(a)
-    yield dut.in_a.stb.eq(1)
-    yield
-    yield
-    a_ack = (yield dut.in_a.ack)
-    assert a_ack == 0
-    yield dut.in_b.v.eq(b)
-    yield dut.in_b.stb.eq(1)
-    b_ack = (yield dut.in_b.ack)
-    assert b_ack == 0
-
-    while True:
-        yield
-        out_z_stb = (yield dut.out_z.stb)
-        if not out_z_stb:
-            continue
-        yield dut.in_a.stb.eq(0)
-        yield dut.in_b.stb.eq(0)
-        yield dut.out_z.ack.eq(1)
-        yield
-        yield dut.out_z.ack.eq(0)
-        yield
-        yield
-        break
-
-    out_z = yield dut.out_z.v
-    return out_z
-
-def check_case(dut, a, b, z):
-    out_z = yield from get_case(dut, a, b)
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-
-
-def run_test(dut, stimulus_a, stimulus_b, op):
-
-    expected_responses = []
-    actual_responses = []
-    for a, b in zip(stimulus_a, stimulus_b):
-        af = Float64.from_bits(a)
-        bf = Float64.from_bits(b)
-        z = op(af, bf)
-        expected_responses.append(z.get_bits())
-        #print (af, bf, z)
-        actual = yield from get_case(dut, a, b)
-        actual_responses.append(actual)
-
-    if len(actual_responses) < len(expected_responses):
-        print ("Fail ... not enough results")
-        exit(0)
-
-    for exp, act, a, b in zip(expected_responses, actual_responses,
-                                      stimulus_a, stimulus_b):
-        passed = match(exp, act)
-
-        if not passed:
-
-            print ("Fail ... expected:", hex(exp), "actual:", hex(act))
-
-            print (hex(a))
-            print ("a mantissa:",              a & 0x000fffffffffffff)
-            print ("a exponent:",            ((a & 0x7ff0000000000000) >> 52)\
-                                                - 1023)
-            print ("a sign:",                ((a & 0x8000000000000000) >> 63))
-
-            print (hex(b))
-            print ("b mantissa:",              b & 0x000fffffffffffff)
-            print ("b exponent:",            ((b & 0x7ff0000000000000) >> 52)\
-                                                 - 1023)
-            print ("b sign:",                ((b & 0x8000000000000000) >> 63))
-
-            print (hex(exp))
-            print ("expected mantissa:",   exp & 0x000fffffffffffff)
-            print ("expected exponent:", ((exp & 0x7ff0000000000000) >> 52)\
-                                                 - 1023)
-            print ("expected sign:",     ((exp & 0x8000000000000000) >> 63))
-
-            print (hex(act))
-            print ("actual mantissa:",       act & 0x000fffffffffffff)
-            print ("actual exponent:",     ((act & 0x7ff0000000000000) >> 52)\
-                                                 - 1023)
-            print ("actual sign:",         ((act & 0x8000000000000000) >> 63))
-
-            sys.exit(0)
-
-
-def run_corner_cases(dut, count, op):
-    #corner cases
-    from itertools import permutations
-    stimulus_a = [i[0] for i in permutations([
-        0x8000000000000000,
-        0x0000000000000000,
-        0x7ff8000000000000,
-        0xfff8000000000000,
-        0x7ff0000000000000,
-        0xfff0000000000000
-    ], 2)]
-    stimulus_b = [i[1] for i in permutations([
-        0x8000000000000000,
-        0x0000000000000000,
-        0x7ff8000000000000,
-        0xfff8000000000000,
-        0x7ff0000000000000,
-        0xfff0000000000000
-    ], 2)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-
-def run_edge_cases(dut, count, op):
-    #edge cases
-    stimulus_a = [0x8000000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x0000000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x8000000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x0000000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x7FF8000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0xFFF8000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x7FF8000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64) for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0xFFF8000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64) for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x7FF0000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0xFFF0000000000000 for i in range(1000)]
-    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x7FF0000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0xFFF0000000000000 for i in range(1000)]
-    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    #seed(0)
-    for i in range(100000):
-        stimulus_a = [randint(0, 1<<64) for i in range(1000)]
-        stimulus_b = [randint(0, 1<<64) for i in range(1000)]
-        yield from run_test(dut, stimulus_a, stimulus_b, op)
-        count += 1000
-        print (count, "random vectors passed")
-
diff --git a/src/add/unit_test_half.py b/src/add/unit_test_half.py
deleted file mode 100644
index 73c9b653..00000000
--- a/src/add/unit_test_half.py
+++ /dev/null
@@ -1,211 +0,0 @@
-from random import randint
-from random import seed
-
-import sys
-from sfpy import Float16
-
-def get_mantissa(x):
-    return 0x3ff & x
-
-def get_exponent(x):
-    return ((x & 0xf800) >> 11) - 15
-
-def get_sign(x):
-    return ((x & 0x8000) >> 15)
-
-def is_nan(x):
-    return get_exponent(x) == 16 and get_mantissa(x) != 0
-
-def is_inf(x):
-    return get_exponent(x) == 16 and get_mantissa(x) == 0
-
-def is_pos_inf(x):
-    return is_inf(x) and not get_sign(x)
-
-def is_neg_inf(x):
-    return is_inf(x) and get_sign(x)
-
-def match(x, y):
-    return (
-        (is_pos_inf(x) and is_pos_inf(y)) or
-        (is_neg_inf(x) and is_neg_inf(y)) or
-        (is_nan(x) and is_nan(y)) or
-        (x == y)
-        )
-
-def get_case(dut, a, b):
-    yield dut.in_a.v.eq(a)
-    yield dut.in_a.stb.eq(1)
-    yield
-    yield
-    a_ack = (yield dut.in_a.ack)
-    assert a_ack == 0
-    yield dut.in_b.v.eq(b)
-    yield dut.in_b.stb.eq(1)
-    b_ack = (yield dut.in_b.ack)
-    assert b_ack == 0
-
-    while True:
-        yield
-        out_z_stb = (yield dut.out_z.stb)
-        if not out_z_stb:
-            continue
-        yield dut.in_a.stb.eq(0)
-        yield dut.in_b.stb.eq(0)
-        yield dut.out_z.ack.eq(1)
-        yield
-        yield dut.out_z.ack.eq(0)
-        yield
-        yield
-        break
-
-    out_z = yield dut.out_z.v
-    return out_z
-
-def check_case(dut, a, b, z):
-    out_z = yield from get_case(dut, a, b)
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-
-
-def run_test(dut, stimulus_a, stimulus_b, op):
-
-    expected_responses = []
-    actual_responses = []
-    for a, b in zip(stimulus_a, stimulus_b):
-        af = Float16.from_bits(a)
-        bf = Float16.from_bits(b)
-        z = op(af, bf)
-        expected_responses.append(z.get_bits())
-        #print (af, bf, z)
-        actual = yield from get_case(dut, a, b)
-        actual_responses.append(actual)
-
-    if len(actual_responses) < len(expected_responses):
-        print ("Fail ... not enough results")
-        exit(0)
-
-    for expected, actual, a, b in zip(expected_responses, actual_responses,
-                                      stimulus_a, stimulus_b):
-        passed = match(expected, actual)
-
-        if not passed:
-
-            print ("Fail ... expected:", hex(expected), "actual:", hex(actual))
-
-            print (hex(a))
-            print ("a mantissa:", get_mantissa(a))
-            print ("a exponent:", get_exponent(a))
-            print ("a sign:", get_sign(a))
-
-            print (hex(b))
-            print ("b mantissa:", get_mantissa(b))
-            print ("b exponent:", get_exponent(b))
-            print ("b sign:", get_sign(b))
-
-            print (hex(expected))
-            print ("expected mantissa:", get_mantissa(expected))
-            print ("expected exponent:", get_exponent(expected))
-            print ("expected sign:", get_sign(expected))
-
-            print (hex(actual))
-            print ("actual mantissa:", get_mantissa(actual))
-            print ("actual exponent:", get_exponent(actual))
-            print ("actual sign:", get_sign(actual))
-
-            sys.exit(0)
-
-def run_corner_cases(dut, count, op):
-    #corner cases
-    corners = [0x8000, 0x0000, 0x7800, 0xf800, 0x7c00, 0xfc00]
-    from itertools import permutations
-    stimulus_a = [i[0] for i in permutations(corners, 2)]
-    stimulus_b = [i[1] for i in permutations(corners, 2)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-
-def run_edge_cases(dut, count, op):
-    maxint16 = 1<<16
-    maxcount = 10
-    #edge cases
-    stimulus_a = [0x8000 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x0000 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x8000 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x0000 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x7800 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0xF800 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x7800 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0xF800 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0x7C00 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_a = [0xFC00 for i in range(maxcount)]
-    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0x7C00 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    stimulus_b = [0xFC00 for i in range(maxcount)]
-    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-    #seed(0)
-    for i in range(100000):
-        stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
-        stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
-        yield from run_test(dut, stimulus_a, stimulus_b, op)
-        count += maxcount
-        print (count, "random vectors passed")
-
diff --git a/src/add/unit_test_single.py b/src/add/unit_test_single.py
deleted file mode 100644
index 2b0d9e56..00000000
--- a/src/add/unit_test_single.py
+++ /dev/null
@@ -1,255 +0,0 @@
-from random import randint
-from random import seed
-
-import sys
-from sfpy import Float32
-
-def get_mantissa(x):
-    return 0x7fffff & x
-
-def get_exponent(x):
-    return ((x & 0x7f800000) >> 23) - 127
-
-def set_exponent(x, e):
-    return (x & ~0x7f800000) | ((e+127) << 23)
-
-def get_sign(x):
-    return ((x & 0x80000000) >> 31)
-
-def is_nan(x):
-    return get_exponent(x) == 128 and get_mantissa(x) != 0
-
-def is_inf(x):
-    return get_exponent(x) == 128 and get_mantissa(x) == 0
-
-def is_pos_inf(x):
-    return is_inf(x) and not get_sign(x)
-
-def is_neg_inf(x):
-    return is_inf(x) and get_sign(x)
-
-def match(x, y):
-    return (
-        (is_pos_inf(x) and is_pos_inf(y)) or
-        (is_neg_inf(x) and is_neg_inf(y)) or
-        (is_nan(x) and is_nan(y)) or
-        (x == y)
-        )
-
-def get_rs_case(dut, a, b, mid):
-    in_a, in_b = dut.rs[0]
-    out_z = dut.res[0]
-    yield dut.ids.in_mid.eq(mid)
-    yield in_a.v.eq(a)
-    yield in_a.valid_i.eq(1)
-    yield
-    yield
-    yield
-    yield
-    a_ack = (yield in_a.ready_o)
-    assert a_ack == 0
-
-    yield in_a.valid_i.eq(0)
-
-    yield in_b.v.eq(b)
-    yield in_b.valid_i.eq(1)
-    yield
-    yield
-    b_ack = (yield in_b.ready_o)
-    assert b_ack == 0
-
-    yield in_b.valid_i.eq(0)
-
-    yield out_z.ready_i.eq(1)
-
-    while True:
-        out_z_stb = (yield out_z.valid_o)
-        if not out_z_stb:
-            yield
-            continue
-        vout_z = yield out_z.v
-        #out_mid = yield dut.ids.out_mid
-        yield out_z.ready_i.eq(0)
-        yield
-        break
-
-    return vout_z, mid
-
-def check_rs_case(dut, a, b, z, mid=None):
-    if mid is None:
-        mid = randint(0, 6)
-    mid = 0
-    out_z, out_mid = yield from get_rs_case(dut, a, b, mid)
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
-
-
-def get_case(dut, a, b, mid):
-    #yield dut.in_mid.eq(mid)
-    yield dut.in_a.v.eq(a)
-    yield dut.in_a.valid_i_test.eq(1)
-    yield
-    yield
-    yield
-    yield
-    a_ack = (yield dut.in_a.ready_o)
-    assert a_ack == 0
-
-    yield dut.in_a.valid_i.eq(0)
-
-    yield dut.in_b.v.eq(b)
-    yield dut.in_b.valid_i.eq(1)
-    yield
-    yield
-    b_ack = (yield dut.in_b.ready_o)
-    assert b_ack == 0
-
-    yield dut.in_b.valid_i.eq(0)
-
-    yield dut.out_z.ready_i.eq(1)
-
-    while True:
-        out_z_stb = (yield dut.out_z.valid_o)
-        if not out_z_stb:
-            yield
-            continue
-        out_z = yield dut.out_z.v
-        #out_mid = yield dut.out_mid
-        yield dut.out_z.ready_i.eq(0)
-        yield
-        break
-
-    return out_z, mid # TODO: mid
-
-def check_case(dut, a, b, z, mid=None):
-    if mid is None:
-        mid = randint(0, 6)
-    mid = 0
-    out_z, out_mid = yield from get_case(dut, a, b, mid)
-    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
-    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
-
-
-def run_test(dut, stimulus_a, stimulus_b, op, get_case_fn):
-
-    expected_responses = []
-    actual_responses = []
-    for a, b in zip(stimulus_a, stimulus_b):
-        mid = randint(0, 6)
-        mid = 0
-        af = Float32.from_bits(a)
-        bf = Float32.from_bits(b)
-        z = op(af, bf)
-        expected_responses.append((z.get_bits(), mid))
-        actual = yield from get_case_fn(dut, a, b, mid)
-        actual_responses.append(actual)
-
-    if len(actual_responses) < len(expected_responses):
-        print ("Fail ... not enough results")
-        exit(0)
-
-    for expected, actual, a, b in zip(expected_responses, actual_responses,
-                                      stimulus_a, stimulus_b):
-        passed = match(expected[0], actual[0])
-        if expected[1] != actual[1]: # check mid
-            print ("MID failed", expected[1], actual[1])
-            sys.exit(0)
-
-        if not passed:
-
-            expected = expected[0]
-            actual = actual[0]
-            print ("Fail ... expected:", hex(expected), "actual:", hex(actual))
-
-            print (hex(a))
-            print ("a mantissa:", a & 0x7fffff)
-            print ("a exponent:", ((a & 0x7f800000) >> 23) - 127)
-            print ("a sign:", ((a & 0x80000000) >> 31))
-
-            print (hex(b))
-            print ("b mantissa:", b & 0x7fffff)
-            print ("b exponent:", ((b & 0x7f800000) >> 23) - 127)
-            print ("b sign:", ((b & 0x80000000) >> 31))
-
-            print (hex(expected))
-            print ("expected mantissa:", expected & 0x7fffff)
-            print ("expected exponent:", ((expected & 0x7f800000) >> 23) - 127)
-            print ("expected sign:", ((expected & 0x80000000) >> 31))
-
-            print (hex(actual))
-            print ("actual mantissa:", actual & 0x7fffff)
-            print ("actual exponent:", ((actual & 0x7f800000) >> 23) - 127)
-            print ("actual sign:", ((actual & 0x80000000) >> 31))
-
-            sys.exit(0)
-
-corner_cases = [0x80000000, 0x00000000, 0x7f800000, 0xff800000,
-                0x7fc00000, 0xffc00000]
-
-def run_corner_cases(dut, count, op, get_case_fn):
-    #corner cases
-    from itertools import permutations
-    stimulus_a = [i[0] for i in permutations(corner_cases, 2)]
-    stimulus_b = [i[1] for i in permutations(corner_cases, 2)]
-    yield from run_test(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed")
-
-def run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn):
-    yield from run_test(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    yield from run_test(dut, stimulus_b, stimulus_a, op, get_case_fn)
-
-def run_cases(dut, count, op, fixed_num, num_entries, get_case_fn):
-    if isinstance(fixed_num, int):
-        stimulus_a = [fixed_num for i in range(num_entries)]
-        report = hex(fixed_num)
-    else:
-        stimulus_a = fixed_num
-        report = "random"
-
-    stimulus_b = [randint(0, 1<<32) for i in range(num_entries)]
-    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed 2^32", report)
-
-    # non-canonical NaNs.
-    stimulus_b = [set_exponent(randint(0, 1<<32), 128) \
-                        for i in range(num_entries)]
-    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed Non-Canonical NaN", report)
-
-    # -127
-    stimulus_b = [set_exponent(randint(0, 1<<32), -127) \
-                        for i in range(num_entries)]
-    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed exp=-127", report)
-
-    # nearly zero
-    stimulus_b = [set_exponent(randint(0, 1<<32), -126) \
-                        for i in range(num_entries)]
-    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed exp=-126", report)
-
-    # nearly inf
-    stimulus_b = [set_exponent(randint(0, 1<<32), 127) \
-                        for i in range(num_entries)]
-    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
-    count += len(stimulus_a)
-    print (count, "vectors passed exp=127", report)
-
-    return count
-
-def run_edge_cases(dut, count, op, get_case_fn):
-    #edge cases
-    for testme in corner_cases:
-        count = yield from run_cases(dut, count, op, testme, 10, get_case_fn)
-
-    for i in range(100000):
-        stimulus_a = [randint(0, 1<<32) for i in range(10)]
-        count = yield from run_cases(dut, count, op, stimulus_a, 10,
-                                     get_case_fn)
-    return count
-
diff --git a/src/ieee754/add/concurrentunit.py b/src/ieee754/add/concurrentunit.py
new file mode 100644
index 00000000..c0053c8b
--- /dev/null
+++ b/src/ieee754/add/concurrentunit.py
@@ -0,0 +1,74 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from math import log
+from nmigen import Module
+from nmigen.cli import main, verilog
+
+from singlepipe import PassThroughStage
+from multipipe import CombMuxOutPipe
+from multipipe import PriorityCombMuxInPipe
+
+from fpcommon.getop import FPADDBaseData
+from fpcommon.denorm import FPSCData
+from fpcommon.pack import FPPackData
+from fpcommon.normtopack import FPNormToPack
+from fpadd.specialcases import FPAddSpecialCasesDeNorm
+from fpadd.addstages import FPAddAlignSingleAdd
+
+
+def num_bits(n):
+    return int(log(n) / log(2))
+
+class FPADDInMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows, iospecfn):
+        self.num_rows = num_rows
+        stage = PassThroughStage(iospecfn)
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
+
+
+class FPADDMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows, iospecfn):
+        self.num_rows = num_rows
+        stage = PassThroughStage(iospecfn)
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
+
+
+class ReservationStations:
+    """ Reservation-Station pipeline
+
+        Input: num_rows - number of input and output Reservation Stations
+
+        Requires: the addition of an "alu" object, an i_specfn and an o_specfn
+
+        * fan-in on inputs (an array of FPADDBaseData: a,b,mid)
+        * ALU pipeline
+        * fan-out on outputs (an array of FPPackData: z,mid)
+
+        Fan-in and Fan-out are combinatorial.
+    """
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        self.inpipe = FPADDInMuxPipe(num_rows, self.i_specfn)   # fan-in
+        self.outpipe = FPADDMuxOutPipe(num_rows, self.o_specfn) # fan-out
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.alu = self.alu
+        m.submodules.outpipe = self.outpipe
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.alu.p)
+        m.d.comb += self.alu.connect_to_next(self.outpipe)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+
diff --git a/src/ieee754/add/dual_add_experiment.py b/src/ieee754/add/dual_add_experiment.py
new file mode 100644
index 00000000..7ec479f5
--- /dev/null
+++ b/src/ieee754/add/dual_add_experiment.py
@@ -0,0 +1,72 @@
+from nmigen import *
+from nmigen.cli import main
+
+from nmigen_add_experiment import FPADD
+from fpbase import FPOp
+
+
+class Adder:
+    def __init__(self, width):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.a + self.b)
+        return m
+
+
+class Subtractor:
+    def __init__(self, width):
+        self.a   = Signal(width)
+        self.b   = Signal(width)
+        self.o   = Signal(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.a - self.b)
+        return m
+
+
+class ALU:
+    def __init__(self, width):
+        #self.op  = Signal()
+        self.a   = FPOp(width)
+        self.b   = FPOp(width)
+        self.c   = FPOp(width)
+        self.z   = FPOp(width)
+        self.int_stb = Signal()
+
+        self.add1 = FPADD(width)
+        self.add2 = FPADD(width)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.add1 = self.add1
+        m.submodules.add2 = self.add2
+        # join add1 a to a: add1.in_a = a
+        m.d.comb += self.add1.in_a.chain_from(self.a)
+        # join add1 b to b: add1.in_b = b
+        m.d.comb += self.add1.in_b.chain_from(self.b)
+        # join add2 a to c: add2.in_a = c
+        m.d.comb += self.add2.in_a.chain_from(self.c)
+        # join add2 b to add1 z: add2.in_b = add1.out_z
+        m.d.comb += self.add2.in_b.chain_inv(self.add1.out_z)
+        # join output from add2 to z: z = add2.out_z
+        m.d.comb += self.z.chain_from(self.add2.out_z)
+        # get at add1's stb signal
+        m.d.comb += self.int_stb.eq(self.add1.out_z.stb)
+        #with m.If(self.op):
+        #    m.d.comb += self.o.eq(self.sub.o)
+        #with m.Else():
+        #    m.d.comb += self.o.eq(self.add.o)
+        return m
+
+
+if __name__ == "__main__":
+    alu = ALU(width=16)
+    main(alu, ports=alu.a.ports() + \
+                     alu.b.ports() + \
+                     alu.c.ports() + \
+                     alu.z.ports())
diff --git a/src/ieee754/add/example_buf_pipe.py b/src/ieee754/add/example_buf_pipe.py
new file mode 100644
index 00000000..4bb7cdf1
--- /dev/null
+++ b/src/ieee754/add/example_buf_pipe.py
@@ -0,0 +1,103 @@
+""" Pipeline and BufferedHandshake examples
+"""
+
+from nmoperator import eq
+from iocontrol import (PrevControl, NextControl)
+from singlepipe import (PrevControl, NextControl, ControlBase,
+                        StageCls, Stage, StageChain,
+                        BufferedHandshake, UnbufferedPipeline)
+
+from nmigen import Signal, Module
+from nmigen.cli import verilog, rtlil
+
+
+class ExampleAddStage(StageCls):
+    """ an example of how to use the buffered pipeline, as a class instance
+    """
+
+    def ispec(self):
+        """ returns a tuple of input signals which will be the incoming data
+        """
+        return (Signal(16), Signal(16))
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16)
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i[0] + i[1]
+
+
+class ExampleBufPipeAdd(BufferedHandshake):
+    """ an example of how to use the buffered pipeline, using a class instance
+    """
+
+    def __init__(self):
+        addstage = ExampleAddStage()
+        BufferedHandshake.__init__(self, addstage)
+
+
+class ExampleStage(Stage):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def ispec():
+        return Signal(16, name="example_input_signal")
+
+    def ospec():
+        return Signal(16, name="example_output_signal")
+
+    def process(i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+
+class ExampleStageCls(StageCls):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def ispec(self):
+        return Signal(16, name="example_input_signal")
+
+    def ospec(self):
+        return Signal(16, name="example_output_signal")
+
+    def process(self, i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+
+class ExampleBufPipe(BufferedHandshake):
+    """ an example of how to use the buffered pipeline.
+    """
+
+    def __init__(self):
+        BufferedHandshake.__init__(self, ExampleStage)
+
+
+class ExamplePipeline(UnbufferedPipeline):
+    """ an example of how to use the unbuffered pipeline.
+    """
+
+    def __init__(self):
+        UnbufferedPipeline.__init__(self, ExampleStage)
+
+
+if __name__ == '__main__':
+    dut = ExampleBufPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_bufpipe.il", "w") as f:
+        f.write(vl)
+
+    dut = ExamplePipeline()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_combpipe.il", "w") as f:
+        f.write(vl)
diff --git a/src/ieee754/add/fadd_state.py b/src/ieee754/add/fadd_state.py
new file mode 100644
index 00000000..7ad88786
--- /dev/null
+++ b/src/ieee754/add/fadd_state.py
@@ -0,0 +1,282 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase
+
+from singlepipe import eq
+
+
+class FPADD(FPBase):
+
+    def __init__(self, width, single_cycle=False):
+        FPBase.__init__(self)
+        self.width = width
+        self.single_cycle = single_cycle
+
+        self.in_a  = FPOp(width)
+        self.in_b  = FPOp(width)
+        self.out_z = FPOp(width)
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPAdd
+        """
+        m = Module()
+
+        # Latches
+        a = FPNumIn(self.in_a, self.width)
+        b = FPNumIn(self.in_b, self.width)
+        z = FPNumOut(self.width, False)
+
+        m.submodules.fpnum_a = a
+        m.submodules.fpnum_b = b
+        m.submodules.fpnum_z = z
+
+        m.d.comb += a.v.eq(self.in_a.v)
+        m.d.comb += b.v.eq(self.in_b.v)
+
+        w = z.m_width + 4
+        tot = Signal(w, reset_less=True) # sticky/round/guard, {mantissa} result, 1 overflow
+
+        of = Overflow()
+
+        m.submodules.overflow = of
+
+        with m.FSM() as fsm:
+
+            # ******
+            # gets operand a
+
+            with m.State("get_a"):
+                res = self.get_op(m, self.in_a, a, "get_b")
+                m.d.sync += eq([a, self.in_a.ack], res)
+
+            # ******
+            # gets operand b
+
+            with m.State("get_b"):
+                res = self.get_op(m, self.in_b, b, "special_cases")
+                m.d.sync += eq([b, self.in_b.ack], res)
+
+            # ******
+            # special cases: NaNs, infs, zeros, denormalised
+            # NOTE: some of these are unique to add.  see "Special Operations"
+            # https://steve.hollasch.net/cgindex/coding/ieeefloat.html
+
+            with m.State("special_cases"):
+
+                s_nomatch = Signal()
+                m.d.comb += s_nomatch.eq(a.s != b.s)
+
+                m_match = Signal()
+                m.d.comb += m_match.eq(a.m == b.m)
+
+                # if a is NaN or b is NaN return NaN
+                with m.If(a.is_nan | b.is_nan):
+                    m.next = "put_z"
+                    m.d.sync += z.nan(1)
+
+                # XXX WEIRDNESS for FP16 non-canonical NaN handling
+                # under review
+
+                ## if a is zero and b is NaN return -b
+                #with m.If(a.is_zero & (a.s==0) & b.is_nan):
+                #    m.next = "put_z"
+                #    m.d.sync += z.create(b.s, b.e, Cat(b.m[3:-2], ~b.m[0]))
+
+                ## if b is zero and a is NaN return -a
+                #with m.Elif(b.is_zero & (b.s==0) & a.is_nan):
+                #    m.next = "put_z"
+                #    m.d.sync += z.create(a.s, a.e, Cat(a.m[3:-2], ~a.m[0]))
+
+                ## if a is -zero and b is NaN return -b
+                #with m.Elif(a.is_zero & (a.s==1) & b.is_nan):
+                #    m.next = "put_z"
+                #    m.d.sync += z.create(a.s & b.s, b.e, Cat(b.m[3:-2], 1))
+
+                ## if b is -zero and a is NaN return -a
+                #with m.Elif(b.is_zero & (b.s==1) & a.is_nan):
+                #    m.next = "put_z"
+                #    m.d.sync += z.create(a.s & b.s, a.e, Cat(a.m[3:-2], 1))
+
+                # if a is inf return inf (or NaN)
+                with m.Elif(a.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(a.s)
+                    # if a is inf and signs don't match return NaN
+                    with m.If(b.exp_128 & s_nomatch):
+                        m.d.sync += z.nan(1)
+
+                # if b is inf return inf
+                with m.Elif(b.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(b.s)
+
+                # if a is zero and b zero return signed-a/b
+                with m.Elif(a.is_zero & b.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.create(a.s & b.s, b.e, b.m[3:-1])
+
+                # if a is zero return b
+                with m.Elif(a.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.create(b.s, b.e, b.m[3:-1])
+
+                # if b is zero return a
+                with m.Elif(b.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.create(a.s, a.e, a.m[3:-1])
+
+                # if a equal to -b return zero (+ve zero)
+                with m.Elif(s_nomatch & m_match & (a.e == b.e)):
+                    m.next = "put_z"
+                    m.d.sync += z.zero(0)
+
+                # Denormalised Number checks
+                with m.Else():
+                    m.next = "align"
+                    self.denormalise(m, a)
+                    self.denormalise(m, b)
+
+            # ******
+            # align.
+
+            with m.State("align"):
+                if not self.single_cycle:
+                    # NOTE: this does *not* do single-cycle multi-shifting,
+                    #       it *STAYS* in the align state until exponents match
+
+                    # exponent of a greater than b: shift b down
+                    with m.If(a.e > b.e):
+                        m.d.sync += b.shift_down()
+                    # exponent of b greater than a: shift a down
+                    with m.Elif(a.e < b.e):
+                        m.d.sync += a.shift_down()
+                    # exponents equal: move to next stage.
+                    with m.Else():
+                        m.next = "add_0"
+                else:
+                    # This one however (single-cycle) will do the shift
+                    # in one go.
+
+                    # XXX TODO: the shifter used here is quite expensive
+                    # having only one would be better
+
+                    ediff = Signal((len(a.e), True), reset_less=True)
+                    ediffr = Signal((len(a.e), True), reset_less=True)
+                    m.d.comb += ediff.eq(a.e - b.e)
+                    m.d.comb += ediffr.eq(b.e - a.e)
+                    with m.If(ediff > 0):
+                        m.d.sync += b.shift_down_multi(ediff)
+                    # exponent of b greater than a: shift a down
+                    with m.Elif(ediff < 0):
+                        m.d.sync += a.shift_down_multi(ediffr)
+
+                    m.next = "add_0"
+
+            # ******
+            # First stage of add.  covers same-sign (add) and subtract
+            # special-casing when mantissas are greater or equal, to
+            # give greatest accuracy.
+
+            with m.State("add_0"):
+                m.next = "add_1"
+                m.d.sync += z.e.eq(a.e)
+                # same-sign (both negative or both positive) add mantissas
+                with m.If(a.s == b.s):
+                    m.d.sync += [
+                        tot.eq(Cat(a.m, 0) + Cat(b.m, 0)),
+                        z.s.eq(a.s)
+                    ]
+                # a mantissa greater than b, use a
+                with m.Elif(a.m >= b.m):
+                    m.d.sync += [
+                        tot.eq(Cat(a.m, 0) - Cat(b.m, 0)),
+                        z.s.eq(a.s)
+                    ]
+                # b mantissa greater than a, use b
+                with m.Else():
+                    m.d.sync += [
+                        tot.eq(Cat(b.m, 0) - Cat(a.m, 0)),
+                        z.s.eq(b.s)
+                ]
+
+            # ******
+            # Second stage of add: preparation for normalisation.
+            # detects when tot sum is too big (tot[27] is kinda a carry bit)
+
+            with m.State("add_1"):
+                m.next = "normalise_1"
+                # tot[27] gets set when the sum overflows. shift result down
+                with m.If(tot[-1]):
+                    m.d.sync += [
+                        z.m.eq(tot[4:]),
+                        of.m0.eq(tot[4]),
+                        of.guard.eq(tot[3]),
+                        of.round_bit.eq(tot[2]),
+                        of.sticky.eq(tot[1] | tot[0]),
+                        z.e.eq(z.e + 1)
+                ]
+                # tot[27] zero case
+                with m.Else():
+                    m.d.sync += [
+                        z.m.eq(tot[3:]),
+                        of.m0.eq(tot[3]),
+                        of.guard.eq(tot[2]),
+                        of.round_bit.eq(tot[1]),
+                        of.sticky.eq(tot[0])
+                ]
+
+            # ******
+            # First stage of normalisation.
+
+            with m.State("normalise_1"):
+                self.normalise_1(m, z, of, "normalise_2")
+
+            # ******
+            # Second stage of normalisation.
+
+            with m.State("normalise_2"):
+                self.normalise_2(m, z, of, "round")
+
+            # ******
+            # rounding stage
+
+            with m.State("round"):
+                self.roundz(m, z, of.roundz)
+                m.next = "corrections"
+
+            # ******
+            # correction stage
+
+            with m.State("corrections"):
+                self.corrections(m, z, "pack")
+
+            # ******
+            # pack stage
+
+            with m.State("pack"):
+                self.pack(m, z, "put_z")
+
+            # ******
+            # put_z stage
+
+            with m.State("put_z"):
+                self.put_z(m, z, self.out_z, "get_a")
+
+        return m
+
+
+if __name__ == "__main__":
+    alu = FPADD(width=32)
+    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
+
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/fmul.py b/src/ieee754/add/fmul.py
new file mode 100644
index 00000000..a2ba41e7
--- /dev/null
+++ b/src/ieee754/add/fmul.py
@@ -0,0 +1,172 @@
+from nmigen import Module, Signal, Cat, Mux, Array, Const
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase, FPState
+from fpcommon.getop import FPGetOp
+from singlepipe import eq
+
+
+class FPMUL(FPBase):
+
+    def __init__(self, width):
+        FPBase.__init__(self)
+        self.width = width
+
+        self.in_a  = FPOp(width)
+        self.in_b  = FPOp(width)
+        self.out_z = FPOp(width)
+
+        self.states = []
+
+    def add_state(self, state):
+        self.states.append(state)
+        return state
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPMUL
+        """
+        m = Module()
+
+        # Latches
+        a = FPNumIn(None, self.width, False)
+        b = FPNumIn(None, self.width, False)
+        z = FPNumOut(self.width, False)
+
+        mw = (z.m_width)*2 - 1 + 3 # sticky/round/guard bits + (2*mant) - 1
+        product = Signal(mw)
+
+        of = Overflow()
+        m.submodules.of = of
+        m.submodules.a = a
+        m.submodules.b = b
+        m.submodules.z = z
+
+        m.d.comb += a.v.eq(self.in_a.v)
+        m.d.comb += b.v.eq(self.in_b.v)
+
+        with m.FSM() as fsm:
+
+            # ******
+            # gets operand a
+
+            with m.State("get_a"):
+                res = self.get_op(m, self.in_a, a, "get_b")
+                m.d.sync += eq([a, self.in_a.ack], res)
+
+            # ******
+            # gets operand b
+
+            with m.State("get_b"):
+                res = self.get_op(m, self.in_b, b, "special_cases")
+                m.d.sync += eq([b, self.in_b.ack], res)
+
+            # ******
+            # special cases
+
+            with m.State("special_cases"):
+                #if a or b is NaN return NaN
+                with m.If(a.is_nan | b.is_nan):
+                    m.next = "put_z"
+                    m.d.sync += z.nan(1)
+                #if a is inf return inf
+                with m.Elif(a.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(a.s ^ b.s)
+                    #if b is zero return NaN
+                    with m.If(b.is_zero):
+                        m.d.sync += z.nan(1)
+                #if b is inf return inf
+                with m.Elif(b.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(a.s ^ b.s)
+                    #if a is zero return NaN
+                    with m.If(a.is_zero):
+                        m.next = "put_z"
+                        m.d.sync += z.nan(1)
+                #if a is zero return zero
+                with m.Elif(a.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.zero(a.s ^ b.s)
+                #if b is zero return zero
+                with m.Elif(b.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.zero(a.s ^ b.s)
+                # Denormalised Number checks
+                with m.Else():
+                    m.next = "normalise_a"
+                    self.denormalise(m, a)
+                    self.denormalise(m, b)
+
+            # ******
+            # normalise_a
+
+            with m.State("normalise_a"):
+                self.op_normalise(m, a, "normalise_b")
+
+            # ******
+            # normalise_b
+
+            with m.State("normalise_b"):
+                self.op_normalise(m, b, "multiply_0")
+
+            #multiply_0
+            with m.State("multiply_0"):
+                m.next = "multiply_1"
+                m.d.sync += [
+                   z.s.eq(a.s ^ b.s),
+                   z.e.eq(a.e + b.e + 1),
+                   product.eq(a.m * b.m * 4)
+                ]
+
+            #multiply_1
+            with m.State("multiply_1"):
+                mw = z.m_width
+                m.next = "normalise_1"
+                m.d.sync += [
+                z.m.eq(product[mw+2:]),
+                of.guard.eq(product[mw+1]),
+                of.round_bit.eq(product[mw]),
+                of.sticky.eq(product[0:mw] != 0)
+            ]
+
+            # ******
+            # First stage of normalisation.
+            with m.State("normalise_1"):
+                self.normalise_1(m, z, of, "normalise_2")
+
+            # ******
+            # Second stage of normalisation.
+
+            with m.State("normalise_2"):
+                self.normalise_2(m, z, of, "round")
+
+            # ******
+            # rounding stage
+
+            with m.State("round"):
+                self.roundz(m, z, of.roundz)
+                m.next = "corrections"
+
+            # ******
+            # correction stage
+
+            with m.State("corrections"):
+                self.corrections(m, z, "pack")
+
+            # ******
+            # pack stage
+            with m.State("pack"):
+                self.pack(m, z, "put_z")
+
+            # ******
+            # put_z stage
+
+            with m.State("put_z"):
+                self.put_z(m, z, self.out_z, "get_a")
+
+        return m
+
+
+if __name__ == "__main__":
+    alu = FPMUL(width=32)
+    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
diff --git a/src/ieee754/add/fpadd/__init__.py b/src/ieee754/add/fpadd/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ieee754/add/fpadd/add0.py b/src/ieee754/add/fpadd/add0.py
new file mode 100644
index 00000000..76790fe2
--- /dev/null
+++ b/src/ieee754/add/fpadd/add0.py
@@ -0,0 +1,113 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat, Elaboratable
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumBase
+from fpbase import FPState
+from fpcommon.denorm import FPSCData
+
+
+class FPAddStage0Data:
+
+    def __init__(self, width, id_wid):
+        self.z = FPNumBase(width, False)
+        self.out_do_z = Signal(reset_less=True)
+        self.oz = Signal(width, reset_less=True)
+        self.tot = Signal(self.z.m_width + 4, reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.tot.eq(i.tot), self.mid.eq(i.mid)]
+
+
+class FPAddStage0Mod(Elaboratable):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPAddStage0Data(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.add0 = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.add0_in_a = self.i.a
+        m.submodules.add0_in_b = self.i.b
+        m.submodules.add0_out_z = self.o.z
+
+        # store intermediate tests (and zero-extended mantissas)
+        seq = Signal(reset_less=True)
+        mge = Signal(reset_less=True)
+        am0 = Signal(len(self.i.a.m)+1, reset_less=True)
+        bm0 = Signal(len(self.i.b.m)+1, reset_less=True)
+        m.d.comb += [seq.eq(self.i.a.s == self.i.b.s),
+                     mge.eq(self.i.a.m >= self.i.b.m),
+                     am0.eq(Cat(self.i.a.m, 0)),
+                     bm0.eq(Cat(self.i.b.m, 0))
+                    ]
+        # same-sign (both negative or both positive) add mantissas
+        with m.If(~self.i.out_do_z):
+            m.d.comb += self.o.z.e.eq(self.i.a.e)
+            with m.If(seq):
+                m.d.comb += [
+                    self.o.tot.eq(am0 + bm0),
+                    self.o.z.s.eq(self.i.a.s)
+                ]
+            # a mantissa greater than b, use a
+            with m.Elif(mge):
+                m.d.comb += [
+                    self.o.tot.eq(am0 - bm0),
+                    self.o.z.s.eq(self.i.a.s)
+                ]
+            # b mantissa greater than a, use b
+            with m.Else():
+                m.d.comb += [
+                    self.o.tot.eq(bm0 - am0),
+                    self.o.z.s.eq(self.i.b.s)
+            ]
+
+        m.d.comb += self.o.oz.eq(self.i.oz)
+        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
+        m.d.comb += self.o.mid.eq(self.i.mid)
+        return m
+
+
+class FPAddStage0(FPState):
+    """ First stage of add.  covers same-sign (add) and subtract
+        special-casing when mantissas are greater or equal, to
+        give greatest accuracy.
+    """
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "add_0")
+        self.mod = FPAddStage0Mod(width)
+        self.o = self.mod.ospec()
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+        # NOTE: these could be done as combinatorial (merge add0+add1)
+        m.d.sync += self.o.eq(self.mod.o)
+
+    def action(self, m):
+        m.next = "add_1"
diff --git a/src/ieee754/add/fpadd/add1.py b/src/ieee754/add/fpadd/add1.py
new file mode 100644
index 00000000..679f5176
--- /dev/null
+++ b/src/ieee754/add/fpadd/add1.py
@@ -0,0 +1,95 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import FPState
+from fpcommon.postcalc import FPAddStage1Data
+from fpadd.add0 import FPAddStage0Data
+
+
+class FPAddStage1Mod(FPState, Elaboratable):
+    """ Second stage of add: preparation for normalisation.
+        detects when tot sum is too big (tot[27] is kinda a carry bit)
+    """
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPAddStage0Data(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPAddStage1Data(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.add1 = self
+        m.submodules.add1_out_overflow = self.o.of
+
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.z.eq(self.i.z)
+        # tot[-1] (MSB) gets set when the sum overflows. shift result down
+        with m.If(~self.i.out_do_z):
+            with m.If(self.i.tot[-1]):
+                m.d.comb += [
+                    self.o.z.m.eq(self.i.tot[4:]),
+                    self.o.of.m0.eq(self.i.tot[4]),
+                    self.o.of.guard.eq(self.i.tot[3]),
+                    self.o.of.round_bit.eq(self.i.tot[2]),
+                    self.o.of.sticky.eq(self.i.tot[1] | self.i.tot[0]),
+                    self.o.z.e.eq(self.i.z.e + 1)
+            ]
+            # tot[-1] (MSB) zero case
+            with m.Else():
+                m.d.comb += [
+                    self.o.z.m.eq(self.i.tot[3:]),
+                    self.o.of.m0.eq(self.i.tot[3]),
+                    self.o.of.guard.eq(self.i.tot[2]),
+                    self.o.of.round_bit.eq(self.i.tot[1]),
+                    self.o.of.sticky.eq(self.i.tot[0])
+            ]
+
+        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
+        m.d.comb += self.o.oz.eq(self.i.oz)
+        m.d.comb += self.o.mid.eq(self.i.mid)
+
+        return m
+
+
+class FPAddStage1(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "add_1")
+        self.mod = FPAddStage1Mod(width)
+        self.out_z = FPNumBase(width, False)
+        self.out_of = Overflow()
+        self.norm_stb = Signal()
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+        m.d.sync += self.norm_stb.eq(0) # sets to zero when not in add1 state
+
+        m.d.sync += self.out_of.eq(self.mod.out_of)
+        m.d.sync += self.out_z.eq(self.mod.out_z)
+        m.d.sync += self.norm_stb.eq(1)
+
+    def action(self, m):
+        m.next = "normalise_1"
+
diff --git a/src/ieee754/add/fpadd/addstages.py b/src/ieee754/add/fpadd/addstages.py
new file mode 100644
index 00000000..f5703aec
--- /dev/null
+++ b/src/ieee754/add/fpadd/addstages.py
@@ -0,0 +1,55 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module
+from nmigen.cli import main, verilog
+
+from singlepipe import (StageChain, SimpleHandshake,
+                        PassThroughStage)
+
+from fpbase import FPState
+from fpcommon.denorm import FPSCData
+from fpcommon.postcalc import FPAddStage1Data
+from fpadd.align import FPAddAlignSingleMod
+from fpadd.add0 import FPAddStage0Mod
+from fpadd.add1 import FPAddStage1Mod
+
+
+class FPAddAlignSingleAdd(FPState, SimpleHandshake):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "align")
+        self.width = width
+        self.id_wid = id_wid
+        SimpleHandshake.__init__(self, self) # pipeline is its own stage
+        self.a1o = self.ospec()
+
+    def ispec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPAddStage1Data(self.width, self.id_wid) # AddStage1 ospec
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+
+        # chain AddAlignSingle, AddStage0 and AddStage1
+        mod = FPAddAlignSingleMod(self.width, self.id_wid)
+        a0mod = FPAddStage0Mod(self.width, self.id_wid)
+        a1mod = FPAddStage1Mod(self.width, self.id_wid)
+
+        chain = StageChain([mod, a0mod, a1mod])
+        chain.setup(m, i)
+
+        self.o = a1mod.o
+
+    def process(self, i):
+        return self.o
+
+    def action(self, m):
+        m.d.sync += self.a1o.eq(self.process(None))
+        m.next = "normalise_1"
+
+
diff --git a/src/ieee754/add/fpadd/align.py b/src/ieee754/add/fpadd/align.py
new file mode 100644
index 00000000..9837a0b8
--- /dev/null
+++ b/src/ieee754/add/fpadd/align.py
@@ -0,0 +1,211 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumOut, FPNumIn, FPNumBase
+from fpbase import MultiShiftRMerge
+from fpbase import FPState
+from fpcommon.denorm import FPSCData
+
+
+class FPNumIn2Ops:
+
+    def __init__(self, width, id_wid):
+        self.a = FPNumIn(None, width)
+        self.b = FPNumIn(None, width)
+        self.z = FPNumOut(width, False)
+        self.out_do_z = Signal(reset_less=True)
+        self.oz = Signal(width, reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
+
+
+
+class FPAddAlignMultiMod(FPState):
+
+    def __init__(self, width):
+        self.in_a = FPNumBase(width)
+        self.in_b = FPNumBase(width)
+        self.out_a = FPNumIn(None, width)
+        self.out_b = FPNumIn(None, width)
+        self.exp_eq = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        # This one however (single-cycle) will do the shift
+        # in one go.
+
+        m = Module()
+
+        m.submodules.align_in_a = self.in_a
+        m.submodules.align_in_b = self.in_b
+        m.submodules.align_out_a = self.out_a
+        m.submodules.align_out_b = self.out_b
+
+        # NOTE: this does *not* do single-cycle multi-shifting,
+        #       it *STAYS* in the align state until exponents match
+
+        # exponent of a greater than b: shift b down
+        m.d.comb += self.exp_eq.eq(0)
+        m.d.comb += self.out_a.eq(self.in_a)
+        m.d.comb += self.out_b.eq(self.in_b)
+        agtb = Signal(reset_less=True)
+        altb = Signal(reset_less=True)
+        m.d.comb += agtb.eq(self.in_a.e > self.in_b.e)
+        m.d.comb += altb.eq(self.in_a.e < self.in_b.e)
+        with m.If(agtb):
+            m.d.comb += self.out_b.shift_down(self.in_b)
+        # exponent of b greater than a: shift a down
+        with m.Elif(altb):
+            m.d.comb += self.out_a.shift_down(self.in_a)
+        # exponents equal: move to next stage.
+        with m.Else():
+            m.d.comb += self.exp_eq.eq(1)
+        return m
+
+
+class FPAddAlignMulti(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "align")
+        self.mod = FPAddAlignMultiMod(width)
+        self.out_a = FPNumIn(None, width)
+        self.out_b = FPNumIn(None, width)
+        self.exp_eq = Signal(reset_less=True)
+
+    def setup(self, m, in_a, in_b):
+        """ links module to inputs and outputs
+        """
+        m.submodules.align = self.mod
+        m.d.comb += self.mod.in_a.eq(in_a)
+        m.d.comb += self.mod.in_b.eq(in_b)
+        m.d.comb += self.exp_eq.eq(self.mod.exp_eq)
+        m.d.sync += self.out_a.eq(self.mod.out_a)
+        m.d.sync += self.out_b.eq(self.mod.out_b)
+
+    def action(self, m):
+        with m.If(self.exp_eq):
+            m.next = "add_0"
+
+
+class FPAddAlignSingleMod:
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPNumIn2Ops(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.align = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        """ Aligns A against B or B against A, depending on which has the
+            greater exponent.  This is done in a *single* cycle using
+            variable-width bit-shift
+
+            the shifter used here is quite expensive in terms of gates.
+            Mux A or B in (and out) into temporaries, as only one of them
+            needs to be aligned against the other
+        """
+        m = Module()
+
+        m.submodules.align_in_a = self.i.a
+        m.submodules.align_in_b = self.i.b
+        m.submodules.align_out_a = self.o.a
+        m.submodules.align_out_b = self.o.b
+
+        # temporary (muxed) input and output to be shifted
+        t_inp = FPNumBase(self.width)
+        t_out = FPNumIn(None, self.width)
+        espec = (len(self.i.a.e), True)
+        msr = MultiShiftRMerge(self.i.a.m_width, espec)
+        m.submodules.align_t_in = t_inp
+        m.submodules.align_t_out = t_out
+        m.submodules.multishift_r = msr
+
+        ediff = Signal(espec, reset_less=True)
+        ediffr = Signal(espec, reset_less=True)
+        tdiff = Signal(espec, reset_less=True)
+        elz = Signal(reset_less=True)
+        egz = Signal(reset_less=True)
+
+        # connect multi-shifter to t_inp/out mantissa (and tdiff)
+        m.d.comb += msr.inp.eq(t_inp.m)
+        m.d.comb += msr.diff.eq(tdiff)
+        m.d.comb += t_out.m.eq(msr.m)
+        m.d.comb += t_out.e.eq(t_inp.e + tdiff)
+        m.d.comb += t_out.s.eq(t_inp.s)
+
+        m.d.comb += ediff.eq(self.i.a.e - self.i.b.e)
+        m.d.comb += ediffr.eq(self.i.b.e - self.i.a.e)
+        m.d.comb += elz.eq(self.i.a.e < self.i.b.e)
+        m.d.comb += egz.eq(self.i.a.e > self.i.b.e)
+
+        # default: A-exp == B-exp, A and B untouched (fall through)
+        m.d.comb += self.o.a.eq(self.i.a)
+        m.d.comb += self.o.b.eq(self.i.b)
+        # only one shifter (muxed)
+        #m.d.comb += t_out.shift_down_multi(tdiff, t_inp)
+        # exponent of a greater than b: shift b down
+        with m.If(~self.i.out_do_z):
+            with m.If(egz):
+                m.d.comb += [t_inp.eq(self.i.b),
+                             tdiff.eq(ediff),
+                             self.o.b.eq(t_out),
+                             self.o.b.s.eq(self.i.b.s), # whoops forgot sign
+                            ]
+            # exponent of b greater than a: shift a down
+            with m.Elif(elz):
+                m.d.comb += [t_inp.eq(self.i.a),
+                             tdiff.eq(ediffr),
+                             self.o.a.eq(t_out),
+                             self.o.a.s.eq(self.i.a.s), # whoops forgot sign
+                            ]
+
+        m.d.comb += self.o.mid.eq(self.i.mid)
+        m.d.comb += self.o.z.eq(self.i.z)
+        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
+        m.d.comb += self.o.oz.eq(self.i.oz)
+
+        return m
+
+
+class FPAddAlignSingle(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "align")
+        self.mod = FPAddAlignSingleMod(width, id_wid)
+        self.out_a = FPNumIn(None, width)
+        self.out_b = FPNumIn(None, width)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+        # NOTE: could be done as comb
+        m.d.sync += self.out_a.eq(self.mod.out_a)
+        m.d.sync += self.out_b.eq(self.mod.out_b)
+
+    def action(self, m):
+        m.next = "add_0"
+
+
diff --git a/src/ieee754/add/fpadd/pipeline.py b/src/ieee754/add/fpadd/pipeline.py
new file mode 100644
index 00000000..e244ee60
--- /dev/null
+++ b/src/ieee754/add/fpadd/pipeline.py
@@ -0,0 +1,59 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module
+from nmigen.cli import main, verilog
+
+from singlepipe import (ControlBase, SimpleHandshake, PassThroughStage)
+from multipipe import CombMuxOutPipe
+from multipipe import PriorityCombMuxInPipe
+
+from fpcommon.getop import FPADDBaseData
+from fpcommon.denorm import FPSCData
+from fpcommon.pack import FPPackData
+from fpcommon.normtopack import FPNormToPack
+from fpadd.specialcases import FPAddSpecialCasesDeNorm
+from fpadd.addstages import FPAddAlignSingleAdd
+
+from concurrentunit import ReservationStations, num_bits
+
+
+class FPADDBasePipe(ControlBase):
+    def __init__(self, width, id_wid):
+        ControlBase.__init__(self)
+        self.pipe1 = FPAddSpecialCasesDeNorm(width, id_wid)
+        self.pipe2 = FPAddAlignSingleAdd(width, id_wid)
+        self.pipe3 = FPNormToPack(width, id_wid)
+
+        self._eqs = self.connect([self.pipe1, self.pipe2, self.pipe3])
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+        m.submodules.scnorm = self.pipe1
+        m.submodules.addalign = self.pipe2
+        m.submodules.normpack = self.pipe3
+        m.d.comb += self._eqs
+        return m
+
+
+class FPADDMuxInOut(ReservationStations):
+    """ Reservation-Station version of FPADD pipeline.
+
+        * fan-in on inputs (an array of FPADDBaseData: a,b,mid)
+        * 3-stage adder pipeline
+        * fan-out on outputs (an array of FPPackData: z,mid)
+
+        Fan-in and Fan-out are combinatorial.
+    """
+    def __init__(self, width, num_rows):
+        self.width = width
+        self.id_wid = num_bits(width)
+        self.alu = FPADDBasePipe(width, self.id_wid)
+        ReservationStations.__init__(self, num_rows)
+
+    def i_specfn(self):
+        return FPADDBaseData(self.width, self.id_wid)
+
+    def o_specfn(self):
+        return FPPackData(self.width, self.id_wid)
diff --git a/src/ieee754/add/fpadd/specialcases.py b/src/ieee754/add/fpadd/specialcases.py
new file mode 100644
index 00000000..6f9d1a08
--- /dev/null
+++ b/src/ieee754/add/fpadd/specialcases.py
@@ -0,0 +1,223 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat, Const
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import FPNumDecode
+from singlepipe import SimpleHandshake, StageChain
+
+from fpbase import FPState, FPID
+from fpcommon.getop import FPADDBaseData
+from fpcommon.denorm import (FPSCData, FPAddDeNormMod)
+
+
+class FPAddSpecialCasesMod:
+    """ special cases: NaNs, infs, zeros, denormalised
+        NOTE: some of these are unique to add.  see "Special Operations"
+        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
+    """
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPADDBaseData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.specialcases = self
+        m.d.comb += self.i.eq(i)
+
+    def process(self, i):
+        return self.o
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.submodules.sc_out_z = self.o.z
+
+        # decode: XXX really should move to separate stage
+        a1 = FPNumDecode(None, self.width)
+        b1 = FPNumDecode(None, self.width)
+        m.submodules.sc_decode_a = a1
+        m.submodules.sc_decode_b = b1
+        m.d.comb += [a1.v.eq(self.i.a),
+                     b1.v.eq(self.i.b),
+                     self.o.a.eq(a1),
+                     self.o.b.eq(b1)
+                    ]
+
+        s_nomatch = Signal(reset_less=True)
+        m.d.comb += s_nomatch.eq(a1.s != b1.s)
+
+        m_match = Signal(reset_less=True)
+        m.d.comb += m_match.eq(a1.m == b1.m)
+
+        e_match = Signal(reset_less=True)
+        m.d.comb += e_match.eq(a1.e == b1.e)
+
+        aeqmb = Signal(reset_less=True)
+        m.d.comb += aeqmb.eq(s_nomatch & m_match & e_match)
+
+        abz = Signal(reset_less=True)
+        m.d.comb += abz.eq(a1.is_zero & b1.is_zero)
+
+        abnan = Signal(reset_less=True)
+        m.d.comb += abnan.eq(a1.is_nan | b1.is_nan)
+
+        bexp128s = Signal(reset_less=True)
+        m.d.comb += bexp128s.eq(b1.exp_128 & s_nomatch)
+
+        # if a is NaN or b is NaN return NaN
+        with m.If(abnan):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.nan(0)
+
+        # XXX WEIRDNESS for FP16 non-canonical NaN handling
+        # under review
+
+        ## if a is zero and b is NaN return -b
+        #with m.If(a.is_zero & (a.s==0) & b.is_nan):
+        #    m.d.comb += self.o.out_do_z.eq(1)
+        #    m.d.comb += z.create(b.s, b.e, Cat(b.m[3:-2], ~b.m[0]))
+
+        ## if b is zero and a is NaN return -a
+        #with m.Elif(b.is_zero & (b.s==0) & a.is_nan):
+        #    m.d.comb += self.o.out_do_z.eq(1)
+        #    m.d.comb += z.create(a.s, a.e, Cat(a.m[3:-2], ~a.m[0]))
+
+        ## if a is -zero and b is NaN return -b
+        #with m.Elif(a.is_zero & (a.s==1) & b.is_nan):
+        #    m.d.comb += self.o.out_do_z.eq(1)
+        #    m.d.comb += z.create(a.s & b.s, b.e, Cat(b.m[3:-2], 1))
+
+        ## if b is -zero and a is NaN return -a
+        #with m.Elif(b.is_zero & (b.s==1) & a.is_nan):
+        #    m.d.comb += self.o.out_do_z.eq(1)
+        #    m.d.comb += z.create(a.s & b.s, a.e, Cat(a.m[3:-2], 1))
+
+        # if a is inf return inf (or NaN)
+        with m.Elif(a1.is_inf):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.inf(a1.s)
+            # if a is inf and signs don't match return NaN
+            with m.If(bexp128s):
+                m.d.comb += self.o.z.nan(0)
+
+        # if b is inf return inf
+        with m.Elif(b1.is_inf):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.inf(b1.s)
+
+        # if a is zero and b zero return signed-a/b
+        with m.Elif(abz):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.create(a1.s & b1.s, b1.e, b1.m[3:-1])
+
+        # if a is zero return b
+        with m.Elif(a1.is_zero):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.create(b1.s, b1.e, b1.m[3:-1])
+
+        # if b is zero return a
+        with m.Elif(b1.is_zero):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.create(a1.s, a1.e, a1.m[3:-1])
+
+        # if a equal to -b return zero (+ve zero)
+        with m.Elif(aeqmb):
+            m.d.comb += self.o.out_do_z.eq(1)
+            m.d.comb += self.o.z.zero(0)
+
+        # Denormalised Number checks next, so pass a/b data through
+        with m.Else():
+            m.d.comb += self.o.out_do_z.eq(0)
+
+        m.d.comb += self.o.oz.eq(self.o.z.v)
+        m.d.comb += self.o.mid.eq(self.i.mid)
+
+        return m
+
+
+class FPAddSpecialCases(FPState):
+    """ special cases: NaNs, infs, zeros, denormalised
+        NOTE: some of these are unique to add.  see "Special Operations"
+        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
+    """
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "special_cases")
+        self.mod = FPAddSpecialCasesMod(width)
+        self.out_z = self.mod.ospec()
+        self.out_do_z = Signal(reset_less=True)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i, self.out_do_z)
+        m.d.sync += self.out_z.v.eq(self.mod.out_z.v) # only take the output
+        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)  # (and mid)
+
+    def action(self, m):
+        self.idsync(m)
+        with m.If(self.out_do_z):
+            m.next = "put_z"
+        with m.Else():
+            m.next = "denormalise"
+
+
+class FPAddSpecialCasesDeNorm(FPState, SimpleHandshake):
+    """ special cases: NaNs, infs, zeros, denormalised
+        NOTE: some of these are unique to add.  see "Special Operations"
+        https://steve.hollasch.net/cgindex/coding/ieeefloat.html
+    """
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "special_cases")
+        self.width = width
+        self.id_wid = id_wid
+        SimpleHandshake.__init__(self, self) # pipe is its own stage
+        self.out = self.ospec()
+
+    def ispec(self):
+        return FPADDBaseData(self.width, self.id_wid) # SpecialCases ispec
+
+    def ospec(self):
+        return FPSCData(self.width, self.id_wid) # DeNorm ospec
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        smod = FPAddSpecialCasesMod(self.width, self.id_wid)
+        dmod = FPAddDeNormMod(self.width, self.id_wid)
+
+        chain = StageChain([smod, dmod])
+        chain.setup(m, i)
+
+        # only needed for break-out (early-out)
+        # self.out_do_z = smod.o.out_do_z
+
+        self.o = dmod.o
+
+    def process(self, i):
+        return self.o
+
+    def action(self, m):
+        # for break-out (early-out)
+        #with m.If(self.out_do_z):
+        #    m.next = "put_z"
+        #with m.Else():
+            m.d.sync += self.out.eq(self.process(None))
+            m.next = "align"
+
+
diff --git a/src/ieee754/add/fpadd/statemachine.py b/src/ieee754/add/fpadd/statemachine.py
new file mode 100644
index 00000000..4418b3fa
--- /dev/null
+++ b/src/ieee754/add/fpadd/statemachine.py
@@ -0,0 +1,376 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat, Mux, Array, Const
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import FPOpIn, FPOpOut
+from fpbase import Trigger
+from singlepipe import (StageChain, SimpleHandshake)
+
+from fpbase import FPState, FPID
+from fpcommon.getop import (FPGetOp, FPADDBaseData, FPGet2Op)
+from fpcommon.denorm import (FPSCData, FPAddDeNorm)
+from fpcommon.postcalc import FPAddStage1Data
+from fpcommon.postnormalise import (FPNorm1Data,
+                            FPNorm1Single, FPNorm1Multi)
+from fpcommon.roundz import (FPRoundData, FPRound)
+from fpcommon.corrections import FPCorrections
+from fpcommon.pack import (FPPackData, FPPackMod, FPPack)
+from fpcommon.normtopack import FPNormToPack
+from fpcommon.putz import (FPPutZ, FPPutZIdx)
+
+from fpadd.specialcases import (FPAddSpecialCases, FPAddSpecialCasesDeNorm)
+from fpadd.align import (FPAddAlignMulti, FPAddAlignSingle)
+from fpadd.add0 import (FPAddStage0Data, FPAddStage0)
+from fpadd.add1 import (FPAddStage1Mod, FPAddStage1)
+from fpadd.addstages import FPAddAlignSingleAdd
+
+
+class FPOpData:
+    def __init__(self, width, id_wid):
+        self.z = FPOpOut(width)
+        self.z.data_o = Signal(width)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def __iter__(self):
+        yield self.z
+        yield self.mid
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.mid.eq(i.mid)]
+
+    def ports(self):
+        return list(self)
+
+
+class FPADDBaseMod:
+
+    def __init__(self, width, id_wid=None, single_cycle=False, compact=True):
+        """ IEEE754 FP Add
+
+            * width: bit-width of IEEE754.  supported: 16, 32, 64
+            * id_wid: an identifier that is sync-connected to the input
+            * single_cycle: True indicates each stage to complete in 1 clock
+            * compact: True indicates a reduced number of stages
+        """
+        self.width = width
+        self.id_wid = id_wid
+        self.single_cycle = single_cycle
+        self.compact = compact
+
+        self.in_t = Trigger()
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+        self.states = []
+
+    def ispec(self):
+        return FPADDBaseData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPOpData(self.width, self.id_wid)
+
+    def add_state(self, state):
+        self.states.append(state)
+        return state
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPAdd
+        """
+        m = Module()
+        m.submodules.out_z = self.o.z
+        m.submodules.in_t = self.in_t
+        if self.compact:
+            self.get_compact_fragment(m, platform)
+        else:
+            self.get_longer_fragment(m, platform)
+
+        with m.FSM() as fsm:
+
+            for state in self.states:
+                with m.State(state.state_from):
+                    state.action(m)
+
+        return m
+
+    def get_longer_fragment(self, m, platform=None):
+
+        get = self.add_state(FPGet2Op("get_ops", "special_cases",
+                                      self.width))
+        get.setup(m, self.i)
+        a = get.out_op1
+        b = get.out_op2
+        get.trigger_setup(m, self.in_t.stb, self.in_t.ack)
+
+        sc = self.add_state(FPAddSpecialCases(self.width, self.id_wid))
+        sc.setup(m, a, b, self.in_mid)
+
+        dn = self.add_state(FPAddDeNorm(self.width, self.id_wid))
+        dn.setup(m, a, b, sc.in_mid)
+
+        if self.single_cycle:
+            alm = self.add_state(FPAddAlignSingle(self.width, self.id_wid))
+            alm.setup(m, dn.out_a, dn.out_b, dn.in_mid)
+        else:
+            alm = self.add_state(FPAddAlignMulti(self.width, self.id_wid))
+            alm.setup(m, dn.out_a, dn.out_b, dn.in_mid)
+
+        add0 = self.add_state(FPAddStage0(self.width, self.id_wid))
+        add0.setup(m, alm.out_a, alm.out_b, alm.in_mid)
+
+        add1 = self.add_state(FPAddStage1(self.width, self.id_wid))
+        add1.setup(m, add0.out_tot, add0.out_z, add0.in_mid)
+
+        if self.single_cycle:
+            n1 = self.add_state(FPNorm1Single(self.width, self.id_wid))
+            n1.setup(m, add1.out_z, add1.out_of, add0.in_mid)
+        else:
+            n1 = self.add_state(FPNorm1Multi(self.width, self.id_wid))
+            n1.setup(m, add1.out_z, add1.out_of, add1.norm_stb, add0.in_mid)
+
+        rn = self.add_state(FPRound(self.width, self.id_wid))
+        rn.setup(m, n1.out_z, n1.out_roundz, n1.in_mid)
+
+        cor = self.add_state(FPCorrections(self.width, self.id_wid))
+        cor.setup(m, rn.out_z, rn.in_mid)
+
+        pa = self.add_state(FPPack(self.width, self.id_wid))
+        pa.setup(m, cor.out_z, rn.in_mid)
+
+        ppz = self.add_state(FPPutZ("pack_put_z", pa.out_z, self.out_z,
+                                    pa.in_mid, self.out_mid))
+
+        pz = self.add_state(FPPutZ("put_z", sc.out_z, self.out_z,
+                                    pa.in_mid, self.out_mid))
+
+    def get_compact_fragment(self, m, platform=None):
+
+        get = FPGet2Op("get_ops", "special_cases", self.width, self.id_wid)
+        sc = FPAddSpecialCasesDeNorm(self.width, self.id_wid)
+        alm = FPAddAlignSingleAdd(self.width, self.id_wid)
+        n1 = FPNormToPack(self.width, self.id_wid)
+
+        get.trigger_setup(m, self.in_t.stb, self.in_t.ack)
+
+        chainlist = [get, sc, alm, n1]
+        chain = StageChain(chainlist, specallocate=True)
+        chain.setup(m, self.i)
+
+        for mod in chainlist:
+            sc = self.add_state(mod)
+
+        ppz = self.add_state(FPPutZ("pack_put_z", n1.out_z.z, self.o,
+                                    n1.out_z.mid, self.o.mid))
+
+        #pz = self.add_state(FPPutZ("put_z", sc.out_z.z, self.o,
+        #                            sc.o.mid, self.o.mid))
+
+
+class FPADDBase(FPState):
+
+    def __init__(self, width, id_wid=None, single_cycle=False):
+        """ IEEE754 FP Add
+
+            * width: bit-width of IEEE754.  supported: 16, 32, 64
+            * id_wid: an identifier that is sync-connected to the input
+            * single_cycle: True indicates each stage to complete in 1 clock
+        """
+        FPState.__init__(self, "fpadd")
+        self.width = width
+        self.single_cycle = single_cycle
+        self.mod = FPADDBaseMod(width, id_wid, single_cycle)
+        self.o = self.ospec()
+
+        self.in_t = Trigger()
+        self.i = self.ispec()
+
+        self.z_done = Signal(reset_less=True) # connects to out_z Strobe
+        self.in_accept = Signal(reset_less=True)
+        self.add_stb = Signal(reset_less=True)
+        self.add_ack = Signal(reset=0, reset_less=True)
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def setup(self, m, i, add_stb, in_mid):
+        m.d.comb += [self.i.eq(i),
+                     self.mod.i.eq(self.i),
+                     self.z_done.eq(self.mod.o.z.trigger),
+                     #self.add_stb.eq(add_stb),
+                     self.mod.in_t.stb.eq(self.in_t.stb),
+                     self.in_t.ack.eq(self.mod.in_t.ack),
+                     self.o.mid.eq(self.mod.o.mid),
+                     self.o.z.v.eq(self.mod.o.z.v),
+                     self.o.z.valid_o.eq(self.mod.o.z.valid_o),
+                     self.mod.o.z.ready_i.eq(self.o.z.ready_i_test),
+                    ]
+
+        m.d.sync += self.add_stb.eq(add_stb)
+        m.d.sync += self.add_ack.eq(0) # sets to zero when not in active state
+        m.d.sync += self.o.z.ready_i.eq(0) # likewise
+        #m.d.sync += self.in_t.stb.eq(0)
+
+        m.submodules.fpadd = self.mod
+
+    def action(self, m):
+
+        # in_accept is set on incoming strobe HIGH and ack LOW.
+        m.d.comb += self.in_accept.eq((~self.add_ack) & (self.add_stb))
+
+        #with m.If(self.in_t.ack):
+        #    m.d.sync += self.in_t.stb.eq(0)
+        with m.If(~self.z_done):
+            # not done: test for accepting an incoming operand pair
+            with m.If(self.in_accept):
+                m.d.sync += [
+                    self.add_ack.eq(1), # acknowledge receipt...
+                    self.in_t.stb.eq(1), # initiate add
+                ]
+            with m.Else():
+                m.d.sync += [self.add_ack.eq(0),
+                             self.in_t.stb.eq(0),
+                             self.o.z.ready_i.eq(1),
+                            ]
+        with m.Else():
+            # done: acknowledge, and write out id and value
+            m.d.sync += [self.add_ack.eq(1),
+                         self.in_t.stb.eq(0)
+                        ]
+            m.next = "put_z"
+
+            return
+
+            if self.in_mid is not None:
+                m.d.sync += self.out_mid.eq(self.mod.out_mid)
+
+            m.d.sync += [
+              self.out_z.v.eq(self.mod.out_z.v)
+            ]
+            # move to output state on detecting z ack
+            with m.If(self.out_z.trigger):
+                m.d.sync += self.out_z.stb.eq(0)
+                m.next = "put_z"
+            with m.Else():
+                m.d.sync += self.out_z.stb.eq(1)
+
+
+class FPADD(FPID):
+    """ FPADD: stages as follows:
+
+        FPGetOp (a)
+           |
+        FPGetOp (b)
+           |
+        FPAddBase---> FPAddBaseMod
+           |            |
+        PutZ          GetOps->Specials->Align->Add1/2->Norm->Round/Pack->PutZ
+
+        FPAddBase is tricky: it is both a stage and *has* stages.
+        Connection to FPAddBaseMod therefore requires an in stb/ack
+        and an out stb/ack.  Just as with Add1-Norm1 interaction, FPGetOp
+        needs to be the thing that raises the incoming stb.
+    """
+
+    def __init__(self, width, id_wid=None, single_cycle=False, rs_sz=2):
+        """ IEEE754 FP Add
+
+            * width: bit-width of IEEE754.  supported: 16, 32, 64
+            * id_wid: an identifier that is sync-connected to the input
+            * single_cycle: True indicates each stage to complete in 1 clock
+        """
+        self.width = width
+        self.id_wid = id_wid
+        self.single_cycle = single_cycle
+
+        #self.out_z = FPOp(width)
+        self.ids = FPID(id_wid)
+
+        rs = []
+        for i in range(rs_sz):
+            in_a  = FPOpIn(width)
+            in_b  = FPOpIn(width)
+            in_a.data_i = Signal(width)
+            in_b.data_i = Signal(width)
+            in_a.name = "in_a_%d" % i
+            in_b.name = "in_b_%d" % i
+            rs.append((in_a, in_b))
+        self.rs = Array(rs)
+
+        res = []
+        for i in range(rs_sz):
+            out_z = FPOpOut(width)
+            out_z.data_o = Signal(width)
+            out_z.name = "out_z_%d" % i
+            res.append(out_z)
+        self.res = Array(res)
+
+        self.states = []
+
+    def add_state(self, state):
+        self.states.append(state)
+        return state
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPAdd
+        """
+        m = Module()
+        #m.submodules += self.rs
+
+        in_a = self.rs[0][0]
+        in_b = self.rs[0][1]
+
+        geta = self.add_state(FPGetOp("get_a", "get_b",
+                                      in_a, self.width))
+        geta.setup(m, in_a)
+        a = geta.out_op
+
+        getb = self.add_state(FPGetOp("get_b", "fpadd",
+                                      in_b, self.width))
+        getb.setup(m, in_b)
+        b = getb.out_op
+
+        ab = FPADDBase(self.width, self.id_wid, self.single_cycle)
+        ab = self.add_state(ab)
+        abd = ab.ispec() # create an input spec object for FPADDBase
+        m.d.sync += [abd.a.eq(a), abd.b.eq(b), abd.mid.eq(self.ids.in_mid)]
+        ab.setup(m, abd, getb.out_decode, self.ids.in_mid)
+        o = ab.o
+
+        pz = self.add_state(FPPutZIdx("put_z", o.z, self.res,
+                                    o.mid, "get_a"))
+
+        with m.FSM() as fsm:
+
+            for state in self.states:
+                with m.State(state.state_from):
+                    state.action(m)
+
+        return m
+
+
+if __name__ == "__main__":
+    if True:
+        alu = FPADD(width=32, id_wid=5, single_cycle=True)
+        main(alu, ports=alu.rs[0][0].ports() + \
+                        alu.rs[0][1].ports() + \
+                        alu.res[0].ports() + \
+                        [alu.ids.in_mid, alu.ids.out_mid])
+    else:
+        alu = FPADDBase(width=32, id_wid=5, single_cycle=True)
+        main(alu, ports=[alu.in_a, alu.in_b] + \
+                        alu.in_t.ports() + \
+                        alu.out_z.ports() + \
+                        [alu.in_mid, alu.out_mid])
+
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/fpbase.py b/src/ieee754/add/fpbase.py
new file mode 100644
index 00000000..f4908592
--- /dev/null
+++ b/src/ieee754/add/fpbase.py
@@ -0,0 +1,733 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Signal, Cat, Const, Mux, Module, Elaboratable
+from math import log
+from operator import or_
+from functools import reduce
+
+from singlepipe import PrevControl, NextControl
+from pipeline import ObjectProxy
+
+
+class MultiShiftR:
+
+    def __init__(self, width):
+        self.width = width
+        self.smax = int(log(width) / log(2))
+        self.i = Signal(width, reset_less=True)
+        self.s = Signal(self.smax, reset_less=True)
+        self.o = Signal(width, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.o.eq(self.i >> self.s)
+        return m
+
+
+class MultiShift:
+    """ Generates variable-length single-cycle shifter from a series
+        of conditional tests on each bit of the left/right shift operand.
+        Each bit tested produces output shifted by that number of bits,
+        in a binary fashion: bit 1 if set shifts by 1 bit, bit 2 if set
+        shifts by 2 bits, each partial result cascading to the next Mux.
+
+        Could be adapted to do arithmetic shift by taking copies of the
+        MSB instead of zeros.
+    """
+
+    def __init__(self, width):
+        self.width = width
+        self.smax = int(log(width) / log(2))
+
+    def lshift(self, op, s):
+        res = op << s
+        return res[:len(op)]
+        res = op
+        for i in range(self.smax):
+            zeros = [0] * (1<<i)
+            res = Mux(s & (1<<i), Cat(zeros, res[0:-(1<<i)]), res)
+        return res
+
+    def rshift(self, op, s):
+        res = op >> s
+        return res[:len(op)]
+        res = op
+        for i in range(self.smax):
+            zeros = [0] * (1<<i)
+            res = Mux(s & (1<<i), Cat(res[(1<<i):], zeros), res)
+        return res
+
+
+class FPNumBase: #(Elaboratable):
+    """ Floating-point Base Number Class
+    """
+    def __init__(self, width, m_extra=True):
+        self.width = width
+        m_width = {16: 11, 32: 24, 64: 53}[width] # 1 extra bit (overflow)
+        e_width = {16: 7,  32: 10, 64: 13}[width] # 2 extra bits (overflow)
+        e_max = 1<<(e_width-3)
+        self.rmw = m_width # real mantissa width (not including extras)
+        self.e_max = e_max
+        if m_extra:
+            # mantissa extra bits (top,guard,round)
+            self.m_extra = 3
+            m_width += self.m_extra
+        else:
+            self.m_extra = 0
+        #print (m_width, e_width, e_max, self.rmw, self.m_extra)
+        self.m_width = m_width
+        self.e_width = e_width
+        self.e_start = self.rmw - 1
+        self.e_end = self.rmw + self.e_width - 3 # for decoding
+
+        self.v = Signal(width, reset_less=True)      # Latched copy of value
+        self.m = Signal(m_width, reset_less=True)    # Mantissa
+        self.e = Signal((e_width, True), reset_less=True) # Exponent: IEEE754exp+2 bits, signed
+        self.s = Signal(reset_less=True)           # Sign bit
+
+        self.mzero = Const(0, (m_width, False))
+        m_msb = 1<<(self.m_width-2)
+        self.msb1 = Const(m_msb, (m_width, False))
+        self.m1s = Const(-1, (m_width, False))
+        self.P128 = Const(e_max, (e_width, True))
+        self.P127 = Const(e_max-1, (e_width, True))
+        self.N127 = Const(-(e_max-1), (e_width, True))
+        self.N126 = Const(-(e_max-2), (e_width, True))
+
+        self.is_nan = Signal(reset_less=True)
+        self.is_zero = Signal(reset_less=True)
+        self.is_inf = Signal(reset_less=True)
+        self.is_overflowed = Signal(reset_less=True)
+        self.is_denormalised = Signal(reset_less=True)
+        self.exp_128 = Signal(reset_less=True)
+        self.exp_sub_n126 = Signal((e_width, True), reset_less=True)
+        self.exp_lt_n126 = Signal(reset_less=True)
+        self.exp_gt_n126 = Signal(reset_less=True)
+        self.exp_gt127 = Signal(reset_less=True)
+        self.exp_n127 = Signal(reset_less=True)
+        self.exp_n126 = Signal(reset_less=True)
+        self.m_zero = Signal(reset_less=True)
+        self.m_msbzero = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.is_nan.eq(self._is_nan())
+        m.d.comb += self.is_zero.eq(self._is_zero())
+        m.d.comb += self.is_inf.eq(self._is_inf())
+        m.d.comb += self.is_overflowed.eq(self._is_overflowed())
+        m.d.comb += self.is_denormalised.eq(self._is_denormalised())
+        m.d.comb += self.exp_128.eq(self.e == self.P128)
+        m.d.comb += self.exp_sub_n126.eq(self.e - self.N126)
+        m.d.comb += self.exp_gt_n126.eq(self.exp_sub_n126 > 0)
+        m.d.comb += self.exp_lt_n126.eq(self.exp_sub_n126 < 0)
+        m.d.comb += self.exp_gt127.eq(self.e > self.P127)
+        m.d.comb += self.exp_n127.eq(self.e == self.N127)
+        m.d.comb += self.exp_n126.eq(self.e == self.N126)
+        m.d.comb += self.m_zero.eq(self.m == self.mzero)
+        m.d.comb += self.m_msbzero.eq(self.m[self.e_start] == 0)
+
+        return m
+
+    def _is_nan(self):
+        return (self.exp_128) & (~self.m_zero)
+
+    def _is_inf(self):
+        return (self.exp_128) & (self.m_zero)
+
+    def _is_zero(self):
+        return (self.exp_n127) & (self.m_zero)
+
+    def _is_overflowed(self):
+        return self.exp_gt127
+
+    def _is_denormalised(self):
+        return (self.exp_n126) & (self.m_msbzero)
+
+    def __iter__(self):
+        yield self.s
+        yield self.e
+        yield self.m
+
+    def eq(self, inp):
+        return [self.s.eq(inp.s), self.e.eq(inp.e), self.m.eq(inp.m)]
+
+
+class FPNumOut(FPNumBase):
+    """ Floating-point Number Class
+
+        Contains signals for an incoming copy of the value, decoded into
+        sign / exponent / mantissa.
+        Also contains encoding functions, creation and recognition of
+        zero, NaN and inf (all signed)
+
+        Four extra bits are included in the mantissa: the top bit
+        (m[-1]) is effectively a carry-overflow.  The other three are
+        guard (m[2]), round (m[1]), and sticky (m[0])
+    """
+    def __init__(self, width, m_extra=True):
+        FPNumBase.__init__(self, width, m_extra)
+
+    def elaborate(self, platform):
+        m = FPNumBase.elaborate(self, platform)
+
+        return m
+
+    def create(self, s, e, m):
+        """ creates a value from sign / exponent / mantissa
+
+            bias is added here, to the exponent
+        """
+        return [
+          self.v[-1].eq(s),          # sign
+          self.v[self.e_start:self.e_end].eq(e + self.P127), # exp (add on bias)
+          self.v[0:self.e_start].eq(m)         # mantissa
+        ]
+
+    def nan(self, s):
+        return self.create(s, self.P128, 1<<(self.e_start-1))
+
+    def inf(self, s):
+        return self.create(s, self.P128, 0)
+
+    def zero(self, s):
+        return self.create(s, self.N127, 0)
+
+    def create2(self, s, e, m):
+        """ creates a value from sign / exponent / mantissa
+
+            bias is added here, to the exponent
+        """
+        e = e + self.P127 # exp (add on bias)
+        return Cat(m[0:self.e_start],
+                   e[0:self.e_end-self.e_start],
+                   s)
+
+    def nan2(self, s):
+        return self.create2(s, self.P128, self.msb1)
+
+    def inf2(self, s):
+        return self.create2(s, self.P128, self.mzero)
+
+    def zero2(self, s):
+        return self.create2(s, self.N127, self.mzero)
+
+
+class MultiShiftRMerge(Elaboratable):
+    """ shifts down (right) and merges lower bits into m[0].
+        m[0] is the "sticky" bit, basically
+    """
+    def __init__(self, width, s_max=None):
+        if s_max is None:
+            s_max = int(log(width) / log(2))
+        self.smax = s_max
+        self.m = Signal(width, reset_less=True)
+        self.inp = Signal(width, reset_less=True)
+        self.diff = Signal(s_max, reset_less=True)
+        self.width = width
+
+    def elaborate(self, platform):
+        m = Module()
+
+        rs = Signal(self.width, reset_less=True)
+        m_mask = Signal(self.width, reset_less=True)
+        smask = Signal(self.width, reset_less=True)
+        stickybit = Signal(reset_less=True)
+        maxslen = Signal(self.smax, reset_less=True)
+        maxsleni = Signal(self.smax, reset_less=True)
+
+        sm = MultiShift(self.width-1)
+        m0s = Const(0, self.width-1)
+        mw = Const(self.width-1, len(self.diff))
+        m.d.comb += [maxslen.eq(Mux(self.diff > mw, mw, self.diff)),
+                     maxsleni.eq(Mux(self.diff > mw, 0, mw-self.diff)),
+                    ]
+
+        m.d.comb += [
+                # shift mantissa by maxslen, mask by inverse
+                rs.eq(sm.rshift(self.inp[1:], maxslen)),
+                m_mask.eq(sm.rshift(~m0s, maxsleni)),
+                smask.eq(self.inp[1:] & m_mask),
+                # sticky bit combines all mask (and mantissa low bit)
+                stickybit.eq(smask.bool() | self.inp[0]),
+                # mantissa result contains m[0] already.
+                self.m.eq(Cat(stickybit, rs))
+           ]
+        return m
+
+
+class FPNumShift(FPNumBase, Elaboratable):
+    """ Floating-point Number Class for shifting
+    """
+    def __init__(self, mainm, op, inv, width, m_extra=True):
+        FPNumBase.__init__(self, width, m_extra)
+        self.latch_in = Signal()
+        self.mainm = mainm
+        self.inv = inv
+        self.op = op
+
+    def elaborate(self, platform):
+        m = FPNumBase.elaborate(self, platform)
+
+        m.d.comb += self.s.eq(op.s)
+        m.d.comb += self.e.eq(op.e)
+        m.d.comb += self.m.eq(op.m)
+
+        with self.mainm.State("align"):
+            with m.If(self.e < self.inv.e):
+                m.d.sync += self.shift_down()
+
+        return m
+
+    def shift_down(self, inp):
+        """ shifts a mantissa down by one. exponent is increased to compensate
+
+            accuracy is lost as a result in the mantissa however there are 3
+            guard bits (the latter of which is the "sticky" bit)
+        """
+        return [self.e.eq(inp.e + 1),
+                self.m.eq(Cat(inp.m[0] | inp.m[1], inp.m[2:], 0))
+               ]
+
+    def shift_down_multi(self, diff):
+        """ shifts a mantissa down. exponent is increased to compensate
+
+            accuracy is lost as a result in the mantissa however there are 3
+            guard bits (the latter of which is the "sticky" bit)
+
+            this code works by variable-shifting the mantissa by up to
+            its maximum bit-length: no point doing more (it'll still be
+            zero).
+
+            the sticky bit is computed by shifting a batch of 1s by
+            the same amount, which will introduce zeros.  it's then
+            inverted and used as a mask to get the LSBs of the mantissa.
+            those are then |'d into the sticky bit.
+        """
+        sm = MultiShift(self.width)
+        mw = Const(self.m_width-1, len(diff))
+        maxslen = Mux(diff > mw, mw, diff)
+        rs = sm.rshift(self.m[1:], maxslen)
+        maxsleni = mw - maxslen
+        m_mask = sm.rshift(self.m1s[1:], maxsleni) # shift and invert
+
+        stickybits = reduce(or_, self.m[1:] & m_mask) | self.m[0]
+        return [self.e.eq(self.e + diff),
+                self.m.eq(Cat(stickybits, rs))
+               ]
+
+    def shift_up_multi(self, diff):
+        """ shifts a mantissa up. exponent is decreased to compensate
+        """
+        sm = MultiShift(self.width)
+        mw = Const(self.m_width, len(diff))
+        maxslen = Mux(diff > mw, mw, diff)
+
+        return [self.e.eq(self.e - diff),
+                self.m.eq(sm.lshift(self.m, maxslen))
+               ]
+
+
+class FPNumDecode(FPNumBase):
+    """ Floating-point Number Class
+
+        Contains signals for an incoming copy of the value, decoded into
+        sign / exponent / mantissa.
+        Also contains encoding functions, creation and recognition of
+        zero, NaN and inf (all signed)
+
+        Four extra bits are included in the mantissa: the top bit
+        (m[-1]) is effectively a carry-overflow.  The other three are
+        guard (m[2]), round (m[1]), and sticky (m[0])
+    """
+    def __init__(self, op, width, m_extra=True):
+        FPNumBase.__init__(self, width, m_extra)
+        self.op = op
+
+    def elaborate(self, platform):
+        m = FPNumBase.elaborate(self, platform)
+
+        m.d.comb += self.decode(self.v)
+
+        return m
+
+    def decode(self, v):
+        """ decodes a latched value into sign / exponent / mantissa
+
+            bias is subtracted here, from the exponent.  exponent
+            is extended to 10 bits so that subtract 127 is done on
+            a 10-bit number
+        """
+        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
+        #print ("decode", self.e_end)
+        return [self.m.eq(Cat(*args)), # mantissa
+                self.e.eq(v[self.e_start:self.e_end] - self.P127), # exp
+                self.s.eq(v[-1]),                 # sign
+                ]
+
+class FPNumIn(FPNumBase):
+    """ Floating-point Number Class
+
+        Contains signals for an incoming copy of the value, decoded into
+        sign / exponent / mantissa.
+        Also contains encoding functions, creation and recognition of
+        zero, NaN and inf (all signed)
+
+        Four extra bits are included in the mantissa: the top bit
+        (m[-1]) is effectively a carry-overflow.  The other three are
+        guard (m[2]), round (m[1]), and sticky (m[0])
+    """
+    def __init__(self, op, width, m_extra=True):
+        FPNumBase.__init__(self, width, m_extra)
+        self.latch_in = Signal()
+        self.op = op
+
+    def decode2(self, m):
+        """ decodes a latched value into sign / exponent / mantissa
+
+            bias is subtracted here, from the exponent.  exponent
+            is extended to 10 bits so that subtract 127 is done on
+            a 10-bit number
+        """
+        v = self.v
+        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
+        #print ("decode", self.e_end)
+        res = ObjectProxy(m, pipemode=False)
+        res.m = Cat(*args)                             # mantissa
+        res.e = v[self.e_start:self.e_end] - self.P127 # exp
+        res.s = v[-1]                                  # sign
+        return res
+
+    def decode(self, v):
+        """ decodes a latched value into sign / exponent / mantissa
+
+            bias is subtracted here, from the exponent.  exponent
+            is extended to 10 bits so that subtract 127 is done on
+            a 10-bit number
+        """
+        args = [0] * self.m_extra + [v[0:self.e_start]] # pad with extra zeros
+        #print ("decode", self.e_end)
+        return [self.m.eq(Cat(*args)), # mantissa
+                self.e.eq(v[self.e_start:self.e_end] - self.P127), # exp
+                self.s.eq(v[-1]),                 # sign
+                ]
+
+    def shift_down(self, inp):
+        """ shifts a mantissa down by one. exponent is increased to compensate
+
+            accuracy is lost as a result in the mantissa however there are 3
+            guard bits (the latter of which is the "sticky" bit)
+        """
+        return [self.e.eq(inp.e + 1),
+                self.m.eq(Cat(inp.m[0] | inp.m[1], inp.m[2:], 0))
+               ]
+
+    def shift_down_multi(self, diff, inp=None):
+        """ shifts a mantissa down. exponent is increased to compensate
+
+            accuracy is lost as a result in the mantissa however there are 3
+            guard bits (the latter of which is the "sticky" bit)
+
+            this code works by variable-shifting the mantissa by up to
+            its maximum bit-length: no point doing more (it'll still be
+            zero).
+
+            the sticky bit is computed by shifting a batch of 1s by
+            the same amount, which will introduce zeros.  it's then
+            inverted and used as a mask to get the LSBs of the mantissa.
+            those are then |'d into the sticky bit.
+        """
+        if inp is None:
+            inp = self
+        sm = MultiShift(self.width)
+        mw = Const(self.m_width-1, len(diff))
+        maxslen = Mux(diff > mw, mw, diff)
+        rs = sm.rshift(inp.m[1:], maxslen)
+        maxsleni = mw - maxslen
+        m_mask = sm.rshift(self.m1s[1:], maxsleni) # shift and invert
+
+        #stickybit = reduce(or_, inp.m[1:] & m_mask) | inp.m[0]
+        stickybit = (inp.m[1:] & m_mask).bool() | inp.m[0]
+        return [self.e.eq(inp.e + diff),
+                self.m.eq(Cat(stickybit, rs))
+               ]
+
+    def shift_up_multi(self, diff):
+        """ shifts a mantissa up. exponent is decreased to compensate
+        """
+        sm = MultiShift(self.width)
+        mw = Const(self.m_width, len(diff))
+        maxslen = Mux(diff > mw, mw, diff)
+
+        return [self.e.eq(self.e - diff),
+                self.m.eq(sm.lshift(self.m, maxslen))
+               ]
+
+class Trigger(Elaboratable):
+    def __init__(self):
+
+        self.stb = Signal(reset=0)
+        self.ack = Signal()
+        self.trigger = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.trigger.eq(self.stb & self.ack)
+        return m
+
+    def eq(self, inp):
+        return [self.stb.eq(inp.stb),
+                self.ack.eq(inp.ack)
+               ]
+
+    def ports(self):
+        return [self.stb, self.ack]
+
+
+class FPOpIn(PrevControl):
+    def __init__(self, width):
+        PrevControl.__init__(self)
+        self.width = width
+
+    @property
+    def v(self):
+        return self.data_i
+
+    def chain_inv(self, in_op, extra=None):
+        stb = in_op.stb
+        if extra is not None:
+            stb = stb & extra
+        return [self.v.eq(in_op.v),          # receive value
+                self.stb.eq(stb),      # receive STB
+                in_op.ack.eq(~self.ack), # send ACK
+               ]
+
+    def chain_from(self, in_op, extra=None):
+        stb = in_op.stb
+        if extra is not None:
+            stb = stb & extra
+        return [self.v.eq(in_op.v),          # receive value
+                self.stb.eq(stb),      # receive STB
+                in_op.ack.eq(self.ack), # send ACK
+               ]
+
+
+class FPOpOut(NextControl):
+    def __init__(self, width):
+        NextControl.__init__(self)
+        self.width = width
+
+    @property
+    def v(self):
+        return self.data_o
+
+    def chain_inv(self, in_op, extra=None):
+        stb = in_op.stb
+        if extra is not None:
+            stb = stb & extra
+        return [self.v.eq(in_op.v),          # receive value
+                self.stb.eq(stb),      # receive STB
+                in_op.ack.eq(~self.ack), # send ACK
+               ]
+
+    def chain_from(self, in_op, extra=None):
+        stb = in_op.stb
+        if extra is not None:
+            stb = stb & extra
+        return [self.v.eq(in_op.v),          # receive value
+                self.stb.eq(stb),      # receive STB
+                in_op.ack.eq(self.ack), # send ACK
+               ]
+
+
+class Overflow: #(Elaboratable):
+    def __init__(self):
+        self.guard = Signal(reset_less=True)     # tot[2]
+        self.round_bit = Signal(reset_less=True) # tot[1]
+        self.sticky = Signal(reset_less=True)    # tot[0]
+        self.m0 = Signal(reset_less=True)        # mantissa zero bit
+
+        self.roundz = Signal(reset_less=True)
+
+    def __iter__(self):
+        yield self.guard
+        yield self.round_bit
+        yield self.sticky
+        yield self.m0
+
+    def eq(self, inp):
+        return [self.guard.eq(inp.guard),
+                self.round_bit.eq(inp.round_bit),
+                self.sticky.eq(inp.sticky),
+                self.m0.eq(inp.m0)]
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.roundz.eq(self.guard & \
+                                   (self.round_bit | self.sticky | self.m0))
+        return m
+
+
+class FPBase:
+    """ IEEE754 Floating Point Base Class
+
+        contains common functions for FP manipulation, such as
+        extracting and packing operands, normalisation, denormalisation,
+        rounding etc.
+    """
+
+    def get_op(self, m, op, v, next_state):
+        """ this function moves to the next state and copies the operand
+            when both stb and ack are 1.
+            acknowledgement is sent by setting ack to ZERO.
+        """
+        res = v.decode2(m)
+        ack = Signal()
+        with m.If((op.ready_o) & (op.valid_i_test)):
+            m.next = next_state
+            # op is latched in from FPNumIn class on same ack/stb
+            m.d.comb += ack.eq(0)
+        with m.Else():
+            m.d.comb += ack.eq(1)
+        return [res, ack]
+
+    def denormalise(self, m, a):
+        """ denormalises a number.  this is probably the wrong name for
+            this function.  for normalised numbers (exponent != minimum)
+            one *extra* bit (the implicit 1) is added *back in*.
+            for denormalised numbers, the mantissa is left alone
+            and the exponent increased by 1.
+
+            both cases *effectively multiply the number stored by 2*,
+            which has to be taken into account when extracting the result.
+        """
+        with m.If(a.exp_n127):
+            m.d.sync += a.e.eq(a.N126) # limit a exponent
+        with m.Else():
+            m.d.sync += a.m[-1].eq(1) # set top mantissa bit
+
+    def op_normalise(self, m, op, next_state):
+        """ operand normalisation
+            NOTE: just like "align", this one keeps going round every clock
+                  until the result's exponent is within acceptable "range"
+        """
+        with m.If((op.m[-1] == 0)): # check last bit of mantissa
+            m.d.sync +=[
+                op.e.eq(op.e - 1),  # DECREASE exponent
+                op.m.eq(op.m << 1), # shift mantissa UP
+            ]
+        with m.Else():
+            m.next = next_state
+
+    def normalise_1(self, m, z, of, next_state):
+        """ first stage normalisation
+
+            NOTE: just like "align", this one keeps going round every clock
+                  until the result's exponent is within acceptable "range"
+            NOTE: the weirdness of reassigning guard and round is due to
+                  the extra mantissa bits coming from tot[0..2]
+        """
+        with m.If((z.m[-1] == 0) & (z.e > z.N126)):
+            m.d.sync += [
+                z.e.eq(z.e - 1),  # DECREASE exponent
+                z.m.eq(z.m << 1), # shift mantissa UP
+                z.m[0].eq(of.guard),       # steal guard bit (was tot[2])
+                of.guard.eq(of.round_bit), # steal round_bit (was tot[1])
+                of.round_bit.eq(0),        # reset round bit
+                of.m0.eq(of.guard),
+            ]
+        with m.Else():
+            m.next = next_state
+
+    def normalise_2(self, m, z, of, next_state):
+        """ second stage normalisation
+
+            NOTE: just like "align", this one keeps going round every clock
+                  until the result's exponent is within acceptable "range"
+            NOTE: the weirdness of reassigning guard and round is due to
+                  the extra mantissa bits coming from tot[0..2]
+        """
+        with m.If(z.e < z.N126):
+            m.d.sync +=[
+                z.e.eq(z.e + 1),  # INCREASE exponent
+                z.m.eq(z.m >> 1), # shift mantissa DOWN
+                of.guard.eq(z.m[0]),
+                of.m0.eq(z.m[1]),
+                of.round_bit.eq(of.guard),
+                of.sticky.eq(of.sticky | of.round_bit)
+            ]
+        with m.Else():
+            m.next = next_state
+
+    def roundz(self, m, z, roundz):
+        """ performs rounding on the output.  TODO: different kinds of rounding
+        """
+        with m.If(roundz):
+            m.d.sync += z.m.eq(z.m + 1) # mantissa rounds up
+            with m.If(z.m == z.m1s): # all 1s
+                m.d.sync += z.e.eq(z.e + 1) # exponent rounds up
+
+    def corrections(self, m, z, next_state):
+        """ denormalisation and sign-bug corrections
+        """
+        m.next = next_state
+        # denormalised, correct exponent to zero
+        with m.If(z.is_denormalised):
+            m.d.sync += z.e.eq(z.N127)
+
+    def pack(self, m, z, next_state):
+        """ packs the result into the output (detects overflow->Inf)
+        """
+        m.next = next_state
+        # if overflow occurs, return inf
+        with m.If(z.is_overflowed):
+            m.d.sync += z.inf(z.s)
+        with m.Else():
+            m.d.sync += z.create(z.s, z.e, z.m)
+
+    def put_z(self, m, z, out_z, next_state):
+        """ put_z: stores the result in the output.  raises stb and waits
+            for ack to be set to 1 before moving to the next state.
+            resets stb back to zero when that occurs, as acknowledgement.
+        """
+        m.d.sync += [
+          out_z.v.eq(z.v)
+        ]
+        with m.If(out_z.valid_o & out_z.ready_i_test):
+            m.d.sync += out_z.valid_o.eq(0)
+            m.next = next_state
+        with m.Else():
+            m.d.sync += out_z.valid_o.eq(1)
+
+
+class FPState(FPBase):
+    def __init__(self, state_from):
+        self.state_from = state_from
+
+    def set_inputs(self, inputs):
+        self.inputs = inputs
+        for k,v in inputs.items():
+            setattr(self, k, v)
+
+    def set_outputs(self, outputs):
+        self.outputs = outputs
+        for k,v in outputs.items():
+            setattr(self, k, v)
+
+
+class FPID:
+    def __init__(self, id_wid):
+        self.id_wid = id_wid
+        if self.id_wid:
+            self.in_mid = Signal(id_wid, reset_less=True)
+            self.out_mid = Signal(id_wid, reset_less=True)
+        else:
+            self.in_mid = None
+            self.out_mid = None
+
+    def idsync(self, m):
+        if self.id_wid is not None:
+            m.d.sync += self.out_mid.eq(self.in_mid)
+
+
diff --git a/src/ieee754/add/fpcommon/__init__.py b/src/ieee754/add/fpcommon/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ieee754/add/fpcommon/corrections.py b/src/ieee754/add/fpcommon/corrections.py
new file mode 100644
index 00000000..ce9ba3cd
--- /dev/null
+++ b/src/ieee754/add/fpcommon/corrections.py
@@ -0,0 +1,69 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Elaboratable
+from nmigen.cli import main, verilog
+from fpbase import FPState
+from fpcommon.roundz import FPRoundData
+
+
+class FPCorrectionsMod(Elaboratable):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return FPRoundData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPRoundData(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.out_z
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.corrections = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.corr_in_z = self.i.z
+        m.submodules.corr_out_z = self.out_z.z
+        m.d.comb += self.out_z.eq(self.i) # copies mid, z, out_do_z
+        with m.If(~self.i.out_do_z):
+            with m.If(self.i.z.is_denormalised):
+                m.d.comb += self.out_z.z.e.eq(self.i.z.N127)
+        return m
+
+
+class FPCorrections(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "corrections")
+        self.mod = FPCorrectionsMod(width)
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def setup(self, m, in_z):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, in_z)
+
+        m.d.sync += self.out_z.eq(self.mod.out_z)
+        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
+
+    def action(self, m):
+        m.next = "pack"
+
+
diff --git a/src/ieee754/add/fpcommon/denorm.py b/src/ieee754/add/fpcommon/denorm.py
new file mode 100644
index 00000000..9fbbc976
--- /dev/null
+++ b/src/ieee754/add/fpcommon/denorm.py
@@ -0,0 +1,108 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import FPNumIn, FPNumOut, FPNumBase
+from fpbase import FPState
+
+
+class FPSCData:
+
+    def __init__(self, width, id_wid):
+        self.a = FPNumBase(width, True)
+        self.b = FPNumBase(width, True)
+        self.z = FPNumOut(width, False)
+        self.oz = Signal(width, reset_less=True)
+        self.out_do_z = Signal(reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def __iter__(self):
+        yield from self.a
+        yield from self.b
+        yield from self.z
+        yield self.oz
+        yield self.out_do_z
+        yield self.mid
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
+
+
+class FPAddDeNormMod(FPState):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPSCData(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.denormalise = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.denorm_in_a = self.i.a
+        m.submodules.denorm_in_b = self.i.b
+        m.submodules.denorm_out_a = self.o.a
+        m.submodules.denorm_out_b = self.o.b
+
+        with m.If(~self.i.out_do_z):
+            # XXX hmmm, don't like repeating identical code
+            m.d.comb += self.o.a.eq(self.i.a)
+            with m.If(self.i.a.exp_n127):
+                m.d.comb += self.o.a.e.eq(self.i.a.N126) # limit a exponent
+            with m.Else():
+                m.d.comb += self.o.a.m[-1].eq(1) # set top mantissa bit
+
+            m.d.comb += self.o.b.eq(self.i.b)
+            with m.If(self.i.b.exp_n127):
+                m.d.comb += self.o.b.e.eq(self.i.b.N126) # limit a exponent
+            with m.Else():
+                m.d.comb += self.o.b.m[-1].eq(1) # set top mantissa bit
+
+        m.d.comb += self.o.mid.eq(self.i.mid)
+        m.d.comb += self.o.z.eq(self.i.z)
+        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
+        m.d.comb += self.o.oz.eq(self.i.oz)
+
+        return m
+
+
+class FPAddDeNorm(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "denormalise")
+        self.mod = FPAddDeNormMod(width)
+        self.out_a = FPNumBase(width)
+        self.out_b = FPNumBase(width)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+        m.d.sync += self.out_a.eq(self.mod.out_a)
+        m.d.sync += self.out_b.eq(self.mod.out_b)
+
+    def action(self, m):
+        # Denormalised Number checks
+        m.next = "align"
+
+
diff --git a/src/ieee754/add/fpcommon/getop.py b/src/ieee754/add/fpcommon/getop.py
new file mode 100644
index 00000000..1988997a
--- /dev/null
+++ b/src/ieee754/add/fpcommon/getop.py
@@ -0,0 +1,174 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat, Mux, Array, Const, Elaboratable
+from nmigen.lib.coding import PriorityEncoder
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import FPNumIn, FPNumOut, FPOpIn, Overflow, FPBase, FPNumBase
+from fpbase import MultiShiftRMerge, Trigger
+from singlepipe import (ControlBase, StageChain, SimpleHandshake,
+                        PassThroughStage, PrevControl)
+from multipipe import CombMuxOutPipe
+from multipipe import PriorityCombMuxInPipe
+
+from fpbase import FPState
+import nmoperator
+
+
+class FPGetOpMod(Elaboratable):
+    def __init__(self, width):
+        self.in_op = FPOpIn(width)
+        self.in_op.data_i = Signal(width)
+        self.out_op = Signal(width)
+        self.out_decode = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.out_decode.eq((self.in_op.ready_o) & \
+                                       (self.in_op.valid_i_test))
+        m.submodules.get_op_in = self.in_op
+        #m.submodules.get_op_out = self.out_op
+        with m.If(self.out_decode):
+            m.d.comb += [
+                self.out_op.eq(self.in_op.v),
+            ]
+        return m
+
+
+class FPGetOp(FPState):
+    """ gets operand
+    """
+
+    def __init__(self, in_state, out_state, in_op, width):
+        FPState.__init__(self, in_state)
+        self.out_state = out_state
+        self.mod = FPGetOpMod(width)
+        self.in_op = in_op
+        self.out_op = Signal(width)
+        self.out_decode = Signal(reset_less=True)
+
+    def setup(self, m, in_op):
+        """ links module to inputs and outputs
+        """
+        setattr(m.submodules, self.state_from, self.mod)
+        m.d.comb += nmoperator.eq(self.mod.in_op, in_op)
+        m.d.comb += self.out_decode.eq(self.mod.out_decode)
+
+    def action(self, m):
+        with m.If(self.out_decode):
+            m.next = self.out_state
+            m.d.sync += [
+                self.in_op.ready_o.eq(0),
+                self.out_op.eq(self.mod.out_op)
+            ]
+        with m.Else():
+            m.d.sync += self.in_op.ready_o.eq(1)
+
+
+class FPNumBase2Ops:
+
+    def __init__(self, width, id_wid, m_extra=True):
+        self.a = FPNumBase(width, m_extra)
+        self.b = FPNumBase(width, m_extra)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
+
+    def ports(self):
+        return [self.a, self.b, self.mid]
+
+
+class FPADDBaseData:
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.a  = Signal(width)
+        self.b  = Signal(width)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.a.eq(i.a), self.b.eq(i.b), self.mid.eq(i.mid)]
+
+    def ports(self):
+        return [self.a, self.b, self.mid]
+
+
+class FPGet2OpMod(PrevControl):
+    def __init__(self, width, id_wid):
+        PrevControl.__init__(self)
+        self.width = width
+        self.id_wid = id_wid
+        self.data_i = self.ispec()
+        self.i = self.data_i
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPADDBaseData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPADDBaseData(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def elaborate(self, platform):
+        m = PrevControl.elaborate(self, platform)
+        with m.If(self.trigger):
+            m.d.comb += [
+                self.o.eq(self.data_i),
+            ]
+        return m
+
+
+class FPGet2Op(FPState):
+    """ gets operands
+    """
+
+    def __init__(self, in_state, out_state, width, id_wid):
+        FPState.__init__(self, in_state)
+        self.out_state = out_state
+        self.mod = FPGet2OpMod(width, id_wid)
+        self.o = self.ospec()
+        self.in_stb = Signal(reset_less=True)
+        self.out_ack = Signal(reset_less=True)
+        self.out_decode = Signal(reset_less=True)
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def trigger_setup(self, m, in_stb, in_ack):
+        """ links stb/ack
+        """
+        m.d.comb += self.mod.valid_i.eq(in_stb)
+        m.d.comb += in_ack.eq(self.mod.ready_o)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.get_ops = self.mod
+        m.d.comb += self.mod.i.eq(i)
+        m.d.comb += self.out_ack.eq(self.mod.ready_o)
+        m.d.comb += self.out_decode.eq(self.mod.trigger)
+
+    def process(self, i):
+        return self.o
+
+    def action(self, m):
+        with m.If(self.out_decode):
+            m.next = self.out_state
+            m.d.sync += [
+                self.mod.ready_o.eq(0),
+                self.o.eq(self.mod.o),
+            ]
+        with m.Else():
+            m.d.sync += self.mod.ready_o.eq(1)
+
+
diff --git a/src/ieee754/add/fpcommon/normtopack.py b/src/ieee754/add/fpcommon/normtopack.py
new file mode 100644
index 00000000..87d08125
--- /dev/null
+++ b/src/ieee754/add/fpcommon/normtopack.py
@@ -0,0 +1,52 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+#from nmigen.cli import main, verilog
+
+from singlepipe import StageChain, SimpleHandshake
+
+from fpbase import FPState, FPID
+from fpcommon.postcalc import FPAddStage1Data
+from fpcommon.postnormalise import FPNorm1ModSingle
+from fpcommon.roundz import FPRoundMod
+from fpcommon.corrections import FPCorrectionsMod
+from fpcommon.pack import FPPackData, FPPackMod
+
+
+class FPNormToPack(FPState, SimpleHandshake):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "normalise_1")
+        self.id_wid = id_wid
+        self.width = width
+        SimpleHandshake.__init__(self, self) # pipeline is its own stage
+
+    def ispec(self):
+        return FPAddStage1Data(self.width, self.id_wid) # Norm1ModSingle ispec
+
+    def ospec(self):
+        return FPPackData(self.width, self.id_wid) # FPPackMod ospec
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+
+        # Normalisation, Rounding Corrections, Pack - in a chain
+        nmod = FPNorm1ModSingle(self.width, self.id_wid)
+        rmod = FPRoundMod(self.width, self.id_wid)
+        cmod = FPCorrectionsMod(self.width, self.id_wid)
+        pmod = FPPackMod(self.width, self.id_wid)
+        stages = [nmod, rmod, cmod, pmod]
+        chain = StageChain(stages)
+        chain.setup(m, i)
+        self.out_z = pmod.ospec()
+
+        self.o = pmod.o
+
+    def process(self, i):
+        return self.o
+
+    def action(self, m):
+        m.d.sync += self.out_z.eq(self.process(None))
+        m.next = "pack_put_z"
diff --git a/src/ieee754/add/fpcommon/pack.py b/src/ieee754/add/fpcommon/pack.py
new file mode 100644
index 00000000..1464883c
--- /dev/null
+++ b/src/ieee754/add/fpcommon/pack.py
@@ -0,0 +1,84 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumOut
+from fpbase import FPState
+from fpcommon.roundz import FPRoundData
+from singlepipe import Object
+
+
+class FPPackData(Object):
+
+    def __init__(self, width, id_wid):
+        Object.__init__(self)
+        self.z = Signal(width, reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+
+class FPPackMod(Elaboratable):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPRoundData(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPPackData(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.o
+
+    def setup(self, m, in_z):
+        """ links module to inputs and outputs
+        """
+        m.submodules.pack = self
+        m.d.comb += self.i.eq(in_z)
+
+    def elaborate(self, platform):
+        m = Module()
+        z = FPNumOut(self.width, False)
+        m.submodules.pack_in_z = self.i.z
+        m.submodules.pack_out_z = z
+        m.d.comb += self.o.mid.eq(self.i.mid)
+        with m.If(~self.i.out_do_z):
+            with m.If(self.i.z.is_overflowed):
+                m.d.comb += z.inf(self.i.z.s)
+            with m.Else():
+                m.d.comb += z.create(self.i.z.s, self.i.z.e, self.i.z.m)
+        with m.Else():
+            m.d.comb += z.v.eq(self.i.oz)
+        m.d.comb += self.o.z.eq(z.v)
+        return m
+
+
+class FPPack(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "pack")
+        self.mod = FPPackMod(width)
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def setup(self, m, in_z):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, in_z)
+
+        m.d.sync += self.out_z.v.eq(self.mod.out_z.v)
+        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
+
+    def action(self, m):
+        m.next = "pack_put_z"
diff --git a/src/ieee754/add/fpcommon/postcalc.py b/src/ieee754/add/fpcommon/postcalc.py
new file mode 100644
index 00000000..7111dc8a
--- /dev/null
+++ b/src/ieee754/add/fpcommon/postcalc.py
@@ -0,0 +1,26 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Signal
+from fpbase import Overflow, FPNumBase
+
+class FPAddStage1Data:
+
+    def __init__(self, width, id_wid):
+        self.z = FPNumBase(width, False)
+        self.out_do_z = Signal(reset_less=True)
+        self.oz = Signal(width, reset_less=True)
+        self.of = Overflow()
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def __iter__(self):
+        yield from self.z
+        yield self.out_do_z
+        yield self.oz
+        yield from self.of
+        yield self.mid
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.of.eq(i.of), self.mid.eq(i.mid)]
diff --git a/src/ieee754/add/fpcommon/postnormalise.py b/src/ieee754/add/fpcommon/postnormalise.py
new file mode 100644
index 00000000..b072490f
--- /dev/null
+++ b/src/ieee754/add/fpcommon/postnormalise.py
@@ -0,0 +1,270 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat, Mux, Elaboratable
+from nmigen.lib.coding import PriorityEncoder
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import Overflow, FPNumBase
+from fpbase import MultiShiftRMerge
+from fpbase import FPState
+from .postcalc import FPAddStage1Data
+
+
+class FPNorm1Data:
+
+    def __init__(self, width, id_wid):
+        self.roundz = Signal(reset_less=True)
+        self.z = FPNumBase(width, False)
+        self.out_do_z = Signal(reset_less=True)
+        self.oz = Signal(width, reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.roundz.eq(i.roundz), self.mid.eq(i.mid)]
+
+
+class FPNorm1ModSingle(Elaboratable):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.o = self.ospec()
+
+    def ispec(self):
+        return FPAddStage1Data(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPNorm1Data(self.width, self.id_wid)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.normalise_1 = self
+        m.d.comb += self.i.eq(i)
+
+    def process(self, i):
+        return self.o
+
+    def elaborate(self, platform):
+        m = Module()
+
+        mwid = self.o.z.m_width+2
+        pe = PriorityEncoder(mwid)
+        m.submodules.norm_pe = pe
+
+        of = Overflow()
+        m.d.comb += self.o.roundz.eq(of.roundz)
+
+        m.submodules.norm1_out_z = self.o.z
+        m.submodules.norm1_out_overflow = of
+        m.submodules.norm1_in_z = self.i.z
+        m.submodules.norm1_in_overflow = self.i.of
+
+        i = self.ispec()
+        m.submodules.norm1_insel_z = i.z
+        m.submodules.norm1_insel_overflow = i.of
+
+        espec = (len(i.z.e), True)
+        ediff_n126 = Signal(espec, reset_less=True)
+        msr = MultiShiftRMerge(mwid, espec)
+        m.submodules.multishift_r = msr
+
+        m.d.comb += i.eq(self.i)
+        # initialise out from in (overridden below)
+        m.d.comb += self.o.z.eq(i.z)
+        m.d.comb += of.eq(i.of)
+        # normalisation increase/decrease conditions
+        decrease = Signal(reset_less=True)
+        increase = Signal(reset_less=True)
+        m.d.comb += decrease.eq(i.z.m_msbzero & i.z.exp_gt_n126)
+        m.d.comb += increase.eq(i.z.exp_lt_n126)
+        # decrease exponent
+        with m.If(~self.i.out_do_z):
+            with m.If(decrease):
+                # *sigh* not entirely obvious: count leading zeros (clz)
+                # with a PriorityEncoder: to find from the MSB
+                # we reverse the order of the bits.
+                temp_m = Signal(mwid, reset_less=True)
+                temp_s = Signal(mwid+1, reset_less=True)
+                clz = Signal((len(i.z.e), True), reset_less=True)
+                # make sure that the amount to decrease by does NOT
+                # go below the minimum non-INF/NaN exponent
+                limclz = Mux(i.z.exp_sub_n126 > pe.o, pe.o,
+                             i.z.exp_sub_n126)
+                m.d.comb += [
+                    # cat round and guard bits back into the mantissa
+                    temp_m.eq(Cat(i.of.round_bit, i.of.guard, i.z.m)),
+                    pe.i.eq(temp_m[::-1]),          # inverted
+                    clz.eq(limclz),                 # count zeros from MSB down
+                    temp_s.eq(temp_m << clz),       # shift mantissa UP
+                    self.o.z.e.eq(i.z.e - clz),  # DECREASE exponent
+                    self.o.z.m.eq(temp_s[2:]),    # exclude bits 0&1
+                    of.m0.eq(temp_s[2]),          # copy of mantissa[0]
+                    # overflow in bits 0..1: got shifted too (leave sticky)
+                    of.guard.eq(temp_s[1]),       # guard
+                    of.round_bit.eq(temp_s[0]),   # round
+                ]
+            # increase exponent
+            with m.Elif(increase):
+                temp_m = Signal(mwid+1, reset_less=True)
+                m.d.comb += [
+                    temp_m.eq(Cat(i.of.sticky, i.of.round_bit, i.of.guard,
+                                  i.z.m)),
+                    ediff_n126.eq(i.z.N126 - i.z.e),
+                    # connect multi-shifter to inp/out mantissa (and ediff)
+                    msr.inp.eq(temp_m),
+                    msr.diff.eq(ediff_n126),
+                    self.o.z.m.eq(msr.m[3:]),
+                    of.m0.eq(temp_s[3]),   # copy of mantissa[0]
+                    # overflow in bits 0..1: got shifted too (leave sticky)
+                    of.guard.eq(temp_s[2]),     # guard
+                    of.round_bit.eq(temp_s[1]), # round
+                    of.sticky.eq(temp_s[0]),    # sticky
+                    self.o.z.e.eq(i.z.e + ediff_n126),
+                ]
+
+        m.d.comb += self.o.mid.eq(self.i.mid)
+        m.d.comb += self.o.out_do_z.eq(self.i.out_do_z)
+        m.d.comb += self.o.oz.eq(self.i.oz)
+
+        return m
+
+
+class FPNorm1ModMulti:
+
+    def __init__(self, width, single_cycle=True):
+        self.width = width
+        self.in_select = Signal(reset_less=True)
+        self.in_z = FPNumBase(width, False)
+        self.in_of = Overflow()
+        self.temp_z = FPNumBase(width, False)
+        self.temp_of = Overflow()
+        self.out_z = FPNumBase(width, False)
+        self.out_of = Overflow()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.submodules.norm1_out_z = self.out_z
+        m.submodules.norm1_out_overflow = self.out_of
+        m.submodules.norm1_temp_z = self.temp_z
+        m.submodules.norm1_temp_of = self.temp_of
+        m.submodules.norm1_in_z = self.in_z
+        m.submodules.norm1_in_overflow = self.in_of
+
+        in_z = FPNumBase(self.width, False)
+        in_of = Overflow()
+        m.submodules.norm1_insel_z = in_z
+        m.submodules.norm1_insel_overflow = in_of
+
+        # select which of temp or in z/of to use
+        with m.If(self.in_select):
+            m.d.comb += in_z.eq(self.in_z)
+            m.d.comb += in_of.eq(self.in_of)
+        with m.Else():
+            m.d.comb += in_z.eq(self.temp_z)
+            m.d.comb += in_of.eq(self.temp_of)
+        # initialise out from in (overridden below)
+        m.d.comb += self.out_z.eq(in_z)
+        m.d.comb += self.out_of.eq(in_of)
+        # normalisation increase/decrease conditions
+        decrease = Signal(reset_less=True)
+        increase = Signal(reset_less=True)
+        m.d.comb += decrease.eq(in_z.m_msbzero & in_z.exp_gt_n126)
+        m.d.comb += increase.eq(in_z.exp_lt_n126)
+        m.d.comb += self.out_norm.eq(decrease | increase) # loop-end
+        # decrease exponent
+        with m.If(decrease):
+            m.d.comb += [
+                self.out_z.e.eq(in_z.e - 1),  # DECREASE exponent
+                self.out_z.m.eq(in_z.m << 1), # shift mantissa UP
+                self.out_z.m[0].eq(in_of.guard), # steal guard (was tot[2])
+                self.out_of.guard.eq(in_of.round_bit), # round (was tot[1])
+                self.out_of.round_bit.eq(0),        # reset round bit
+                self.out_of.m0.eq(in_of.guard),
+            ]
+        # increase exponent
+        with m.Elif(increase):
+            m.d.comb += [
+                self.out_z.e.eq(in_z.e + 1),  # INCREASE exponent
+                self.out_z.m.eq(in_z.m >> 1), # shift mantissa DOWN
+                self.out_of.guard.eq(in_z.m[0]),
+                self.out_of.m0.eq(in_z.m[1]),
+                self.out_of.round_bit.eq(in_of.guard),
+                self.out_of.sticky.eq(in_of.sticky | in_of.round_bit)
+            ]
+
+        return m
+
+
+class FPNorm1Single(FPState):
+
+    def __init__(self, width, id_wid, single_cycle=True):
+        FPState.__init__(self, "normalise_1")
+        self.mod = FPNorm1ModSingle(width)
+        self.o = self.ospec()
+        self.out_z = FPNumBase(width, False)
+        self.out_roundz = Signal(reset_less=True)
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+    def action(self, m):
+        m.next = "round"
+
+
+class FPNorm1Multi(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "normalise_1")
+        self.mod = FPNorm1ModMulti(width)
+        self.stb = Signal(reset_less=True)
+        self.ack = Signal(reset=0, reset_less=True)
+        self.out_norm = Signal(reset_less=True)
+        self.in_accept = Signal(reset_less=True)
+        self.temp_z = FPNumBase(width)
+        self.temp_of = Overflow()
+        self.out_z = FPNumBase(width)
+        self.out_roundz = Signal(reset_less=True)
+
+    def setup(self, m, in_z, in_of, norm_stb):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, in_z, in_of, norm_stb,
+                       self.in_accept, self.temp_z, self.temp_of,
+                       self.out_z, self.out_norm)
+
+        m.d.comb += self.stb.eq(norm_stb)
+        m.d.sync += self.ack.eq(0) # sets to zero when not in normalise_1 state
+
+    def action(self, m):
+        m.d.comb += self.in_accept.eq((~self.ack) & (self.stb))
+        m.d.sync += self.temp_of.eq(self.mod.out_of)
+        m.d.sync += self.temp_z.eq(self.out_z)
+        with m.If(self.out_norm):
+            with m.If(self.in_accept):
+                m.d.sync += [
+                    self.ack.eq(1),
+                ]
+            with m.Else():
+                m.d.sync += self.ack.eq(0)
+        with m.Else():
+            # normalisation not required (or done).
+            m.next = "round"
+            m.d.sync += self.ack.eq(1)
+            m.d.sync += self.out_roundz.eq(self.mod.out_of.roundz)
+
+
diff --git a/src/ieee754/add/fpcommon/prenormalise.py b/src/ieee754/add/fpcommon/prenormalise.py
new file mode 100644
index 00000000..0b3a65cb
--- /dev/null
+++ b/src/ieee754/add/fpcommon/prenormalise.py
@@ -0,0 +1,83 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Cat,
+from nmigen.lib.coding import PriorityEncoder
+from nmigen.cli import main, verilog
+from math import log
+
+from fpbase import Overflow, FPNumBase
+from fpbase import MultiShiftRMerge
+
+from fpbase import FPState
+
+
+class FPNormaliseModSingle:
+
+    def __init__(self, width):
+        self.width = width
+        self.in_z = self.ispec()
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return FPNumBase(self.width, False)
+
+    def ospec(self):
+        return FPNumBase(self.width, False)
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        m.submodules.normalise = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        mwid = self.out_z.m_width+2
+        pe = PriorityEncoder(mwid)
+        m.submodules.norm_pe = pe
+
+        m.submodules.norm1_out_z = self.out_z
+        m.submodules.norm1_in_z = self.in_z
+
+        in_z = FPNumBase(self.width, False)
+        in_of = Overflow()
+        m.submodules.norm1_insel_z = in_z
+        m.submodules.norm1_insel_overflow = in_of
+
+        espec = (len(in_z.e), True)
+        ediff_n126 = Signal(espec, reset_less=True)
+        msr = MultiShiftRMerge(mwid, espec)
+        m.submodules.multishift_r = msr
+
+        m.d.comb += in_z.eq(self.in_z)
+        m.d.comb += in_of.eq(self.in_of)
+        # initialise out from in (overridden below)
+        m.d.comb += self.out_z.eq(in_z)
+        m.d.comb += self.out_of.eq(in_of)
+        # normalisation decrease condition
+        decrease = Signal(reset_less=True)
+        m.d.comb += decrease.eq(in_z.m_msbzero)
+        # decrease exponent
+        with m.If(decrease):
+            # *sigh* not entirely obvious: count leading zeros (clz)
+            # with a PriorityEncoder: to find from the MSB
+            # we reverse the order of the bits.
+            temp_m = Signal(mwid, reset_less=True)
+            temp_s = Signal(mwid+1, reset_less=True)
+            clz = Signal((len(in_z.e), True), reset_less=True)
+            m.d.comb += [
+                # cat round and guard bits back into the mantissa
+                temp_m.eq(Cat(in_of.round_bit, in_of.guard, in_z.m)),
+                pe.i.eq(temp_m[::-1]),          # inverted
+                clz.eq(pe.o),                   # count zeros from MSB down
+                temp_s.eq(temp_m << clz),       # shift mantissa UP
+                self.out_z.e.eq(in_z.e - clz),  # DECREASE exponent
+                self.out_z.m.eq(temp_s[2:]),    # exclude bits 0&1
+            ]
+
+        return m
+
+
diff --git a/src/ieee754/add/fpcommon/putz.py b/src/ieee754/add/fpcommon/putz.py
new file mode 100644
index 00000000..8173ed85
--- /dev/null
+++ b/src/ieee754/add/fpcommon/putz.py
@@ -0,0 +1,60 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Signal
+from nmigen.cli import main, verilog
+from fpbase import FPState
+
+
+class FPPutZ(FPState):
+
+    def __init__(self, state, in_z, out_z, in_mid, out_mid, to_state=None):
+        FPState.__init__(self, state)
+        if to_state is None:
+            to_state = "get_ops"
+        self.to_state = to_state
+        self.in_z = in_z
+        self.out_z = out_z
+        self.in_mid = in_mid
+        self.out_mid = out_mid
+
+    def action(self, m):
+        if self.in_mid is not None:
+            m.d.sync += self.out_mid.eq(self.in_mid)
+        m.d.sync += [
+          self.out_z.z.v.eq(self.in_z)
+        ]
+        with m.If(self.out_z.z.valid_o & self.out_z.z.ready_i_test):
+            m.d.sync += self.out_z.z.valid_o.eq(0)
+            m.next = self.to_state
+        with m.Else():
+            m.d.sync += self.out_z.z.valid_o.eq(1)
+
+
+class FPPutZIdx(FPState):
+
+    def __init__(self, state, in_z, out_zs, in_mid, to_state=None):
+        FPState.__init__(self, state)
+        if to_state is None:
+            to_state = "get_ops"
+        self.to_state = to_state
+        self.in_z = in_z
+        self.out_zs = out_zs
+        self.in_mid = in_mid
+
+    def action(self, m):
+        outz_stb = Signal(reset_less=True)
+        outz_ack = Signal(reset_less=True)
+        m.d.comb += [outz_stb.eq(self.out_zs[self.in_mid].valid_o),
+                     outz_ack.eq(self.out_zs[self.in_mid].ready_i_test),
+                    ]
+        m.d.sync += [
+          self.out_zs[self.in_mid].v.eq(self.in_z.v)
+        ]
+        with m.If(outz_stb & outz_ack):
+            m.d.sync += self.out_zs[self.in_mid].valid_o.eq(0)
+            m.next = self.to_state
+        with m.Else():
+            m.d.sync += self.out_zs[self.in_mid].valid_o.eq(1)
+
diff --git a/src/ieee754/add/fpcommon/roundz.py b/src/ieee754/add/fpcommon/roundz.py
new file mode 100644
index 00000000..420d6669
--- /dev/null
+++ b/src/ieee754/add/fpcommon/roundz.py
@@ -0,0 +1,82 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Elaboratable
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumBase
+from fpbase import FPState
+from fpcommon.postnormalise import FPNorm1Data
+
+
+class FPRoundData:
+
+    def __init__(self, width, id_wid):
+        self.z = FPNumBase(width, False)
+        self.out_do_z = Signal(reset_less=True)
+        self.oz = Signal(width, reset_less=True)
+        self.mid = Signal(id_wid, reset_less=True)
+
+    def eq(self, i):
+        return [self.z.eq(i.z), self.out_do_z.eq(i.out_do_z), self.oz.eq(i.oz),
+                self.mid.eq(i.mid)]
+
+
+class FPRoundMod(Elaboratable):
+
+    def __init__(self, width, id_wid):
+        self.width = width
+        self.id_wid = id_wid
+        self.i = self.ispec()
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return FPNorm1Data(self.width, self.id_wid)
+
+    def ospec(self):
+        return FPRoundData(self.width, self.id_wid)
+
+    def process(self, i):
+        return self.out_z
+
+    def setup(self, m, i):
+        m.submodules.roundz = self
+        m.d.comb += self.i.eq(i)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.out_z.eq(self.i) # copies mid, z, out_do_z
+        with m.If(~self.i.out_do_z):
+            with m.If(self.i.roundz):
+                m.d.comb += self.out_z.z.m.eq(self.i.z.m + 1) # mantissa up
+                with m.If(self.i.z.m == self.i.z.m1s): # all 1s
+                    m.d.comb += self.out_z.z.e.eq(self.i.z.e + 1) # exponent up
+
+        return m
+
+
+class FPRound(FPState):
+
+    def __init__(self, width, id_wid):
+        FPState.__init__(self, "round")
+        self.mod = FPRoundMod(width)
+        self.out_z = self.ospec()
+
+    def ispec(self):
+        return self.mod.ispec()
+
+    def ospec(self):
+        return self.mod.ospec()
+
+    def setup(self, m, i):
+        """ links module to inputs and outputs
+        """
+        self.mod.setup(m, i)
+
+        self.idsync(m)
+        m.d.sync += self.out_z.eq(self.mod.out_z)
+        m.d.sync += self.out_z.mid.eq(self.mod.o.mid)
+
+    def action(self, m):
+        m.next = "corrections"
diff --git a/src/ieee754/add/fsqrt.py b/src/ieee754/add/fsqrt.py
new file mode 100644
index 00000000..02449b0f
--- /dev/null
+++ b/src/ieee754/add/fsqrt.py
@@ -0,0 +1,256 @@
+from sfpy import Float32
+
+
+# XXX DO NOT USE, fails on num=65536.  wark-wark...
+def sqrtsimple(num):
+    res = 0
+    bit = 1
+
+    while (bit < num):
+        bit <<= 2
+
+    while (bit != 0):
+        if (num >= res + bit):
+            num -= res + bit
+            res = (res >> 1) + bit
+        else:
+            res >>= 1
+        bit >>= 2
+
+    return res
+
+
+def sqrt(num):
+    D = num # D is input (from num)
+    Q = 0 # quotient
+    R = 0 # remainder
+    for i in range(64, -1, -1): # negative ranges are weird...
+
+        R = (R<<2)|((D>>(i+i))&3)
+
+        if R >= 0:
+            R -= ((Q<<2)|1) # -Q01
+        else:
+            R += ((Q<<2)|3) # +Q11
+
+        Q <<= 1
+        if R >= 0:
+            Q |= 1 # new Q
+
+    if R < 0:
+        R = R + ((Q<<1)|1)
+
+    return Q, R
+
+
+# grabbed these from unit_test_single (convenience, this is just experimenting)
+
+def get_mantissa(x):
+    return 0x7fffff & x
+
+def get_exponent(x):
+    return ((x & 0x7f800000) >> 23) - 127
+
+def set_exponent(x, e):
+    return (x & ~0x7f800000) | ((e+127) << 23)
+
+def get_sign(x):
+    return ((x & 0x80000000) >> 31)
+
+# convert FP32 to s/e/m
+def create_fp32(s, e, m):
+    """ receive sign, exponent, mantissa, return FP32 """
+    return set_exponent((s << 31) | get_mantissa(m))
+
+# convert s/e/m to FP32
+def decode_fp32(x):
+    """ receive FP32, return sign, exponent, mantissa """
+    return get_sign(x), get_exponent(x), get_mantissa(x)
+
+
+# main function, takes mantissa and exponent as separate arguments
+# returns a tuple, sqrt'd mantissa, sqrt'd exponent
+
+def main(mantissa, exponent):
+    if exponent & 1 != 0:
+        # shift mantissa up, subtract 1 from exp to compensate
+        mantissa <<= 1
+        exponent -= 1
+    m, r = sqrt(mantissa)
+    return m, r, exponent >> 1
+
+
+#normalization function
+def normalise(s, m, e, lowbits):
+    if (lowbits >= 2):
+        m += 1
+    if get_mantissa(m) == ((1<<24)-1):
+        e += 1
+    return s, m, e
+
+
+def fsqrt_test(x):
+
+    xbits = x.bits
+    print ("x", x, type(x))
+    sq_test = x.sqrt()
+    print ("sqrt", sq_test)
+
+    print (xbits, type(xbits))
+    s, e, m = decode_fp32(xbits)
+    print("x decode", s, e, m, hex(m))
+
+    m |= 1<<23 # set top bit (the missing "1" from mantissa)
+    m <<= 27
+
+    sm, sr, se = main(m, e)
+    lowbits = sm & 0x3
+    sm >>= 2
+    sm = get_mantissa(sm)
+    #sm += 2
+
+    s, sm, se = normalise(s, sm, se, lowbits)
+
+    print("our  sqrt", s, se, sm, hex(sm), bin(sm), "lowbits", lowbits,
+                                                    "rem", hex(sr))
+    if lowbits >= 2:
+        print ("probably needs rounding (+1 on mantissa)")
+
+    sq_xbits = sq_test.bits
+    s, e, m = decode_fp32(sq_xbits)
+    print ("sf32 sqrt", s, e, m, hex(m), bin(m))
+    print ()
+
+if __name__ == '__main__':
+
+    # quick test up to 1000 of two sqrt functions
+    for Q in range(1, int(1e4)):
+        print(Q, sqrt(Q), sqrtsimple(Q), int(Q**0.5))
+        assert int(Q**0.5) == sqrtsimple(Q), "Q sqrtsimpl fail %d" % Q
+        assert int(Q**0.5) == sqrt(Q)[0], "Q sqrt fail %d" % Q
+
+    # quick mantissa/exponent demo
+    for e in range(26):
+        for m in range(26):
+            ms, mr, es = main(m, e)
+            print("m:%d e:%d sqrt: m:%d-%d e:%d" % (m, e, ms, mr, es))
+
+    x = Float32(1234.123456789)
+    fsqrt_test(x)
+    x = Float32(32.1)
+    fsqrt_test(x)
+    x = Float32(16.0)
+    fsqrt_test(x)
+    x = Float32(8.0)
+    fsqrt_test(x)
+    x = Float32(8.5)
+    fsqrt_test(x)
+    x = Float32(3.14159265358979323)
+    fsqrt_test(x)
+    x = Float32(12.99392923123123)
+    fsqrt_test(x)
+    x = Float32(0.123456)
+    fsqrt_test(x)
+
+
+
+
+"""
+
+Notes:
+https://pdfs.semanticscholar.org/5060/4e9aff0e37089c4ab9a376c3f35761ffe28b.pdf
+
+//This is the main code of integer sqrt function found here:http://verilogcodes.blogspot.com/2017/11/a-verilog-function-for-finding-square-root.html
+//
+
+module testbench;
+
+reg [15:0] sqr;
+
+//Verilog function to find square root of a 32 bit number.
+//The output is 16 bit.
+function [15:0] sqrt;
+    input [31:0] num;  //declare input
+    //intermediate signals.
+    reg [31:0] a;
+    reg [15:0] q;
+    reg [17:0] left,right,r;
+    integer i;
+begin
+    //initialize all the variables.
+    a = num;
+    q = 0;
+    i = 0;
+    left = 0;   //input to adder/sub
+    right = 0;  //input to adder/sub
+    r = 0;  //remainder
+    //run the calculations for 16 iterations.
+    for(i=0;i<16;i=i+1) begin
+        right = {q,r[17],1'b1};
+        left = {r[15:0],a[31:30]};
+        a = {a[29:0],2'b00};    //left shift by 2 bits.
+        if (r[17] == 1) //add if r is negative
+            r = left + right;
+        else    //subtract if r is positive
+            r = left - right;
+        q = {q[14:0],!r[17]};
+    end
+    sqrt = q;   //final assignment of output.
+end
+endfunction //end of Function
+
+
+c version (from paper linked from URL)
+
+unsigned squart(D, r) /*Non-Restoring sqrt*/
+    unsigned D; /*D:32-bit unsigned integer to be square rooted */
+    int *r;
+{
+    unsigned Q = 0; /*Q:16-bit unsigned integer (root)*/
+    int R = 0; /*R:17-bit integer (remainder)*/
+    int i;
+    for (i = 15;i>=0;i--) /*for each root bit*/
+    {
+        if (R>=0)
+        { /*new remainder:*/
+            R = R<<2)|((D>>(i+i))&3);
+            R = R-((Q<<2)|1); /*-Q01*/
+        }
+        else
+        { /*new remainder:*/
+            R = R<<2)|((D>>(i+i))&3);
+            R = R+((Q<<2)|3); /*+Q11*/
+        }
+        if (R>=0) Q = Q<<1)|1; /*new Q:*/
+        else Q = Q<<1)|0; /*new Q:*/
+    }
+
+    /*remainder adjusting*/
+    if (R<0) R = R+((Q<<1)|1);
+    *r = R; /*return remainder*/
+    return(Q); /*return root*/
+}
+
+From wikipedia page:
+
+short isqrt(short num) {
+    short res = 0;
+    short bit = 1 << 14; // The second-to-top bit is set: 1 << 30 for 32 bits
+
+    // "bit" starts at the highest power of four <= the argument.
+    while (bit > num)
+        bit >>= 2;
+
+    while (bit != 0) {
+        if (num >= res + bit) {
+            num -= res + bit;
+            res = (res >> 1) + bit;
+        }
+        else
+            res >>= 1;
+        bit >>= 2;
+    }
+    return res;
+}
+
+"""
diff --git a/src/ieee754/add/function_unit.py b/src/ieee754/add/function_unit.py
new file mode 100644
index 00000000..108c84f3
--- /dev/null
+++ b/src/ieee754/add/function_unit.py
@@ -0,0 +1,44 @@
+from nmigen import Signal, Cat, Const, Mux, Module, Array
+from nmigen.cli import main, verilog
+
+from nmigen_add_experiment import FPADD
+from rstation_row import ReservationStationRow
+
+from math import log
+
+class FunctionUnit:
+
+    def __init__(self, width, num_units):
+        """ Function Unit
+
+            * width: bit-width of IEEE754.  supported: 16, 32, 64
+            * num_units: number of Reservation Stations
+        """
+        self.width = width
+
+        fus = []
+        bsz = int(log(width) / log(2))
+        for i in range(num_units):
+            mid = Const(i, bsz)
+            rs = ReservationStationRow(width, mid)
+            rs.name = "RS%d" % i
+            fus.append(rs)
+        self.fus = Array(fus)
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for ReservationStationRow
+        """
+        m = Module()
+
+        return m
+
+
+if __name__ == "__main__":
+    rs = ReservationStationRow(width=32, id_wid=Const(1,4)
+    main(alu, ports=[rs.in_a, rs.in_b, rs.out_z]
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/inputgroup.py b/src/ieee754/add/inputgroup.py
new file mode 100644
index 00000000..e1b775d4
--- /dev/null
+++ b/src/ieee754/add/inputgroup.py
@@ -0,0 +1,115 @@
+from nmigen import Module, Signal, Cat, Array, Const
+from nmigen.lib.coding import PriorityEncoder
+from math import log
+
+from fpbase import Trigger
+
+
+class FPGetSyncOpsMod:
+    def __init__(self, width, num_ops=2):
+        self.width = width
+        self.num_ops = num_ops
+        inops = []
+        outops = []
+        for i in range(num_ops):
+            inops.append(Signal(width, reset_less=True))
+            outops.append(Signal(width, reset_less=True))
+        self.in_op = inops
+        self.out_op = outops
+        self.stb = Signal(num_ops)
+        self.ack = Signal()
+        self.ready = Signal(reset_less=True)
+        self.out_decode = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.ready.eq(self.stb == Const(-1, (self.num_ops, False)))
+        m.d.comb += self.out_decode.eq(self.ack & self.ready)
+        with m.If(self.out_decode):
+            for i in range(self.num_ops):
+                m.d.comb += [
+                        self.out_op[i].eq(self.in_op[i]),
+                ]
+        return m
+
+    def ports(self):
+        return self.in_op + self.out_op + [self.stb, self.ack]
+
+
+class FPOps(Trigger):
+    def __init__(self, width, num_ops):
+        Trigger.__init__(self)
+        self.width = width
+        self.num_ops = num_ops
+
+        res = []
+        for i in range(num_ops):
+            res.append(Signal(width))
+        self.v  = Array(res)
+
+    def ports(self):
+        res = []
+        for i in range(self.num_ops):
+            res.append(self.v[i])
+        res.append(self.ack)
+        res.append(self.stb)
+        return res
+
+
+class InputGroup:
+    def __init__(self, width, num_ops=2, num_rows=4):
+        self.width = width
+        self.num_ops = num_ops
+        self.num_rows = num_rows
+        self.mmax = int(log(self.num_rows) / log(2))
+        self.rs = []
+        self.mid = Signal(self.mmax, reset_less=True) # multiplex id
+        for i in range(num_rows):
+            self.rs.append(FPGetSyncOpsMod(width, num_ops))
+        self.rs = Array(self.rs)
+
+        self.out_op = FPOps(width, num_ops)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        pe = PriorityEncoder(self.num_rows)
+        m.submodules.selector = pe
+        m.submodules.out_op = self.out_op
+        m.submodules += self.rs
+
+        # connect priority encoder
+        in_ready = []
+        for i in range(self.num_rows):
+            in_ready.append(self.rs[i].ready)
+        m.d.comb += pe.i.eq(Cat(*in_ready))
+
+        active = Signal(reset_less=True)
+        out_en = Signal(reset_less=True)
+        m.d.comb += active.eq(~pe.n) # encoder active
+        m.d.comb += out_en.eq(active & self.out_op.trigger)
+
+        # encoder active: ack relevant input, record MID, pass output
+        with m.If(out_en):
+            rs = self.rs[pe.o]
+            m.d.sync += self.mid.eq(pe.o)
+            m.d.sync += rs.ack.eq(0)
+            m.d.sync += self.out_op.stb.eq(0)
+            for j in range(self.num_ops):
+                m.d.sync += self.out_op.v[j].eq(rs.out_op[j])
+        with m.Else():
+            m.d.sync += self.out_op.stb.eq(1)
+            # acks all default to zero
+            for i in range(self.num_rows):
+                m.d.sync += self.rs[i].ack.eq(1)
+
+        return m
+
+    def ports(self):
+        res = []
+        for i in range(self.num_rows):
+            inop = self.rs[i]
+            res += inop.in_op + [inop.stb]
+        return self.out_op.ports() + res + [self.mid]
+
+
diff --git a/src/ieee754/add/iocontrol.py b/src/ieee754/add/iocontrol.py
new file mode 100644
index 00000000..3d823c9b
--- /dev/null
+++ b/src/ieee754/add/iocontrol.py
@@ -0,0 +1,306 @@
+""" IO Control API
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Stage API:
+    ---------
+
+    stage requires compliance with a strict API that may be
+    implemented in several means, including as a static class.
+
+    Stages do not HOLD data, and they definitely do not contain
+    signalling (ready/valid).  They do however specify the FORMAT
+    of the incoming and outgoing data, and they provide a means to
+    PROCESS that data (from incoming format to outgoing format).
+
+    Stage Blocks really must be combinatorial blocks.  It would be ok
+    to have input come in from sync'd sources (clock-driven) however by
+    doing so they would no longer be deterministic, and chaining such
+    blocks with such side-effects together could result in unexpected,
+    unpredictable, unreproduceable behaviour.
+    So generally to be avoided, then unless you know what you are doing.
+
+    the methods of a stage instance must be as follows:
+
+    * ispec() - Input data format specification.  Takes a bit of explaining.
+                The requirements are: something that eventually derives from
+                nmigen Value must be returned *OR* an iterator or iterable
+                or sequence (list, tuple etc.) or generator must *yield*
+                thing(s) that (eventually) derive from the nmigen Value class.
+
+                Complex to state, very simple in practice:
+                see test_buf_pipe.py for over 25 worked examples.
+
+    * ospec() - Output data format specification.
+                format requirements identical to ispec.
+
+    * process(m, i) - Optional function for processing ispec-formatted data.
+                returns a combinatorial block of a result that
+                may be assigned to the output, by way of the "nmoperator.eq"
+                function.  Note that what is returned here can be
+                extremely flexible.  Even a dictionary can be returned
+                as long as it has fields that match precisely with the
+                Record into which its values is intended to be assigned.
+                Again: see example unit tests for details.
+
+    * setup(m, i) - Optional function for setting up submodules.
+                may be used for more complex stages, to link
+                the input (i) to submodules.  must take responsibility
+                for adding those submodules to the module (m).
+                the submodules must be combinatorial blocks and
+                must have their inputs and output linked combinatorially.
+
+    Both StageCls (for use with non-static classes) and Stage (for use
+    by static classes) are abstract classes from which, for convenience
+    and as a courtesy to other developers, anything conforming to the
+    Stage API may *choose* to derive.  See Liskov Substitution Principle:
+    https://en.wikipedia.org/wiki/Liskov_substitution_principle
+
+    StageChain:
+    ----------
+
+    A useful combinatorial wrapper around stages that chains them together
+    and then presents a Stage-API-conformant interface.  By presenting
+    the same API as the stages it wraps, it can clearly be used recursively.
+
+    ControlBase:
+    -----------
+
+    The base class for pipelines.  Contains previous and next ready/valid/data.
+    Also has an extremely useful "connect" function that can be used to
+    connect a chain of pipelines and present the exact same prev/next
+    ready/valid/data API.
+
+    Note: pipelines basically do not become pipelines as such until
+    handed to a derivative of ControlBase.  ControlBase itself is *not*
+    strictly considered a pipeline class.  Wishbone and AXI4 (master or
+    slave) could be derived from ControlBase, for example.
+"""
+
+from nmigen import Signal, Cat, Const, Module, Value, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.hdl.rec import Record
+
+from collections.abc import Sequence, Iterable
+from collections import OrderedDict
+
+import nmoperator
+
+
+class Object:
+    def __init__(self):
+        self.fields = OrderedDict()
+
+    def __setattr__(self, k, v):
+        print ("kv", k, v)
+        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
+           k in dir(Object) or "fields" not in self.__dict__):
+            return object.__setattr__(self, k, v)
+        self.fields[k] = v
+
+    def __getattr__(self, k):
+        if k in self.__dict__:
+            return object.__getattr__(self, k)
+        try:
+            return self.fields[k]
+        except KeyError as e:
+            raise AttributeError(e)
+
+    def __iter__(self):
+        for x in self.fields.values():  # OrderedDict so order is preserved
+            if isinstance(x, Iterable):
+                yield from x
+            else:
+                yield x
+
+    def eq(self, inp):
+        res = []
+        for (k, o) in self.fields.items():
+            i = getattr(inp, k)
+            print ("eq", o, i)
+            rres = o.eq(i)
+            if isinstance(rres, Sequence):
+                res += rres
+            else:
+                res.append(rres)
+        print (res)
+        return res
+
+    def ports(self): # being called "keys" would be much better
+        return list(self)
+
+
+class RecordObject(Record):
+    def __init__(self, layout=None, name=None):
+        Record.__init__(self, layout=layout or [], name=None)
+
+    def __setattr__(self, k, v):
+        #print (dir(Record))
+        if (k.startswith('_') or k in ["fields", "name", "src_loc"] or
+           k in dir(Record) or "fields" not in self.__dict__):
+            return object.__setattr__(self, k, v)
+        self.fields[k] = v
+        #print ("RecordObject setattr", k, v)
+        if isinstance(v, Record):
+            newlayout = {k: (k, v.layout)}
+        elif isinstance(v, Value):
+            newlayout = {k: (k, v.shape())}
+        else:
+            newlayout = {k: (k, nmoperator.shape(v))}
+        self.layout.fields.update(newlayout)
+
+    def __iter__(self):
+        for x in self.fields.values(): # remember: fields is an OrderedDict
+            if isinstance(x, Iterable):
+                yield from x           # a bit like flatten (nmigen.tools)
+            else:
+                yield x
+
+    def ports(self): # would be better being called "keys"
+        return list(self)
+
+
+class PrevControl(Elaboratable):
+    """ contains signals that come *from* the previous stage (both in and out)
+        * valid_i: previous stage indicating all incoming data is valid.
+                   may be a multi-bit signal, where all bits are required
+                   to be asserted to indicate "valid".
+        * ready_o: output to next stage indicating readiness to accept data
+        * data_i : an input - MUST be added by the USER of this class
+    """
+
+    def __init__(self, i_width=1, stage_ctl=False):
+        self.stage_ctl = stage_ctl
+        self.valid_i = Signal(i_width, name="p_valid_i") # prev   >>in  self
+        self._ready_o = Signal(name="p_ready_o")         # prev   <<out self
+        self.data_i = None # XXX MUST BE ADDED BY USER
+        if stage_ctl:
+            self.s_ready_o = Signal(name="p_s_o_rdy")    # prev   <<out self
+        self.trigger = Signal(reset_less=True)
+
+    @property
+    def ready_o(self):
+        """ public-facing API: indicates (externally) that stage is ready
+        """
+        if self.stage_ctl:
+            return self.s_ready_o # set dynamically by stage
+        return self._ready_o      # return this when not under dynamic control
+
+    def _connect_in(self, prev, direct=False, fn=None, do_data=True):
+        """ internal helper function to connect stage to an input source.
+            do not use to connect stage-to-stage!
+        """
+        valid_i = prev.valid_i if direct else prev.valid_i_test
+        res = [self.valid_i.eq(valid_i),
+               prev.ready_o.eq(self.ready_o)]
+        if do_data is False:
+            return res
+        data_i = fn(prev.data_i) if fn is not None else prev.data_i
+        return res + [nmoperator.eq(self.data_i, data_i)]
+
+    @property
+    def valid_i_test(self):
+        vlen = len(self.valid_i)
+        if vlen > 1:
+            # multi-bit case: valid only when valid_i is all 1s
+            all1s = Const(-1, (len(self.valid_i), False))
+            valid_i = (self.valid_i == all1s)
+        else:
+            # single-bit valid_i case
+            valid_i = self.valid_i
+
+        # when stage indicates not ready, incoming data
+        # must "appear" to be not ready too
+        if self.stage_ctl:
+            valid_i = valid_i & self.s_ready_o
+
+        return valid_i
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.trigger.eq(self.valid_i_test & self.ready_o)
+        return m
+
+    def eq(self, i):
+        return [nmoperator.eq(self.data_i, i.data_i),
+                self.ready_o.eq(i.ready_o),
+                self.valid_i.eq(i.valid_i)]
+
+    def __iter__(self):
+        yield self.valid_i
+        yield self.ready_o
+        if hasattr(self.data_i, "ports"):
+            yield from self.data_i.ports()
+        elif isinstance(self.data_i, Sequence):
+            yield from self.data_i
+        else:
+            yield self.data_i
+
+    def ports(self):
+        return list(self)
+
+
+class NextControl(Elaboratable):
+    """ contains the signals that go *to* the next stage (both in and out)
+        * valid_o: output indicating to next stage that data is valid
+        * ready_i: input from next stage indicating that it can accept data
+        * data_o : an output - MUST be added by the USER of this class
+    """
+    def __init__(self, stage_ctl=False):
+        self.stage_ctl = stage_ctl
+        self.valid_o = Signal(name="n_valid_o") # self out>>  next
+        self.ready_i = Signal(name="n_ready_i") # self <<in   next
+        self.data_o = None # XXX MUST BE ADDED BY USER
+        #if self.stage_ctl:
+        self.d_valid = Signal(reset=1) # INTERNAL (data valid)
+        self.trigger = Signal(reset_less=True)
+
+    @property
+    def ready_i_test(self):
+        if self.stage_ctl:
+            return self.ready_i & self.d_valid
+        return self.ready_i
+
+    def connect_to_next(self, nxt, do_data=True):
+        """ helper function to connect to the next stage data/valid/ready.
+            data/valid is passed *TO* nxt, and ready comes *IN* from nxt.
+            use this when connecting stage-to-stage
+        """
+        res = [nxt.valid_i.eq(self.valid_o),
+               self.ready_i.eq(nxt.ready_o)]
+        if do_data:
+            res.append(nmoperator.eq(nxt.data_i, self.data_o))
+        return res
+
+    def _connect_out(self, nxt, direct=False, fn=None, do_data=True):
+        """ internal helper function to connect stage to an output source.
+            do not use to connect stage-to-stage!
+        """
+        ready_i = nxt.ready_i if direct else nxt.ready_i_test
+        res = [nxt.valid_o.eq(self.valid_o),
+               self.ready_i.eq(ready_i)]
+        if not do_data:
+            return res
+        data_o = fn(nxt.data_o) if fn is not None else nxt.data_o
+        return res + [nmoperator.eq(data_o, self.data_o)]
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.comb += self.trigger.eq(self.ready_i_test & self.valid_o)
+        return m
+
+    def __iter__(self):
+        yield self.ready_i
+        yield self.valid_o
+        if hasattr(self.data_o, "ports"):
+            yield from self.data_o.ports()
+        elif isinstance(self.data_o, Sequence):
+            yield from self.data_o
+        else:
+            yield self.data_o
+
+    def ports(self):
+        return list(self)
+
diff --git a/src/ieee754/add/multipipe.py b/src/ieee754/add/multipipe.py
new file mode 100644
index 00000000..e24703f8
--- /dev/null
+++ b/src/ieee754/add/multipipe.py
@@ -0,0 +1,358 @@
+""" Combinatorial Multi-input and Multi-output multiplexer blocks
+    conforming to Pipeline API
+
+    Multi-input is complex because if any one input is ready, the output
+    can be ready, and the decision comes from a separate module.
+
+    Multi-output is simple (pretty much identical to UnbufferedPipeline),
+    and the selection is just a mux.  The only proviso (difference) being:
+    the outputs not being selected have to have their ready_o signals
+    DEASSERTED.
+"""
+
+from math import log
+from nmigen import Signal, Cat, Const, Mux, Module, Array, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.coding import PriorityEncoder
+from nmigen.hdl.rec import Record, Layout
+from stageapi import _spec
+
+from collections.abc import Sequence
+
+from example_buf_pipe import eq, NextControl, PrevControl, ExampleStage
+
+
+class MultiInControlBase(Elaboratable):
+    """ Common functions for Pipeline API
+    """
+    def __init__(self, in_multi=None, p_len=1):
+        """ Multi-input Control class.  Conforms to same API as ControlBase...
+            mostly.  has additional indices to the *multiple* input stages
+
+            * p: contains ready/valid to the previous stages PLURAL
+            * n: contains ready/valid to the next stage
+
+            User must also:
+            * add data_i members to PrevControl and
+            * add data_o member  to NextControl
+        """
+        # set up input and output IO ACK (prev/next ready/valid)
+        p = []
+        for i in range(p_len):
+            p.append(PrevControl(in_multi))
+        self.p = Array(p)
+        self.n = NextControl()
+
+    def connect_to_next(self, nxt, p_idx=0):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n.connect_to_next(nxt.p[p_idx])
+
+    def _connect_in(self, prev, idx=0, prev_idx=None):
+        """ helper function to connect stage to an input source.  do not
+            use to connect stage-to-stage!
+        """
+        if prev_idx is None:
+            return self.p[idx]._connect_in(prev.p)
+        return self.p[idx]._connect_in(prev.p[prev_idx])
+
+    def _connect_out(self, nxt):
+        """ helper function to connect stage to an output source.  do not
+            use to connect stage-to-stage!
+        """
+        if nxt_idx is None:
+            return self.n._connect_out(nxt.n)
+        return self.n._connect_out(nxt.n)
+
+    def set_input(self, i, idx=0):
+        """ helper function to set the input data
+        """
+        return eq(self.p[idx].data_i, i)
+
+    def elaborate(self, platform):
+        m = Module()
+        for i, p in enumerate(self.p):
+            setattr(m.submodules, "p%d" % i, p)
+        m.submodules.n = self.n
+        return m
+
+    def __iter__(self):
+        for p in self.p:
+            yield from p
+        yield from self.n
+
+    def ports(self):
+        return list(self)
+
+
+class MultiOutControlBase(Elaboratable):
+    """ Common functions for Pipeline API
+    """
+    def __init__(self, n_len=1, in_multi=None):
+        """ Multi-output Control class.  Conforms to same API as ControlBase...
+            mostly.  has additional indices to the multiple *output* stages
+            [MultiInControlBase has multiple *input* stages]
+
+            * p: contains ready/valid to the previou stage
+            * n: contains ready/valid to the next stages PLURAL
+
+            User must also:
+            * add data_i member to PrevControl and
+            * add data_o members to NextControl
+        """
+
+        # set up input and output IO ACK (prev/next ready/valid)
+        self.p = PrevControl(in_multi)
+        n = []
+        for i in range(n_len):
+            n.append(NextControl())
+        self.n = Array(n)
+
+    def connect_to_next(self, nxt, n_idx=0):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n[n_idx].connect_to_next(nxt.p)
+
+    def _connect_in(self, prev, idx=0):
+        """ helper function to connect stage to an input source.  do not
+            use to connect stage-to-stage!
+        """
+        return self.n[idx]._connect_in(prev.p)
+
+    def _connect_out(self, nxt, idx=0, nxt_idx=None):
+        """ helper function to connect stage to an output source.  do not
+            use to connect stage-to-stage!
+        """
+        if nxt_idx is None:
+            return self.n[idx]._connect_out(nxt.n)
+        return self.n[idx]._connect_out(nxt.n[nxt_idx])
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.p = self.p
+        for i, n in enumerate(self.n):
+            setattr(m.submodules, "n%d" % i, n)
+        return m
+
+    def set_input(self, i):
+        """ helper function to set the input data
+        """
+        return eq(self.p.data_i, i)
+
+    def __iter__(self):
+        yield from self.p
+        for n in self.n:
+            yield from n
+
+    def ports(self):
+        return list(self)
+
+
+class CombMultiOutPipeline(MultiOutControlBase):
+    """ A multi-input Combinatorial block conforming to the Pipeline API
+
+        Attributes:
+        -----------
+        p.data_i : stage input data (non-array).  shaped according to ispec
+        n.data_o : stage output data array.       shaped according to ospec
+    """
+
+    def __init__(self, stage, n_len, n_mux):
+        MultiOutControlBase.__init__(self, n_len=n_len)
+        self.stage = stage
+        self.n_mux = n_mux
+
+        # set up the input and output data
+        self.p.data_i = _spec(stage.ispec, 'data_i') # input type
+        for i in range(n_len):
+            name = 'data_o_%d' % i
+            self.n[i].data_o = _spec(stage.ospec, name) # output type
+
+    def process(self, i):
+        if hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def elaborate(self, platform):
+        m = MultiOutControlBase.elaborate(self, platform)
+
+        if hasattr(self.n_mux, "elaborate"): # TODO: identify submodule?
+            m.submodules += self.n_mux
+
+        # need buffer register conforming to *input* spec
+        r_data = _spec(self.stage.ispec, 'r_data') # input type
+        if hasattr(self.stage, "setup"):
+            self.stage.setup(m, r_data)
+
+        # multiplexer id taken from n_mux
+        mid = self.n_mux.m_id
+
+        # temporaries
+        p_valid_i = Signal(reset_less=True)
+        pv = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
+
+        # all outputs to next stages first initialised to zero (invalid)
+        # the only output "active" is then selected by the muxid
+        for i in range(len(self.n)):
+            m.d.comb += self.n[i].valid_o.eq(0)
+        data_valid = self.n[mid].valid_o
+        m.d.comb += self.p.ready_o.eq(~data_valid | self.n[mid].ready_i)
+        m.d.comb += data_valid.eq(p_valid_i | \
+                                    (~self.n[mid].ready_i & data_valid))
+        with m.If(pv):
+            m.d.comb += eq(r_data, self.p.data_i)
+        m.d.comb += eq(self.n[mid].data_o, self.process(r_data))
+
+        return m
+
+
+class CombMultiInPipeline(MultiInControlBase):
+    """ A multi-input Combinatorial block conforming to the Pipeline API
+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        r_data : input_shape according to ispec
+            A temporary (buffered) copy of a prior (valid) input.
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+    """
+
+    def __init__(self, stage, p_len, p_mux):
+        MultiInControlBase.__init__(self, p_len=p_len)
+        self.stage = stage
+        self.p_mux = p_mux
+
+        # set up the input and output data
+        for i in range(p_len):
+            name = 'data_i_%d' % i
+            self.p[i].data_i = _spec(stage.ispec, name) # input type
+        self.n.data_o = _spec(stage.ospec, 'data_o')
+
+    def process(self, i):
+        if hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def elaborate(self, platform):
+        m = MultiInControlBase.elaborate(self, platform)
+
+        m.submodules += self.p_mux
+
+        # need an array of buffer registers conforming to *input* spec
+        r_data = []
+        data_valid = []
+        p_valid_i = []
+        n_ready_in = []
+        p_len = len(self.p)
+        for i in range(p_len):
+            name = 'r_%d' % i
+            r = _spec(self.stage.ispec, name) # input type
+            r_data.append(r)
+            data_valid.append(Signal(name="data_valid", reset_less=True))
+            p_valid_i.append(Signal(name="p_valid_i", reset_less=True))
+            n_ready_in.append(Signal(name="n_ready_in", reset_less=True))
+            if hasattr(self.stage, "setup"):
+                self.stage.setup(m, r)
+        if len(r_data) > 1:
+            r_data = Array(r_data)
+            p_valid_i = Array(p_valid_i)
+            n_ready_in = Array(n_ready_in)
+            data_valid = Array(data_valid)
+
+        nirn = Signal(reset_less=True)
+        m.d.comb += nirn.eq(~self.n.ready_i)
+        mid = self.p_mux.m_id
+        for i in range(p_len):
+            m.d.comb += data_valid[i].eq(0)
+            m.d.comb += n_ready_in[i].eq(1)
+            m.d.comb += p_valid_i[i].eq(0)
+            m.d.comb += self.p[i].ready_o.eq(0)
+        m.d.comb += p_valid_i[mid].eq(self.p_mux.active)
+        m.d.comb += self.p[mid].ready_o.eq(~data_valid[mid] | self.n.ready_i)
+        m.d.comb += n_ready_in[mid].eq(nirn & data_valid[mid])
+        anyvalid = Signal(i, reset_less=True)
+        av = []
+        for i in range(p_len):
+            av.append(data_valid[i])
+        anyvalid = Cat(*av)
+        m.d.comb += self.n.valid_o.eq(anyvalid.bool())
+        m.d.comb += data_valid[mid].eq(p_valid_i[mid] | \
+                                    (n_ready_in[mid] & data_valid[mid]))
+
+        for i in range(p_len):
+            vr = Signal(reset_less=True)
+            m.d.comb += vr.eq(self.p[i].valid_i & self.p[i].ready_o)
+            with m.If(vr):
+                m.d.comb += eq(r_data[i], self.p[i].data_i)
+
+        m.d.comb += eq(self.n.data_o, self.process(r_data[mid]))
+
+        return m
+
+
+class CombMuxOutPipe(CombMultiOutPipeline):
+    def __init__(self, stage, n_len):
+        # HACK: stage is also the n-way multiplexer
+        CombMultiOutPipeline.__init__(self, stage, n_len=n_len, n_mux=stage)
+
+        # HACK: n-mux is also the stage... so set the muxid equal to input mid
+        stage.m_id = self.p.data_i.mid
+
+
+
+class InputPriorityArbiter(Elaboratable):
+    """ arbitration module for Input-Mux pipe, baed on PriorityEncoder
+    """
+    def __init__(self, pipe, num_rows):
+        self.pipe = pipe
+        self.num_rows = num_rows
+        self.mmax = int(log(self.num_rows) / log(2))
+        self.m_id = Signal(self.mmax, reset_less=True) # multiplex id
+        self.active = Signal(reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        assert len(self.pipe.p) == self.num_rows, \
+                "must declare input to be same size"
+        pe = PriorityEncoder(self.num_rows)
+        m.submodules.selector = pe
+
+        # connect priority encoder
+        in_ready = []
+        for i in range(self.num_rows):
+            p_valid_i = Signal(reset_less=True)
+            m.d.comb += p_valid_i.eq(self.pipe.p[i].valid_i_test)
+            in_ready.append(p_valid_i)
+        m.d.comb += pe.i.eq(Cat(*in_ready)) # array of input "valids"
+        m.d.comb += self.active.eq(~pe.n)   # encoder active (one input valid)
+        m.d.comb += self.m_id.eq(pe.o)       # output one active input
+
+        return m
+
+    def ports(self):
+        return [self.m_id, self.active]
+
+
+
+class PriorityCombMuxInPipe(CombMultiInPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self, stage, p_len=2):
+        p_mux = InputPriorityArbiter(self, p_len)
+        CombMultiInPipeline.__init__(self, stage, p_len, p_mux)
+
+
+if __name__ == '__main__':
+
+    dut = PriorityCombMuxInPipe(ExampleStage)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_combpipe.il", "w") as f:
+        f.write(vl)
diff --git a/src/ieee754/add/nmigen_add_experiment.py b/src/ieee754/add/nmigen_add_experiment.py
new file mode 100644
index 00000000..ecb1d35b
--- /dev/null
+++ b/src/ieee754/add/nmigen_add_experiment.py
@@ -0,0 +1,28 @@
+# IEEE Floating Point Adder (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen.cli import main, verilog
+from fpadd.statemachine import FPADDBase, FPADD
+from fpadd.pipeline import FPADDMuxInOut
+
+if __name__ == "__main__":
+    if True:
+        alu = FPADD(width=32, id_wid=5, single_cycle=True)
+        main(alu, ports=alu.rs[0][0].ports() + \
+                        alu.rs[0][1].ports() + \
+                        alu.res[0].ports() + \
+                        [alu.ids.in_mid, alu.ids.out_mid])
+    else:
+        alu = FPADDBase(width=32, id_wid=5, single_cycle=True)
+        main(alu, ports=[alu.in_a, alu.in_b] + \
+                        alu.in_t.ports() + \
+                        alu.out_z.ports() + \
+                        [alu.in_mid, alu.out_mid])
+
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/nmigen_div_experiment.py b/src/ieee754/add/nmigen_div_experiment.py
new file mode 100644
index 00000000..a7e215cb
--- /dev/null
+++ b/src/ieee754/add/nmigen_div_experiment.py
@@ -0,0 +1,246 @@
+# IEEE Floating Point Divider (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Const, Cat
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumIn, FPNumOut, FPOpIn, FPOpOut, Overflow, FPBase, FPState
+from singlepipe import eq
+
+class Div:
+    def __init__(self, width):
+        self.width = width
+        self.quot = Signal(width)  # quotient
+        self.dor = Signal(width)   # divisor
+        self.dend = Signal(width)  # dividend
+        self.rem = Signal(width)   # remainder
+        self.count = Signal(7)     # loop count
+
+        self.czero = Const(0, width)
+
+    def reset(self, m):
+        m.d.sync += [
+            self.quot.eq(self.czero),
+            self.rem.eq(self.czero),
+            self.count.eq(Const(0, 7))
+        ]
+
+
+class FPDIV(FPBase):
+
+    def __init__(self, width):
+        FPBase.__init__(self)
+        self.width = width
+
+        self.in_a  = FPOpIn(width)
+        self.in_b  = FPOpIn(width)
+        self.out_z = FPOpOut(width)
+
+        self.states = []
+
+    def add_state(self, state):
+        self.states.append(state)
+        return state
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPDiv
+        """
+        m = Module()
+
+        # Latches
+        a = FPNumIn(None, self.width, False)
+        b = FPNumIn(None, self.width, False)
+        z = FPNumOut(self.width, False)
+
+        div = Div(a.m_width*2 + 3) # double the mantissa width plus g/r/sticky
+
+        of = Overflow()
+        m.submodules.in_a = a
+        m.submodules.in_b = b
+        m.submodules.z = z
+        m.submodules.of = of
+
+        m.d.comb += a.v.eq(self.in_a.v)
+        m.d.comb += b.v.eq(self.in_b.v)
+
+        with m.FSM() as fsm:
+
+            # ******
+            # gets operand a
+
+            with m.State("get_a"):
+                res = self.get_op(m, self.in_a, a, "get_b")
+                m.d.sync += eq([a, self.in_a.ready_o], res)
+
+            # ******
+            # gets operand b
+
+            with m.State("get_b"):
+                res = self.get_op(m, self.in_b, b, "special_cases")
+                m.d.sync += eq([b, self.in_b.ready_o], res)
+
+            # ******
+            # special cases: NaNs, infs, zeros, denormalised
+            # NOTE: some of these are unique to div.  see "Special Operations"
+            # https://steve.hollasch.net/cgindex/coding/ieeefloat.html
+
+            with m.State("special_cases"):
+
+                # if a is NaN or b is NaN return NaN
+                with m.If(a.is_nan | b.is_nan):
+                    m.next = "put_z"
+                    m.d.sync += z.nan(1)
+
+                # if a is Inf and b is Inf return NaN
+                with m.Elif(a.is_inf & b.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.nan(1)
+
+                # if a is inf return inf (or NaN if b is zero)
+                with m.Elif(a.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(a.s ^ b.s)
+
+                # if b is inf return zero
+                with m.Elif(b.is_inf):
+                    m.next = "put_z"
+                    m.d.sync += z.zero(a.s ^ b.s)
+
+                # if a is zero return zero (or NaN if b is zero)
+                with m.Elif(a.is_zero):
+                    m.next = "put_z"
+                    # if b is zero return NaN
+                    with m.If(b.is_zero):
+                        m.d.sync += z.nan(1)
+                    with m.Else():
+                        m.d.sync += z.zero(a.s ^ b.s)
+
+                # if b is zero return Inf
+                with m.Elif(b.is_zero):
+                    m.next = "put_z"
+                    m.d.sync += z.inf(a.s ^ b.s)
+
+                # Denormalised Number checks
+                with m.Else():
+                    m.next = "normalise_a"
+                    self.denormalise(m, a)
+                    self.denormalise(m, b)
+
+            # ******
+            # normalise_a
+
+            with m.State("normalise_a"):
+                self.op_normalise(m, a, "normalise_b")
+
+            # ******
+            # normalise_b
+
+            with m.State("normalise_b"):
+                self.op_normalise(m, b, "divide_0")
+
+            # ******
+            # First stage of divide.  initialise state
+
+            with m.State("divide_0"):
+                m.next = "divide_1"
+                m.d.sync += [
+                    z.s.eq(a.s ^ b.s), # sign
+                    z.e.eq(a.e - b.e), # exponent
+                    div.dend.eq(a.m<<(a.m_width+3)), # 3 bits for g/r/sticky
+                    div.dor.eq(b.m),
+                ]
+                div.reset(m)
+
+            # ******
+            # Second stage of divide.
+
+            with m.State("divide_1"):
+                m.next = "divide_2"
+                m.d.sync += [
+                    div.quot.eq(div.quot << 1),
+                    div.rem.eq(Cat(div.dend[-1], div.rem[0:])),
+                    div.dend.eq(div.dend << 1),
+                ]
+
+            # ******
+            # Third stage of divide.
+            # This stage ends by jumping out to divide_3
+            # However it defaults to jumping to divide_1 (which comes back here)
+
+            with m.State("divide_2"):
+                with m.If(div.rem >= div.dor):
+                    m.d.sync += [
+                        div.quot[0].eq(1),
+                        div.rem.eq(div.rem - div.dor),
+                    ]
+                with m.If(div.count == div.width-2):
+                    m.next = "divide_3"
+                with m.Else():
+                    m.next = "divide_1"
+                    m.d.sync += [
+                        div.count.eq(div.count + 1),
+                    ]
+
+            # ******
+            # Fourth stage of divide.
+
+            with m.State("divide_3"):
+                m.next = "normalise_1"
+                m.d.sync += [
+                    z.m.eq(div.quot[3:]),
+                    of.guard.eq(div.quot[2]),
+                    of.round_bit.eq(div.quot[1]),
+                    of.sticky.eq(div.quot[0] | (div.rem != 0))
+                ]
+
+            # ******
+            # First stage of normalisation.
+
+            with m.State("normalise_1"):
+                self.normalise_1(m, z, of, "normalise_2")
+
+            # ******
+            # Second stage of normalisation.
+
+            with m.State("normalise_2"):
+                self.normalise_2(m, z, of, "round")
+
+            # ******
+            # rounding stage
+
+            with m.State("round"):
+                self.roundz(m, z, of.roundz)
+                m.next = "corrections"
+
+            # ******
+            # correction stage
+
+            with m.State("corrections"):
+                self.corrections(m, z, "pack")
+
+            # ******
+            # pack stage
+
+            with m.State("pack"):
+                self.pack(m, z, "put_z")
+
+            # ******
+            # put_z stage
+
+            with m.State("put_z"):
+                self.put_z(m, z, self.out_z, "get_a")
+
+        return m
+
+
+if __name__ == "__main__":
+    alu = FPDIV(width=32)
+    main(alu, ports=alu.in_a.ports() + alu.in_b.ports() + alu.out_z.ports())
+
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/nmoperator.py b/src/ieee754/add/nmoperator.py
new file mode 100644
index 00000000..bd5e5544
--- /dev/null
+++ b/src/ieee754/add/nmoperator.py
@@ -0,0 +1,171 @@
+""" nmigen operator functions / utils
+
+    eq:
+    --
+
+    a strategically very important function that is identical in function
+    to nmigen's Signal.eq function, except it may take objects, or a list
+    of objects, or a tuple of objects, and where objects may also be
+    Records.
+"""
+
+from nmigen import Signal, Cat, Const, Mux, Module, Value, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.lib.fifo import SyncFIFO, SyncFIFOBuffered
+from nmigen.hdl.ast import ArrayProxy
+from nmigen.hdl.rec import Record, Layout
+
+from abc import ABCMeta, abstractmethod
+from collections.abc import Sequence, Iterable
+from collections import OrderedDict
+from queue import Queue
+import inspect
+
+
+class Visitor2:
+    """ a helper class for iterating twin-argument compound data structures.
+
+        Record is a special (unusual, recursive) case, where the input may be
+        specified as a dictionary (which may contain further dictionaries,
+        recursively), where the field names of the dictionary must match
+        the Record's field spec.  Alternatively, an object with the same
+        member names as the Record may be assigned: it does not have to
+        *be* a Record.
+
+        ArrayProxy is also special-cased, it's a bit messy: whilst ArrayProxy
+        has an eq function, the object being assigned to it (e.g. a python
+        object) might not.  despite the *input* having an eq function,
+        that doesn't help us, because it's the *ArrayProxy* that's being
+        assigned to.  so.... we cheat.  use the ports() function of the
+        python object, enumerate them, find out the list of Signals that way,
+        and assign them.
+    """
+    def iterator2(self, o, i):
+        if isinstance(o, dict):
+            yield from self.dict_iter2(o, i)
+
+        if not isinstance(o, Sequence):
+            o, i = [o], [i]
+        for (ao, ai) in zip(o, i):
+            #print ("visit", fn, ao, ai)
+            if isinstance(ao, Record):
+                yield from self.record_iter2(ao, ai)
+            elif isinstance(ao, ArrayProxy) and not isinstance(ai, Value):
+                yield from self.arrayproxy_iter2(ao, ai)
+            else:
+                yield (ao, ai)
+
+    def dict_iter2(self, o, i):
+        for (k, v) in o.items():
+            print ("d-iter", v, i[k])
+            yield (v, i[k])
+        return res
+
+    def _not_quite_working_with_all_unit_tests_record_iter2(self, ao, ai):
+        print ("record_iter2", ao, ai, type(ao), type(ai))
+        if isinstance(ai, Value):
+            if isinstance(ao, Sequence):
+                ao, ai = [ao], [ai]
+            for o, i in zip(ao, ai):
+                yield (o, i)
+            return
+        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            yield from self.iterator2(ao.fields[field_name], val)
+
+    def record_iter2(self, ao, ai):
+        for idx, (field_name, field_shape, _) in enumerate(ao.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            yield from self.iterator2(ao.fields[field_name], val)
+
+    def arrayproxy_iter2(self, ao, ai):
+        for p in ai.ports():
+            op = getattr(ao, p.name)
+            print ("arrayproxy - p", p, p.name)
+            yield from self.iterator2(op, p)
+
+
+class Visitor:
+    """ a helper class for iterating single-argument compound data structures.
+        similar to Visitor2.
+    """
+    def iterate(self, i):
+        """ iterate a compound structure recursively using yield
+        """
+        if not isinstance(i, Sequence):
+            i = [i]
+        for ai in i:
+            #print ("iterate", ai)
+            if isinstance(ai, Record):
+                #print ("record", list(ai.layout))
+                yield from self.record_iter(ai)
+            elif isinstance(ai, ArrayProxy) and not isinstance(ai, Value):
+                yield from self.array_iter(ai)
+            else:
+                yield ai
+
+    def record_iter(self, ai):
+        for idx, (field_name, field_shape, _) in enumerate(ai.layout):
+            if isinstance(field_shape, Layout):
+                val = ai.fields
+            else:
+                val = ai
+            if hasattr(val, field_name): # check for attribute
+                val = getattr(val, field_name)
+            else:
+                val = val[field_name] # dictionary-style specification
+            #print ("recidx", idx, field_name, field_shape, val)
+            yield from self.iterate(val)
+
+    def array_iter(self, ai):
+        for p in ai.ports():
+            yield from self.iterate(p)
+
+
+def eq(o, i):
+    """ makes signals equal: a helper routine which identifies if it is being
+        passed a list (or tuple) of objects, or signals, or Records, and calls
+        the objects' eq function.
+    """
+    res = []
+    for (ao, ai) in Visitor2().iterator2(o, i):
+        rres = ao.eq(ai)
+        if not isinstance(rres, Sequence):
+            rres = [rres]
+        res += rres
+    return res
+
+
+def shape(i):
+    #print ("shape", i)
+    r = 0
+    for part in list(i):
+        #print ("shape?", part)
+        s, _ = part.shape()
+        r += s
+    return r, False
+
+
+def cat(i):
+    """ flattens a compound structure recursively using Cat
+    """
+    from nmigen.tools import flatten
+    #res = list(flatten(i)) # works (as of nmigen commit f22106e5) HOWEVER...
+    res = list(Visitor().iterate(i)) # needed because input may be a sequence
+    return Cat(*res)
+
+
diff --git a/src/ieee754/add/pipeline.py b/src/ieee754/add/pipeline.py
new file mode 100644
index 00000000..afcee743
--- /dev/null
+++ b/src/ieee754/add/pipeline.py
@@ -0,0 +1,394 @@
+""" Example 5: Making use of PyRTL and Introspection. """
+
+from collections.abc import Sequence
+
+from nmigen import Signal
+from nmigen.hdl.rec import Record
+from nmigen import tracer
+from nmigen.compat.fhdl.bitcontainer import value_bits_sign
+from contextlib import contextmanager
+
+from nmoperator import eq
+from singlepipe import StageCls, ControlBase, BufferedHandshake
+from singlepipe import UnbufferedPipeline
+
+
+# The following example shows how pyrtl can be used to make some interesting
+# hardware structures using python introspection.  In particular, this example
+# makes a N-stage pipeline structure.  Any specific pipeline is then a derived
+# class of SimplePipeline where methods with names starting with "stage" are
+# stages, and new members with names not starting with "_" are to be registered
+# for the next stage.
+
+def like(value, rname, pipe, pipemode=False):
+    if isinstance(value, ObjectProxy):
+        return ObjectProxy.like(pipe, value, pipemode=pipemode,
+                                name=rname, reset_less=True)
+    else:
+        return Signal(value_bits_sign(value), name=rname,
+                             reset_less=True)
+        return Signal.like(value, name=rname, reset_less=True)
+
+def get_assigns(_assigns):
+    assigns = []
+    for e in _assigns:
+        if isinstance(e, ObjectProxy):
+            assigns += get_assigns(e._assigns)
+        else:
+            assigns.append(e)
+    return assigns
+
+
+def get_eqs(_eqs):
+    eqs = []
+    for e in _eqs:
+        if isinstance(e, ObjectProxy):
+            eqs += get_eqs(e._eqs)
+        else:
+            eqs.append(e)
+    return eqs
+
+
+class ObjectProxy:
+    def __init__(self, m, name=None, pipemode=False, syncmode=True):
+        self._m = m
+        if name is None:
+            name = tracer.get_var_name(default=None)
+        self.name = name
+        self._pipemode = pipemode
+        self._syncmode = syncmode
+        self._eqs = {}
+        self._assigns = []
+        self._preg_map = {}
+
+    @classmethod
+    def like(cls, m, value, pipemode=False, name=None, src_loc_at=0, **kwargs):
+        name = name or tracer.get_var_name(depth=2 + src_loc_at,
+                                            default="$like")
+
+        src_loc_at_1 = 1 + src_loc_at
+        r = ObjectProxy(m, value.name, pipemode)
+        #for a, aname in value._preg_map.items():
+        #    r._preg_map[aname] = like(a, aname, m, pipemode)
+        for a in value.ports():
+            aname = a.name
+            r._preg_map[aname] = like(a, aname, m, pipemode)
+        return r
+
+    def __repr__(self):
+        subobjs = []
+        for a in self.ports():
+            aname = a.name
+            ai = self._preg_map[aname]
+            subobjs.append(repr(ai))
+        return "<OP %s>" % subobjs
+
+    def get_specs(self, liked=False):
+        res = []
+        for k, v in self._preg_map.items():
+            #v = like(v, k, stage._m)
+            res.append(v)
+            if isinstance(v, ObjectProxy):
+                res += v.get_specs()
+        return res
+
+    def eq(self, i):
+        print ("ObjectProxy eq", self, i)
+        res = []
+        for a in self.ports():
+            aname = a.name
+            ai = i._preg_map[aname]
+            res.append(a.eq(ai))
+        return res
+
+    def ports(self):
+        res = []
+        for aname, a in self._preg_map.items():
+            if isinstance(a, Signal) or isinstance(a, ObjectProxy) or \
+               isinstance(a, Record):
+                res.append(a)
+        #print ("ObjectPorts", res)
+        return res
+
+    def __getattr__(self, name):
+        try:
+            v = self._preg_map[name]
+            return v
+            #return like(v, name, self._m)
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for OP %s'
+                % (name, self.name))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_') or name in ['name', 'ports', 'eq', 'like']:
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        #rname = "%s_%s" % (self.name, name)
+        rname = name
+        new_pipereg = like(value, rname, self._m, self._pipemode)
+        self._preg_map[name] = new_pipereg
+        #object.__setattr__(self, name, new_pipereg)
+        if self._pipemode:
+            #print ("OP pipemode", self._syncmode, new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            if self._syncmode:
+                self._m.d.sync += assign
+            else:
+                self._m.d.comb += assign
+        elif self._m:
+            #print ("OP !pipemode assign", new_pipereg, value, type(value))
+            self._m.d.comb += eq(new_pipereg, value)
+        else:
+            #print ("OP !pipemode !m", new_pipereg, value, type(value))
+            self._assigns += eq(new_pipereg, value)
+            if isinstance(value, ObjectProxy):
+                #print ("OP, defer assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs.append(value._eqs)
+
+
+class PipelineStage:
+    """ Pipeline builder stage with auto generation of pipeline registers.
+    """
+
+    def __init__(self, name, m, prev=None, pipemode=False, ispec=None):
+        self._m = m
+        self._stagename = name
+        self._preg_map = {'__nextstage__': {}}
+        self._prev_stage = prev
+        self._ispec = ispec
+        if ispec:
+            self._preg_map[self._stagename] = ispec
+        if prev:
+            print ("prev", prev._stagename, prev._preg_map)
+            #if prev._stagename in prev._preg_map:
+            #    m = prev._preg_map[prev._stagename]
+            #    self._preg_map[prev._stagename] = m
+            if '__nextstage__' in prev._preg_map:
+                m = prev._preg_map['__nextstage__']
+                m = likedict(m)
+                self._preg_map[self._stagename] = m
+                #for k, v in m.items():
+                    #m[k] = like(v, k, self._m)
+                print ("make current", self._stagename, m)
+        self._pipemode = pipemode
+        self._eqs = {}
+        self._assigns = []
+
+    def __getattribute__(self, name):
+        if name.startswith('_'):
+            return object.__getattribute__(self, name)
+        #if name in self._preg_map['__nextstage__']:
+        #    return self._preg_map['__nextstage__'][name]
+        try:
+            print ("getattr", name, object.__getattribute__(self, '_preg_map'))
+            v = self._preg_map[self._stagename][name]
+            return v
+            #return like(v, name, self._m)
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for stage %s'
+                % (name, self._stagename))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        pipereg_id = self._stagename
+        rname = 'pipereg_' + pipereg_id + '_' + name
+        new_pipereg = like(value, rname, self._m, self._pipemode)
+        next_stage = '__nextstage__'
+        if next_stage not in self._preg_map:
+            self._preg_map[next_stage] = {}
+        self._preg_map[next_stage][name] = new_pipereg
+        print ("setattr", name, value, self._preg_map)
+        if self._pipemode:
+            self._eqs[name] = new_pipereg
+            assign = eq(new_pipereg, value)
+            print ("pipemode: append", new_pipereg, value, assign)
+            if isinstance(value, ObjectProxy):
+                print ("OP, assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs[name]._eqs = value._eqs
+            #self._m.d.comb += assign
+            self._assigns += assign
+        elif self._m:
+            print ("!pipemode: assign", new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            self._m.d.sync += assign
+        else:
+            print ("!pipemode !m: defer assign", new_pipereg, value)
+            assign = eq(new_pipereg, value)
+            self._eqs[name] = new_pipereg
+            self._assigns += assign
+            if isinstance(value, ObjectProxy):
+                print ("OP, defer assigns:", value._assigns)
+                self._assigns += value._assigns
+                self._eqs[name]._eqs = value._eqs
+
+def likelist(specs):
+    res = []
+    for v in specs:
+        res.append(like(v, v.name, None, pipemode=True))
+    return res
+
+def likedict(specs):
+    if not isinstance(specs, dict):
+        return like(specs, specs.name, None, pipemode=True)
+    res = {}
+    for k, v in specs.items():
+        res[k] = likedict(v)
+    return res
+
+
+class AutoStage(StageCls):
+    def __init__(self, inspecs, outspecs, eqs, assigns):
+        self.inspecs, self.outspecs = inspecs, outspecs
+        self.eqs, self.assigns = eqs, assigns
+        #self.o = self.ospec()
+    def ispec(self): return likedict(self.inspecs)
+    def ospec(self): return likedict(self.outspecs)
+
+    def process(self, i):
+        print ("stage process", i)
+        return self.eqs
+
+    def setup(self, m, i):
+        print ("stage setup i", i, m)
+        print ("stage setup inspecs", self.inspecs)
+        print ("stage setup outspecs", self.outspecs)
+        print ("stage setup eqs", self.eqs)
+        #self.o = self.ospec()
+        m.d.comb += eq(self.inspecs, i)
+        #m.d.comb += eq(self.outspecs, self.eqs)
+        #m.d.comb += eq(self.o, i)
+
+
+class AutoPipe(UnbufferedPipeline):
+    def __init__(self, stage, assigns):
+        UnbufferedPipeline.__init__(self, stage)
+        self.assigns = assigns
+
+    def elaborate(self, platform):
+        m = UnbufferedPipeline.elaborate(self, platform)
+        m.d.comb += self.assigns
+        print ("assigns", self.assigns, m)
+        return m
+
+
+class PipeManager:
+    def __init__(self, m, pipemode=False, pipetype=None):
+        self.m = m
+        self.pipemode = pipemode
+        self.pipetype = pipetype
+
+    @contextmanager
+    def Stage(self, name, prev=None, ispec=None):
+        if ispec:
+            ispec = likedict(ispec)
+        print ("start stage", name, ispec)
+        stage = PipelineStage(name, None, prev, self.pipemode, ispec=ispec)
+        try:
+            yield stage, self.m #stage._m
+        finally:
+            pass
+        if self.pipemode:
+            if stage._ispec:
+                print ("use ispec", stage._ispec)
+                inspecs = stage._ispec
+            else:
+                inspecs = self.get_specs(stage, name)
+                #inspecs = likedict(inspecs)
+            outspecs = self.get_specs(stage, '__nextstage__', liked=True)
+            print ("stage inspecs", name, inspecs)
+            print ("stage outspecs", name, outspecs)
+            eqs = stage._eqs # get_eqs(stage._eqs)
+            assigns = get_assigns(stage._assigns)
+            print ("stage eqs", name, eqs)
+            print ("stage assigns", name, assigns)
+            s = AutoStage(inspecs, outspecs, eqs, assigns)
+            self.stages.append(s)
+        print ("end stage", name, self.pipemode, "\n")
+
+    def get_specs(self, stage, name, liked=False):
+        return stage._preg_map[name]
+        if name in stage._preg_map:
+            res = []
+            for k, v in stage._preg_map[name].items():
+                #v = like(v, k, stage._m)
+                res.append(v)
+                #if isinstance(v, ObjectProxy):
+                #    res += v.get_specs()
+            return res
+        return {}
+
+    def __enter__(self):
+        self.stages = []
+        return self
+
+    def __exit__(self, *args):
+        print ("exit stage", args)
+        pipes = []
+        cb = ControlBase()
+        for s in self.stages:
+            print ("stage specs", s, s.inspecs, s.outspecs)
+            if self.pipetype == 'buffered':
+                p = BufferedHandshake(s)
+            else:
+                p = AutoPipe(s, s.assigns)
+            pipes.append(p)
+            self.m.submodules += p
+
+        self.m.d.comb += cb.connect(pipes)
+
+
+class SimplePipeline:
+    """ Pipeline builder with auto generation of pipeline registers.
+    """
+
+    def __init__(self, m):
+        self._m = m
+        self._pipeline_register_map = {}
+        self._current_stage_num = 0
+
+    def _setup(self):
+        stage_list = []
+        for method in dir(self):
+            if method.startswith('stage'):
+                stage_list.append(method)
+        for stage in sorted(stage_list):
+            stage_method = getattr(self, stage)
+            stage_method()
+            self._current_stage_num += 1
+
+    def __getattr__(self, name):
+        try:
+            return self._pipeline_register_map[self._current_stage_num][name]
+        except KeyError:
+            raise AttributeError(
+                'error, no pipeline register "%s" defined for stage %d'
+                % (name, self._current_stage_num))
+
+    def __setattr__(self, name, value):
+        if name.startswith('_'):
+            # do not do anything tricky with variables starting with '_'
+            object.__setattr__(self, name, value)
+            return
+        next_stage = self._current_stage_num + 1
+        pipereg_id = str(self._current_stage_num) + 'to' + str(next_stage)
+        rname = 'pipereg_' + pipereg_id + '_' + name
+        #new_pipereg = Signal(value_bits_sign(value), name=rname,
+        #                     reset_less=True)
+        if isinstance(value, ObjectProxy):
+            new_pipereg = ObjectProxy.like(self._m, value,
+                                           name=rname, reset_less = True)
+        else:
+            new_pipereg = Signal.like(value, name=rname, reset_less = True)
+        if next_stage not in self._pipeline_register_map:
+            self._pipeline_register_map[next_stage] = {}
+        self._pipeline_register_map[next_stage][name] = new_pipereg
+        self._m.d.sync += eq(new_pipereg, value)
+
diff --git a/src/ieee754/add/pipeline_example.py b/src/ieee754/add/pipeline_example.py
new file mode 100644
index 00000000..799caf6d
--- /dev/null
+++ b/src/ieee754/add/pipeline_example.py
@@ -0,0 +1,204 @@
+""" Example 5: Making use of PyRTL and Introspection. """
+
+from nmigen import Module, Signal, Const
+from nmigen.cli import main, verilog, rtlil
+
+
+from pipeline import SimplePipeline, ObjectProxy, PipeManager
+
+
+class SimplePipelineExample(SimplePipeline):
+    """ A very simple pipeline to show how registers are inferred. """
+
+    def __init__(self, pipe):
+        SimplePipeline.__init__(self, pipe)
+        self._loopback = Signal(4)
+        self._setup()
+
+    def stage0(self):
+        self.n = ~self._loopback
+
+    def stage1(self):
+        self.n = self.n + 2
+
+    def stage2(self):
+        localv = Signal(4)
+        self._pipe.comb += localv.eq(2)
+        self.n = self.n << localv
+
+    def stage3(self):
+        self.n = ~self.n
+
+    def stage4(self):
+        self._pipe.sync += self._loopback.eq(self.n + 3)
+
+
+class ObjectBasedPipelineExample(SimplePipeline):
+    """ A very simple pipeline to show how registers are inferred. """
+
+    def __init__(self, m):
+        SimplePipeline.__init__(self, m)
+        self._loopback = Signal(4)
+        o = ObjectProxy(m)
+        o.a = Signal(4)
+        o.b = Signal(4)
+        self._obj = o
+        self._setup()
+
+    def stage0(self):
+        self.n = ~self._loopback
+        self.o = self._obj
+
+    def stage1(self):
+        self.n = self.n + self.o.a
+        o = ObjectProxy(self._m)
+        o.c = self.n
+        o.d = self.o.b + self.n + Const(5)
+        self.o = o
+
+    def stage2(self):
+        localv = Signal(4)
+        self._m.d.comb += localv.eq(2)
+        self.n = self.n << localv
+        o = ObjectProxy(self._m)
+        o.e = self.n + self.o.c + self.o.d
+        self.o = o
+
+    def stage3(self):
+        self.n = ~self.n
+        self.o = self.o
+        self.o.e = self.o.e + self.n
+
+    def stage4(self):
+        self._m.d.sync += self._loopback.eq(self.n + 3 + self.o.e)
+
+
+class PipeModule:
+
+    def __init__(self):
+        self.m = Module()
+        self.p = ObjectBasedPipelineExample(self.m)
+
+    def elaborate(self, platform=None):
+        return self.m
+
+
+class PipelineStageExample:
+
+    def __init__(self):
+        self._loopback = Signal(4, name="loopback")
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+
+        with PipeManager(m, pipemode=True) as pipe:
+
+            ispec={'loopback': self._loopback}
+            with pipe.Stage("first", ispec=ispec) as (p, m):
+                p.n = ~p.loopback
+            with pipe.Stage("second", p) as (p, m):
+                #p.n = ~self._loopback + 2
+                p.n = p.n + Const(2)
+            with pipe.Stage("third", p) as (p, m):
+                #p.n = ~self._loopback + 5
+                localv = Signal(4)
+                m.d.comb += localv.eq(2)
+                p.n = p.n << localv + Const(1)
+                #p.m = p.n + 2
+
+        print (pipe.stages)
+
+        return m
+
+class PipelineStageObjectExample:
+
+    def __init__(self):
+        self.loopback = Signal(4)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+
+        o = ObjectProxy(None, pipemode=False)
+        o.a = Signal(4)
+        o.b = Signal(4)
+        self.obj = o
+
+        localv2 = Signal(4)
+        m.d.sync += localv2.eq(localv2 + 3)
+
+        #m.d.comb += self.obj.a.eq(localv2 + 1)
+        #m.d.sync += self._loopback.eq(localv2)
+
+        ispec= {'loopback': self.loopback, 'obj': self.obj}
+        with PipeManager(m, pipemode=True) as pipe:
+
+            with pipe.Stage("first", ispec=ispec) as (p, m):
+                p.n = ~p.loopback
+                p.o = p.obj
+            with pipe.Stage("second", p) as (p, m):
+                #p.n = ~self.loopback + 2
+                localn = Signal(4)
+                m.d.comb += localn.eq(p.n)
+                o = ObjectProxy(None, pipemode=False)
+                o.c = localn
+                o.d = p.o.b + localn + Const(5)
+                p.n = localn
+                p.o = o
+            with pipe.Stage("third", p) as (p, m):
+                #p.n = ~self._loopback + 5
+                localv = Signal(4)
+                m.d.comb += localv.eq(2)
+                p.n = p.n << localv
+                o = ObjectProxy(None, pipemode=False)
+                o.e = p.n + p.o.c + p.o.d
+                p.o = o
+
+        print ("stages", pipe.stages)
+
+        return m
+
+
+class PipelineStageObjectExample2:
+
+    def __init__(self):
+        self._loopback = Signal(4)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+
+        ispec= [self._loopback]
+        with PipeManager(m, pipemode=True) as pipe:
+
+            with pipe.Stage("first",
+                            ispec=ispec) as (p, m):
+                p.n = ~self._loopback
+                o = ObjectProxy(None, pipemode=False)
+                o.b = ~self._loopback + Const(5)
+                p.o = o
+
+        print ("stages", pipe.stages)
+
+        return m
+
+
+
+if __name__ == "__main__":
+    example = PipeModule()
+    with open("pipe_module.il", "w") as f:
+        f.write(rtlil.convert(example, ports=[
+               example.p._loopback,
+             ]))
+    example = PipelineStageExample()
+    with open("pipe_stage_module.il", "w") as f:
+        f.write(rtlil.convert(example, ports=[
+               example._loopback,
+             ]))
+    #exit(0)
+    example = PipelineStageObjectExample()
+    with open("pipe_stage_object_module.il", "w") as f:
+        f.write(rtlil.convert(example, ports=[
+               example.loopback,
+             ]))
diff --git a/src/ieee754/add/queue.py b/src/ieee754/add/queue.py
new file mode 100644
index 00000000..0038953d
--- /dev/null
+++ b/src/ieee754/add/queue.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2014 - 2019 The Regents of the University of
+# California (Regents). All Rights Reserved.  Redistribution and use in
+# source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#    * Redistributions of source code must retain the above
+#      copyright notice, this list of conditions and the following
+#      two paragraphs of disclaimer.
+#    * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      two paragraphs of disclaimer in the documentation and/or other materials
+#      provided with the distribution.
+#    * Neither the name of the Regents nor the names of its contributors
+#      may be used to endorse or promote products derived from this
+#      software without specific prior written permission.
+# IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+# SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
+# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
+# REGENTS HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF
+# ANY, PROVIDED HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION
+# TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+# MODIFICATIONS.
+
+from nmigen import Module, Signal, Memory, Mux, Elaboratable
+from nmigen.tools import bits_for
+from nmigen.cli import main
+from nmigen.lib.fifo import FIFOInterface
+
+# translated from https://github.com/freechipsproject/chisel3/blob/a4a29e29c3f1eed18f851dcf10bdc845571dfcb6/src/main/scala/chisel3/util/Decoupled.scala#L185   # noqa
+
+
+class Queue(FIFOInterface, Elaboratable):
+    def __init__(self, width, depth, fwft=True, pipe=False):
+        """ Queue (FIFO) with pipe mode and first-write fall-through capability
+
+            * :width: width of Queue data in/out
+            * :depth: queue depth.  NOTE: may be set to 0 (this is ok)
+            * :fwft : first-write, fall-through mode (Chisel Queue "flow" mode)
+            * :pipe : pipe mode.  NOTE: this mode can cause unanticipated
+                      problems.  when read is enabled, so is writeable.
+                      therefore if read is enabled, the data ABSOLUTELY MUST
+                      be read.
+
+            fwft mode = True basically means that the data may be transferred
+            combinatorially from input to output.
+
+            Attributes:
+            * level: available free space (number of unread entries)
+
+            din  = enq_data, writable  = enq_ready, we = enq_valid
+            dout = deq_data, re = deq_ready, readable = deq_valid
+        """
+        FIFOInterface.__init__(self, width, depth, fwft)
+        self.pipe = pipe
+        self.depth = depth
+        self.level = Signal(bits_for(depth))
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # set up an SRAM.  XXX bug in Memory: cannot create SRAM of depth 1
+        ram = Memory(self.width, self.depth if self.depth > 1 else 2)
+        m.submodules.ram_read = ram_read = ram.read_port(synchronous=False)
+        m.submodules.ram_write = ram_write = ram.write_port()
+
+        # convenience names
+        p_ready_o = self.writable
+        p_valid_i = self.we
+        enq_data = self.din
+
+        n_valid_o = self.readable
+        n_ready_i = self.re
+        deq_data = self.dout
+
+        # intermediaries
+        ptr_width = bits_for(self.depth - 1) if self.depth > 1 else 0
+        enq_ptr = Signal(ptr_width) # cyclic pointer to "insert" point (wrport)
+        deq_ptr = Signal(ptr_width) # cyclic pointer to "remove" point (rdport)
+        maybe_full = Signal() # not reset_less (set by sync)
+
+        # temporaries
+        do_enq = Signal(reset_less=True)
+        do_deq = Signal(reset_less=True)
+        ptr_diff = Signal(ptr_width)
+        ptr_match = Signal(reset_less=True)
+        empty = Signal(reset_less=True)
+        full = Signal(reset_less=True)
+        enq_max = Signal(reset_less=True)
+        deq_max = Signal(reset_less=True)
+
+        m.d.comb += [ptr_match.eq(enq_ptr == deq_ptr), # read-ptr = write-ptr
+                     ptr_diff.eq(enq_ptr - deq_ptr),
+                     enq_max.eq(enq_ptr == self.depth - 1),
+                     deq_max.eq(deq_ptr == self.depth - 1),
+                     empty.eq(ptr_match & ~maybe_full),
+                     full.eq(ptr_match & maybe_full),
+                     do_enq.eq(p_ready_o & p_valid_i), # write conditions ok
+                     do_deq.eq(n_ready_i & n_valid_o), # read conditions ok
+
+                     # set readable and writable (NOTE: see pipe mode below)
+                     n_valid_o.eq(~empty), # cannot read if empty!
+                     p_ready_o.eq(~full),  # cannot write if full!
+
+                     # set up memory and connect to input and output
+                     ram_write.addr.eq(enq_ptr),
+                     ram_write.data.eq(enq_data),
+                     ram_write.en.eq(do_enq),
+                     ram_read.addr.eq(deq_ptr),
+                     deq_data.eq(ram_read.data) # NOTE: overridden in fwft mode
+                    ]
+
+        # under write conditions, SRAM write-pointer moves on next clock
+        with m.If(do_enq):
+            m.d.sync += enq_ptr.eq(Mux(enq_max, 0, enq_ptr+1))
+
+        # under read conditions, SRAM read-pointer moves on next clock
+        with m.If(do_deq):
+            m.d.sync += deq_ptr.eq(Mux(deq_max, 0, deq_ptr+1))
+
+        # if read-but-not-write or write-but-not-read, maybe_full set
+        with m.If(do_enq != do_deq):
+            m.d.sync += maybe_full.eq(do_enq)
+
+        # first-word fall-through: same as "flow" parameter in Chisel3 Queue
+        # basically instead of relying on the Memory characteristics (which
+        # in FPGAs do not have write-through), then when the queue is empty
+        # take the output directly from the input, i.e. *bypass* the SRAM.
+        # this done combinatorially to give the exact same characteristics
+        # as Memory "write-through"... without relying on a changing API
+        if self.fwft:
+            with m.If(p_valid_i):
+                m.d.comb += n_valid_o.eq(1)
+            with m.If(empty):
+                m.d.comb += deq_data.eq(enq_data)
+                m.d.comb += do_deq.eq(0)
+                with m.If(n_ready_i):
+                    m.d.comb += do_enq.eq(0)
+
+        # pipe mode: if next stage says it's ready (readable), we
+        #            *must* declare the input ready (writeable).
+        if self.pipe:
+            with m.If(n_ready_i):
+                m.d.comb += p_ready_o.eq(1)
+
+        # set the count (available free space), optimise on power-of-two
+        if self.depth == 1 << ptr_width:  # is depth a power of 2
+            m.d.comb += self.level.eq(
+                Mux(maybe_full & ptr_match, self.depth, 0) | ptr_diff)
+        else:
+            m.d.comb += self.level.eq(Mux(ptr_match,
+                                          Mux(maybe_full, self.depth, 0),
+                                          Mux(deq_ptr > enq_ptr,
+                                              self.depth + ptr_diff,
+                                              ptr_diff)))
+
+        return m
+
+
+if __name__ == "__main__":
+    reg_stage = Queue(1, 1, pipe=True)
+    break_ready_chain_stage = Queue(1, 1, pipe=True, fwft=True)
+    m = Module()
+    ports = []
+
+    def queue_ports(queue, name_prefix):
+        retval = []
+        for name in ["level",
+                     "dout",
+                     "readable",
+                     "writable"]:
+            port = getattr(queue, name)
+            signal = Signal(port.shape(), name=name_prefix+name)
+            m.d.comb += signal.eq(port)
+            retval.append(signal)
+        for name in ["re",
+                     "din",
+                     "we"]:
+            port = getattr(queue, name)
+            signal = Signal(port.shape(), name=name_prefix+name)
+            m.d.comb += port.eq(signal)
+            retval.append(signal)
+        return retval
+
+    m.submodules.reg_stage = reg_stage
+    ports += queue_ports(reg_stage, "reg_stage_")
+    m.submodules.break_ready_chain_stage = break_ready_chain_stage
+    ports += queue_ports(break_ready_chain_stage, "break_ready_chain_stage_")
+    main(m, ports=ports)
diff --git a/src/ieee754/add/record_experiment.py b/src/ieee754/add/record_experiment.py
new file mode 100644
index 00000000..1789c3bd
--- /dev/null
+++ b/src/ieee754/add/record_experiment.py
@@ -0,0 +1,106 @@
+from nmigen import Module, Signal, Mux, Const, Elaboratable
+from nmigen.hdl.rec import Record, Layout, DIR_NONE
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+from nmigen.compat.fhdl.bitcontainer import value_bits_sign
+from singlepipe import cat, RecordObject
+
+
+class RecordTest:
+
+    def __init__(self):
+        self.r1 = RecordObject()
+        self.r1.sig1 = Signal(16)
+        self.r1.r2 = RecordObject()
+        self.r1.r2.sig2 = Signal(16)
+        self.r1.r3 = RecordObject()
+        self.r1.r3.sig3 = Signal(16)
+        self.sig123 = Signal(48)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        sig1 = Signal(16)
+        m.d.comb += sig1.eq(self.r1.sig1)
+        sig2 = Signal(16)
+        m.d.comb += sig2.eq(self.r1.r2.sig2)
+
+        print (self.r1.fields)
+        print (self.r1.shape())
+        print ("width", len(self.r1))
+        m.d.comb += self.sig123.eq(cat(self.r1))
+
+        return m
+
+
+def testbench(dut):
+    yield dut.r1.sig1.eq(5)
+    yield dut.r1.r2.sig2.eq(10)
+    yield dut.r1.r3.sig3.eq(1)
+    
+    sig1 = yield dut.r1.sig1
+    assert sig1 == 5
+    sig2 = yield dut.r1.r2.sig2
+    assert sig2 == 10
+
+    yield
+
+    sig123 = yield dut.sig123
+    print ("sig123", hex(sig123))
+    assert sig123 == 0x1000a0005
+
+
+
+class RecordTest2(Elaboratable):
+
+    def __init__(self):
+        self.r1 = RecordObject()
+        self.r1.sig1 = Signal(16)
+        self.r1.r2 = RecordObject()
+        self.r1.r2.sig2 = Signal(16)
+        self.r1.r3 = RecordObject()
+        self.r1.r3.sig3 = Signal(16)
+        self.sig123 = Signal(48)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        m.d.comb += cat(self.r1).eq(self.sig123)
+
+        return m
+
+
+def testbench2(dut):
+    
+    sig123 = yield dut.sig123.eq(0x1000a0005)
+
+    yield
+
+    sig1 = yield dut.r1.sig1
+    assert sig1 == 5
+    sig2 = yield dut.r1.r2.sig2
+    assert sig2 == 10
+    sig3 = yield dut.r1.r3.sig3
+    assert sig3 == 1
+
+
+
+######################################################################
+# Unit Tests
+######################################################################
+
+if __name__ == '__main__':
+    print ("test 1")
+    dut = RecordTest()
+    run_simulation(dut, testbench(dut), vcd_name="test_record1.vcd")
+    vl = rtlil.convert(dut, ports=[dut.sig123, dut.r1.sig1, dut.r1.r2.sig2])
+    with open("test_record1.il", "w") as f:
+        f.write(vl)
+
+    print ("test 2")
+    dut = RecordTest2()
+    run_simulation(dut, testbench2(dut), vcd_name="test_record2.vcd")
+    vl = rtlil.convert(dut, ports=[dut.sig123, dut.r1.sig1, dut.r1.r2.sig2])
+    with open("test_record2.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/ieee754/add/rstation_row.py b/src/ieee754/add/rstation_row.py
new file mode 100644
index 00000000..aeb58732
--- /dev/null
+++ b/src/ieee754/add/rstation_row.py
@@ -0,0 +1,39 @@
+from nmigen import Signal, Cat, Const, Mux, Module
+
+from nmigen.cli import main, verilog
+
+from fpbase import FPNumIn, FPNumOut, FPOp, Overflow, FPBase, FPNumBase
+from fpbase import MultiShiftRMerge
+
+class ReservationStationRow:
+
+    def __init__(self, width, id_wid):
+        """ Reservation Station row
+
+            * width: bit-width of IEEE754.  supported: 16, 32, 64
+            * id_wid: an identifier to be passed through to the FunctionUnit
+        """
+        self.width = width
+
+        self.in_a  = Signal(width)
+        self.in_b  = Signal(width)
+        self.id_wid = id_wid
+        self.out_z = Signal(width)
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for ReservationStationRow
+        """
+        m = Module()
+
+        return m
+
+
+if __name__ == "__main__":
+    rs = ReservationStationRow(width=32, id_wid=Const(1,4))
+    main(alu, ports=[rs.in_a, rs.in_b, rs.out_z]
+
+    # works... but don't use, just do "python fname.py convert -t v"
+    #print (verilog.convert(alu, ports=[
+    #                        ports=alu.in_a.ports() + \
+    #                              alu.in_b.ports() + \
+    #                              alu.out_z.ports())
diff --git a/src/ieee754/add/singlepipe.py b/src/ieee754/add/singlepipe.py
new file mode 100644
index 00000000..68b62e43
--- /dev/null
+++ b/src/ieee754/add/singlepipe.py
@@ -0,0 +1,829 @@
+""" Pipeline API.  For multi-input and multi-output variants, see multipipe.
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Important: see Stage API (stageapi.py) in combination with below
+
+    RecordBasedStage:
+    ----------------
+
+    A convenience class that takes an input shape, output shape, a
+    "processing" function and an optional "setup" function.  Honestly
+    though, there's not much more effort to just... create a class
+    that returns a couple of Records (see ExampleAddRecordStage in
+    examples).
+
+    PassThroughStage:
+    ----------------
+
+    A convenience class that takes a single function as a parameter,
+    that is chain-called to create the exact same input and output spec.
+    It has a process() function that simply returns its input.
+
+    Instances of this class are completely redundant if handed to
+    StageChain, however when passed to UnbufferedPipeline they
+    can be used to introduce a single clock delay.
+
+    ControlBase:
+    -----------
+
+    The base class for pipelines.  Contains previous and next ready/valid/data.
+    Also has an extremely useful "connect" function that can be used to
+    connect a chain of pipelines and present the exact same prev/next
+    ready/valid/data API.
+
+    Note: pipelines basically do not become pipelines as such until
+    handed to a derivative of ControlBase.  ControlBase itself is *not*
+    strictly considered a pipeline class.  Wishbone and AXI4 (master or
+    slave) could be derived from ControlBase, for example.
+    UnbufferedPipeline:
+    ------------------
+
+    A simple stalling clock-synchronised pipeline that has no buffering
+    (unlike BufferedHandshake).  Data flows on *every* clock cycle when
+    the conditions are right (this is nominally when the input is valid
+    and the output is ready).
+
+    A stall anywhere along the line will result in a stall back-propagating
+    down the entire chain.  The BufferedHandshake by contrast will buffer
+    incoming data, allowing previous stages one clock cycle's grace before
+    also having to stall.
+
+    An advantage of the UnbufferedPipeline over the Buffered one is
+    that the amount of logic needed (number of gates) is greatly
+    reduced (no second set of buffers basically)
+
+    The disadvantage of the UnbufferedPipeline is that the valid/ready
+    logic, if chained together, is *combinatorial*, resulting in
+    progressively larger gate delay.
+
+    PassThroughHandshake:
+    ------------------
+
+    A Control class that introduces a single clock delay, passing its
+    data through unaltered.  Unlike RegisterPipeline (which relies
+    on UnbufferedPipeline and PassThroughStage) it handles ready/valid
+    itself.
+
+    RegisterPipeline:
+    ----------------
+
+    A convenience class that, because UnbufferedPipeline introduces a single
+    clock delay, when its stage is a PassThroughStage, it results in a Pipeline
+    stage that, duh, delays its (unmodified) input by one clock cycle.
+
+    BufferedHandshake:
+    ----------------
+
+    nmigen implementation of buffered pipeline stage, based on zipcpu:
+    https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html
+
+    this module requires quite a bit of thought to understand how it works
+    (and why it is needed in the first place).  reading the above is
+    *strongly* recommended.
+
+    unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires
+    the STB / ACK signals to raise and lower (on separate clocks) before
+    data may proceeed (thus only allowing one piece of data to proceed
+    on *ALTERNATE* cycles), the signalling here is a true pipeline
+    where data will flow on *every* clock when the conditions are right.
+
+    input acceptance conditions are when:
+        * incoming previous-stage strobe (p.valid_i) is HIGH
+        * outgoing previous-stage ready   (p.ready_o) is LOW
+
+    output transmission conditions are when:
+        * outgoing next-stage strobe (n.valid_o) is HIGH
+        * outgoing next-stage ready   (n.ready_i) is LOW
+
+    the tricky bit is when the input has valid data and the output is not
+    ready to accept it.  if it wasn't for the clock synchronisation, it
+    would be possible to tell the input "hey don't send that data, we're
+    not ready".  unfortunately, it's not possible to "change the past":
+    the previous stage *has no choice* but to pass on its data.
+
+    therefore, the incoming data *must* be accepted - and stored: that
+    is the responsibility / contract that this stage *must* accept.
+    on the same clock, it's possible to tell the input that it must
+    not send any more data.  this is the "stall" condition.
+
+    we now effectively have *two* possible pieces of data to "choose" from:
+    the buffered data, and the incoming data.  the decision as to which
+    to process and output is based on whether we are in "stall" or not.
+    i.e. when the next stage is no longer ready, the output comes from
+    the buffer if a stall had previously occurred, otherwise it comes
+    direct from processing the input.
+
+    this allows us to respect a synchronous "travelling STB" with what
+    dan calls a "buffered handshake".
+
+    it's quite a complex state machine!
+
+    SimpleHandshake
+    ---------------
+
+    Synchronised pipeline, Based on:
+    https://github.com/ZipCPU/dbgbus/blob/master/hexbus/rtl/hbdeword.v
+"""
+
+from nmigen import Signal, Mux, Module, Elaboratable
+from nmigen.cli import verilog, rtlil
+from nmigen.hdl.rec import Record
+
+from queue import Queue
+import inspect
+
+from iocontrol import (PrevControl, NextControl, Object, RecordObject)
+from stageapi import (_spec, StageCls, Stage, StageChain, StageHelper)
+import nmoperator
+                      
+
+class RecordBasedStage(Stage):
+    """ convenience class which provides a Records-based layout.
+        honestly it's a lot easier just to create a direct Records-based
+        class (see ExampleAddRecordStage)
+    """
+    def __init__(self, in_shape, out_shape, processfn, setupfn=None):
+        self.in_shape = in_shape
+        self.out_shape = out_shape
+        self.__process = processfn
+        self.__setup = setupfn
+    def ispec(self): return Record(self.in_shape)
+    def ospec(self): return Record(self.out_shape)
+    def process(seif, i): return self.__process(i)
+    def setup(seif, m, i): return self.__setup(m, i)
+
+
+class PassThroughStage(StageCls):
+    """ a pass-through stage with its input data spec identical to its output,
+        and "passes through" its data from input to output (does nothing).
+
+        use this basically to explicitly make any data spec Stage-compliant.
+        (many APIs would potentially use a static "wrap" method in e.g.
+         StageCls to achieve a similar effect)
+    """
+    def __init__(self, iospecfn): self.iospecfn = iospecfn
+    def ispec(self): return self.iospecfn()
+    def ospec(self): return self.iospecfn()
+
+
+class ControlBase(StageHelper, Elaboratable):
+    """ Common functions for Pipeline API.  Note: a "pipeline stage" only
+        exists (conceptually) when a ControlBase derivative is handed
+        a Stage (combinatorial block)
+
+        NOTE: ControlBase derives from StageHelper, making it accidentally
+        compliant with the Stage API.  Using those functions directly
+        *BYPASSES* a ControlBase instance ready/valid signalling, which
+        clearly should not be done without a really, really good reason.
+    """
+    def __init__(self, stage=None, in_multi=None, stage_ctl=False):
+        """ Base class containing ready/valid/data to previous and next stages
+
+            * p: contains ready/valid to the previous stage
+            * n: contains ready/valid to the next stage
+
+            Except when calling Controlbase.connect(), user must also:
+            * add data_i member to PrevControl (p) and
+            * add data_o member to NextControl (n)
+            Calling ControlBase._new_data is a good way to do that.
+        """
+        StageHelper.__init__(self, stage)
+
+        # set up input and output IO ACK (prev/next ready/valid)
+        self.p = PrevControl(in_multi, stage_ctl)
+        self.n = NextControl(stage_ctl)
+
+        # set up the input and output data
+        if stage is not None:
+            self._new_data("data")
+
+    def _new_data(self, name):
+        """ allocates new data_i and data_o
+        """
+        self.p.data_i, self.n.data_o = self.new_specs(name)
+
+    @property
+    def data_r(self):
+        return self.process(self.p.data_i)
+
+    def connect_to_next(self, nxt):
+        """ helper function to connect to the next stage data/valid/ready.
+        """
+        return self.n.connect_to_next(nxt.p)
+
+    def _connect_in(self, prev):
+        """ internal helper function to connect stage to an input source.
+            do not use to connect stage-to-stage!
+        """
+        return self.p._connect_in(prev.p)
+
+    def _connect_out(self, nxt):
+        """ internal helper function to connect stage to an output source.
+            do not use to connect stage-to-stage!
+        """
+        return self.n._connect_out(nxt.n)
+
+    def connect(self, pipechain):
+        """ connects a chain (list) of Pipeline instances together and
+            links them to this ControlBase instance:
+
+                      in <----> self <---> out
+                       |                   ^
+                       v                   |
+                    [pipe1, pipe2, pipe3, pipe4]
+                       |    ^  |    ^  |     ^
+                       v    |  v    |  v     |
+                     out---in out--in out---in
+
+            Also takes care of allocating data_i/data_o, by looking up
+            the data spec for each end of the pipechain.  i.e It is NOT
+            necessary to allocate self.p.data_i or self.n.data_o manually:
+            this is handled AUTOMATICALLY, here.
+
+            Basically this function is the direct equivalent of StageChain,
+            except that unlike StageChain, the Pipeline logic is followed.
+
+            Just as StageChain presents an object that conforms to the
+            Stage API from a list of objects that also conform to the
+            Stage API, an object that calls this Pipeline connect function
+            has the exact same pipeline API as the list of pipline objects
+            it is called with.
+
+            Thus it becomes possible to build up larger chains recursively.
+            More complex chains (multi-input, multi-output) will have to be
+            done manually.
+
+            Argument:
+
+            * :pipechain: - a sequence of ControlBase-derived classes
+                            (must be one or more in length)
+
+            Returns:
+
+            * a list of eq assignments that will need to be added in
+              an elaborate() to m.d.comb
+        """
+        assert len(pipechain) > 0, "pipechain must be non-zero length"
+        assert self.stage is None, "do not use connect with a stage"
+        eqs = [] # collated list of assignment statements
+
+        # connect inter-chain
+        for i in range(len(pipechain)-1):
+            pipe1 = pipechain[i]                # earlier
+            pipe2 = pipechain[i+1]              # later (by 1)
+            eqs += pipe1.connect_to_next(pipe2) # earlier n to later p
+
+        # connect front and back of chain to ourselves
+        front = pipechain[0]                # first in chain
+        end = pipechain[-1]                 # last in chain
+        self.set_specs(front, end) # sets up ispec/ospec functions
+        self._new_data("chain") # NOTE: REPLACES existing data
+        eqs += front._connect_in(self)      # front p to our p
+        eqs += end._connect_out(self)       # end n   to our n
+
+        return eqs
+
+    def set_input(self, i):
+        """ helper function to set the input data (used in unit tests)
+        """
+        return nmoperator.eq(self.p.data_i, i)
+
+    def __iter__(self):
+        yield from self.p # yields ready/valid/data (data also gets yielded)
+        yield from self.n # ditto
+
+    def ports(self):
+        return list(self)
+
+    def elaborate(self, platform):
+        """ handles case where stage has dynamic ready/valid functions
+        """
+        m = Module()
+        m.submodules.p = self.p
+        m.submodules.n = self.n
+
+        self.setup(m, self.p.data_i)
+
+        if not self.p.stage_ctl:
+            return m
+
+        # intercept the previous (outgoing) "ready", combine with stage ready
+        m.d.comb += self.p.s_ready_o.eq(self.p._ready_o & self.stage.d_ready)
+
+        # intercept the next (incoming) "ready" and combine it with data valid
+        sdv = self.stage.d_valid(self.n.ready_i)
+        m.d.comb += self.n.d_valid.eq(self.n.ready_i & sdv)
+
+        return m
+
+
+class BufferedHandshake(ControlBase):
+    """ buffered pipeline stage.  data and strobe signals travel in sync.
+        if ever the input is ready and the output is not, processed data
+        is shunted in a temporary register.
+
+        Argument: stage.  see Stage API above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                            process --->----^
+                              |             |
+                              +-- r_data ->-+
+
+        input data p.data_i is read (only), is processed and goes into an
+        intermediate result store [process()].  this is updated combinatorially.
+
+        in a non-stall condition, the intermediate result will go into the
+        output (update_output).  however if ever there is a stall, it goes
+        into r_data instead [update_buffer()].
+
+        when the non-stall condition is released, r_data is the first
+        to be transferred to the output [flush_buffer()], and the stall
+        condition cleared.
+
+        on the next cycle (as long as stall is not raised again) the
+        input may begin to be processed and transferred directly to output.
+    """
+
+    def elaborate(self, platform):
+        self.m = ControlBase.elaborate(self, platform)
+
+        result = _spec(self.stage.ospec, "r_tmp")
+        r_data = _spec(self.stage.ospec, "r_data")
+
+        # establish some combinatorial temporaries
+        o_n_validn = Signal(reset_less=True)
+        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
+        nir_por = Signal(reset_less=True)
+        nir_por_n = Signal(reset_less=True)
+        p_valid_i = Signal(reset_less=True)
+        nir_novn = Signal(reset_less=True)
+        nirn_novn = Signal(reset_less=True)
+        por_pivn = Signal(reset_less=True)
+        npnn = Signal(reset_less=True)
+        self.m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
+                     o_n_validn.eq(~self.n.valid_o),
+                     n_ready_i.eq(self.n.ready_i_test),
+                     nir_por.eq(n_ready_i & self.p._ready_o),
+                     nir_por_n.eq(n_ready_i & ~self.p._ready_o),
+                     nir_novn.eq(n_ready_i | o_n_validn),
+                     nirn_novn.eq(~n_ready_i & o_n_validn),
+                     npnn.eq(nir_por | nirn_novn),
+                     por_pivn.eq(self.p._ready_o & ~p_valid_i)
+        ]
+
+        # store result of processing in combinatorial temporary
+        self.m.d.comb += nmoperator.eq(result, self.data_r)
+
+        # if not in stall condition, update the temporary register
+        with self.m.If(self.p.ready_o): # not stalled
+            self.m.d.sync += nmoperator.eq(r_data, result) # update buffer
+
+        # data pass-through conditions
+        with self.m.If(npnn):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            self.m.d.sync += [self.n.valid_o.eq(p_valid_i), # valid if p_valid
+                              nmoperator.eq(self.n.data_o, data_o), # update out
+                             ]
+        # buffer flush conditions (NOTE: can override data passthru conditions)
+        with self.m.If(nir_por_n): # not stalled
+            # Flush the [already processed] buffer to the output port.
+            data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
+            self.m.d.sync += [self.n.valid_o.eq(1),  # reg empty
+                              nmoperator.eq(self.n.data_o, data_o), # flush
+                             ]
+        # output ready conditions
+        self.m.d.sync += self.p._ready_o.eq(nir_novn | por_pivn)
+
+        return self.m
+
+
+class SimpleHandshake(ControlBase):
+    """ simple handshake control.  data and strobe signals travel in sync.
+        implements the protocol used by Wishbone and AXI4.
+
+        Argument: stage.  see Stage API above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                              +--process->--^
+        Truth Table
+
+        Inputs   Temporary  Output Data
+        -------  ---------- -----  ----
+        P P N N  PiV& ~NiR&  N P
+        i o i o  PoR  NoV    o o
+        V R R V              V R
+
+        -------   -    -     - -
+        0 0 0 0   0    0    >0 0    reg
+        0 0 0 1   0    1    >1 0    reg
+        0 0 1 0   0    0     0 1    process(data_i)
+        0 0 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        0 1 0 0   0    0    >0 0    reg
+        0 1 0 1   0    1    >1 0    reg
+        0 1 1 0   0    0     0 1    process(data_i)
+        0 1 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        1 0 0 0   0    0    >0 0    reg
+        1 0 0 1   0    1    >1 0    reg
+        1 0 1 0   0    0     0 1    process(data_i)
+        1 0 1 1   0    0     0 1    process(data_i)
+        -------   -    -     - -
+        1 1 0 0   1    0     1 0    process(data_i)
+        1 1 0 1   1    1     1 0    process(data_i)
+        1 1 1 0   1    0     1 1    process(data_i)
+        1 1 1 1   1    0     1 1    process(data_i)
+        -------   -    -     - -
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        r_busy = Signal()
+        result = _spec(self.stage.ospec, "r_tmp")
+
+        # establish some combinatorial temporaries
+        n_ready_i = Signal(reset_less=True, name="n_i_rdy_data")
+        p_valid_i_p_ready_o = Signal(reset_less=True)
+        p_valid_i = Signal(reset_less=True)
+        m.d.comb += [p_valid_i.eq(self.p.valid_i_test),
+                     n_ready_i.eq(self.n.ready_i_test),
+                     p_valid_i_p_ready_o.eq(p_valid_i & self.p.ready_o),
+        ]
+
+        # store result of processing in combinatorial temporary
+        m.d.comb += nmoperator.eq(result, self.data_r)
+
+        # previous valid and ready
+        with m.If(p_valid_i_p_ready_o):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            m.d.sync += [r_busy.eq(1),      # output valid
+                         nmoperator.eq(self.n.data_o, data_o), # update output
+                        ]
+        # previous invalid or not ready, however next is accepting
+        with m.Elif(n_ready_i):
+            data_o = self._postprocess(result) # XXX TBD, does nothing right now
+            m.d.sync += [nmoperator.eq(self.n.data_o, data_o)]
+            # TODO: could still send data here (if there was any)
+            #m.d.sync += self.n.valid_o.eq(0) # ...so set output invalid
+            m.d.sync += r_busy.eq(0) # ...so set output invalid
+
+        m.d.comb += self.n.valid_o.eq(r_busy)
+        # if next is ready, so is previous
+        m.d.comb += self.p._ready_o.eq(n_ready_i)
+
+        return self.m
+
+
+class UnbufferedPipeline(ControlBase):
+    """ A simple pipeline stage with single-clock synchronisation
+        and two-way valid/ready synchronised signalling.
+
+        Note that a stall in one stage will result in the entire pipeline
+        chain stalling.
+
+        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
+        travel synchronously with the data: the valid/ready signalling
+        combines in a *combinatorial* fashion.  Therefore, a long pipeline
+        chain will lengthen propagation delays.
+
+        Argument: stage.  see Stage API, above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |
+                            r_data        result
+                              |             |
+                              +--process ->-+
+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        r_data : input_shape according to ispec
+            A temporary (buffered) copy of a prior (valid) input.
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+        result: output_shape according to ospec
+            The output of the combinatorial logic.  it is updated
+            COMBINATORIALLY (no clock dependence).
+
+        Truth Table
+
+        Inputs  Temp  Output  Data
+        -------   -   -----   ----
+        P P N N ~NiR&  N P
+        i o i o  NoV   o o
+        V R R V        V R
+
+        -------   -    - -
+        0 0 0 0   0    0 1    reg
+        0 0 0 1   1    1 0    reg
+        0 0 1 0   0    0 1    reg
+        0 0 1 1   0    0 1    reg
+        -------   -    - -
+        0 1 0 0   0    0 1    reg
+        0 1 0 1   1    1 0    reg
+        0 1 1 0   0    0 1    reg
+        0 1 1 1   0    0 1    reg
+        -------   -    - -
+        1 0 0 0   0    1 1    reg
+        1 0 0 1   1    1 0    reg
+        1 0 1 0   0    1 1    reg
+        1 0 1 1   0    1 1    reg
+        -------   -    - -
+        1 1 0 0   0    1 1    process(data_i)
+        1 1 0 1   1    1 0    process(data_i)
+        1 1 1 0   0    1 1    process(data_i)
+        1 1 1 1   0    1 1    process(data_i)
+        -------   -    - -
+
+        Note: PoR is *NOT* involved in the above decision-making.
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        data_valid = Signal() # is data valid or not
+        r_data = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # some temporaries
+        p_valid_i = Signal(reset_less=True)
+        pv = Signal(reset_less=True)
+        buf_full = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        m.d.comb += pv.eq(self.p.valid_i & self.p.ready_o)
+        m.d.comb += buf_full.eq(~self.n.ready_i_test & data_valid)
+
+        m.d.comb += self.n.valid_o.eq(data_valid)
+        m.d.comb += self.p._ready_o.eq(~data_valid | self.n.ready_i_test)
+        m.d.sync += data_valid.eq(p_valid_i | buf_full)
+
+        with m.If(pv):
+            m.d.sync += nmoperator.eq(r_data, self.data_r)
+        data_o = self._postprocess(r_data) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
+
+        return self.m
+
+class UnbufferedPipeline2(ControlBase):
+    """ A simple pipeline stage with single-clock synchronisation
+        and two-way valid/ready synchronised signalling.
+
+        Note that a stall in one stage will result in the entire pipeline
+        chain stalling.
+
+        Also that unlike BufferedHandshake, the valid/ready signalling does NOT
+        travel synchronously with the data: the valid/ready signalling
+        combines in a *combinatorial* fashion.  Therefore, a long pipeline
+        chain will lengthen propagation delays.
+
+        Argument: stage.  see Stage API, above
+
+        stage-1   p.valid_i >>in   stage   n.valid_o out>>   stage+1
+        stage-1   p.ready_o <<out  stage   n.ready_i <<in    stage+1
+        stage-1   p.data_i  >>in   stage   n.data_o  out>>   stage+1
+                              |             |    |
+                              +- process-> buf <-+
+        Attributes:
+        -----------
+        p.data_i : StageInput, shaped according to ispec
+            The pipeline input
+        p.data_o : StageOutput, shaped according to ospec
+            The pipeline output
+        buf : output_shape according to ospec
+            A temporary (buffered) copy of a valid output
+            This is HELD if the output is not ready.  It is updated
+            SYNCHRONOUSLY.
+
+        Inputs  Temp  Output Data
+        -------   -   -----
+        P P N N ~NiR&  N P   (buf_full)
+        i o i o  NoV   o o
+        V R R V        V R
+
+        -------   -    - -
+        0 0 0 0   0    0 1   process(data_i)
+        0 0 0 1   1    1 0   reg (odata, unchanged)
+        0 0 1 0   0    0 1   process(data_i)
+        0 0 1 1   0    0 1   process(data_i)
+        -------   -    - -
+        0 1 0 0   0    0 1   process(data_i)
+        0 1 0 1   1    1 0   reg (odata, unchanged)
+        0 1 1 0   0    0 1   process(data_i)
+        0 1 1 1   0    0 1   process(data_i)
+        -------   -    - -
+        1 0 0 0   0    1 1   process(data_i)
+        1 0 0 1   1    1 0   reg (odata, unchanged)
+        1 0 1 0   0    1 1   process(data_i)
+        1 0 1 1   0    1 1   process(data_i)
+        -------   -    - -
+        1 1 0 0   0    1 1   process(data_i)
+        1 1 0 1   1    1 0   reg (odata, unchanged)
+        1 1 1 0   0    1 1   process(data_i)
+        1 1 1 1   0    1 1   process(data_i)
+        -------   -    - -
+
+        Note: PoR is *NOT* involved in the above decision-making.
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        buf_full = Signal() # is data valid or not
+        buf = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # some temporaries
+        p_valid_i = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+
+        m.d.comb += self.n.valid_o.eq(buf_full | p_valid_i)
+        m.d.comb += self.p._ready_o.eq(~buf_full)
+        m.d.sync += buf_full.eq(~self.n.ready_i_test & self.n.valid_o)
+
+        data_o = Mux(buf_full, buf, self.data_r)
+        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, data_o)
+        m.d.sync += nmoperator.eq(buf, self.n.data_o)
+
+        return self.m
+
+
+class PassThroughHandshake(ControlBase):
+    """ A control block that delays by one clock cycle.
+
+        Inputs   Temporary          Output Data
+        -------  ------------------  ----- ----
+        P P N N  PiV& PiV| NiR| pvr   N P  (pvr)
+        i o i o  PoR  ~PoR ~NoV       o o
+        V R R V                       V R
+
+        -------   -    -    -   -     - -
+        0 0 0 0   0    1    1   0     1 1   odata (unchanged)
+        0 0 0 1   0    1    0   0     1 0   odata (unchanged)
+        0 0 1 0   0    1    1   0     1 1   odata (unchanged)
+        0 0 1 1   0    1    1   0     1 1   odata (unchanged)
+        -------   -    -    -   -     - -
+        0 1 0 0   0    0    1   0     0 1   odata (unchanged)
+        0 1 0 1   0    0    0   0     0 0   odata (unchanged)
+        0 1 1 0   0    0    1   0     0 1   odata (unchanged)
+        0 1 1 1   0    0    1   0     0 1   odata (unchanged)
+        -------   -    -    -   -     - -
+        1 0 0 0   0    1    1   1     1 1   process(in)
+        1 0 0 1   0    1    0   0     1 0   odata (unchanged)
+        1 0 1 0   0    1    1   1     1 1   process(in)
+        1 0 1 1   0    1    1   1     1 1   process(in)
+        -------   -    -    -   -     - -
+        1 1 0 0   1    1    1   1     1 1   process(in)
+        1 1 0 1   1    1    0   0     1 0   odata (unchanged)
+        1 1 1 0   1    1    1   1     1 1   process(in)
+        1 1 1 1   1    1    1   1     1 1   process(in)
+        -------   -    -    -   -     - -
+
+    """
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        r_data = _spec(self.stage.ospec, "r_tmp") # output type
+
+        # temporaries
+        p_valid_i = Signal(reset_less=True)
+        pvr = Signal(reset_less=True)
+        m.d.comb += p_valid_i.eq(self.p.valid_i_test)
+        m.d.comb += pvr.eq(p_valid_i & self.p.ready_o)
+
+        m.d.comb += self.p.ready_o.eq(~self.n.valid_o |  self.n.ready_i_test)
+        m.d.sync += self.n.valid_o.eq(p_valid_i       | ~self.p.ready_o)
+
+        odata = Mux(pvr, self.data_r, r_data)
+        m.d.sync += nmoperator.eq(r_data, odata)
+        r_data = self._postprocess(r_data) # XXX TBD, does nothing right now
+        m.d.comb += nmoperator.eq(self.n.data_o, r_data)
+
+        return m
+
+
+class RegisterPipeline(UnbufferedPipeline):
+    """ A pipeline stage that delays by one clock cycle, creating a
+        sync'd latch out of data_o and valid_o as an indirect byproduct
+        of using PassThroughStage
+    """
+    def __init__(self, iospecfn):
+        UnbufferedPipeline.__init__(self, PassThroughStage(iospecfn))
+
+
+class FIFOControl(ControlBase):
+    """ FIFO Control.  Uses Queue to store data, coincidentally
+        happens to have same valid/ready signalling as Stage API.
+
+        data_i -> fifo.din -> FIFO -> fifo.dout -> data_o
+    """
+    def __init__(self, depth, stage, in_multi=None, stage_ctl=False,
+                                     fwft=True, pipe=False):
+        """ FIFO Control
+
+            * :depth: number of entries in the FIFO
+            * :stage: data processing block
+            * :fwft:  first word fall-thru mode (non-fwft introduces delay)
+            * :pipe:  specifies pipe mode.
+
+            when fwft = True it indicates that transfers may occur
+            combinatorially through stage processing in the same clock cycle.
+            This requires that the Stage be a Moore FSM:
+            https://en.wikipedia.org/wiki/Moore_machine
+
+            when fwft = False it indicates that all output signals are
+            produced only from internal registers or memory, i.e. that the
+            Stage is a Mealy FSM:
+            https://en.wikipedia.org/wiki/Mealy_machine
+
+            data is processed (and located) as follows:
+
+            self.p  self.stage temp    fn temp  fn  temp  fp   self.n
+            data_i->process()->result->cat->din.FIFO.dout->cat(data_o)
+
+            yes, really: cat produces a Cat() which can be assigned to.
+            this is how the FIFO gets de-catted without needing a de-cat
+            function
+        """
+        self.fwft = fwft
+        self.pipe = pipe
+        self.fdepth = depth
+        ControlBase.__init__(self, stage, in_multi, stage_ctl)
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        # make a FIFO with a signal of equal width to the data_o.
+        (fwidth, _) = nmoperator.shape(self.n.data_o)
+        fifo = Queue(fwidth, self.fdepth, fwft=self.fwft, pipe=self.pipe)
+        m.submodules.fifo = fifo
+
+        def processfn(data_i):
+            # store result of processing in combinatorial temporary
+            result = _spec(self.stage.ospec, "r_temp")
+            m.d.comb += nmoperator.eq(result, self.process(data_i))
+            return nmoperator.cat(result)
+
+        ## prev: make the FIFO (Queue object) "look" like a PrevControl...
+        m.submodules.fp = fp = PrevControl()
+        fp.valid_i, fp._ready_o, fp.data_i = fifo.we, fifo.writable, fifo.din
+        m.d.comb += fp._connect_in(self.p, fn=processfn)
+
+        # next: make the FIFO (Queue object) "look" like a NextControl...
+        m.submodules.fn = fn = NextControl()
+        fn.valid_o, fn.ready_i, fn.data_o  = fifo.readable, fifo.re, fifo.dout
+        connections = fn._connect_out(self.n, fn=nmoperator.cat)
+
+        # ok ok so we can't just do the ready/valid eqs straight:
+        # first 2 from connections are the ready/valid, 3rd is data.
+        if self.fwft:
+            m.d.comb += connections[:2] # combinatorial on next ready/valid
+        else:
+            m.d.sync += connections[:2]  # non-fwft mode needs sync
+        data_o = connections[2] # get the data
+        data_o = self._postprocess(data_o) # XXX TBD, does nothing right now
+        m.d.comb += data_o
+
+        return m
+
+
+# aka "RegStage".
+class UnbufferedPipeline(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+
+# aka "BreakReadyStage" XXX had to set fwft=True to get it to work
+class PassThroughHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 1, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=True)
+
+# this is *probably* BufferedHandshake, although test #997 now succeeds.
+class BufferedHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 2, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+
+
+"""
+# this is *probably* SimpleHandshake (note: memory cell size=0)
+class SimpleHandshake(FIFOControl):
+    def __init__(self, stage, in_multi=None, stage_ctl=False):
+        FIFOControl.__init__(self, 0, stage, in_multi, stage_ctl,
+                                   fwft=True, pipe=False)
+"""
diff --git a/src/ieee754/add/stageapi.py b/src/ieee754/add/stageapi.py
new file mode 100644
index 00000000..9651bf79
--- /dev/null
+++ b/src/ieee754/add/stageapi.py
@@ -0,0 +1,271 @@
+""" Stage API
+
+    Associated development bugs:
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=64
+    * http://bugs.libre-riscv.org/show_bug.cgi?id=57
+
+    Stage API:
+    ---------
+
+    stage requires compliance with a strict API that may be
+    implemented in several means, including as a static class.
+
+    Stages do not HOLD data, and they definitely do not contain
+    signalling (ready/valid).  They do however specify the FORMAT
+    of the incoming and outgoing data, and they provide a means to
+    PROCESS that data (from incoming format to outgoing format).
+
+    Stage Blocks really should be combinatorial blocks (Moore FSMs).
+    It would be ok to have input come in from sync'd sources
+    (clock-driven, Mealy FSMs) however by doing so they would no longer
+    be deterministic, and chaining such blocks with such side-effects
+    together could result in unexpected, unpredictable, unreproduceable
+    behaviour.
+
+    So generally to be avoided, then unless you know what you are doing.
+    https://en.wikipedia.org/wiki/Moore_machine
+    https://en.wikipedia.org/wiki/Mealy_machine
+
+    the methods of a stage instance must be as follows:
+
+    * ispec() - Input data format specification.  Takes a bit of explaining.
+                The requirements are: something that eventually derives from
+                nmigen Value must be returned *OR* an iterator or iterable
+                or sequence (list, tuple etc.) or generator must *yield*
+                thing(s) that (eventually) derive from the nmigen Value class.
+
+                Complex to state, very simple in practice:
+                see test_buf_pipe.py for over 25 worked examples.
+
+    * ospec() - Output data format specification.
+                format requirements identical to ispec.
+
+    * process(m, i) - Optional function for processing ispec-formatted data.
+                returns a combinatorial block of a result that
+                may be assigned to the output, by way of the "nmoperator.eq"
+                function.  Note that what is returned here can be
+                extremely flexible.  Even a dictionary can be returned
+                as long as it has fields that match precisely with the
+                Record into which its values is intended to be assigned.
+                Again: see example unit tests for details.
+
+    * setup(m, i) - Optional function for setting up submodules.
+                may be used for more complex stages, to link
+                the input (i) to submodules.  must take responsibility
+                for adding those submodules to the module (m).
+                the submodules must be combinatorial blocks and
+                must have their inputs and output linked combinatorially.
+
+    Both StageCls (for use with non-static classes) and Stage (for use
+    by static classes) are abstract classes from which, for convenience
+    and as a courtesy to other developers, anything conforming to the
+    Stage API may *choose* to derive.  See Liskov Substitution Principle:
+    https://en.wikipedia.org/wiki/Liskov_substitution_principle
+
+    StageChain:
+    ----------
+
+    A useful combinatorial wrapper around stages that chains them together
+    and then presents a Stage-API-conformant interface.  By presenting
+    the same API as the stages it wraps, it can clearly be used recursively.
+
+    StageHelper:
+    ----------
+
+    A convenience wrapper around a Stage-API-compliant "thing" which
+    complies with the Stage API and provides mandatory versions of
+    all the optional bits.
+"""
+
+from abc import ABCMeta, abstractmethod
+import inspect
+
+import nmoperator
+
+
+def _spec(fn, name=None):
+    """ useful function that determines if "fn" has an argument "name".
+        if so, fn(name) is called otherwise fn() is called.
+
+        means that ispec and ospec can be declared with *or without*
+        a name argument.  normally it would be necessary to have
+        "ispec(name=None)" to achieve the same effect.
+    """
+    if name is None:
+        return fn()
+    varnames = dict(inspect.getmembers(fn.__code__))['co_varnames']
+    if 'name' in varnames:
+        return fn(name=name)
+    return fn()
+
+
+class StageCls(metaclass=ABCMeta):
+    """ Class-based "Stage" API.  requires instantiation (after derivation)
+
+        see "Stage API" above..  Note: python does *not* require derivation
+        from this class.  All that is required is that the pipelines *have*
+        the functions listed in this class.  Derivation from this class
+        is therefore merely a "courtesy" to maintainers.
+    """
+    @abstractmethod
+    def ispec(self): pass       # REQUIRED
+    @abstractmethod
+    def ospec(self): pass       # REQUIRED
+    #@abstractmethod
+    #def setup(self, m, i): pass # OPTIONAL
+    #@abstractmethod
+    #def process(self, i): pass  # OPTIONAL
+
+
+class Stage(metaclass=ABCMeta):
+    """ Static "Stage" API.  does not require instantiation (after derivation)
+
+        see "Stage API" above.  Note: python does *not* require derivation
+        from this class.  All that is required is that the pipelines *have*
+        the functions listed in this class.  Derivation from this class
+        is therefore merely a "courtesy" to maintainers.
+    """
+    @staticmethod
+    @abstractmethod
+    def ispec(): pass
+
+    @staticmethod
+    @abstractmethod
+    def ospec(): pass
+
+    #@staticmethod
+    #@abstractmethod
+    #def setup(m, i): pass
+
+    #@staticmethod
+    #@abstractmethod
+    #def process(i): pass
+
+
+class StageHelper(Stage):
+    """ a convenience wrapper around something that is Stage-API-compliant.
+        (that "something" may be a static class, for example).
+
+        StageHelper happens to also be compliant with the Stage API,
+        it differs from the stage that it wraps in that all the "optional"
+        functions are provided (hence the designation "convenience wrapper")
+    """
+    def __init__(self, stage):
+        self.stage = stage
+        self._ispecfn = None
+        self._ospecfn = None
+        if stage is not None:
+            self.set_specs(self, self)
+
+    def ospec(self, name):
+        assert self._ospecfn is not None
+        return _spec(self._ospecfn, name)
+
+    def ispec(self, name):
+        assert self._ispecfn is not None
+        return _spec(self._ispecfn, name)
+
+    def set_specs(self, p, n):
+        """ sets up the ispecfn and ospecfn for getting input and output data
+        """
+        if hasattr(p, "stage"):
+            p = p.stage
+        if hasattr(n, "stage"):
+            n = n.stage
+        self._ispecfn = p.ispec
+        self._ospecfn = n.ospec
+
+    def new_specs(self, name):
+        """ allocates new ispec and ospec pair
+        """
+        return (_spec(self.ispec, "%s_i" % name),
+                _spec(self.ospec, "%s_o" % name))
+
+    def process(self, i):
+        if self.stage and hasattr(self.stage, "process"):
+            return self.stage.process(i)
+        return i
+
+    def setup(self, m, i):
+        if self.stage is not None and hasattr(self.stage, "setup"):
+            self.stage.setup(m, i)
+
+    def _postprocess(self, i): # XXX DISABLED
+        return i # RETURNS INPUT
+        if hasattr(self.stage, "postprocess"):
+            return self.stage.postprocess(i)
+        return i
+
+
+class StageChain(StageHelper):
+    """ pass in a list of stages, and they will automatically be
+        chained together via their input and output specs into a
+        combinatorial chain, to create one giant combinatorial block.
+
+        the end result basically conforms to the exact same Stage API.
+
+        * input to this class will be the input of the first stage
+        * output of first stage goes into input of second
+        * output of second goes into input into third
+        * ... (etc. etc.)
+        * the output of this class will be the output of the last stage
+
+        NOTE: whilst this is very similar to ControlBase.connect(), it is
+        *really* important to appreciate that StageChain is pure
+        combinatorial and bypasses (does not involve, at all, ready/valid
+        signalling of any kind).
+
+        ControlBase.connect on the other hand respects, connects, and uses
+        ready/valid signalling.
+
+        Arguments:
+
+        * :chain: a chain of combinatorial blocks conforming to the Stage API
+                  NOTE: StageChain.ispec and ospect have to have something
+                  to return (beginning and end specs of the chain),
+                  therefore the chain argument must be non-zero length
+
+        * :specallocate: if set, new input and output data will be allocated
+                         and connected (eq'd) to each chained Stage.
+                         in some cases if this is not done, the nmigen warning
+                         "driving from two sources, module is being flattened"
+                         will be issued.
+
+        NOTE: do NOT use StageChain with combinatorial blocks that have
+        side-effects (state-based / clock-based input) or conditional
+        (inter-chain) dependencies, unless you really know what you are doing.
+    """
+    def __init__(self, chain, specallocate=False):
+        assert len(chain) > 0, "stage chain must be non-zero length"
+        self.chain = chain
+        StageHelper.__init__(self, None)
+        self.setup = self._sa_setup if specallocate else self._na_setup
+        self.set_specs(self.chain[0], self.chain[-1])
+
+    def _sa_setup(self, m, i):
+        for (idx, c) in enumerate(self.chain):
+            if hasattr(c, "setup"):
+                c.setup(m, i)               # stage may have some module stuff
+            ofn = self.chain[idx].ospec     # last assignment survives
+            o = _spec(ofn, 'chainin%d' % idx)
+            m.d.comb += nmoperator.eq(o, c.process(i)) # process input into "o"
+            if idx == len(self.chain)-1:
+                break
+            ifn = self.chain[idx+1].ispec   # new input on next loop
+            i = _spec(ifn, 'chainin%d' % (idx+1))
+            m.d.comb += nmoperator.eq(i, o) # assign to next input
+        self.o = o
+        return self.o                       # last loop is the output
+
+    def _na_setup(self, m, i):
+        for (idx, c) in enumerate(self.chain):
+            if hasattr(c, "setup"):
+                c.setup(m, i)               # stage may have some module stuff
+            i = o = c.process(i)            # store input into "o"
+        self.o = o
+        return self.o                       # last loop is the output
+
+    def process(self, i):
+        return self.o # conform to Stage API: return last-loop output
+
+
diff --git a/src/ieee754/add/test_add.py b/src/ieee754/add/test_add.py
new file mode 100644
index 00000000..989cf482
--- /dev/null
+++ b/src/ieee754/add/test_add.py
@@ -0,0 +1,78 @@
+from operator import add
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from nmigen_add_experiment import FPADD
+
+from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_rs_case, check_rs_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+def testbench(dut):
+    yield from check_rs_case(dut, 0x36093399, 0x7f6a12f1, 0x7f6a12f1)
+    yield from check_rs_case(dut, 0x006CE3EE, 0x806CE3EC, 0x00000002)
+    yield from check_rs_case(dut, 0x00000047, 0x80000048, 0x80000001)
+    yield from check_rs_case(dut, 0x000116C2, 0x8001170A, 0x80000048)
+    yield from check_rs_case(dut, 0x7ed01f25, 0xff559e2c, 0xfedb1d33)
+    yield from check_rs_case(dut, 0, 0, 0)
+    yield from check_rs_case(dut, 0xFFFFFFFF, 0xC63B800A, 0x7FC00000)
+    yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    #yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    yield from check_rs_case(dut, 0x7F800000, 0xFF800000, 0x7FC00000)
+    yield from check_rs_case(dut, 0x42540000, 0xC2540000, 0x00000000)
+    yield from check_rs_case(dut, 0xC2540000, 0x42540000, 0x00000000)
+    yield from check_rs_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
+    yield from check_rs_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
+    yield from check_rs_case(dut, 0x40000000, 0xc0000000, 0x00000000)
+    yield from check_rs_case(dut, 0x3F800000, 0x40000000, 0x40400000)
+    yield from check_rs_case(dut, 0x40000000, 0x3F800000, 0x40400000)
+    yield from check_rs_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
+    yield from check_rs_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
+    yield from check_rs_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
+    yield from check_rs_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
+    yield from check_rs_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
+    yield from check_rs_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
+    yield from check_rs_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
+    yield from check_rs_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
+    yield from check_rs_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
+    yield from check_rs_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
+    yield from check_rs_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
+    yield from check_rs_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
+    yield from check_rs_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
+    yield from check_rs_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    yield from check_rs_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
+    yield from check_rs_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
+    yield from check_rs_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
+    yield from check_rs_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
+    yield from check_rs_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
+    yield from check_rs_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
+    yield from check_rs_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
+    yield from check_rs_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
+    yield from check_rs_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
+    yield from check_rs_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
+    yield from check_rs_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
+    #yield from check_rs_case(dut, 1, 0, 1)
+    #yield from check_rs_case(dut, 1, 1, 1)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0x80000000, 0x22cb525a, 0x40000000, 0x83e73d5c,
+                  0xbf9b1e94, 0x34082401,
+                    0x5e8ef81, 0x5c75da81, 0x2b017]
+    stimulus_b = [0xff800001, 0xadd79efa, 0xC0000000, 0x1c800000,
+                  0xc038ed3a, 0xb328cd45, 
+                    0x114f3db, 0x2f642a39, 0xff3807ab]
+    yield from run_test(dut, stimulus_a, stimulus_b, add, get_rs_case)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, add, get_rs_case)
+    yield from run_edge_cases(dut, count, add, get_rs_case)
+
+if __name__ == '__main__':
+    dut = FPADD(width=32, id_wid=5, single_cycle=True)
+    run_simulation(dut, testbench(dut), vcd_name="test_add.vcd")
+
diff --git a/src/ieee754/add/test_add16.py b/src/ieee754/add/test_add16.py
new file mode 100644
index 00000000..f39ae8ae
--- /dev/null
+++ b/src/ieee754/add/test_add16.py
@@ -0,0 +1,44 @@
+from operator import add
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from nmigen_add_experiment import FPADD
+
+from unit_test_half import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+def testbench(dut):
+    #yield from check_case(dut, 0x7800, 0xff6f, 0xff6f)
+    #yield from check_case(dut, 0x0000, 0x7c32, 0x7e32)
+    #yield from check_case(dut, 0x0000, 0x7da9, 0x7fa9)
+    #yield from check_case(dut, 0x0000, 0x7ea0, 0x7ea0)
+    #yield from check_case(dut, 0x7c9a, 0x8000, 0x7e9a)
+    #yield from check_case(dut, 0x7d5e, 0x0000, 0x7f5e)
+    #yield from check_case(dut, 0x8000, 0x7c8c, 0x7e8c)
+    #yield from check_case(dut, 0x8000, 0xfc55, 0xfe55)
+    #yield from check_case(dut, 0x8000, 0x7e1a, 0x7e1a)
+
+    #yield from check_case(dut, 0x8000, 0xfc01, 0x7e00)
+    yield from check_case(dut, 0xfc00, 0x7c00, 0x7e00)
+    yield from check_case(dut, 0x8000, 0, 0)
+    yield from check_case(dut, 0, 0, 0)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [ 0x8000, 0x8000 ]
+    stimulus_b = [ 0x0000, 0xfc01 ]
+    yield from run_test(dut, stimulus_a, stimulus_b, add)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, add)
+    yield from run_edge_cases(dut, count, add)
+
+if __name__ == '__main__':
+    dut = FPADD(width=16, single_cycle=True)
+    run_simulation(dut, testbench(dut), vcd_name="test_add16.vcd")
+
diff --git a/src/ieee754/add/test_add64.py b/src/ieee754/add/test_add64.py
new file mode 100644
index 00000000..dcca12c6
--- /dev/null
+++ b/src/ieee754/add/test_add64.py
@@ -0,0 +1,45 @@
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+from operator import add
+
+from nmigen_add_experiment import FPADD
+
+import sys
+import atexit
+from random import randint
+from random import seed
+
+from unit_test_double import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+
+def testbench(dut):
+    yield from check_case(dut, 0, 0, 0)
+    yield from check_case(dut, 0x3FF0000000000000, 0x4000000000000000,
+                               0x4008000000000000)
+    yield from check_case(dut, 0x4000000000000000, 0x3FF0000000000000,
+                               0x4008000000000000)
+    yield from check_case(dut, 0x4056C00000000000, 0x4042800000000000,
+                               0x4060000000000000)
+    yield from check_case(dut, 0x4056C00000000000, 0x4042EA3D70A3D70A,
+                               0x40601A8F5C28F5C2)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0x3ff00000000000c5, 0xff80000000000000]
+    stimulus_b = [0xbd28a404211fb72b, 0x7f80000000000000]
+    yield from run_test(dut, stimulus_a, stimulus_b, add)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, add)
+    yield from run_edge_cases(dut, count, add)
+
+
+if __name__ == '__main__':
+    dut = FPADD(width=64, single_cycle=False)
+    run_simulation(dut, testbench(dut), vcd_name="test_add64.vcd")
+
diff --git a/src/ieee754/add/test_add_base.py b/src/ieee754/add/test_add_base.py
new file mode 100644
index 00000000..248f719a
--- /dev/null
+++ b/src/ieee754/add/test_add_base.py
@@ -0,0 +1,94 @@
+from random import randint
+from operator import add
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from nmigen_add_experiment import FPADDBase, FPADDBaseMod
+
+def get_case(dut, a, b, mid):
+    yield dut.in_mid.eq(mid)
+    yield dut.in_a.eq(a)
+    yield dut.in_b.eq(b)
+    yield dut.in_t.stb.eq(1)
+    yield
+    yield
+    yield
+    yield
+    ack = (yield dut.in_t.ack)
+    assert ack == 0
+
+    yield dut.in_t.stb.eq(0)
+
+    yield dut.out_z.ack.eq(1)
+
+    while True:
+        out_z_stb = (yield dut.out_z.stb)
+        if not out_z_stb:
+            yield
+            continue
+        out_z = yield dut.out_z.v
+        out_mid = yield dut.out_mid
+        yield dut.out_z.ack.eq(0)
+        yield
+        break
+
+    return out_z, out_mid
+
+def check_case(dut, a, b, z, mid=None):
+    if mid is None:
+        mid = randint(0, 6)
+    out_z, out_mid = yield from get_case(dut, a, b, mid)
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
+
+
+
+def testbench(dut):
+    yield from check_case(dut, 0x36093399, 0x7f6a12f1, 0x7f6a12f1)
+    yield from check_case(dut, 0x006CE3EE, 0x806CE3EC, 0x00000002)
+    yield from check_case(dut, 0x00000047, 0x80000048, 0x80000001)
+    yield from check_case(dut, 0x000116C2, 0x8001170A, 0x80000048)
+    yield from check_case(dut, 0x7ed01f25, 0xff559e2c, 0xfedb1d33)
+    yield from check_case(dut, 0, 0, 0)
+    yield from check_case(dut, 0xFFFFFFFF, 0xC63B800A, 0x7FC00000)
+    yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    #yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    yield from check_case(dut, 0x7F800000, 0xFF800000, 0x7FC00000)
+    yield from check_case(dut, 0x42540000, 0xC2540000, 0x00000000)
+    yield from check_case(dut, 0xC2540000, 0x42540000, 0x00000000)
+    yield from check_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
+    yield from check_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
+    yield from check_case(dut, 0x40000000, 0xc0000000, 0x00000000)
+    yield from check_case(dut, 0x3F800000, 0x40000000, 0x40400000)
+    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40400000)
+    yield from check_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
+    yield from check_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
+    yield from check_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
+    yield from check_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
+    yield from check_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
+    yield from check_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
+    yield from check_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
+    yield from check_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
+    yield from check_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
+    yield from check_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
+    yield from check_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
+    yield from check_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
+    yield from check_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
+    yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    yield from check_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
+    yield from check_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
+    yield from check_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
+    yield from check_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
+    yield from check_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
+    yield from check_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
+    yield from check_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
+    yield from check_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
+    yield from check_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
+    yield from check_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
+    yield from check_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
+
+if __name__ == '__main__':
+    dut = FPADDBaseMod(width=32, id_wid=5, single_cycle=True)
+    run_simulation(dut, testbench(dut), vcd_name="test_add.vcd")
+
diff --git a/src/ieee754/add/test_buf_pipe.py b/src/ieee754/add/test_buf_pipe.py
new file mode 100644
index 00000000..37f2b31f
--- /dev/null
+++ b/src/ieee754/add/test_buf_pipe.py
@@ -0,0 +1,1308 @@
+""" Unit tests for Buffered and Unbuffered pipelines
+
+    contains useful worked examples of how to use the Pipeline API,
+    including:
+
+    * Combinatorial Stage "Chaining"
+    * class-based data stages
+    * nmigen module-based data stages
+    * special nmigen module-based data stage, where the stage *is* the module
+    * Record-based data stages
+    * static-class data stages
+    * multi-stage pipelines (and how to connect them)
+    * how to *use* the pipelines (see Test5) - how to get data in and out
+
+"""
+
+from nmigen import Module, Signal, Mux, Const, Elaboratable
+from nmigen.hdl.rec import Record
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from example_buf_pipe import ExampleBufPipe, ExampleBufPipeAdd
+from example_buf_pipe import ExamplePipeline, UnbufferedPipeline
+from example_buf_pipe import ExampleStageCls
+from example_buf_pipe import PrevControl, NextControl, BufferedHandshake
+from example_buf_pipe import StageChain, ControlBase, StageCls
+from singlepipe import UnbufferedPipeline2
+from singlepipe import SimpleHandshake
+from singlepipe import PassThroughHandshake
+from singlepipe import PassThroughStage
+from singlepipe import FIFOControl
+from singlepipe import RecordObject
+
+from random import randint, seed
+
+#seed(4)
+
+
+def check_o_n_valid(dut, val):
+    o_n_valid = yield dut.n.valid_o
+    assert o_n_valid == val
+
+def check_o_n_valid2(dut, val):
+    o_n_valid = yield dut.n.valid_o
+    assert o_n_valid == val
+
+
+def tbench(dut):
+    #yield dut.i_p_rst.eq(1)
+    yield dut.n.ready_i.eq(0)
+    #yield dut.p.ready_o.eq(0)
+    yield
+    yield
+    #yield dut.i_p_rst.eq(0)
+    yield dut.n.ready_i.eq(1)
+    yield dut.p.data_i.eq(5)
+    yield dut.p.valid_i.eq(1)
+    yield
+
+    yield dut.p.data_i.eq(7)
+    yield from check_o_n_valid(dut, 0) # effects of i_p_valid delayed
+    yield
+    yield from check_o_n_valid(dut, 1) # ok *now* i_p_valid effect is felt
+
+    yield dut.p.data_i.eq(2)
+    yield
+    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
+    yield dut.p.data_i.eq(9)
+    yield
+    yield dut.p.valid_i.eq(0)
+    yield dut.p.data_i.eq(12)
+    yield
+    yield dut.p.data_i.eq(32)
+    yield dut.n.ready_i.eq(1)
+    yield
+    yield from check_o_n_valid(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid(dut, 0) # buffer outputted, *now* we're done.
+    yield
+
+
+def tbench2(dut):
+    #yield dut.p.i_rst.eq(1)
+    yield dut.n.ready_i.eq(0)
+    #yield dut.p.ready_o.eq(0)
+    yield
+    yield
+    #yield dut.p.i_rst.eq(0)
+    yield dut.n.ready_i.eq(1)
+    yield dut.p.data_i.eq(5)
+    yield dut.p.valid_i.eq(1)
+    yield
+
+    yield dut.p.data_i.eq(7)
+    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
+    yield
+    yield from check_o_n_valid2(dut, 0) # effects of i_p_valid delayed 2 clocks
+
+    yield dut.p.data_i.eq(2)
+    yield
+    yield from check_o_n_valid2(dut, 1) # ok *now* i_p_valid effect is felt
+    yield dut.n.ready_i.eq(0) # begin going into "stall" (next stage says ready)
+    yield dut.p.data_i.eq(9)
+    yield
+    yield dut.p.valid_i.eq(0)
+    yield dut.p.data_i.eq(12)
+    yield
+    yield dut.p.data_i.eq(32)
+    yield dut.n.ready_i.eq(1)
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 1) # buffer still needs to output
+    yield
+    yield from check_o_n_valid2(dut, 0) # buffer outputted, *now* we're done.
+    yield
+    yield
+    yield
+
+
+class Test3:
+    def __init__(self, dut, resultfn):
+        self.dut = dut
+        self.resultfn = resultfn
+        self.data = []
+        for i in range(num_tests):
+            #data.append(randint(0, 1<<16-1))
+            self.data.append(i+1)
+        self.i = 0
+        self.o = 0
+
+    def send(self):
+        while self.o != len(self.data):
+            send_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                if send_range == 0:
+                    send = True
+                else:
+                    send = randint(0, send_range) != 0
+                o_p_ready = yield self.dut.p.ready_o
+                if not o_p_ready:
+                    yield
+                    continue
+                if send and self.i != len(self.data):
+                    yield self.dut.p.valid_i.eq(1)
+                    yield self.dut.p.data_i.eq(self.data[self.i])
+                    self.i += 1
+                else:
+                    yield self.dut.p.valid_i.eq(0)
+                yield
+
+    def rcv(self):
+        while self.o != len(self.data):
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                stall = randint(0, stall_range) != 0
+                yield self.dut.n.ready_i.eq(stall)
+                yield
+                o_n_valid = yield self.dut.n.valid_o
+                i_n_ready = yield self.dut.n.ready_i_test
+                if not o_n_valid or not i_n_ready:
+                    continue
+                data_o = yield self.dut.n.data_o
+                self.resultfn(data_o, self.data[self.o], self.i, self.o)
+                self.o += 1
+                if self.o == len(self.data):
+                    break
+
+def resultfn_3(data_o, expected, i, o):
+    assert data_o == expected + 1, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, expected)
+
+def data_placeholder():
+        data = []
+        for i in range(num_tests):
+            d = PlaceHolder()
+            d.src1 = randint(0, 1<<16-1)
+            d.src2 = randint(0, 1<<16-1)
+            data.append(d)
+        return data
+
+def data_dict():
+        data = []
+        for i in range(num_tests):
+            data.append({'src1': randint(0, 1<<16-1),
+                         'src2': randint(0, 1<<16-1)})
+        return data
+
+
+class Test5:
+    def __init__(self, dut, resultfn, data=None, stage_ctl=False):
+        self.dut = dut
+        self.resultfn = resultfn
+        self.stage_ctl = stage_ctl
+        if data:
+            self.data = data
+        else:
+            self.data = []
+            for i in range(num_tests):
+                self.data.append((randint(0, 1<<16-1), randint(0, 1<<16-1)))
+        self.i = 0
+        self.o = 0
+
+    def send(self):
+        while self.o != len(self.data):
+            send_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                if send_range == 0:
+                    send = True
+                else:
+                    send = randint(0, send_range) != 0
+                #send = True
+                o_p_ready = yield self.dut.p.ready_o
+                if not o_p_ready:
+                    yield
+                    continue
+                if send and self.i != len(self.data):
+                    yield self.dut.p.valid_i.eq(1)
+                    for v in self.dut.set_input(self.data[self.i]):
+                        yield v
+                    self.i += 1
+                else:
+                    yield self.dut.p.valid_i.eq(0)
+                yield
+
+    def rcv(self):
+        while self.o != len(self.data):
+            stall_range = randint(0, 3)
+            for j in range(randint(1,10)):
+                ready = randint(0, stall_range) != 0
+                #ready = True
+                yield self.dut.n.ready_i.eq(ready)
+                yield
+                o_n_valid = yield self.dut.n.valid_o
+                i_n_ready = yield self.dut.n.ready_i_test
+                if not o_n_valid or not i_n_ready:
+                    continue
+                if isinstance(self.dut.n.data_o, Record):
+                    data_o = {}
+                    dod = self.dut.n.data_o
+                    for k, v in dod.fields.items():
+                        data_o[k] = yield v
+                else:
+                    data_o = yield self.dut.n.data_o
+                self.resultfn(data_o, self.data[self.o], self.i, self.o)
+                self.o += 1
+                if self.o == len(self.data):
+                    break
+
+def resultfn_5(data_o, expected, i, o):
+    res = expected[0] + expected[1]
+    assert data_o == res, \
+                "%d-%d data %x not match %s\n" \
+                % (i, o, data_o, repr(expected))
+
+def tbench4(dut):
+    data = []
+    for i in range(num_tests):
+        #data.append(randint(0, 1<<16-1))
+        data.append(i+1)
+    i = 0
+    o = 0
+    while True:
+        stall = randint(0, 3) != 0
+        send = randint(0, 5) != 0
+        yield dut.n.ready_i.eq(stall)
+        o_p_ready = yield dut.p.ready_o
+        if o_p_ready:
+            if send and i != len(data):
+                yield dut.p.valid_i.eq(1)
+                yield dut.p.data_i.eq(data[i])
+                i += 1
+            else:
+                yield dut.p.valid_i.eq(0)
+        yield
+        o_n_valid = yield dut.n.valid_o
+        i_n_ready = yield dut.n.ready_i_test
+        if o_n_valid and i_n_ready:
+            data_o = yield dut.n.data_o
+            assert data_o == data[o] + 2, "%d-%d data %x not match %x\n" \
+                                        % (i, o, data_o, data[o])
+            o += 1
+            if o == len(data):
+                break
+
+######################################################################
+# Test 2 and 4
+######################################################################
+
+class ExampleBufPipe2(ControlBase):
+    """ Example of how to do chained pipeline stages.
+    """
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufPipe()
+        pipe2 = ExampleBufPipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 9
+######################################################################
+
+class ExampleBufPipeChain2(BufferedHandshake):
+    """ connects two stages together as a *single* combinatorial stage.
+    """
+    def __init__(self):
+        stage1 = ExampleStageCls()
+        stage2 = ExampleStageCls()
+        combined = StageChain([stage1, stage2])
+        BufferedHandshake.__init__(self, combined)
+
+
+def data_chain2():
+        data = []
+        for i in range(num_tests):
+            data.append(randint(0, 1<<16-2))
+        return data
+
+
+def resultfn_9(data_o, expected, i, o):
+    res = expected + 2
+    assert data_o == res, \
+                "%d-%d received data %x not match expected %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 6 and 10
+######################################################################
+
+class SetLessThan(Elaboratable):
+    def __init__(self, width, signed):
+        self.m = Module()
+        self.src1 = Signal((width, signed), name="src1")
+        self.src2 = Signal((width, signed), name="src2")
+        self.output = Signal(width, name="out")
+
+    def elaborate(self, platform):
+        self.m.d.comb += self.output.eq(Mux(self.src1 < self.src2, 1, 0))
+        return self.m
+
+
+class LTStage(StageCls):
+    """ module-based stage example
+    """
+    def __init__(self):
+        self.slt = SetLessThan(16, True)
+
+    def ispec(self, name):
+        return (Signal(16, name="%s_sig1" % name),
+                Signal(16, name="%s_sig2" % name))
+
+    def ospec(self, name):
+        return Signal(16, "%s_out" % name)
+
+    def setup(self, m, i):
+        self.o = Signal(16)
+        m.submodules.slt = self.slt
+        m.d.comb += self.slt.src1.eq(i[0])
+        m.d.comb += self.slt.src2.eq(i[1])
+        m.d.comb += self.o.eq(self.slt.output)
+
+    def process(self, i):
+        return self.o
+
+
+class LTStageDerived(SetLessThan, StageCls):
+    """ special version of a nmigen module where the module is also a stage
+
+        shows that you don't actually need to combinatorially connect
+        to the outputs, or add the module as a submodule: just return
+        the module output parameter(s) from the Stage.process() function
+    """
+
+    def __init__(self):
+        SetLessThan.__init__(self, 16, True)
+
+    def ispec(self):
+        return (Signal(16), Signal(16))
+
+    def ospec(self):
+        return Signal(16)
+
+    def setup(self, m, i):
+        m.submodules.slt = self
+        m.d.comb += self.src1.eq(i[0])
+        m.d.comb += self.src2.eq(i[1])
+
+    def process(self, i):
+        return self.output
+
+
+class ExampleLTPipeline(UnbufferedPipeline):
+    """ an example of how to use the unbuffered pipeline.
+    """
+
+    def __init__(self):
+        stage = LTStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+class ExampleLTBufferedPipeDerived(BufferedHandshake):
+    """ an example of how to use the buffered pipeline.
+    """
+
+    def __init__(self):
+        stage = LTStageDerived()
+        BufferedHandshake.__init__(self, stage)
+
+
+def resultfn_6(data_o, expected, i, o):
+    res = 1 if expected[0] < expected[1] else 0
+    assert data_o == res, \
+                "%d-%d data %x not match %s\n" \
+                % (i, o, data_o, repr(expected))
+
+
+######################################################################
+# Test 7
+######################################################################
+
+class ExampleAddRecordStage(StageCls):
+    """ example use of a Record
+    """
+
+    record_spec = [('src1', 16), ('src2', 16)]
+    def ispec(self):
+        """ returns a Record using the specification
+        """
+        return Record(self.record_spec)
+
+    def ospec(self):
+        return Record(self.record_spec)
+
+    def process(self, i):
+        """ process the input data, returning a dictionary with key names
+            that exactly match the Record's attributes.
+        """
+        return {'src1': i.src1 + 1,
+                'src2': i.src2 + 1}
+
+######################################################################
+# Test 11
+######################################################################
+
+class ExampleAddRecordPlaceHolderStage(StageCls):
+    """ example use of a Record, with a placeholder as the processing result
+    """
+
+    record_spec = [('src1', 16), ('src2', 16)]
+    def ispec(self):
+        """ returns a Record using the specification
+        """
+        return Record(self.record_spec)
+
+    def ospec(self):
+        return Record(self.record_spec)
+
+    def process(self, i):
+        """ process the input data, returning a PlaceHolder class instance
+            with attributes that exactly match those of the Record.
+        """
+        o = PlaceHolder()
+        o.src1 = i.src1 + 1
+        o.src2 = i.src2 + 1
+        return o
+
+
+# a dummy class that may have stuff assigned to instances once created
+class PlaceHolder: pass
+
+
+class ExampleAddRecordPipe(UnbufferedPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self):
+        stage = ExampleAddRecordStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+def resultfn_7(data_o, expected, i, o):
+    res = (expected['src1'] + 1, expected['src2'] + 1)
+    assert data_o['src1'] == res[0] and data_o['src2'] == res[1], \
+                "%d-%d data %s not match %s\n" \
+                % (i, o, repr(data_o), repr(expected))
+
+
+class ExampleAddRecordPlaceHolderPipe(UnbufferedPipeline):
+    """ an example of how to use the combinatorial pipeline.
+    """
+
+    def __init__(self):
+        stage = ExampleAddRecordPlaceHolderStage()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+def resultfn_test11(data_o, expected, i, o):
+    res1 = expected.src1 + 1
+    res2 = expected.src2 + 1
+    assert data_o['src1'] == res1 and data_o['src2'] == res2, \
+                "%d-%d data %s not match %s\n" \
+                % (i, o, repr(data_o), repr(expected))
+
+
+######################################################################
+# Test 8
+######################################################################
+
+
+class Example2OpClass:
+    """ an example of a class used to store 2 operands.
+        requires an eq function, to conform with the pipeline stage API
+    """
+
+    def __init__(self):
+        self.op1 = Signal(16)
+        self.op2 = Signal(16)
+
+    def eq(self, i):
+        return [self.op1.eq(i.op1), self.op2.eq(i.op2)]
+
+
+class ExampleAddClassStage(StageCls):
+    """ an example of how to use the buffered pipeline, as a class instance
+    """
+
+    def ispec(self):
+        """ returns an instance of an Example2OpClass.
+        """
+        return Example2OpClass()
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16, name="add2_out")
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i.op1 + i.op2
+
+
+class ExampleBufPipeAddClass(BufferedHandshake):
+    """ an example of how to use the buffered pipeline, using a class instance
+    """
+
+    def __init__(self):
+        addstage = ExampleAddClassStage()
+        BufferedHandshake.__init__(self, addstage)
+
+
+class TestInputAdd:
+    """ the eq function, called by set_input, needs an incoming object
+        that conforms to the Example2OpClass.eq function requirements
+        easiest way to do that is to create a class that has the exact
+        same member layout (self.op1, self.op2) as Example2OpClass
+    """
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+
+def resultfn_8(data_o, expected, i, o):
+    res = expected.op1 + expected.op2 # these are a TestInputAdd instance
+    assert data_o == res, \
+                "%d-%d data %s res %x not match %s\n" \
+                % (i, o, repr(data_o), res, repr(expected))
+
+def data_2op():
+        data = []
+        for i in range(num_tests):
+            data.append(TestInputAdd(randint(0, 1<<16-1), randint(0, 1<<16-1)))
+        return data
+
+
+######################################################################
+# Test 12
+######################################################################
+
+class ExampleStageDelayCls(StageCls, Elaboratable):
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def __init__(self, valid_trigger=2):
+        self.count = Signal(2)
+        self.valid_trigger = valid_trigger
+
+    def ispec(self):
+        return Signal(16, name="example_input_signal")
+
+    def ospec(self):
+        return Signal(16, name="example_output_signal")
+
+    @property
+    def d_ready(self):
+        """ data is ready to be accepted when this is true
+        """
+        return (self.count == 1)# | (self.count == 3)
+        return Const(1)
+
+    def d_valid(self, ready_i):
+        """ data is valid at output when this is true
+        """
+        return self.count == self.valid_trigger
+        return Const(1)
+
+    def process(self, i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+    def elaborate(self, platform):
+        m = Module()
+        m.d.sync += self.count.eq(self.count + 1)
+        return m
+
+
+class ExampleBufDelayedPipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageDelayCls(valid_trigger=2)
+        BufferedHandshake.__init__(self, stage, stage_ctl=True)
+
+    def elaborate(self, platform):
+        m = BufferedHandshake.elaborate(self, platform)
+        m.submodules.stage = self.stage
+        return m
+
+
+def data_chain1():
+        data = []
+        for i in range(num_tests):
+            data.append(1<<((i*3)%15))
+            #data.append(randint(0, 1<<16-2))
+            #print (hex(data[-1]))
+        return data
+
+
+def resultfn_12(data_o, expected, i, o):
+    res = expected + 1
+    assert data_o == res, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 13
+######################################################################
+
+class ExampleUnBufDelayedPipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageDelayCls(valid_trigger=3)
+        BufferedHandshake.__init__(self, stage, stage_ctl=True)
+
+    def elaborate(self, platform):
+        m = BufferedHandshake.elaborate(self, platform)
+        m.submodules.stage = self.stage
+        return m
+
+######################################################################
+# Test 15
+######################################################################
+
+class ExampleBufModeAdd1Pipe(SimpleHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        SimpleHandshake.__init__(self, stage)
+
+
+######################################################################
+# Test 16
+######################################################################
+
+class ExampleBufModeUnBufPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufModeAdd1Pipe()
+        pipe2 = ExampleBufAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+######################################################################
+# Test 17
+######################################################################
+
+class ExampleUnBufAdd1Pipe2(UnbufferedPipeline2):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        UnbufferedPipeline2.__init__(self, stage)
+
+
+######################################################################
+# Test 18
+######################################################################
+
+class PassThroughTest(PassThroughHandshake):
+
+    def iospecfn(self):
+        return Signal(16, "out")
+
+    def __init__(self):
+        stage = PassThroughStage(self.iospecfn)
+        PassThroughHandshake.__init__(self, stage)
+
+def resultfn_identical(data_o, expected, i, o):
+    res = expected
+    assert data_o == res, \
+                "%d-%d data %x not match %x\n" \
+                % (i, o, data_o, res)
+
+
+######################################################################
+# Test 19
+######################################################################
+
+class ExamplePassAdd1Pipe(PassThroughHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        PassThroughHandshake.__init__(self, stage)
+
+
+class ExampleBufPassThruPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        pipe1 = ExampleBufModeAdd1Pipe()
+        pipe2 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 20
+######################################################################
+
+def iospecfn():
+    return Signal(16, name="d_in")
+
+class FIFOTest16(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfn)
+        FIFOControl.__init__(self, 2, stage)
+
+
+######################################################################
+# Test 21
+######################################################################
+
+class ExampleFIFOPassThruPipe1(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTest16()
+        pipe2 = FIFOTest16()
+        pipe3 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+        m.submodules.pipe3 = pipe3
+
+        m.d.comb += self.connect([pipe1, pipe2, pipe3])
+
+        return m
+
+
+######################################################################
+# Test 22
+######################################################################
+
+class Example2OpRecord(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.op1 = Signal(16)
+        self.op2 = Signal(16)
+
+
+class ExampleAddRecordObjectStage(StageCls):
+
+    def ispec(self):
+        """ returns an instance of an Example2OpRecord.
+        """
+        return Example2OpRecord()
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16)
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i.op1 + i.op2
+
+
+class ExampleRecordHandshakeAddClass(SimpleHandshake):
+
+    def __init__(self):
+        addstage = ExampleAddRecordObjectStage()
+        SimpleHandshake.__init__(self, stage=addstage)
+
+
+######################################################################
+# Test 23
+######################################################################
+
+def iospecfnrecord():
+    return Example2OpRecord()
+
+class FIFOTestRecordControl(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfnrecord)
+        FIFOControl.__init__(self, 2, stage)
+
+
+class ExampleFIFORecordObjectPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTestRecordControl()
+        pipe2 = ExampleRecordHandshakeAddClass()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 24
+######################################################################
+
+class FIFOTestRecordAddStageControl(FIFOControl):
+
+    def __init__(self):
+        stage = ExampleAddRecordObjectStage()
+        FIFOControl.__init__(self, 2, stage)
+
+
+
+######################################################################
+# Test 25
+######################################################################
+
+class FIFOTestAdd16(FIFOControl):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        FIFOControl.__init__(self, 2, stage)
+
+
+class ExampleFIFOAdd2Pipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = FIFOTestAdd16()
+        pipe2 = FIFOTestAdd16()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 26
+######################################################################
+
+def iospecfn24():
+    return (Signal(16, name="src1"), Signal(16, name="src2"))
+
+class FIFOTest2x16(FIFOControl):
+
+    def __init__(self):
+        stage = PassThroughStage(iospecfn2)
+        FIFOControl.__init__(self, 2, stage)
+
+
+######################################################################
+# Test 997
+######################################################################
+
+class ExampleBufPassThruPipe2(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        #pipe1 = ExampleUnBufAdd1Pipe()
+        #pipe2 = ExampleBufAdd1Pipe()
+        pipe1 = ExampleBufAdd1Pipe()
+        pipe2 = ExamplePassAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Test 998
+######################################################################
+
+class ExampleBufPipe3(ControlBase):
+    """ Example of how to do delayed pipeline, where the stage signals
+        whether it is ready.
+    """
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        pipe1 = ExampleBufDelayedPipe()
+        pipe2 = ExampleBufPipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+######################################################################
+# Test 999 - XXX FAILS
+# http://bugs.libre-riscv.org/show_bug.cgi?id=57
+######################################################################
+
+class ExampleBufAdd1Pipe(BufferedHandshake):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        BufferedHandshake.__init__(self, stage)
+
+
+class ExampleUnBufAdd1Pipe(UnbufferedPipeline):
+
+    def __init__(self):
+        stage = ExampleStageCls()
+        UnbufferedPipeline.__init__(self, stage)
+
+
+class ExampleBufUnBufPipe(ControlBase):
+
+    def elaborate(self, platform):
+        m = ControlBase.elaborate(self, platform)
+
+        # XXX currently fails: any other permutation works fine.
+        # p1=u,p2=b ok p1=u,p2=u ok p1=b,p2=b ok
+        # also fails using UnbufferedPipeline as well
+        #pipe1 = ExampleUnBufAdd1Pipe()
+        #pipe2 = ExampleBufAdd1Pipe()
+        pipe1 = ExampleBufAdd1Pipe()
+        pipe2 = ExampleUnBufAdd1Pipe()
+
+        m.submodules.pipe1 = pipe1
+        m.submodules.pipe2 = pipe2
+
+        m.d.comb += self.connect([pipe1, pipe2])
+
+        return m
+
+
+######################################################################
+# Unit Tests
+######################################################################
+
+num_tests = 10
+
+if __name__ == '__main__':
+    if False:
+        print ("test 1")
+        dut = ExampleBufPipe()
+        run_simulation(dut, tbench(dut), vcd_name="test_bufpipe.vcd")
+
+        print ("test 2")
+        dut = ExampleBufPipe2()
+        run_simulation(dut, tbench2(dut), vcd_name="test_bufpipe2.vcd")
+        ports = [dut.p.valid_i, dut.n.ready_i,
+                 dut.n.valid_o, dut.p.ready_o] + \
+                 [dut.p.data_i] + [dut.n.data_o]
+        vl = rtlil.convert(dut, ports=ports)
+        with open("test_bufpipe2.il", "w") as f:
+            f.write(vl)
+
+
+    print ("test 3")
+    dut = ExampleBufPipe()
+    test = Test3(dut, resultfn_3)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe3.vcd")
+
+    print ("test 3.5")
+    dut = ExamplePipeline()
+    test = Test3(dut, resultfn_3)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_combpipe3.vcd")
+
+    print ("test 4")
+    dut = ExampleBufPipe2()
+    run_simulation(dut, tbench4(dut), vcd_name="test_bufpipe4.vcd")
+
+    print ("test 5")
+    dut = ExampleBufPipeAdd()
+    test = Test5(dut, resultfn_5, stage_ctl=True)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe5.vcd")
+
+    print ("test 6")
+    dut = ExampleLTPipeline()
+    test = Test5(dut, resultfn_6)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltcomb6.vcd")
+
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             list(dut.p.data_i) + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_ltcomb_pipe.il", "w") as f:
+        f.write(vl)
+
+    print ("test 7")
+    dut = ExampleAddRecordPipe()
+    data=data_dict()
+    test = Test5(dut, resultfn_7, data=data)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o,
+             dut.p.data_i.src1, dut.p.data_i.src2,
+             dut.n.data_o.src1, dut.n.data_o.src2]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_recordcomb_pipe.il", "w") as f:
+        f.write(vl)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
+
+    print ("test 8")
+    dut = ExampleBufPipeAddClass()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe8.vcd")
+
+    print ("test 9")
+    dut = ExampleBufPipeChain2()
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipechain2.il", "w") as f:
+        f.write(vl)
+
+    data = data_chain2()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv],
+                        vcd_name="test_bufpipechain2.vcd")
+
+    print ("test 10")
+    dut = ExampleLTBufferedPipeDerived()
+    test = Test5(dut, resultfn_6)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_ltbufpipe10.vcd")
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_ltbufpipe10.il", "w") as f:
+        f.write(vl)
+
+    print ("test 11")
+    dut = ExampleAddRecordPlaceHolderPipe()
+    data=data_placeholder()
+    test = Test5(dut, resultfn_test11, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord.vcd")
+
+
+    print ("test 12")
+    dut = ExampleBufDelayedPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe12.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipe12.il", "w") as f:
+        f.write(vl)
+
+    print ("test 13")
+    dut = ExampleUnBufDelayedPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe13.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_unbufpipe13.il", "w") as f:
+        f.write(vl)
+
+    print ("test 15")
+    dut = ExampleBufModeAdd1Pipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf15.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf15.il", "w") as f:
+        f.write(vl)
+
+    print ("test 16")
+    dut = ExampleBufModeUnBufPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf16.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf16.il", "w") as f:
+        f.write(vl)
+
+    print ("test 17")
+    dut = ExampleUnBufAdd1Pipe2()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_unbufpipe17.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_unbufpipe17.il", "w") as f:
+        f.write(vl)
+
+    print ("test 18")
+    dut = PassThroughTest()
+    data = data_chain1()
+    test = Test5(dut, resultfn_identical, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_passthru18.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_passthru18.il", "w") as f:
+        f.write(vl)
+
+    print ("test 19")
+    dut = ExampleBufPassThruPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass19.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpass19.il", "w") as f:
+        f.write(vl)
+
+    print ("test 20")
+    dut = FIFOTest16()
+    data = data_chain1()
+    test = Test5(dut, resultfn_identical, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifo20.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_fifo20.il", "w") as f:
+        f.write(vl)
+
+    print ("test 21")
+    dut = ExampleFIFOPassThruPipe1()
+    data = data_chain1()
+    test = Test5(dut, resultfn_12, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_fifopass21.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_fifopass21.il", "w") as f:
+        f.write(vl)
+
+    print ("test 22")
+    dut = ExampleRecordHandshakeAddClass()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord22.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord22.il", "w") as f:
+        f.write(vl)
+
+    print ("test 23")
+    dut = ExampleFIFORecordObjectPipe()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord23.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord23.il", "w") as f:
+        f.write(vl)
+
+    print ("test 24")
+    dut = FIFOTestRecordAddStageControl()
+    data=data_2op()
+    test = Test5(dut, resultfn_8, data=data)
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i.op1, dut.p.data_i.op2] + \
+             [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_addrecord24.il", "w") as f:
+        f.write(vl)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_addrecord24.vcd")
+
+    print ("test 25")
+    dut = ExampleFIFOAdd2Pipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_add2pipe25.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_add2pipe25.il", "w") as f:
+        f.write(vl)
+
+    print ("test 997")
+    dut = ExampleBufPassThruPipe2()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpass997.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpass997.il", "w") as f:
+        f.write(vl)
+
+    print ("test 998 (fails, bug)")
+    dut = ExampleBufPipe3()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufpipe14.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufpipe14.il", "w") as f:
+        f.write(vl)
+
+    print ("test 999 (expected to fail, which is a bug)")
+    dut = ExampleBufUnBufPipe()
+    data = data_chain1()
+    test = Test5(dut, resultfn_9, data=data)
+    run_simulation(dut, [test.send, test.rcv], vcd_name="test_bufunbuf999.vcd")
+    ports = [dut.p.valid_i, dut.n.ready_i,
+             dut.n.valid_o, dut.p.ready_o] + \
+             [dut.p.data_i] + [dut.n.data_o]
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_bufunbuf999.il", "w") as f:
+        f.write(vl)
+
diff --git a/src/ieee754/add/test_div.py b/src/ieee754/add/test_div.py
new file mode 100644
index 00000000..3f192338
--- /dev/null
+++ b/src/ieee754/add/test_div.py
@@ -0,0 +1,47 @@
+import sys
+from random import randint
+from random import seed
+from operator import truediv
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from nmigen_div_experiment import FPDIV
+
+from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+
+def testbench(dut):
+    yield from check_case(dut, 0x80000000, 0x00000000, 0xffc00000)
+    yield from check_case(dut, 0x00000000, 0x80000000, 0xffc00000)
+    yield from check_case(dut, 0x0002b017, 0xff3807ab, 0x80000000)
+    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40000000)
+    yield from check_case(dut, 0x3F800000, 0x40000000, 0x3F000000)
+    yield from check_case(dut, 0x3F800000, 0x40400000, 0x3EAAAAAB)
+    yield from check_case(dut, 0x40400000, 0x41F80000, 0x3DC6318C)
+    yield from check_case(dut, 0x41F9EB4D, 0x429A4C70, 0x3ECF52B2)
+    yield from check_case(dut, 0x7F7FFFFE, 0x70033181, 0x4EF9C4C8)
+    yield from check_case(dut, 0x7F7FFFFE, 0x70000001, 0x4EFFFFFC)
+    yield from check_case(dut, 0x7F7FFCFF, 0x70200201, 0x4ECCC7D5)
+    yield from check_case(dut, 0x70200201, 0x7F7FFCFF, 0x302003E2)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0xbf9b1e94, 0x34082401, 0x5e8ef81, 0x5c75da81, 0x2b017]
+    stimulus_b = [0xc038ed3a, 0xb328cd45, 0x114f3db, 0x2f642a39, 0xff3807ab]
+    yield from run_test(dut, stimulus_a, stimulus_b, truediv, get_case)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, truediv, get_case)
+    yield from run_edge_cases(dut, count, truediv, get_case)
+
+
+if __name__ == '__main__':
+    dut = FPDIV(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_div.vcd")
+
diff --git a/src/ieee754/add/test_div64.py b/src/ieee754/add/test_div64.py
new file mode 100644
index 00000000..5a9daf23
--- /dev/null
+++ b/src/ieee754/add/test_div64.py
@@ -0,0 +1,67 @@
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from nmigen_div_experiment import FPDIV
+
+class ORGate:
+    def __init__(self):
+        self.a = Signal()
+        self.b = Signal()
+        self.x = Signal()
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        m.d.comb += self.x.eq(self.a | self.b)
+
+        return m
+
+def check_case(dut, a, b, z):
+    yield dut.in_a.v.eq(a)
+    yield dut.in_a.stb.eq(1)
+    yield
+    yield
+    a_ack = (yield dut.in_a.ack)
+    assert a_ack == 0
+    yield dut.in_b.v.eq(b)
+    yield dut.in_b.stb.eq(1)
+    b_ack = (yield dut.in_b.ack)
+    assert b_ack == 0
+
+    while True:
+        yield
+        out_z_stb = (yield dut.out_z.stb)
+        if not out_z_stb:
+            continue
+        yield dut.in_a.stb.eq(0)
+        yield dut.in_b.stb.eq(0)
+        yield dut.out_z.ack.eq(1)
+        yield
+        yield dut.out_z.ack.eq(0)
+        yield
+        yield
+        break
+
+    out_z = yield dut.out_z.v
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+
+def testbench(dut):
+    yield from check_case(dut, 0x4008000000000000, 0x3FF0000000000000,
+                               0x4008000000000000)
+    yield from check_case(dut, 0x3FF0000000000000, 0x4008000000000000,
+                               0x3FD5555555555555)
+
+    if False:
+        yield from check_case(dut, 0x3F800000, 0x40000000, 0x3F000000)
+        yield from check_case(dut, 0x3F800000, 0x40400000, 0x3EAAAAAB)
+        yield from check_case(dut, 0x40400000, 0x41F80000, 0x3DC6318C)
+        yield from check_case(dut, 0x41F9EB4D, 0x429A4C70, 0x3ECF52B2)
+        yield from check_case(dut, 0x7F7FFFFE, 0x70033181, 0x4EF9C4C8)
+        yield from check_case(dut, 0x7F7FFFFE, 0x70000001, 0x4EFFFFFC)
+        yield from check_case(dut, 0x7F7FFCFF, 0x70200201, 0x4ECCC7D5)
+        yield from check_case(dut, 0x70200201, 0x7F7FFCFF, 0x302003E2)
+
+if __name__ == '__main__':
+    dut = FPDIV(width=64)
+    run_simulation(dut, testbench(dut), vcd_name="test_div64.vcd")
+
diff --git a/src/ieee754/add/test_dual.py b/src/ieee754/add/test_dual.py
new file mode 100644
index 00000000..15f5c762
--- /dev/null
+++ b/src/ieee754/add/test_dual.py
@@ -0,0 +1,60 @@
+from sfpy import Float32
+from nmigen.compat.sim import run_simulation
+from dual_add_experiment import ALU
+
+
+def get_case(dut, a, b, c):
+    yield dut.a.v.eq(a)
+    yield dut.a.stb.eq(1)
+    yield
+    yield
+    a_ack = (yield dut.a.ack)
+    assert a_ack == 0
+
+    yield dut.a.stb.eq(0)
+
+    yield dut.b.v.eq(b)
+    yield dut.b.stb.eq(1)
+    yield
+    yield
+    b_ack = (yield dut.b.ack)
+    assert b_ack == 0
+
+    yield dut.b.stb.eq(0)
+
+    yield dut.c.v.eq(c)
+    yield dut.c.stb.eq(1)
+    yield
+    yield
+    c_ack = (yield dut.c.ack)
+    assert c_ack == 0
+
+    yield dut.c.stb.eq(0)
+
+    yield dut.z.ack.eq(1)
+
+    while True:
+        out_z_stb = (yield dut.z.stb)
+        if not out_z_stb:
+            yield
+            continue
+
+        out_z = yield dut.z.v
+
+        yield dut.z.ack.eq(0)
+        break
+
+    return out_z
+
+def check_case(dut, a, b, c, z):
+    out_z = yield from get_case(dut, a, b, c)
+    assert out_z == z, "Output z 0x%x != 0x%x" % (out_z, z)
+
+def testbench(dut):
+    yield from check_case(dut, 0, 0, 0, 0)
+    yield from check_case(dut, 0x3F800000, 0x40000000, 0xc0000000, 0x3F800000)
+
+if __name__ == '__main__':
+    dut = ALU(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_dual_add.vcd")
+
diff --git a/src/ieee754/add/test_fpadd_pipe.py b/src/ieee754/add/test_fpadd_pipe.py
new file mode 100644
index 00000000..df25e55f
--- /dev/null
+++ b/src/ieee754/add/test_fpadd_pipe.py
@@ -0,0 +1,126 @@
+""" key strategic example showing how to do multi-input fan-in into a
+    multi-stage pipeline, then multi-output fanout.
+
+    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
+    and used as a routing ID on the fanout.
+"""
+
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Value
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from nmigen_add_experiment import (FPADDMuxInOut,)
+
+from sfpy import Float32
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 10
+        self.width = 32
+        for mid in range(dut.num_rows):
+            self.di[mid] = {}
+            self.do[mid] = []
+            for i in range(self.tlen):
+                op1 = randint(0, (1<<self.width)-1)
+                op2 = randint(0, (1<<self.width)-1)
+                #op1 = 0x40900000
+                #op2 = 0x40200000
+                res = Float32(op1) + Float32(op2)
+                self.di[mid][i] = (op1, op2)
+                self.do[mid].append(res.bits)
+
+    def send(self, mid):
+        for i in range(self.tlen):
+            op1, op2 = self.di[mid][i]
+            rs = dut.p[mid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.a.eq(op1)
+            yield rs.data_i.b.eq(op2)
+            yield rs.data_i.mid.eq(mid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            fop1 = Float32(op1)
+            fop2 = Float32(op2)
+            res = fop1 + fop2
+            print ("send", mid, i, hex(op1), hex(op2), hex(res.bits),
+                           fop1, fop2, res)
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        yield
+
+        print ("send ended", mid)
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self, mid):
+        while True:
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+            n = self.dut.n[mid]
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_mid = yield n.data_o.mid
+            out_z = yield n.data_o.z
+
+            out_i = 0
+
+            print ("recv", out_mid, hex(out_z), "expected",
+                        hex(self.do[mid][out_i] ))
+
+            # see if this output has occurred already, delete it if it has
+            assert mid == out_mid, "out_mid %d not correct %d" % (out_mid, mid)
+            assert self.do[mid][out_i] == out_z
+            del self.do[mid][out_i]
+
+            # check if there's any more outputs
+            if len(self.do[mid]) == 0:
+                break
+        print ("recv ended", mid)
+
+
+
+if __name__ == '__main__':
+    dut = FPADDMuxInOut(32, 4)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_fpadd_pipe.il", "w") as f:
+        f.write(vl)
+    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
+
+    test = InputTest(dut)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send(0), test.send(1),
+                         test.send(3), test.send(2),
+                        ],
+                   vcd_name="test_fpadd_pipe.vcd")
+
diff --git a/src/ieee754/add/test_fpnum.py b/src/ieee754/add/test_fpnum.py
new file mode 100644
index 00000000..6d9ecd10
--- /dev/null
+++ b/src/ieee754/add/test_fpnum.py
@@ -0,0 +1,60 @@
+from random import randint
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from fpbase import FPNum
+
+class FPNumModShiftMulti:
+    def __init__(self, width):
+        self.a = FPNum(width)
+        self.ediff = Signal((self.a.e_width, True))
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        #m.d.sync += self.a.decode(self.a.v)
+        m.d.sync += self.a.shift_down_multi(self.ediff)
+
+        return m
+
+def check_case(dut, width, e_width, m, e, i):
+    yield dut.a.m.eq(m)
+    yield dut.a.e.eq(e)
+    yield dut.ediff.eq(i)
+    yield
+    yield
+
+    out_m = yield dut.a.m
+    out_e = yield dut.a.e
+    ed = yield dut.ediff
+    calc_e = (e + i) 
+    print (e, bin(m), out_e, calc_e, bin(out_m), i, ed)
+
+    calc_m = ((m >> (i+1)) << 1) | (m & 1)
+    for l in range(i):
+        if m & (1<<(l+1)):
+            calc_m |= 1
+
+    assert out_e == calc_e, "Output e 0x%x != expected 0x%x" % (out_e, calc_e)
+    assert out_m == calc_m, "Output m 0x%x != expected 0x%x" % (out_m, calc_m)
+
+def testbench(dut):
+    m_width = dut.a.m_width
+    e_width = dut.a.e_width
+    e_max = dut.a.e_max
+    for j in range(200):
+        m = randint(0, (1<<m_width)-1)
+        zeros = randint(0, 31)
+        for i in range(zeros):
+            m &= ~(1<<i)
+        e = randint(-e_max, e_max)
+        for i in range(32):
+            yield from check_case(dut, m_width, e_width, m, e, i)
+
+if __name__ == '__main__':
+    dut = FPNumModShiftMulti(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
+
+    #dut = MultiShiftModL(width=32)
+    #run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
+
diff --git a/src/ieee754/add/test_fsm_experiment.py b/src/ieee754/add/test_fsm_experiment.py
new file mode 100644
index 00000000..17cee24e
--- /dev/null
+++ b/src/ieee754/add/test_fsm_experiment.py
@@ -0,0 +1,129 @@
+# IEEE Floating Point Divider (Single Precision)
+# Copyright (C) Jonathan P Dawson 2013
+# 2013-12-12
+
+from nmigen import Module, Signal, Const, Cat, Elaboratable
+from nmigen.cli import main, verilog, rtlil
+from nmigen.compat.sim import run_simulation
+
+
+from fpbase import FPNumIn, FPNumOut, FPOpIn, FPOpOut, FPBase, FPState
+from nmoperator import eq
+from singlepipe import SimpleHandshake, ControlBase
+from test_buf_pipe import data_chain2, Test5
+
+
+class FPDIV(FPBase, Elaboratable):
+
+    def __init__(self, width):
+        FPBase.__init__(self)
+        self.width = width
+
+        self.p = FPOpIn(width)
+        self.n = FPOpOut(width)
+
+        self.p.data_i = self.ispec()
+        self.n.data_o = self.ospec()
+
+        self.states = []
+
+    def ispec(self):
+        return Signal(self.width, name="a")
+
+    def ospec(self):
+        return Signal(self.width, name="z")
+
+    def setup(self, m, i):
+        m.d.comb += self.p.v.eq(i) # connect input
+
+    def process(self, i):
+        return self.n.v # return z output
+
+    def add_state(self, state):
+        self.states.append(state)
+        return state
+
+    def elaborate(self, platform=None):
+        """ creates the HDL code-fragment for FPDiv
+        """
+        m = Module()
+
+        # Latches
+        a = FPNumIn(None, self.width, False)
+        z = FPNumOut(self.width, False)
+
+        m.submodules.p = self.p
+        m.submodules.n = self.n
+        m.submodules.a = a
+        m.submodules.z = z
+
+        m.d.comb += a.v.eq(self.p.v)
+
+        with m.FSM() as fsm:
+
+            # ******
+            # gets operand a
+
+            with m.State("get_a"):
+                res = self.get_op(m, self.p, a, "add_1")
+                m.d.sync += eq([a, self.p.ready_o], res)
+
+            with m.State("add_1"):
+                m.next = "pack"
+                m.d.sync += [
+                    z.s.eq(a.s), # sign
+                    z.e.eq(a.e), # exponent
+                    z.m.eq(a.m + 1), # mantissa
+                ]
+
+            # ******
+            # pack stage
+
+            with m.State("pack"):
+                self.pack(m, z, "put_z")
+
+            # ******
+            # put_z stage
+
+            with m.State("put_z"):
+                self.put_z(m, z, self.n, "get_a")
+
+        return m
+
+class FPDIVPipe(ControlBase):
+
+    def __init__(self, width):
+        self.width = width
+        self.fpdiv = FPDIV(width=width)
+        ControlBase.__init__(self, self.fpdiv)
+
+    def elaborate(self, platform):
+        self.m = m = ControlBase.elaborate(self, platform)
+
+        m.submodules.fpdiv = self.fpdiv
+
+        # see if connecting to stb/ack works
+        m.d.comb += self.fpdiv.p._connect_in(self.p)
+        m.d.comb += self.fpdiv.n._connect_out(self.n, do_data=False)
+        m.d.comb += self.n.data_o.eq(self.data_r)
+
+        return m
+
+def resultfn(data_o, expected, i, o):
+    res = expected + 1
+    assert data_o == res, \
+                "%d-%d received data %x not match expected %x\n" \
+                % (i, o, data_o, res)
+
+
+if __name__ == "__main__":
+    dut = FPDIVPipe(width=16)
+    data = data_chain2()
+    ports = dut.ports()
+    vl = rtlil.convert(dut, ports=ports)
+    with open("test_fsm_experiment.il", "w") as f:
+        f.write(vl)
+    test = Test5(dut, resultfn, data=data)
+    run_simulation(dut, [test.send, test.rcv],
+                    vcd_name="test_fsm_experiment.vcd")
+
diff --git a/src/ieee754/add/test_inout_mux_pipe.py b/src/ieee754/add/test_inout_mux_pipe.py
new file mode 100644
index 00000000..35abe2ea
--- /dev/null
+++ b/src/ieee754/add/test_inout_mux_pipe.py
@@ -0,0 +1,229 @@
+""" key strategic example showing how to do multi-input fan-in into a
+    multi-stage pipeline, then multi-output fanout.
+
+    the multiplex ID from the fan-in is passed in to the pipeline, preserved,
+    and used as a routing ID on the fanout.
+"""
+
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Value, Elaboratable
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from multipipe import CombMultiOutPipeline, CombMuxOutPipe
+from multipipe import PriorityCombMuxInPipe
+from singlepipe import SimpleHandshake, RecordObject, Object
+
+
+class PassData2(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.mid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+class PassData(Object):
+    def __init__(self):
+        Object.__init__(self)
+        self.mid = Signal(2, reset_less=True)
+        self.idx = Signal(8, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+
+class PassThroughStage:
+    def ispec(self):
+        return PassData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+
+    def process(self, i):
+        return i # pass-through
+
+
+
+class PassThroughPipe(SimpleHandshake):
+    def __init__(self):
+        SimpleHandshake.__init__(self, PassThroughStage())
+
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 100
+        for mid in range(dut.num_rows):
+            self.di[mid] = {}
+            self.do[mid] = {}
+            for i in range(self.tlen):
+                self.di[mid][i] = randint(0, 255) + (mid<<8)
+                self.do[mid][i] = self.di[mid][i]
+
+    def send(self, mid):
+        for i in range(self.tlen):
+            op2 = self.di[mid][i]
+            rs = dut.p[mid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.mid.eq(mid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", mid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        yield
+
+        print ("send ended", mid)
+
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self, mid):
+        while True:
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+            n = self.dut.n[mid]
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_mid = yield n.data_o.mid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", out_mid, out_i, hex(out_v))
+
+            # see if this output has occurred already, delete it if it has
+            assert mid == out_mid, "out_mid %d not correct %d" % (out_mid, mid)
+            assert out_i in self.do[mid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[mid]))
+            assert self.do[mid][out_i] == out_v # pass-through data
+            del self.do[mid][out_i]
+
+            # check if there's any more outputs
+            if len(self.do[mid]) == 0:
+                break
+        print ("recv ended", mid)
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
+
+
+class OutputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = []
+        self.do = {}
+        self.tlen = 100
+        for i in range(self.tlen * dut.num_rows):
+            if i < dut.num_rows:
+                mid = i
+            else:
+                mid = randint(0, dut.num_rows-1)
+            data = randint(0, 255) + (mid<<8)
+
+    def send(self):
+        for i in range(self.tlen * dut.num_rows):
+            op2 = self.di[i][0]
+            mid = self.di[i][1]
+            rs = dut.p
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.mid.eq(mid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", mid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+
+
+class TestMuxOutPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
+
+
+class TestInOutPipe(Elaboratable):
+    def __init__(self, num_rows=4):
+        self.num_rows = num_rows
+        self.inpipe = TestPriorityMuxPipe(num_rows) # fan-in (combinatorial)
+        self.pipe1 = PassThroughPipe()              # stage 1 (clock-sync)
+        self.pipe2 = PassThroughPipe()              # stage 2 (clock-sync)
+        self.outpipe = TestMuxOutPipe(num_rows)     # fan-out (combinatorial)
+
+        self.p = self.inpipe.p  # kinda annoying,
+        self.n = self.outpipe.n # use pipe in/out as this class in/out
+        self._ports = self.inpipe.ports() + self.outpipe.ports()
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.inpipe = self.inpipe
+        m.submodules.pipe1 = self.pipe1
+        m.submodules.pipe2 = self.pipe2
+        m.submodules.outpipe = self.outpipe
+
+        m.d.comb += self.inpipe.n.connect_to_next(self.pipe1.p)
+        m.d.comb += self.pipe1.connect_to_next(self.pipe2)
+        m.d.comb += self.pipe2.connect_to_next(self.outpipe)
+
+        return m
+
+    def ports(self):
+        return self._ports
+
+
+if __name__ == '__main__':
+    dut = TestInOutPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inoutmux_pipe.il", "w") as f:
+        f.write(vl)
+    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
+
+    test = InputTest(dut)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send(0), test.send(1),
+                         test.send(3), test.send(2),
+                        ],
+                   vcd_name="test_inoutmux_pipe.vcd")
+
diff --git a/src/ieee754/add/test_inputgroup.py b/src/ieee754/add/test_inputgroup.py
new file mode 100644
index 00000000..09a72e17
--- /dev/null
+++ b/src/ieee754/add/test_inputgroup.py
@@ -0,0 +1,179 @@
+from random import randint
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from inputgroup import InputGroup
+
+
+def testbench(dut):
+    stb = yield dut.out_op.stb
+    assert stb == 0
+    ack = yield dut.out_op.ack
+    assert ack == 0
+
+    # set row 1 input 0
+    yield dut.rs[1].in_op[0].eq(5)
+    yield dut.rs[1].stb.eq(0b01) # strobe indicate 1st op ready
+    #yield dut.rs[1].ack.eq(1)
+    yield
+
+    # check row 1 output (should be inactive)
+    decode = yield dut.rs[1].out_decode
+    assert decode == 0
+    if False:
+        op0 = yield dut.rs[1].out_op[0]
+        op1 = yield dut.rs[1].out_op[1]
+        assert op0 == 0 and op1 == 0
+
+    # output should be inactive
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+
+    # set row 0 input 1
+    yield dut.rs[1].in_op[1].eq(6)
+    yield dut.rs[1].stb.eq(0b11) # strobe indicate both ops ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield dut.rs[1].stb.eq(0) # clear row 1 strobe
+
+    # output strobe should be active, MID should be 0 until "ack" is set...
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+    out_mid = yield dut.mid
+    assert out_mid == 0
+
+    # ... and output should not yet be passed through either
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 0 and op1 == 0
+
+    # wait for out_op.ack to activate...
+    yield dut.rs[1].stb.eq(0b00) # set row 1 strobes to zero
+    yield
+
+    # *now* output should be passed through
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 5 and op1 == 6
+
+    # set row 2 input
+    yield dut.rs[2].in_op[0].eq(3)
+    yield dut.rs[2].in_op[1].eq(4)
+    yield dut.rs[2].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.out_op.ack.eq(1) # set output ack
+    yield
+    yield dut.rs[2].stb.eq(0) # clear row 2 strobe
+    yield dut.out_op.ack.eq(0) # set output ack
+    yield
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 3 and op1 == 4, "op0 %d op1 %d" % (op0, op1)
+    out_mid = yield dut.mid
+    assert out_mid == 2
+
+    # set row 0 and 3 input
+    yield dut.rs[0].in_op[0].eq(9)
+    yield dut.rs[0].in_op[1].eq(8)
+    yield dut.rs[0].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.rs[3].in_op[0].eq(1)
+    yield dut.rs[3].in_op[1].eq(2)
+    yield dut.rs[3].stb.eq(0b11) # strobe indicate 1st op ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.rs[0].stb.eq(0) # clear row 1 strobe
+    yield
+    out_mid = yield dut.mid
+    assert out_mid == 0, "out mid %d" % out_mid
+
+    yield
+    yield dut.rs[3].stb.eq(0) # clear row 1 strobe
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield
+    out_mid = yield dut.mid
+    assert out_mid == 3, "out mid %d" % out_mid
+
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 10
+        for mid in range(dut.num_rows):
+            self.di[mid] = {}
+            self.do[mid] = {}
+            for i in range(self.tlen):
+                self.di[mid][i] = randint(0, 100)
+                self.do[mid][i] = self.di[mid][i]
+
+    def send(self, mid):
+        for i in range(self.tlen):
+            op2 = self.di[mid][i]
+            rs = dut.rs[mid]
+            ack = yield rs.ack
+            while not ack:
+                yield
+                ack = yield rs.ack
+            yield rs.in_op[0].eq(i)
+            yield rs.in_op[1].eq(op2)
+            yield rs.stb.eq(0b11) # strobe indicate 1st op ready
+            ack = yield rs.ack
+            while ack:
+                yield
+                ack = yield rs.ack
+            yield rs.stb.eq(0)
+
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 8)):
+                yield
+
+    def recv(self):
+        while True:
+            stb = yield dut.out_op.stb
+            yield dut.out_op.ack.eq(0)
+            while not stb:
+                yield dut.out_op.ack.eq(1)
+                yield
+                stb = yield dut.out_op.stb
+
+            stb = yield dut.out_op.stb
+            while stb:
+                yield
+                stb = yield dut.out_op.stb
+            mid = yield dut.mid
+            out_i = yield dut.out_op.v[0]
+            out_v = yield dut.out_op.v[1]
+
+            # see if this output has occurred already, delete it if it has
+            assert out_i in self.do[mid]
+            assert self.do[mid][out_i] == out_v
+            del self.do[mid][out_i]
+
+            # check if there's any more outputs
+            zerolen = True
+            for (k, v) in self.do.items():
+                if v:
+                    zerolen = False
+            if zerolen:
+                break
+
+if __name__ == '__main__':
+    dut = InputGroup(width=32)
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inputgroup.il", "w") as f:
+        f.write(vl)
+    run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
+
+    dut = InputGroup(width=16)
+    test = InputTest(dut)
+    run_simulation(dut, [test.send(3), test.send(2),
+                         test.send(1), test.send(0),
+                         test.recv()],
+                   vcd_name="test_inputgroup_parallel.vcd")
+
diff --git a/src/ieee754/add/test_mul.py b/src/ieee754/add/test_mul.py
new file mode 100644
index 00000000..21d82528
--- /dev/null
+++ b/src/ieee754/add/test_mul.py
@@ -0,0 +1,39 @@
+import sys
+from random import randint
+from random import seed
+from operator import mul
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from fmul import FPMUL
+
+from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+
+def testbench(dut):
+    yield from check_case(dut, 0x40000000, 0x40000000, 0x40800000)
+    yield from check_case(dut, 0x41400000, 0x40A00000, 0x42700000)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0xba57711a, 0xbf9b1e94, 0x34082401, 0x5e8ef81,
+                  0x5c75da81, 0x2b017]
+    stimulus_b = [0xee1818c5, 0xc038ed3a, 0xb328cd45, 0x114f3db,
+                  0x2f642a39, 0xff3807ab]
+    yield from run_test(dut, stimulus_a, stimulus_b, mul, get_case)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, mul, get_case)
+    yield from run_edge_cases(dut, count, mul, get_case)
+
+
+if __name__ == '__main__':
+    dut = FPMUL(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_mul.vcd")
+
diff --git a/src/ieee754/add/test_mul64.py b/src/ieee754/add/test_mul64.py
new file mode 100644
index 00000000..81c5b5a4
--- /dev/null
+++ b/src/ieee754/add/test_mul64.py
@@ -0,0 +1,37 @@
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+from operator import mul
+
+from fmul import FPMUL
+
+import sys
+import atexit
+from random import randint
+from random import seed
+
+from unit_test_double import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+
+def testbench(dut):
+    yield from check_case(dut, 0, 0, 0)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0xff80000000000000, 0x3351099a0528e138]
+    stimulus_b = [0x7f80000000000000, 0xd651a9a9986af2b5]
+    yield from run_test(dut, stimulus_a, stimulus_b, mul)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, mul)
+    yield from run_edge_cases(dut, count, mul)
+
+
+if __name__ == '__main__':
+    dut = FPMUL(width=64)
+    run_simulation(dut, testbench(dut), vcd_name="test_mul64.vcd")
+
diff --git a/src/ieee754/add/test_multishift.py b/src/ieee754/add/test_multishift.py
new file mode 100644
index 00000000..651e5018
--- /dev/null
+++ b/src/ieee754/add/test_multishift.py
@@ -0,0 +1,134 @@
+from random import randint
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from fpbase import MultiShift, MultiShiftR, MultiShiftRMerge
+
+class MultiShiftModL:
+    def __init__(self, width):
+        self.ms = MultiShift(width)
+        self.a = Signal(width)
+        self.b = Signal(self.ms.smax)
+        self.x = Signal(width)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        m.d.comb += self.x.eq(self.ms.lshift(self.a, self.b))
+
+        return m
+
+class MultiShiftModR:
+    def __init__(self, width):
+        self.ms = MultiShift(width)
+        self.a = Signal(width)
+        self.b = Signal(self.ms.smax)
+        self.x = Signal(width)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        m.d.comb += self.x.eq(self.ms.rshift(self.a, self.b))
+
+        return m
+
+class MultiShiftModRMod:
+    def __init__(self, width):
+        self.ms = MultiShiftR(width)
+        self.a = Signal(width)
+        self.b = Signal(self.ms.smax)
+        self.x = Signal(width)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        m.submodules += self.ms
+        m.d.comb += self.ms.i.eq(self.a)
+        m.d.comb += self.ms.s.eq(self.b)
+        m.d.comb += self.x.eq(self.ms.o)
+
+        return m
+
+class MultiShiftRMergeMod:
+    def __init__(self, width):
+        self.ms = MultiShiftRMerge(width)
+        self.a = Signal(width)
+        self.b = Signal(self.ms.smax)
+        self.x = Signal(width)
+
+    def elaborate(self, platform=None):
+
+        m = Module()
+        m.submodules += self.ms
+        m.d.comb += self.ms.inp.eq(self.a)
+        m.d.comb += self.ms.diff.eq(self.b)
+        m.d.comb += self.x.eq(self.ms.m)
+
+        return m
+
+
+def check_case(dut, width, a, b):
+    yield dut.a.eq(a)
+    yield dut.b.eq(b)
+    yield
+
+    x = (a << b) & ((1<<width)-1)
+
+    out_x = yield dut.x
+    assert out_x == x, "Output x 0x%x not equal to expected 0x%x" % (out_x, x)
+
+def check_caser(dut, width, a, b):
+    yield dut.a.eq(a)
+    yield dut.b.eq(b)
+    yield
+
+    x = (a >> b) & ((1<<width)-1)
+
+    out_x = yield dut.x
+    assert out_x == x, "Output x 0x%x not equal to expected 0x%x" % (out_x, x)
+
+
+def check_case_merge(dut, width, a, b):
+    yield dut.a.eq(a)
+    yield dut.b.eq(b)
+    yield
+
+    x = (a >> b) & ((1<<width)-1) # actual shift
+    if (a & ((2<<b)-1)) != 0: # mask for sticky bit
+        x |= 1 # set LSB
+
+    out_x = yield dut.x
+    assert out_x == x, \
+                "\nshift %d\nInput\n%+32s\nOutput x\n%+32s != \n%+32s" % \
+                        (b, bin(a), bin(out_x), bin(x))
+
+def testmerge(dut):
+    for i in range(32):
+        for j in range(1000):
+            a = randint(0, (1<<32)-1)
+            yield from check_case_merge(dut, 32, a, i)
+
+def testbench(dut):
+    for i in range(32):
+        for j in range(1000):
+            a = randint(0, (1<<32)-1)
+            yield from check_case(dut, 32, a, i)
+
+def testbenchr(dut):
+    for i in range(32):
+        for j in range(1000):
+            a = randint(0, (1<<32)-1)
+            yield from check_caser(dut, 32, a, i)
+
+if __name__ == '__main__':
+    dut = MultiShiftRMergeMod(width=32)
+    run_simulation(dut, testmerge(dut), vcd_name="test_multishiftmerge.vcd")
+    dut = MultiShiftModRMod(width=32)
+    run_simulation(dut, testbenchr(dut), vcd_name="test_multishift.vcd")
+
+    dut = MultiShiftModR(width=32)
+    run_simulation(dut, testbenchr(dut), vcd_name="test_multishift.vcd")
+
+    dut = MultiShiftModL(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_multishift.vcd")
+
diff --git a/src/ieee754/add/test_outmux_pipe.py b/src/ieee754/add/test_outmux_pipe.py
new file mode 100644
index 00000000..b674a870
--- /dev/null
+++ b/src/ieee754/add/test_outmux_pipe.py
@@ -0,0 +1,162 @@
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat, Elaboratable
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from multipipe import CombMuxOutPipe
+from singlepipe import SimpleHandshake, PassThroughHandshake, RecordObject
+
+
+class PassInData(RecordObject):
+    def __init__(self):
+        RecordObject.__init__(self)
+        self.mid = Signal(2, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+
+class PassThroughStage:
+
+    def ispec(self):
+        return PassInData()
+
+    def ospec(self, name):
+        return Signal(16, name="%s_dout" % name, reset_less=True)
+                
+    def process(self, i):
+        return i.data
+
+
+class PassThroughDataStage:
+    def ispec(self):
+        return PassInData()
+    def ospec(self):
+        return self.ispec() # same as ospec
+
+    def process(self, i):
+        return i # pass-through
+
+
+
+class PassThroughPipe(PassThroughHandshake):
+    def __init__(self):
+        PassThroughHandshake.__init__(self, PassThroughDataStage())
+
+
+class OutputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = []
+        self.do = {}
+        self.tlen = 10
+        for i in range(self.tlen * dut.num_rows):
+            if i < dut.num_rows:
+                mid = i
+            else:
+                mid = randint(0, dut.num_rows-1)
+            data = randint(0, 255) + (mid<<8)
+            if mid not in self.do:
+                self.do[mid] = []
+            self.di.append((data, mid))
+            self.do[mid].append(data)
+
+    def send(self):
+        for i in range(self.tlen * dut.num_rows):
+            op2 = self.di[i][0]
+            mid = self.di[i][1]
+            rs = dut.p
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.mid.eq(mid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", mid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+
+    def rcv(self, mid):
+        out_i = 0
+        count = 0
+        stall_range = randint(0, 3)
+        while out_i != len(self.do[mid]):
+            count += 1
+            assert count != 2000, "timeout: too long"
+            n = self.dut.n[mid]
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            out_v = yield n.data_o
+
+            print ("recv", mid, out_i, hex(out_v))
+
+            assert self.do[mid][out_i] == out_v # pass-through data
+
+            out_i += 1
+
+            if randint(0, 5) == 0:
+                stall_range = randint(0, 3)
+            stall = randint(0, stall_range) != 0
+            if stall:
+                yield n.ready_i.eq(0)
+                for i in range(stall_range):
+                    yield
+
+
+class TestPriorityMuxPipe(CombMuxOutPipe):
+    def __init__(self, num_rows):
+        self.num_rows = num_rows
+        stage = PassThroughStage()
+        CombMuxOutPipe.__init__(self, stage, n_len=self.num_rows)
+
+
+class TestSyncToPriorityPipe(Elaboratable):
+    def __init__(self):
+        self.num_rows = 4
+        self.pipe = PassThroughPipe()
+        self.muxpipe = TestPriorityMuxPipe(self.num_rows)
+
+        self.p = self.pipe.p
+        self.n = self.muxpipe.n
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.pipe = self.pipe
+        m.submodules.muxpipe = self.muxpipe
+        m.d.comb += self.pipe.n.connect_to_next(self.muxpipe.p)
+        return m
+
+    def ports(self):
+        res = [self.p.valid_i, self.p.ready_o] + \
+                self.p.data_i.ports()
+        for i in range(len(self.n)):
+            res += [self.n[i].ready_i, self.n[i].valid_o] + \
+                    [self.n[i].data_o]
+                    #self.n[i].data_o.ports()
+        return res
+
+
+if __name__ == '__main__':
+    dut = TestSyncToPriorityPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_outmux_pipe.il", "w") as f:
+        f.write(vl)
+
+    test = OutputTest(dut)
+    run_simulation(dut, [test.rcv(1), test.rcv(0),
+                         test.rcv(3), test.rcv(2),
+                         test.send()],
+                   vcd_name="test_outmux_pipe.vcd")
+
diff --git a/src/ieee754/add/test_prioritymux_pipe.py b/src/ieee754/add/test_prioritymux_pipe.py
new file mode 100644
index 00000000..5f7891e8
--- /dev/null
+++ b/src/ieee754/add/test_prioritymux_pipe.py
@@ -0,0 +1,218 @@
+from random import randint
+from math import log
+from nmigen import Module, Signal, Cat
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog, rtlil
+
+from singlepipe import PassThroughStage
+from multipipe import (CombMultiInPipeline, PriorityCombMuxInPipe)
+
+
+class PassData:
+    def __init__(self):
+        self.mid = Signal(2, reset_less=True)
+        self.idx = Signal(6, reset_less=True)
+        self.data = Signal(16, reset_less=True)
+
+    def eq(self, i):
+        return [self.mid.eq(i.mid), self.idx.eq(i.idx), self.data.eq(i.data)]
+
+    def ports(self):
+        return [self.mid, self.idx, self.data]
+
+
+def testbench(dut):
+    stb = yield dut.out_op.stb
+    assert stb == 0
+    ack = yield dut.out_op.ack
+    assert ack == 0
+
+    # set row 1 input 0
+    yield dut.rs[1].in_op[0].eq(5)
+    yield dut.rs[1].stb.eq(0b01) # strobe indicate 1st op ready
+    #yield dut.rs[1].ack.eq(1)
+    yield
+
+    # check row 1 output (should be inactive)
+    decode = yield dut.rs[1].out_decode
+    assert decode == 0
+    if False:
+        op0 = yield dut.rs[1].out_op[0]
+        op1 = yield dut.rs[1].out_op[1]
+        assert op0 == 0 and op1 == 0
+
+    # output should be inactive
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+
+    # set row 0 input 1
+    yield dut.rs[1].in_op[1].eq(6)
+    yield dut.rs[1].stb.eq(0b11) # strobe indicate both ops ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield dut.rs[1].stb.eq(0) # clear row 1 strobe
+
+    # output strobe should be active, MID should be 0 until "ack" is set...
+    out_stb = yield dut.out_op.stb
+    assert out_stb == 1
+    out_mid = yield dut.mid
+    assert out_mid == 0
+
+    # ... and output should not yet be passed through either
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 0 and op1 == 0
+
+    # wait for out_op.ack to activate...
+    yield dut.rs[1].stb.eq(0b00) # set row 1 strobes to zero
+    yield
+
+    # *now* output should be passed through
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 5 and op1 == 6
+
+    # set row 2 input
+    yield dut.rs[2].in_op[0].eq(3)
+    yield dut.rs[2].in_op[1].eq(4)
+    yield dut.rs[2].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.out_op.ack.eq(1) # set output ack
+    yield
+    yield dut.rs[2].stb.eq(0) # clear row 2 strobe
+    yield dut.out_op.ack.eq(0) # set output ack
+    yield
+    op0 = yield dut.out_op.v[0]
+    op1 = yield dut.out_op.v[1]
+    assert op0 == 3 and op1 == 4, "op0 %d op1 %d" % (op0, op1)
+    out_mid = yield dut.mid
+    assert out_mid == 2
+
+    # set row 0 and 3 input
+    yield dut.rs[0].in_op[0].eq(9)
+    yield dut.rs[0].in_op[1].eq(8)
+    yield dut.rs[0].stb.eq(0b11) # strobe indicate 1st op ready
+    yield dut.rs[3].in_op[0].eq(1)
+    yield dut.rs[3].in_op[1].eq(2)
+    yield dut.rs[3].stb.eq(0b11) # strobe indicate 1st op ready
+
+    # set acknowledgement of output... takes 1 cycle to respond
+    yield dut.out_op.ack.eq(1)
+    yield
+    yield dut.rs[0].stb.eq(0) # clear row 1 strobe
+    yield
+    out_mid = yield dut.mid
+    assert out_mid == 0, "out mid %d" % out_mid
+
+    yield
+    yield dut.rs[3].stb.eq(0) # clear row 1 strobe
+    yield dut.out_op.ack.eq(0) # clear ack on output
+    yield
+    out_mid = yield dut.mid
+    assert out_mid == 3, "out mid %d" % out_mid
+
+
+class InputTest:
+    def __init__(self, dut):
+        self.dut = dut
+        self.di = {}
+        self.do = {}
+        self.tlen = 10
+        for mid in range(dut.num_rows):
+            self.di[mid] = {}
+            self.do[mid] = {}
+            for i in range(self.tlen):
+                self.di[mid][i] = randint(0, 100) + (mid<<8)
+                self.do[mid][i] = self.di[mid][i]
+
+    def send(self, mid):
+        for i in range(self.tlen):
+            op2 = self.di[mid][i]
+            rs = dut.p[mid]
+            yield rs.valid_i.eq(1)
+            yield rs.data_i.data.eq(op2)
+            yield rs.data_i.idx.eq(i)
+            yield rs.data_i.mid.eq(mid)
+            yield
+            o_p_ready = yield rs.ready_o
+            while not o_p_ready:
+                yield
+                o_p_ready = yield rs.ready_o
+
+            print ("send", mid, i, hex(op2))
+
+            yield rs.valid_i.eq(0)
+            # wait random period of time before queueing another value
+            for i in range(randint(0, 3)):
+                yield
+
+        yield rs.valid_i.eq(0)
+        ## wait random period of time before queueing another value
+        #for i in range(randint(0, 3)):
+        #    yield
+
+        #send_range = randint(0, 3)
+        #if send_range == 0:
+        #    send = True
+        #else:
+        #    send = randint(0, send_range) != 0
+
+    def rcv(self):
+        while True:
+            #stall_range = randint(0, 3)
+            #for j in range(randint(1,10)):
+            #    stall = randint(0, stall_range) != 0
+            #    yield self.dut.n[0].ready_i.eq(stall)
+            #    yield
+            n = self.dut.n
+            yield n.ready_i.eq(1)
+            yield
+            o_n_valid = yield n.valid_o
+            i_n_ready = yield n.ready_i
+            if not o_n_valid or not i_n_ready:
+                continue
+
+            mid = yield n.data_o.mid
+            out_i = yield n.data_o.idx
+            out_v = yield n.data_o.data
+
+            print ("recv", mid, out_i, hex(out_v))
+
+            # see if this output has occurred already, delete it if it has
+            assert out_i in self.do[mid], "out_i %d not in array %s" % \
+                                          (out_i, repr(self.do[mid]))
+            assert self.do[mid][out_i] == out_v # pass-through data
+            del self.do[mid][out_i]
+
+            # check if there's any more outputs
+            zerolen = True
+            for (k, v) in self.do.items():
+                if v:
+                    zerolen = False
+            if zerolen:
+                break
+
+
+class TestPriorityMuxPipe(PriorityCombMuxInPipe):
+    def __init__(self):
+        self.num_rows = 4
+        def iospecfn(): return PassData()
+        stage = PassThroughStage(iospecfn)
+        PriorityCombMuxInPipe.__init__(self, stage, p_len=self.num_rows)
+
+
+if __name__ == '__main__':
+    dut = TestPriorityMuxPipe()
+    vl = rtlil.convert(dut, ports=dut.ports())
+    with open("test_inputgroup_multi.il", "w") as f:
+        f.write(vl)
+    #run_simulation(dut, testbench(dut), vcd_name="test_inputgroup.vcd")
+
+    test = InputTest(dut)
+    run_simulation(dut, [test.send(1), test.send(0),
+                         test.send(3), test.send(2),
+                         test.rcv()],
+                   vcd_name="test_inputgroup_multi.vcd")
+
diff --git a/src/ieee754/add/test_state_add.py b/src/ieee754/add/test_state_add.py
new file mode 100644
index 00000000..8d1ccf59
--- /dev/null
+++ b/src/ieee754/add/test_state_add.py
@@ -0,0 +1,72 @@
+from random import randint
+from random import seed
+from operator import add
+
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+
+from fadd_state import FPADD
+
+from unit_test_single import (get_mantissa, get_exponent, get_sign, is_nan,
+                                is_inf, is_pos_inf, is_neg_inf,
+                                match, get_case, check_case, run_test,
+                                run_edge_cases, run_corner_cases)
+
+def testbench(dut):
+    yield from check_case(dut, 0xFFFFFFFF, 0xC63B800A, 0xFFC00000)
+    yield from check_case(dut, 0xFF800000, 0x7F800000, 0xFFC00000)
+    #yield from check_case(dut, 0xFF800000, 0x7F800000, 0x7FC00000)
+    yield from check_case(dut, 0x7F800000, 0xFF800000, 0xFFC00000)
+    yield from check_case(dut, 0x42540000, 0xC2540000, 0x00000000)
+    yield from check_case(dut, 0xC2540000, 0x42540000, 0x00000000)
+    yield from check_case(dut, 0xfe34f995, 0xff5d59ad, 0xff800000)
+    yield from check_case(dut, 0x82471f51, 0x243985f, 0x801c3790)
+    yield from check_case(dut, 0, 0, 0)
+    yield from check_case(dut, 0x40000000, 0xc0000000, 0x00000000)
+    yield from check_case(dut, 0x3F800000, 0x40000000, 0x40400000)
+    yield from check_case(dut, 0x40000000, 0x3F800000, 0x40400000)
+    yield from check_case(dut, 0x447A0000, 0x4488B000, 0x4502D800)
+    yield from check_case(dut, 0x463B800A, 0x42BA8A3D, 0x463CF51E)
+    yield from check_case(dut, 0x42BA8A3D, 0x463B800A, 0x463CF51E)
+    yield from check_case(dut, 0x463B800A, 0xC2BA8A3D, 0x463A0AF6)
+    yield from check_case(dut, 0xC2BA8A3D, 0x463B800A, 0x463A0AF6)
+    yield from check_case(dut, 0xC63B800A, 0x42BA8A3D, 0xC63A0AF6)
+    yield from check_case(dut, 0x42BA8A3D, 0xC63B800A, 0xC63A0AF6)
+    yield from check_case(dut, 0x7F800000, 0x00000000, 0x7F800000)
+    yield from check_case(dut, 0x00000000, 0x7F800000, 0x7F800000)
+    yield from check_case(dut, 0xFF800000, 0x00000000, 0xFF800000)
+    yield from check_case(dut, 0x00000000, 0xFF800000, 0xFF800000)
+    yield from check_case(dut, 0x7F800000, 0x7F800000, 0x7F800000)
+    yield from check_case(dut, 0xFF800000, 0xFF800000, 0xFF800000)
+    yield from check_case(dut, 0x00018643, 0x00FA72A4, 0x00FBF8E7)
+    yield from check_case(dut, 0x001A2239, 0x00FA72A4, 0x010A4A6E)
+    yield from check_case(dut, 0x3F7FFFFE, 0x3F7FFFFE, 0x3FFFFFFE)
+    yield from check_case(dut, 0x7EFFFFEE, 0x7EFFFFEE, 0x7F7FFFEE)
+    yield from check_case(dut, 0x7F7FFFEE, 0xFEFFFFEE, 0x7EFFFFEE)
+    yield from check_case(dut, 0x7F7FFFEE, 0x756CA884, 0x7F7FFFFD)
+    yield from check_case(dut, 0x7F7FFFEE, 0x758A0CF8, 0x7F7FFFFF)
+    yield from check_case(dut, 0x42500000, 0x51A7A358, 0x51A7A358)
+    yield from check_case(dut, 0x51A7A358, 0x42500000, 0x51A7A358)
+    yield from check_case(dut, 0x4E5693A4, 0x42500000, 0x4E5693A5)
+    yield from check_case(dut, 0x42500000, 0x4E5693A4, 0x4E5693A5)
+    #yield from check_case(dut, 1, 0, 1)
+    #yield from check_case(dut, 1, 1, 1)
+
+    count = 0
+
+    #regression tests
+    stimulus_a = [0x22cb525a, 0x40000000, 0x83e73d5c, 0xbf9b1e94, 0x34082401,
+                    0x5e8ef81, 0x5c75da81, 0x2b017]
+    stimulus_b = [0xadd79efa, 0xC0000000, 0x1c800000, 0xc038ed3a, 0xb328cd45, 
+                    0x114f3db, 0x2f642a39, 0xff3807ab]
+    yield from run_test(dut, stimulus_a, stimulus_b, add, get_case)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    yield from run_corner_cases(dut, count, add, get_case)
+    yield from run_edge_cases(dut, count, add, get_case)
+
+if __name__ == '__main__':
+    dut = FPADD(width=32, single_cycle=True)
+    run_simulation(dut, testbench(dut), vcd_name="test_state_add.vcd")
+
diff --git a/src/ieee754/add/test_syncops.py b/src/ieee754/add/test_syncops.py
new file mode 100644
index 00000000..484597ca
--- /dev/null
+++ b/src/ieee754/add/test_syncops.py
@@ -0,0 +1,48 @@
+from random import randint
+from nmigen import Module, Signal
+from nmigen.compat.sim import run_simulation
+from nmigen.cli import verilog
+
+from inputgroup import FPGetSyncOpsMod
+
+
+def testbench(dut):
+    stb = yield dut.stb
+    assert stb == 0
+    ack = yield dut.ack
+    assert ack == 0
+
+    yield dut.in_op[0].eq(5)
+    yield dut.stb.eq(0b01)
+    yield dut.ack.eq(1)
+    yield
+    yield
+    decode = yield dut.out_decode
+    assert decode == 0
+
+    op0 = yield dut.out_op[0]
+    op1 = yield dut.out_op[1]
+    assert op0 == 0 and op1 == 0
+
+    yield dut.in_op[1].eq(6)
+    yield dut.stb.eq(0b11)
+    yield
+    yield
+
+    op0 = yield dut.out_op[0]
+    op1 = yield dut.out_op[1]
+    assert op0 == 5 and op1 == 6
+
+    yield dut.ack.eq(0)
+    yield
+
+    op0 = yield dut.out_op[0]
+    op1 = yield dut.out_op[1]
+    assert op0 == 0 and op1 == 0
+
+if __name__ == '__main__':
+    dut = FPGetSyncOpsMod(width=32)
+    run_simulation(dut, testbench(dut), vcd_name="test_getsyncops.vcd")
+    vl = verilog.convert(dut, ports=dut.ports())
+    with open("test_getsyncops.v", "w") as f:
+        f.write(vl)
diff --git a/src/ieee754/add/unit_test_double.py b/src/ieee754/add/unit_test_double.py
new file mode 100644
index 00000000..8cc097f2
--- /dev/null
+++ b/src/ieee754/add/unit_test_double.py
@@ -0,0 +1,227 @@
+import sys
+from random import randint
+from random import seed
+
+from sfpy import Float64
+
+def get_mantissa(x):
+    return x & 0x000fffffffffffff
+
+def get_exponent(x):
+    return ((x & 0x7ff0000000000000) >> 52) - 1023
+
+def get_sign(x):
+    return ((x & 0x8000000000000000) >> 63)
+
+def is_nan(x):
+    return get_exponent(x) == 1024 and get_mantissa(x) != 0
+
+def is_inf(x):
+    return get_exponent(x) == 1024 and get_mantissa(x) == 0
+
+def is_pos_inf(x):
+    return is_inf(x) and not get_sign(x)
+
+def is_neg_inf(x):
+    return is_inf(x) and get_sign(x)
+
+def match(x, y):
+    return (
+        (is_pos_inf(x) and is_pos_inf(y)) or
+        (is_neg_inf(x) and is_neg_inf(y)) or
+        (is_nan(x) and is_nan(y)) or
+        (x == y)
+        )
+
+def get_case(dut, a, b):
+    yield dut.in_a.v.eq(a)
+    yield dut.in_a.stb.eq(1)
+    yield
+    yield
+    a_ack = (yield dut.in_a.ack)
+    assert a_ack == 0
+    yield dut.in_b.v.eq(b)
+    yield dut.in_b.stb.eq(1)
+    b_ack = (yield dut.in_b.ack)
+    assert b_ack == 0
+
+    while True:
+        yield
+        out_z_stb = (yield dut.out_z.stb)
+        if not out_z_stb:
+            continue
+        yield dut.in_a.stb.eq(0)
+        yield dut.in_b.stb.eq(0)
+        yield dut.out_z.ack.eq(1)
+        yield
+        yield dut.out_z.ack.eq(0)
+        yield
+        yield
+        break
+
+    out_z = yield dut.out_z.v
+    return out_z
+
+def check_case(dut, a, b, z):
+    out_z = yield from get_case(dut, a, b)
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+
+
+def run_test(dut, stimulus_a, stimulus_b, op):
+
+    expected_responses = []
+    actual_responses = []
+    for a, b in zip(stimulus_a, stimulus_b):
+        af = Float64.from_bits(a)
+        bf = Float64.from_bits(b)
+        z = op(af, bf)
+        expected_responses.append(z.get_bits())
+        #print (af, bf, z)
+        actual = yield from get_case(dut, a, b)
+        actual_responses.append(actual)
+
+    if len(actual_responses) < len(expected_responses):
+        print ("Fail ... not enough results")
+        exit(0)
+
+    for exp, act, a, b in zip(expected_responses, actual_responses,
+                                      stimulus_a, stimulus_b):
+        passed = match(exp, act)
+
+        if not passed:
+
+            print ("Fail ... expected:", hex(exp), "actual:", hex(act))
+
+            print (hex(a))
+            print ("a mantissa:",              a & 0x000fffffffffffff)
+            print ("a exponent:",            ((a & 0x7ff0000000000000) >> 52)\
+                                                - 1023)
+            print ("a sign:",                ((a & 0x8000000000000000) >> 63))
+
+            print (hex(b))
+            print ("b mantissa:",              b & 0x000fffffffffffff)
+            print ("b exponent:",            ((b & 0x7ff0000000000000) >> 52)\
+                                                 - 1023)
+            print ("b sign:",                ((b & 0x8000000000000000) >> 63))
+
+            print (hex(exp))
+            print ("expected mantissa:",   exp & 0x000fffffffffffff)
+            print ("expected exponent:", ((exp & 0x7ff0000000000000) >> 52)\
+                                                 - 1023)
+            print ("expected sign:",     ((exp & 0x8000000000000000) >> 63))
+
+            print (hex(act))
+            print ("actual mantissa:",       act & 0x000fffffffffffff)
+            print ("actual exponent:",     ((act & 0x7ff0000000000000) >> 52)\
+                                                 - 1023)
+            print ("actual sign:",         ((act & 0x8000000000000000) >> 63))
+
+            sys.exit(0)
+
+
+def run_corner_cases(dut, count, op):
+    #corner cases
+    from itertools import permutations
+    stimulus_a = [i[0] for i in permutations([
+        0x8000000000000000,
+        0x0000000000000000,
+        0x7ff8000000000000,
+        0xfff8000000000000,
+        0x7ff0000000000000,
+        0xfff0000000000000
+    ], 2)]
+    stimulus_b = [i[1] for i in permutations([
+        0x8000000000000000,
+        0x0000000000000000,
+        0x7ff8000000000000,
+        0xfff8000000000000,
+        0x7ff0000000000000,
+        0xfff0000000000000
+    ], 2)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+
+def run_edge_cases(dut, count, op):
+    #edge cases
+    stimulus_a = [0x8000000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x0000000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x8000000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x0000000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x7FF8000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0xFFF8000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x7FF8000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64) for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0xFFF8000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64) for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x7FF0000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0xFFF0000000000000 for i in range(1000)]
+    stimulus_b = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x7FF0000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0xFFF0000000000000 for i in range(1000)]
+    stimulus_a = [randint(0, 1<<64)  for i in range(1000)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    #seed(0)
+    for i in range(100000):
+        stimulus_a = [randint(0, 1<<64) for i in range(1000)]
+        stimulus_b = [randint(0, 1<<64) for i in range(1000)]
+        yield from run_test(dut, stimulus_a, stimulus_b, op)
+        count += 1000
+        print (count, "random vectors passed")
+
diff --git a/src/ieee754/add/unit_test_half.py b/src/ieee754/add/unit_test_half.py
new file mode 100644
index 00000000..73c9b653
--- /dev/null
+++ b/src/ieee754/add/unit_test_half.py
@@ -0,0 +1,211 @@
+from random import randint
+from random import seed
+
+import sys
+from sfpy import Float16
+
+def get_mantissa(x):
+    return 0x3ff & x
+
+def get_exponent(x):
+    return ((x & 0xf800) >> 11) - 15
+
+def get_sign(x):
+    return ((x & 0x8000) >> 15)
+
+def is_nan(x):
+    return get_exponent(x) == 16 and get_mantissa(x) != 0
+
+def is_inf(x):
+    return get_exponent(x) == 16 and get_mantissa(x) == 0
+
+def is_pos_inf(x):
+    return is_inf(x) and not get_sign(x)
+
+def is_neg_inf(x):
+    return is_inf(x) and get_sign(x)
+
+def match(x, y):
+    return (
+        (is_pos_inf(x) and is_pos_inf(y)) or
+        (is_neg_inf(x) and is_neg_inf(y)) or
+        (is_nan(x) and is_nan(y)) or
+        (x == y)
+        )
+
+def get_case(dut, a, b):
+    yield dut.in_a.v.eq(a)
+    yield dut.in_a.stb.eq(1)
+    yield
+    yield
+    a_ack = (yield dut.in_a.ack)
+    assert a_ack == 0
+    yield dut.in_b.v.eq(b)
+    yield dut.in_b.stb.eq(1)
+    b_ack = (yield dut.in_b.ack)
+    assert b_ack == 0
+
+    while True:
+        yield
+        out_z_stb = (yield dut.out_z.stb)
+        if not out_z_stb:
+            continue
+        yield dut.in_a.stb.eq(0)
+        yield dut.in_b.stb.eq(0)
+        yield dut.out_z.ack.eq(1)
+        yield
+        yield dut.out_z.ack.eq(0)
+        yield
+        yield
+        break
+
+    out_z = yield dut.out_z.v
+    return out_z
+
+def check_case(dut, a, b, z):
+    out_z = yield from get_case(dut, a, b)
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+
+
+def run_test(dut, stimulus_a, stimulus_b, op):
+
+    expected_responses = []
+    actual_responses = []
+    for a, b in zip(stimulus_a, stimulus_b):
+        af = Float16.from_bits(a)
+        bf = Float16.from_bits(b)
+        z = op(af, bf)
+        expected_responses.append(z.get_bits())
+        #print (af, bf, z)
+        actual = yield from get_case(dut, a, b)
+        actual_responses.append(actual)
+
+    if len(actual_responses) < len(expected_responses):
+        print ("Fail ... not enough results")
+        exit(0)
+
+    for expected, actual, a, b in zip(expected_responses, actual_responses,
+                                      stimulus_a, stimulus_b):
+        passed = match(expected, actual)
+
+        if not passed:
+
+            print ("Fail ... expected:", hex(expected), "actual:", hex(actual))
+
+            print (hex(a))
+            print ("a mantissa:", get_mantissa(a))
+            print ("a exponent:", get_exponent(a))
+            print ("a sign:", get_sign(a))
+
+            print (hex(b))
+            print ("b mantissa:", get_mantissa(b))
+            print ("b exponent:", get_exponent(b))
+            print ("b sign:", get_sign(b))
+
+            print (hex(expected))
+            print ("expected mantissa:", get_mantissa(expected))
+            print ("expected exponent:", get_exponent(expected))
+            print ("expected sign:", get_sign(expected))
+
+            print (hex(actual))
+            print ("actual mantissa:", get_mantissa(actual))
+            print ("actual exponent:", get_exponent(actual))
+            print ("actual sign:", get_sign(actual))
+
+            sys.exit(0)
+
+def run_corner_cases(dut, count, op):
+    #corner cases
+    corners = [0x8000, 0x0000, 0x7800, 0xf800, 0x7c00, 0xfc00]
+    from itertools import permutations
+    stimulus_a = [i[0] for i in permutations(corners, 2)]
+    stimulus_b = [i[1] for i in permutations(corners, 2)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+
+def run_edge_cases(dut, count, op):
+    maxint16 = 1<<16
+    maxcount = 10
+    #edge cases
+    stimulus_a = [0x8000 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x0000 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x8000 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x0000 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x7800 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0xF800 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x7800 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0xF800 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0x7C00 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_a = [0xFC00 for i in range(maxcount)]
+    stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0x7C00 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    stimulus_b = [0xFC00 for i in range(maxcount)]
+    stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+    #seed(0)
+    for i in range(100000):
+        stimulus_a = [randint(0, maxint16-1) for i in range(maxcount)]
+        stimulus_b = [randint(0, maxint16-1) for i in range(maxcount)]
+        yield from run_test(dut, stimulus_a, stimulus_b, op)
+        count += maxcount
+        print (count, "random vectors passed")
+
diff --git a/src/ieee754/add/unit_test_single.py b/src/ieee754/add/unit_test_single.py
new file mode 100644
index 00000000..2b0d9e56
--- /dev/null
+++ b/src/ieee754/add/unit_test_single.py
@@ -0,0 +1,255 @@
+from random import randint
+from random import seed
+
+import sys
+from sfpy import Float32
+
+def get_mantissa(x):
+    return 0x7fffff & x
+
+def get_exponent(x):
+    return ((x & 0x7f800000) >> 23) - 127
+
+def set_exponent(x, e):
+    return (x & ~0x7f800000) | ((e+127) << 23)
+
+def get_sign(x):
+    return ((x & 0x80000000) >> 31)
+
+def is_nan(x):
+    return get_exponent(x) == 128 and get_mantissa(x) != 0
+
+def is_inf(x):
+    return get_exponent(x) == 128 and get_mantissa(x) == 0
+
+def is_pos_inf(x):
+    return is_inf(x) and not get_sign(x)
+
+def is_neg_inf(x):
+    return is_inf(x) and get_sign(x)
+
+def match(x, y):
+    return (
+        (is_pos_inf(x) and is_pos_inf(y)) or
+        (is_neg_inf(x) and is_neg_inf(y)) or
+        (is_nan(x) and is_nan(y)) or
+        (x == y)
+        )
+
+def get_rs_case(dut, a, b, mid):
+    in_a, in_b = dut.rs[0]
+    out_z = dut.res[0]
+    yield dut.ids.in_mid.eq(mid)
+    yield in_a.v.eq(a)
+    yield in_a.valid_i.eq(1)
+    yield
+    yield
+    yield
+    yield
+    a_ack = (yield in_a.ready_o)
+    assert a_ack == 0
+
+    yield in_a.valid_i.eq(0)
+
+    yield in_b.v.eq(b)
+    yield in_b.valid_i.eq(1)
+    yield
+    yield
+    b_ack = (yield in_b.ready_o)
+    assert b_ack == 0
+
+    yield in_b.valid_i.eq(0)
+
+    yield out_z.ready_i.eq(1)
+
+    while True:
+        out_z_stb = (yield out_z.valid_o)
+        if not out_z_stb:
+            yield
+            continue
+        vout_z = yield out_z.v
+        #out_mid = yield dut.ids.out_mid
+        yield out_z.ready_i.eq(0)
+        yield
+        break
+
+    return vout_z, mid
+
+def check_rs_case(dut, a, b, z, mid=None):
+    if mid is None:
+        mid = randint(0, 6)
+    mid = 0
+    out_z, out_mid = yield from get_rs_case(dut, a, b, mid)
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
+
+
+def get_case(dut, a, b, mid):
+    #yield dut.in_mid.eq(mid)
+    yield dut.in_a.v.eq(a)
+    yield dut.in_a.valid_i_test.eq(1)
+    yield
+    yield
+    yield
+    yield
+    a_ack = (yield dut.in_a.ready_o)
+    assert a_ack == 0
+
+    yield dut.in_a.valid_i.eq(0)
+
+    yield dut.in_b.v.eq(b)
+    yield dut.in_b.valid_i.eq(1)
+    yield
+    yield
+    b_ack = (yield dut.in_b.ready_o)
+    assert b_ack == 0
+
+    yield dut.in_b.valid_i.eq(0)
+
+    yield dut.out_z.ready_i.eq(1)
+
+    while True:
+        out_z_stb = (yield dut.out_z.valid_o)
+        if not out_z_stb:
+            yield
+            continue
+        out_z = yield dut.out_z.v
+        #out_mid = yield dut.out_mid
+        yield dut.out_z.ready_i.eq(0)
+        yield
+        break
+
+    return out_z, mid # TODO: mid
+
+def check_case(dut, a, b, z, mid=None):
+    if mid is None:
+        mid = randint(0, 6)
+    mid = 0
+    out_z, out_mid = yield from get_case(dut, a, b, mid)
+    assert out_z == z, "Output z 0x%x not equal to expected 0x%x" % (out_z, z)
+    assert out_mid == mid, "Output mid 0x%x != expected 0x%x" % (out_mid, mid)
+
+
+def run_test(dut, stimulus_a, stimulus_b, op, get_case_fn):
+
+    expected_responses = []
+    actual_responses = []
+    for a, b in zip(stimulus_a, stimulus_b):
+        mid = randint(0, 6)
+        mid = 0
+        af = Float32.from_bits(a)
+        bf = Float32.from_bits(b)
+        z = op(af, bf)
+        expected_responses.append((z.get_bits(), mid))
+        actual = yield from get_case_fn(dut, a, b, mid)
+        actual_responses.append(actual)
+
+    if len(actual_responses) < len(expected_responses):
+        print ("Fail ... not enough results")
+        exit(0)
+
+    for expected, actual, a, b in zip(expected_responses, actual_responses,
+                                      stimulus_a, stimulus_b):
+        passed = match(expected[0], actual[0])
+        if expected[1] != actual[1]: # check mid
+            print ("MID failed", expected[1], actual[1])
+            sys.exit(0)
+
+        if not passed:
+
+            expected = expected[0]
+            actual = actual[0]
+            print ("Fail ... expected:", hex(expected), "actual:", hex(actual))
+
+            print (hex(a))
+            print ("a mantissa:", a & 0x7fffff)
+            print ("a exponent:", ((a & 0x7f800000) >> 23) - 127)
+            print ("a sign:", ((a & 0x80000000) >> 31))
+
+            print (hex(b))
+            print ("b mantissa:", b & 0x7fffff)
+            print ("b exponent:", ((b & 0x7f800000) >> 23) - 127)
+            print ("b sign:", ((b & 0x80000000) >> 31))
+
+            print (hex(expected))
+            print ("expected mantissa:", expected & 0x7fffff)
+            print ("expected exponent:", ((expected & 0x7f800000) >> 23) - 127)
+            print ("expected sign:", ((expected & 0x80000000) >> 31))
+
+            print (hex(actual))
+            print ("actual mantissa:", actual & 0x7fffff)
+            print ("actual exponent:", ((actual & 0x7f800000) >> 23) - 127)
+            print ("actual sign:", ((actual & 0x80000000) >> 31))
+
+            sys.exit(0)
+
+corner_cases = [0x80000000, 0x00000000, 0x7f800000, 0xff800000,
+                0x7fc00000, 0xffc00000]
+
+def run_corner_cases(dut, count, op, get_case_fn):
+    #corner cases
+    from itertools import permutations
+    stimulus_a = [i[0] for i in permutations(corner_cases, 2)]
+    stimulus_b = [i[1] for i in permutations(corner_cases, 2)]
+    yield from run_test(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed")
+
+def run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn):
+    yield from run_test(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    yield from run_test(dut, stimulus_b, stimulus_a, op, get_case_fn)
+
+def run_cases(dut, count, op, fixed_num, num_entries, get_case_fn):
+    if isinstance(fixed_num, int):
+        stimulus_a = [fixed_num for i in range(num_entries)]
+        report = hex(fixed_num)
+    else:
+        stimulus_a = fixed_num
+        report = "random"
+
+    stimulus_b = [randint(0, 1<<32) for i in range(num_entries)]
+    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed 2^32", report)
+
+    # non-canonical NaNs.
+    stimulus_b = [set_exponent(randint(0, 1<<32), 128) \
+                        for i in range(num_entries)]
+    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed Non-Canonical NaN", report)
+
+    # -127
+    stimulus_b = [set_exponent(randint(0, 1<<32), -127) \
+                        for i in range(num_entries)]
+    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed exp=-127", report)
+
+    # nearly zero
+    stimulus_b = [set_exponent(randint(0, 1<<32), -126) \
+                        for i in range(num_entries)]
+    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed exp=-126", report)
+
+    # nearly inf
+    stimulus_b = [set_exponent(randint(0, 1<<32), 127) \
+                        for i in range(num_entries)]
+    yield from run_test_2(dut, stimulus_a, stimulus_b, op, get_case_fn)
+    count += len(stimulus_a)
+    print (count, "vectors passed exp=127", report)
+
+    return count
+
+def run_edge_cases(dut, count, op, get_case_fn):
+    #edge cases
+    for testme in corner_cases:
+        count = yield from run_cases(dut, count, op, testme, 10, get_case_fn)
+
+    for i in range(100000):
+        stimulus_a = [randint(0, 1<<32) for i in range(10)]
+        count = yield from run_cases(dut, count, op, stimulus_a, 10,
+                                     get_case_fn)
+    return count
+