refactor the buffered pipeline to a cleaner API with better separation

[ieee754fpu.git] / src / add / example_buf_pipe.py
diff --git a/src/add/example_buf_pipe.py b/src/add/example_buf_pipe.py

index 717504bcd37db33a2bce90e6168f5e64019e8674..45bed19301a5c46b3213c8ae0217405182f6a215 100644 (file)
--- a/src/add/example_buf_pipe.py
+++ b/src/add/example_buf_pipe.py
@@ -12,12 +12,12 @@
      where data will flow on *every* clock when the conditions are right.
  
      input acceptance conditions are when:
-        * incoming previous-stage strobe (i_p_stb) is HIGH
-        * outgoing previous-stage busy   (o_p_busy) is LOW
+        * incoming previous-stage strobe (i.p_valid) is HIGH
+        * outgoing previous-stage ready   (o.p_ready) is LOW
  
      output transmission conditions are when:
-        * outgoing next-stage strobe (o_n_stb) is HIGH
-        * outgoing next-stage busy   (i_n_busy) is LOW
+        * outgoing next-stage strobe (o.n_valid) is HIGH
+        * outgoing next-stage ready   (i.n_ready) is LOW
  
      the tricky bit is when the input has valid data and the output is not
      ready to accept it.  if it wasn't for the clock synchronisation, it
@@ -25,149 +25,243 @@
      not ready".  unfortunately, it's not possible to "change the past":
      the previous stage *has no choice* but to pass on its data.
  
-    therefore, the incoming data *must* be accepted - and stored.
+    therefore, the incoming data *must* be accepted - and stored: that
+    is the responsibility / contract that this stage *must* accept.
      on the same clock, it's possible to tell the input that it must
      not send any more data.  this is the "stall" condition.
  
      we now effectively have *two* possible pieces of data to "choose" from:
      the buffered data, and the incoming data.  the decision as to which
      to process and output is based on whether we are in "stall" or not.
-    i.e. when the next stage is no longer busy, the output comes from
+    i.e. when the next stage is no longer ready, the output comes from
      the buffer if a stall had previously occurred, otherwise it comes
      direct from processing the input.
  
+    this allows us to respect a synchronous "travelling STB" with what
+    dan calls a "buffered handshake".
+
      it's quite a complex state machine!
  """
  
  from nmigen import Signal, Cat, Const, Mux, Module
-from nmigen.compat.sim import run_simulation
  from nmigen.cli import verilog, rtlil
+from collections.abc import Sequence
+
+
+class IOAckIn:
+
+    def __init__(self):
+        self.p_valid = Signal() # >>in - comes in from PREVIOUS stage
+        self.n_ready = Signal() # in<< - comes in from the NEXT stage
+
+
+class IOAckOut:
+
+    def __init__(self):
+        self.n_valid = Signal() # out>> - goes out to the NEXT stage
+        self.p_ready = Signal() # <<out - goes out to the PREVIOUS stage
+
+
+def eq(o, i):
+    if not isinstance(o, Sequence):
+        o, i = [o], [i]
+    res = []
+    for (ao, ai) in zip(o, i):
+        res.append(ao.eq(ai))
+    return res
+
  
-class BufPipe:
-    """ buffered pipeline stage
+class BufferedPipeline:
+    """ buffered pipeline stage.  data and strobe signals travel in sync.
+        if ever the input is ready and the output is not, processed data
+        is stored in a temporary register.
  
-        stage-1   i_p_stb  >>in   stage   o_n_stb  out>>   stage+1
-        stage-1   o_p_busy <<out  stage   i_n_busy <<in    stage+1
-        stage-1   i_data   >>in   stage   o_data   out>>   stage+1
+        stage-1   i.p_valid >>in   stage   o.n_valid out>>   stage+1
+        stage-1   o.p_ready <<out  stage   i.n_ready <<in    stage+1
+        stage-1   i.data    >>in   stage   o.data    out>>   stage+1
                                |             |
-                              +------->  process
+                            process --->----^
                                |             |
-                              +-- r_data ---+
+                              +-- r_data ->-+
+
+        input data i_data is read (only), is processed and goes into an
+        intermediate result store [process()].  this is updated combinatorially.
+
+        in a non-stall condition, the intermediate result will go into the
+        output (update_output).  however if ever there is a stall, it goes
+        into r_data instead [update_buffer()].
+
+        when the non-stall condition is released, r_data is the first
+        to be transferred to the output [flush_buffer()], and the stall
+        condition cleared.
+
+        on the next cycle (as long as stall is not raised again) the
+        input may begin to be processed and transferred directly to output.
      """
-    def __init__(self):
-        # input
-        #self.i_p_rst = Signal()    # >>in - comes in from PREVIOUS stage
-        self.i_p_stb = Signal()    # >>in - comes in from PREVIOUS stage
-        self.i_n_busy = Signal()   # in<< - comes in from the NEXT stage
-        self.i_data = Signal(32) # >>in - comes in from the PREVIOUS stage
-        #self.i_rst = Signal()
+    def __init__(self, stage):
+        """ pass in a "stage" which may be either a static class or a class
+            instance, which has three functions:
+            * ispec: returns input signals according to the input specification
+            * ispec: returns output signals to the output specification
+            * process: takes an input instance and returns processed data
+
+            i_data -> process() -> result --> o.data
+                                     |           ^
+                                     |           |
+                                     +-> r_data -+
+        """
+        # input: strobe comes in from previous stage, ready comes in from next
+        self.i = IOAckIn()
+        #self.i.p_valid = Signal()    # >>in - comes in from PREVIOUS stage
+        #self.i.n_ready = Signal()   # in<< - comes in from the NEXT stage
  
-        # buffered
-        self.r_data = Signal(32)
+        # output: strobe goes out to next stage, ready comes in from previous
+        self.o = IOAckOut()
+        #self.o.n_valid = Signal()    # out>> - goes out to the NEXT stage
+        #self.o.p_ready = Signal()   # <<out - goes out to the PREVIOUS stage
  
-        # output
-        self.o_n_stb = Signal()    # out>> - goes out to the NEXT stage
-        self.o_p_busy = Signal()   # <<out - goes out to the PREVIOUS stage
-        self.o_data = Signal(32) # out>> - goes out to the NEXT stage
+        # set up the input and output data
+        self.i.data = stage.ispec() # input type
+        self.r_data = stage.ospec() # all these are output type
+        self.result = stage.ospec()
+        self.o.data = stage.ospec()
+        self.stage = stage
  
-    def pre_process(self, d_in):
-        return d_in | 0xf0000
+    def set_input(self, i):
+        return eq(self.i.data, i)
  
-    def process(self, d_in):
-        return d_in + 1
+    def update_buffer(self):
+        """ copies the result into the intermediate register r_data,
+            which will need to be outputted on a subsequent cycle
+            prior to allowing "normal" operation.
+        """
+        return eq(self.r_data, self.result)
+
+    def update_output(self):
+        """ copies the (combinatorial) result into the output
+        """
+        return eq(self.o.data, self.result)
+
+    def flush_buffer(self):
+        """ copies the *intermediate* register r_data into the output
+        """
+        return eq(self.o.data, self.r_data)
+
+    def ports(self):
+        return [self.i.data, self.o.data]
  
      def elaborate(self, platform):
          m = Module()
  
-        o_p_busyn = Signal(reset_less=True)
-        o_n_stbn = Signal(reset_less=True)
-        i_n_busyn = Signal(reset_less=True)
-        i_p_stb_o_p_busyn = Signal(reset_less=True)
-        m.d.comb += i_n_busyn.eq(~self.i_n_busy)
-        m.d.comb += o_n_stbn.eq(~self.o_n_stb)
-        m.d.comb += o_p_busyn.eq(~self.o_p_busy)
-        m.d.comb += i_p_stb_o_p_busyn.eq(self.i_p_stb & o_p_busyn)
-
-        result = Signal(32)
-        m.d.comb += result.eq(self.process(self.i_data))
-        with m.If(o_p_busyn): # not stalled
-            m.d.sync += self.r_data.eq(result)
-
-        #with m.If(self.i_p_rst): # reset
-        #    m.d.sync += self.o_n_stb.eq(0)
-        #    m.d.sync += self.o_p_busy.eq(0)
-        with m.If(i_n_busyn): # next stage is not busy
-            with m.If(o_p_busyn): # not stalled
-                # nothing in buffer: send input direct to output
-                m.d.sync += self.o_n_stb.eq(self.i_p_stb)
-                m.d.sync += self.o_data.eq(result)
-            with m.Else(): # o_p_busy is true, and something is in our buffer.
+        # establish some combinatorial temporaries
+        o_n_validn = Signal(reset_less=True)
+        i_p_valid_o_p_ready = Signal(reset_less=True)
+        m.d.comb += [o_n_validn.eq(~self.o.n_valid),
+                     i_p_valid_o_p_ready.eq(self.i.p_valid & self.o.p_ready),
+        ]
+
+        # store result of processing in combinatorial temporary
+        with m.If(self.i.p_valid): # input is valid: process it
+            m.d.comb += eq(self.result, self.stage.process(self.i.data))
+        # if not in stall condition, update the temporary register
+        with m.If(self.o.p_ready): # not stalled
+            m.d.sync += self.update_buffer()
+
+        #with m.If(self.i.p_rst): # reset
+        #    m.d.sync += self.o.n_valid.eq(0)
+        #    m.d.sync += self.o.p_ready.eq(0)
+        with m.If(self.i.n_ready): # next stage is ready
+            with m.If(self.o.p_ready): # not stalled
+                # nothing in buffer: send (processed) input direct to output
+                m.d.sync += [self.o.n_valid.eq(self.i.p_valid),
+                             self.update_output(),
+                            ]
+            with m.Else(): # o.p_ready is false, and something is in buffer.
                  # Flush the [already processed] buffer to the output port.
-                m.d.sync += self.o_n_stb.eq(1)
-                m.d.sync += self.o_data.eq(self.r_data)
-                # ignore input, since o_p_busy is also true.
-                # also clear stall condition, declare register to be empty.
-                m.d.sync += self.o_p_busy.eq(0)
-
-        # (i_n_busy) is true here: next stage is busy
-        with m.Elif(o_n_stbn): # next stage being told "not busy"
-            m.d.sync += self.o_n_stb.eq(self.i_p_stb)
-            m.d.sync += self.o_p_busy.eq(0) # Keep the buffer empty
-            # Apply the logic to the input data, and set the output data
-            m.d.sync += self.o_data.eq(result)
-
-        # (i_n_busy) and (o_n_stb) both true:
-        with m.Elif(i_p_stb_o_p_busyn):
-            # If next stage *is* busy, and not stalled yet, accept input
-            m.d.sync += self.o_p_busy.eq(self.i_p_stb & self.o_n_stb)
-
-        with m.If(o_p_busyn): # not stalled
-            # turns out that from all of the above conditions, just
-            # always put result into buffer if not busy
-            m.d.sync += self.r_data.eq(result)
+                m.d.sync += [self.o.n_valid.eq(1),
+                             self.flush_buffer(),
+                             # clear stall condition, declare register empty.
+                             self.o.p_ready.eq(1),
+                            ]
+                # ignore input, since o.p_ready is also false.
+
+        # (i.n_ready) is false here: next stage is ready
+        with m.Elif(o_n_validn): # next stage being told "ready"
+            m.d.sync += [self.o.n_valid.eq(self.i.p_valid),
+                         self.o.p_ready.eq(1), # Keep the buffer empty
+                         # set the output data (from comb result)
+                         self.update_output(),
+                        ]
+        # (i.n_ready) false and (o.n_valid) true:
+        with m.Elif(i_p_valid_o_p_ready):
+            # If next stage *is* ready, and not stalled yet, accept input
+            m.d.sync += self.o.p_ready.eq(~(self.i.p_valid & self.o.n_valid))
  
          return m
  
      def ports(self):
-        return [self.i_p_stb, self.i_n_busy, self.i_data,
-                self.r_data,
-                self.o_n_stb, self.o_p_busy, self.o_data
+        return [self.i.p_valid, self.i.n_ready,
+                self.o.n_valid, self.o.p_ready,
                 ]
  
  
-def testbench(dut):
-    #yield dut.i_p_rst.eq(1)
-    yield dut.i_n_busy.eq(1)
-    yield dut.o_p_busy.eq(1)
-    yield
-    yield
-    #yield dut.i_p_rst.eq(0)
-    yield dut.i_n_busy.eq(0)
-    yield dut.i_data.eq(5)
-    yield dut.i_p_stb.eq(1)
-    yield
-    yield dut.i_data.eq(7)
-    yield
-    yield dut.i_data.eq(2)
-    yield
-    yield dut.i_n_busy.eq(1)
-    yield dut.i_data.eq(9)
-    yield
-    yield dut.i_p_stb.eq(0)
-    yield dut.i_data.eq(12)
-    yield
-    yield dut.i_data.eq(32)
-    yield dut.i_n_busy.eq(0)
-    yield
-    yield
-    yield
-    yield
+class ExampleAddStage:
+    """ an example of how to use the buffered pipeline, as a class instance
+    """
+
+    def ispec(self):
+        """ returns a tuple of input signals which will be the incoming data
+        """
+        return (Signal(16), Signal(16))
+
+    def ospec(self):
+        """ returns an output signal which will happen to contain the sum
+            of the two inputs
+        """
+        return Signal(16)
+
+    def process(self, i):
+        """ process the input data (sums the values in the tuple) and returns it
+        """
+        return i[0] + i[1]
+
+
+class ExampleBufPipeAdd(BufferedPipeline):
+    """ an example of how to use the buffered pipeline, using a class instance
+    """
+
+    def __init__(self):
+        addstage = ExampleAddStage()
+        BufferedPipeline.__init__(self, addstage)
+
+
+class ExampleStage:
+    """ an example of how to use the buffered pipeline, in a static class
+        fashion
+    """
+
+    def ispec():
+        return Signal(16)
+
+    def ospec():
+        return Signal(16)
+
+    def process(i):
+        """ process the input data and returns it (adds 1)
+        """
+        return i + 1
+
+
+class ExampleBufPipe(BufferedPipeline):
+    """ an example of how to use the buffered pipeline.
+    """
+
+    def __init__(self):
+        BufferedPipeline.__init__(self, ExampleStage)
  
  
  if __name__ == '__main__':
-    dut = BufPipe()
+    dut = ExampleBufPipe()
      vl = rtlil.convert(dut, ports=dut.ports())
      with open("test_bufpipe.il", "w") as f:
          f.write(vl)
-    run_simulation(dut, testbench(dut), vcd_name="test_bufpipe.vcd")
-