X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fadd%2Fexample_buf_pipe.py;h=6db85fc4e7e4ccedf8967cf6eb6be00077222c75;hb=c10d9619880099356e760c4ae45c8a0b18d1aeac;hp=b72e1c43904451ba0ef7f9fa78d5417da8de0a8d;hpb=0e70fec7c3df1ee97020aa5be6f358c85898a5fb;p=ieee754fpu.git diff --git a/src/add/example_buf_pipe.py b/src/add/example_buf_pipe.py index b72e1c43..6db85fc4 100644 --- a/src/add/example_buf_pipe.py +++ b/src/add/example_buf_pipe.py @@ -1,11 +1,59 @@ """ nmigen implementation of buffered pipeline stage, based on zipcpu: https://zipcpu.com/blog/2017/08/14/strategies-for-pipelining.html + + this module requires quite a bit of thought to understand how it works + (and why it is needed in the first place). reading the above is + *strongly* recommended. + + unlike john dawson's IEEE754 FPU STB/ACK signalling, which requires + the STB / ACK signals to raise and lower (on separate clocks) before + data may proceeed (thus only allowing one piece of data to proceed + on *ALTERNATE* cycles), the signalling here is a true pipeline + where data will flow on *every* clock when the conditions are right. + + input acceptance conditions are when: + * incoming previous-stage strobe (i_p_stb) is HIGH + * outgoing previous-stage busy (o_p_busy) is LOW + + output transmission conditions are when: + * outgoing next-stage strobe (o_n_stb) is HIGH + * outgoing next-stage busy (i_n_busy) is LOW + + the tricky bit is when the input has valid data and the output is not + ready to accept it. if it wasn't for the clock synchronisation, it + would be possible to tell the input "hey don't send that data, we're + not ready". unfortunately, it's not possible to "change the past": + the previous stage *has no choice* but to pass on its data. + + therefore, the incoming data *must* be accepted - and stored. + on the same clock, it's possible to tell the input that it must + not send any more data. this is the "stall" condition. + + we now effectively have *two* possible pieces of data to "choose" from: + the buffered data, and the incoming data. the decision as to which + to process and output is based on whether we are in "stall" or not. + i.e. when the next stage is no longer busy, the output comes from + the buffer if a stall had previously occurred, otherwise it comes + direct from processing the input. + + it's quite a complex state machine! """ + from nmigen import Signal, Cat, Const, Mux, Module from nmigen.compat.sim import run_simulation from nmigen.cli import verilog, rtlil class BufPipe: + """ buffered pipeline stage + + stage-1 i_p_stb >>in stage o_n_stb out>> stage+1 + stage-1 o_p_busy <>in stage o_data out>> stage+1 + | | + +-------> process + | | + +-- r_data ---+ + """ def __init__(self): # input #self.i_p_rst = Signal() # >>in - comes in from PREVIOUS stage @@ -31,42 +79,57 @@ class BufPipe: def elaborate(self, platform): m = Module() + # establish some combinatorial temporaries + o_p_busyn = Signal(reset_less=True) + o_n_stbn = Signal(reset_less=True) + i_n_busyn = Signal(reset_less=True) i_p_stb_o_p_busyn = Signal(reset_less=True) - m.d.comb += i_p_stb_o_p_busyn.eq(self.i_p_stb & (~self.o_p_busy)) + m.d.comb += [i_n_busyn.eq(~self.i_n_busy), + o_n_stbn.eq(~self.o_n_stb), + o_p_busyn.eq(~self.o_p_busy), + i_p_stb_o_p_busyn.eq(self.i_p_stb & o_p_busyn), + ] + + # store result of processing in combinatorial temporary + result = Signal(32) + m.d.comb += result.eq(self.process(self.i_data)) + with m.If(o_p_busyn): # not stalled + m.d.sync += self.r_data.eq(result) #with m.If(self.i_p_rst): # reset # m.d.sync += self.o_n_stb.eq(0) # m.d.sync += self.o_p_busy.eq(0) - with m.If(~self.i_n_busy): # previous stage is not busy - with m.If(~self.o_p_busy): # not stalled + with m.If(i_n_busyn): # next stage is not busy + with m.If(o_p_busyn): # not stalled # nothing in buffer: send input direct to output - m.d.sync += self.o_n_stb.eq(self.i_p_stb) - m.d.sync += self.o_data.eq(self.process(self.i_data)) + m.d.sync += [self.o_n_stb.eq(self.i_p_stb), + self.o_data.eq(result), + ] with m.Else(): # o_p_busy is true, and something is in our buffer. - # Flush the buffer to the output port. - m.d.sync += self.o_n_stb.eq(1) - m.d.sync += self.o_data.eq(self.r_data) + # Flush the [already processed] buffer to the output port. + m.d.sync += [self.o_n_stb.eq(1), + self.o_data.eq(self.r_data), + # clear stall condition, declare register empty. + self.o_p_busy.eq(0), + ] # ignore input, since o_p_busy is also true. - # also clear stall condition, declare register to be empty. - m.d.sync += self.o_p_busy.eq(0) - - # (i_n_busy) is true here: previous stage is busy - with m.Elif(~self.o_n_stb): # next stage being told "not busy" - m.d.sync += self.o_n_stb.eq(self.i_p_stb) - m.d.sync += self.o_p_busy.eq(0) # Keep the buffer empty - # Apply the logic to the input data, and set the output data - m.d.sync += self.o_data.eq(self.process(self.i_data)) + # (i_n_busy) is true here: next stage is busy + with m.Elif(o_n_stbn): # next stage being told "not busy" + m.d.sync += [self.o_n_stb.eq(self.i_p_stb), + self.o_p_busy.eq(0), # Keep the buffer empty + # set the output data (from comb result) + self.o_data.eq(result), + ] # (i_n_busy) and (o_n_stb) both true: with m.Elif(i_p_stb_o_p_busyn): - # If next stage *is* busy, and not stalled yet, accept requested - # input and store in temporary + # If next stage *is* busy, and not stalled yet, accept input m.d.sync += self.o_p_busy.eq(self.i_p_stb & self.o_n_stb) - with m.If(~self.o_n_stb): - m.d.sync += self.r_data.eq(self.i_data) - with m.If(~self.o_p_busy): # not stalled - m.d.sync += self.r_data.eq(self.pre_process(self.i_data)) + with m.If(o_p_busyn): # not stalled + # turns out that from all of the above conditions, just + # always put result into buffer if not busy + m.d.sync += self.r_data.eq(result) return m @@ -95,12 +158,15 @@ def testbench(dut): yield dut.i_n_busy.eq(1) yield dut.i_data.eq(9) yield + yield dut.i_p_stb.eq(0) yield dut.i_data.eq(12) yield + yield dut.i_data.eq(32) yield dut.i_n_busy.eq(0) yield yield yield + yield if __name__ == '__main__':