rather big change to interaction between regfile and compunits on read
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Sat, 15 Aug 2020 21:54:50 +0000 (22:54 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Sat, 15 Aug 2020 21:54:50 +0000 (22:54 +0100)
regfiles are now sync-delayed by one clock from "ren".  this means that
a read-request has to be fired off then excluded from the PriorityPicker,
whilst waiting for the output to arrive on the next clock.  *then*
the "go read" signal can be fired, which gets the data (arriving 1 cycle
late from the regfile) "in sync" with its "go read"

src/soc/regfile/regfile.py
src/soc/simple/core.py
src/soc/simple/issuer.py
src/soc/simple/test/test_core.py
src/soc/simple/test/test_issuer.py

index e06695eb3272bc141fc7f3c2291088090613e50c..10b3fa0692922818d6e318b9168f014bb8e87070 100644 (file)
@@ -32,9 +32,10 @@ import operator
 
 
 class Register(Elaboratable):
-    def __init__(self, width, writethru=True):
+    def __init__(self, width, writethru=True, synced=True):
         self.width = width
         self.writethru = writethru
+        self.synced = synced
         self._rdports = []
         self._wrports = []
 
@@ -56,22 +57,26 @@ class Register(Elaboratable):
         m = Module()
         self.reg = reg = Signal(self.width, name="reg")
 
+        if self.synced:
+            domain = m.d.sync
+        else:
+            domain = m.d.comb
+
         # read ports. has write-through detection (returns data written)
         for rp in self._rdports:
-            with m.If(rp.ren == 1):
+            domain += rp.data_o.eq(0)
+            with m.If(rp.ren):
                 if self.writethru:
                     wr_detect = Signal(reset_less=False)
                     m.d.comb += wr_detect.eq(0)
                     for wp in self._wrports:
                         with m.If(wp.wen):
-                            m.d.comb += rp.data_o.eq(wp.data_i)
+                            domain += rp.data_o.eq(wp.data_i)
                             m.d.comb += wr_detect.eq(1)
                     with m.If(~wr_detect):
-                        m.d.comb += rp.data_o.eq(reg)
+                        domain += rp.data_o.eq(reg)
                 else:
-                    m.d.comb += rp.data_o.eq(reg)
-            with m.Else():
-                m.d.comb += rp.data_o.eq(0)
+                    domain += rp.data_o.eq(reg)
 
         # write ports, delayed by 1 cycle
         for wp in self._wrports:
@@ -101,10 +106,12 @@ class RegFileArray(Elaboratable):
         and read-en signals (per port).
     """
 
-    def __init__(self, width, depth):
+    def __init__(self, width, depth, synced=True):
+        self.synced = synced
         self.width = width
         self.depth = depth
-        self.regs = Array(Register(width) for _ in range(self.depth))
+        self.regs = Array(Register(width, synced=synced) \
+                          for _ in range(self.depth))
         self._rdports = []
         self._wrports = []
 
@@ -149,11 +156,22 @@ class RegFileArray(Elaboratable):
         for i, reg in enumerate(self.regs):
             setattr(m.submodules, "reg_%d" % i, reg)
 
+        if self.synced:
+            domain = m.d.sync
+        else:
+            domain = m.d.comb
+
         for (regs, p) in self._rdports:
             #print (p)
             m.d.comb += self._get_en_sig(regs, 'ren').eq(p.ren)
             ror = ortreereduce(list(regs))
-            m.d.comb += p.data_o.eq(ror)
+            if self.synced:
+                ren_delay = Signal.like(p.ren)
+                m.d.sync += ren_delay.eq(p.ren)
+                with m.If(ren_delay):
+                    m.d.comb += p.data_o.eq(ror)
+            else:
+                m.d.comb += p.data_o.eq(ror)
         for (regs, p) in self._wrports:
             m.d.comb += self._get_en_sig(regs, 'wen').eq(p.wen)
             for r in regs:
@@ -171,8 +189,9 @@ class RegFileArray(Elaboratable):
 
 class RegFileMem(Elaboratable):
     unary = False
-    def __init__(self, width, depth, fwd_bus_mode=False):
+    def __init__(self, width, depth, fwd_bus_mode=False, synced=True):
         self.fwd_bus_mode = fwd_bus_mode
+        self.synced = synced
         self.width, self.depth = width, depth
         self.memory = Memory(width=width, depth=depth)
         self._rdports = {}
@@ -183,7 +202,11 @@ class RegFileMem(Elaboratable):
         port = RecordObject([("addr", bsz),
                              ("ren", 1),
                              ("data_o", self.width)], name=name)
-        self._rdports[name] = (port, self.memory.read_port(domain="comb"))
+        if self.synced:
+            domain = "sync"
+        else:
+            domain = "comb"
+        self._rdports[name] = (port, self.memory.read_port(domain=domain))
         return port
 
     def write_port(self, name=None):
@@ -215,7 +238,12 @@ class RegFileMem(Elaboratable):
                     with m.If(~wr_detect):
                         m.d.comb += rp.data_o.eq(rport.data)
             else:
-                with m.If(rp.ren):
+                if self.synced:
+                    ren_delay = Signal.like(rp.ren)
+                    m.d.sync += ren_delay.eq(rp.ren)
+                    with m.If(ren_delay):
+                        m.d.comb += rp.data_o.eq(rport.data)
+                else:
                     m.d.comb += rp.data_o.eq(rport.data)
 
         # write ports, delayed by one cycle (in the memory itself)
@@ -384,7 +412,7 @@ def test_regfile():
 
     run_simulation(dut, regfile_sim(dut, rp, wp), vcd_name='test_regfile.vcd')
 
-    dut = RegFileMem(32, 8)
+    dut = RegFileMem(32, 8, True, False)
     rp = dut.read_port("rp1")
     wp = dut.write_port("wp1")
     vl = rtlil.convert(dut)#, ports=dut.ports())
@@ -393,7 +421,7 @@ def test_regfile():
 
     run_simulation(dut, regfile_sim(dut, rp, wp), vcd_name='test_regmem.vcd')
 
-    dut = RegFileArray(32, 8)
+    dut = RegFileArray(32, 8, False)
     rp1 = dut.read_port("read1")
     rp2 = dut.read_port("read2")
     wp = dut.write_port("write")
index 19f076ba96906b075e77c58b92917321704561c8..8e70feac073e71e44d9fbef17124f77f4c09b938 100644 (file)
@@ -222,29 +222,38 @@ class NonProductionCore(Elaboratable):
                 fu_active = fu_bitdict[funame]
                 name = "%s_%s_%s_%i" % (regfile, rpidx, funame, pi)
                 addr_en = Signal.like(reads[i], name="addr_en_"+name)
-                rp = Signal(name="rp_"+name)
-                pick = Signal()
+                pick = Signal(name="pick_"+name)     # picker input
+                rp = Signal(name="rp_"+name)         # picker output
+                delay_pick = Signal(name="dp_"+name) # read-enable "underway"
 
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i])
+                # exclude any currently-enabled read-request (mask out active)
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
+                                ~delay_pick)
                 comb += rdpick.i[pi].eq(pick)
-                sync += fu.go_rd_i[idx].eq(rising_edge(m, rp))
+                comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
+
                 # if picked, select read-port "reg select" number to port
                 comb += rp.eq(rdpick.o[pi] & rdpick.en_o)
+                sync += delay_pick.eq(rp) # delayed "pick"
                 comb += addr_en.eq(Mux(rp, reads[i], 0))
+
+                # the read-enable happens combinatorially (see mux-bus below)
+                # but it results in the data coming out on a one-cycle delay.
                 if rfile.unary:
                     rens.append(addr_en)
                 else:
                     addrs.append(addr_en)
                     rens.append(rp)
 
-                with m.If(rp):
+                # use the *delayed* pick signal to put requested data onto bus
+                with m.If(delay_pick):
                     # connect regfile port to input, creating fan-out Bus
                     src = fu.src_i[idx]
                     print("reg connect widths",
                           regfile, regname, pi, funame,
                           src.shape(), rport.data_o.shape())
                     # all FUs connect to same port
-                    sync += src.eq(rport.data_o)
+                    comb += src.eq(rport.data_o)
 
         # or-reduce the muxed read signals
         if rfile.unary:
index 0515611938d21afd4268f4c8355b40c251f79d53..e6f837e4d0835f867bfb20453443c3434ffeb90d 100644 (file)
@@ -135,20 +135,24 @@ class TestIssuer(Elaboratable):
 
         # read the PC
         pc = Signal(64, reset_less=True)
+        pc_ok_delay = Signal()
+        sync += pc_ok_delay.eq(~self.pc_i.ok)
         with m.If(self.pc_i.ok):
             # incoming override (start from pc_i)
             comb += pc.eq(self.pc_i.data)
         with m.Else():
-            # otherwise read StateRegs regfile for PC
+            # otherwise read StateRegs regfile for PC...
             comb += self.state_r_pc.ren.eq(1<<StateRegs.PC)
+        # ... but on a 1-clock delay
+        with m.If(pc_ok_delay):
             comb += pc.eq(self.state_r_pc.data_o)
 
         # don't write pc every cycle
-        sync += self.state_w_pc.wen.eq(0)
-        sync += self.state_w_pc.data_i.eq(0)
+        comb += self.state_w_pc.wen.eq(0)
+        comb += self.state_w_pc.data_i.eq(0)
 
         # don't read msr every cycle
-        sync += self.state_r_msr.ren.eq(0)
+        comb += self.state_r_msr.ren.eq(0)
 
         # connect up debug signals
         # TODO comb += core.icache_rst_i.eq(dbg.icache_rst_o)
@@ -187,14 +191,15 @@ class TestIssuer(Elaboratable):
                     comb += self.imem.f_valid_i.eq(1)
                     sync += cur_state.pc.eq(pc)
 
-                    # read MSR, latch it, and put it in decode "state"
-                    sync += self.state_r_msr.ren.eq(1<<StateRegs.MSR)
-                    sync += cur_state.msr.eq(self.state_r_msr.data_o)
+                    # initiate read of MSR
+                    comb += self.state_r_msr.ren.eq(1<<StateRegs.MSR)
 
                     m.next = "INSN_READ" # move to "wait for bus" phase
 
             # dummy pause to find out why simulation is not keeping up
             with m.State("INSN_READ"):
+                # one cycle later, msr read arrives
+                sync += cur_state.msr.eq(self.state_r_msr.data_o)
                 with m.If(self.imem.f_busy_o): # zzz...
                     # busy: stay in wait-read
                     comb += self.imem.a_valid_i.eq(1)
@@ -232,14 +237,15 @@ class TestIssuer(Elaboratable):
                     # this just blithely overwrites whatever pipeline
                     # updated the PC
                     with m.If(~pc_changed):
-                        sync += self.state_w_pc.wen.eq(1<<StateRegs.PC)
-                        sync += self.state_w_pc.data_i.eq(nia)
+                        comb += self.state_w_pc.wen.eq(1<<StateRegs.PC)
+                        comb += self.state_w_pc.data_i.eq(nia)
                     sync += core.e.eq(0)
                     m.next = "IDLE" # back to idle
 
         # this bit doesn't have to be in the FSM: connect up to read
         # regfiles on demand from DMI
-
+        sync += d_reg.ack.eq(0)
+        sync += d_reg.data.eq(0)
         with m.If(d_reg.req): # request for regfile access being made
             # TODO: error-check this
             # XXX should this be combinatorial?  sync better?
@@ -248,8 +254,9 @@ class TestIssuer(Elaboratable):
             else:
                 comb += self.int_r.addr.eq(d_reg.addr)
                 comb += self.int_r.ren.eq(1)
-            comb += d_reg.data.eq(self.int_r.data_o)
-            comb += d_reg.ack.eq(1)
+            # data arrives one clock later
+            sync += d_reg.data.eq(self.int_r.data_o)
+            sync += d_reg.ack.eq(1)
 
         return m
 
index af7487926cfbbf7c887d2e626479b737a4d15a12..9d751cf016eccb5fdf104cea802edb4832dbe59e 100644 (file)
@@ -49,18 +49,18 @@ def setup_regs(pdecode2, core, test):
     cr = test.cr
     crregs = core.regs.cr
     #cr = int('{:32b}'.format(cr)[::-1], 2)
-    print("cr reg", hex(cr))
+    print("setup cr reg", hex(cr))
     for i in range(8):
         #j = 7-i
         cri = (cr >> (i*4)) & 0xf
         #cri = int('{:04b}'.format(cri)[::-1], 2)
-        print("cr reg", hex(cri), i,
+        print("setup cr reg", hex(cri), i,
               crregs.regs[i].reg.shape())
         yield crregs.regs[i].reg.eq(cri)
 
     # set up XER.  "direct" write (bypass rd/write ports)
     xregs = core.regs.xer
-    print("sprs", test.sprs)
+    print("setup sprs", test.sprs)
     xer = None
     if 'XER' in test.sprs:
         xer = test.sprs['XER']
index 3e4e236a4f654fbcf2b4daeecab7b68f34a5fcc4..c4877940067237be08b12cdd9a23e9bf070f772c 100644 (file)
@@ -247,6 +247,16 @@ class TestRunner(FHDLTestCase):
                     terminated = yield issuer.dbg.terminated_o
                     print("terminated", terminated)
 
+                    if index >= len(instructions):
+                        print ("index over, send dmi stop")
+                        # stop at end
+                        yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+                        yield
+                        yield
+
+                    # wait one cycle for registers to settle
+                    yield
+
                     # register check
                     yield from check_regs(self, sim, core, test, code)
 
@@ -254,9 +264,15 @@ class TestRunner(FHDLTestCase):
                     yield from check_sim_memory(self, l0, sim, code)
 
                     terminated = yield issuer.dbg.terminated_o
+                    print("terminated(2)", terminated)
                     if terminated:
                         break
 
+                # stop at end
+                yield from set_dmi(dmi, DBGCore.CTRL, 1<<DBGCtrl.STOP)
+                yield
+                yield
+
                 # test of dmi reg get
                 int_reg = 9
                 yield from set_dmi(dmi, DBGCore.GSPR_IDX, int_reg) # int reg 9