Merge branch 'master' of ssh://git.libre-riscv.org:922/soc
authorTobias Platen <tplaten@posteo.de>
Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
committerTobias Platen <tplaten@posteo.de>
Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
src/soc/regfile/regfiles.py
src/soc/simple/core.py
src/soc/simple/test/test_core.py
src/soc/simple/test/teststate.py

index 58c7526f0665437f62e0c4f636b28fe75b954784..28f8172d74774bdc8c0a95a4406a21520256f7d6 100644 (file)
@@ -98,7 +98,7 @@ class IntRegs(RegFileMem): #class IntRegs(RegFileArray):
     * write-through capability (read on same cycle as write)
     """
     def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, 32, fwd_bus_mode=not regreduce_en)
+        super().__init__(64, 32, fwd_bus_mode=False)
         self.svp64_en = svp64_en
         self.regreduce_en = regreduce_en
         wr_spec, rd_spec = self.get_port_specs()
@@ -135,7 +135,7 @@ class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
     Note: r/w issue are used by issuer to increment/decrement TB/DEC.
     """
     def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=not regreduce_en)
+        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=False)
         self.svp64_en = svp64_en
         self.regreduce_en = regreduce_en
         wr_spec, rd_spec = self.get_port_specs()
@@ -234,7 +234,7 @@ class SPRRegs(RegFileMem):
         else:
             n_sprs = len(SPRfull)
         super().__init__(width=64, depth=n_sprs,
-                         fwd_bus_mode=not regreduce_en)
+                         fwd_bus_mode=False)
         self.svp64_en = svp64_en
         self.regreduce_en = regreduce_en
         wr_spec, rd_spec = self.get_port_specs()
index bd770a94863e68501943e01a406483b43e01a7f6..825c7d4d92453eb52ced901a800a6fb0adafdf3a 100644 (file)
@@ -226,7 +226,7 @@ class NonProductionCore(ControlBase):
             # connect each satellite decoder and give it the instruction.
             # as subset decoders this massively reduces wire fanout given
             # the large number of ALUs
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
+            m.submodules["dec_%s" % v.fn_name] = v
             comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
             comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
             # sigh due to SVP64 RA_OR_ZERO detection connect these too
@@ -427,6 +427,18 @@ class NonProductionCore(ControlBase):
             # and resolved
             with m.If(self.issue_conflict):
                 comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
 
         # return both the function unit "enable" dict as well as the "busy".
         # the "busy-or-issued" can be passed in to the Read/Write port
@@ -470,10 +482,10 @@ class NonProductionCore(ControlBase):
                 (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
                  fspec.wid, fspec.specs)
             print ("fpsec", i, fspec, len(fuspecs))
+            name = "%s_%s_%d" % (regfile, regname, i)
             ppoffs.append(pplen) # record offset for picker
             pplen += len(fspec.specs)
-            name = "rdflag_%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name=name, reset_less=True)
+            rdflag = Signal(name="rdflag_"+name, reset_less=True)
             comb += rdflag.eq(fspec.rdport)
             rdflags.append(rdflag)
 
@@ -481,7 +493,7 @@ class NonProductionCore(ControlBase):
 
         # create a priority picker to manage this port
         rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
 
         rens = []
         addrs = []
@@ -501,8 +513,19 @@ class NonProductionCore(ControlBase):
                 fu_issued = fu_bitdict[funame]
 
                 # get (or set up) a latched copy of read register number
+                # and (sigh) also the read-ok flag
                 rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+                rhname = "%s_%s_%d" % (regfile, regname, i)
                 read = Signal.like(_read, name="read_"+name)
+                rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+                                reset_less=True)
+                if rhname not in fu.rf_latches:
+                    rfl = Signal(name="rdflag_latch_"+rname)
+                    fu.rf_latches[rhname] = rfl
+                    with m.If(fu.issue_i):
+                        sync += rfl.eq(rdflags[i])
+                else:
+                    rfl = fu.rf_latches[rhname]
                 if rname not in fu.rd_latches:
                     rdl = Signal.like(_read, name="rdlatch_"+rname)
                     fu.rd_latches[rname] = rdl
@@ -514,8 +537,10 @@ class NonProductionCore(ControlBase):
                 # after the read cycle, use the latched copy
                 with m.If(fu.issue_i):
                     comb += read.eq(_read)
+                    comb += rdflag.eq(rdflags[i])
                 with m.Else():
                     comb += read.eq(rdl)
+                    comb += rdflag.eq(rfl)
 
                 # connect request-read to picker input, and output to go-rd
                 addr_en = Signal.like(read, name="addr_en_"+name)
@@ -526,7 +551,7 @@ class NonProductionCore(ControlBase):
 
                 # exclude any currently-enabled read-request (mask out active)
                 # entirely block anything hazarded from being picked
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
                                 ~delay_pick & ~rhazard)
                 comb += rdpick.i[pi].eq(pick)
                 comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
@@ -561,7 +586,7 @@ class NonProductionCore(ControlBase):
                 wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
                 issue_active = Signal(name="rd_iactive_"+name)
                 # XXX combinatorial loop here
-                comb += issue_active.eq(fu_active & rf)
+                comb += issue_active.eq(fu_active & rdflag)
                 with m.If(issue_active):
                     if rfile.unary:
                         comb += wvchk_en.eq(read)
@@ -740,7 +765,7 @@ class NonProductionCore(ControlBase):
             wvset = wv.s # write-vec bit-level hazard ctrl
             wvclr = wv.r # write-vec bit-level hazard ctrl
             wvchk = wv.q # write-after-write hazard check
-            wvchk_qint = wv.q_int # write-after-write hazard check, delayed
+            wvchk_qint = wv.q # write-after-write hazard check, NOT delayed
 
         fspecs = fspec
         if not isinstance(fspecs, list):
@@ -776,7 +801,7 @@ class NonProductionCore(ControlBase):
 
         # create a priority picker to manage this port
         wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
 
         wsigs = []
         wens = []
@@ -911,11 +936,19 @@ class NonProductionCore(ControlBase):
             comb += wport.wen.eq(ortreereduce_sig(wens))
 
         if not self.make_hazard_vecs:
-            return
-
-        # for write-vectors
-        comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
-        comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (ortreereduce_sig(wvclren), # clear (regfile write)
+                ortreereduce_sig(wvseten)) # set (issue time)
 
     def connect_wrports(self, m, fu_bitdict, fu_selected):
         """connect write ports
@@ -937,6 +970,8 @@ class NonProductionCore(ControlBase):
         # same for write ports.
         # BLECH!  complex code-duplication! BLECH!
         wrpickers = {}
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
         for regfile, spec in byregfiles_wr.items():
             fuspecs = byregfiles_wrspec[regfile]
             wrpickers[regfile] = {}
@@ -953,9 +988,33 @@ class NonProductionCore(ControlBase):
                     if 'fast3' in fuspecs:
                         fuspecs['fast1'].append(fuspecs.pop('fast3'))
 
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
             for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, fu_selected, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                         regfile, regname, fspec)
+                wvclrers[regfile.lower()].append(wvclren)
+                wvseters[regfile.lower()].append(wvseten)
+
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
 
     def get_byregfiles(self, readmode):
 
@@ -976,7 +1035,8 @@ class NonProductionCore(ControlBase):
             # the issue there is that this function is actually better
             # suited at the moment
             if readmode:
-                fu.rd_latches = {}
+                fu.rd_latches = {} # read reg number latches
+                fu.rf_latches = {} # read flag latches
             else:
                 fu.wr_latches = {}
 
@@ -1048,6 +1108,7 @@ if __name__ == '__main__':
     pspec = TestMemPspec(ldst_ifacetype='testpi',
                          imem_ifacetype='',
                          addr_wid=48,
+                         allow_overlap=True,
                          mask_wid=8,
                          reg_wid=64)
     dut = NonProductionCore(pspec)
index dfafb2cbb38a54ec138d0fde9b7cb1b7b55ac8e0..479299ce6a08cc750ceece5d6a531fdc04f6e650 100644 (file)
@@ -19,6 +19,8 @@ from openpower.decoder.power_decoder import create_pdecode
 from openpower.decoder.power_decoder2 import PowerDecode2
 from openpower.decoder.selectable_int import SelectableInt
 from openpower.decoder.isa.all import ISA
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.state import CoreState
 
 # note that using SPRreduced has to be done to match the
 # PowerDecoder2 SPR map
@@ -26,6 +28,7 @@ from openpower.decoder.power_enums import SPRreduced as SPR
 from openpower.decoder.power_enums import spr_dict, Function, XER_bits
 from soc.config.test.test_loadstore import TestMemPspec
 from openpower.endian import bigendian
+from soc.regfile.regfiles import StateRegs
 
 from soc.simple.core import NonProductionCore
 from soc.experiment.compalu_multi import find_ok  # hack
@@ -40,6 +43,7 @@ from soc.fu.shift_rot.test.test_pipe_caller import ShiftRotTestCase
 from soc.fu.cr.test.test_pipe_caller import CRTestCase
 from soc.fu.branch.test.test_pipe_caller import BranchTestCase
 from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
+from openpower.test.general.overlap_hazards import HazardTestCase
 from openpower.util import spr_to_fast_reg
 
 from openpower.consts import StateRegsEnum
@@ -189,8 +193,8 @@ def set_issue(core, dec2, sim):
 
 def wait_for_busy_clear(cu):
     while True:
-        busy_o = yield cu.busy_o
-        terminate_o = yield cu.core_terminate_o
+        busy_o = yield cu.o.busy_o
+        terminate_o = yield cu.o.core_terminate_o
         if not busy_o:
             print("busy/terminate:", busy_o, terminate_o)
             break
@@ -207,32 +211,48 @@ class TestRunner(FHDLTestCase):
         m = Module()
         comb = m.d.comb
         instruction = Signal(32)
-        ivalid_i = Signal()
 
         pspec = TestMemPspec(ldst_ifacetype='testpi',
                              imem_ifacetype='',
                              addr_wid=48,
                              mask_wid=8,
+                             allow_overlap=True,
                              reg_wid=64)
 
+        cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+        pdecode2 = PowerDecode2(None, state=cur_state,
+                                     #opkls=IssuerDecode2ToOperand,
+                                     svp64_en=True, # self.svp64_en,
+                                     regreduce_en=False, #self.regreduce_en
+                                    )
+
         m.submodules.core = core = NonProductionCore(pspec)
-        pdecode2 = core.pdecode2
+        m.submodules.pdecode2 = pdecode2
+        core.pdecode2 = pdecode2
         l0 = core.l0
 
-        comb += core.raw_opcode_i.eq(instruction)
-        comb += core.ivalid_i.eq(ivalid_i)
+        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+        comb += pdecode2.dec.bigendian.eq(bigendian)  # little / big?
+        comb += core.i.e.eq(pdecode2.e)
+        comb += core.i.state.eq(cur_state)
+        comb += core.i.raw_insn_i.eq(instruction)
+        comb += core.i.bigendian_i.eq(bigendian)
+
+        # set the PC StateRegs read port to always send back the PC
+        stateregs = core.regs.state
+        pc_regnum = StateRegs.PC
+        comb += stateregs.r_ports['cia'].ren.eq(1<<pc_regnum)
 
         # temporary hack: says "go" immediately for both address gen and ST
         ldst = core.fus.fus['ldst0']
-        m.d.comb += ldst.ad.go.eq(ldst.ad.rel)  # link addr-go direct to rel
-        m.d.comb += ldst.st.go.eq(ldst.st.rel)  # link store-go direct to rel
+        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)  # link addr-go to rel
+        m.d.comb += ldst.st.go_i.eq(ldst.st.rel_o)  # link store-go to rel
 
         # nmigen Simulation
         sim = Simulator(m)
         sim.add_clock(1e-6)
 
         def process():
-            yield core.issue_i.eq(0)
             yield
 
             for test in self.test_data:
@@ -247,7 +267,7 @@ class TestRunner(FHDLTestCase):
                     instructions = list(zip(gen, program.assembly.splitlines()))
 
                     yield from setup_tst_memory(l0, test.mem)
-                    yield from setup_regs(core, test)
+                    yield from setup_regs(pdecode2, core, test)
 
                     index = sim.pc.CIA.value // 4
                     while index < len(instructions):
@@ -257,27 +277,53 @@ class TestRunner(FHDLTestCase):
                         print(code)
 
                         # ask the decoder to decode this binary data (endian'd)
-                        yield core.bigendian_i.eq(bigendian)  # little / big?
                         yield instruction.eq(ins)          # raw binary instr.
-                        yield ivalid_i.eq(1)
                         yield Settle()
-                        # fn_unit = yield pdecode2.e.fn_unit
-                        #fuval = self.funit.value
-                        #self.assertEqual(fn_unit & fuval, fuval)
-
-                        # set operand and get inputs
-                        yield from set_issue(core, pdecode2, sim)
-                        yield Settle()
-
-                        yield from wait_for_busy_clear(core)
-                        yield ivalid_i.eq(0)
-                        yield
 
                         print("sim", code)
                         # call simulated operation
                         opname = code.split(' ')[0]
                         yield from sim.call(opname)
-                        index = sim.pc.CIA.value // 4
+                        pc = sim.pc.CIA.value
+                        nia = sim.pc.NIA.value
+                        index = pc // 4
+
+                        # set the PC to the same simulated value
+                        # (core is not able to do this itself, except
+                        # for branch / TRAP)
+                        print ("after call, pc nia", pc, nia)
+                        yield stateregs.regs[pc_regnum].reg.eq(pc)
+                        yield Settle()
+
+                        yield core.p.i_valid.eq(1)
+                        yield
+                        o_ready = yield core.p.o_ready
+                        while True:
+                            if o_ready:
+                                break
+                            yield
+                            o_ready = yield core.p.o_ready
+                        yield core.p.i_valid.eq(0)
+
+                        # set operand and get inputs
+                        yield from wait_for_busy_clear(core)
+
+                        # synchronised (non-overlap) is fine to check
+                        if not core.allow_overlap:
+                            # register check
+                            yield from check_regs(self, sim, core, test, code)
+
+                            # Memory check
+                            yield from check_mem(self, sim, core, test, code)
+
+                    # non-overlap mode is only fine to check right at the end
+                    if core.allow_overlap:
+                        # wait until all settled
+                        # XXX really this should be in DMI, which should in turn
+                        # use issuer.any_busy to not send back "stopped" signal
+                        while (yield core.o.any_busy_o):
+                            yield
+                        yield Settle()
 
                         # register check
                         yield from check_regs(self, sim, core, test, code)
@@ -285,6 +331,10 @@ class TestRunner(FHDLTestCase):
                         # Memory check
                         yield from check_mem(self, sim, core, test, code)
 
+            # give a couple extra clock cycles for gtkwave display to be happy
+            yield
+            yield
+
         sim.add_sync_process(process)
         with sim.write_vcd("core_simulator.vcd", "core_simulator.gtkw",
                            traces=[]):
@@ -294,12 +344,13 @@ class TestRunner(FHDLTestCase):
 if __name__ == "__main__":
     unittest.main(exit=False)
     suite = unittest.TestSuite()
-    suite.addTest(TestRunner(LDSTTestCase().test_data))
-    suite.addTest(TestRunner(CRTestCase().test_data))
-    suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+    suite.addTest(TestRunner(HazardTestCase().test_data))
+    #suite.addTest(TestRunner(LDSTTestCase().test_data))
+    #suite.addTest(TestRunner(CRTestCase().test_data))
+    #suite.addTest(TestRunner(ShiftRotTestCase().test_data))
     suite.addTest(TestRunner(LogicalTestCase().test_data))
     suite.addTest(TestRunner(ALUTestCase().test_data))
-    suite.addTest(TestRunner(BranchTestCase().test_data))
+    #suite.addTest(TestRunner(BranchTestCase().test_data))
 
     runner = unittest.TextTestRunner()
     runner.run(suite)
index 4fd1ab3b7f0e5dd4fa9fe1461ef907a9cb20701d..7da358ea7af1c51e7ce46e8635d39ff3a2f7a0a9 100644 (file)
@@ -48,6 +48,7 @@ class HDLState(State):
     def get_pc(self):
         self.pcl = []
         self.state = self.core.regs.state
+        # relies on the state.r_port being permanently held as PC
         self.pc = yield self.state.r_ports['cia'].o_data
         self.pcl.append(self.pc)
         log("class hdl pc", hex(self.pc))