Merge branch 'master' of ssh://git.libre-riscv.org:922/soc

author Tobias Platen <tplaten@posteo.de>

Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)

committer Tobias Platen <tplaten@posteo.de>

Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
author Tobias Platen <tplaten@posteo.de>
Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
committer Tobias Platen <tplaten@posteo.de>
Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
diff --git a/src/soc/regfile/regfiles.py b/src/soc/regfile/regfiles.py

index 58c7526f0665437f62e0c4f636b28fe75b954784..28f8172d74774bdc8c0a95a4406a21520256f7d6 100644 (file)
--- a/src/soc/regfile/regfiles.py
+++ b/src/soc/regfile/regfiles.py
@@ -98,7 +98,7 @@ class IntRegs(RegFileMem): #class IntRegs(RegFileArray):
      * write-through capability (read on same cycle as write)
      """
      def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, 32, fwd_bus_mode=not regreduce_en)
+        super().__init__(64, 32, fwd_bus_mode=False)
          self.svp64_en = svp64_en
          self.regreduce_en = regreduce_en
          wr_spec, rd_spec = self.get_port_specs()
@@ -135,7 +135,7 @@ class FastRegs(RegFileMem, FastRegsEnum): #RegFileArray):
      Note: r/w issue are used by issuer to increment/decrement TB/DEC.
      """
      def __init__(self, svp64_en=False, regreduce_en=False):
-        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=not regreduce_en)
+        super().__init__(64, FastRegsEnum.N_REGS, fwd_bus_mode=False)
          self.svp64_en = svp64_en
          self.regreduce_en = regreduce_en
          wr_spec, rd_spec = self.get_port_specs()
@@ -234,7 +234,7 @@ class SPRRegs(RegFileMem):
          else:
              n_sprs = len(SPRfull)
          super().__init__(width=64, depth=n_sprs,
-                         fwd_bus_mode=not regreduce_en)
+                         fwd_bus_mode=False)
          self.svp64_en = svp64_en
          self.regreduce_en = regreduce_en
          wr_spec, rd_spec = self.get_port_specs()
diff --git a/src/soc/simple/core.py b/src/soc/simple/core.py

index bd770a94863e68501943e01a406483b43e01a7f6..825c7d4d92453eb52ced901a800a6fb0adafdf3a 100644 (file)
--- a/src/soc/simple/core.py
+++ b/src/soc/simple/core.py
@@ -226,7 +226,7 @@ class NonProductionCore(ControlBase):
              # connect each satellite decoder and give it the instruction.
              # as subset decoders this massively reduces wire fanout given
              # the large number of ALUs
-            setattr(m.submodules, "dec_%s" % v.fn_name, v)
+            m.submodules["dec_%s" % v.fn_name] = v
              comb += v.dec.raw_opcode_in.eq(self.ireg.raw_insn_i)
              comb += v.dec.bigendian.eq(self.ireg.bigendian_i)
              # sigh due to SVP64 RA_OR_ZERO detection connect these too
@@ -427,6 +427,18 @@ class NonProductionCore(ControlBase):
              # and resolved
              with m.If(self.issue_conflict):
                  comb += busy_o.eq(1)
+            # make sure that LDST, SPR, MMU, Branch and Trap all say "busy"
+            # and do not allow overlap.  these are all the ones that
+            # are non-forward-progressing: exceptions etc. that otherwise
+            # change CoreState for some reason (MSR, PC, SVSTATE)
+            for funame, fu in fus.items():
+                if (funame.lower().startswith('ldst') or
+                    funame.lower().startswith('branch') or
+                    funame.lower().startswith('mmu') or
+                    funame.lower().startswith('spr') or
+                    funame.lower().startswith('trap')):
+                    with m.If(fu.busy_o):
+                        comb += busy_o.eq(1)
  
          # return both the function unit "enable" dict as well as the "busy".
          # the "busy-or-issued" can be passed in to the Read/Write port
@@ -470,10 +482,10 @@ class NonProductionCore(ControlBase):
                  (fspec.rdport, fspec.wrport, fspec.read, fspec.write,
                   fspec.wid, fspec.specs)
              print ("fpsec", i, fspec, len(fuspecs))
+            name = "%s_%s_%d" % (regfile, regname, i)
              ppoffs.append(pplen) # record offset for picker
              pplen += len(fspec.specs)
-            name = "rdflag_%s_%s_%d" % (regfile, regname, i)
-            rdflag = Signal(name=name, reset_less=True)
+            rdflag = Signal(name="rdflag_"+name, reset_less=True)
              comb += rdflag.eq(fspec.rdport)
              rdflags.append(rdflag)
  
@@ -481,7 +493,7 @@ class NonProductionCore(ControlBase):
  
          # create a priority picker to manage this port
          rdpickers[regfile][rpidx] = rdpick = PriorityPicker(pplen)
-        setattr(m.submodules, "rdpick_%s_%s" % (regfile, rpidx), rdpick)
+        m.submodules["rdpick_%s_%s" % (regfile, rpidx)] = rdpick
  
          rens = []
          addrs = []
@@ -501,8 +513,19 @@ class NonProductionCore(ControlBase):
                  fu_issued = fu_bitdict[funame]
  
                  # get (or set up) a latched copy of read register number
+                # and (sigh) also the read-ok flag
                  rname = "%s_%s_%s_%d" % (funame, regfile, regname, pi)
+                rhname = "%s_%s_%d" % (regfile, regname, i)
                  read = Signal.like(_read, name="read_"+name)
+                rdflag = Signal(name="rdflag_%s_%s" % (funame, rhname),
+                                reset_less=True)
+                if rhname not in fu.rf_latches:
+                    rfl = Signal(name="rdflag_latch_"+rname)
+                    fu.rf_latches[rhname] = rfl
+                    with m.If(fu.issue_i):
+                        sync += rfl.eq(rdflags[i])
+                else:
+                    rfl = fu.rf_latches[rhname]
                  if rname not in fu.rd_latches:
                      rdl = Signal.like(_read, name="rdlatch_"+rname)
                      fu.rd_latches[rname] = rdl
@@ -514,8 +537,10 @@ class NonProductionCore(ControlBase):
                  # after the read cycle, use the latched copy
                  with m.If(fu.issue_i):
                      comb += read.eq(_read)
+                    comb += rdflag.eq(rdflags[i])
                  with m.Else():
                      comb += read.eq(rdl)
+                    comb += rdflag.eq(rfl)
  
                  # connect request-read to picker input, and output to go-rd
                  addr_en = Signal.like(read, name="addr_en_"+name)
@@ -526,7 +551,7 @@ class NonProductionCore(ControlBase):
  
                  # exclude any currently-enabled read-request (mask out active)
                  # entirely block anything hazarded from being picked
-                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflags[i] &
+                comb += pick.eq(fu.rd_rel_o[idx] & fu_active & rdflag &
                                  ~delay_pick & ~rhazard)
                  comb += rdpick.i[pi].eq(pick)
                  comb += fu.go_rd_i[idx].eq(delay_pick) # pass in *delayed* pick
@@ -561,7 +586,7 @@ class NonProductionCore(ControlBase):
                  wvchk_en = Signal(len(wvchk), name="wv_chk_addr_en_"+name)
                  issue_active = Signal(name="rd_iactive_"+name)
                  # XXX combinatorial loop here
-                comb += issue_active.eq(fu_active & rf)
+                comb += issue_active.eq(fu_active & rdflag)
                  with m.If(issue_active):
                      if rfile.unary:
                          comb += wvchk_en.eq(read)
@@ -740,7 +765,7 @@ class NonProductionCore(ControlBase):
              wvset = wv.s # write-vec bit-level hazard ctrl
              wvclr = wv.r # write-vec bit-level hazard ctrl
              wvchk = wv.q # write-after-write hazard check
-            wvchk_qint = wv.q_int # write-after-write hazard check, delayed
+            wvchk_qint = wv.q # write-after-write hazard check, NOT delayed
  
          fspecs = fspec
          if not isinstance(fspecs, list):
@@ -776,7 +801,7 @@ class NonProductionCore(ControlBase):
  
          # create a priority picker to manage this port
          wrpickers[regfile][rpidx] = wrpick = PriorityPicker(pplen)
-        setattr(m.submodules, "wrpick_%s_%s" % (regfile, rpidx), wrpick)
+        m.submodules["wrpick_%s_%s" % (regfile, rpidx)] = wrpick
  
          wsigs = []
          wens = []
@@ -911,11 +936,19 @@ class NonProductionCore(ControlBase):
              comb += wport.wen.eq(ortreereduce_sig(wens))
  
          if not self.make_hazard_vecs:
-            return
-
-        # for write-vectors
-        comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
-        comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
+            return [], []
+
+        # return these here rather than set wvclr/wvset directly,
+        # because there may be more than one write-port to a given
+        # regfile.  example: XER has a write-port for SO, CA, and OV
+        # and the *last one added* of those would overwrite the other
+        # two.  solution: have connect_wrports collate all the
+        # or-tree-reduced bitvector set/clear requests and drop them
+        # in as a single "thing".  this can only be done because the
+        # set/get is an unary bitvector.
+        print ("make write-vecs", regfile, regname, wvset, wvclr)
+        return (ortreereduce_sig(wvclren), # clear (regfile write)
+                ortreereduce_sig(wvseten)) # set (issue time)
  
      def connect_wrports(self, m, fu_bitdict, fu_selected):
          """connect write ports
@@ -937,6 +970,8 @@ class NonProductionCore(ControlBase):
          # same for write ports.
          # BLECH!  complex code-duplication! BLECH!
          wrpickers = {}
+        wvclrers = defaultdict(list)
+        wvseters = defaultdict(list)
          for regfile, spec in byregfiles_wr.items():
              fuspecs = byregfiles_wrspec[regfile]
              wrpickers[regfile] = {}
@@ -953,9 +988,33 @@ class NonProductionCore(ControlBase):
                      if 'fast3' in fuspecs:
                          fuspecs['fast1'].append(fuspecs.pop('fast3'))
  
+            # collate these and record them by regfile because there
+            # are sometimes more write-ports per regfile
              for (regname, fspec) in sort_fuspecs(fuspecs):
-                self.connect_wrport(m, fu_bitdict, fu_selected, wrpickers,
+                wvclren, wvseten = self.connect_wrport(m,
+                                        fu_bitdict, fu_selected,
+                                        wrpickers,
                                          regfile, regname, fspec)
+                wvclrers[regfile.lower()].append(wvclren)
+                wvseters[regfile.lower()].append(wvseten)
+
+        if not self.make_hazard_vecs:
+            return
+
+        # for write-vectors: reduce the clr-ers and set-ers down to
+        # a single set of bits.  otherwise if there are two write
+        # ports (on some regfiles), the last one doing comb += on
+        # the reg.wv[regfile] instance "wins" (and all others are ignored,
+        # whoops).  if there was only one write-port per wv regfile this would
+        # not be an issue.
+        for regfile in wvclrers.keys():
+            wv = regs.wv[regfile]
+            wvset = wv.s # write-vec bit-level hazard ctrl
+            wvclr = wv.r # write-vec bit-level hazard ctrl
+            wvclren = wvclrers[regfile]
+            wvseten = wvseters[regfile]
+            comb += wvclr.eq(ortreereduce_sig(wvclren)) # clear (regfile write)
+            comb += wvset.eq(ortreereduce_sig(wvseten)) # set (issue time)
  
      def get_byregfiles(self, readmode):
  
@@ -976,7 +1035,8 @@ class NonProductionCore(ControlBase):
              # the issue there is that this function is actually better
              # suited at the moment
              if readmode:
-                fu.rd_latches = {}
+                fu.rd_latches = {} # read reg number latches
+                fu.rf_latches = {} # read flag latches
              else:
                  fu.wr_latches = {}
  
@@ -1048,6 +1108,7 @@ if __name__ == '__main__':
      pspec = TestMemPspec(ldst_ifacetype='testpi',
                           imem_ifacetype='',
                           addr_wid=48,
+                         allow_overlap=True,
                           mask_wid=8,
                           reg_wid=64)
      dut = NonProductionCore(pspec)
diff --git a/src/soc/simple/test/test_core.py b/src/soc/simple/test/test_core.py

index dfafb2cbb38a54ec138d0fde9b7cb1b7b55ac8e0..479299ce6a08cc750ceece5d6a531fdc04f6e650 100644 (file)
--- a/src/soc/simple/test/test_core.py
+++ b/src/soc/simple/test/test_core.py
@@ -19,6 +19,8 @@ from openpower.decoder.power_decoder import create_pdecode
  from openpower.decoder.power_decoder2 import PowerDecode2
  from openpower.decoder.selectable_int import SelectableInt
  from openpower.decoder.isa.all import ISA
+from openpower.decoder.decode2execute1 import IssuerDecode2ToOperand
+from openpower.state import CoreState
  
  # note that using SPRreduced has to be done to match the
  # PowerDecoder2 SPR map
@@ -26,6 +28,7 @@ from openpower.decoder.power_enums import SPRreduced as SPR
  from openpower.decoder.power_enums import spr_dict, Function, XER_bits
  from soc.config.test.test_loadstore import TestMemPspec
  from openpower.endian import bigendian
+from soc.regfile.regfiles import StateRegs
  
  from soc.simple.core import NonProductionCore
  from soc.experiment.compalu_multi import find_ok  # hack
@@ -40,6 +43,7 @@ from soc.fu.shift_rot.test.test_pipe_caller import ShiftRotTestCase
  from soc.fu.cr.test.test_pipe_caller import CRTestCase
  from soc.fu.branch.test.test_pipe_caller import BranchTestCase
  from soc.fu.ldst.test.test_pipe_caller import LDSTTestCase
+from openpower.test.general.overlap_hazards import HazardTestCase
  from openpower.util import spr_to_fast_reg
  
  from openpower.consts import StateRegsEnum
@@ -189,8 +193,8 @@ def set_issue(core, dec2, sim):
  
  def wait_for_busy_clear(cu):
      while True:
-        busy_o = yield cu.busy_o
-        terminate_o = yield cu.core_terminate_o
+        busy_o = yield cu.o.busy_o
+        terminate_o = yield cu.o.core_terminate_o
          if not busy_o:
              print("busy/terminate:", busy_o, terminate_o)
              break
@@ -207,32 +211,48 @@ class TestRunner(FHDLTestCase):
          m = Module()
          comb = m.d.comb
          instruction = Signal(32)
-        ivalid_i = Signal()
  
          pspec = TestMemPspec(ldst_ifacetype='testpi',
                               imem_ifacetype='',
                               addr_wid=48,
                               mask_wid=8,
+                             allow_overlap=True,
                               reg_wid=64)
  
+        cur_state = CoreState("cur") # current state (MSR/PC/SVSTATE)
+        pdecode2 = PowerDecode2(None, state=cur_state,
+                                     #opkls=IssuerDecode2ToOperand,
+                                     svp64_en=True, # self.svp64_en,
+                                     regreduce_en=False, #self.regreduce_en
+                                    )
+
          m.submodules.core = core = NonProductionCore(pspec)
-        pdecode2 = core.pdecode2
+        m.submodules.pdecode2 = pdecode2
+        core.pdecode2 = pdecode2
          l0 = core.l0
  
-        comb += core.raw_opcode_i.eq(instruction)
-        comb += core.ivalid_i.eq(ivalid_i)
+        comb += pdecode2.dec.raw_opcode_in.eq(instruction)
+        comb += pdecode2.dec.bigendian.eq(bigendian)  # little / big?
+        comb += core.i.e.eq(pdecode2.e)
+        comb += core.i.state.eq(cur_state)
+        comb += core.i.raw_insn_i.eq(instruction)
+        comb += core.i.bigendian_i.eq(bigendian)
+
+        # set the PC StateRegs read port to always send back the PC
+        stateregs = core.regs.state
+        pc_regnum = StateRegs.PC
+        comb += stateregs.r_ports['cia'].ren.eq(1<<pc_regnum)
  
          # temporary hack: says "go" immediately for both address gen and ST
          ldst = core.fus.fus['ldst0']
-        m.d.comb += ldst.ad.go.eq(ldst.ad.rel)  # link addr-go direct to rel
-        m.d.comb += ldst.st.go.eq(ldst.st.rel)  # link store-go direct to rel
+        m.d.comb += ldst.ad.go_i.eq(ldst.ad.rel_o)  # link addr-go to rel
+        m.d.comb += ldst.st.go_i.eq(ldst.st.rel_o)  # link store-go to rel
  
          # nmigen Simulation
          sim = Simulator(m)
          sim.add_clock(1e-6)
  
          def process():
-            yield core.issue_i.eq(0)
              yield
  
              for test in self.test_data:
@@ -247,7 +267,7 @@ class TestRunner(FHDLTestCase):
                      instructions = list(zip(gen, program.assembly.splitlines()))
  
                      yield from setup_tst_memory(l0, test.mem)
-                    yield from setup_regs(core, test)
+                    yield from setup_regs(pdecode2, core, test)
  
                      index = sim.pc.CIA.value // 4
                      while index < len(instructions):
@@ -257,27 +277,53 @@ class TestRunner(FHDLTestCase):
                          print(code)
  
                          # ask the decoder to decode this binary data (endian'd)
-                        yield core.bigendian_i.eq(bigendian)  # little / big?
                          yield instruction.eq(ins)          # raw binary instr.
-                        yield ivalid_i.eq(1)
                          yield Settle()
-                        # fn_unit = yield pdecode2.e.fn_unit
-                        #fuval = self.funit.value
-                        #self.assertEqual(fn_unit & fuval, fuval)
-
-                        # set operand and get inputs
-                        yield from set_issue(core, pdecode2, sim)
-                        yield Settle()
-
-                        yield from wait_for_busy_clear(core)
-                        yield ivalid_i.eq(0)
-                        yield
  
                          print("sim", code)
                          # call simulated operation
                          opname = code.split(' ')[0]
                          yield from sim.call(opname)
-                        index = sim.pc.CIA.value // 4
+                        pc = sim.pc.CIA.value
+                        nia = sim.pc.NIA.value
+                        index = pc // 4
+
+                        # set the PC to the same simulated value
+                        # (core is not able to do this itself, except
+                        # for branch / TRAP)
+                        print ("after call, pc nia", pc, nia)
+                        yield stateregs.regs[pc_regnum].reg.eq(pc)
+                        yield Settle()
+
+                        yield core.p.i_valid.eq(1)
+                        yield
+                        o_ready = yield core.p.o_ready
+                        while True:
+                            if o_ready:
+                                break
+                            yield
+                            o_ready = yield core.p.o_ready
+                        yield core.p.i_valid.eq(0)
+
+                        # set operand and get inputs
+                        yield from wait_for_busy_clear(core)
+
+                        # synchronised (non-overlap) is fine to check
+                        if not core.allow_overlap:
+                            # register check
+                            yield from check_regs(self, sim, core, test, code)
+
+                            # Memory check
+                            yield from check_mem(self, sim, core, test, code)
+
+                    # non-overlap mode is only fine to check right at the end
+                    if core.allow_overlap:
+                        # wait until all settled
+                        # XXX really this should be in DMI, which should in turn
+                        # use issuer.any_busy to not send back "stopped" signal
+                        while (yield core.o.any_busy_o):
+                            yield
+                        yield Settle()
  
                          # register check
                          yield from check_regs(self, sim, core, test, code)
@@ -285,6 +331,10 @@ class TestRunner(FHDLTestCase):
                          # Memory check
                          yield from check_mem(self, sim, core, test, code)
  
+            # give a couple extra clock cycles for gtkwave display to be happy
+            yield
+            yield
+
          sim.add_sync_process(process)
          with sim.write_vcd("core_simulator.vcd", "core_simulator.gtkw",
                             traces=[]):
@@ -294,12 +344,13 @@ class TestRunner(FHDLTestCase):
  if __name__ == "__main__":
      unittest.main(exit=False)
      suite = unittest.TestSuite()
-    suite.addTest(TestRunner(LDSTTestCase().test_data))
-    suite.addTest(TestRunner(CRTestCase().test_data))
-    suite.addTest(TestRunner(ShiftRotTestCase().test_data))
+    suite.addTest(TestRunner(HazardTestCase().test_data))
+    #suite.addTest(TestRunner(LDSTTestCase().test_data))
+    #suite.addTest(TestRunner(CRTestCase().test_data))
+    #suite.addTest(TestRunner(ShiftRotTestCase().test_data))
      suite.addTest(TestRunner(LogicalTestCase().test_data))
      suite.addTest(TestRunner(ALUTestCase().test_data))
-    suite.addTest(TestRunner(BranchTestCase().test_data))
+    #suite.addTest(TestRunner(BranchTestCase().test_data))
  
      runner = unittest.TextTestRunner()
      runner.run(suite)
diff --git a/src/soc/simple/test/teststate.py b/src/soc/simple/test/teststate.py

index 4fd1ab3b7f0e5dd4fa9fe1461ef907a9cb20701d..7da358ea7af1c51e7ce46e8635d39ff3a2f7a0a9 100644 (file)
--- a/src/soc/simple/test/teststate.py
+++ b/src/soc/simple/test/teststate.py
@@ -48,6 +48,7 @@ class HDLState(State):
      def get_pc(self):
          self.pcl = []
          self.state = self.core.regs.state
+        # relies on the state.r_port being permanently held as PC
          self.pc = yield self.state.r_ports['cia'].o_data
          self.pcl.append(self.pc)
          log("class hdl pc", hex(self.pc))
author	Tobias Platen <tplaten@posteo.de>
	Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
committer	Tobias Platen <tplaten@posteo.de>
	Tue, 30 Nov 2021 16:52:40 +0000 (17:52 +0100)
src/soc/regfile/regfiles.py		patch \| blob \| history
src/soc/simple/core.py		patch \| blob \| history
src/soc/simple/test/test_core.py		patch \| blob \| history
src/soc/simple/test/teststate.py		patch \| blob \| history