add ldst_matrix.py back in, needs some work though

[soc.git] / src / experiment / score6600.py
diff --git a/src/experiment/score6600.py b/src/experiment/score6600.py

index 925445e73968571454c47f11f8afc3032f455c4d..212653b198714ed7c2291f8ed5152218185ec1c3 100644 (file)
--- a/src/experiment/score6600.py
+++ b/src/experiment/score6600.py
@@ -3,19 +3,321 @@ from nmigen.cli import verilog, rtlil
  from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
  
  from regfile.regfile import RegFileArray, treereduce
  from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
  
  from regfile.regfile import RegFileArray, treereduce
-from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
  from scoreboard.fu_fu_matrix import FUFUDepMatrix
  from scoreboard.fu_reg_matrix import FURegDepMatrix
  from scoreboard.global_pending import GlobalPending
  from scoreboard.group_picker import GroupPicker
  from scoreboard.fu_fu_matrix import FUFUDepMatrix
  from scoreboard.fu_reg_matrix import FURegDepMatrix
  from scoreboard.global_pending import GlobalPending
  from scoreboard.group_picker import GroupPicker
-from scoreboard.issue_unit import IntFPIssueUnit, RegDecode
+from scoreboard.issue_unit import IssueUnitGroup, IssueUnitArray, RegDecode
+from scoreboard.shadow import ShadowMatrix, BranchSpeculationRecord
+from scoreboard.instruction_q import Instruction, InstructionQ
  
  from compalu import ComputationUnitNoDelay
  
  
  from compalu import ComputationUnitNoDelay
  
-from alu_hier import ALU
+from alu_hier import ALU, BranchALU
  from nmutil.latch import SRLatch
  from nmutil.latch import SRLatch
+from nmutil.nmoperator import eq
  
  
-from random import randint
+from random import randint, seed
+from copy import deepcopy
+from math import log
+
+
+class Memory(Elaboratable):
+    def __init__(self, regwid, addrw):
+        self.ddepth = regwid/8
+        depth = (1<<addrw) / self.ddepth
+        self.adr   = Signal(addrw)
+        self.dat_r = Signal(regwid)
+        self.dat_w = Signal(regwid)
+        self.we    = Signal()
+        self.mem   = Memory(width=regwid, depth=depth, init=range(0, depth))
+
+    def elaborate(self, platform):
+        m = Module()
+        m.submodules.rdport = rdport = self.mem.read_port()
+        m.submodules.wrport = wrport = self.mem.write_port()
+        m.d.comb += [
+            rdport.addr.eq(self.adr[self.ddepth:]), # ignore low bits
+            self.dat_r.eq(rdport.data),
+            wrport.addr.eq(self.adr),
+            wrport.data.eq(self.dat_w),
+            wrport.en.eq(self.we),
+        ]
+        return m
+
+
+class MemSim:
+    def __init__(self, regwid, addrw):
+        self.regwid = regwid
+        self.ddepth = regwid//8
+        depth = (1<<addrw) // self.ddepth
+        self.mem = list(range(0, depth))
+
+    def ld(self, addr):
+        return self.mem[addr>>self.ddepth]
+
+    def st(self, addr, data):
+        self.mem[addr>>self.ddepth] = data & ((1<<self.regwid)-1)
+
+
+class CompUnitsBase(Elaboratable):
+    """ Computation Unit Base class.
+
+        Amazingly, this class works recursively.  It's supposed to just
+        look after some ALUs (that can handle the same operations),
+        grouping them together, however it turns out that the same code
+        can also group *groups* of Computation Units together as well.
+
+        Basically it was intended just to concatenate the ALU's issue,
+        go_rd etc. signals together, which start out as bits and become
+        sequences.  Turns out that the same trick works just as well
+        on Computation Units!
+
+        So this class may be used recursively to present a top-level
+        sequential concatenation of all the signals in and out of
+        ALUs, whilst at the same time making it convenient to group
+        ALUs together.
+
+        At the lower level, the intent is that groups of (identical)
+        ALUs may be passed the same operation.  Even beyond that,
+        the intent is that that group of (identical) ALUs actually
+        share the *same pipeline* and as such become a "Concurrent
+        Computation Unit" as defined by Mitch Alsup (see section
+        11.4.9.3)
+    """
+    def __init__(self, rwid, units):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :units: sequence of ALUs (or CompUnitsBase derivatives)
+        """
+        self.units = units
+        self.rwid = rwid
+        self.rwid = rwid
+        if units and isinstance(units[0], CompUnitsBase):
+            self.n_units = 0
+            for u in self.units:
+                self.n_units += u.n_units
+        else:
+            self.n_units = len(units)
+
+        n_units = self.n_units
+
+        # inputs
+        self.issue_i = Signal(n_units, reset_less=True)
+        self.go_rd_i = Signal(n_units, reset_less=True)
+        self.go_wr_i = Signal(n_units, reset_less=True)
+        self.shadown_i = Signal(n_units, reset_less=True)
+        self.go_die_i = Signal(n_units, reset_less=True)
+
+        # outputs
+        self.busy_o = Signal(n_units, reset_less=True)
+        self.rd_rel_o = Signal(n_units, reset_less=True)
+        self.req_rel_o = Signal(n_units, reset_less=True)
+
+        # in/out register data (note: not register#, actual data)
+        self.data_o = Signal(rwid, reset_less=True)
+        self.src1_i = Signal(rwid, reset_less=True)
+        self.src2_i = Signal(rwid, reset_less=True)
+        # input operand
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+
+        for i, alu in enumerate(self.units):
+            setattr(m.submodules, "comp%d" % i, alu)
+
+        go_rd_l = []
+        go_wr_l = []
+        issue_l = []
+        busy_l = []
+        req_rel_l = []
+        rd_rel_l = []
+        shadow_l = []
+        godie_l = []
+        for alu in self.units:
+            req_rel_l.append(alu.req_rel_o)
+            rd_rel_l.append(alu.rd_rel_o)
+            shadow_l.append(alu.shadown_i)
+            godie_l.append(alu.go_die_i)
+            go_wr_l.append(alu.go_wr_i)
+            go_rd_l.append(alu.go_rd_i)
+            issue_l.append(alu.issue_i)
+            busy_l.append(alu.busy_o)
+        comb += self.rd_rel_o.eq(Cat(*rd_rel_l))
+        comb += self.req_rel_o.eq(Cat(*req_rel_l))
+        comb += self.busy_o.eq(Cat(*busy_l))
+        comb += Cat(*godie_l).eq(self.go_die_i)
+        comb += Cat(*shadow_l).eq(self.shadown_i)
+        comb += Cat(*go_wr_l).eq(self.go_wr_i)
+        comb += Cat(*go_rd_l).eq(self.go_rd_i)
+        comb += Cat(*issue_l).eq(self.issue_i)
+
+        # connect data register input/output
+
+        # merge (OR) all integer FU / ALU outputs to a single value
+        # bit of a hack: treereduce needs a list with an item named "data_o"
+        if self.units:
+            data_o = treereduce(self.units)
+            comb += self.data_o.eq(data_o)
+
+        for i, alu in enumerate(self.units):
+            comb += alu.src1_i.eq(self.src1_i)
+            comb += alu.src2_i.eq(self.src2_i)
+
+        return m
+
+
+class CompUnitALUs(CompUnitsBase):
+
+    def __init__(self, rwid, opwid):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :opwid:  operand bit width
+        """
+        self.opwid = opwid
+
+        # inputs
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.imm_i = Signal(rwid, reset_less=True)
+
+        # Int ALUs
+        add = ALU(rwid)
+        sub = ALU(rwid)
+        mul = ALU(rwid)
+        shf = ALU(rwid)
+
+        units = []
+        for alu in [add, sub, mul, shf]:
+            aluopwid = 3 # extra bit for immediate mode
+            units.append(ComputationUnitNoDelay(rwid, aluopwid, alu))
+
+        CompUnitsBase.__init__(self, rwid, units)
+
+    def elaborate(self, platform):
+        m = CompUnitsBase.elaborate(self, platform)
+        comb = m.d.comb
+
+        # hand the same operation to all units, only lower 2 bits though
+        for alu in self.units:
+            comb += alu.oper_i[0:3].eq(self.oper_i)
+            comb += alu.imm_i.eq(self.imm_i)
+
+        return m
+
+
+class CompUnitBR(CompUnitsBase):
+
+    def __init__(self, rwid, opwid):
+        """ Inputs:
+
+            * :rwid:   bit width of register file(s) - both FP and INT
+            * :opwid:  operand bit width
+
+            Note: bgt unit is returned so that a shadow unit can be created
+            for it
+        """
+        self.opwid = opwid
+
+        # inputs
+        self.oper_i = Signal(opwid, reset_less=True)
+        self.imm_i = Signal(rwid, reset_less=True)
+
+        # Branch ALU and CU
+        self.bgt = BranchALU(rwid)
+        aluopwid = 3 # extra bit for immediate mode
+        self.br1 = ComputationUnitNoDelay(rwid, aluopwid, self.bgt)
+        CompUnitsBase.__init__(self, rwid, [self.br1])
+
+    def elaborate(self, platform):
+        m = CompUnitsBase.elaborate(self, platform)
+        comb = m.d.comb
+
+        # hand the same operation to all units
+        for alu in self.units:
+            comb += alu.oper_i.eq(self.oper_i)
+            comb += alu.imm_i.eq(self.imm_i)
+
+        return m
+
+
+class FunctionUnits(Elaboratable):
+
+    def __init__(self, n_regs, n_int_alus):
+        self.n_regs = n_regs
+        self.n_int_alus = n_int_alus
+
+        self.dest_i = Signal(n_regs, reset_less=True) # Dest R# in
+        self.src1_i = Signal(n_regs, reset_less=True) # oper1 R# in
+        self.src2_i = Signal(n_regs, reset_less=True) # oper2 R# in
+
+        self.g_int_rd_pend_o = Signal(n_regs, reset_less=True)
+        self.g_int_wr_pend_o = Signal(n_regs, reset_less=True)
+
+        self.dest_rsel_o = Signal(n_regs, reset_less=True) # dest reg (bot)
+        self.src1_rsel_o = Signal(n_regs, reset_less=True) # src1 reg (bot)
+        self.src2_rsel_o = Signal(n_regs, reset_less=True) # src2 reg (bot)
+
+        self.req_rel_i = Signal(n_int_alus, reset_less = True)
+        self.readable_o = Signal(n_int_alus, reset_less=True)
+        self.writable_o = Signal(n_int_alus, reset_less=True)
+
+        self.go_rd_i = Signal(n_int_alus, reset_less=True)
+        self.go_wr_i = Signal(n_int_alus, reset_less=True)
+        self.go_die_i = Signal(n_int_alus, reset_less=True)
+        self.req_rel_o = Signal(n_int_alus, reset_less=True)
+        self.fn_issue_i = Signal(n_int_alus, reset_less=True)
+
+        # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        n_intfus = self.n_int_alus
+
+        # Integer FU-FU Dep Matrix
+        intfudeps = FUFUDepMatrix(n_intfus, n_intfus)
+        m.submodules.intfudeps = intfudeps
+        # Integer FU-Reg Dep Matrix
+        intregdeps = FURegDepMatrix(n_intfus, self.n_regs)
+        m.submodules.intregdeps = intregdeps
+
+        comb += self.g_int_rd_pend_o.eq(intregdeps.rd_rsel_o)
+        comb += self.g_int_wr_pend_o.eq(intregdeps.wr_rsel_o)
+
+        comb += intregdeps.rd_pend_i.eq(intregdeps.rd_rsel_o)
+        comb += intregdeps.wr_pend_i.eq(intregdeps.wr_rsel_o)
+
+        comb += intfudeps.rd_pend_i.eq(intregdeps.rd_pend_o)
+        comb += intfudeps.wr_pend_i.eq(intregdeps.wr_pend_o)
+        self.wr_pend_o = intregdeps.wr_pend_o # also output for use in WaWGrid
+
+        comb += intfudeps.issue_i.eq(self.fn_issue_i)
+        comb += intfudeps.go_rd_i.eq(self.go_rd_i)
+        comb += intfudeps.go_wr_i.eq(self.go_wr_i)
+        comb += intfudeps.go_die_i.eq(self.go_die_i)
+        comb += self.readable_o.eq(intfudeps.readable_o)
+        comb += self.writable_o.eq(intfudeps.writable_o)
+
+        # Connect function issue / arrays, and dest/src1/src2
+        comb += intregdeps.dest_i.eq(self.dest_i)
+        comb += intregdeps.src1_i.eq(self.src1_i)
+        comb += intregdeps.src2_i.eq(self.src2_i)
+
+        comb += intregdeps.go_rd_i.eq(self.go_rd_i)
+        comb += intregdeps.go_wr_i.eq(self.go_wr_i)
+        comb += intregdeps.go_die_i.eq(self.go_die_i)
+        comb += intregdeps.issue_i.eq(self.fn_issue_i)
+
+        comb += self.dest_rsel_o.eq(intregdeps.dest_rsel_o)
+        comb += self.src1_rsel_o.eq(intregdeps.src1_rsel_o)
+        comb += self.src2_rsel_o.eq(intregdeps.src2_rsel_o)
+
+        return m
  
  
  class Scoreboard(Elaboratable):
  
  
  class Scoreboard(Elaboratable):
@@ -32,16 +334,37 @@ class Scoreboard(Elaboratable):
          self.intregs = RegFileArray(rwid, n_regs)
          self.fpregs = RegFileArray(rwid, n_regs)
  
          self.intregs = RegFileArray(rwid, n_regs)
          self.fpregs = RegFileArray(rwid, n_regs)
  
+        # issue q needs to get at these
+        self.aluissue = IssueUnitGroup(4)
+        self.brissue = IssueUnitGroup(1)
+        # and these
+        self.alu_oper_i = Signal(4, reset_less=True)
+        self.alu_imm_i = Signal(rwid, reset_less=True)
+        self.br_oper_i = Signal(4, reset_less=True)
+        self.br_imm_i = Signal(rwid, reset_less=True)
+
          # inputs
          # inputs
-        self.int_store_i = Signal(reset_less=True) # instruction is a store
          self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
          self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
          self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
          self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
          self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
          self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
+        self.reg_enable_i = Signal(reset_less=True) # enable reg decode
  
  
+        # outputs
          self.issue_o = Signal(reset_less=True) # instruction was accepted
          self.issue_o = Signal(reset_less=True) # instruction was accepted
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+
+        # for branch speculation experiment.  branch_direction = 0 if
+        # the branch hasn't been met yet.  1 indicates "success", 2 is "fail"
+        # branch_succ and branch_fail are requests to have the current
+        # instruction be dependent on the branch unit "shadow" capability.
+        self.branch_succ_i = Signal(reset_less=True)
+        self.branch_fail_i = Signal(reset_less=True)
+        self.branch_direction_o = Signal(2, reset_less=True)
  
      def elaborate(self, platform):
          m = Module()
  
      def elaborate(self, platform):
          m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
  
          m.submodules.intregs = self.intregs
          m.submodules.fpregs = self.fpregs
  
          m.submodules.intregs = self.intregs
          m.submodules.fpregs = self.fpregs
@@ -55,68 +378,45 @@ class Scoreboard(Elaboratable):
          fp_src1 = self.fpregs.read_port("src1")
          fp_src2 = self.fpregs.read_port("src2")
  
          fp_src1 = self.fpregs.read_port("src1")
          fp_src2 = self.fpregs.read_port("src2")
  
-        # Int ALUs
-        add = ALU(self.rwid)
-        sub = ALU(self.rwid)
-        m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add)
-        m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub)
-        int_alus = [comp1, comp2]
-
-        m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add
-        m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub
+        # Int ALUs and Comp Units
+        n_int_alus = 5
+        cua = CompUnitALUs(self.rwid, 3)
+        cub = CompUnitBR(self.rwid, 3)
+        m.submodules.cu = cu = CompUnitsBase(self.rwid, [cua, cub])
+        bgt = cub.bgt # get at the branch computation unit
+        br1 = cub.br1
  
          # Int FUs
  
          # Int FUs
-        if_l = []
-        int_src1_pend_v = []
-        int_src2_pend_v = []
-        int_rd_pend_v = []
-        int_wr_pend_v = []
-        for i, a in enumerate(int_alus):
-            # set up Integer Function Unit, add to module (and python list)
-            fu = IntFnUnit(self.n_regs, shadow_wid=0)
-            setattr(m.submodules, "intfu%d" % i, fu)
-            if_l.append(fu)
-            # collate the read/write pending vectors (to go into global pending)
-            int_src1_pend_v.append(fu.src1_pend_o)
-            int_src2_pend_v.append(fu.src2_pend_o)
-            int_rd_pend_v.append(fu.int_rd_pend_o)
-            int_wr_pend_v.append(fu.int_wr_pend_o)
-        int_fus = Array(if_l)
+        m.submodules.intfus = intfus = FunctionUnits(self.n_regs, n_int_alus)
  
          # Count of number of FUs
  
          # Count of number of FUs
-        n_int_fus = len(if_l)
+        n_intfus = n_int_alus
          n_fp_fus = 0 # for now
  
          n_fp_fus = 0 # for now
  
-        n_fus = n_int_fus + n_fp_fus # plus FP FUs
-
-        # Integer FU-FU Dep Matrix
-        intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
-        m.submodules.intfudeps = intfudeps
-        # Integer FU-Reg Dep Matrix
-        intregdeps = FURegDepMatrix(n_int_fus, self.n_regs)
-        m.submodules.intregdeps = intregdeps
-
          # Integer Priority Picker 1: Adder + Subtractor
          # Integer Priority Picker 1: Adder + Subtractor
-        intpick1 = GroupPicker(2) # picks between add and sub
+        intpick1 = GroupPicker(n_intfus) # picks between add, sub, mul and shf
          m.submodules.intpick1 = intpick1
  
          m.submodules.intpick1 = intpick1
  
-        # Global Pending Vectors (INT and TODO FP)
-        # NOTE: number of vectors is NOT same as number of FUs.
-        g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v)
-        g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v)
-        g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v)
-        g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v)
-        m.submodules.g_int_src1_pend_v = g_int_src1_pend_v
-        m.submodules.g_int_src2_pend_v = g_int_src2_pend_v
-        m.submodules.g_int_rd_pend_v = g_int_rd_pend_v
-        m.submodules.g_int_wr_pend_v = g_int_wr_pend_v
-
          # INT/FP Issue Unit
          regdecode = RegDecode(self.n_regs)
          m.submodules.regdecode = regdecode
          # INT/FP Issue Unit
          regdecode = RegDecode(self.n_regs)
          m.submodules.regdecode = regdecode
-        issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus)
+        issueunit = IssueUnitArray([self.aluissue, self.brissue])
          m.submodules.issueunit = issueunit
  
          m.submodules.issueunit = issueunit
  
+        # Shadow Matrix.  currently n_intfus shadows, to be used for
+        # write-after-write hazards.  NOTE: there is one extra for branches,
+        # so the shadow width is increased by 1
+        m.submodules.shadows = shadows = ShadowMatrix(n_intfus, n_intfus, True)
+        m.submodules.bshadow = bshadow = ShadowMatrix(n_intfus, 1, False)
+
+        # record previous instruction to cast shadow on current instruction
+        prev_shadow = Signal(n_intfus)
+
+        # Branch Speculation recorder.  tracks the success/fail state as
+        # each instruction is issued, so that when the branch occurs the
+        # allow/cancel can be issued as appropriate.
+        m.submodules.specrec = bspec = BranchSpeculationRecord(n_intfus)
+
          #---------
          # ok start wiring things together...
          # "now hear de word of de looord... dem bones dem bones dem dryy bones"
          #---------
          # ok start wiring things together...
          # "now hear de word of de looord... dem bones dem bones dem dryy bones"
@@ -126,137 +426,326 @@ class Scoreboard(Elaboratable):
          #---------
          # Issue Unit is where it starts.  set up some in/outs for this module
          #---------
          #---------
          # Issue Unit is where it starts.  set up some in/outs for this module
          #---------
-        m.d.comb += [issueunit.i.store_i.eq(self.int_store_i),
-                     regdecode.dest_i.eq(self.int_dest_i),
+        comb += [    regdecode.dest_i.eq(self.int_dest_i),
                       regdecode.src1_i.eq(self.int_src1_i),
                       regdecode.src2_i.eq(self.int_src2_i),
                       regdecode.src1_i.eq(self.int_src1_i),
                       regdecode.src2_i.eq(self.int_src2_i),
-                     regdecode.enable_i.eq(1),
-                     issueunit.i.dest_i.eq(regdecode.dest_o),
+                     regdecode.enable_i.eq(self.reg_enable_i),
                       self.issue_o.eq(issueunit.issue_o)
                      ]
                       self.issue_o.eq(issueunit.issue_o)
                      ]
-        self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode
  
  
-        # connect global rd/wr pending vectors
-        m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
+        # take these to outside (issue needs them)
+        comb += cua.oper_i.eq(self.alu_oper_i)
+        comb += cua.imm_i.eq(self.alu_imm_i)
+        comb += cub.oper_i.eq(self.br_oper_i)
+        comb += cub.imm_i.eq(self.br_imm_i)
+
          # TODO: issueunit.f (FP)
  
          # and int function issue / busy arrays, and dest/src1/src2
          # TODO: issueunit.f (FP)
  
          # and int function issue / busy arrays, and dest/src1/src2
-        fn_busy_l = []
-        fn_issue_l = []
-        for i, alu in enumerate(int_alus):
-            fn_issue_l.append(fu.issue_i)
-            fn_busy_l.append(fu.busy_o)
-            m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i])
-            m.d.comb += fu.dest_i.eq(self.int_dest_i)
-            m.d.comb += fu.src1_i.eq(self.int_src1_i)
-            m.d.comb += fu.src2_i.eq(self.int_src2_i)
-            # XXX sync, so as to stop a simulation infinite loop
-            m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o)
-
-        fn_issue_o = Signal(len(fn_issue_l), reset_less=True)
-        m.d.comb += fn_issue_o.eq(Cat(*fn_issue_l))
-        #fn_issue_o = issueunit.i.fn_issue_o
+        comb += intfus.dest_i.eq(regdecode.dest_o)
+        comb += intfus.src1_i.eq(regdecode.src1_o)
+        comb += intfus.src2_i.eq(regdecode.src2_o)
+
+        fn_issue_o = issueunit.fn_issue_o
+
+        comb += intfus.fn_issue_i.eq(fn_issue_o)
+        comb += issueunit.busy_i.eq(cu.busy_o)
+        comb += self.busy_o.eq(cu.busy_o.bool())
+
          #---------
          #---------
-        # connect fu-fu matrix
+        # merge shadow matrices outputs
          #---------
  
          #---------
  
-        m.d.comb += intfudeps.rd_pend_i.eq(g_int_rd_pend_v.g_pend_o)
-        m.d.comb += intfudeps.wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
+        # these are explained in ShadowMatrix docstring, and are to be
+        # connected to the FUReg and FUFU Matrices, to get them to reset
+        anydie = Signal(n_intfus, reset_less=True)
+        allshadown = Signal(n_intfus, reset_less=True)
+        shreset = Signal(n_intfus, reset_less=True)
+        comb += allshadown.eq(shadows.shadown_o & bshadow.shadown_o)
+        comb += anydie.eq(shadows.go_die_o | bshadow.go_die_o)
+        comb += shreset.eq(bspec.match_g_o | bspec.match_f_o)
  
  
-        # Group Picker... done manually for now.  TODO: cat array of pick sigs
-        go_rd_i = intfudeps.go_rd_i
-        go_wr_i = intfudeps.go_wr_i
-        m.d.sync += go_rd_i[0].eq(intpick1.go_rd_o[0]) # add rd
-        m.d.sync += go_wr_i[0].eq(intpick1.go_wr_o[0]) # add wr
+        #---------
+        # connect fu-fu matrix
+        #---------
  
  
-        m.d.sync += go_rd_i[1].eq(intpick1.go_rd_o[1]) # sub rd
-        m.d.sync += go_wr_i[1].eq(intpick1.go_wr_o[1]) # sub wr
+        # Group Picker... done manually for now.
+        go_rd_o = intpick1.go_rd_o
+        go_wr_o = intpick1.go_wr_o
+        go_rd_i = intfus.go_rd_i
+        go_wr_i = intfus.go_wr_i
+        go_die_i = intfus.go_die_i
+        # NOTE: connect to the shadowed versions so that they can "die" (reset)
+        comb += go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus]) # rd
+        comb += go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus]) # wr
+        comb += go_die_i[0:n_intfus].eq(anydie[0:n_intfus]) # die
  
  
-        m.d.comb += intfudeps.issue_i.eq(fn_issue_o)
+        # Connect Picker
+        #---------
+        comb += intpick1.rd_rel_i[0:n_intfus].eq(cu.rd_rel_o[0:n_intfus])
+        comb += intpick1.req_rel_i[0:n_intfus].eq(cu.req_rel_o[0:n_intfus])
+        int_rd_o = intfus.readable_o
+        int_wr_o = intfus.writable_o
+        comb += intpick1.readable_i[0:n_intfus].eq(int_rd_o[0:n_intfus])
+        comb += intpick1.writable_i[0:n_intfus].eq(int_wr_o[0:n_intfus])
  
          #---------
  
          #---------
-        # connect fu-dep matrix
+        # Shadow Matrix
+        #---------
+
+        comb += shadows.issue_i.eq(fn_issue_o)
+        #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
+        comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
          #---------
          #---------
-        r_go_rd_i = intregdeps.go_rd_i
-        r_go_wr_i = intregdeps.go_wr_i
-        m.d.comb += r_go_rd_i.eq(go_rd_i)
-        m.d.comb += r_go_wr_i.eq(go_wr_i)
+        # NOTE; this setup is for the instruction order preservation...
  
  
-        m.d.comb += intregdeps.dest_i.eq(regdecode.dest_o)
-        m.d.comb += intregdeps.src1_i.eq(regdecode.src1_o)
-        m.d.comb += intregdeps.src2_i.eq(regdecode.src2_o)
-        m.d.comb += intregdeps.issue_i.eq(fn_issue_o)
+        # connect shadows / go_dies to Computation Units
+        comb += cu.shadown_i[0:n_intfus].eq(allshadown)
+        comb += cu.go_die_i[0:n_intfus].eq(anydie)
+
+        # ok connect first n_int_fu shadows to busy lines, to create an
+        # instruction-order linked-list-like arrangement, using a bit-matrix
+        # (instead of e.g. a ring buffer).
+        # XXX TODO
+
+        # when written, the shadow can be cancelled (and was good)
+        for i in range(n_intfus):
+            comb += shadows.s_good_i[i][0:n_intfus].eq(go_wr_o[0:n_intfus])
+
+        # *previous* instruction shadows *current* instruction, and, obviously,
+        # if the previous is completed (!busy) don't cast the shadow!
+        comb += prev_shadow.eq(~fn_issue_o & cu.busy_o)
+        for i in range(n_intfus):
+            comb += shadows.shadow_i[i][0:n_intfus].eq(prev_shadow)
  
  
-        # Connect Picker
          #---------
          #---------
-        m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o)
-        m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o)
-        int_readable_o = intfudeps.readable_o
-        int_writable_o = intfudeps.writable_o
-        m.d.comb += intpick1.readable_i[0].eq(int_readable_o[0]) # add rd
-        m.d.comb += intpick1.writable_i[0].eq(int_writable_o[0]) # add wr
-        m.d.comb += intpick1.readable_i[1].eq(int_readable_o[1]) # sub rd
-        m.d.comb += intpick1.writable_i[1].eq(int_writable_o[1]) # sub wr
+        # ... and this is for branch speculation.  it uses the extra bit
+        # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
+        # only needs to set shadow_i, s_fail_i and s_good_i
+
+        # issue captures shadow_i (if enabled)
+        comb += bshadow.reset_i[0:n_intfus].eq(shreset[0:n_intfus])
+
+        bactive = Signal(reset_less=True)
+        comb += bactive.eq((bspec.active_i | br1.issue_i) & ~br1.go_wr_i)
+
+        # instruction being issued (fn_issue_o) has a shadow cast by the branch
+        with m.If(bactive & (self.branch_succ_i | self.branch_fail_i)):
+            comb += bshadow.issue_i.eq(fn_issue_o)
+            for i in range(n_intfus):
+                with m.If(fn_issue_o & (Const(1<<i))):
+                    comb += bshadow.shadow_i[i][0].eq(1)
+
+        # finally, we need an indicator to the test infrastructure as to
+        # whether the branch succeeded or failed, plus, link up to the
+        # "recorder" of whether the instruction was under shadow or not
+
+        with m.If(br1.issue_i):
+            sync += bspec.active_i.eq(1)
+        with m.If(self.branch_succ_i):
+            comb += bspec.good_i.eq(fn_issue_o & 0x1f)
+        with m.If(self.branch_fail_i):
+            comb += bspec.fail_i.eq(fn_issue_o & 0x1f)
+
+        # branch is active (TODO: a better signal: this is over-using the
+        # go_write signal - actually the branch should not be "writing")
+        with m.If(br1.go_wr_i):
+            sync += self.branch_direction_o.eq(br1.data_o+Const(1, 2))
+            sync += bspec.active_i.eq(0)
+            comb += bspec.br_i.eq(1)
+            # branch occurs if data == 1, failed if data == 0
+            comb += bspec.br_ok_i.eq(br1.data_o == 1)
+            for i in range(n_intfus):
+                # *expected* direction of the branch matched against *actual*
+                comb += bshadow.s_good_i[i][0].eq(bspec.match_g_o[i])
+                # ... or it didn't
+                comb += bshadow.s_fail_i[i][0].eq(bspec.match_f_o[i])
  
          #---------
          # Connect Register File(s)
          #---------
  
          #---------
          # Connect Register File(s)
          #---------
-        m.d.sync += int_dest.wen.eq(intregdeps.dest_rsel_o)
-        m.d.comb += int_src1.ren.eq(intregdeps.src1_rsel_o)
-        m.d.comb += int_src2.ren.eq(intregdeps.src2_rsel_o)
+        comb += int_dest.wen.eq(intfus.dest_rsel_o)
+        comb += int_src1.ren.eq(intfus.src1_rsel_o)
+        comb += int_src2.ren.eq(intfus.src2_rsel_o)
  
  
-        # merge (OR) all integer FU / ALU outputs to a single value
-        # bit of a hack: treereduce needs a list with an item named "dest_o"
-        dest_o = treereduce(int_alus)
-        m.d.comb += int_dest.data_i.eq(dest_o)
+        # connect ALUs to regfule
+        comb += int_dest.data_i.eq(cu.data_o)
+        comb += cu.src1_i.eq(int_src1.data_o)
+        comb += cu.src2_i.eq(int_src2.data_o)
  
  
-        # connect ALUs
-        for i, alu in enumerate(int_alus):
-            m.d.comb += alu.go_rd_i.eq(go_rd_i[i])
-            m.d.comb += alu.go_wr_i.eq(go_wr_i[i])
-            m.d.comb += alu.src1_i.eq(int_src1.data_o)
-            m.d.comb += alu.src2_i.eq(int_src2.data_o)
+        # connect ALU Computation Units
+        comb += cu.go_rd_i[0:n_intfus].eq(go_rd_o[0:n_intfus])
+        comb += cu.go_wr_i[0:n_intfus].eq(go_wr_o[0:n_intfus])
+        comb += cu.issue_i[0:n_intfus].eq(fn_issue_o[0:n_intfus])
  
          return m
  
  
          return m
  
-
      def __iter__(self):
          yield from self.intregs
          yield from self.fpregs
      def __iter__(self):
          yield from self.intregs
          yield from self.fpregs
-        yield self.int_store_i
          yield self.int_dest_i
          yield self.int_src1_i
          yield self.int_src2_i
          yield self.issue_o
          yield self.int_dest_i
          yield self.int_src1_i
          yield self.int_src2_i
          yield self.issue_o
-        #yield from self.int_src1
-        #yield from self.int_dest
-        #yield from self.int_src1
-        #yield from self.int_src2
-        #yield from self.fp_dest
-        #yield from self.fp_src1
-        #yield from self.fp_src2
+        yield self.branch_succ_i
+        yield self.branch_fail_i
+        yield self.branch_direction_o
  
      def ports(self):
          return list(self)
  
  
      def ports(self):
          return list(self)
  
+
+class IssueToScoreboard(Elaboratable):
+
+    def __init__(self, qlen, n_in, n_out, rwid, opwid, n_regs):
+        self.qlen = qlen
+        self.n_in = n_in
+        self.n_out = n_out
+        self.rwid = rwid
+        self.opw = opwid
+        self.n_regs = n_regs
+
+        mqbits = (int(log(qlen) / log(2))+2, False)
+        self.p_add_i = Signal(mqbits) # instructions to add (from data_i)
+        self.p_ready_o = Signal() # instructions were added
+        self.data_i = Instruction.nq(n_in, "data_i", rwid, opwid)
+
+        self.busy_o = Signal(reset_less=True) # at least one CU is busy
+        self.qlen_o = Signal(mqbits, reset_less=True)
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        iq = InstructionQ(self.rwid, self.opw, self.qlen, self.n_in, self.n_out)
+        sc = Scoreboard(self.rwid, self.n_regs)
+        m.submodules.iq = iq
+        m.submodules.sc = sc
+
+        # get at the regfile for testing
+        self.intregs = sc.intregs
+
+        # and the "busy" signal and instruction queue length
+        comb += self.busy_o.eq(sc.busy_o)
+        comb += self.qlen_o.eq(iq.qlen_o)
+
+        # link up instruction queue
+        comb += iq.p_add_i.eq(self.p_add_i)
+        comb += self.p_ready_o.eq(iq.p_ready_o)
+        for i in range(self.n_in):
+            comb += eq(iq.data_i[i], self.data_i[i])
+
+        # take instruction and process it.  note that it's possible to
+        # "inspect" the queue contents *without* actually removing the
+        # items.  items are only removed when the
+
+        # in "waiting" state
+        wait_issue_br = Signal()
+        wait_issue_alu = Signal()
+
+        with m.If(wait_issue_br | wait_issue_alu):
+            # set instruction pop length to 1 if the unit accepted
+            with m.If(wait_issue_br & (sc.brissue.fn_issue_o != 0)):
+                with m.If(iq.qlen_o != 0):
+                    comb += iq.n_sub_i.eq(1)
+            with m.If(wait_issue_alu & (sc.aluissue.fn_issue_o != 0)):
+                with m.If(iq.qlen_o != 0):
+                    comb += iq.n_sub_i.eq(1)
+
+        # see if some instruction(s) are here.  note that this is
+        # "inspecting" the in-place queue.  note also that on the
+        # cycle following "waiting" for fn_issue_o to be set, the
+        # "resetting" done above (insn_i=0) could be re-ASSERTed.
+        with m.If(iq.qlen_o != 0):
+            # get the operands and operation
+            imm = iq.data_o[0].imm_i
+            dest = iq.data_o[0].dest_i
+            src1 = iq.data_o[0].src1_i
+            src2 = iq.data_o[0].src2_i
+            op = iq.data_o[0].oper_i
+            opi = iq.data_o[0].opim_i # immediate set
+
+            # set the src/dest regs
+            comb += sc.int_dest_i.eq(dest)
+            comb += sc.int_src1_i.eq(src1)
+            comb += sc.int_src2_i.eq(src2)
+            comb += sc.reg_enable_i.eq(1) # enable the regfile
+
+            # choose a Function-Unit-Group
+            with m.If((op & (0x3<<2)) != 0): # branch
+                comb += sc.brissue.insn_i.eq(1)
+                comb += sc.br_oper_i.eq(Cat(op[0:2], opi))
+                comb += sc.br_imm_i.eq(imm)
+                comb += wait_issue_br.eq(1)
+            with m.Else():                   # alu
+                comb += sc.aluissue.insn_i.eq(1)
+                comb += sc.alu_oper_i.eq(Cat(op[0:2], opi))
+                comb += sc.alu_imm_i.eq(imm)
+                comb += wait_issue_alu.eq(1)
+
+            # XXX TODO
+            # these indicate that the instruction is to be made
+            # shadow-dependent on
+            # (either) branch success or branch fail
+            #yield sc.branch_fail_i.eq(branch_fail)
+            #yield sc.branch_succ_i.eq(branch_success)
+
+        return m
+
+    def __iter__(self):
+        yield self.p_ready_o
+        for o in self.data_i:
+            yield from list(o)
+        yield self.p_add_i
+
+    def ports(self):
+        return list(self)
+
+
  IADD = 0
  ISUB = 1
  IADD = 0
  ISUB = 1
+IMUL = 2
+ISHF = 3
+IBGT = 4
+IBLT = 5
+IBEQ = 6
+IBNE = 7
  
  class RegSim:
      def __init__(self, rwidth, nregs):
          self.rwidth = rwidth
          self.regs = [0] * nregs
  
  
  class RegSim:
      def __init__(self, rwidth, nregs):
          self.rwidth = rwidth
          self.regs = [0] * nregs
  
-    def op(self, op, src1, src2, dest):
-        src1 = self.regs[src1]
-        src2 = self.regs[src2]
+    def op(self, op, op_imm, imm, src1, src2, dest):
+        maxbits = (1 << self.rwidth) - 1
+        src1 = self.regs[src1] & maxbits
+        if op_imm:
+            src2 = imm
+        else:
+            src2 = self.regs[src2] & maxbits
          if op == IADD:
          if op == IADD:
-            val = (src1 + src2) & ((1<<(self.rwidth))-1)
+            val = src1 + src2
          elif op == ISUB:
          elif op == ISUB:
-            val = (src1 - src2) & ((1<<(self.rwidth))-1)
-        self.regs[dest] = val
+            val = src1 - src2
+        elif op == IMUL:
+            val = src1 * src2
+        elif op == ISHF:
+            val = src1 >> (src2 & maxbits)
+        elif op == IBGT:
+            val = int(src1 > src2)
+        elif op == IBLT:
+            val = int(src1 < src2)
+        elif op == IBEQ:
+            val = int(src1 == src2)
+        elif op == IBNE:
+            val = int(src1 != src2)
+        val &= maxbits
+        self.setval(dest, val)
+        return val
  
      def setval(self, dest, val):
  
      def setval(self, dest, val):
+        print ("sim setval", dest, hex(val))
          self.regs[dest] = val
  
      def dump(self, dut):
          self.regs[dest] = val
  
      def dump(self, dut):
@@ -273,14 +762,50 @@ class RegSim:
                  yield from self.dump(dut)
                  assert False
  
                  yield from self.dump(dut)
                  assert False
  
-def int_instr(dut, alusim, op, src1, src2, dest):
-    for i in range(len(dut.int_insn_i)):
-        yield dut.int_insn_i[i].eq(0)
+def instr_q(dut, op, op_imm, imm, src1, src2, dest,
+            branch_success, branch_fail):
+    instrs = [{'oper_i': op, 'dest_i': dest, 'imm_i': imm, 'opim_i': op_imm,
+               'src1_i': src1, 'src2_i': src2}]
+
+    sendlen = 1
+    for idx in range(sendlen):
+        yield from eq(dut.data_i[idx], instrs[idx])
+        di = yield dut.data_i[idx]
+        print ("senddata %d %x" % (idx, di))
+    yield dut.p_add_i.eq(sendlen)
+    yield
+    o_p_ready = yield dut.p_ready_o
+    while not o_p_ready:
+        yield
+        o_p_ready = yield dut.p_ready_o
+
+    yield dut.p_add_i.eq(0)
+
+
+def int_instr(dut, op, imm, src1, src2, dest, branch_success, branch_fail):
+    yield from disable_issue(dut)
      yield dut.int_dest_i.eq(dest)
      yield dut.int_src1_i.eq(src1)
      yield dut.int_src2_i.eq(src2)
      yield dut.int_dest_i.eq(dest)
      yield dut.int_src1_i.eq(src1)
      yield dut.int_src2_i.eq(src2)
-    yield dut.int_insn_i[op].eq(1)
-    alusim.op(op, src1, src2, dest)
+    if (op & (0x3<<2)) != 0: # branch
+        yield dut.brissue.insn_i.eq(1)
+        yield dut.br_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.br_imm_i.eq(imm)
+        dut_issue = dut.brissue
+    else:
+        yield dut.aluissue.insn_i.eq(1)
+        yield dut.alu_oper_i.eq(Const(op & 0x3, 2))
+        yield dut.alu_imm_i.eq(imm)
+        dut_issue = dut.aluissue
+    yield dut.reg_enable_i.eq(1)
+
+    # these indicate that the instruction is to be made shadow-dependent on
+    # (either) branch success or branch fail
+    yield dut.branch_fail_i.eq(branch_fail)
+    yield dut.branch_succ_i.eq(branch_success)
+
+    yield
+    yield from wait_for_issue(dut, dut_issue)
  
  
  def print_reg(dut, rnums):
  
  
  def print_reg(dut, rnums):
@@ -292,85 +817,310 @@ def print_reg(dut, rnums):
      print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
  
  
      print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
  
  
-def scoreboard_sim(dut, alusim):
-    yield dut.int_store_i.eq(0)
+def create_random_ops(dut, n_ops, shadowing=False, max_opnums=3):
+    insts = []
+    for i in range(n_ops):
+        src1 = randint(1, dut.n_regs-1)
+        src2 = randint(1, dut.n_regs-1)
+        imm = randint(1, (1<<dut.rwid)-1)
+        dest = randint(1, dut.n_regs-1)
+        op = randint(0, max_opnums)
+        opi = 0 if randint(0, 2) else 1 # set true if random is nonzero
+
+        if shadowing:
+            insts.append((src1, src2, dest, op, opi, imm, (0, 0)))
+        else:
+            insts.append((src1, src2, dest, op, opi, imm))
+    return insts
+
+
+def wait_for_busy_clear(dut):
+    while True:
+        busy_o = yield dut.busy_o
+        if not busy_o:
+            break
+        print ("busy",)
+        yield
  
  
-    for i in range(1, dut.n_regs):
-        yield dut.intregs.regs[i].reg.eq(i)
-        alusim.setval(i, i)
+def disable_issue(dut):
+    yield dut.aluissue.insn_i.eq(0)
+    yield dut.brissue.insn_i.eq(0)
  
  
-    if False:
-        yield from int_instr(dut, alusim, IADD, 4, 3, 5)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from int_instr(dut, alusim, IADD, 5, 2, 5)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from int_instr(dut, alusim, ISUB, 5, 1, 3)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        for i in range(len(dut.int_insn_i)):
-            yield dut.int_insn_i[i].eq(0)
-        yield from print_reg(dut, [3,4,5])
-        yield
-        yield from print_reg(dut, [3,4,5])
+
+def wait_for_issue(dut, dut_issue):
+    while True:
+        issue_o = yield dut_issue.fn_issue_o
+        if issue_o:
+            yield from disable_issue(dut)
+            yield dut.reg_enable_i.eq(0)
+            break
+        print ("busy",)
+        #yield from print_reg(dut, [1,2,3])
          yield
          yield
-        yield from print_reg(dut, [3,4,5])
+    #yield from print_reg(dut, [1,2,3])
+
+def scoreboard_branch_sim(dut, alusim):
+
+    iseed = 3
+
+    for i in range(1):
+
+        print ("rseed", iseed)
+        seed(iseed)
+        iseed += 1
+
+        yield dut.branch_direction_o.eq(0)
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = 31+i*3
+            val = randint(0, (1<<alusim.rwidth)-1)
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        if False:
+            # create some instructions: branches create a tree
+            insts = create_random_ops(dut, 1, True, 1)
+            #insts.append((6, 6, 1, 2, (0, 0)))
+            #insts.append((4, 3, 3, 0, (0, 0)))
+
+            src1 = randint(1, dut.n_regs-1)
+            src2 = randint(1, dut.n_regs-1)
+            #op = randint(4, 7)
+            op = 4 # only BGT at the moment
+
+            branch_ok = create_random_ops(dut, 1, True, 1)
+            branch_fail = create_random_ops(dut, 1, True, 1)
+
+            insts.append((src1, src2, (branch_ok, branch_fail), op, (0, 0)))
+
+        if True:
+            insts = []
+            insts.append( (3, 5, 2, 0, (0, 0)) )
+            branch_ok = []
+            branch_fail = []
+            #branch_ok.append  ( (5, 7, 5, 1, (1, 0)) )
+            branch_ok.append( None )
+            branch_fail.append( (1, 1, 2, 0, (0, 1)) )
+            #branch_fail.append( None )
+            insts.append( (6, 4, (branch_ok, branch_fail), 4, (0, 0)) )
+
+        siminsts = deepcopy(insts)
+
+        # issue instruction(s)
+        i = -1
+        instrs = insts
+        branch_direction = 0
+        while instrs:
+            yield
+            yield
+            i += 1
+            branch_direction = yield dut.branch_direction_o # way branch went
+            (src1, src2, dest, op, (shadow_on, shadow_off)) = insts.pop(0)
+            if branch_direction == 1 and shadow_on:
+                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
+                continue # branch was "success" and this is a "failed"... skip
+            if branch_direction == 2 and shadow_off:
+                print ("skip", i, src1, src2, dest, op, shadow_on, shadow_off)
+                continue # branch was "fail" and this is a "success"... skip
+            if branch_direction != 0:
+                shadow_on = 0
+                shadow_off = 0
+            is_branch = op >= 4
+            if is_branch:
+                branch_ok, branch_fail = dest
+                dest = src2
+                # ok zip up the branch success / fail instructions and
+                # drop them into the queue, one marked "to have branch success"
+                # the other to be marked shadow branch "fail".
+                # one out of each of these will be cancelled
+                for ok, fl in zip(branch_ok, branch_fail):
+                    if ok:
+                        instrs.append((ok[0], ok[1], ok[2], ok[3], (1, 0)))
+                    if fl:
+                        instrs.append((fl[0], fl[1], fl[2], fl[3], (0, 1)))
+            print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
+                            (i, src1, src2, dest, op, shadow_on, shadow_off))
+            yield from int_instr(dut, op, src1, src2, dest,
+                                 shadow_on, shadow_off)
+
+        # wait for all instructions to stop before checking
          yield
          yield
-
+        yield from wait_for_busy_clear(dut)
+
+        i = -1
+        while siminsts:
+            instr = siminsts.pop(0)
+            if instr is None:
+                continue
+            (src1, src2, dest, op, (shadow_on, shadow_off)) = instr
+            i += 1
+            is_branch = op >= 4
+            if is_branch:
+                branch_ok, branch_fail = dest
+                dest = src2
+            print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
+                            (i, src1, src2, dest, op, shadow_on, shadow_off))
+            branch_res = alusim.op(op, src1, src2, dest)
+            if is_branch:
+                if branch_res:
+                    siminsts += branch_ok
+                else:
+                    siminsts += branch_fail
+
+        # check status
          yield from alusim.check(dut)
          yield from alusim.check(dut)
+        yield from alusim.dump(dut)
  
  
-    for i in range(4):
-        src1 = randint(1, dut.n_regs-1)
-        src2 = randint(1, dut.n_regs-1)
+
+def scoreboard_sim(dut, alusim):
+
+    seed(0)
+
+    for i in range(50):
+
+        # set random values in the registers
+        for i in range(1, dut.n_regs):
+            val = randint(0, (1<<alusim.rwidth)-1)
+            #val = 31+i*3
+            #val = i
+            yield dut.intregs.regs[i].reg.eq(val)
+            alusim.setval(i, val)
+
+        # create some instructions (some random, some regression tests)
+        instrs = []
+        if True:
+            instrs = create_random_ops(dut, 15, True, 4)
+
+        if False:
+            instrs.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
+
+        if False:
+            instrs.append( (7, 3, 2, 4, (0, 0)) )
+            instrs.append( (7, 6, 6, 2, (0, 0)) )
+            instrs.append( (1, 7, 2, 2, (0, 0)) )
+
+        if False:
+            instrs.append((2, 3, 3, 0, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 1, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 2, 0, 0, (0, 0)))
+            instrs.append((5, 3, 3, 3, 0, 0, (0, 0)))
+            instrs.append((3, 5, 5, 0, 0, 0, (0, 0)))
+
+        if False:
+            instrs.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
+            instrs.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
+            instrs.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
+
+        if False:
+            instrs.append((5, 6, 2, 1))
+            instrs.append((2, 2, 4, 0))
+            #instrs.append((2, 2, 3, 1))
+
+        if False:
+            instrs.append((2, 1, 2, 3))
+
+        if False:
+            instrs.append((2, 6, 2, 1))
+            instrs.append((2, 1, 2, 0))
+
+        if False:
+            instrs.append((1, 2, 7, 2))
+            instrs.append((7, 1, 5, 0))
+            instrs.append((4, 4, 1, 1))
+
+        if False:
+            instrs.append((5, 6, 2, 2))
+            instrs.append((1, 1, 4, 1))
+            instrs.append((6, 5, 3, 0))
+
+        if False:
+            # Write-after-Write Hazard
+            instrs.append( (3, 6, 7, 2) )
+            instrs.append( (4, 4, 7, 1) )
+
+        if False:
+            # self-read/write-after-write followed by Read-after-Write
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # Read-after-Write followed by self-read-after-write
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+
+        if False:
+            # self-read-write sandwich
+            instrs.append((5, 6, 1, 2))
+            instrs.append((1, 1, 1, 1))
+            instrs.append((1, 5, 3, 0))
+
+        if False:
+            # very weird failure
+            instrs.append( (5, 2, 5, 2) )
+            instrs.append( (2, 6, 3, 0) )
+            instrs.append( (4, 2, 2, 1) )
+
+        if False:
+            v1 = 4
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (0, 1)))
+
+        if False:
+            v1 = 6
+            yield dut.intregs.regs[5].reg.eq(v1)
+            alusim.setval(5, v1)
+            yield dut.intregs.regs[3].reg.eq(5)
+            alusim.setval(3, 5)
+            instrs.append((5, 3, 3, 4, (0, 0)))
+            instrs.append((4, 2, 1, 2, (1, 0)))
+
+        if False:
+            instrs.append( (4, 3, 5, 1, 0, (0, 0)) )
+            instrs.append( (5, 2, 3, 1, 0, (0, 0)) )
+            instrs.append( (7, 1, 5, 2, 0, (0, 0)) )
+            instrs.append( (5, 6, 6, 4, 0, (0, 0)) )
+            instrs.append( (7, 5, 2, 2, 0, (1, 0)) )
+            instrs.append( (1, 7, 5, 0, 0, (0, 1)) )
+            instrs.append( (1, 6, 1, 2, 0, (1, 0)) )
+            instrs.append( (1, 6, 7, 3, 0, (0, 0)) )
+            instrs.append( (6, 7, 7, 0, 0, (0, 0)) )
+
+        # issue instruction(s), wait for issue to be free before proceeding
+        for i, instr in enumerate(instrs):
+            src1, src2, dest, op, opi, imm, (br_ok, br_fail) = instr
+
+            print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
+                    (i, src1, src2, dest, op, opi, imm))
+            alusim.op(op, opi, imm, src1, src2, dest)
+            yield from instr_q(dut, op, opi, imm, src1, src2, dest,
+                               br_ok, br_fail)
+
+        # wait for all instructions to stop before checking
          while True:
          while True:
-            dest = randint(1, dut.n_regs-1)
-            break
-            if dest not in [src1, src2]:
+            iqlen = yield dut.qlen_o
+            if iqlen == 0:
                  break
                  break
-        src1 = 1
-        src2 = 7
-        dest = src2
-
-        op = randint(0, 1)
-        op = 0
-        print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest))
-        yield from int_instr(dut, alusim, op, src1, src2, dest)
-        yield from print_reg(dut, [3,4,5])
+            yield
          yield
          yield
-        yield from print_reg(dut, [3,4,5])
-        for i in range(len(dut.int_insn_i)):
-            yield dut.int_insn_i[i].eq(0)
          yield
          yield
          yield
          yield
+        yield
+        yield from wait_for_busy_clear(dut)
  
  
-
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield from print_reg(dut, [3,4,5])
-    yield
-    yield
-    yield
-    yield
-    yield from alusim.check(dut)
-
-
-def explore_groups(dut):
-    from nmigen.hdl.ir import Fragment
-    from nmigen.hdl.xfrm import LHSGroupAnalyzer
-
-    fragment = dut.elaborate(platform=None)
-    fr = Fragment.get(fragment, platform=None)
-
-    groups = LHSGroupAnalyzer()(fragment._statements)
-
-    print (groups)
+        # check status
+        yield from alusim.check(dut)
+        yield from alusim.dump(dut)
  
  
  def test_scoreboard():
  
  
  def test_scoreboard():
-    dut = Scoreboard(32, 8)
-    alusim = RegSim(32, 8)
+    dut = IssueToScoreboard(2, 1, 1, 16, 8, 8)
+    alusim = RegSim(16, 8)
+    memsim = MemSim(16, 16)
      vl = rtlil.convert(dut, ports=dut.ports())
      with open("test_scoreboard6600.il", "w") as f:
          f.write(vl)
      vl = rtlil.convert(dut, ports=dut.ports())
      with open("test_scoreboard6600.il", "w") as f:
          f.write(vl)
@@ -378,6 +1128,9 @@ def test_scoreboard():
      run_simulation(dut, scoreboard_sim(dut, alusim),
                          vcd_name='test_scoreboard6600.vcd')
  
      run_simulation(dut, scoreboard_sim(dut, alusim),
                          vcd_name='test_scoreboard6600.vcd')
  
+    #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
+    #                    vcd_name='test_scoreboard6600.vcd')
+
  
  if __name__ == '__main__':
      test_scoreboard()
  
  if __name__ == '__main__':
      test_scoreboard()