Handle newer nMigen adding a "bench" hierarchy root in VCD files

[soc.git] / src / soc / experiment / test / test_compalu_multi.py
diff --git a/src/soc/experiment/test/test_compalu_multi.py b/src/soc/experiment/test/test_compalu_multi.py

index 7c3d122ca6f9f4339d4ecfe076a211dd78e6d1fa..2f2c51d1c18888187c4d540e54fb5604d9b8e236 100644 (file)
--- a/src/soc/experiment/test/test_compalu_multi.py
+++ b/src/soc/experiment/test/test_compalu_multi.py
@@ -12,9 +12,10 @@ Documented at http://libre-soc.org/3d_gpu/architecture/compunit
  
  from soc.experiment.alu_fsm import Shifter, CompFSMOpSubset
  from soc.fu.alu.alu_input_record import CompALUOpSubset
+from soc.fu.cr.cr_input_record import CompCROpSubset
  from soc.experiment.alu_hier import ALU, DummyALU
  from soc.experiment.compalu_multi import MultiCompUnit
-from soc.decoder.power_enums import MicrOp
+from openpower.decoder.power_enums import MicrOp
  from nmutil.gtkw import write_gtkw
  from nmigen import Module, Signal
  from nmigen.cli import rtlil
@@ -57,6 +58,7 @@ class OperandProducer:
          # transaction parameters, passed via signals
          self.delay = Signal(8)
          self.data = Signal.like(self.port)
+        self.data_valid = False
          # add ourselves to the simulation process list
          sim.add_sync_process(self._process)
  
@@ -71,6 +73,7 @@ class OperandProducer:
                  yield
                  yield Settle()
              # read the transaction parameters
+            assert self.data_valid, "an unexpected operand was consumed"
              delay = (yield self.delay)
              data = (yield self.data)
              # wait for `delay` cycles
@@ -81,6 +84,7 @@ class OperandProducer:
              yield self.port.eq(data)
              yield self.count.eq(self.count + 1)
              yield
+            self.data_valid = False
              yield self.go_i.eq(0)
              yield self.port.eq(0)
  
@@ -98,6 +102,7 @@ class OperandProducer:
          """
          yield self.data.eq(data)
          yield self.delay.eq(delay)
+        self.data_valid = True
  
  
  class ResultConsumer:
@@ -126,6 +131,7 @@ class ResultConsumer:
          # transaction parameters, passed via signals
          self.delay = Signal(8)
          self.expected = Signal.like(self.port)
+        self.expecting = False
          # add ourselves to the simulation process list
          sim.add_sync_process(self._process)
  
@@ -140,6 +146,7 @@ class ResultConsumer:
                  yield
                  yield Settle()
              # read the transaction parameters
+            assert self.expecting, "an unexpected result was produced"
              delay = (yield self.delay)
              expected = (yield self.expected)
              # wait for `delay` cycles
@@ -170,6 +177,7 @@ class ResultConsumer:
          """
          yield self.expected.eq(expected)
          yield self.delay.eq(delay)
+        self.expecting = True
  
  
  def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0, zero_a=0):
@@ -278,102 +286,225 @@ def scoreboard_sim_fsm(dut, producers, consumers):
      yield from op_sim_fsm(21, 0, 0, 21, [1, 1, 1])
  
  
-def scoreboard_sim_dummy(dut):
-    result = yield from op_sim(dut, 5, 2, MicrOp.OP_NOP, inv_a=0,
-                               imm=8, imm_ok=1)
-    assert result == 5, result
-
-    result = yield from op_sim(dut, 9, 2, MicrOp.OP_NOP, inv_a=0,
-                               imm=8, imm_ok=1)
-    assert result == 9, result
+def scoreboard_sim_dummy(op):
+    yield from op.issue([5, 2, 0], MicrOp.OP_NOP, [5],
+                        src_delays=[0, 2, 1], dest_delays=[0])
+    yield from op.issue([9, 2, 0], MicrOp.OP_NOP, [9],
+                        src_delays=[2, 1, 0], dest_delays=[2])
+    # test all combinations of masked input ports
+    yield from op.issue([5, 2, 0], MicrOp.OP_NOP, [0],
+                        rdmaskn=[1, 0, 0],
+                        src_delays=[0, 2, 1], dest_delays=[0])
+    yield from op.issue([9, 2, 0], MicrOp.OP_NOP, [9],
+                        rdmaskn=[0, 1, 0],
+                        src_delays=[2, 1, 0], dest_delays=[2])
+    yield from op.issue([5, 2, 0], MicrOp.OP_NOP, [5],
+                        rdmaskn=[0, 0, 1],
+                        src_delays=[2, 1, 0], dest_delays=[2])
+    yield from op.issue([9, 2, 0], MicrOp.OP_NOP, [9],
+                        rdmaskn=[0, 1, 1],
+                        src_delays=[2, 1, 0], dest_delays=[2])
+    yield from op.issue([9, 2, 0], MicrOp.OP_NOP, [0],
+                        rdmaskn=[1, 1, 0],
+                        src_delays=[2, 1, 0], dest_delays=[2])
+    yield from op.issue([9, 2, 0], MicrOp.OP_NOP, [0],
+                        rdmaskn=[1, 1, 1],
+                        src_delays=[2, 1, 0], dest_delays=[2])
  
  
  class OpSim:
      """ALU Operation issuer
  
      Issues operations to the DUT"""
-    def __init__(self, dut, producers, consumers):
+    def __init__(self, dut, sim):
          self.op_count = 0
          self.zero_a_count = 0
          self.imm_ok_count = 0
+        self.rdmaskn_count = [0] * len(dut.src_i)
+        self.wrmask_count = [0] * len(dut.dest)
          self.dut = dut
-        self.producers = producers
-        self.consumers = consumers
-
-    def issue(self, a, b, op, expected, delays,
-              inv_a=0, imm=0, imm_ok=0, zero_a=0):
+        # create one operand producer for each input port
+        self.producers = list()
+        for i in range(len(dut.src_i)):
+            self.producers.append(OperandProducer(sim, dut, i))
+        # create one result consumer for each output port
+        self.consumers = list()
+        for i in range(len(dut.dest)):
+            self.consumers.append(ResultConsumer(sim, dut, i))
+
+    def issue(self, src_i, op, expected, src_delays, dest_delays,
+              inv_a=0, imm=0, imm_ok=0, zero_a=0, rc=0,
+              rdmaskn=None, wrmask=None):
          """Executes the issue operation"""
          dut = self.dut
          producers = self.producers
          consumers = self.consumers
-        print("issue", a, b, op, expected)
+        if rdmaskn is None:
+            rdmaskn = [0] * len(src_i)
+        if wrmask is None:
+            wrmask = [0] * len(expected)
          yield dut.issue_i.eq(0)
          yield
          # forward data and delays to the producers and consumers
+        # first, send special cases (with zero_a and/or imm_ok)
          if not zero_a:
-            yield from producers[0].send(a, delays[0])
+            yield from producers[0].send(src_i[0], src_delays[0])
          if not imm_ok:
-            yield from producers[1].send(b, delays[1])
-        yield from consumers[0].receive(expected, delays[2])
+            yield from producers[1].send(src_i[1], src_delays[1])
+        # then, send the rest (if any)
+        for i in range(2, len(producers)):
+            yield from producers[i].send(src_i[i], src_delays[i])
+        for i in range(len(consumers)):
+            yield from consumers[i].receive(expected[i], dest_delays[i])
          # submit operation, and assert issue_i for one cycle
          yield dut.oper_i.insn_type.eq(op)
-        yield dut.oper_i.invert_in.eq(inv_a)
-        yield dut.oper_i.imm_data.data.eq(imm)
-        yield dut.oper_i.imm_data.ok.eq(imm_ok)
-        yield dut.oper_i.zero_a.eq(zero_a)
+        if hasattr(dut.oper_i, "invert_in"):
+            yield dut.oper_i.invert_in.eq(inv_a)
+        if hasattr(dut.oper_i, "imm_data"):
+            yield dut.oper_i.imm_data.data.eq(imm)
+            yield dut.oper_i.imm_data.ok.eq(imm_ok)
+        if hasattr(dut.oper_i, "zero_a"):
+            yield dut.oper_i.zero_a.eq(zero_a)
+        if hasattr(dut.oper_i, "rc"):
+            yield dut.oper_i.rc.rc.eq(rc)
+        if hasattr(dut, "rdmaskn"):
+            rdmaskn_bits = 0
+            for i in range(len(rdmaskn)):
+                rdmaskn_bits |= rdmaskn[i] << i
+            yield dut.rdmaskn.eq(rdmaskn_bits)
          yield dut.issue_i.eq(1)
          yield
          yield dut.issue_i.eq(0)
+        # deactivate decoder inputs along with issue_i, so we can be sure they
+        # were latched at the correct cycle
+        # note: rdmaskn is not latched, and must be held as long as
+        # busy_o is active
+        # See: https://bugs.libre-soc.org/show_bug.cgi?id=336#c44
+        yield self.dut.oper_i.insn_type.eq(0)
+        if hasattr(dut.oper_i, "invert_in"):
+            yield self.dut.oper_i.invert_in.eq(0)
+        if hasattr(dut.oper_i, "imm_data"):
+            yield self.dut.oper_i.imm_data.data.eq(0)
+            yield self.dut.oper_i.imm_data.ok.eq(0)
+        if hasattr(dut.oper_i, "zero_a"):
+            yield self.dut.oper_i.zero_a.eq(0)
+        if hasattr(dut.oper_i, "rc"):
+            yield dut.oper_i.rc.rc.eq(0)
          # wait for busy to be negated
          yield Settle()
          while (yield dut.busy_o):
              yield
              yield Settle()
+        # now, deactivate rdmaskn
+        if hasattr(dut, "rdmaskn"):
+            yield dut.rdmaskn.eq(0)
          # update the operation count
          self.op_count = (self.op_count + 1) & 255
-        # On zero_a and imm_ok executions, the producer counters will fall
-        # behind. But, by summing the following counts, the invariant is
+        # On zero_a, imm_ok and rdmaskn executions, the producer counters will
+        # fall behind. But, by summing the following counts, the invariant is
          # preserved.
-        if zero_a:
-            self.zero_a_count = self.zero_a_count + 1
-        if imm_ok:
-            self.imm_ok_count = self.imm_ok_count + 1
+        if zero_a and not rdmaskn[0]:
+            self.zero_a_count += 1
+        if imm_ok and not rdmaskn[1]:
+            self.imm_ok_count += 1
+        for i in range(len(rdmaskn)):
+            if rdmaskn[i]:
+                self.rdmaskn_count[i] += 1
+        for i in range(len(wrmask)):
+            if wrmask[i]:
+                self.wrmask_count[i] += 1
          # check that producers and consumers have the same count
          # this assures that no data was left unused or was lost
-        assert (yield producers[0].count) + self.zero_a_count == self.op_count
-        assert (yield producers[1].count) + self.imm_ok_count == self.op_count
-        assert (yield consumers[0].count) == self.op_count
+        # first, check special cases (zero_a and imm_ok)
+        port_a_cnt = \
+            (yield producers[0].count) \
+            + self.zero_a_count \
+            + self.rdmaskn_count[0]
+        port_b_cnt = \
+            (yield producers[1].count) \
+            + self.imm_ok_count \
+            + self.rdmaskn_count[1]
+        assert port_a_cnt == self.op_count
+        assert port_b_cnt == self.op_count
+        # then, check the rest (if any)
+        for i in range(2, len(producers)):
+            port_cnt = (yield producers[i].count) + self.rdmaskn_count[i]
+            assert port_cnt == self.op_count
+        # check write counter
+        for i in range(len(consumers)):
+            port_cnt = (yield consumers[i].count) + self.wrmask_count[i]
+            assert port_cnt == self.op_count
  
  
  def scoreboard_sim(op):
+    # the following tests cases have rc=0, so no CR output is expected
      # zero (no) input operands test
      # 0 + 8 = 8
-    yield from op.issue(5, 2, MicrOp.OP_ADD,
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [8, 0],
                          zero_a=1, imm=8, imm_ok=1,
-                        expected=8, delays=[0, 2, 0])
+                        wrmask=[0, 1],
+                        src_delays=[0, 2], dest_delays=[0, 0])
      # 5 + 8 = 13
-    yield from op.issue(5, 2, MicrOp.OP_ADD,
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [13, 0],
                          inv_a=0, imm=8, imm_ok=1,
-                        expected=13, delays=[2, 0, 2])
+                        wrmask=[0, 1],
+                        src_delays=[2, 0], dest_delays=[2, 0])
      # 5 + 2 = 7
-    yield from op.issue(5, 2, MicrOp.OP_ADD,
-                        expected=7, delays=[1, 1, 1])
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [7, 0],
+                        wrmask=[0, 1],
+                        src_delays=[1, 1], dest_delays=[1, 0])
      # (-6) + 2 = (-4)
-    yield from op.issue(5, 2, MicrOp.OP_ADD, inv_a=1,
-                        expected=65532, delays=[1, 2, 0])
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [65532, 0],
+                        inv_a=1,
+                        wrmask=[0, 1],
+                        src_delays=[1, 2], dest_delays=[0, 0])
      # 0 + 2 = 2
-    yield from op.issue(5, 2, MicrOp.OP_ADD, zero_a=1,
-                        expected=2, delays=[2, 0, 1])
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [2, 0],
+                        zero_a=1,
+                        wrmask=[0, 1],
+                        src_delays=[2, 0], dest_delays=[1, 0])
+
+    # test all combinations of masked input ports
+    # NOP does not make any request nor response
+    yield from op.issue([5, 2], MicrOp.OP_NOP, [0, 0],
+                        rdmaskn=[1, 1], wrmask=[1, 1],
+                        src_delays=[1, 2], dest_delays=[1, 0])
+    # sign_extend(0x80) = 0xFF80
+    yield from op.issue([0x80, 2], MicrOp.OP_EXTS, [0xFF80, 0],
+                        rdmaskn=[0, 1], wrmask=[0, 1],
+                        src_delays=[2, 1], dest_delays=[0, 0])
+    # sign_extend(0x80) = 0xFF80
+    yield from op.issue([2, 0x80], MicrOp.OP_EXTSWSLI, [0xFF80, 0],
+                        rdmaskn=[1, 0], wrmask=[0, 1],
+                        src_delays=[1, 2], dest_delays=[1, 0])
  
      # test combinatorial zero-delay operation
-    # In the test ALU, any operation other than ADD, MUL or SHR
+    # In the test ALU, any operation other than ADD, MUL, EXTS or SHR
      # is zero-delay, and do a subtraction.
-    yield from op.issue(5, 2, MicrOp.OP_NOP,
-                        expected=3, delays=[0, 1, 2])
+    # 5 - 2 = 3
+    yield from op.issue([5, 2], MicrOp.OP_CMP, [3, 0],
+                        wrmask=[0, 1],
+                        src_delays=[0, 1], dest_delays=[2, 0])
+
+    # test with rc=1, so expect results on the CR output port
+    # 5 + 2 = 7
+    # 7 > 0 => CR = 0b100
+    yield from op.issue([5, 2], MicrOp.OP_ADD, [7, 0b100],
+                        rc=1,
+                        src_delays=[1, 1], dest_delays=[1, 0])
+    # sign_extend(0x80) = 0xFF80
+    # -128 < 0 => CR = 0b010
+    yield from op.issue([0x80, 2], MicrOp.OP_EXTS, [0xFF80, 0b010],
+                        rc=1, rdmaskn=[0, 1],
+                        src_delays=[2, 1], dest_delays=[0, 2])
+    # 5 - 5 = 0
+    # 0 == 0 => CR = 0b001
+    yield from op.issue([5, 2], MicrOp.OP_CMP, [0, 0b001],
+                        imm=5, imm_ok=1, rc=1,
+                        src_delays=[0, 1], dest_delays=[2, 1])
  
  
  def test_compunit_fsm():
-    top = "top.cu" if is_engine_pysim() else "cu"
      style = {
          'in': {'color': 'orange'},
          'out': {'color': 'yellow'},
@@ -394,22 +525,23 @@ def test_compunit_fsm():
              'src2_i[7:0]']),
          ('result port', 'out', [
              'cu_wr__rel_o', 'cu_wr__go_i', 'dest1_o[7:0]']),
-        ('alu', {'module': top+'.alu'}, [
+        ('alu', {'submodule': 'alu'}, [
              ('prev port', 'in', [
                  'op__sdir', 'p_data_i[7:0]', 'p_shift_i[7:0]',
-                'p_valid_i', 'p_ready_o']),
+                ({'submodule': 'p'},
+                    ['p_i_valid', 'p_o_ready'])]),
              ('next port', 'out', [
-                'n_data_o[7:0]', 'n_valid_o', 'n_ready_i']),
-        ]),
-        ('debug', {'module': 'top'},
-         ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])
+                'n_data_o[7:0]',
+                ({'submodule': 'n'},
+                    ['n_o_valid', 'n_i_ready'])])]),
+        ('debug', {'module': 'bench'},
+            ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
  
-    ]
      write_gtkw(
          "test_compunit_fsm1.gtkw",
          "test_compunit_fsm1.vcd",
          traces, style,
-        module=top
+        module='bench.top.cu'
      )
      m = Module()
      alu = Shifter(8)
@@ -443,7 +575,7 @@ def test_compunit():
  
      m = Module()
      alu = ALU(16)
-    dut = MultiCompUnit(16, alu, CompALUOpSubset)
+    dut = MultiCompUnit(16, alu, CompALUOpSubset, n_dst=2)
      m.submodules.cu = dut
  
      vl = rtlil.convert(dut, ports=dut.ports())
@@ -453,236 +585,19 @@ def test_compunit():
      sim = Simulator(m)
      sim.add_clock(1e-6)
  
-    # create one operand producer for each input port
-    prod_a = OperandProducer(sim, dut, 0)
-    prod_b = OperandProducer(sim, dut, 1)
-    # create an result consumer for the output port
-    cons = ResultConsumer(sim, dut, 0)
      # create an operation issuer
-    op = OpSim(dut, [prod_a, prod_b], [cons])
+    op = OpSim(dut, sim)
      sim.add_sync_process(wrap(scoreboard_sim(op)))
      sim_writer = sim.write_vcd('test_compunit1.vcd')
      with sim_writer:
          sim.run()
  
  
-class CompUnitParallelTest:
-    def __init__(self, dut):
-        self.dut = dut
-
-        # Operation cycle should not take longer than this:
-        self.MAX_BUSY_WAIT = 50
-
-        # Minimum duration in which issue_i will be kept inactive,
-        # during which busy_o must remain low.
-        self.MIN_BUSY_LOW = 5
-
-        # Number of cycles to stall until the assertion of go.
-        # One value, for each port. Can be zero, for no delay.
-        self.RD_GO_DELAY = [0, 3]
-
-        # store common data for the input operation of the processes
-        # input operation:
-        self.op = 0
-        self.inv_a = self.zero_a = 0
-        self.imm = self.imm_ok = 0
-        self.imm_control = (0, 0)
-        self.rdmaskn = (0, 0)
-        # input data:
-        self.operands = (0, 0)
-
-        # Indicates completion of the sub-processes
-        self.rd_complete = [False, False]
-
-    def driver(self):
-        print("Begin parallel test.")
-        yield from self.operation(5, 2, MicrOp.OP_ADD)
-
-    def operation(self, a, b, op, inv_a=0, imm=0, imm_ok=0, zero_a=0,
-                  rdmaskn=(0, 0)):
-        # store data for the operation
-        self.operands = (a, b)
-        self.op = op
-        self.inv_a = inv_a
-        self.imm = imm
-        self.imm_ok = imm_ok
-        self.zero_a = zero_a
-        self.imm_control = (zero_a, imm_ok)
-        self.rdmaskn = rdmaskn
-
-        # Initialize completion flags
-        self.rd_complete = [False, False]
-
-        # trigger operation cycle
-        yield from self.issue()
-
-        # check that the sub-processes completed, before the busy_o cycle ended
-        for completion in self.rd_complete:
-            assert completion
-
-    def issue(self):
-        # issue_i starts inactive
-        yield self.dut.issue_i.eq(0)
-
-        for n in range(self.MIN_BUSY_LOW):
-            yield
-            # busy_o must remain inactive. It cannot rise on its own.
-            busy_o = yield self.dut.busy_o
-            assert not busy_o
-
-        # activate issue_i to begin the operation cycle
-        yield self.dut.issue_i.eq(1)
-
-        # at the same time, present the operation
-        yield self.dut.oper_i.insn_type.eq(self.op)
-        yield self.dut.oper_i.invert_in.eq(self.inv_a)
-        yield self.dut.oper_i.imm_data.data.eq(self.imm)
-        yield self.dut.oper_i.imm_data.ok.eq(self.imm_ok)
-        yield self.dut.oper_i.zero_a.eq(self.zero_a)
-        rdmaskn = self.rdmaskn[0] | (self.rdmaskn[1] << 1)
-        yield self.dut.rdmaskn.eq(rdmaskn)
-
-        # give one cycle for the CompUnit to latch the data
-        yield
-
-        # busy_o must keep being low in this cycle, because issue_i was
-        # low on the previous cycle.
-        # It cannot rise on its own.
-        # Also, busy_o and issue_i must never be active at the same time, ever.
-        busy_o = yield self.dut.busy_o
-        assert not busy_o
-
-        # Lower issue_i
-        yield self.dut.issue_i.eq(0)
-
-        # deactivate inputs along with issue_i, so we can be sure the data
-        # was latched at the correct cycle
-        # note: rdmaskn must be held, while busy_o is active
-        # TODO: deactivate rdmaskn when the busy_o cycle ends
-        yield self.dut.oper_i.insn_type.eq(0)
-        yield self.dut.oper_i.invert_in.eq(0)
-        yield self.dut.oper_i.imm_data.data.eq(0)
-        yield self.dut.oper_i.imm_data.ok.eq(0)
-        yield self.dut.oper_i.zero_a.eq(0)
-        yield
-
-        # wait for busy_o to lower
-        # timeout after self.MAX_BUSY_WAIT cycles
-        for n in range(self.MAX_BUSY_WAIT):
-            # sample busy_o in the current cycle
-            busy_o = yield self.dut.busy_o
-            if not busy_o:
-                # operation cycle ends when busy_o becomes inactive
-                break
-            yield
-
-        # if busy_o is still active, a timeout has occurred
-        # TODO: Uncomment this, once the test is complete:
-        # assert not busy_o
-
-        if busy_o:
-            print("If you are reading this, "
-                  "it's because the above test failed, as expected,\n"
-                  "with a timeout. It must pass, once the test is complete.")
-            return
-
-        print("If you are reading this, "
-              "it's because the above test unexpectedly passed.")
-
-    def rd(self, rd_idx):
-        # wait for issue_i to rise
-        while True:
-            issue_i = yield self.dut.issue_i
-            if issue_i:
-                break
-            # issue_i has not risen yet, so rd must keep low
-            rel = yield self.dut.rd.rel_o[rd_idx]
-            assert not rel
-            yield
-
-        # we do not want rd to rise on an immediate operand
-        # if it is immediate, exit the process
-        # likewise, if the read mask is active
-        # TODO: don't exit the process, monitor rd instead to ensure it
-        #       doesn't rise on its own
-        if self.rdmaskn[rd_idx] or self.imm_control[rd_idx]:
-            self.rd_complete[rd_idx] = True
-            return
-
-        # issue_i has risen. rel must rise on the next cycle
-        rel = yield self.dut.rd.rel_o[rd_idx]
-        assert not rel
-
-        # stall for additional cycles. Check that rel doesn't fall on its own
-        for n in range(self.RD_GO_DELAY[rd_idx]):
-            yield
-            rel = yield self.dut.rd.rel_o[rd_idx]
-            assert rel
-
-        # Before asserting "go", make sure "rel" has risen.
-        # The use of Settle allows "go" to be set combinatorially,
-        # rising on the same cycle as "rel".
-        yield Settle()
-        rel = yield self.dut.rd.rel_o[rd_idx]
-        assert rel
-
-        # assert go for one cycle, passing along the operand value
-        yield self.dut.rd.go_i[rd_idx].eq(1)
-        yield self.dut.src_i[rd_idx].eq(self.operands[rd_idx])
-        # check that the operand was sent to the alu
-        # TODO: Properly check the alu protocol
-        yield Settle()
-        alu_input = yield self.dut.get_in(rd_idx)
-        assert alu_input == self.operands[rd_idx]
-        yield
-
-        # rel must keep high, since go was inactive in the last cycle
-        rel = yield self.dut.rd.rel_o[rd_idx]
-        assert rel
-
-        # finish the go one-clock pulse
-        yield self.dut.rd.go_i[rd_idx].eq(0)
-        yield self.dut.src_i[rd_idx].eq(0)
-        yield
-
-        # rel must have gone low in response to go being high
-        # on the previous cycle
-        rel = yield self.dut.rd.rel_o[rd_idx]
-        assert not rel
-
-        self.rd_complete[rd_idx] = True
-
-        # TODO: check that rel doesn't rise again until the end of the
-        #       busy_o cycle
-
-    def wr(self, wr_idx):
-        # monitor self.dut.wr.req[rd_idx] and sets dut.wr.go[idx] for one cycle
-        yield
-        # TODO: also when dut.wr.go is set, check the output against the
-        # self.expected_o and assert.  use dut.get_out(wr_idx) to do so.
-
-    def run_simulation(self, vcd_name):
-        m = Module()
-        m.submodules.cu = self.dut
-        sim = Simulator(m)
-        sim.add_clock(1e-6)
-
-        sim.add_sync_process(wrap(self.driver()))
-        sim.add_sync_process(wrap(self.rd(0)))
-        sim.add_sync_process(wrap(self.rd(1)))
-        sim.add_sync_process(wrap(self.wr(0)))
-        sim_writer = sim.write_vcd(vcd_name)
-        with sim_writer:
-            sim.run()
-
-
  def test_compunit_regspec2_fsm():
  
      inspec = [('INT', 'data', '0:15'),
-              ('INT', 'shift', '0:15'),
-              ]
-    outspec = [('INT', 'data', '0:15'),
-               ]
+              ('INT', 'shift', '0:15')]
+    outspec = [('INT', 'data', '0:15')]
  
      regspec = (inspec, outspec)
  
@@ -712,23 +627,66 @@ def test_compunit_regspec2_fsm():
  
  def test_compunit_regspec3():
  
+    style = {
+        'in': {'color': 'orange'},
+        'out': {'color': 'yellow'},
+    }
+    traces = [
+        'clk',
+        ('operation port', {'color': 'red'}, [
+            'cu_issue_i', 'cu_busy_o',
+            {'comment': 'operation'},
+            ('oper_i_None__insn_type'
+             + ('' if is_engine_pysim() else '[6:0]'),
+             {'display': 'insn_type'})]),
+        ('operand 1 port', 'in', [
+            ('cu_rdmaskn_i[2:0]', {'bit': 2}),
+            ('cu_rd__rel_o[2:0]', {'bit': 2}),
+            ('cu_rd__go_i[2:0]', {'bit': 2}),
+            'src1_i[15:0]']),
+        ('operand 2 port', 'in', [
+            ('cu_rdmaskn_i[2:0]', {'bit': 1}),
+            ('cu_rd__rel_o[2:0]', {'bit': 1}),
+            ('cu_rd__go_i[2:0]', {'bit': 1}),
+            'src2_i[15:0]']),
+        ('operand 3 port', 'in', [
+            ('cu_rdmaskn_i[2:0]', {'bit': 0}),
+            ('cu_rd__rel_o[2:0]', {'bit': 0}),
+            ('cu_rd__go_i[2:0]', {'bit': 0}),
+            'src1_i[15:0]']),
+        ('result port', 'out', [
+            'cu_wrmask_o', 'cu_wr__rel_o', 'cu_wr__go_i', 'dest1_o[15:0]']),
+        ('alu', {'submodule': 'alu'}, [
+            ('prev port', 'in', [
+                'oper_i_None__insn_type', 'i1[15:0]',
+                'i_valid', 'o_ready']),
+            ('next port', 'out', [
+                'alu_o[15:0]', 'o_valid', 'i_ready'])])]
+
+    write_gtkw("test_compunit_regspec3.gtkw",
+               "test_compunit_regspec3.vcd",
+               traces, style,
+               clk_period=1e-6,
+               module='bench.top.cu')
+
      inspec = [('INT', 'a', '0:15'),
                ('INT', 'b', '0:15'),
                ('INT', 'c', '0:15')]
-    outspec = [('INT', 'o', '0:15'),
-               ]
+    outspec = [('INT', 'o', '0:15')]
  
      regspec = (inspec, outspec)
  
      m = Module()
      alu = DummyALU(16)
-    dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
+    dut = MultiCompUnit(regspec, alu, CompCROpSubset)
      m.submodules.cu = dut
  
      sim = Simulator(m)
      sim.add_clock(1e-6)
  
-    sim.add_sync_process(wrap(scoreboard_sim_dummy(dut)))
+    # create an operation issuer
+    op = OpSim(dut, sim)
+    sim.add_sync_process(wrap(scoreboard_sim_dummy(op)))
      sim_writer = sim.write_vcd('test_compunit_regspec3.vcd')
      with sim_writer:
          sim.run()
@@ -745,40 +703,54 @@ def test_compunit_regspec1():
          ('operation port', {'color': 'red'}, [
              'cu_issue_i', 'cu_busy_o',
              {'comment': 'operation'},
-            ('oper_i_None__insn_type', {'display': 'insn_type'}),
+            ('oper_i_None__insn_type'
+             + ('' if is_engine_pysim() else '[6:0]'),
+             {'display': 'insn_type'}),
              ('oper_i_None__invert_in', {'display': 'invert_in'}),
              ('oper_i_None__imm_data__data[63:0]', {'display': 'data[63:0]'}),
-            ('oper_i_None__imm_data__imm_ok', {'display': 'imm_ok'}),
-            ('oper_i_None__zero_a', {'display': 'zero_a'})]),
+            ('oper_i_None__imm_data__ok', {'display': 'imm_ok'}),
+            ('oper_i_None__zero_a', {'display': 'zero_a'}),
+            ('oper_i_None__rc__rc', {'display': 'rc'})]),
          ('operand 1 port', 'in', [
+            ('cu_rdmaskn_i[1:0]', {'bit': 1}),
              ('cu_rd__rel_o[1:0]', {'bit': 1}),
              ('cu_rd__go_i[1:0]', {'bit': 1}),
              'src1_i[15:0]']),
          ('operand 2 port', 'in', [
+            ('cu_rdmaskn_i[1:0]', {'bit': 0}),
              ('cu_rd__rel_o[1:0]', {'bit': 0}),
              ('cu_rd__go_i[1:0]', {'bit': 0}),
              'src2_i[15:0]']),
          ('result port', 'out', [
-            'cu_wr__rel_o', 'cu_wr__go_i', 'dest1_o[15:0]']),
-        ('alu', {'module': 'top.cu.alu'}, [
+            ('cu_wrmask_o[1:0]', {'bit': 1}),
+            ('cu_wr__rel_o[1:0]', {'bit': 1}),
+            ('cu_wr__go_i[1:0]', {'bit': 1}),
+            'dest1_o[15:0]']),
+        ('cr port', 'out', [
+            ('cu_wrmask_o[1:0]', {'bit': 0}),
+            ('cu_wr__rel_o[1:0]', {'bit': 0}),
+            ('cu_wr__go_i[1:0]', {'bit': 0}),
+            'dest2_o[15:0]']),
+        ('alu', {'submodule': 'alu'}, [
              ('prev port', 'in', [
-                'op__insn_type', 'op__invert_i', 'a[15:0]', 'b[15:0]',
-                'valid_i', 'ready_o']),
+                'op__insn_type', 'op__invert_in', 'a[15:0]', 'b[15:0]',
+                'i_valid', 'o_ready']),
              ('next port', 'out', [
-                'alu_o[15:0]', 'valid_o', 'ready_i'])]),
-        ('debug', {'module': 'top'},
+                'alu_o[15:0]', 'o_valid', 'i_ready',
+                'alu_o_ok', 'alu_cr_ok'])]),
+        ('debug', {'module': 'bench'},
              ['src1_count[7:0]', 'src2_count[7:0]', 'dest1_count[7:0]'])]
  
      write_gtkw("test_compunit_regspec1.gtkw",
                 "test_compunit_regspec1.vcd",
                 traces, style,
                 clk_period=1e-6,
-               module='top.cu')
+               module='bench.top.cu')
  
      inspec = [('INT', 'a', '0:15'),
                ('INT', 'b', '0:15')]
      outspec = [('INT', 'o', '0:15'),
-               ]
+               ('INT', 'cr', '0:15')]
  
      regspec = (inspec, outspec)
  
@@ -794,24 +766,16 @@ def test_compunit_regspec1():
      sim = Simulator(m)
      sim.add_clock(1e-6)
  
-    # create one operand producer for each input port
-    prod_a = OperandProducer(sim, dut, 0)
-    prod_b = OperandProducer(sim, dut, 1)
-    # create an result consumer for the output port
-    cons = ResultConsumer(sim, dut, 0)
      # create an operation issuer
-    op = OpSim(dut, [prod_a, prod_b], [cons])
+    op = OpSim(dut, sim)
      sim.add_sync_process(wrap(scoreboard_sim(op)))
      sim_writer = sim.write_vcd('test_compunit_regspec1.vcd',
-                               traces=[prod_a.count,
-                                       prod_b.count,
-                                       cons.count])
+                               traces=[op.producers[0].count,
+                                       op.producers[1].count,
+                                       op.consumers[0].count])
      with sim_writer:
          sim.run()
  
-    test = CompUnitParallelTest(dut)
-    test.run_simulation("test_compunit_parallel.vcd")
-
  
  if __name__ == '__main__':
      test_compunit()