src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmutil.extend import exts
  18 from nmutil.gtkw import write_gtkw
  19
  20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
  23                                         is_engine_pysim)
  24
  25 from soc.decoder.power_enums import MicrOp, Function, CryIn
  26
  27 from soc.fu.alu.alu_input_record import CompALUOpSubset
  28 from soc.fu.cr.cr_input_record import CompCROpSubset
  29
  30 import operator
  31
  32
  33 class Adder(Elaboratable):
  34     def __init__(self, width):
  35         self.invert_in = Signal()
  36         self.a = Signal(width)
  37         self.b = Signal(width)
  38         self.o = Signal(width, name="add_o")
  39
  40     def elaborate(self, platform):
  41         m = Module()
  42         with m.If(self.invert_in):
  43             m.d.comb += self.o.eq((~self.a) + self.b)
  44         with m.Else():
  45             m.d.comb += self.o.eq(self.a + self.b)
  46         return m
  47
  48
  49 class Subtractor(Elaboratable):
  50     def __init__(self, width):
  51         self.a = Signal(width)
  52         self.b = Signal(width)
  53         self.o = Signal(width, name="sub_o")
  54
  55     def elaborate(self, platform):
  56         m = Module()
  57         m.d.comb += self.o.eq(self.a - self.b)
  58         return m
  59
  60
  61 class Multiplier(Elaboratable):
  62     def __init__(self, width):
  63         self.a = Signal(width)
  64         self.b = Signal(width)
  65         self.o = Signal(width, name="mul_o")
  66
  67     def elaborate(self, platform):
  68         m = Module()
  69         m.d.comb += self.o.eq(self.a * self.b)
  70         return m
  71
  72
  73 class Shifter(Elaboratable):
  74     def __init__(self, width):
  75         self.width = width
  76         self.a = Signal(width)
  77         self.b = Signal(width)
  78         self.o = Signal(width, name="shf_o")
  79
  80     def elaborate(self, platform):
  81         m = Module()
  82         btrunc = Signal(self.width)
  83         m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
  84         m.d.comb += self.o.eq(self.a >> btrunc)
  85         return m
  86
  87
  88 class SignExtend(Elaboratable):
  89     def __init__(self, width):
  90         self.width = width
  91         self.a = Signal(width)
  92         self.o = Signal(width, name="exts_o")
  93
  94     def elaborate(self, platform):
  95         m = Module()
  96         m.d.comb += self.o.eq(exts(self.a, 8, self.width))
  97         return m
  98
  99
 100 class Dummy:
 101     pass
 102
 103
 104 class DummyALU(Elaboratable):
 105     def __init__(self, width):
 106         self.p = Dummy()  # make look like nmutil pipeline API
 107         self.p.data_i = Dummy()
 108         self.p.data_i.ctx = Dummy()
 109         self.n = Dummy()  # make look like nmutil pipeline API
 110         self.n.data_o = Dummy()
 111         self.p.valid_i = Signal()
 112         self.p.ready_o = Signal()
 113         self.n.ready_i = Signal()
 114         self.n.valid_o = Signal()
 115         self.counter = Signal(4)
 116         self.op = CompCROpSubset()
 117         i = []
 118         i.append(Signal(width, name="i1"))
 119         i.append(Signal(width, name="i2"))
 120         i.append(Signal(width, name="i3"))
 121         self.i = Array(i)
 122         self.a, self.b, self.c = i[0], i[1], i[2]
 123         self.out = Array([Signal(width, name="alu_o")])
 124         self.o = self.out[0]
 125         self.width = width
 126         # more "look like nmutil pipeline API"
 127         self.p.data_i.ctx.op = self.op
 128         self.p.data_i.a = self.a
 129         self.p.data_i.b = self.b
 130         self.p.data_i.c = self.c
 131         self.n.data_o.o = self.o
 132
 133     def elaborate(self, platform):
 134         m = Module()
 135
 136         go_now = Signal(reset_less=True)  # testing no-delay ALU
 137
 138         with m.If(self.p.valid_i):
 139             # input is valid. next check, if we already said "ready" or not
 140             with m.If(~self.p.ready_o):
 141                 # we didn't say "ready" yet, so say so and initialise
 142                 m.d.sync += self.p.ready_o.eq(1)
 143
 144                 m.d.sync += self.o.eq(self.a)
 145                 m.d.comb += go_now.eq(1)
 146                 m.d.sync += self.counter.eq(1)
 147
 148         with m.Else():
 149             # input says no longer valid, so drop ready as well.
 150             # a "proper" ALU would have had to sync in the opcode and a/b ops
 151             m.d.sync += self.p.ready_o.eq(0)
 152
 153         # ok so the counter's running: when it gets to 1, fire the output
 154         with m.If((self.counter == 1) | go_now):
 155             # set the output as valid if the recipient is ready for it
 156             m.d.sync += self.n.valid_o.eq(1)
 157         with m.If(self.n.ready_i & self.n.valid_o):
 158             m.d.sync += self.n.valid_o.eq(0)
 159             # recipient said it was ready: reset back to known-good.
 160             m.d.sync += self.counter.eq(0)  # reset the counter
 161             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 162
 163         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 164         with m.If(self.counter > 1):
 165             m.d.sync += self.counter.eq(self.counter - 1)
 166
 167         return m
 168
 169     def __iter__(self):
 170         yield from self.op.ports()
 171         yield self.a
 172         yield self.b
 173         yield self.c
 174         yield self.o
 175
 176     def ports(self):
 177         return list(self)
 178
 179
 180 class ALU(Elaboratable):
 181     def __init__(self, width):
 182         self.p = Dummy()  # make look like nmutil pipeline API
 183         self.p.data_i = Dummy()
 184         self.p.data_i.ctx = Dummy()
 185         self.n = Dummy()  # make look like nmutil pipeline API
 186         self.n.data_o = Dummy()
 187         self.p.valid_i = Signal()
 188         self.p.ready_o = Signal()
 189         self.n.ready_i = Signal()
 190         self.n.valid_o = Signal()
 191         self.counter = Signal(4)
 192         self.op = CompALUOpSubset(name="op")
 193         i = []
 194         i.append(Signal(width, name="i1"))
 195         i.append(Signal(width, name="i2"))
 196         self.i = Array(i)
 197         self.a, self.b = i[0], i[1]
 198         self.out = Array([Signal(width, name="alu_o")])
 199         self.o = self.out[0]
 200         self.width = width
 201         # more "look like nmutil pipeline API"
 202         self.p.data_i.ctx.op = self.op
 203         self.p.data_i.a = self.a
 204         self.p.data_i.b = self.b
 205         self.n.data_o.o = self.o
 206
 207     def elaborate(self, platform):
 208         m = Module()
 209         add = Adder(self.width)
 210         mul = Multiplier(self.width)
 211         shf = Shifter(self.width)
 212         sub = Subtractor(self.width)
 213         ext_sign = SignExtend(self.width)
 214
 215         m.submodules.add = add
 216         m.submodules.mul = mul
 217         m.submodules.shf = shf
 218         m.submodules.sub = sub
 219         m.submodules.ext_sign = ext_sign
 220
 221         # really should not activate absolutely all ALU inputs like this
 222         for mod in [add, mul, shf, sub]:
 223             m.d.comb += [
 224                 mod.a.eq(self.a),
 225                 mod.b.eq(self.b),
 226             ]
 227         m.d.comb += ext_sign.a.eq(self.a)
 228
 229         # pass invert (and carry later)
 230         m.d.comb += add.invert_in.eq(self.op.invert_in)
 231
 232         go_now = Signal(reset_less=True)  # testing no-delay ALU
 233
 234         # ALU sequencer is idle when the count is zero
 235         alu_idle = Signal(reset_less=True)
 236         m.d.comb += alu_idle.eq(self.counter == 0)
 237
 238         # ALU sequencer is done when the count is one
 239         alu_done = Signal(reset_less=True)
 240         m.d.comb += alu_done.eq(self.counter == 1)
 241
 242         # select handshake handling according to ALU type
 243         with m.If(go_now):
 244             # with a combinatorial, no-delay ALU, just pass through
 245             # the handshake signals to the other side
 246             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 247             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 248         with m.Else():
 249             # sequential ALU handshake:
 250             # ready_o responds to valid_i, but only if the ALU is idle
 251             m.d.comb += self.p.ready_o.eq(alu_idle)
 252             # select the internally generated valid_o, above
 253             m.d.comb += self.n.valid_o.eq(alu_done)
 254
 255         # hold the ALU result until ready_o is asserted
 256         alu_r = Signal(self.width)
 257
 258         with m.If(alu_idle):
 259             with m.If(self.p.valid_i):
 260
 261                 # as this is a "fake" pipeline, just grab the output right now
 262                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 263                     m.d.sync += alu_r.eq(add.o)
 264                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 265                     m.d.sync += alu_r.eq(mul.o)
 266                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 267                     m.d.sync += alu_r.eq(shf.o)
 268                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
 269                     m.d.sync += alu_r.eq(ext_sign.o)
 270                 # SUB is zero-delay, no need to register
 271
 272                 # NOTE: all of these are fake, just something to test
 273
 274                 # MUL, to take 5 instructions
 275                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 276                     m.d.sync += self.counter.eq(5)
 277                 # SHIFT to take 1, straight away
 278                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 279                     m.d.sync += self.counter.eq(1)
 280                 # ADD/SUB to take 3
 281                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 282                     m.d.sync += self.counter.eq(3)
 283                 # EXTS to take 1
 284                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
 285                     m.d.sync += self.counter.eq(1)
 286                 # others to take no delay
 287                 with m.Else():
 288                     m.d.comb += go_now.eq(1)
 289
 290         with m.Elif(~alu_done | self.n.ready_i):
 291             # decrement the counter while the ALU is neither idle nor finished
 292             m.d.sync += self.counter.eq(self.counter - 1)
 293
 294         # choose between zero-delay output, or registered
 295         with m.If(go_now):
 296             m.d.comb += self.o.eq(sub.o)
 297         # only present the result at the last computation cycle
 298         with m.Elif(alu_done):
 299             m.d.comb += self.o.eq(alu_r)
 300
 301         return m
 302
 303     def __iter__(self):
 304         yield from self.op.ports()
 305         yield self.a
 306         yield self.b
 307         yield self.o
 308         yield self.p.valid_i
 309         yield self.p.ready_o
 310         yield self.n.valid_o
 311         yield self.n.ready_i
 312
 313     def ports(self):
 314         return list(self)
 315
 316
 317 class BranchOp(Elaboratable):
 318     def __init__(self, width, op):
 319         self.a = Signal(width)
 320         self.b = Signal(width)
 321         self.o = Signal(width)
 322         self.op = op
 323
 324     def elaborate(self, platform):
 325         m = Module()
 326         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 327         return m
 328
 329
 330 class BranchALU(Elaboratable):
 331     def __init__(self, width):
 332         self.p = Dummy()  # make look like nmutil pipeline API
 333         self.p.data_i = Dummy()
 334         self.p.data_i.ctx = Dummy()
 335         self.n = Dummy()  # make look like nmutil pipeline API
 336         self.n.data_o = Dummy()
 337         self.p.valid_i = Signal()
 338         self.p.ready_o = Signal()
 339         self.n.ready_i = Signal()
 340         self.n.valid_o = Signal()
 341         self.counter = Signal(4)
 342         self.op = Signal(2)
 343         i = []
 344         i.append(Signal(width, name="i1"))
 345         i.append(Signal(width, name="i2"))
 346         self.i = Array(i)
 347         self.a, self.b = i[0], i[1]
 348         self.out = Array([Signal(width)])
 349         self.o = self.out[0]
 350         self.width = width
 351
 352     def elaborate(self, platform):
 353         m = Module()
 354         bgt = BranchOp(self.width, operator.gt)
 355         blt = BranchOp(self.width, operator.lt)
 356         beq = BranchOp(self.width, operator.eq)
 357         bne = BranchOp(self.width, operator.ne)
 358
 359         m.submodules.bgt = bgt
 360         m.submodules.blt = blt
 361         m.submodules.beq = beq
 362         m.submodules.bne = bne
 363         for mod in [bgt, blt, beq, bne]:
 364             m.d.comb += [
 365                 mod.a.eq(self.a),
 366                 mod.b.eq(self.b),
 367             ]
 368
 369         go_now = Signal(reset_less=True)  # testing no-delay ALU
 370         with m.If(self.p.valid_i):
 371             # input is valid. next check, if we already said "ready" or not
 372             with m.If(~self.p.ready_o):
 373                 # we didn't say "ready" yet, so say so and initialise
 374                 m.d.sync += self.p.ready_o.eq(1)
 375
 376                 # as this is a "fake" pipeline, just grab the output right now
 377                 with m.Switch(self.op):
 378                     for i, mod in enumerate([bgt, blt, beq, bne]):
 379                         with m.Case(i):
 380                             m.d.sync += self.o.eq(mod.o)
 381                 # branch to take 5 cycles (fake)
 382                 m.d.sync += self.counter.eq(5)
 383                 #m.d.comb += go_now.eq(1)
 384         with m.Else():
 385             # input says no longer valid, so drop ready as well.
 386             # a "proper" ALU would have had to sync in the opcode and a/b ops
 387             m.d.sync += self.p.ready_o.eq(0)
 388
 389         # ok so the counter's running: when it gets to 1, fire the output
 390         with m.If((self.counter == 1) | go_now):
 391             # set the output as valid if the recipient is ready for it
 392             m.d.sync += self.n.valid_o.eq(1)
 393         with m.If(self.n.ready_i & self.n.valid_o):
 394             m.d.sync += self.n.valid_o.eq(0)
 395             # recipient said it was ready: reset back to known-good.
 396             m.d.sync += self.counter.eq(0)  # reset the counter
 397             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 398
 399         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 400         with m.If(self.counter > 1):
 401             m.d.sync += self.counter.eq(self.counter - 1)
 402
 403         return m
 404
 405     def __iter__(self):
 406         yield self.op
 407         yield self.a
 408         yield self.b
 409         yield self.o
 410
 411     def ports(self):
 412         return list(self)
 413
 414
 415 def run_op(dut, a, b, op, inv_a=0):
 416     yield dut.a.eq(a)
 417     yield dut.b.eq(b)
 418     yield dut.op.insn_type.eq(op)
 419     yield dut.op.invert_in.eq(inv_a)
 420     yield dut.n.ready_i.eq(0)
 421     yield dut.p.valid_i.eq(1)
 422     yield dut.n.ready_i.eq(1)
 423     yield
 424
 425     # wait for the ALU to accept our input data
 426     while not (yield dut.p.ready_o):
 427         yield
 428
 429     yield dut.p.valid_i.eq(0)
 430     yield dut.a.eq(0)
 431     yield dut.b.eq(0)
 432     yield dut.op.insn_type.eq(0)
 433     yield dut.op.invert_in.eq(0)
 434
 435     # wait for the ALU to present the output data
 436     while not (yield dut.n.valid_o):
 437         yield
 438
 439     # latch the result and lower read_i
 440     result = yield dut.o
 441     yield dut.n.ready_i.eq(0)
 442
 443     return result
 444
 445
 446 def alu_sim(dut):
 447     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 448     print("alu_sim add", result)
 449     assert (result == 8)
 450
 451     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 452     print("alu_sim mul", result)
 453     assert (result == 6)
 454
 455     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 456     print("alu_sim add-inv", result)
 457     assert (result == 65533)
 458
 459     # test zero-delay ALU
 460     # don't have OP_SUB, so use any other
 461     result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
 462     print("alu_sim sub", result)
 463     assert (result == 2)
 464
 465     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 466     print("alu_sim shr", result)
 467     assert (result == 3)
 468
 469
 470 def test_alu():
 471     alu = ALU(width=16)
 472     write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
 473     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 474
 475     vl = rtlil.convert(alu, ports=alu.ports())
 476     with open("test_alu.il", "w") as f:
 477         f.write(vl)
 478
 479
 480 def test_alu_parallel():
 481     # Compare with the sequential test implementation, above.
 482     m = Module()
 483     m.submodules.alu = dut = ALU(width=16)
 484     write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
 485                    pysim=is_engine_pysim())
 486
 487     sim = Simulator(m)
 488     sim.add_clock(1e-6)
 489
 490     def send(a, b, op, inv_a=0):
 491         # present input data and assert valid_i
 492         yield dut.a.eq(a)
 493         yield dut.b.eq(b)
 494         yield dut.op.insn_type.eq(op)
 495         yield dut.op.invert_in.eq(inv_a)
 496         yield dut.p.valid_i.eq(1)
 497         yield
 498         # wait for ready_o to be asserted
 499         while not (yield dut.p.ready_o):
 500             yield
 501         # clear input data and negate valid_i
 502         # if send is called again immediately afterwards, there will be no
 503         # visible transition (they will not be negated, after all)
 504         yield dut.p.valid_i.eq(0)
 505         yield dut.a.eq(0)
 506         yield dut.b.eq(0)
 507         yield dut.op.insn_type.eq(0)
 508         yield dut.op.invert_in.eq(0)
 509
 510     def receive():
 511         # signal readiness to receive data
 512         yield dut.n.ready_i.eq(1)
 513         yield
 514         # wait for valid_o to be asserted
 515         while not (yield dut.n.valid_o):
 516             yield
 517         # read result
 518         result = yield dut.o
 519         # negate ready_i
 520         # if receive is called again immediately afterwards, there will be no
 521         # visible transition (it will not be negated, after all)
 522         yield dut.n.ready_i.eq(0)
 523         return result
 524
 525     def producer():
 526         # send a few test cases, interspersed with wait states
 527         # note that, for this test, we do not wait for the result to be ready,
 528         # before presenting the next input
 529         # 5 + 3
 530         yield from send(5, 3, MicrOp.OP_ADD)
 531         yield
 532         yield
 533         # 2 * 3
 534         yield from send(2, 3, MicrOp.OP_MUL_L64)
 535         # (-5) + 3
 536         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 537         yield
 538         # 5 - 3
 539         # note that this is a zero-delay operation
 540         yield from send(5, 3, MicrOp.OP_NOP)
 541         yield
 542         yield
 543         # 13 >> 2
 544         yield from send(13, 2, MicrOp.OP_SHR)
 545         # sign extent 13
 546         yield from send(13, 2, MicrOp.OP_EXTS)
 547         # sign extend -128 (8 bits)
 548         yield from send(0x80, 2, MicrOp.OP_EXTS)
 549
 550     def consumer():
 551         # receive and check results, interspersed with wait states
 552         # the consumer is not in step with the producer, but the
 553         # order of the results are preserved
 554         yield
 555         # 5 + 3 = 8
 556         result = yield from receive()
 557         assert (result == 8)
 558         # 2 * 3 = 6
 559         result = yield from receive()
 560         assert (result == 6)
 561         yield
 562         yield
 563         # (-5) + 3 = -2
 564         result = yield from receive()
 565         assert (result == 65533)  # unsigned equivalent to -2
 566         # 5 - 3 = 2
 567         # note that this is a zero-delay operation
 568         # this, and the previous result, will be received back-to-back
 569         # (check the output waveform to see this)
 570         result = yield from receive()
 571         assert (result == 2)
 572         yield
 573         yield
 574         # 13 >> 2 = 3
 575         result = yield from receive()
 576         assert (result == 3)
 577         # sign extent 13 = 13
 578         result = yield from receive()
 579         assert (result == 13)
 580         # sign extend -128 (8 bits) = -128 (16 bits)
 581         result = yield from receive()
 582         assert (result == 0xFF80)
 583
 584     sim.add_sync_process(producer)
 585     sim.add_sync_process(consumer)
 586     sim_writer = sim.write_vcd("test_alu_parallel.vcd")
 587     with sim_writer:
 588         sim.run()
 589
 590
 591 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
 592                    pysim=True):
 593     """Common function to write the GTKWave documents for this module"""
 594     gtkwave_desc = [
 595         'clk',
 596         'i1[15:0]',
 597         'i2[15:0]',
 598         'op__insn_type' if pysim else 'op__insn_type[6:0]',
 599         'op__invert_in',
 600         'valid_i',
 601         'ready_o',
 602         'valid_o',
 603         'ready_i',
 604         'alu_o[15:0]',
 605     ]
 606     # determine the module name of the DUT
 607     module = 'top'
 608     if sub_module is not None:
 609         module = nmigen_sim_top_module + sub_module
 610     vcd_name = gtkw_name.replace('.gtkw', '.vcd')
 611     write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
 612                loc=__file__, clk_period=clk_period, base='signed')
 613
 614
 615 if __name__ == "__main__":
 616     test_alu()
 617     test_alu_parallel()
 618
 619     # alu = BranchALU(width=16)
 620     # vl = rtlil.convert(alu, ports=alu.ports())
 621     # with open("test_branch_alu.il", "w") as f:
 622     #     f.write(vl)