src/soc/experiment/compalu_multi.py

   1 from nmigen.compat.sim import run_simulation
   2 from nmigen.cli import verilog, rtlil
   3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
   4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
   5
   6 from nmutil.latch import SRLatch, latchregister
   7 from nmutil.iocontrol import RecordObject
   8
   9 from soc.decoder.power_decoder2 import Data
  10 from soc.decoder.power_enums import InternalOp
  11 from soc.fu.regspec import RegSpec, RegSpecALUAPI
  12
  13
  14 """ Computation Unit (aka "ALU Manager").
  15
  16     This module runs a "revolving door" set of three latches, based on
  17     * Issue
  18     * Go_Read
  19     * Go_Write
  20     where one of them cannot be set on any given cycle.
  21
  22     * When issue is first raised, a busy signal is sent out.
  23       The src1 and src2 registers and the operand can be latched in
  24       at this point
  25
  26     * Read request is set, which is acknowledged through the Scoreboard
  27       to the priority picker, which generates (one and only one) Go_Read
  28       at a time.  One of those will (eventually) be this Computation Unit.
  29
  30     * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
  31       src1/src2/operand in place), and the ALU is told to proceed.
  32
  33     * when the ALU pipeline is ready, this activates "write request release",
  34       and the ALU's output is captured into a temporary register.
  35
  36     * Write request release is *HELD UP* (prevented from proceeding) if shadowN
  37       is asserted LOW.  This is how all speculation, precise exceptions,
  38       predication - everything - is achieved.
  39
  40     * Write request release will go through a similar process as Read request,
  41       resulting (eventually) in Go_Write being asserted.
  42
  43     * When Go_Write is asserted, two things happen: (1) the data in the temp
  44       register is placed combinatorially onto the output, and (2) the
  45       req_l latch is cleared, busy is dropped, and the Comp Unit is back
  46       through its revolving door to do another task.
  47
  48     Note that the read and write latches are held synchronously for one cycle,
  49     i.e. that when Go_Read comes in, one cycle is given in which the incoming
  50     register (broadcast over a Regfile Read Port) may have time to be latched.
  51
  52     It is REQUIRED that Go_Read be held valid only for one cycle, and it is
  53     REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
  54     Go_Read is asserted HI.
  55
  56     Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
  57     likewise be dropped exactly one cycle after assertion of Go_Write.
  58
  59     When Go_Die is asserted then strictly speaking the entire FSM should be
  60     fully reset and that includes sending a cancellation request to the ALU.
  61     (XXX TODO: alu "go die" is not presently wired up)
  62 """
  63
  64 def go_record(n, name):
  65     r = Record([('go', n, DIR_FANIN),
  66                 ('rel', n, DIR_FANOUT)], name=name)
  67     r.go.reset_less = True
  68     r.rel.reset_less = True
  69     return r
  70
  71 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
  72
  73 class CompUnitRecord(RegSpec, RecordObject):
  74     """CompUnitRecord
  75
  76     base class for Computation Units, to provide a uniform API
  77     and allow "record.connect" etc. to be used, particularly when
  78     it comes to connecting multiple Computation Units up as a block
  79     (very laborious)
  80
  81     LDSTCompUnitRecord should derive from this class and add the
  82     additional signals it requires
  83
  84     :subkls:      the class (not an instance) needed to construct the opcode
  85     :rwid:        either an integer (specifies width of all regs) or a "regspec"
  86
  87     see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
  88     """
  89     def __init__(self, subkls, rwid, n_src=None, n_dst=None, name=None):
  90         RegSpec.__init__(self, rwid, n_src, n_dst)
  91         RecordObject.__init__(self, name)
  92         self._subkls = subkls
  93
  94         # create source operands
  95         src = []
  96         for i in range(n_src):
  97             j = i + 1 # name numbering to match src1/src2
  98             name = "src%d_i" % j
  99             rw = self._get_srcwid(i)
 100             sreg = Signal(rw, name=name, reset_less=True)
 101             setattr(self, name, sreg)
 102             src.append(sreg)
 103         self._src_i = src
 104
 105         # create dest operands
 106         dst = []
 107         for i in range(n_dst):
 108             j = i + 1 # name numbering to match dest1/2...
 109             name = "dest%d_i" % j
 110             rw = self._get_dstwid(i)
 111             dreg = Signal(rw, name=name, reset_less=True)
 112             setattr(self, name, dreg)
 113             dst.append(dreg)
 114         self._dest = dst
 115
 116         # operation / data input
 117         self.oper_i = subkls() # operand
 118
 119         # create read/write and other scoreboard signalling
 120         self.rd = go_record(n_src, name="rd") # read in, req out
 121         self.wr = go_record(n_dst, name="wr") # write in, req out
 122         self.issue_i = Signal(reset_less=True) # fn issue in
 123         self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
 124         self.go_die_i = Signal() # go die (reset)
 125
 126         # output (busy/done)
 127         self.busy_o = Signal(reset_less=True) # fn busy out
 128         self.done_o = Signal(reset_less=True)
 129
 130
 131 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 132     def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
 133         """MultiCompUnit
 134
 135         * :rwid:        width of register latches (TODO: allocate per regspec)
 136         * :alu:         the ALU (pipeline, FSM) - must conform to nmutil Pipe API
 137         * :opsubsetkls: the subset of Decode2ExecuteType
 138         * :n_src:       number of src operands
 139         * :n_dst:       number of destination operands
 140         """
 141         RegSpecALUAPI.__init__(self, rwid, alu)
 142         self.n_src, self.n_dst = n_src, n_dst
 143         self.opsubsetkls = opsubsetkls
 144         self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
 145
 146         # convenience names for src operands
 147         for i in range(n_src):
 148             j = i + 1 # name numbering to match src1/src2
 149             name = "src%d_i" % j
 150             setattr(self, name, getattr(cu, name))
 151
 152         # convenience names for dest operands
 153         for i in range(n_dst):
 154             j = i + 1 # name numbering to match dest1/2...
 155             name = "dest%d_i" % j
 156             setattr(self, name, getattr(cu, name))
 157
 158         # more convenience names
 159         self.rd = cu.rd
 160         self.wr = cu.wr
 161         self.go_rd_i = self.rd.go # temporary naming
 162         self.go_wr_i = self.wr.go # temporary naming
 163         self.rd_rel_o = self.rd.rel # temporary naming
 164         self.req_rel_o = self.wr.rel # temporary naming
 165         self.issue_i = cu.issue_i
 166         self.shadown_i = cu.shadown_i
 167         self.go_die_i = cu.go_die_i
 168
 169         # operation / data input
 170         self.oper_i = cu.oper_i
 171         self.src_i = cu._src_i
 172
 173         self.busy_o = cu.busy_o
 174         self.dest = cu._dest
 175         self.data_o = self.dest[0] # Dest out
 176         self.done_o = cu.done_o
 177
 178     def elaborate(self, platform):
 179         m = Module()
 180         m.submodules.alu = self.alu
 181         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
 182         m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
 183         m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
 184         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
 185         m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
 186
 187         # ALU only proceeds when all src are ready.  rd_rel_o is delayed
 188         # so combine it with go_rd_i.  if all bits are set we're good
 189         all_rd = Signal(reset_less=True)
 190         m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
 191                     (((~self.rd.rel) | self.rd.go).all()))
 192
 193         # write_requests all done
 194         # req_done works because any one of the last of the writes
 195         # is enough, when combined with when read-phase is done (rst_l.q)
 196         wr_any = Signal(reset_less=True)
 197         req_done = Signal(reset_less=True)
 198         m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
 199         m.d.comb += wr_any.eq(self.wr.go.bool())
 200         m.d.comb += req_done.eq(rst_l.q & wr_any)
 201
 202         # shadow/go_die
 203         reset = Signal(reset_less=True)
 204         rst_r = Signal(reset_less=True) # reset latch off
 205         reset_w = Signal(self.n_dst, reset_less=True)
 206         reset_r = Signal(self.n_src, reset_less=True)
 207         m.d.comb += reset.eq(req_done | self.go_die_i)
 208         m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
 209         m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
 210         m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
 211
 212         # read-done,wr-proceed latch
 213         m.d.comb += rok_l.s.eq(self.issue_i)  # set up when issue starts
 214         m.d.comb += rok_l.r.eq(self.alu.p.ready_o) # off when ALU acknowledges
 215
 216         # wr-done, back-to-start latch
 217         m.d.comb += rst_l.s.eq(all_rd)     # set when read-phase is fully done
 218         m.d.comb += rst_l.r.eq(rst_r)        # *off* on issue
 219
 220         # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
 221         m.d.sync += opc_l.s.eq(self.issue_i)       # set on issue
 222         m.d.sync += opc_l.r.eq(self.alu.n.valid_o & req_done) # reset on ALU
 223
 224         # src operand latch (not using go_wr_i)
 225         m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
 226         m.d.sync += src_l.r.eq(reset_r)
 227
 228         # dest operand latch (not using issue_i)
 229         m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
 230         m.d.sync += req_l.r.eq(reset_w)
 231
 232         # create a latch/register for the operand
 233         oper_r = self.opsubsetkls()
 234         latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
 235
 236         # and for each output from the ALU
 237         drl = []
 238         for i in range(self.n_dst):
 239             name = "data_r%d" % i
 240             data_r = Signal(self.cu._get_srcwid(i), name=name, reset_less=True)
 241             latchregister(m, self.get_out(i), data_r, req_l.q[i], name)
 242             drl.append(data_r)
 243
 244         # pass the operation to the ALU
 245         m.d.comb += self.get_op().eq(oper_r)
 246
 247         # create list of src/alu-src/src-latch.  override 1st and 2nd one below.
 248         # in the case, for ALU and Logical pipelines, we assume RB is the 2nd operand
 249         # in the input "regspec".  see for example soc.fu.alu.pipe_data.ALUInputData
 250         # TODO: assume RA is the 1st operand, zero_a detection is needed.
 251         sl = []
 252         for i in range(self.n_src):
 253             sl.append([self.src_i[i], self.get_in(i), src_l.q[i]])
 254
 255         # if the operand subset has "zero_a" we implicitly assume that means
 256         # src_i[0] is an INT register type where zero can be multiplexed in, instead.
 257         # see https://bugs.libre-soc.org/show_bug.cgi?id=336
 258         #if hasattr(oper_r, "zero_a"):
 259             # select zero immediate if opcode says so.  however also change the latch
 260             # to trigger *from* the opcode latch instead.
 261             # ...
 262             # ...
 263
 264         # if the operand subset has "imm_data" we implicitly assume that means
 265         # "this is an INT ALU/Logical FU jobbie, RB is multiplexed with the immediate"
 266         if hasattr(oper_r, "imm_data"):
 267             # select immediate if opcode says so.  however also change the latch
 268             # to trigger *from* the opcode latch instead.
 269             op_is_imm = oper_r.imm_data.imm_ok
 270             src2_or_imm = Signal(self.cu._get_srcwid(1), reset_less=True)
 271             src_sel = Signal(reset_less=True)
 272             m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
 273             m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
 274                                                       self.src2_i))
 275             # overwrite 2nd src-latch with immediate-muxed stuff
 276             sl[1][0] = src2_or_imm
 277             sl[1][2] = src_sel
 278
 279         # create a latch/register for src1/src2 (even if it is a copy of an immediate)
 280         for i in range(self.n_src):
 281             src, alusrc, latch = sl[i]
 282             latchregister(m, src, alusrc, latch, name="src_r%d" % i)
 283
 284         # -----
 285         # outputs
 286         # -----
 287
 288         # all request signals gated by busy_o.  prevents picker problems
 289         m.d.comb += self.busy_o.eq(opc_l.q) # busy out
 290         bro = Repl(self.busy_o, self.n_src)
 291         m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
 292
 293         # on a go_read, tell the ALU we're accepting data.
 294         # NOTE: this spells TROUBLE if the ALU isn't ready!
 295         # go_read is only valid for one clock!
 296         with m.If(all_rd):                           # src operands ready, GO!
 297             with m.If(~self.alu.p.ready_o):          # no ACK yet
 298                 m.d.comb += self.alu.p.valid_i.eq(1) # so indicate valid
 299
 300         brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
 301         # only proceed if ALU says its output is valid
 302         with m.If(self.alu.n.valid_o):
 303             # when ALU ready, write req release out. waits for shadow
 304             m.d.comb += self.wr.rel.eq(req_l.q & brd)
 305             # when output latch is ready, and ALU says ready, accept ALU output
 306             with m.If(reset):
 307                 m.d.comb += self.alu.n.ready_i.eq(1) # tells ALU "thanks got it"
 308
 309         # output the data from the latch on go_write
 310         for i in range(self.n_dst):
 311             with m.If(self.wr.go[i]):
 312                 m.d.comb += self.dest[i].eq(drl[i])
 313
 314         return m
 315
 316     def __iter__(self):
 317         yield self.rd.go
 318         yield self.wr.go
 319         yield self.issue_i
 320         yield self.shadown_i
 321         yield self.go_die_i
 322         yield from self.oper_i.ports()
 323         yield self.src1_i
 324         yield self.src2_i
 325         yield self.busy_o
 326         yield self.rd.rel
 327         yield self.wr.rel
 328         yield self.data_o
 329
 330     def ports(self):
 331         return list(self)
 332
 333
 334 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
 335     yield dut.issue_i.eq(0)
 336     yield
 337     yield dut.src_i[0].eq(a)
 338     yield dut.src_i[1].eq(b)
 339     yield dut.oper_i.insn_type.eq(op)
 340     yield dut.oper_i.invert_a.eq(inv_a)
 341     yield dut.oper_i.imm_data.imm.eq(imm)
 342     yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
 343     yield dut.issue_i.eq(1)
 344     yield
 345     yield dut.issue_i.eq(0)
 346     yield
 347     yield dut.rd.go.eq(0b11)
 348     while True:
 349         yield
 350         rd_rel_o = yield dut.rd.rel
 351         print ("rd_rel", rd_rel_o)
 352         if rd_rel_o:
 353             break
 354     yield
 355     yield dut.rd.go.eq(0)
 356     req_rel_o = yield dut.wr.rel
 357     result = yield dut.data_o
 358     print ("req_rel", req_rel_o, result)
 359     while True:
 360         req_rel_o = yield dut.wr.rel
 361         result = yield dut.data_o
 362         print ("req_rel", req_rel_o, result)
 363         if req_rel_o:
 364             break
 365         yield
 366     yield dut.wr.go[0].eq(1)
 367     yield
 368     result = yield dut.data_o
 369     print ("result", result)
 370     yield dut.wr.go[0].eq(0)
 371     yield
 372     return result
 373
 374
 375 def scoreboard_sim(dut):
 376     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
 377                                     imm=8, imm_ok=1)
 378     assert result == 13
 379
 380     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
 381     assert result == 7
 382
 383     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
 384     assert result == 65532
 385
 386
 387 def test_compunit():
 388     from alu_hier import ALU
 389     from soc.fu.alu.alu_input_record import CompALUOpSubset
 390
 391     m = Module()
 392     alu = ALU(16)
 393     dut = MultiCompUnit(16, alu, CompALUOpSubset)
 394     m.submodules.cu = dut
 395
 396     vl = rtlil.convert(dut, ports=dut.ports())
 397     with open("test_compunit1.il", "w") as f:
 398         f.write(vl)
 399
 400     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 401
 402
 403 def test_compunit_regspec1():
 404     from alu_hier import ALU
 405     from soc.fu.alu.alu_input_record import CompALUOpSubset
 406
 407     inspec = [('INT', 'a', '0:15'),
 408               ('INT', 'b', '0:15')]
 409     outspec = [('INT', 'o', '0:15'),
 410               ]
 411
 412     regspec = (inspec, outspec)
 413
 414     m = Module()
 415     alu = ALU(16)
 416     dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
 417     m.submodules.cu = dut
 418
 419     vl = rtlil.convert(dut, ports=dut.ports())
 420     with open("test_compunit_regspec1.il", "w") as f:
 421         f.write(vl)
 422
 423     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 424
 425
 426 if __name__ == '__main__':
 427     test_compunit()
 428     test_compunit_regspec1()