src/soc/experiment/compalu_multi.py

   1 from nmigen.compat.sim import run_simulation
   2 from nmigen.cli import verilog, rtlil
   3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
   4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
   5
   6 from nmutil.latch import SRLatch, latchregister
   7 from nmutil.iocontrol import RecordObject
   8
   9 from soc.decoder.power_decoder2 import Data
  10 from soc.decoder.power_enums import InternalOp
  11
  12
  13 """ Computation Unit (aka "ALU Manager").
  14
  15     This module runs a "revolving door" set of three latches, based on
  16     * Issue
  17     * Go_Read
  18     * Go_Write
  19     where one of them cannot be set on any given cycle.
  20
  21     * When issue is first raised, a busy signal is sent out.
  22       The src1 and src2 registers and the operand can be latched in
  23       at this point
  24
  25     * Read request is set, which is acknowledged through the Scoreboard
  26       to the priority picker, which generates (one and only one) Go_Read
  27       at a time.  One of those will (eventually) be this Computation Unit.
  28
  29     * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
  30       src1/src2/operand in place), and the ALU is told to proceed.
  31
  32     * when the ALU pipeline is ready, this activates "write request release",
  33       and the ALU's output is captured into a temporary register.
  34
  35     * Write request release is *HELD UP* (prevented from proceeding) if shadowN
  36       is asserted LOW.  This is how all speculation, precise exceptions,
  37       predication - everything - is achieved.
  38
  39     * Write request release will go through a similar process as Read request,
  40       resulting (eventually) in Go_Write being asserted.
  41
  42     * When Go_Write is asserted, two things happen: (1) the data in the temp
  43       register is placed combinatorially onto the output, and (2) the
  44       req_l latch is cleared, busy is dropped, and the Comp Unit is back
  45       through its revolving door to do another task.
  46
  47     Note that the read and write latches are held synchronously for one cycle,
  48     i.e. that when Go_Read comes in, one cycle is given in which the incoming
  49     register (broadcast over a Regfile Read Port) may have time to be latched.
  50
  51     It is REQUIRED that Go_Read be held valid only for one cycle, and it is
  52     REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
  53     Go_Read is asserted HI.
  54
  55     Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
  56     likewise be dropped exactly one cycle after assertion of Go_Write.
  57
  58     When Go_Die is asserted then strictly speaking the entire FSM should be
  59     fully reset and that includes sending a cancellation request to the ALU.
  60     (XXX TODO: alu "go die" is not presently wired up)
  61 """
  62
  63 def go_record(n, name):
  64     r = Record([('go', n, DIR_FANIN),
  65                 ('rel', n, DIR_FANOUT)], name=name)
  66     r.go.reset_less = True
  67     r.rel.reset_less = True
  68     return r
  69
  70 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
  71 def get_regspec_bitwidth(regspec, srcdest, idx):
  72     bitspec = regspec[srcdest][idx]
  73     wid = 0
  74     print (bitspec)
  75     for ranges in bitspec[2].split(","):
  76         ranges = ranges.split(":")
  77         print (ranges)
  78         if len(ranges) == 1: # only one bit
  79             wid += 1
  80         else:
  81             start, end = map(int, ranges)
  82             wid += (end-start)+1
  83     return wid
  84
  85
  86 class CompUnitRecord(RecordObject):
  87     """CompUnitRecord
  88
  89     base class for Computation Units, to provide a uniform API
  90     and allow "record.connect" etc. to be used, particularly when
  91     it comes to connecting multiple Computation Units up as a block
  92     (very laborious)
  93
  94     LDSTCompUnitRecord should derive from this class and add the
  95     additional signals it requires
  96
  97     :subkls:      the class (not an instance) needed to construct the opcode
  98     :rwid:        either an integer (specifies width of all regs) or a "regspec"
  99
 100     see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
 101     """
 102     def __init__(self, subkls, rwid, n_src=None, n_dst=None, name=None):
 103         RecordObject.__init__(self, name)
 104         self._rwid = rwid
 105         if isinstance(rwid, int):
 106             # rwid: integer (covers all registers)
 107             self._n_src, self._n_dst = n_src, n_dst
 108         else:
 109             # rwid: a regspec.
 110             self._n_src, self._n_dst = len(rwid[0]), len(rwid[1])
 111         self._subkls = subkls
 112
 113         src = []
 114         for i in range(n_src):
 115             j = i + 1 # name numbering to match src1/src2
 116             name = "src%d_i" % j
 117             rw = self._get_srcwid(i)
 118             sreg = Signal(rw, name=name, reset_less=True)
 119             setattr(self, name, sreg)
 120             src.append(sreg)
 121         self._src_i = src
 122
 123         dst = []
 124         for i in range(n_dst):
 125             j = i + 1 # name numbering to match dest1/2...
 126             name = "dest%d_i" % j
 127             rw = self._get_dstwid(i)
 128             dreg = Signal(rw, name=name, reset_less=True)
 129             setattr(self, name, dreg)
 130             dst.append(dreg)
 131         self._dest = dst
 132
 133         self.rd = go_record(n_src, name="rd") # read in, req out
 134         self.wr = go_record(n_dst, name="wr") # write in, req out
 135         self.issue_i = Signal(reset_less=True) # fn issue in
 136         self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
 137         self.go_die_i = Signal() # go die (reset)
 138
 139         # operation / data input
 140         self.oper_i = subkls() # operand
 141
 142         # output (busy/done)
 143         self.busy_o = Signal(reset_less=True) # fn busy out
 144         self.done_o = Signal(reset_less=True)
 145
 146     def _get_dstwid(self, i):
 147         if isinstance(self._rwid, int):
 148             return self._rwid
 149         return get_regspec_bitwidth(self._rwid, 1, i)
 150
 151     def _get_srcwid(self, i):
 152         if isinstance(self._rwid, int):
 153             return self._rwid
 154         return get_regspec_bitwidth(self._rwid, 0, i)
 155
 156
 157 class RegSpecALUAPI:
 158     def __init__(self, rwid, alu):
 159         """RegSpecAPI
 160
 161         * :rwid:       regspec
 162         * :alu:        ALU covered by this regspec
 163         """
 164         self.rwid = rwid
 165         self.alu = alu # actual ALU - set as a "submodule" of the CU
 166
 167     def get_out(self, i):
 168         if isinstance(self.rwid, int): # old - testing - API (rwid is int)
 169             return self.alu.out[i]
 170         # regspec-based API: look up variable through regspec according to row number
 171         return getattr(self.alu.n.data_o, self.rwid[1][i][1])
 172
 173     def get_in(self, i):
 174         if isinstance(self.rwid, int): # old - testing - API (rwid is int)
 175             return self.alu.i[i]
 176         # regspec-based API: look up variable through regspec according to row number
 177         return getattr(self.alu.p.data_i, self.rwid[0][i][1])
 178
 179     def get_op(self):
 180         if isinstance(self.rwid, int): # old - testing - API (rwid is int)
 181             return self.alu.op
 182         return self.alu.p.data_i.ctx.op
 183
 184
 185 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 186     def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
 187         """MultiCompUnit
 188
 189         * :rwid:        width of register latches (TODO: allocate per regspec)
 190         * :alu:         the ALU (pipeline, FSM) - must conform to nmutil Pipe API
 191         * :opsubsetkls: the subset of Decode2ExecuteType
 192         * :n_src:       number of src operands
 193         * :n_dst:       number of destination operands
 194         """
 195         RegSpecALUAPI.__init__(self, rwid, alu)
 196         self.n_src, self.n_dst = n_src, n_dst
 197         self.opsubsetkls = opsubsetkls
 198         self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
 199
 200         for i in range(n_src):
 201             j = i + 1 # name numbering to match src1/src2
 202             name = "src%d_i" % j
 203             setattr(self, name, getattr(cu, name))
 204
 205         for i in range(n_dst):
 206             j = i + 1 # name numbering to match dest1/2...
 207             name = "dest%d_i" % j
 208             setattr(self, name, getattr(cu, name))
 209
 210         # convenience names
 211         self.rd = cu.rd
 212         self.wr = cu.wr
 213         self.go_rd_i = self.rd.go # temporary naming
 214         self.go_wr_i = self.wr.go # temporary naming
 215         self.rd_rel_o = self.rd.rel # temporary naming
 216         self.req_rel_o = self.wr.rel # temporary naming
 217         self.issue_i = cu.issue_i
 218         self.shadown_i = cu.shadown_i
 219         self.go_die_i = cu.go_die_i
 220
 221         # operation / data input
 222         self.oper_i = cu.oper_i
 223         self.src_i = cu._src_i
 224
 225         self.busy_o = cu.busy_o
 226         self.dest = cu._dest
 227         self.data_o = self.dest[0] # Dest out
 228         self.done_o = cu.done_o
 229
 230     def elaborate(self, platform):
 231         m = Module()
 232         m.submodules.alu = self.alu
 233         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
 234         m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
 235         m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
 236         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
 237         m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
 238
 239         # ALU only proceeds when all src are ready.  rd_rel_o is delayed
 240         # so combine it with go_rd_i.  if all bits are set we're good
 241         all_rd = Signal(reset_less=True)
 242         m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
 243                     (((~self.rd.rel) | self.rd.go).all()))
 244
 245         # write_requests all done
 246         # req_done works because any one of the last of the writes
 247         # is enough, when combined with when read-phase is done (rst_l.q)
 248         wr_any = Signal(reset_less=True)
 249         req_done = Signal(reset_less=True)
 250         m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
 251         m.d.comb += wr_any.eq(self.wr.go.bool())
 252         m.d.comb += req_done.eq(rst_l.q & wr_any)
 253
 254         # shadow/go_die
 255         reset = Signal(reset_less=True)
 256         rst_r = Signal(reset_less=True) # reset latch off
 257         reset_w = Signal(self.n_dst, reset_less=True)
 258         reset_r = Signal(self.n_src, reset_less=True)
 259         m.d.comb += reset.eq(req_done | self.go_die_i)
 260         m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
 261         m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
 262         m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
 263
 264         # read-done,wr-proceed latch
 265         m.d.comb += rok_l.s.eq(self.issue_i)  # set up when issue starts
 266         m.d.comb += rok_l.r.eq(self.alu.p.ready_o) # off when ALU acknowledges
 267
 268         # wr-done, back-to-start latch
 269         m.d.comb += rst_l.s.eq(all_rd)     # set when read-phase is fully done
 270         m.d.comb += rst_l.r.eq(rst_r)        # *off* on issue
 271
 272         # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
 273         m.d.sync += opc_l.s.eq(self.issue_i)       # set on issue
 274         m.d.sync += opc_l.r.eq(self.alu.n.valid_o & req_done) # reset on ALU
 275
 276         # src operand latch (not using go_wr_i)
 277         m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
 278         m.d.sync += src_l.r.eq(reset_r)
 279
 280         # dest operand latch (not using issue_i)
 281         m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
 282         m.d.sync += req_l.r.eq(reset_w)
 283
 284         # create a latch/register for the operand
 285         oper_r = self.opsubsetkls()
 286         latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
 287
 288         # and for each output from the ALU
 289         drl = []
 290         for i in range(self.n_dst):
 291             name = "data_r%d" % i
 292             data_r = Signal(self.cu._get_srcwid(i), name=name, reset_less=True)
 293             latchregister(m, self.get_out(i), data_r, req_l.q[i], name)
 294             drl.append(data_r)
 295
 296         # pass the operation to the ALU
 297         m.d.comb += self.get_op().eq(oper_r)
 298
 299         # create list of src/alu-src/src-latch.  override 1st and 2nd one below.
 300         # in the case, for ALU and Logical pipelines, we assume RB is the 2nd operand
 301         # in the input "regspec".  see for example soc.fu.alu.pipe_data.ALUInputData
 302         # TODO: assume RA is the 1st operand, zero_a detection is needed.
 303         sl = []
 304         for i in range(self.n_src):
 305             sl.append([self.src_i[i], self.get_in(i), src_l.q[i]])
 306
 307         # if the operand subset has "zero_a" we implicitly assume that means
 308         # src_i[0] is an INT register type where zero can be multiplexed in, instead.
 309         # see https://bugs.libre-soc.org/show_bug.cgi?id=336
 310         #if hasattr(oper_r, "zero_a"):
 311             # select zero immediate if opcode says so.  however also change the latch
 312             # to trigger *from* the opcode latch instead.
 313             # ...
 314             # ...
 315
 316         # if the operand subset has "imm_data" we implicitly assume that means
 317         # "this is an INT ALU/Logical FU jobbie, RB is multiplexed with the immediate"
 318         if hasattr(oper_r, "imm_data"):
 319             # select immediate if opcode says so.  however also change the latch
 320             # to trigger *from* the opcode latch instead.
 321             op_is_imm = oper_r.imm_data.imm_ok
 322             src2_or_imm = Signal(self.cu._get_srcwid(1), reset_less=True)
 323             src_sel = Signal(reset_less=True)
 324             m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
 325             m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
 326                                                       self.src2_i))
 327             # overwrite 2nd src-latch with immediate-muxed stuff
 328             sl[1][0] = src2_or_imm
 329             sl[1][2] = src_sel
 330
 331         # create a latch/register for src1/src2 (even if it is a copy of an immediate)
 332         for i in range(self.n_src):
 333             src, alusrc, latch = sl[i]
 334             latchregister(m, src, alusrc, latch, name="src_r%d" % i)
 335
 336         # -----
 337         # outputs
 338         # -----
 339
 340         # all request signals gated by busy_o.  prevents picker problems
 341         m.d.comb += self.busy_o.eq(opc_l.q) # busy out
 342         bro = Repl(self.busy_o, self.n_src)
 343         m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
 344
 345         # on a go_read, tell the ALU we're accepting data.
 346         # NOTE: this spells TROUBLE if the ALU isn't ready!
 347         # go_read is only valid for one clock!
 348         with m.If(all_rd):                           # src operands ready, GO!
 349             with m.If(~self.alu.p.ready_o):          # no ACK yet
 350                 m.d.comb += self.alu.p.valid_i.eq(1) # so indicate valid
 351
 352         brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
 353         # only proceed if ALU says its output is valid
 354         with m.If(self.alu.n.valid_o):
 355             # when ALU ready, write req release out. waits for shadow
 356             m.d.comb += self.wr.rel.eq(req_l.q & brd)
 357             # when output latch is ready, and ALU says ready, accept ALU output
 358             with m.If(reset):
 359                 m.d.comb += self.alu.n.ready_i.eq(1) # tells ALU "thanks got it"
 360
 361         # output the data from the latch on go_write
 362         for i in range(self.n_dst):
 363             with m.If(self.wr.go[i]):
 364                 m.d.comb += self.dest[i].eq(drl[i])
 365
 366         return m
 367
 368     def __iter__(self):
 369         yield self.rd.go
 370         yield self.wr.go
 371         yield self.issue_i
 372         yield self.shadown_i
 373         yield self.go_die_i
 374         yield from self.oper_i.ports()
 375         yield self.src1_i
 376         yield self.src2_i
 377         yield self.busy_o
 378         yield self.rd.rel
 379         yield self.wr.rel
 380         yield self.data_o
 381
 382     def ports(self):
 383         return list(self)
 384
 385
 386 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
 387     yield dut.issue_i.eq(0)
 388     yield
 389     yield dut.src_i[0].eq(a)
 390     yield dut.src_i[1].eq(b)
 391     yield dut.oper_i.insn_type.eq(op)
 392     yield dut.oper_i.invert_a.eq(inv_a)
 393     yield dut.oper_i.imm_data.imm.eq(imm)
 394     yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
 395     yield dut.issue_i.eq(1)
 396     yield
 397     yield dut.issue_i.eq(0)
 398     yield
 399     yield dut.rd.go.eq(0b11)
 400     while True:
 401         yield
 402         rd_rel_o = yield dut.rd.rel
 403         print ("rd_rel", rd_rel_o)
 404         if rd_rel_o:
 405             break
 406     yield
 407     yield dut.rd.go.eq(0)
 408     req_rel_o = yield dut.wr.rel
 409     result = yield dut.data_o
 410     print ("req_rel", req_rel_o, result)
 411     while True:
 412         req_rel_o = yield dut.wr.rel
 413         result = yield dut.data_o
 414         print ("req_rel", req_rel_o, result)
 415         if req_rel_o:
 416             break
 417         yield
 418     yield dut.wr.go[0].eq(1)
 419     yield
 420     result = yield dut.data_o
 421     print ("result", result)
 422     yield dut.wr.go[0].eq(0)
 423     yield
 424     return result
 425
 426
 427 def scoreboard_sim(dut):
 428     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
 429                                     imm=8, imm_ok=1)
 430     assert result == 13
 431
 432     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
 433     assert result == 7
 434
 435     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
 436     assert result == 65532
 437
 438
 439 def test_compunit():
 440     from alu_hier import ALU
 441     from soc.fu.alu.alu_input_record import CompALUOpSubset
 442
 443     m = Module()
 444     alu = ALU(16)
 445     dut = MultiCompUnit(16, alu, CompALUOpSubset)
 446     m.submodules.cu = dut
 447
 448     vl = rtlil.convert(dut, ports=dut.ports())
 449     with open("test_compunit1.il", "w") as f:
 450         f.write(vl)
 451
 452     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 453
 454
 455 def test_compunit_regspec1():
 456     from alu_hier import ALU
 457     from soc.fu.alu.alu_input_record import CompALUOpSubset
 458
 459     inspec = [('INT', 'a', '0:15'),
 460               ('INT', 'b', '0:15')]
 461     outspec = [('INT', 'o', '0:15'),
 462               ]
 463
 464     regspec = (inspec, outspec)
 465
 466     m = Module()
 467     alu = ALU(16)
 468     dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
 469     m.submodules.cu = dut
 470
 471     vl = rtlil.convert(dut, ports=dut.ports())
 472     with open("test_compunit_regspec1.il", "w") as f:
 473         f.write(vl)
 474
 475     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 476
 477
 478 if __name__ == '__main__':
 479     test_compunit()
 480     test_compunit_regspec1()