1 from nmigen
.compat
.sim
import run_simulation
2 from nmigen
.cli
import verilog
, rtlil
3 from nmigen
import Module
, Const
, Signal
, Array
, Cat
, Elaboratable
5 from regfile
.regfile
import RegFileArray
, treereduce
6 from scoreboard
.fu_fu_matrix
import FUFUDepMatrix
7 from scoreboard
.fu_reg_matrix
import FURegDepMatrix
8 from scoreboard
.global_pending
import GlobalPending
9 from scoreboard
.group_picker
import GroupPicker
10 from scoreboard
.issue_unit
import IssueUnitGroup
, IssueUnitArray
, RegDecode
11 from scoreboard
.shadow
import ShadowMatrix
, BranchSpeculationRecord
12 from scoreboard
.instruction_q
import Instruction
, InstructionQ
13 from scoreboard
.memfu
import MemFunctionUnits
15 from compalu
import ComputationUnitNoDelay
16 from compldst
import LDSTCompUnit
18 from alu_hier
import ALU
, BranchALU
19 from nmutil
.latch
import SRLatch
20 from nmutil
.nmoperator
import eq
22 from random
import randint
, seed
23 from copy
import deepcopy
27 class Memory(Elaboratable
):
28 def __init__(self
, regwid
, addrw
):
29 self
.ddepth
= regwid
/8
30 depth
= (1<<addrw
) / self
.ddepth
31 self
.adr
= Signal(addrw
)
32 self
.dat_r
= Signal(regwid
)
33 self
.dat_w
= Signal(regwid
)
35 self
.mem
= Memory(width
=regwid
, depth
=depth
, init
=range(0, depth
))
37 def elaborate(self
, platform
):
39 m
.submodules
.rdport
= rdport
= self
.mem
.read_port()
40 m
.submodules
.wrport
= wrport
= self
.mem
.write_port()
42 rdport
.addr
.eq(self
.adr
[self
.ddepth
:]), # ignore low bits
43 self
.dat_r
.eq(rdport
.data
),
44 wrport
.addr
.eq(self
.adr
),
45 wrport
.data
.eq(self
.dat_w
),
46 wrport
.en
.eq(self
.we
),
52 def __init__(self
, regwid
, addrw
):
54 self
.ddepth
= regwid
//8
55 depth
= (1<<addrw
) // self
.ddepth
56 self
.mem
= list(range(0, depth
))
59 return self
.mem
[addr
>>self
.ddepth
]
61 def st(self
, addr
, data
):
62 self
.mem
[addr
>>self
.ddepth
] = data
& ((1<<self
.regwid
)-1)
65 class CompUnitsBase(Elaboratable
):
66 """ Computation Unit Base class.
68 Amazingly, this class works recursively. It's supposed to just
69 look after some ALUs (that can handle the same operations),
70 grouping them together, however it turns out that the same code
71 can also group *groups* of Computation Units together as well.
73 Basically it was intended just to concatenate the ALU's issue,
74 go_rd etc. signals together, which start out as bits and become
75 sequences. Turns out that the same trick works just as well
78 So this class may be used recursively to present a top-level
79 sequential concatenation of all the signals in and out of
80 ALUs, whilst at the same time making it convenient to group
83 At the lower level, the intent is that groups of (identical)
84 ALUs may be passed the same operation. Even beyond that,
85 the intent is that that group of (identical) ALUs actually
86 share the *same pipeline* and as such become a "Concurrent
87 Computation Unit" as defined by Mitch Alsup (see section
90 def __init__(self
, rwid
, units
, ldstmode
=False):
93 * :rwid: bit width of register file(s) - both FP and INT
94 * :units: sequence of ALUs (or CompUnitsBase derivatives)
97 self
.ldstmode
= ldstmode
100 if units
and isinstance(units
[0], CompUnitsBase
):
103 self
.n_units
+= u
.n_units
105 self
.n_units
= len(units
)
107 n_units
= self
.n_units
110 self
.issue_i
= Signal(n_units
, reset_less
=True)
111 self
.go_rd_i
= Signal(n_units
, reset_less
=True)
112 self
.go_wr_i
= Signal(n_units
, reset_less
=True)
113 self
.shadown_i
= Signal(n_units
, reset_less
=True)
114 self
.go_die_i
= Signal(n_units
, reset_less
=True)
116 self
.go_ad_i
= Signal(n_units
, reset_less
=True)
119 self
.busy_o
= Signal(n_units
, reset_less
=True)
120 self
.rd_rel_o
= Signal(n_units
, reset_less
=True)
121 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
123 self
.adr_rel_o
= Signal(n_units
, reset_less
=True)
124 self
.sto_rel_o
= Signal(n_units
, reset_less
=True)
125 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
126 self
.load_mem_o
= Signal(n_units
, reset_less
=True)
127 self
.stwd_mem_o
= Signal(n_units
, reset_less
=True)
129 # in/out register data (note: not register#, actual data)
130 self
.data_o
= Signal(rwid
, reset_less
=True)
131 self
.src1_i
= Signal(rwid
, reset_less
=True)
132 self
.src2_i
= Signal(rwid
, reset_less
=True)
135 def elaborate(self
, platform
):
139 for i
, alu
in enumerate(self
.units
):
140 setattr(m
.submodules
, "comp%d" % i
, alu
)
150 for alu
in self
.units
:
151 req_rel_l
.append(alu
.req_rel_o
)
152 rd_rel_l
.append(alu
.rd_rel_o
)
153 shadow_l
.append(alu
.shadown_i
)
154 godie_l
.append(alu
.go_die_i
)
155 go_wr_l
.append(alu
.go_wr_i
)
156 go_rd_l
.append(alu
.go_rd_i
)
157 issue_l
.append(alu
.issue_i
)
158 busy_l
.append(alu
.busy_o
)
159 comb
+= self
.rd_rel_o
.eq(Cat(*rd_rel_l
))
160 comb
+= self
.req_rel_o
.eq(Cat(*req_rel_l
))
161 comb
+= self
.busy_o
.eq(Cat(*busy_l
))
162 comb
+= Cat(*godie_l
).eq(self
.go_die_i
)
163 comb
+= Cat(*shadow_l
).eq(self
.shadown_i
)
164 comb
+= Cat(*go_wr_l
).eq(self
.go_wr_i
)
165 comb
+= Cat(*go_rd_l
).eq(self
.go_rd_i
)
166 comb
+= Cat(*issue_l
).eq(self
.issue_i
)
168 # connect data register input/output
170 # merge (OR) all integer FU / ALU outputs to a single value
171 # bit of a hack: treereduce needs a list with an item named "data_o"
173 data_o
= treereduce(self
.units
)
174 comb
+= self
.data_o
.eq(data_o
)
176 for i
, alu
in enumerate(self
.units
):
177 comb
+= alu
.src1_i
.eq(self
.src1_i
)
178 comb
+= alu
.src2_i
.eq(self
.src2_i
)
180 if not self
.ldstmode
:
188 for alu
in self
.units
:
189 adr_rel_l
.append(alu
.adr_rel_o
)
190 sto_rel_l
.append(alu
.sto_rel_o
)
191 ldmem_l
.append(alu
.load_mem_o
)
192 stmem_l
.append(alu
.stwd_mem_o
)
193 go_ad_l
.append(alu
.go_ad_i
)
194 comb
+= self
.adr_rel_o
.eq(Cat(*adr_rel_l
))
195 comb
+= self
.sto_rel_o
.eq(Cat(*sto_rel_l
))
196 comb
+= self
.load_mem_o
.eq(Cat(*ldmem_l
))
197 comb
+= self
.stwd_mem_o
.eq(Cat(*stmem_l
))
198 comb
+= Cat(*go_ad_l
).eq(self
.go_ad_i
)
203 class CompUnitLDSTs(CompUnitsBase
):
205 def __init__(self
, rwid
, opwid
, mem
):
208 * :rwid: bit width of register file(s) - both FP and INT
209 * :opwid: operand bit width
214 self
.oper_i
= Signal(opwid
, reset_less
=True)
215 self
.imm_i
= Signal(rwid
, reset_less
=True)
222 for alu
in [add1
, add2
]:
223 aluopwid
= 4 # see compldst.py for "internal" opcode
224 units
.append(LDSTCompUnit(rwid
, aluopwid
, alu
, mem
))
226 CompUnitsBase
.__init
__(self
, rwid
, units
, ldstmode
=True)
228 def elaborate(self
, platform
):
229 m
= CompUnitsBase
.elaborate(self
, platform
)
232 # hand the same operation to all units, 4 lower bits though
233 for alu
in self
.units
:
234 comb
+= alu
.oper_i
[0:4].eq(self
.oper_i
)
235 comb
+= alu
.imm_i
.eq(self
.imm_i
)
236 comb
+= alu
.isalu_i
.eq(0)
241 class CompUnitALUs(CompUnitsBase
):
243 def __init__(self
, rwid
, opwid
, n_alus
):
246 * :rwid: bit width of register file(s) - both FP and INT
247 * :opwid: operand bit width
252 self
.oper_i
= Signal(opwid
, reset_less
=True)
253 self
.imm_i
= Signal(rwid
, reset_less
=True)
257 for i
in range(n_alus
):
258 alus
.append(ALU(rwid
))
262 aluopwid
= 3 # extra bit for immediate mode
263 units
.append(ComputationUnitNoDelay(rwid
, aluopwid
, alu
))
265 CompUnitsBase
.__init
__(self
, rwid
, units
)
267 def elaborate(self
, platform
):
268 m
= CompUnitsBase
.elaborate(self
, platform
)
271 # hand the same operation to all units, only lower 3 bits though
272 for alu
in self
.units
:
273 comb
+= alu
.oper_i
[0:3].eq(self
.oper_i
)
274 comb
+= alu
.imm_i
.eq(self
.imm_i
)
279 class CompUnitBR(CompUnitsBase
):
281 def __init__(self
, rwid
, opwid
):
284 * :rwid: bit width of register file(s) - both FP and INT
285 * :opwid: operand bit width
287 Note: bgt unit is returned so that a shadow unit can be created
293 self
.oper_i
= Signal(opwid
, reset_less
=True)
294 self
.imm_i
= Signal(rwid
, reset_less
=True)
297 self
.bgt
= BranchALU(rwid
)
298 aluopwid
= 3 # extra bit for immediate mode
299 self
.br1
= ComputationUnitNoDelay(rwid
, aluopwid
, self
.bgt
)
300 CompUnitsBase
.__init
__(self
, rwid
, [self
.br1
])
302 def elaborate(self
, platform
):
303 m
= CompUnitsBase
.elaborate(self
, platform
)
306 # hand the same operation to all units
307 for alu
in self
.units
:
308 comb
+= alu
.oper_i
.eq(self
.oper_i
)
309 comb
+= alu
.imm_i
.eq(self
.imm_i
)
314 class FunctionUnits(Elaboratable
):
316 def __init__(self
, n_regs
, n_int_alus
):
318 self
.n_int_alus
= n_int_alus
320 self
.dest_i
= Signal(n_regs
, reset_less
=True) # Dest R# in
321 self
.src1_i
= Signal(n_regs
, reset_less
=True) # oper1 R# in
322 self
.src2_i
= Signal(n_regs
, reset_less
=True) # oper2 R# in
324 self
.g_int_rd_pend_o
= Signal(n_regs
, reset_less
=True)
325 self
.g_int_wr_pend_o
= Signal(n_regs
, reset_less
=True)
327 self
.dest_rsel_o
= Signal(n_regs
, reset_less
=True) # dest reg (bot)
328 self
.src1_rsel_o
= Signal(n_regs
, reset_less
=True) # src1 reg (bot)
329 self
.src2_rsel_o
= Signal(n_regs
, reset_less
=True) # src2 reg (bot)
331 self
.readable_o
= Signal(n_int_alus
, reset_less
=True)
332 self
.writable_o
= Signal(n_int_alus
, reset_less
=True)
334 self
.go_rd_i
= Signal(n_int_alus
, reset_less
=True)
335 self
.go_wr_i
= Signal(n_int_alus
, reset_less
=True)
336 self
.go_die_i
= Signal(n_int_alus
, reset_less
=True)
337 self
.fn_issue_i
= Signal(n_int_alus
, reset_less
=True)
339 # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
341 def elaborate(self
, platform
):
346 n_intfus
= self
.n_int_alus
348 # Integer FU-FU Dep Matrix
349 intfudeps
= FUFUDepMatrix(n_intfus
, n_intfus
)
350 m
.submodules
.intfudeps
= intfudeps
351 # Integer FU-Reg Dep Matrix
352 intregdeps
= FURegDepMatrix(n_intfus
, self
.n_regs
, 2)
353 m
.submodules
.intregdeps
= intregdeps
355 comb
+= self
.g_int_rd_pend_o
.eq(intregdeps
.v_rd_rsel_o
)
356 comb
+= self
.g_int_wr_pend_o
.eq(intregdeps
.v_wr_rsel_o
)
358 comb
+= intregdeps
.rd_pend_i
.eq(intregdeps
.v_rd_rsel_o
)
359 comb
+= intregdeps
.wr_pend_i
.eq(intregdeps
.v_wr_rsel_o
)
361 comb
+= intfudeps
.rd_pend_i
.eq(intregdeps
.rd_pend_o
)
362 comb
+= intfudeps
.wr_pend_i
.eq(intregdeps
.wr_pend_o
)
363 self
.wr_pend_o
= intregdeps
.wr_pend_o
# also output for use in WaWGrid
365 comb
+= intfudeps
.issue_i
.eq(self
.fn_issue_i
)
366 comb
+= intfudeps
.go_rd_i
.eq(self
.go_rd_i
)
367 comb
+= intfudeps
.go_wr_i
.eq(self
.go_wr_i
)
368 comb
+= intfudeps
.go_die_i
.eq(self
.go_die_i
)
369 comb
+= self
.readable_o
.eq(intfudeps
.readable_o
)
370 comb
+= self
.writable_o
.eq(intfudeps
.writable_o
)
372 # Connect function issue / arrays, and dest/src1/src2
373 comb
+= intregdeps
.dest_i
.eq(self
.dest_i
)
374 comb
+= intregdeps
.src_i
[0].eq(self
.src1_i
)
375 comb
+= intregdeps
.src_i
[1].eq(self
.src2_i
)
377 comb
+= intregdeps
.go_rd_i
.eq(self
.go_rd_i
)
378 comb
+= intregdeps
.go_wr_i
.eq(self
.go_wr_i
)
379 comb
+= intregdeps
.go_die_i
.eq(self
.go_die_i
)
380 comb
+= intregdeps
.issue_i
.eq(self
.fn_issue_i
)
382 comb
+= self
.dest_rsel_o
.eq(intregdeps
.dest_rsel_o
)
383 comb
+= self
.src1_rsel_o
.eq(intregdeps
.src_rsel_o
[0])
384 comb
+= self
.src2_rsel_o
.eq(intregdeps
.src_rsel_o
[1])
389 class Scoreboard(Elaboratable
):
390 def __init__(self
, rwid
, n_regs
):
393 * :rwid: bit width of register file(s) - both FP and INT
394 * :n_regs: depth of register file(s) - number of FP and INT regs
400 self
.intregs
= RegFileArray(rwid
, n_regs
)
401 self
.fpregs
= RegFileArray(rwid
, n_regs
)
403 # issue q needs to get at these
404 self
.aluissue
= IssueUnitGroup(4)
405 self
.brissue
= IssueUnitGroup(1)
407 self
.alu_oper_i
= Signal(4, reset_less
=True)
408 self
.alu_imm_i
= Signal(rwid
, reset_less
=True)
409 self
.br_oper_i
= Signal(4, reset_less
=True)
410 self
.br_imm_i
= Signal(rwid
, reset_less
=True)
413 self
.int_dest_i
= Signal(max=n_regs
, reset_less
=True) # Dest R# in
414 self
.int_src1_i
= Signal(max=n_regs
, reset_less
=True) # oper1 R# in
415 self
.int_src2_i
= Signal(max=n_regs
, reset_less
=True) # oper2 R# in
416 self
.reg_enable_i
= Signal(reset_less
=True) # enable reg decode
419 self
.issue_o
= Signal(reset_less
=True) # instruction was accepted
420 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
422 # for branch speculation experiment. branch_direction = 0 if
423 # the branch hasn't been met yet. 1 indicates "success", 2 is "fail"
424 # branch_succ and branch_fail are requests to have the current
425 # instruction be dependent on the branch unit "shadow" capability.
426 self
.branch_succ_i
= Signal(reset_less
=True)
427 self
.branch_fail_i
= Signal(reset_less
=True)
428 self
.branch_direction_o
= Signal(2, reset_less
=True)
430 def elaborate(self
, platform
):
435 m
.submodules
.intregs
= self
.intregs
436 m
.submodules
.fpregs
= self
.fpregs
439 int_dest
= self
.intregs
.write_port("dest")
440 int_src1
= self
.intregs
.read_port("src1")
441 int_src2
= self
.intregs
.read_port("src2")
443 fp_dest
= self
.fpregs
.write_port("dest")
444 fp_src1
= self
.fpregs
.read_port("src1")
445 fp_src2
= self
.fpregs
.read_port("src2")
447 # Int ALUs and BR ALUs
449 cua
= CompUnitALUs(self
.rwid
, 3, n_alus
=4)
450 cub
= CompUnitBR(self
.rwid
, 3) # 1 BR ALUs
454 cul
= CompUnitLDSTs(self
.rwid
, 3, None)
457 m
.submodules
.cu
= cu
= CompUnitsBase(self
.rwid
, [cua
, cub
, cul
])
458 bgt
= cub
.bgt
# get at the branch computation unit
462 m
.submodules
.intfus
= intfus
= FunctionUnits(self
.n_regs
, n_int_alus
)
465 m
.submodules
.memfus
= memfus
= MemFunctionUnits(n_ldsts
, 11)
467 # Count of number of FUs
468 n_intfus
= n_int_alus
469 n_fp_fus
= 0 # for now
471 # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
472 intpick1
= GroupPicker(n_intfus
) # picks 1 reader and 1 writer to intreg
473 m
.submodules
.intpick1
= intpick1
476 regdecode
= RegDecode(self
.n_regs
)
477 m
.submodules
.regdecode
= regdecode
478 issueunit
= IssueUnitArray([self
.aluissue
, self
.brissue
])
479 m
.submodules
.issueunit
= issueunit
481 # Shadow Matrix. currently n_intfus shadows, to be used for
482 # write-after-write hazards. NOTE: there is one extra for branches,
483 # so the shadow width is increased by 1
484 m
.submodules
.shadows
= shadows
= ShadowMatrix(n_intfus
, n_intfus
, True)
485 m
.submodules
.bshadow
= bshadow
= ShadowMatrix(n_intfus
, 1, False)
487 # record previous instruction to cast shadow on current instruction
488 prev_shadow
= Signal(n_intfus
)
490 # Branch Speculation recorder. tracks the success/fail state as
491 # each instruction is issued, so that when the branch occurs the
492 # allow/cancel can be issued as appropriate.
493 m
.submodules
.specrec
= bspec
= BranchSpeculationRecord(n_intfus
)
496 # ok start wiring things together...
497 # "now hear de word of de looord... dem bones dem bones dem dryy bones"
498 # https://www.youtube.com/watch?v=pYb8Wm6-QfA
502 # Issue Unit is where it starts. set up some in/outs for this module
504 comb
+= [ regdecode
.dest_i
.eq(self
.int_dest_i
),
505 regdecode
.src1_i
.eq(self
.int_src1_i
),
506 regdecode
.src2_i
.eq(self
.int_src2_i
),
507 regdecode
.enable_i
.eq(self
.reg_enable_i
),
508 self
.issue_o
.eq(issueunit
.issue_o
)
511 # take these to outside (issue needs them)
512 comb
+= cua
.oper_i
.eq(self
.alu_oper_i
)
513 comb
+= cua
.imm_i
.eq(self
.alu_imm_i
)
514 comb
+= cub
.oper_i
.eq(self
.br_oper_i
)
515 comb
+= cub
.imm_i
.eq(self
.br_imm_i
)
517 # TODO: issueunit.f (FP)
519 # and int function issue / busy arrays, and dest/src1/src2
520 comb
+= intfus
.dest_i
.eq(regdecode
.dest_o
)
521 comb
+= intfus
.src1_i
.eq(regdecode
.src1_o
)
522 comb
+= intfus
.src2_i
.eq(regdecode
.src2_o
)
524 fn_issue_o
= issueunit
.fn_issue_o
526 comb
+= intfus
.fn_issue_i
.eq(fn_issue_o
)
527 comb
+= issueunit
.busy_i
.eq(cu
.busy_o
)
528 comb
+= self
.busy_o
.eq(cu
.busy_o
.bool())
531 # merge shadow matrices outputs
534 # these are explained in ShadowMatrix docstring, and are to be
535 # connected to the FUReg and FUFU Matrices, to get them to reset
536 anydie
= Signal(n_intfus
, reset_less
=True)
537 allshadown
= Signal(n_intfus
, reset_less
=True)
538 shreset
= Signal(n_intfus
, reset_less
=True)
539 comb
+= allshadown
.eq(shadows
.shadown_o
& bshadow
.shadown_o
)
540 comb
+= anydie
.eq(shadows
.go_die_o | bshadow
.go_die_o
)
541 comb
+= shreset
.eq(bspec
.match_g_o | bspec
.match_f_o
)
544 # connect fu-fu matrix
547 # Group Picker... done manually for now.
548 go_rd_o
= intpick1
.go_rd_o
549 go_wr_o
= intpick1
.go_wr_o
550 go_rd_i
= intfus
.go_rd_i
551 go_wr_i
= intfus
.go_wr_i
552 go_die_i
= intfus
.go_die_i
553 # NOTE: connect to the shadowed versions so that they can "die" (reset)
554 comb
+= go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
]) # rd
555 comb
+= go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
]) # wr
556 comb
+= go_die_i
[0:n_intfus
].eq(anydie
[0:n_intfus
]) # die
560 comb
+= intpick1
.rd_rel_i
[0:n_intfus
].eq(cu
.rd_rel_o
[0:n_intfus
])
561 comb
+= intpick1
.req_rel_i
[0:n_intfus
].eq(cu
.req_rel_o
[0:n_intfus
])
562 int_rd_o
= intfus
.readable_o
563 int_wr_o
= intfus
.writable_o
564 comb
+= intpick1
.readable_i
[0:n_intfus
].eq(int_rd_o
[0:n_intfus
])
565 comb
+= intpick1
.writable_i
[0:n_intfus
].eq(int_wr_o
[0:n_intfus
])
571 comb
+= shadows
.issue_i
.eq(fn_issue_o
)
572 #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
573 comb
+= shadows
.reset_i
[0:n_intfus
].eq(bshadow
.go_die_o
[0:n_intfus
])
575 # NOTE; this setup is for the instruction order preservation...
577 # connect shadows / go_dies to Computation Units
578 comb
+= cu
.shadown_i
[0:n_intfus
].eq(allshadown
)
579 comb
+= cu
.go_die_i
[0:n_intfus
].eq(anydie
)
581 # ok connect first n_int_fu shadows to busy lines, to create an
582 # instruction-order linked-list-like arrangement, using a bit-matrix
583 # (instead of e.g. a ring buffer).
586 # when written, the shadow can be cancelled (and was good)
587 for i
in range(n_intfus
):
588 comb
+= shadows
.s_good_i
[i
][0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
590 # *previous* instruction shadows *current* instruction, and, obviously,
591 # if the previous is completed (!busy) don't cast the shadow!
592 comb
+= prev_shadow
.eq(~fn_issue_o
& cu
.busy_o
)
593 for i
in range(n_intfus
):
594 comb
+= shadows
.shadow_i
[i
][0:n_intfus
].eq(prev_shadow
)
597 # ... and this is for branch speculation. it uses the extra bit
598 # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
599 # only needs to set shadow_i, s_fail_i and s_good_i
601 # issue captures shadow_i (if enabled)
602 comb
+= bshadow
.reset_i
[0:n_intfus
].eq(shreset
[0:n_intfus
])
604 bactive
= Signal(reset_less
=True)
605 comb
+= bactive
.eq((bspec
.active_i | br1
.issue_i
) & ~br1
.go_wr_i
)
607 # instruction being issued (fn_issue_o) has a shadow cast by the branch
608 with m
.If(bactive
& (self
.branch_succ_i | self
.branch_fail_i
)):
609 comb
+= bshadow
.issue_i
.eq(fn_issue_o
)
610 for i
in range(n_intfus
):
611 with m
.If(fn_issue_o
& (Const(1<<i
))):
612 comb
+= bshadow
.shadow_i
[i
][0].eq(1)
614 # finally, we need an indicator to the test infrastructure as to
615 # whether the branch succeeded or failed, plus, link up to the
616 # "recorder" of whether the instruction was under shadow or not
618 with m
.If(br1
.issue_i
):
619 sync
+= bspec
.active_i
.eq(1)
620 with m
.If(self
.branch_succ_i
):
621 comb
+= bspec
.good_i
.eq(fn_issue_o
& 0x1f)
622 with m
.If(self
.branch_fail_i
):
623 comb
+= bspec
.fail_i
.eq(fn_issue_o
& 0x1f)
625 # branch is active (TODO: a better signal: this is over-using the
626 # go_write signal - actually the branch should not be "writing")
627 with m
.If(br1
.go_wr_i
):
628 sync
+= self
.branch_direction_o
.eq(br1
.data_o
+Const(1, 2))
629 sync
+= bspec
.active_i
.eq(0)
630 comb
+= bspec
.br_i
.eq(1)
631 # branch occurs if data == 1, failed if data == 0
632 comb
+= bspec
.br_ok_i
.eq(br1
.data_o
== 1)
633 for i
in range(n_intfus
):
634 # *expected* direction of the branch matched against *actual*
635 comb
+= bshadow
.s_good_i
[i
][0].eq(bspec
.match_g_o
[i
])
637 comb
+= bshadow
.s_fail_i
[i
][0].eq(bspec
.match_f_o
[i
])
640 # Connect Register File(s)
642 comb
+= int_dest
.wen
.eq(intfus
.dest_rsel_o
)
643 comb
+= int_src1
.ren
.eq(intfus
.src1_rsel_o
)
644 comb
+= int_src2
.ren
.eq(intfus
.src2_rsel_o
)
646 # connect ALUs to regfule
647 comb
+= int_dest
.data_i
.eq(cu
.data_o
)
648 comb
+= cu
.src1_i
.eq(int_src1
.data_o
)
649 comb
+= cu
.src2_i
.eq(int_src2
.data_o
)
651 # connect ALU Computation Units
652 comb
+= cu
.go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
])
653 comb
+= cu
.go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
654 comb
+= cu
.issue_i
[0:n_intfus
].eq(fn_issue_o
[0:n_intfus
])
659 yield from self
.intregs
660 yield from self
.fpregs
661 yield self
.int_dest_i
662 yield self
.int_src1_i
663 yield self
.int_src2_i
665 yield self
.branch_succ_i
666 yield self
.branch_fail_i
667 yield self
.branch_direction_o
673 class IssueToScoreboard(Elaboratable
):
675 def __init__(self
, qlen
, n_in
, n_out
, rwid
, opwid
, n_regs
):
683 mqbits
= (int(log(qlen
) / log(2))+2, False)
684 self
.p_add_i
= Signal(mqbits
) # instructions to add (from data_i)
685 self
.p_ready_o
= Signal() # instructions were added
686 self
.data_i
= Instruction
.nq(n_in
, "data_i", rwid
, opwid
)
688 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
689 self
.qlen_o
= Signal(mqbits
, reset_less
=True)
691 def elaborate(self
, platform
):
696 iq
= InstructionQ(self
.rwid
, self
.opw
, self
.qlen
, self
.n_in
, self
.n_out
)
697 sc
= Scoreboard(self
.rwid
, self
.n_regs
)
701 # get at the regfile for testing
702 self
.intregs
= sc
.intregs
704 # and the "busy" signal and instruction queue length
705 comb
+= self
.busy_o
.eq(sc
.busy_o
)
706 comb
+= self
.qlen_o
.eq(iq
.qlen_o
)
708 # link up instruction queue
709 comb
+= iq
.p_add_i
.eq(self
.p_add_i
)
710 comb
+= self
.p_ready_o
.eq(iq
.p_ready_o
)
711 for i
in range(self
.n_in
):
712 comb
+= eq(iq
.data_i
[i
], self
.data_i
[i
])
714 # take instruction and process it. note that it's possible to
715 # "inspect" the queue contents *without* actually removing the
716 # items. items are only removed when the
719 wait_issue_br
= Signal()
720 wait_issue_alu
= Signal()
722 with m
.If(wait_issue_br | wait_issue_alu
):
723 # set instruction pop length to 1 if the unit accepted
724 with m
.If(wait_issue_br
& (sc
.brissue
.fn_issue_o
!= 0)):
725 with m
.If(iq
.qlen_o
!= 0):
726 comb
+= iq
.n_sub_i
.eq(1)
727 with m
.If(wait_issue_alu
& (sc
.aluissue
.fn_issue_o
!= 0)):
728 with m
.If(iq
.qlen_o
!= 0):
729 comb
+= iq
.n_sub_i
.eq(1)
731 # see if some instruction(s) are here. note that this is
732 # "inspecting" the in-place queue. note also that on the
733 # cycle following "waiting" for fn_issue_o to be set, the
734 # "resetting" done above (insn_i=0) could be re-ASSERTed.
735 with m
.If(iq
.qlen_o
!= 0):
736 # get the operands and operation
737 imm
= iq
.data_o
[0].imm_i
738 dest
= iq
.data_o
[0].dest_i
739 src1
= iq
.data_o
[0].src1_i
740 src2
= iq
.data_o
[0].src2_i
741 op
= iq
.data_o
[0].oper_i
742 opi
= iq
.data_o
[0].opim_i
# immediate set
744 # set the src/dest regs
745 comb
+= sc
.int_dest_i
.eq(dest
)
746 comb
+= sc
.int_src1_i
.eq(src1
)
747 comb
+= sc
.int_src2_i
.eq(src2
)
748 comb
+= sc
.reg_enable_i
.eq(1) # enable the regfile
750 # choose a Function-Unit-Group
751 with m
.If((op
& (0x3<<2)) != 0): # branch
752 comb
+= sc
.brissue
.insn_i
.eq(1)
753 comb
+= sc
.br_oper_i
.eq(Cat(op
[0:2], opi
))
754 comb
+= sc
.br_imm_i
.eq(imm
)
755 comb
+= wait_issue_br
.eq(1)
757 comb
+= sc
.aluissue
.insn_i
.eq(1)
758 comb
+= sc
.alu_oper_i
.eq(Cat(op
[0:2], opi
))
759 comb
+= sc
.alu_imm_i
.eq(imm
)
760 comb
+= wait_issue_alu
.eq(1)
763 # these indicate that the instruction is to be made
764 # shadow-dependent on
765 # (either) branch success or branch fail
766 #yield sc.branch_fail_i.eq(branch_fail)
767 #yield sc.branch_succ_i.eq(branch_success)
773 for o
in self
.data_i
:
791 def __init__(self
, rwidth
, nregs
):
793 self
.regs
= [0] * nregs
795 def op(self
, op
, op_imm
, imm
, src1
, src2
, dest
):
796 maxbits
= (1 << self
.rwidth
) - 1
797 src1
= self
.regs
[src1
] & maxbits
801 src2
= self
.regs
[src2
] & maxbits
809 val
= src1
>> (src2
& maxbits
)
811 val
= int(src1
> src2
)
813 val
= int(src1
< src2
)
815 val
= int(src1
== src2
)
817 val
= int(src1
!= src2
)
819 self
.setval(dest
, val
)
822 def setval(self
, dest
, val
):
823 print ("sim setval", dest
, hex(val
))
824 self
.regs
[dest
] = val
827 for i
, val
in enumerate(self
.regs
):
828 reg
= yield dut
.intregs
.regs
[i
].reg
829 okstr
= "OK" if reg
== val
else "!ok"
830 print("reg %d expected %x received %x %s" % (i
, val
, reg
, okstr
))
832 def check(self
, dut
):
833 for i
, val
in enumerate(self
.regs
):
834 reg
= yield dut
.intregs
.regs
[i
].reg
836 print("reg %d expected %x received %x\n" % (i
, val
, reg
))
837 yield from self
.dump(dut
)
840 def instr_q(dut
, op
, op_imm
, imm
, src1
, src2
, dest
,
841 branch_success
, branch_fail
):
842 instrs
= [{'oper_i': op
, 'dest_i': dest
, 'imm_i': imm
, 'opim_i': op_imm
,
843 'src1_i': src1
, 'src2_i': src2
}]
846 for idx
in range(sendlen
):
847 yield from eq(dut
.data_i
[idx
], instrs
[idx
])
848 di
= yield dut
.data_i
[idx
]
849 print ("senddata %d %x" % (idx
, di
))
850 yield dut
.p_add_i
.eq(sendlen
)
852 o_p_ready
= yield dut
.p_ready_o
855 o_p_ready
= yield dut
.p_ready_o
857 yield dut
.p_add_i
.eq(0)
860 def int_instr(dut
, op
, imm
, src1
, src2
, dest
, branch_success
, branch_fail
):
861 yield from disable_issue(dut
)
862 yield dut
.int_dest_i
.eq(dest
)
863 yield dut
.int_src1_i
.eq(src1
)
864 yield dut
.int_src2_i
.eq(src2
)
865 if (op
& (0x3<<2)) != 0: # branch
866 yield dut
.brissue
.insn_i
.eq(1)
867 yield dut
.br_oper_i
.eq(Const(op
& 0x3, 2))
868 yield dut
.br_imm_i
.eq(imm
)
869 dut_issue
= dut
.brissue
871 yield dut
.aluissue
.insn_i
.eq(1)
872 yield dut
.alu_oper_i
.eq(Const(op
& 0x3, 2))
873 yield dut
.alu_imm_i
.eq(imm
)
874 dut_issue
= dut
.aluissue
875 yield dut
.reg_enable_i
.eq(1)
877 # these indicate that the instruction is to be made shadow-dependent on
878 # (either) branch success or branch fail
879 yield dut
.branch_fail_i
.eq(branch_fail
)
880 yield dut
.branch_succ_i
.eq(branch_success
)
883 yield from wait_for_issue(dut
, dut_issue
)
886 def print_reg(dut
, rnums
):
889 reg
= yield dut
.intregs
.regs
[rnum
].reg
890 rs
.append("%x" % reg
)
891 rnums
= map(str, rnums
)
892 print ("reg %s: %s" % (','.join(rnums
), ','.join(rs
)))
895 def create_random_ops(dut
, n_ops
, shadowing
=False, max_opnums
=3):
897 for i
in range(n_ops
):
898 src1
= randint(1, dut
.n_regs
-1)
899 src2
= randint(1, dut
.n_regs
-1)
900 imm
= randint(1, (1<<dut
.rwid
)-1)
901 dest
= randint(1, dut
.n_regs
-1)
902 op
= randint(0, max_opnums
)
903 opi
= 0 if randint(0, 2) else 1 # set true if random is nonzero
906 insts
.append((src1
, src2
, dest
, op
, opi
, imm
, (0, 0)))
908 insts
.append((src1
, src2
, dest
, op
, opi
, imm
))
912 def wait_for_busy_clear(dut
):
914 busy_o
= yield dut
.busy_o
920 def disable_issue(dut
):
921 yield dut
.aluissue
.insn_i
.eq(0)
922 yield dut
.brissue
.insn_i
.eq(0)
925 def wait_for_issue(dut
, dut_issue
):
927 issue_o
= yield dut_issue
.fn_issue_o
929 yield from disable_issue(dut
)
930 yield dut
.reg_enable_i
.eq(0)
933 #yield from print_reg(dut, [1,2,3])
935 #yield from print_reg(dut, [1,2,3])
937 def scoreboard_branch_sim(dut
, alusim
):
943 print ("rseed", iseed
)
947 yield dut
.branch_direction_o
.eq(0)
949 # set random values in the registers
950 for i
in range(1, dut
.n_regs
):
952 val
= randint(0, (1<<alusim
.rwidth
)-1)
953 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
954 alusim
.setval(i
, val
)
957 # create some instructions: branches create a tree
958 insts
= create_random_ops(dut
, 1, True, 1)
959 #insts.append((6, 6, 1, 2, (0, 0)))
960 #insts.append((4, 3, 3, 0, (0, 0)))
962 src1
= randint(1, dut
.n_regs
-1)
963 src2
= randint(1, dut
.n_regs
-1)
965 op
= 4 # only BGT at the moment
967 branch_ok
= create_random_ops(dut
, 1, True, 1)
968 branch_fail
= create_random_ops(dut
, 1, True, 1)
970 insts
.append((src1
, src2
, (branch_ok
, branch_fail
), op
, (0, 0)))
974 insts
.append( (3, 5, 2, 0, (0, 0)) )
977 #branch_ok.append ( (5, 7, 5, 1, (1, 0)) )
978 branch_ok
.append( None )
979 branch_fail
.append( (1, 1, 2, 0, (0, 1)) )
980 #branch_fail.append( None )
981 insts
.append( (6, 4, (branch_ok
, branch_fail
), 4, (0, 0)) )
983 siminsts
= deepcopy(insts
)
985 # issue instruction(s)
993 branch_direction
= yield dut
.branch_direction_o
# way branch went
994 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = insts
.pop(0)
995 if branch_direction
== 1 and shadow_on
:
996 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
997 continue # branch was "success" and this is a "failed"... skip
998 if branch_direction
== 2 and shadow_off
:
999 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
1000 continue # branch was "fail" and this is a "success"... skip
1001 if branch_direction
!= 0:
1006 branch_ok
, branch_fail
= dest
1008 # ok zip up the branch success / fail instructions and
1009 # drop them into the queue, one marked "to have branch success"
1010 # the other to be marked shadow branch "fail".
1011 # one out of each of these will be cancelled
1012 for ok
, fl
in zip(branch_ok
, branch_fail
):
1014 instrs
.append((ok
[0], ok
[1], ok
[2], ok
[3], (1, 0)))
1016 instrs
.append((fl
[0], fl
[1], fl
[2], fl
[3], (0, 1)))
1017 print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
1018 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1019 yield from int_instr(dut
, op
, src1
, src2
, dest
,
1020 shadow_on
, shadow_off
)
1022 # wait for all instructions to stop before checking
1024 yield from wait_for_busy_clear(dut
)
1028 instr
= siminsts
.pop(0)
1031 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = instr
1035 branch_ok
, branch_fail
= dest
1037 print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
1038 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1039 branch_res
= alusim
.op(op
, src1
, src2
, dest
)
1042 siminsts
+= branch_ok
1044 siminsts
+= branch_fail
1047 yield from alusim
.check(dut
)
1048 yield from alusim
.dump(dut
)
1051 def scoreboard_sim(dut
, alusim
):
1057 # set random values in the registers
1058 for i
in range(1, dut
.n_regs
):
1059 val
= randint(0, (1<<alusim
.rwidth
)-1)
1062 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
1063 alusim
.setval(i
, val
)
1065 # create some instructions (some random, some regression tests)
1068 instrs
= create_random_ops(dut
, 15, True, 4)
1071 instrs
.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
1074 instrs
.append( (7, 3, 2, 4, (0, 0)) )
1075 instrs
.append( (7, 6, 6, 2, (0, 0)) )
1076 instrs
.append( (1, 7, 2, 2, (0, 0)) )
1079 instrs
.append((2, 3, 3, 0, 0, 0, (0, 0)))
1080 instrs
.append((5, 3, 3, 1, 0, 0, (0, 0)))
1081 instrs
.append((3, 5, 5, 2, 0, 0, (0, 0)))
1082 instrs
.append((5, 3, 3, 3, 0, 0, (0, 0)))
1083 instrs
.append((3, 5, 5, 0, 0, 0, (0, 0)))
1086 instrs
.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
1087 instrs
.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
1088 instrs
.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
1091 instrs
.append((5, 6, 2, 1))
1092 instrs
.append((2, 2, 4, 0))
1093 #instrs.append((2, 2, 3, 1))
1096 instrs
.append((2, 1, 2, 3))
1099 instrs
.append((2, 6, 2, 1))
1100 instrs
.append((2, 1, 2, 0))
1103 instrs
.append((1, 2, 7, 2))
1104 instrs
.append((7, 1, 5, 0))
1105 instrs
.append((4, 4, 1, 1))
1108 instrs
.append((5, 6, 2, 2))
1109 instrs
.append((1, 1, 4, 1))
1110 instrs
.append((6, 5, 3, 0))
1113 # Write-after-Write Hazard
1114 instrs
.append( (3, 6, 7, 2) )
1115 instrs
.append( (4, 4, 7, 1) )
1118 # self-read/write-after-write followed by Read-after-Write
1119 instrs
.append((1, 1, 1, 1))
1120 instrs
.append((1, 5, 3, 0))
1123 # Read-after-Write followed by self-read-after-write
1124 instrs
.append((5, 6, 1, 2))
1125 instrs
.append((1, 1, 1, 1))
1128 # self-read-write sandwich
1129 instrs
.append((5, 6, 1, 2))
1130 instrs
.append((1, 1, 1, 1))
1131 instrs
.append((1, 5, 3, 0))
1134 # very weird failure
1135 instrs
.append( (5, 2, 5, 2) )
1136 instrs
.append( (2, 6, 3, 0) )
1137 instrs
.append( (4, 2, 2, 1) )
1141 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1142 alusim
.setval(5, v1
)
1143 yield dut
.intregs
.regs
[3].reg
.eq(5)
1145 instrs
.append((5, 3, 3, 4, (0, 0)))
1146 instrs
.append((4, 2, 1, 2, (0, 1)))
1150 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1151 alusim
.setval(5, v1
)
1152 yield dut
.intregs
.regs
[3].reg
.eq(5)
1154 instrs
.append((5, 3, 3, 4, (0, 0)))
1155 instrs
.append((4, 2, 1, 2, (1, 0)))
1158 instrs
.append( (4, 3, 5, 1, 0, (0, 0)) )
1159 instrs
.append( (5, 2, 3, 1, 0, (0, 0)) )
1160 instrs
.append( (7, 1, 5, 2, 0, (0, 0)) )
1161 instrs
.append( (5, 6, 6, 4, 0, (0, 0)) )
1162 instrs
.append( (7, 5, 2, 2, 0, (1, 0)) )
1163 instrs
.append( (1, 7, 5, 0, 0, (0, 1)) )
1164 instrs
.append( (1, 6, 1, 2, 0, (1, 0)) )
1165 instrs
.append( (1, 6, 7, 3, 0, (0, 0)) )
1166 instrs
.append( (6, 7, 7, 0, 0, (0, 0)) )
1168 # issue instruction(s), wait for issue to be free before proceeding
1169 for i
, instr
in enumerate(instrs
):
1170 src1
, src2
, dest
, op
, opi
, imm
, (br_ok
, br_fail
) = instr
1172 print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
1173 (i
, src1
, src2
, dest
, op
, opi
, imm
))
1174 alusim
.op(op
, opi
, imm
, src1
, src2
, dest
)
1175 yield from instr_q(dut
, op
, opi
, imm
, src1
, src2
, dest
,
1178 # wait for all instructions to stop before checking
1180 iqlen
= yield dut
.qlen_o
1188 yield from wait_for_busy_clear(dut
)
1191 yield from alusim
.check(dut
)
1192 yield from alusim
.dump(dut
)
1195 def test_scoreboard():
1196 dut
= IssueToScoreboard(2, 1, 1, 16, 8, 8)
1197 alusim
= RegSim(16, 8)
1198 memsim
= MemSim(16, 16)
1199 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
1200 with
open("test_scoreboard6600.il", "w") as f
:
1203 run_simulation(dut
, scoreboard_sim(dut
, alusim
),
1204 vcd_name
='test_scoreboard6600.vcd')
1206 #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
1207 # vcd_name='test_scoreboard6600.vcd')
1210 if __name__
== '__main__':