1 from nmigen
.compat
.sim
import run_simulation
2 from nmigen
.cli
import verilog
, rtlil
3 from nmigen
import Module
, Const
, Signal
, Array
, Cat
, Elaboratable
, Memory
5 from regfile
.regfile
import RegFileArray
, treereduce
6 from scoreboard
.fu_fu_matrix
import FUFUDepMatrix
7 from scoreboard
.fu_reg_matrix
import FURegDepMatrix
8 from scoreboard
.global_pending
import GlobalPending
9 from scoreboard
.group_picker
import GroupPicker
10 from scoreboard
.issue_unit
import IssueUnitGroup
, IssueUnitArray
, RegDecode
11 from scoreboard
.shadow
import ShadowMatrix
, BranchSpeculationRecord
12 from scoreboard
.instruction_q
import Instruction
, InstructionQ
13 from scoreboard
.memfu
import MemFunctionUnits
15 from compalu
import ComputationUnitNoDelay
16 from compldst
import LDSTCompUnit
18 from alu_hier
import ALU
, BranchALU
19 from nmutil
.latch
import SRLatch
20 from nmutil
.nmoperator
import eq
22 from random
import randint
, seed
23 from copy
import deepcopy
27 class TestMemory(Elaboratable
):
28 def __init__(self
, regwid
, addrw
):
29 self
.ddepth
= 1 # regwid //8
30 depth
= (1<<addrw
) // self
.ddepth
31 self
.adr
= Signal(addrw
)
32 self
.dat_r
= Signal(regwid
)
33 self
.dat_w
= Signal(regwid
)
35 self
.mem
= Memory(width
=regwid
, depth
=depth
, init
=range(0, depth
))
37 def elaborate(self
, platform
):
39 m
.submodules
.rdport
= rdport
= self
.mem
.read_port()
40 m
.submodules
.wrport
= wrport
= self
.mem
.write_port()
42 rdport
.addr
.eq(self
.adr
[self
.ddepth
:]), # ignore low bits
43 self
.dat_r
.eq(rdport
.data
),
44 wrport
.addr
.eq(self
.adr
),
45 wrport
.data
.eq(self
.dat_w
),
46 wrport
.en
.eq(self
.we
),
52 def __init__(self
, regwid
, addrw
):
54 self
.ddepth
= 1 # regwid//8
55 depth
= (1<<addrw
) // self
.ddepth
56 self
.mem
= list(range(0, depth
))
59 return self
.mem
[addr
>>self
.ddepth
]
61 def st(self
, addr
, data
):
62 self
.mem
[addr
>>self
.ddepth
] = data
& ((1<<self
.regwid
)-1)
65 class CompUnitsBase(Elaboratable
):
66 """ Computation Unit Base class.
68 Amazingly, this class works recursively. It's supposed to just
69 look after some ALUs (that can handle the same operations),
70 grouping them together, however it turns out that the same code
71 can also group *groups* of Computation Units together as well.
73 Basically it was intended just to concatenate the ALU's issue,
74 go_rd etc. signals together, which start out as bits and become
75 sequences. Turns out that the same trick works just as well
78 So this class may be used recursively to present a top-level
79 sequential concatenation of all the signals in and out of
80 ALUs, whilst at the same time making it convenient to group
83 At the lower level, the intent is that groups of (identical)
84 ALUs may be passed the same operation. Even beyond that,
85 the intent is that that group of (identical) ALUs actually
86 share the *same pipeline* and as such become a "Concurrent
87 Computation Unit" as defined by Mitch Alsup (see section
90 def __init__(self
, rwid
, units
, ldstmode
=False):
93 * :rwid: bit width of register file(s) - both FP and INT
94 * :units: sequence of ALUs (or CompUnitsBase derivatives)
97 self
.ldstmode
= ldstmode
100 if units
and isinstance(units
[0], CompUnitsBase
):
103 self
.n_units
+= u
.n_units
105 self
.n_units
= len(units
)
107 n_units
= self
.n_units
110 self
.issue_i
= Signal(n_units
, reset_less
=True)
111 self
.go_rd_i
= Signal(n_units
, reset_less
=True)
112 self
.go_wr_i
= Signal(n_units
, reset_less
=True)
113 self
.shadown_i
= Signal(n_units
, reset_less
=True)
114 self
.go_die_i
= Signal(n_units
, reset_less
=True)
116 self
.go_ad_i
= Signal(n_units
, reset_less
=True)
119 self
.busy_o
= Signal(n_units
, reset_less
=True)
120 self
.rd_rel_o
= Signal(n_units
, reset_less
=True)
121 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
123 self
.adr_rel_o
= Signal(n_units
, reset_less
=True)
124 self
.sto_rel_o
= Signal(n_units
, reset_less
=True)
125 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
126 self
.load_mem_o
= Signal(n_units
, reset_less
=True)
127 self
.stwd_mem_o
= Signal(n_units
, reset_less
=True)
129 # in/out register data (note: not register#, actual data)
130 self
.data_o
= Signal(rwid
, reset_less
=True)
131 self
.src1_i
= Signal(rwid
, reset_less
=True)
132 self
.src2_i
= Signal(rwid
, reset_less
=True)
135 def elaborate(self
, platform
):
139 for i
, alu
in enumerate(self
.units
):
140 setattr(m
.submodules
, "comp%d" % i
, alu
)
150 for alu
in self
.units
:
151 req_rel_l
.append(alu
.req_rel_o
)
152 rd_rel_l
.append(alu
.rd_rel_o
)
153 shadow_l
.append(alu
.shadown_i
)
154 godie_l
.append(alu
.go_die_i
)
155 go_wr_l
.append(alu
.go_wr_i
)
156 go_rd_l
.append(alu
.go_rd_i
)
157 issue_l
.append(alu
.issue_i
)
158 busy_l
.append(alu
.busy_o
)
159 comb
+= self
.rd_rel_o
.eq(Cat(*rd_rel_l
))
160 comb
+= self
.req_rel_o
.eq(Cat(*req_rel_l
))
161 comb
+= self
.busy_o
.eq(Cat(*busy_l
))
162 comb
+= Cat(*godie_l
).eq(self
.go_die_i
)
163 comb
+= Cat(*shadow_l
).eq(self
.shadown_i
)
164 comb
+= Cat(*go_wr_l
).eq(self
.go_wr_i
)
165 comb
+= Cat(*go_rd_l
).eq(self
.go_rd_i
)
166 comb
+= Cat(*issue_l
).eq(self
.issue_i
)
168 # connect data register input/output
170 # merge (OR) all integer FU / ALU outputs to a single value
171 # bit of a hack: treereduce needs a list with an item named "data_o"
173 data_o
= treereduce(self
.units
)
174 comb
+= self
.data_o
.eq(data_o
)
176 for i
, alu
in enumerate(self
.units
):
177 comb
+= alu
.src1_i
.eq(self
.src1_i
)
178 comb
+= alu
.src2_i
.eq(self
.src2_i
)
180 if not self
.ldstmode
:
188 for alu
in self
.units
:
189 adr_rel_l
.append(alu
.adr_rel_o
)
190 sto_rel_l
.append(alu
.sto_rel_o
)
191 ldmem_l
.append(alu
.load_mem_o
)
192 stmem_l
.append(alu
.stwd_mem_o
)
193 go_ad_l
.append(alu
.go_ad_i
)
194 comb
+= self
.adr_rel_o
.eq(Cat(*adr_rel_l
))
195 comb
+= self
.sto_rel_o
.eq(Cat(*sto_rel_l
))
196 comb
+= self
.load_mem_o
.eq(Cat(*ldmem_l
))
197 comb
+= self
.stwd_mem_o
.eq(Cat(*stmem_l
))
198 comb
+= Cat(*go_ad_l
).eq(self
.go_ad_i
)
203 class CompUnitLDSTs(CompUnitsBase
):
205 def __init__(self
, rwid
, opwid
, n_ldsts
, mem
):
208 * :rwid: bit width of register file(s) - both FP and INT
209 * :opwid: operand bit width
214 self
.oper_i
= Signal(opwid
, reset_less
=True)
215 self
.imm_i
= Signal(rwid
, reset_less
=True)
219 for i
in range(n_ldsts
):
220 alus
.append(ALU(rwid
))
224 aluopwid
= 4 # see compldst.py for "internal" opcode
225 units
.append(LDSTCompUnit(rwid
, aluopwid
, alu
, mem
))
227 CompUnitsBase
.__init
__(self
, rwid
, units
, ldstmode
=True)
229 def elaborate(self
, platform
):
230 m
= CompUnitsBase
.elaborate(self
, platform
)
233 # hand the same operation to all units, 4 lower bits though
234 for alu
in self
.units
:
235 comb
+= alu
.oper_i
[0:4].eq(self
.oper_i
)
236 comb
+= alu
.imm_i
.eq(self
.imm_i
)
237 comb
+= alu
.isalu_i
.eq(0)
242 class CompUnitALUs(CompUnitsBase
):
244 def __init__(self
, rwid
, opwid
, n_alus
):
247 * :rwid: bit width of register file(s) - both FP and INT
248 * :opwid: operand bit width
253 self
.oper_i
= Signal(opwid
, reset_less
=True)
254 self
.imm_i
= Signal(rwid
, reset_less
=True)
258 for i
in range(n_alus
):
259 alus
.append(ALU(rwid
))
263 aluopwid
= 3 # extra bit for immediate mode
264 units
.append(ComputationUnitNoDelay(rwid
, aluopwid
, alu
))
266 CompUnitsBase
.__init
__(self
, rwid
, units
)
268 def elaborate(self
, platform
):
269 m
= CompUnitsBase
.elaborate(self
, platform
)
272 # hand the same operation to all units, only lower 3 bits though
273 for alu
in self
.units
:
274 comb
+= alu
.oper_i
[0:3].eq(self
.oper_i
)
275 comb
+= alu
.imm_i
.eq(self
.imm_i
)
280 class CompUnitBR(CompUnitsBase
):
282 def __init__(self
, rwid
, opwid
):
285 * :rwid: bit width of register file(s) - both FP and INT
286 * :opwid: operand bit width
288 Note: bgt unit is returned so that a shadow unit can be created
294 self
.oper_i
= Signal(opwid
, reset_less
=True)
295 self
.imm_i
= Signal(rwid
, reset_less
=True)
298 self
.bgt
= BranchALU(rwid
)
299 aluopwid
= 3 # extra bit for immediate mode
300 self
.br1
= ComputationUnitNoDelay(rwid
, aluopwid
, self
.bgt
)
301 CompUnitsBase
.__init
__(self
, rwid
, [self
.br1
])
303 def elaborate(self
, platform
):
304 m
= CompUnitsBase
.elaborate(self
, platform
)
307 # hand the same operation to all units
308 for alu
in self
.units
:
309 comb
+= alu
.oper_i
.eq(self
.oper_i
)
310 comb
+= alu
.imm_i
.eq(self
.imm_i
)
315 class FunctionUnits(Elaboratable
):
317 def __init__(self
, n_regs
, n_int_alus
):
319 self
.n_int_alus
= n_int_alus
321 self
.dest_i
= Signal(n_regs
, reset_less
=True) # Dest R# in
322 self
.src1_i
= Signal(n_regs
, reset_less
=True) # oper1 R# in
323 self
.src2_i
= Signal(n_regs
, reset_less
=True) # oper2 R# in
325 self
.g_int_rd_pend_o
= Signal(n_regs
, reset_less
=True)
326 self
.g_int_wr_pend_o
= Signal(n_regs
, reset_less
=True)
328 self
.dest_rsel_o
= Signal(n_regs
, reset_less
=True) # dest reg (bot)
329 self
.src1_rsel_o
= Signal(n_regs
, reset_less
=True) # src1 reg (bot)
330 self
.src2_rsel_o
= Signal(n_regs
, reset_less
=True) # src2 reg (bot)
332 self
.readable_o
= Signal(n_int_alus
, reset_less
=True)
333 self
.writable_o
= Signal(n_int_alus
, reset_less
=True)
335 self
.go_rd_i
= Signal(n_int_alus
, reset_less
=True)
336 self
.go_wr_i
= Signal(n_int_alus
, reset_less
=True)
337 self
.go_die_i
= Signal(n_int_alus
, reset_less
=True)
338 self
.fn_issue_i
= Signal(n_int_alus
, reset_less
=True)
340 # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
342 def elaborate(self
, platform
):
347 n_intfus
= self
.n_int_alus
349 # Integer FU-FU Dep Matrix
350 intfudeps
= FUFUDepMatrix(n_intfus
, n_intfus
)
351 m
.submodules
.intfudeps
= intfudeps
352 # Integer FU-Reg Dep Matrix
353 intregdeps
= FURegDepMatrix(n_intfus
, self
.n_regs
, 2)
354 m
.submodules
.intregdeps
= intregdeps
356 comb
+= self
.g_int_rd_pend_o
.eq(intregdeps
.v_rd_rsel_o
)
357 comb
+= self
.g_int_wr_pend_o
.eq(intregdeps
.v_wr_rsel_o
)
359 comb
+= intregdeps
.rd_pend_i
.eq(intregdeps
.v_rd_rsel_o
)
360 comb
+= intregdeps
.wr_pend_i
.eq(intregdeps
.v_wr_rsel_o
)
362 comb
+= intfudeps
.rd_pend_i
.eq(intregdeps
.rd_pend_o
)
363 comb
+= intfudeps
.wr_pend_i
.eq(intregdeps
.wr_pend_o
)
364 self
.wr_pend_o
= intregdeps
.wr_pend_o
# also output for use in WaWGrid
366 comb
+= intfudeps
.issue_i
.eq(self
.fn_issue_i
)
367 comb
+= intfudeps
.go_rd_i
.eq(self
.go_rd_i
)
368 comb
+= intfudeps
.go_wr_i
.eq(self
.go_wr_i
)
369 comb
+= intfudeps
.go_die_i
.eq(self
.go_die_i
)
370 comb
+= self
.readable_o
.eq(intfudeps
.readable_o
)
371 comb
+= self
.writable_o
.eq(intfudeps
.writable_o
)
373 # Connect function issue / arrays, and dest/src1/src2
374 comb
+= intregdeps
.dest_i
.eq(self
.dest_i
)
375 comb
+= intregdeps
.src_i
[0].eq(self
.src1_i
)
376 comb
+= intregdeps
.src_i
[1].eq(self
.src2_i
)
378 comb
+= intregdeps
.go_rd_i
.eq(self
.go_rd_i
)
379 comb
+= intregdeps
.go_wr_i
.eq(self
.go_wr_i
)
380 comb
+= intregdeps
.go_die_i
.eq(self
.go_die_i
)
381 comb
+= intregdeps
.issue_i
.eq(self
.fn_issue_i
)
383 comb
+= self
.dest_rsel_o
.eq(intregdeps
.dest_rsel_o
)
384 comb
+= self
.src1_rsel_o
.eq(intregdeps
.src_rsel_o
[0])
385 comb
+= self
.src2_rsel_o
.eq(intregdeps
.src_rsel_o
[1])
390 class Scoreboard(Elaboratable
):
391 def __init__(self
, rwid
, n_regs
):
394 * :rwid: bit width of register file(s) - both FP and INT
395 * :n_regs: depth of register file(s) - number of FP and INT regs
401 self
.intregs
= RegFileArray(rwid
, n_regs
)
402 self
.fpregs
= RegFileArray(rwid
, n_regs
)
404 # issue q needs to get at these
405 self
.aluissue
= IssueUnitGroup(2)
406 self
.lsissue
= IssueUnitGroup(2)
407 self
.brissue
= IssueUnitGroup(1)
409 self
.alu_oper_i
= Signal(4, reset_less
=True)
410 self
.alu_imm_i
= Signal(rwid
, reset_less
=True)
411 self
.br_oper_i
= Signal(4, reset_less
=True)
412 self
.br_imm_i
= Signal(rwid
, reset_less
=True)
413 self
.ls_oper_i
= Signal(4, reset_less
=True)
414 self
.ls_imm_i
= Signal(rwid
, reset_less
=True)
417 self
.int_dest_i
= Signal(max=n_regs
, reset_less
=True) # Dest R# in
418 self
.int_src1_i
= Signal(max=n_regs
, reset_less
=True) # oper1 R# in
419 self
.int_src2_i
= Signal(max=n_regs
, reset_less
=True) # oper2 R# in
420 self
.reg_enable_i
= Signal(reset_less
=True) # enable reg decode
423 self
.issue_o
= Signal(reset_less
=True) # instruction was accepted
424 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
426 # for branch speculation experiment. branch_direction = 0 if
427 # the branch hasn't been met yet. 1 indicates "success", 2 is "fail"
428 # branch_succ and branch_fail are requests to have the current
429 # instruction be dependent on the branch unit "shadow" capability.
430 self
.branch_succ_i
= Signal(reset_less
=True)
431 self
.branch_fail_i
= Signal(reset_less
=True)
432 self
.branch_direction_o
= Signal(2, reset_less
=True)
434 def elaborate(self
, platform
):
439 m
.submodules
.intregs
= self
.intregs
440 m
.submodules
.fpregs
= self
.fpregs
443 int_dest
= self
.intregs
.write_port("dest")
444 int_src1
= self
.intregs
.read_port("src1")
445 int_src2
= self
.intregs
.read_port("src2")
447 fp_dest
= self
.fpregs
.write_port("dest")
448 fp_src1
= self
.fpregs
.read_port("src1")
449 fp_src2
= self
.fpregs
.read_port("src2")
451 # Int ALUs and BR ALUs
453 cua
= CompUnitALUs(self
.rwid
, 3, n_alus
=self
.aluissue
.n_insns
)
454 cub
= CompUnitBR(self
.rwid
, 3) # 1 BR ALUs
458 cul
= CompUnitLDSTs(self
.rwid
, 4, self
.lsissue
.n_insns
, None)
461 m
.submodules
.cu
= cu
= CompUnitsBase(self
.rwid
, [cua
, cul
, cub
])
462 bgt
= cub
.bgt
# get at the branch computation unit
466 m
.submodules
.intfus
= intfus
= FunctionUnits(self
.n_regs
, n_int_alus
)
469 m
.submodules
.memfus
= memfus
= MemFunctionUnits(n_ldsts
, 5)
471 # Count of number of FUs
472 n_intfus
= n_int_alus
473 n_fp_fus
= 0 # for now
475 # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
476 intpick1
= GroupPicker(n_intfus
) # picks 1 reader and 1 writer to intreg
477 m
.submodules
.intpick1
= intpick1
480 regdecode
= RegDecode(self
.n_regs
)
481 m
.submodules
.regdecode
= regdecode
482 issueunit
= IssueUnitArray([self
.aluissue
, self
.lsissue
, self
.brissue
])
483 m
.submodules
.issueunit
= issueunit
485 # Shadow Matrix. currently n_intfus shadows, to be used for
486 # write-after-write hazards. NOTE: there is one extra for branches,
487 # so the shadow width is increased by 1
488 m
.submodules
.shadows
= shadows
= ShadowMatrix(n_intfus
, n_intfus
, True)
489 m
.submodules
.bshadow
= bshadow
= ShadowMatrix(n_intfus
, 1, False)
491 # record previous instruction to cast shadow on current instruction
492 prev_shadow
= Signal(n_intfus
)
494 # Branch Speculation recorder. tracks the success/fail state as
495 # each instruction is issued, so that when the branch occurs the
496 # allow/cancel can be issued as appropriate.
497 m
.submodules
.specrec
= bspec
= BranchSpeculationRecord(n_intfus
)
500 # ok start wiring things together...
501 # "now hear de word of de looord... dem bones dem bones dem dryy bones"
502 # https://www.youtube.com/watch?v=pYb8Wm6-QfA
506 # Issue Unit is where it starts. set up some in/outs for this module
508 comb
+= [ regdecode
.dest_i
.eq(self
.int_dest_i
),
509 regdecode
.src1_i
.eq(self
.int_src1_i
),
510 regdecode
.src2_i
.eq(self
.int_src2_i
),
511 regdecode
.enable_i
.eq(self
.reg_enable_i
),
512 self
.issue_o
.eq(issueunit
.issue_o
)
515 # take these to outside (issue needs them)
516 comb
+= cua
.oper_i
.eq(self
.alu_oper_i
)
517 comb
+= cua
.imm_i
.eq(self
.alu_imm_i
)
518 comb
+= cub
.oper_i
.eq(self
.br_oper_i
)
519 comb
+= cub
.imm_i
.eq(self
.br_imm_i
)
520 comb
+= cul
.oper_i
.eq(self
.ls_oper_i
)
521 comb
+= cul
.imm_i
.eq(self
.ls_imm_i
)
523 # TODO: issueunit.f (FP)
525 # and int function issue / busy arrays, and dest/src1/src2
526 comb
+= intfus
.dest_i
.eq(regdecode
.dest_o
)
527 comb
+= intfus
.src1_i
.eq(regdecode
.src1_o
)
528 comb
+= intfus
.src2_i
.eq(regdecode
.src2_o
)
530 fn_issue_o
= issueunit
.fn_issue_o
532 comb
+= intfus
.fn_issue_i
.eq(fn_issue_o
)
533 comb
+= issueunit
.busy_i
.eq(cu
.busy_o
)
534 comb
+= self
.busy_o
.eq(cu
.busy_o
.bool())
537 # merge shadow matrices outputs
540 # these are explained in ShadowMatrix docstring, and are to be
541 # connected to the FUReg and FUFU Matrices, to get them to reset
542 anydie
= Signal(n_intfus
, reset_less
=True)
543 allshadown
= Signal(n_intfus
, reset_less
=True)
544 shreset
= Signal(n_intfus
, reset_less
=True)
545 comb
+= allshadown
.eq(shadows
.shadown_o
& bshadow
.shadown_o
)
546 comb
+= anydie
.eq(shadows
.go_die_o | bshadow
.go_die_o
)
547 comb
+= shreset
.eq(bspec
.match_g_o | bspec
.match_f_o
)
550 # connect fu-fu matrix
553 # Group Picker... done manually for now.
554 go_rd_o
= intpick1
.go_rd_o
555 go_wr_o
= intpick1
.go_wr_o
556 go_rd_i
= intfus
.go_rd_i
557 go_wr_i
= intfus
.go_wr_i
558 go_die_i
= intfus
.go_die_i
559 # NOTE: connect to the shadowed versions so that they can "die" (reset)
560 comb
+= go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
]) # rd
561 comb
+= go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
]) # wr
562 comb
+= go_die_i
[0:n_intfus
].eq(anydie
[0:n_intfus
]) # die
566 comb
+= intpick1
.rd_rel_i
[0:n_intfus
].eq(cu
.rd_rel_o
[0:n_intfus
])
567 comb
+= intpick1
.req_rel_i
[0:n_intfus
].eq(cu
.req_rel_o
[0:n_intfus
])
568 int_rd_o
= intfus
.readable_o
569 int_wr_o
= intfus
.writable_o
570 comb
+= intpick1
.readable_i
[0:n_intfus
].eq(int_rd_o
[0:n_intfus
])
571 comb
+= intpick1
.writable_i
[0:n_intfus
].eq(int_wr_o
[0:n_intfus
])
577 comb
+= shadows
.issue_i
.eq(fn_issue_o
)
578 #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
579 comb
+= shadows
.reset_i
[0:n_intfus
].eq(bshadow
.go_die_o
[0:n_intfus
])
581 # NOTE; this setup is for the instruction order preservation...
583 # connect shadows / go_dies to Computation Units
584 comb
+= cu
.shadown_i
[0:n_intfus
].eq(allshadown
)
585 comb
+= cu
.go_die_i
[0:n_intfus
].eq(anydie
)
587 # ok connect first n_int_fu shadows to busy lines, to create an
588 # instruction-order linked-list-like arrangement, using a bit-matrix
589 # (instead of e.g. a ring buffer).
592 # when written, the shadow can be cancelled (and was good)
593 for i
in range(n_intfus
):
594 comb
+= shadows
.s_good_i
[i
][0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
596 # *previous* instruction shadows *current* instruction, and, obviously,
597 # if the previous is completed (!busy) don't cast the shadow!
598 comb
+= prev_shadow
.eq(~fn_issue_o
& cu
.busy_o
)
599 for i
in range(n_intfus
):
600 comb
+= shadows
.shadow_i
[i
][0:n_intfus
].eq(prev_shadow
)
603 # ... and this is for branch speculation. it uses the extra bit
604 # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
605 # only needs to set shadow_i, s_fail_i and s_good_i
607 # issue captures shadow_i (if enabled)
608 comb
+= bshadow
.reset_i
[0:n_intfus
].eq(shreset
[0:n_intfus
])
610 bactive
= Signal(reset_less
=True)
611 comb
+= bactive
.eq((bspec
.active_i | br1
.issue_i
) & ~br1
.go_wr_i
)
613 # instruction being issued (fn_issue_o) has a shadow cast by the branch
614 with m
.If(bactive
& (self
.branch_succ_i | self
.branch_fail_i
)):
615 comb
+= bshadow
.issue_i
.eq(fn_issue_o
)
616 for i
in range(n_intfus
):
617 with m
.If(fn_issue_o
& (Const(1<<i
))):
618 comb
+= bshadow
.shadow_i
[i
][0].eq(1)
620 # finally, we need an indicator to the test infrastructure as to
621 # whether the branch succeeded or failed, plus, link up to the
622 # "recorder" of whether the instruction was under shadow or not
624 with m
.If(br1
.issue_i
):
625 sync
+= bspec
.active_i
.eq(1)
626 with m
.If(self
.branch_succ_i
):
627 comb
+= bspec
.good_i
.eq(fn_issue_o
& 0x1f) # XXX MAGIC CONSTANT
628 with m
.If(self
.branch_fail_i
):
629 comb
+= bspec
.fail_i
.eq(fn_issue_o
& 0x1f) # XXX MAGIC CONSTANT
631 # branch is active (TODO: a better signal: this is over-using the
632 # go_write signal - actually the branch should not be "writing")
633 with m
.If(br1
.go_wr_i
):
634 sync
+= self
.branch_direction_o
.eq(br1
.data_o
+Const(1, 2))
635 sync
+= bspec
.active_i
.eq(0)
636 comb
+= bspec
.br_i
.eq(1)
637 # branch occurs if data == 1, failed if data == 0
638 comb
+= bspec
.br_ok_i
.eq(br1
.data_o
== 1)
639 for i
in range(n_intfus
):
640 # *expected* direction of the branch matched against *actual*
641 comb
+= bshadow
.s_good_i
[i
][0].eq(bspec
.match_g_o
[i
])
643 comb
+= bshadow
.s_fail_i
[i
][0].eq(bspec
.match_f_o
[i
])
646 # Connect Register File(s)
648 comb
+= int_dest
.wen
.eq(intfus
.dest_rsel_o
)
649 comb
+= int_src1
.ren
.eq(intfus
.src1_rsel_o
)
650 comb
+= int_src2
.ren
.eq(intfus
.src2_rsel_o
)
652 # connect ALUs to regfule
653 comb
+= int_dest
.data_i
.eq(cu
.data_o
)
654 comb
+= cu
.src1_i
.eq(int_src1
.data_o
)
655 comb
+= cu
.src2_i
.eq(int_src2
.data_o
)
657 # connect ALU Computation Units
658 comb
+= cu
.go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
])
659 comb
+= cu
.go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
660 comb
+= cu
.issue_i
[0:n_intfus
].eq(fn_issue_o
[0:n_intfus
])
665 yield from self
.intregs
666 yield from self
.fpregs
667 yield self
.int_dest_i
668 yield self
.int_src1_i
669 yield self
.int_src2_i
671 yield self
.branch_succ_i
672 yield self
.branch_fail_i
673 yield self
.branch_direction_o
679 class IssueToScoreboard(Elaboratable
):
681 def __init__(self
, qlen
, n_in
, n_out
, rwid
, opwid
, n_regs
):
689 mqbits
= (int(log(qlen
) / log(2))+2, False)
690 self
.p_add_i
= Signal(mqbits
) # instructions to add (from data_i)
691 self
.p_ready_o
= Signal() # instructions were added
692 self
.data_i
= Instruction
.nq(n_in
, "data_i", rwid
, opwid
)
694 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
695 self
.qlen_o
= Signal(mqbits
, reset_less
=True)
697 def elaborate(self
, platform
):
702 iq
= InstructionQ(self
.rwid
, self
.opw
, self
.qlen
, self
.n_in
, self
.n_out
)
703 sc
= Scoreboard(self
.rwid
, self
.n_regs
)
704 mem
= TestMemory(self
.rwid
, 8) # not too big, takes too long
707 m
.submodules
.mem
= mem
709 # get at the regfile for testing
710 self
.intregs
= sc
.intregs
712 # and the "busy" signal and instruction queue length
713 comb
+= self
.busy_o
.eq(sc
.busy_o
)
714 comb
+= self
.qlen_o
.eq(iq
.qlen_o
)
716 # link up instruction queue
717 comb
+= iq
.p_add_i
.eq(self
.p_add_i
)
718 comb
+= self
.p_ready_o
.eq(iq
.p_ready_o
)
719 for i
in range(self
.n_in
):
720 comb
+= eq(iq
.data_i
[i
], self
.data_i
[i
])
722 # take instruction and process it. note that it's possible to
723 # "inspect" the queue contents *without* actually removing the
724 # items. items are only removed when the
727 wait_issue_br
= Signal()
728 wait_issue_alu
= Signal()
729 wait_issue_ls
= Signal()
731 with m
.If(wait_issue_br | wait_issue_alu | wait_issue_ls
):
732 # set instruction pop length to 1 if the unit accepted
733 with m
.If(wait_issue_ls
& (sc
.lsissue
.fn_issue_o
!= 0)):
734 with m
.If(iq
.qlen_o
!= 0):
735 comb
+= iq
.n_sub_i
.eq(1)
736 with m
.If(wait_issue_br
& (sc
.brissue
.fn_issue_o
!= 0)):
737 with m
.If(iq
.qlen_o
!= 0):
738 comb
+= iq
.n_sub_i
.eq(1)
739 with m
.If(wait_issue_alu
& (sc
.aluissue
.fn_issue_o
!= 0)):
740 with m
.If(iq
.qlen_o
!= 0):
741 comb
+= iq
.n_sub_i
.eq(1)
743 # see if some instruction(s) are here. note that this is
744 # "inspecting" the in-place queue. note also that on the
745 # cycle following "waiting" for fn_issue_o to be set, the
746 # "resetting" done above (insn_i=0) could be re-ASSERTed.
747 with m
.If(iq
.qlen_o
!= 0):
748 # get the operands and operation
749 imm
= iq
.data_o
[0].imm_i
750 dest
= iq
.data_o
[0].dest_i
751 src1
= iq
.data_o
[0].src1_i
752 src2
= iq
.data_o
[0].src2_i
753 op
= iq
.data_o
[0].oper_i
754 opi
= iq
.data_o
[0].opim_i
# immediate set
756 # set the src/dest regs
757 comb
+= sc
.int_dest_i
.eq(dest
)
758 comb
+= sc
.int_src1_i
.eq(src1
)
759 comb
+= sc
.int_src2_i
.eq(src2
)
760 comb
+= sc
.reg_enable_i
.eq(1) # enable the regfile
762 # choose a Function-Unit-Group
763 with m
.If((op
& (0x3<<2)) != 0): # branch
764 comb
+= sc
.br_oper_i
.eq(Cat(op
[0:2], opi
))
765 comb
+= sc
.br_imm_i
.eq(imm
)
766 comb
+= sc
.brissue
.insn_i
.eq(1)
767 comb
+= wait_issue_br
.eq(1)
768 with m
.Elif((op
& (0x3<<4)) != 0): # ld/st
774 comb
+= sc
.ls_oper_i
.eq(Cat(op
[0], opi
[0], op
[4:6]))
775 comb
+= sc
.ls_imm_i
.eq(imm
)
776 comb
+= sc
.lsissue
.insn_i
.eq(1)
777 comb
+= wait_issue_ls
.eq(1)
779 comb
+= sc
.alu_oper_i
.eq(Cat(op
[0:2], opi
))
780 comb
+= sc
.alu_imm_i
.eq(imm
)
781 comb
+= sc
.aluissue
.insn_i
.eq(1)
782 comb
+= wait_issue_alu
.eq(1)
785 # these indicate that the instruction is to be made
786 # shadow-dependent on
787 # (either) branch success or branch fail
788 #yield sc.branch_fail_i.eq(branch_fail)
789 #yield sc.branch_succ_i.eq(branch_success)
795 for o
in self
.data_i
:
814 def __init__(self
, rwidth
, nregs
):
816 self
.regs
= [0] * nregs
818 def op(self
, op
, op_imm
, imm
, src1
, src2
, dest
):
819 maxbits
= (1 << self
.rwidth
) - 1
820 src1
= self
.regs
[src1
] & maxbits
824 src2
= self
.regs
[src2
] & maxbits
832 val
= src1
>> (src2
& maxbits
)
834 val
= int(src1
> src2
)
836 val
= int(src1
< src2
)
838 val
= int(src1
== src2
)
840 val
= int(src1
!= src2
)
842 return 0 # LD/ST TODO
844 self
.setval(dest
, val
)
847 def setval(self
, dest
, val
):
848 print ("sim setval", dest
, hex(val
))
849 self
.regs
[dest
] = val
852 for i
, val
in enumerate(self
.regs
):
853 reg
= yield dut
.intregs
.regs
[i
].reg
854 okstr
= "OK" if reg
== val
else "!ok"
855 print("reg %d expected %x received %x %s" % (i
, val
, reg
, okstr
))
857 def check(self
, dut
):
858 for i
, val
in enumerate(self
.regs
):
859 reg
= yield dut
.intregs
.regs
[i
].reg
861 print("reg %d expected %x received %x\n" % (i
, val
, reg
))
862 yield from self
.dump(dut
)
865 def instr_q(dut
, op
, op_imm
, imm
, src1
, src2
, dest
,
866 branch_success
, branch_fail
):
867 instrs
= [{'oper_i': op
, 'dest_i': dest
, 'imm_i': imm
, 'opim_i': op_imm
,
868 'src1_i': src1
, 'src2_i': src2
}]
871 for idx
in range(sendlen
):
872 yield from eq(dut
.data_i
[idx
], instrs
[idx
])
873 di
= yield dut
.data_i
[idx
]
874 print ("senddata %d %x" % (idx
, di
))
875 yield dut
.p_add_i
.eq(sendlen
)
877 o_p_ready
= yield dut
.p_ready_o
880 o_p_ready
= yield dut
.p_ready_o
882 yield dut
.p_add_i
.eq(0)
885 def int_instr(dut
, op
, imm
, src1
, src2
, dest
, branch_success
, branch_fail
):
886 yield from disable_issue(dut
)
887 yield dut
.int_dest_i
.eq(dest
)
888 yield dut
.int_src1_i
.eq(src1
)
889 yield dut
.int_src2_i
.eq(src2
)
890 if (op
& (0x3<<2)) != 0: # branch
891 yield dut
.brissue
.insn_i
.eq(1)
892 yield dut
.br_oper_i
.eq(Const(op
& 0x3, 2))
893 yield dut
.br_imm_i
.eq(imm
)
894 dut_issue
= dut
.brissue
896 yield dut
.aluissue
.insn_i
.eq(1)
897 yield dut
.alu_oper_i
.eq(Const(op
& 0x3, 2))
898 yield dut
.alu_imm_i
.eq(imm
)
899 dut_issue
= dut
.aluissue
900 yield dut
.reg_enable_i
.eq(1)
902 # these indicate that the instruction is to be made shadow-dependent on
903 # (either) branch success or branch fail
904 yield dut
.branch_fail_i
.eq(branch_fail
)
905 yield dut
.branch_succ_i
.eq(branch_success
)
908 yield from wait_for_issue(dut
, dut_issue
)
911 def print_reg(dut
, rnums
):
914 reg
= yield dut
.intregs
.regs
[rnum
].reg
915 rs
.append("%x" % reg
)
916 rnums
= map(str, rnums
)
917 print ("reg %s: %s" % (','.join(rnums
), ','.join(rs
)))
920 def create_random_ops(dut
, n_ops
, shadowing
=False, max_opnums
=3):
922 for i
in range(n_ops
):
923 src1
= randint(1, dut
.n_regs
-1)
924 src2
= randint(1, dut
.n_regs
-1)
925 imm
= randint(1, (1<<dut
.rwid
)-1)
926 dest
= randint(1, dut
.n_regs
-1)
927 op
= randint(0, max_opnums
)
928 opi
= 0 if randint(0, 2) else 1 # set true if random is nonzero
931 insts
.append((src1
, src2
, dest
, op
, opi
, imm
, (0, 0)))
933 insts
.append((src1
, src2
, dest
, op
, opi
, imm
))
937 def wait_for_busy_clear(dut
):
939 busy_o
= yield dut
.busy_o
945 def disable_issue(dut
):
946 yield dut
.aluissue
.insn_i
.eq(0)
947 yield dut
.brissue
.insn_i
.eq(0)
948 yield dut
.lsissue
.insn_i
.eq(0)
951 def wait_for_issue(dut
, dut_issue
):
953 issue_o
= yield dut_issue
.fn_issue_o
955 yield from disable_issue(dut
)
956 yield dut
.reg_enable_i
.eq(0)
959 #yield from print_reg(dut, [1,2,3])
961 #yield from print_reg(dut, [1,2,3])
963 def scoreboard_branch_sim(dut
, alusim
):
969 print ("rseed", iseed
)
973 yield dut
.branch_direction_o
.eq(0)
975 # set random values in the registers
976 for i
in range(1, dut
.n_regs
):
978 val
= randint(0, (1<<alusim
.rwidth
)-1)
979 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
980 alusim
.setval(i
, val
)
983 # create some instructions: branches create a tree
984 insts
= create_random_ops(dut
, 1, True, 1)
985 #insts.append((6, 6, 1, 2, (0, 0)))
986 #insts.append((4, 3, 3, 0, (0, 0)))
988 src1
= randint(1, dut
.n_regs
-1)
989 src2
= randint(1, dut
.n_regs
-1)
991 op
= 4 # only BGT at the moment
993 branch_ok
= create_random_ops(dut
, 1, True, 1)
994 branch_fail
= create_random_ops(dut
, 1, True, 1)
996 insts
.append((src1
, src2
, (branch_ok
, branch_fail
), op
, (0, 0)))
1000 insts
.append( (3, 5, 2, 0, (0, 0)) )
1003 #branch_ok.append ( (5, 7, 5, 1, (1, 0)) )
1004 branch_ok
.append( None )
1005 branch_fail
.append( (1, 1, 2, 0, (0, 1)) )
1006 #branch_fail.append( None )
1007 insts
.append( (6, 4, (branch_ok
, branch_fail
), 4, (0, 0)) )
1009 siminsts
= deepcopy(insts
)
1011 # issue instruction(s)
1014 branch_direction
= 0
1019 branch_direction
= yield dut
.branch_direction_o
# way branch went
1020 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = insts
.pop(0)
1021 if branch_direction
== 1 and shadow_on
:
1022 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
1023 continue # branch was "success" and this is a "failed"... skip
1024 if branch_direction
== 2 and shadow_off
:
1025 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
1026 continue # branch was "fail" and this is a "success"... skip
1027 if branch_direction
!= 0:
1032 branch_ok
, branch_fail
= dest
1034 # ok zip up the branch success / fail instructions and
1035 # drop them into the queue, one marked "to have branch success"
1036 # the other to be marked shadow branch "fail".
1037 # one out of each of these will be cancelled
1038 for ok
, fl
in zip(branch_ok
, branch_fail
):
1040 instrs
.append((ok
[0], ok
[1], ok
[2], ok
[3], (1, 0)))
1042 instrs
.append((fl
[0], fl
[1], fl
[2], fl
[3], (0, 1)))
1043 print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
1044 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1045 yield from int_instr(dut
, op
, src1
, src2
, dest
,
1046 shadow_on
, shadow_off
)
1048 # wait for all instructions to stop before checking
1050 yield from wait_for_busy_clear(dut
)
1054 instr
= siminsts
.pop(0)
1057 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = instr
1061 branch_ok
, branch_fail
= dest
1063 print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
1064 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1065 branch_res
= alusim
.op(op
, src1
, src2
, dest
)
1068 siminsts
+= branch_ok
1070 siminsts
+= branch_fail
1073 yield from alusim
.check(dut
)
1074 yield from alusim
.dump(dut
)
1077 def scoreboard_sim(dut
, alusim
):
1083 # set random values in the registers
1084 for i
in range(1, dut
.n_regs
):
1085 val
= randint(0, (1<<alusim
.rwidth
)-1)
1088 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
1089 alusim
.setval(i
, val
)
1091 # create some instructions (some random, some regression tests)
1094 instrs
= create_random_ops(dut
, 15, True, 4)
1096 if True: # LD test (with immediate)
1097 instrs
.append( (1, 2, 2, 0x10, 1, 20, (0, 0)) )
1100 instrs
.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
1103 instrs
.append( (7, 3, 2, 4, (0, 0)) )
1104 instrs
.append( (7, 6, 6, 2, (0, 0)) )
1105 instrs
.append( (1, 7, 2, 2, (0, 0)) )
1108 instrs
.append((2, 3, 3, 0, 0, 0, (0, 0)))
1109 instrs
.append((5, 3, 3, 1, 0, 0, (0, 0)))
1110 instrs
.append((3, 5, 5, 2, 0, 0, (0, 0)))
1111 instrs
.append((5, 3, 3, 3, 0, 0, (0, 0)))
1112 instrs
.append((3, 5, 5, 0, 0, 0, (0, 0)))
1115 instrs
.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
1116 instrs
.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
1117 instrs
.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
1120 instrs
.append((5, 6, 2, 1))
1121 instrs
.append((2, 2, 4, 0))
1122 #instrs.append((2, 2, 3, 1))
1125 instrs
.append((2, 1, 2, 3))
1128 instrs
.append((2, 6, 2, 1))
1129 instrs
.append((2, 1, 2, 0))
1132 instrs
.append((1, 2, 7, 2))
1133 instrs
.append((7, 1, 5, 0))
1134 instrs
.append((4, 4, 1, 1))
1137 instrs
.append((5, 6, 2, 2))
1138 instrs
.append((1, 1, 4, 1))
1139 instrs
.append((6, 5, 3, 0))
1142 # Write-after-Write Hazard
1143 instrs
.append( (3, 6, 7, 2) )
1144 instrs
.append( (4, 4, 7, 1) )
1147 # self-read/write-after-write followed by Read-after-Write
1148 instrs
.append((1, 1, 1, 1))
1149 instrs
.append((1, 5, 3, 0))
1152 # Read-after-Write followed by self-read-after-write
1153 instrs
.append((5, 6, 1, 2))
1154 instrs
.append((1, 1, 1, 1))
1157 # self-read-write sandwich
1158 instrs
.append((5, 6, 1, 2))
1159 instrs
.append((1, 1, 1, 1))
1160 instrs
.append((1, 5, 3, 0))
1163 # very weird failure
1164 instrs
.append( (5, 2, 5, 2) )
1165 instrs
.append( (2, 6, 3, 0) )
1166 instrs
.append( (4, 2, 2, 1) )
1170 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1171 alusim
.setval(5, v1
)
1172 yield dut
.intregs
.regs
[3].reg
.eq(5)
1174 instrs
.append((5, 3, 3, 4, (0, 0)))
1175 instrs
.append((4, 2, 1, 2, (0, 1)))
1179 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1180 alusim
.setval(5, v1
)
1181 yield dut
.intregs
.regs
[3].reg
.eq(5)
1183 instrs
.append((5, 3, 3, 4, (0, 0)))
1184 instrs
.append((4, 2, 1, 2, (1, 0)))
1187 instrs
.append( (4, 3, 5, 1, 0, (0, 0)) )
1188 instrs
.append( (5, 2, 3, 1, 0, (0, 0)) )
1189 instrs
.append( (7, 1, 5, 2, 0, (0, 0)) )
1190 instrs
.append( (5, 6, 6, 4, 0, (0, 0)) )
1191 instrs
.append( (7, 5, 2, 2, 0, (1, 0)) )
1192 instrs
.append( (1, 7, 5, 0, 0, (0, 1)) )
1193 instrs
.append( (1, 6, 1, 2, 0, (1, 0)) )
1194 instrs
.append( (1, 6, 7, 3, 0, (0, 0)) )
1195 instrs
.append( (6, 7, 7, 0, 0, (0, 0)) )
1197 # issue instruction(s), wait for issue to be free before proceeding
1198 for i
, instr
in enumerate(instrs
):
1199 src1
, src2
, dest
, op
, opi
, imm
, (br_ok
, br_fail
) = instr
1201 print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
1202 (i
, src1
, src2
, dest
, op
, opi
, imm
))
1203 alusim
.op(op
, opi
, imm
, src1
, src2
, dest
)
1204 yield from instr_q(dut
, op
, opi
, imm
, src1
, src2
, dest
,
1207 # wait for all instructions to stop before checking
1209 iqlen
= yield dut
.qlen_o
1217 yield from wait_for_busy_clear(dut
)
1220 yield from alusim
.check(dut
)
1221 yield from alusim
.dump(dut
)
1224 def test_scoreboard():
1225 dut
= IssueToScoreboard(2, 1, 1, 16, 8, 8)
1226 alusim
= RegSim(16, 8)
1227 memsim
= MemSim(16, 16)
1228 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
1229 with
open("test_scoreboard6600.il", "w") as f
:
1232 run_simulation(dut
, scoreboard_sim(dut
, alusim
),
1233 vcd_name
='test_scoreboard6600.vcd')
1235 #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
1236 # vcd_name='test_scoreboard6600.vcd')
1239 if __name__
== '__main__':