1 from nmigen
.compat
.sim
import run_simulation
2 from nmigen
.cli
import verilog
, rtlil
3 from nmigen
import Module
, Const
, Signal
, Array
, Cat
, Elaboratable
, Memory
5 from regfile
.regfile
import RegFileArray
, treereduce
6 from scoreboard
.fu_fu_matrix
import FUFUDepMatrix
7 from scoreboard
.fu_reg_matrix
import FURegDepMatrix
8 from scoreboard
.global_pending
import GlobalPending
9 from scoreboard
.group_picker
import GroupPicker
10 from scoreboard
.issue_unit
import IssueUnitGroup
, IssueUnitArray
, RegDecode
11 from scoreboard
.shadow
import ShadowMatrix
, BranchSpeculationRecord
12 from scoreboard
.instruction_q
import Instruction
, InstructionQ
13 from scoreboard
.memfu
import MemFunctionUnits
15 from compalu
import ComputationUnitNoDelay
16 from compldst
import LDSTCompUnit
18 from alu_hier
import ALU
, BranchALU
19 from nmutil
.latch
import SRLatch
20 from nmutil
.nmoperator
import eq
22 from random
import randint
, seed
23 from copy
import deepcopy
27 class TestMemory(Elaboratable
):
28 def __init__(self
, regwid
, addrw
):
29 self
.ddepth
= 1 # regwid //8
30 depth
= (1<<addrw
) // self
.ddepth
31 self
.adr
= Signal(addrw
)
32 self
.dat_r
= Signal(regwid
)
33 self
.dat_w
= Signal(regwid
)
35 self
.mem
= Memory(width
=regwid
, depth
=depth
, init
=range(0, depth
))
37 def elaborate(self
, platform
):
39 m
.submodules
.rdport
= rdport
= self
.mem
.read_port()
40 m
.submodules
.wrport
= wrport
= self
.mem
.write_port()
42 rdport
.addr
.eq(self
.adr
[self
.ddepth
:]), # ignore low bits
43 self
.dat_r
.eq(rdport
.data
),
44 wrport
.addr
.eq(self
.adr
),
45 wrport
.data
.eq(self
.dat_w
),
46 wrport
.en
.eq(self
.we
),
52 def __init__(self
, regwid
, addrw
):
54 self
.ddepth
= 1 # regwid//8
55 depth
= (1<<addrw
) // self
.ddepth
56 self
.mem
= list(range(0, depth
))
59 return self
.mem
[addr
>>self
.ddepth
]
61 def st(self
, addr
, data
):
62 self
.mem
[addr
>>self
.ddepth
] = data
& ((1<<self
.regwid
)-1)
65 class CompUnitsBase(Elaboratable
):
66 """ Computation Unit Base class.
68 Amazingly, this class works recursively. It's supposed to just
69 look after some ALUs (that can handle the same operations),
70 grouping them together, however it turns out that the same code
71 can also group *groups* of Computation Units together as well.
73 Basically it was intended just to concatenate the ALU's issue,
74 go_rd etc. signals together, which start out as bits and become
75 sequences. Turns out that the same trick works just as well
78 So this class may be used recursively to present a top-level
79 sequential concatenation of all the signals in and out of
80 ALUs, whilst at the same time making it convenient to group
83 At the lower level, the intent is that groups of (identical)
84 ALUs may be passed the same operation. Even beyond that,
85 the intent is that that group of (identical) ALUs actually
86 share the *same pipeline* and as such become a "Concurrent
87 Computation Unit" as defined by Mitch Alsup (see section
90 def __init__(self
, rwid
, units
, ldstmode
=False):
93 * :rwid: bit width of register file(s) - both FP and INT
94 * :units: sequence of ALUs (or CompUnitsBase derivatives)
97 self
.ldstmode
= ldstmode
100 if units
and isinstance(units
[0], CompUnitsBase
):
103 self
.n_units
+= u
.n_units
105 self
.n_units
= len(units
)
107 n_units
= self
.n_units
110 self
.issue_i
= Signal(n_units
, reset_less
=True)
111 self
.go_rd_i
= Signal(n_units
, reset_less
=True)
112 self
.go_wr_i
= Signal(n_units
, reset_less
=True)
113 self
.shadown_i
= Signal(n_units
, reset_less
=True)
114 self
.go_die_i
= Signal(n_units
, reset_less
=True)
116 self
.go_ad_i
= Signal(n_units
, reset_less
=True)
119 self
.busy_o
= Signal(n_units
, reset_less
=True)
120 self
.rd_rel_o
= Signal(n_units
, reset_less
=True)
121 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
123 self
.adr_rel_o
= Signal(n_units
, reset_less
=True)
124 self
.sto_rel_o
= Signal(n_units
, reset_less
=True)
125 self
.req_rel_o
= Signal(n_units
, reset_less
=True)
126 self
.load_mem_o
= Signal(n_units
, reset_less
=True)
127 self
.stwd_mem_o
= Signal(n_units
, reset_less
=True)
129 # in/out register data (note: not register#, actual data)
130 self
.data_o
= Signal(rwid
, reset_less
=True)
131 self
.src1_i
= Signal(rwid
, reset_less
=True)
132 self
.src2_i
= Signal(rwid
, reset_less
=True)
135 def elaborate(self
, platform
):
139 for i
, alu
in enumerate(self
.units
):
140 setattr(m
.submodules
, "comp%d" % i
, alu
)
150 for alu
in self
.units
:
151 req_rel_l
.append(alu
.req_rel_o
)
152 rd_rel_l
.append(alu
.rd_rel_o
)
153 shadow_l
.append(alu
.shadown_i
)
154 godie_l
.append(alu
.go_die_i
)
155 go_wr_l
.append(alu
.go_wr_i
)
156 go_rd_l
.append(alu
.go_rd_i
)
157 issue_l
.append(alu
.issue_i
)
158 busy_l
.append(alu
.busy_o
)
159 comb
+= self
.rd_rel_o
.eq(Cat(*rd_rel_l
))
160 comb
+= self
.req_rel_o
.eq(Cat(*req_rel_l
))
161 comb
+= self
.busy_o
.eq(Cat(*busy_l
))
162 comb
+= Cat(*godie_l
).eq(self
.go_die_i
)
163 comb
+= Cat(*shadow_l
).eq(self
.shadown_i
)
164 comb
+= Cat(*go_wr_l
).eq(self
.go_wr_i
)
165 comb
+= Cat(*go_rd_l
).eq(self
.go_rd_i
)
166 comb
+= Cat(*issue_l
).eq(self
.issue_i
)
168 # connect data register input/output
170 # merge (OR) all integer FU / ALU outputs to a single value
171 # bit of a hack: treereduce needs a list with an item named "data_o"
173 data_o
= treereduce(self
.units
)
174 comb
+= self
.data_o
.eq(data_o
)
176 for i
, alu
in enumerate(self
.units
):
177 comb
+= alu
.src1_i
.eq(self
.src1_i
)
178 comb
+= alu
.src2_i
.eq(self
.src2_i
)
180 if not self
.ldstmode
:
188 for alu
in self
.units
:
189 adr_rel_l
.append(alu
.adr_rel_o
)
190 sto_rel_l
.append(alu
.sto_rel_o
)
191 ldmem_l
.append(alu
.load_mem_o
)
192 stmem_l
.append(alu
.stwd_mem_o
)
193 go_ad_l
.append(alu
.go_ad_i
)
194 comb
+= self
.adr_rel_o
.eq(Cat(*adr_rel_l
))
195 comb
+= self
.sto_rel_o
.eq(Cat(*sto_rel_l
))
196 comb
+= self
.load_mem_o
.eq(Cat(*ldmem_l
))
197 comb
+= self
.stwd_mem_o
.eq(Cat(*stmem_l
))
198 comb
+= Cat(*go_ad_l
).eq(self
.go_ad_i
)
203 class CompUnitLDSTs(CompUnitsBase
):
205 def __init__(self
, rwid
, opwid
, mem
):
208 * :rwid: bit width of register file(s) - both FP and INT
209 * :opwid: operand bit width
214 self
.oper_i
= Signal(opwid
, reset_less
=True)
215 self
.imm_i
= Signal(rwid
, reset_less
=True)
222 for alu
in [add1
, add2
]:
223 aluopwid
= 4 # see compldst.py for "internal" opcode
224 units
.append(LDSTCompUnit(rwid
, aluopwid
, alu
, mem
))
226 CompUnitsBase
.__init
__(self
, rwid
, units
, ldstmode
=True)
228 def elaborate(self
, platform
):
229 m
= CompUnitsBase
.elaborate(self
, platform
)
232 # hand the same operation to all units, 4 lower bits though
233 for alu
in self
.units
:
234 comb
+= alu
.oper_i
[0:4].eq(self
.oper_i
)
235 comb
+= alu
.imm_i
.eq(self
.imm_i
)
236 comb
+= alu
.isalu_i
.eq(0)
241 class CompUnitALUs(CompUnitsBase
):
243 def __init__(self
, rwid
, opwid
, n_alus
):
246 * :rwid: bit width of register file(s) - both FP and INT
247 * :opwid: operand bit width
252 self
.oper_i
= Signal(opwid
, reset_less
=True)
253 self
.imm_i
= Signal(rwid
, reset_less
=True)
257 for i
in range(n_alus
):
258 alus
.append(ALU(rwid
))
262 aluopwid
= 3 # extra bit for immediate mode
263 units
.append(ComputationUnitNoDelay(rwid
, aluopwid
, alu
))
265 CompUnitsBase
.__init
__(self
, rwid
, units
)
267 def elaborate(self
, platform
):
268 m
= CompUnitsBase
.elaborate(self
, platform
)
271 # hand the same operation to all units, only lower 3 bits though
272 for alu
in self
.units
:
273 comb
+= alu
.oper_i
[0:3].eq(self
.oper_i
)
274 comb
+= alu
.imm_i
.eq(self
.imm_i
)
279 class CompUnitBR(CompUnitsBase
):
281 def __init__(self
, rwid
, opwid
):
284 * :rwid: bit width of register file(s) - both FP and INT
285 * :opwid: operand bit width
287 Note: bgt unit is returned so that a shadow unit can be created
293 self
.oper_i
= Signal(opwid
, reset_less
=True)
294 self
.imm_i
= Signal(rwid
, reset_less
=True)
297 self
.bgt
= BranchALU(rwid
)
298 aluopwid
= 3 # extra bit for immediate mode
299 self
.br1
= ComputationUnitNoDelay(rwid
, aluopwid
, self
.bgt
)
300 CompUnitsBase
.__init
__(self
, rwid
, [self
.br1
])
302 def elaborate(self
, platform
):
303 m
= CompUnitsBase
.elaborate(self
, platform
)
306 # hand the same operation to all units
307 for alu
in self
.units
:
308 comb
+= alu
.oper_i
.eq(self
.oper_i
)
309 comb
+= alu
.imm_i
.eq(self
.imm_i
)
314 class FunctionUnits(Elaboratable
):
316 def __init__(self
, n_regs
, n_int_alus
):
318 self
.n_int_alus
= n_int_alus
320 self
.dest_i
= Signal(n_regs
, reset_less
=True) # Dest R# in
321 self
.src1_i
= Signal(n_regs
, reset_less
=True) # oper1 R# in
322 self
.src2_i
= Signal(n_regs
, reset_less
=True) # oper2 R# in
324 self
.g_int_rd_pend_o
= Signal(n_regs
, reset_less
=True)
325 self
.g_int_wr_pend_o
= Signal(n_regs
, reset_less
=True)
327 self
.dest_rsel_o
= Signal(n_regs
, reset_less
=True) # dest reg (bot)
328 self
.src1_rsel_o
= Signal(n_regs
, reset_less
=True) # src1 reg (bot)
329 self
.src2_rsel_o
= Signal(n_regs
, reset_less
=True) # src2 reg (bot)
331 self
.readable_o
= Signal(n_int_alus
, reset_less
=True)
332 self
.writable_o
= Signal(n_int_alus
, reset_less
=True)
334 self
.go_rd_i
= Signal(n_int_alus
, reset_less
=True)
335 self
.go_wr_i
= Signal(n_int_alus
, reset_less
=True)
336 self
.go_die_i
= Signal(n_int_alus
, reset_less
=True)
337 self
.fn_issue_i
= Signal(n_int_alus
, reset_less
=True)
339 # Note: FURegs wr_pend_o is also outputted from here, for use in WaWGrid
341 def elaborate(self
, platform
):
346 n_intfus
= self
.n_int_alus
348 # Integer FU-FU Dep Matrix
349 intfudeps
= FUFUDepMatrix(n_intfus
, n_intfus
)
350 m
.submodules
.intfudeps
= intfudeps
351 # Integer FU-Reg Dep Matrix
352 intregdeps
= FURegDepMatrix(n_intfus
, self
.n_regs
, 2)
353 m
.submodules
.intregdeps
= intregdeps
355 comb
+= self
.g_int_rd_pend_o
.eq(intregdeps
.v_rd_rsel_o
)
356 comb
+= self
.g_int_wr_pend_o
.eq(intregdeps
.v_wr_rsel_o
)
358 comb
+= intregdeps
.rd_pend_i
.eq(intregdeps
.v_rd_rsel_o
)
359 comb
+= intregdeps
.wr_pend_i
.eq(intregdeps
.v_wr_rsel_o
)
361 comb
+= intfudeps
.rd_pend_i
.eq(intregdeps
.rd_pend_o
)
362 comb
+= intfudeps
.wr_pend_i
.eq(intregdeps
.wr_pend_o
)
363 self
.wr_pend_o
= intregdeps
.wr_pend_o
# also output for use in WaWGrid
365 comb
+= intfudeps
.issue_i
.eq(self
.fn_issue_i
)
366 comb
+= intfudeps
.go_rd_i
.eq(self
.go_rd_i
)
367 comb
+= intfudeps
.go_wr_i
.eq(self
.go_wr_i
)
368 comb
+= intfudeps
.go_die_i
.eq(self
.go_die_i
)
369 comb
+= self
.readable_o
.eq(intfudeps
.readable_o
)
370 comb
+= self
.writable_o
.eq(intfudeps
.writable_o
)
372 # Connect function issue / arrays, and dest/src1/src2
373 comb
+= intregdeps
.dest_i
.eq(self
.dest_i
)
374 comb
+= intregdeps
.src_i
[0].eq(self
.src1_i
)
375 comb
+= intregdeps
.src_i
[1].eq(self
.src2_i
)
377 comb
+= intregdeps
.go_rd_i
.eq(self
.go_rd_i
)
378 comb
+= intregdeps
.go_wr_i
.eq(self
.go_wr_i
)
379 comb
+= intregdeps
.go_die_i
.eq(self
.go_die_i
)
380 comb
+= intregdeps
.issue_i
.eq(self
.fn_issue_i
)
382 comb
+= self
.dest_rsel_o
.eq(intregdeps
.dest_rsel_o
)
383 comb
+= self
.src1_rsel_o
.eq(intregdeps
.src_rsel_o
[0])
384 comb
+= self
.src2_rsel_o
.eq(intregdeps
.src_rsel_o
[1])
389 class Scoreboard(Elaboratable
):
390 def __init__(self
, rwid
, n_regs
):
393 * :rwid: bit width of register file(s) - both FP and INT
394 * :n_regs: depth of register file(s) - number of FP and INT regs
400 self
.intregs
= RegFileArray(rwid
, n_regs
)
401 self
.fpregs
= RegFileArray(rwid
, n_regs
)
403 # issue q needs to get at these
404 self
.aluissue
= IssueUnitGroup(4)
405 self
.brissue
= IssueUnitGroup(1)
406 self
.lsissue
= IssueUnitGroup(1)
408 self
.alu_oper_i
= Signal(4, reset_less
=True)
409 self
.alu_imm_i
= Signal(rwid
, reset_less
=True)
410 self
.br_oper_i
= Signal(4, reset_less
=True)
411 self
.br_imm_i
= Signal(rwid
, reset_less
=True)
412 self
.ls_oper_i
= Signal(4, reset_less
=True)
413 self
.ls_imm_i
= Signal(rwid
, reset_less
=True)
416 self
.int_dest_i
= Signal(max=n_regs
, reset_less
=True) # Dest R# in
417 self
.int_src1_i
= Signal(max=n_regs
, reset_less
=True) # oper1 R# in
418 self
.int_src2_i
= Signal(max=n_regs
, reset_less
=True) # oper2 R# in
419 self
.reg_enable_i
= Signal(reset_less
=True) # enable reg decode
422 self
.issue_o
= Signal(reset_less
=True) # instruction was accepted
423 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
425 # for branch speculation experiment. branch_direction = 0 if
426 # the branch hasn't been met yet. 1 indicates "success", 2 is "fail"
427 # branch_succ and branch_fail are requests to have the current
428 # instruction be dependent on the branch unit "shadow" capability.
429 self
.branch_succ_i
= Signal(reset_less
=True)
430 self
.branch_fail_i
= Signal(reset_less
=True)
431 self
.branch_direction_o
= Signal(2, reset_less
=True)
433 def elaborate(self
, platform
):
438 m
.submodules
.intregs
= self
.intregs
439 m
.submodules
.fpregs
= self
.fpregs
442 int_dest
= self
.intregs
.write_port("dest")
443 int_src1
= self
.intregs
.read_port("src1")
444 int_src2
= self
.intregs
.read_port("src2")
446 fp_dest
= self
.fpregs
.write_port("dest")
447 fp_src1
= self
.fpregs
.read_port("src1")
448 fp_src2
= self
.fpregs
.read_port("src2")
450 # Int ALUs and BR ALUs
452 cua
= CompUnitALUs(self
.rwid
, 3, n_alus
=4)
453 cub
= CompUnitBR(self
.rwid
, 3) # 1 BR ALUs
457 cul
= CompUnitLDSTs(self
.rwid
, 3, None)
460 m
.submodules
.cu
= cu
= CompUnitsBase(self
.rwid
, [cua
, cub
, cul
])
461 bgt
= cub
.bgt
# get at the branch computation unit
465 m
.submodules
.intfus
= intfus
= FunctionUnits(self
.n_regs
, n_int_alus
)
468 m
.submodules
.memfus
= memfus
= MemFunctionUnits(n_ldsts
, 5)
470 # Count of number of FUs
471 n_intfus
= n_int_alus
472 n_fp_fus
= 0 # for now
474 # Integer Priority Picker 1: Adder + Subtractor (and LD/ST)
475 intpick1
= GroupPicker(n_intfus
) # picks 1 reader and 1 writer to intreg
476 m
.submodules
.intpick1
= intpick1
479 regdecode
= RegDecode(self
.n_regs
)
480 m
.submodules
.regdecode
= regdecode
481 issueunit
= IssueUnitArray([self
.aluissue
, self
.brissue
, self
.lsissue
])
482 m
.submodules
.issueunit
= issueunit
484 # Shadow Matrix. currently n_intfus shadows, to be used for
485 # write-after-write hazards. NOTE: there is one extra for branches,
486 # so the shadow width is increased by 1
487 m
.submodules
.shadows
= shadows
= ShadowMatrix(n_intfus
, n_intfus
, True)
488 m
.submodules
.bshadow
= bshadow
= ShadowMatrix(n_intfus
, 1, False)
490 # record previous instruction to cast shadow on current instruction
491 prev_shadow
= Signal(n_intfus
)
493 # Branch Speculation recorder. tracks the success/fail state as
494 # each instruction is issued, so that when the branch occurs the
495 # allow/cancel can be issued as appropriate.
496 m
.submodules
.specrec
= bspec
= BranchSpeculationRecord(n_intfus
)
499 # ok start wiring things together...
500 # "now hear de word of de looord... dem bones dem bones dem dryy bones"
501 # https://www.youtube.com/watch?v=pYb8Wm6-QfA
505 # Issue Unit is where it starts. set up some in/outs for this module
507 comb
+= [ regdecode
.dest_i
.eq(self
.int_dest_i
),
508 regdecode
.src1_i
.eq(self
.int_src1_i
),
509 regdecode
.src2_i
.eq(self
.int_src2_i
),
510 regdecode
.enable_i
.eq(self
.reg_enable_i
),
511 self
.issue_o
.eq(issueunit
.issue_o
)
514 # take these to outside (issue needs them)
515 comb
+= cua
.oper_i
.eq(self
.alu_oper_i
)
516 comb
+= cua
.imm_i
.eq(self
.alu_imm_i
)
517 comb
+= cub
.oper_i
.eq(self
.br_oper_i
)
518 comb
+= cub
.imm_i
.eq(self
.br_imm_i
)
519 comb
+= cul
.oper_i
.eq(self
.ls_oper_i
)
520 comb
+= cul
.imm_i
.eq(self
.ls_imm_i
)
522 # TODO: issueunit.f (FP)
524 # and int function issue / busy arrays, and dest/src1/src2
525 comb
+= intfus
.dest_i
.eq(regdecode
.dest_o
)
526 comb
+= intfus
.src1_i
.eq(regdecode
.src1_o
)
527 comb
+= intfus
.src2_i
.eq(regdecode
.src2_o
)
529 fn_issue_o
= issueunit
.fn_issue_o
531 comb
+= intfus
.fn_issue_i
.eq(fn_issue_o
)
532 comb
+= issueunit
.busy_i
.eq(cu
.busy_o
)
533 comb
+= self
.busy_o
.eq(cu
.busy_o
.bool())
536 # merge shadow matrices outputs
539 # these are explained in ShadowMatrix docstring, and are to be
540 # connected to the FUReg and FUFU Matrices, to get them to reset
541 anydie
= Signal(n_intfus
, reset_less
=True)
542 allshadown
= Signal(n_intfus
, reset_less
=True)
543 shreset
= Signal(n_intfus
, reset_less
=True)
544 comb
+= allshadown
.eq(shadows
.shadown_o
& bshadow
.shadown_o
)
545 comb
+= anydie
.eq(shadows
.go_die_o | bshadow
.go_die_o
)
546 comb
+= shreset
.eq(bspec
.match_g_o | bspec
.match_f_o
)
549 # connect fu-fu matrix
552 # Group Picker... done manually for now.
553 go_rd_o
= intpick1
.go_rd_o
554 go_wr_o
= intpick1
.go_wr_o
555 go_rd_i
= intfus
.go_rd_i
556 go_wr_i
= intfus
.go_wr_i
557 go_die_i
= intfus
.go_die_i
558 # NOTE: connect to the shadowed versions so that they can "die" (reset)
559 comb
+= go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
]) # rd
560 comb
+= go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
]) # wr
561 comb
+= go_die_i
[0:n_intfus
].eq(anydie
[0:n_intfus
]) # die
565 comb
+= intpick1
.rd_rel_i
[0:n_intfus
].eq(cu
.rd_rel_o
[0:n_intfus
])
566 comb
+= intpick1
.req_rel_i
[0:n_intfus
].eq(cu
.req_rel_o
[0:n_intfus
])
567 int_rd_o
= intfus
.readable_o
568 int_wr_o
= intfus
.writable_o
569 comb
+= intpick1
.readable_i
[0:n_intfus
].eq(int_rd_o
[0:n_intfus
])
570 comb
+= intpick1
.writable_i
[0:n_intfus
].eq(int_wr_o
[0:n_intfus
])
576 comb
+= shadows
.issue_i
.eq(fn_issue_o
)
577 #comb += shadows.reset_i[0:n_intfus].eq(bshadow.go_die_o[0:n_intfus])
578 comb
+= shadows
.reset_i
[0:n_intfus
].eq(bshadow
.go_die_o
[0:n_intfus
])
580 # NOTE; this setup is for the instruction order preservation...
582 # connect shadows / go_dies to Computation Units
583 comb
+= cu
.shadown_i
[0:n_intfus
].eq(allshadown
)
584 comb
+= cu
.go_die_i
[0:n_intfus
].eq(anydie
)
586 # ok connect first n_int_fu shadows to busy lines, to create an
587 # instruction-order linked-list-like arrangement, using a bit-matrix
588 # (instead of e.g. a ring buffer).
591 # when written, the shadow can be cancelled (and was good)
592 for i
in range(n_intfus
):
593 comb
+= shadows
.s_good_i
[i
][0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
595 # *previous* instruction shadows *current* instruction, and, obviously,
596 # if the previous is completed (!busy) don't cast the shadow!
597 comb
+= prev_shadow
.eq(~fn_issue_o
& cu
.busy_o
)
598 for i
in range(n_intfus
):
599 comb
+= shadows
.shadow_i
[i
][0:n_intfus
].eq(prev_shadow
)
602 # ... and this is for branch speculation. it uses the extra bit
603 # tacked onto the ShadowMatrix (hence shadow_wid=n_intfus+1)
604 # only needs to set shadow_i, s_fail_i and s_good_i
606 # issue captures shadow_i (if enabled)
607 comb
+= bshadow
.reset_i
[0:n_intfus
].eq(shreset
[0:n_intfus
])
609 bactive
= Signal(reset_less
=True)
610 comb
+= bactive
.eq((bspec
.active_i | br1
.issue_i
) & ~br1
.go_wr_i
)
612 # instruction being issued (fn_issue_o) has a shadow cast by the branch
613 with m
.If(bactive
& (self
.branch_succ_i | self
.branch_fail_i
)):
614 comb
+= bshadow
.issue_i
.eq(fn_issue_o
)
615 for i
in range(n_intfus
):
616 with m
.If(fn_issue_o
& (Const(1<<i
))):
617 comb
+= bshadow
.shadow_i
[i
][0].eq(1)
619 # finally, we need an indicator to the test infrastructure as to
620 # whether the branch succeeded or failed, plus, link up to the
621 # "recorder" of whether the instruction was under shadow or not
623 with m
.If(br1
.issue_i
):
624 sync
+= bspec
.active_i
.eq(1)
625 with m
.If(self
.branch_succ_i
):
626 comb
+= bspec
.good_i
.eq(fn_issue_o
& 0x1f)
627 with m
.If(self
.branch_fail_i
):
628 comb
+= bspec
.fail_i
.eq(fn_issue_o
& 0x1f)
630 # branch is active (TODO: a better signal: this is over-using the
631 # go_write signal - actually the branch should not be "writing")
632 with m
.If(br1
.go_wr_i
):
633 sync
+= self
.branch_direction_o
.eq(br1
.data_o
+Const(1, 2))
634 sync
+= bspec
.active_i
.eq(0)
635 comb
+= bspec
.br_i
.eq(1)
636 # branch occurs if data == 1, failed if data == 0
637 comb
+= bspec
.br_ok_i
.eq(br1
.data_o
== 1)
638 for i
in range(n_intfus
):
639 # *expected* direction of the branch matched against *actual*
640 comb
+= bshadow
.s_good_i
[i
][0].eq(bspec
.match_g_o
[i
])
642 comb
+= bshadow
.s_fail_i
[i
][0].eq(bspec
.match_f_o
[i
])
645 # Connect Register File(s)
647 comb
+= int_dest
.wen
.eq(intfus
.dest_rsel_o
)
648 comb
+= int_src1
.ren
.eq(intfus
.src1_rsel_o
)
649 comb
+= int_src2
.ren
.eq(intfus
.src2_rsel_o
)
651 # connect ALUs to regfule
652 comb
+= int_dest
.data_i
.eq(cu
.data_o
)
653 comb
+= cu
.src1_i
.eq(int_src1
.data_o
)
654 comb
+= cu
.src2_i
.eq(int_src2
.data_o
)
656 # connect ALU Computation Units
657 comb
+= cu
.go_rd_i
[0:n_intfus
].eq(go_rd_o
[0:n_intfus
])
658 comb
+= cu
.go_wr_i
[0:n_intfus
].eq(go_wr_o
[0:n_intfus
])
659 comb
+= cu
.issue_i
[0:n_intfus
].eq(fn_issue_o
[0:n_intfus
])
664 yield from self
.intregs
665 yield from self
.fpregs
666 yield self
.int_dest_i
667 yield self
.int_src1_i
668 yield self
.int_src2_i
670 yield self
.branch_succ_i
671 yield self
.branch_fail_i
672 yield self
.branch_direction_o
678 class IssueToScoreboard(Elaboratable
):
680 def __init__(self
, qlen
, n_in
, n_out
, rwid
, opwid
, n_regs
):
688 mqbits
= (int(log(qlen
) / log(2))+2, False)
689 self
.p_add_i
= Signal(mqbits
) # instructions to add (from data_i)
690 self
.p_ready_o
= Signal() # instructions were added
691 self
.data_i
= Instruction
.nq(n_in
, "data_i", rwid
, opwid
)
693 self
.busy_o
= Signal(reset_less
=True) # at least one CU is busy
694 self
.qlen_o
= Signal(mqbits
, reset_less
=True)
696 def elaborate(self
, platform
):
701 iq
= InstructionQ(self
.rwid
, self
.opw
, self
.qlen
, self
.n_in
, self
.n_out
)
702 sc
= Scoreboard(self
.rwid
, self
.n_regs
)
703 mem
= TestMemory(self
.rwid
, 8) # not too big, takes too long
706 m
.submodules
.mem
= mem
708 # get at the regfile for testing
709 self
.intregs
= sc
.intregs
711 # and the "busy" signal and instruction queue length
712 comb
+= self
.busy_o
.eq(sc
.busy_o
)
713 comb
+= self
.qlen_o
.eq(iq
.qlen_o
)
715 # link up instruction queue
716 comb
+= iq
.p_add_i
.eq(self
.p_add_i
)
717 comb
+= self
.p_ready_o
.eq(iq
.p_ready_o
)
718 for i
in range(self
.n_in
):
719 comb
+= eq(iq
.data_i
[i
], self
.data_i
[i
])
721 # take instruction and process it. note that it's possible to
722 # "inspect" the queue contents *without* actually removing the
723 # items. items are only removed when the
726 wait_issue_br
= Signal()
727 wait_issue_alu
= Signal()
728 wait_issue_ls
= Signal()
730 with m
.If(wait_issue_br | wait_issue_alu | wait_issue_ls
):
731 # set instruction pop length to 1 if the unit accepted
732 with m
.If(wait_issue_ls
& (sc
.lsissue
.fn_issue_o
!= 0)):
733 with m
.If(iq
.qlen_o
!= 0):
734 comb
+= iq
.n_sub_i
.eq(1)
735 with m
.If(wait_issue_br
& (sc
.brissue
.fn_issue_o
!= 0)):
736 with m
.If(iq
.qlen_o
!= 0):
737 comb
+= iq
.n_sub_i
.eq(1)
738 with m
.If(wait_issue_alu
& (sc
.aluissue
.fn_issue_o
!= 0)):
739 with m
.If(iq
.qlen_o
!= 0):
740 comb
+= iq
.n_sub_i
.eq(1)
742 # see if some instruction(s) are here. note that this is
743 # "inspecting" the in-place queue. note also that on the
744 # cycle following "waiting" for fn_issue_o to be set, the
745 # "resetting" done above (insn_i=0) could be re-ASSERTed.
746 with m
.If(iq
.qlen_o
!= 0):
747 # get the operands and operation
748 imm
= iq
.data_o
[0].imm_i
749 dest
= iq
.data_o
[0].dest_i
750 src1
= iq
.data_o
[0].src1_i
751 src2
= iq
.data_o
[0].src2_i
752 op
= iq
.data_o
[0].oper_i
753 opi
= iq
.data_o
[0].opim_i
# immediate set
755 # set the src/dest regs
756 comb
+= sc
.int_dest_i
.eq(dest
)
757 comb
+= sc
.int_src1_i
.eq(src1
)
758 comb
+= sc
.int_src2_i
.eq(src2
)
759 comb
+= sc
.reg_enable_i
.eq(1) # enable the regfile
761 # choose a Function-Unit-Group
762 with m
.If((op
& (0x3<<2)) != 0): # branch
763 comb
+= sc
.br_oper_i
.eq(Cat(op
[0:2], opi
))
764 comb
+= sc
.br_imm_i
.eq(imm
)
765 comb
+= sc
.brissue
.insn_i
.eq(1)
766 comb
+= wait_issue_br
.eq(1)
767 with m
.Elif((op
& (0x3<<4)) != 0): # ld/st
772 comb
+= sc
.ls_oper_i
.eq(Cat(op
[0], opi
, op
[4:5]))
773 comb
+= sc
.ls_imm_i
.eq(imm
)
774 comb
+= sc
.lsissue
.insn_i
.eq(1)
775 comb
+= wait_issue_ls
.eq(1)
777 comb
+= sc
.alu_oper_i
.eq(Cat(op
[0:2], opi
))
778 comb
+= sc
.alu_imm_i
.eq(imm
)
779 comb
+= sc
.aluissue
.insn_i
.eq(1)
780 comb
+= wait_issue_alu
.eq(1)
783 # these indicate that the instruction is to be made
784 # shadow-dependent on
785 # (either) branch success or branch fail
786 #yield sc.branch_fail_i.eq(branch_fail)
787 #yield sc.branch_succ_i.eq(branch_success)
793 for o
in self
.data_i
:
811 def __init__(self
, rwidth
, nregs
):
813 self
.regs
= [0] * nregs
815 def op(self
, op
, op_imm
, imm
, src1
, src2
, dest
):
816 maxbits
= (1 << self
.rwidth
) - 1
817 src1
= self
.regs
[src1
] & maxbits
821 src2
= self
.regs
[src2
] & maxbits
829 val
= src1
>> (src2
& maxbits
)
831 val
= int(src1
> src2
)
833 val
= int(src1
< src2
)
835 val
= int(src1
== src2
)
837 val
= int(src1
!= src2
)
839 self
.setval(dest
, val
)
842 def setval(self
, dest
, val
):
843 print ("sim setval", dest
, hex(val
))
844 self
.regs
[dest
] = val
847 for i
, val
in enumerate(self
.regs
):
848 reg
= yield dut
.intregs
.regs
[i
].reg
849 okstr
= "OK" if reg
== val
else "!ok"
850 print("reg %d expected %x received %x %s" % (i
, val
, reg
, okstr
))
852 def check(self
, dut
):
853 for i
, val
in enumerate(self
.regs
):
854 reg
= yield dut
.intregs
.regs
[i
].reg
856 print("reg %d expected %x received %x\n" % (i
, val
, reg
))
857 yield from self
.dump(dut
)
860 def instr_q(dut
, op
, op_imm
, imm
, src1
, src2
, dest
,
861 branch_success
, branch_fail
):
862 instrs
= [{'oper_i': op
, 'dest_i': dest
, 'imm_i': imm
, 'opim_i': op_imm
,
863 'src1_i': src1
, 'src2_i': src2
}]
866 for idx
in range(sendlen
):
867 yield from eq(dut
.data_i
[idx
], instrs
[idx
])
868 di
= yield dut
.data_i
[idx
]
869 print ("senddata %d %x" % (idx
, di
))
870 yield dut
.p_add_i
.eq(sendlen
)
872 o_p_ready
= yield dut
.p_ready_o
875 o_p_ready
= yield dut
.p_ready_o
877 yield dut
.p_add_i
.eq(0)
880 def int_instr(dut
, op
, imm
, src1
, src2
, dest
, branch_success
, branch_fail
):
881 yield from disable_issue(dut
)
882 yield dut
.int_dest_i
.eq(dest
)
883 yield dut
.int_src1_i
.eq(src1
)
884 yield dut
.int_src2_i
.eq(src2
)
885 if (op
& (0x3<<2)) != 0: # branch
886 yield dut
.brissue
.insn_i
.eq(1)
887 yield dut
.br_oper_i
.eq(Const(op
& 0x3, 2))
888 yield dut
.br_imm_i
.eq(imm
)
889 dut_issue
= dut
.brissue
891 yield dut
.aluissue
.insn_i
.eq(1)
892 yield dut
.alu_oper_i
.eq(Const(op
& 0x3, 2))
893 yield dut
.alu_imm_i
.eq(imm
)
894 dut_issue
= dut
.aluissue
895 yield dut
.reg_enable_i
.eq(1)
897 # these indicate that the instruction is to be made shadow-dependent on
898 # (either) branch success or branch fail
899 yield dut
.branch_fail_i
.eq(branch_fail
)
900 yield dut
.branch_succ_i
.eq(branch_success
)
903 yield from wait_for_issue(dut
, dut_issue
)
906 def print_reg(dut
, rnums
):
909 reg
= yield dut
.intregs
.regs
[rnum
].reg
910 rs
.append("%x" % reg
)
911 rnums
= map(str, rnums
)
912 print ("reg %s: %s" % (','.join(rnums
), ','.join(rs
)))
915 def create_random_ops(dut
, n_ops
, shadowing
=False, max_opnums
=3):
917 for i
in range(n_ops
):
918 src1
= randint(1, dut
.n_regs
-1)
919 src2
= randint(1, dut
.n_regs
-1)
920 imm
= randint(1, (1<<dut
.rwid
)-1)
921 dest
= randint(1, dut
.n_regs
-1)
922 op
= randint(0, max_opnums
)
923 opi
= 0 if randint(0, 2) else 1 # set true if random is nonzero
926 insts
.append((src1
, src2
, dest
, op
, opi
, imm
, (0, 0)))
928 insts
.append((src1
, src2
, dest
, op
, opi
, imm
))
932 def wait_for_busy_clear(dut
):
934 busy_o
= yield dut
.busy_o
940 def disable_issue(dut
):
941 yield dut
.aluissue
.insn_i
.eq(0)
942 yield dut
.brissue
.insn_i
.eq(0)
945 def wait_for_issue(dut
, dut_issue
):
947 issue_o
= yield dut_issue
.fn_issue_o
949 yield from disable_issue(dut
)
950 yield dut
.reg_enable_i
.eq(0)
953 #yield from print_reg(dut, [1,2,3])
955 #yield from print_reg(dut, [1,2,3])
957 def scoreboard_branch_sim(dut
, alusim
):
963 print ("rseed", iseed
)
967 yield dut
.branch_direction_o
.eq(0)
969 # set random values in the registers
970 for i
in range(1, dut
.n_regs
):
972 val
= randint(0, (1<<alusim
.rwidth
)-1)
973 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
974 alusim
.setval(i
, val
)
977 # create some instructions: branches create a tree
978 insts
= create_random_ops(dut
, 1, True, 1)
979 #insts.append((6, 6, 1, 2, (0, 0)))
980 #insts.append((4, 3, 3, 0, (0, 0)))
982 src1
= randint(1, dut
.n_regs
-1)
983 src2
= randint(1, dut
.n_regs
-1)
985 op
= 4 # only BGT at the moment
987 branch_ok
= create_random_ops(dut
, 1, True, 1)
988 branch_fail
= create_random_ops(dut
, 1, True, 1)
990 insts
.append((src1
, src2
, (branch_ok
, branch_fail
), op
, (0, 0)))
994 insts
.append( (3, 5, 2, 0, (0, 0)) )
997 #branch_ok.append ( (5, 7, 5, 1, (1, 0)) )
998 branch_ok
.append( None )
999 branch_fail
.append( (1, 1, 2, 0, (0, 1)) )
1000 #branch_fail.append( None )
1001 insts
.append( (6, 4, (branch_ok
, branch_fail
), 4, (0, 0)) )
1003 siminsts
= deepcopy(insts
)
1005 # issue instruction(s)
1008 branch_direction
= 0
1013 branch_direction
= yield dut
.branch_direction_o
# way branch went
1014 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = insts
.pop(0)
1015 if branch_direction
== 1 and shadow_on
:
1016 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
1017 continue # branch was "success" and this is a "failed"... skip
1018 if branch_direction
== 2 and shadow_off
:
1019 print ("skip", i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
)
1020 continue # branch was "fail" and this is a "success"... skip
1021 if branch_direction
!= 0:
1026 branch_ok
, branch_fail
= dest
1028 # ok zip up the branch success / fail instructions and
1029 # drop them into the queue, one marked "to have branch success"
1030 # the other to be marked shadow branch "fail".
1031 # one out of each of these will be cancelled
1032 for ok
, fl
in zip(branch_ok
, branch_fail
):
1034 instrs
.append((ok
[0], ok
[1], ok
[2], ok
[3], (1, 0)))
1036 instrs
.append((fl
[0], fl
[1], fl
[2], fl
[3], (0, 1)))
1037 print ("instr %d: (%d, %d, %d, %d, (%d, %d))" % \
1038 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1039 yield from int_instr(dut
, op
, src1
, src2
, dest
,
1040 shadow_on
, shadow_off
)
1042 # wait for all instructions to stop before checking
1044 yield from wait_for_busy_clear(dut
)
1048 instr
= siminsts
.pop(0)
1051 (src1
, src2
, dest
, op
, (shadow_on
, shadow_off
)) = instr
1055 branch_ok
, branch_fail
= dest
1057 print ("sim %d: (%d, %d, %d, %d, (%d, %d))" % \
1058 (i
, src1
, src2
, dest
, op
, shadow_on
, shadow_off
))
1059 branch_res
= alusim
.op(op
, src1
, src2
, dest
)
1062 siminsts
+= branch_ok
1064 siminsts
+= branch_fail
1067 yield from alusim
.check(dut
)
1068 yield from alusim
.dump(dut
)
1071 def scoreboard_sim(dut
, alusim
):
1077 # set random values in the registers
1078 for i
in range(1, dut
.n_regs
):
1079 val
= randint(0, (1<<alusim
.rwidth
)-1)
1082 yield dut
.intregs
.regs
[i
].reg
.eq(val
)
1083 alusim
.setval(i
, val
)
1085 # create some instructions (some random, some regression tests)
1088 instrs
= create_random_ops(dut
, 15, True, 4)
1091 instrs
.append( (1, 2, 2, 1, 1, 20, (0, 0)) )
1094 instrs
.append( (7, 3, 2, 4, (0, 0)) )
1095 instrs
.append( (7, 6, 6, 2, (0, 0)) )
1096 instrs
.append( (1, 7, 2, 2, (0, 0)) )
1099 instrs
.append((2, 3, 3, 0, 0, 0, (0, 0)))
1100 instrs
.append((5, 3, 3, 1, 0, 0, (0, 0)))
1101 instrs
.append((3, 5, 5, 2, 0, 0, (0, 0)))
1102 instrs
.append((5, 3, 3, 3, 0, 0, (0, 0)))
1103 instrs
.append((3, 5, 5, 0, 0, 0, (0, 0)))
1106 instrs
.append( (3, 3, 4, 0, 0, 13979, (0, 0)))
1107 instrs
.append( (6, 4, 1, 2, 0, 40976, (0, 0)))
1108 instrs
.append( (1, 4, 7, 4, 1, 23652, (0, 0)))
1111 instrs
.append((5, 6, 2, 1))
1112 instrs
.append((2, 2, 4, 0))
1113 #instrs.append((2, 2, 3, 1))
1116 instrs
.append((2, 1, 2, 3))
1119 instrs
.append((2, 6, 2, 1))
1120 instrs
.append((2, 1, 2, 0))
1123 instrs
.append((1, 2, 7, 2))
1124 instrs
.append((7, 1, 5, 0))
1125 instrs
.append((4, 4, 1, 1))
1128 instrs
.append((5, 6, 2, 2))
1129 instrs
.append((1, 1, 4, 1))
1130 instrs
.append((6, 5, 3, 0))
1133 # Write-after-Write Hazard
1134 instrs
.append( (3, 6, 7, 2) )
1135 instrs
.append( (4, 4, 7, 1) )
1138 # self-read/write-after-write followed by Read-after-Write
1139 instrs
.append((1, 1, 1, 1))
1140 instrs
.append((1, 5, 3, 0))
1143 # Read-after-Write followed by self-read-after-write
1144 instrs
.append((5, 6, 1, 2))
1145 instrs
.append((1, 1, 1, 1))
1148 # self-read-write sandwich
1149 instrs
.append((5, 6, 1, 2))
1150 instrs
.append((1, 1, 1, 1))
1151 instrs
.append((1, 5, 3, 0))
1154 # very weird failure
1155 instrs
.append( (5, 2, 5, 2) )
1156 instrs
.append( (2, 6, 3, 0) )
1157 instrs
.append( (4, 2, 2, 1) )
1161 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1162 alusim
.setval(5, v1
)
1163 yield dut
.intregs
.regs
[3].reg
.eq(5)
1165 instrs
.append((5, 3, 3, 4, (0, 0)))
1166 instrs
.append((4, 2, 1, 2, (0, 1)))
1170 yield dut
.intregs
.regs
[5].reg
.eq(v1
)
1171 alusim
.setval(5, v1
)
1172 yield dut
.intregs
.regs
[3].reg
.eq(5)
1174 instrs
.append((5, 3, 3, 4, (0, 0)))
1175 instrs
.append((4, 2, 1, 2, (1, 0)))
1178 instrs
.append( (4, 3, 5, 1, 0, (0, 0)) )
1179 instrs
.append( (5, 2, 3, 1, 0, (0, 0)) )
1180 instrs
.append( (7, 1, 5, 2, 0, (0, 0)) )
1181 instrs
.append( (5, 6, 6, 4, 0, (0, 0)) )
1182 instrs
.append( (7, 5, 2, 2, 0, (1, 0)) )
1183 instrs
.append( (1, 7, 5, 0, 0, (0, 1)) )
1184 instrs
.append( (1, 6, 1, 2, 0, (1, 0)) )
1185 instrs
.append( (1, 6, 7, 3, 0, (0, 0)) )
1186 instrs
.append( (6, 7, 7, 0, 0, (0, 0)) )
1188 # issue instruction(s), wait for issue to be free before proceeding
1189 for i
, instr
in enumerate(instrs
):
1190 src1
, src2
, dest
, op
, opi
, imm
, (br_ok
, br_fail
) = instr
1192 print ("instr %d: (%d, %d, %d, %d, %d, %d)" % \
1193 (i
, src1
, src2
, dest
, op
, opi
, imm
))
1194 alusim
.op(op
, opi
, imm
, src1
, src2
, dest
)
1195 yield from instr_q(dut
, op
, opi
, imm
, src1
, src2
, dest
,
1198 # wait for all instructions to stop before checking
1200 iqlen
= yield dut
.qlen_o
1208 yield from wait_for_busy_clear(dut
)
1211 yield from alusim
.check(dut
)
1212 yield from alusim
.dump(dut
)
1215 def test_scoreboard():
1216 dut
= IssueToScoreboard(2, 1, 1, 16, 8, 8)
1217 alusim
= RegSim(16, 8)
1218 memsim
= MemSim(16, 16)
1219 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
1220 with
open("test_scoreboard6600.il", "w") as f
:
1223 run_simulation(dut
, scoreboard_sim(dut
, alusim
),
1224 vcd_name
='test_scoreboard6600.vcd')
1226 #run_simulation(dut, scoreboard_branch_sim(dut, alusim),
1227 # vcd_name='test_scoreboard6600.vcd')
1230 if __name__
== '__main__':