getting there with instruction overlapping
[soc.git] / src / experiment / cscore.py
1 from nmigen.compat.sim import run_simulation
2 from nmigen.cli import verilog, rtlil
3 from nmigen import Module, Const, Signal, Array, Cat, Elaboratable
4
5 from regfile.regfile import RegFileArray, treereduce
6 from scoreboard.fn_unit import IntFnUnit, FPFnUnit, LDFnUnit, STFnUnit
7 from scoreboard.fu_fu_matrix import FUFUDepMatrix
8 from scoreboard.fu_reg_matrix import FURegDepMatrix
9 from scoreboard.global_pending import GlobalPending
10 from scoreboard.group_picker import GroupPicker
11 from scoreboard.issue_unit import IntFPIssueUnit, RegDecode
12
13 from compalu import ComputationUnitNoDelay
14
15 from alu_hier import ALU
16 from nmutil.latch import SRLatch
17
18 from random import randint
19
20
21 class Scoreboard(Elaboratable):
22 def __init__(self, rwid, n_regs):
23 """ Inputs:
24
25 * :rwid: bit width of register file(s) - both FP and INT
26 * :n_regs: depth of register file(s) - number of FP and INT regs
27 """
28 self.rwid = rwid
29 self.n_regs = n_regs
30
31 # Register Files
32 self.intregs = RegFileArray(rwid, n_regs)
33 self.fpregs = RegFileArray(rwid, n_regs)
34
35 # inputs
36 self.int_store_i = Signal(reset_less=True) # instruction is a store
37 self.int_dest_i = Signal(max=n_regs, reset_less=True) # Dest R# in
38 self.int_src1_i = Signal(max=n_regs, reset_less=True) # oper1 R# in
39 self.int_src2_i = Signal(max=n_regs, reset_less=True) # oper2 R# in
40
41 self.issue_o = Signal(reset_less=True) # instruction was accepted
42
43 def elaborate(self, platform):
44 m = Module()
45
46 m.submodules.intregs = self.intregs
47 m.submodules.fpregs = self.fpregs
48
49 # register ports
50 int_dest = self.intregs.write_port("dest")
51 int_src1 = self.intregs.read_port("src1")
52 int_src2 = self.intregs.read_port("src2")
53
54 fp_dest = self.fpregs.write_port("dest")
55 fp_src1 = self.fpregs.read_port("src1")
56 fp_src2 = self.fpregs.read_port("src2")
57
58 # Int ALUs
59 add = ALU(self.rwid)
60 sub = ALU(self.rwid)
61 m.submodules.comp1 = comp1 = ComputationUnitNoDelay(self.rwid, 1, add)
62 m.submodules.comp2 = comp2 = ComputationUnitNoDelay(self.rwid, 1, sub)
63 int_alus = [comp1, comp2]
64
65 m.d.comb += comp1.oper_i.eq(Const(0)) # temporary/experiment: op=add
66 m.d.comb += comp2.oper_i.eq(Const(1)) # temporary/experiment: op=sub
67
68 # Int FUs
69 if_l = []
70 int_src1_pend_v = []
71 int_src2_pend_v = []
72 int_rd_pend_v = []
73 int_wr_pend_v = []
74 for i, a in enumerate(int_alus):
75 # set up Integer Function Unit, add to module (and python list)
76 fu = IntFnUnit(self.n_regs, shadow_wid=0)
77 setattr(m.submodules, "intfu%d" % i, fu)
78 if_l.append(fu)
79 # collate the read/write pending vectors (to go into global pending)
80 int_src1_pend_v.append(fu.src1_pend_o)
81 int_src2_pend_v.append(fu.src2_pend_o)
82 int_rd_pend_v.append(fu.int_rd_pend_o)
83 int_wr_pend_v.append(fu.int_wr_pend_o)
84 int_fus = Array(if_l)
85
86 # Count of number of FUs
87 n_int_fus = len(if_l)
88 n_fp_fus = 0 # for now
89
90 n_fus = n_int_fus + n_fp_fus # plus FP FUs
91
92 # XXX replaced by array of FUs? *FnUnit
93 # # Integer FU-FU Dep Matrix
94 # m.submodules.intfudeps = FUFUDepMatrix(n_int_fus, n_int_fus)
95 # Integer FU-Reg Dep Matrix
96 # intregdeps = FURegDepMatrix(self.n_regs, n_int_fus)
97 # m.submodules.intregdeps = intregdeps
98
99 # Integer Priority Picker 1: Adder + Subtractor
100 intpick1 = GroupPicker(2) # picks between add and sub
101 m.submodules.intpick1 = intpick1
102
103 # Global Pending Vectors (INT and FP)
104 # NOTE: number of vectors is NOT same as number of FUs.
105 g_int_src1_pend_v = GlobalPending(self.n_regs, int_src1_pend_v)
106 g_int_src2_pend_v = GlobalPending(self.n_regs, int_src2_pend_v)
107 g_int_rd_pend_v = GlobalPending(self.n_regs, int_rd_pend_v, True)
108 g_int_wr_pend_v = GlobalPending(self.n_regs, int_wr_pend_v, True)
109 m.submodules.g_int_src1_pend_v = g_int_src1_pend_v
110 m.submodules.g_int_src2_pend_v = g_int_src2_pend_v
111 m.submodules.g_int_rd_pend_v = g_int_rd_pend_v
112 m.submodules.g_int_wr_pend_v = g_int_wr_pend_v
113
114 # INT/FP Issue Unit
115 regdecode = RegDecode(self.n_regs)
116 m.submodules.regdecode = regdecode
117 issueunit = IntFPIssueUnit(self.n_regs, n_int_fus, n_fp_fus)
118 m.submodules.issueunit = issueunit
119
120 #---------
121 # ok start wiring things together...
122 # "now hear de word of de looord... dem bones dem bones dem dryy bones"
123 # https://www.youtube.com/watch?v=pYb8Wm6-QfA
124 #---------
125
126 #---------
127 # Issue Unit is where it starts. set up some in/outs for this module
128 #---------
129 m.d.comb += [issueunit.i.store_i.eq(self.int_store_i),
130 regdecode.dest_i.eq(self.int_dest_i),
131 regdecode.src1_i.eq(self.int_src1_i),
132 regdecode.src2_i.eq(self.int_src2_i),
133 regdecode.enable_i.eq(1),
134 issueunit.i.dest_i.eq(regdecode.dest_o),
135 self.issue_o.eq(issueunit.issue_o)
136 ]
137 self.int_insn_i = issueunit.i.insn_i # enabled by instruction decode
138
139 # connect global rd/wr pending vectors
140 m.d.comb += issueunit.i.g_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
141 # TODO: issueunit.f (FP)
142
143 # and int function issue / busy arrays, and dest/src1/src2
144 fn_issue_l = []
145 fn_busy_l = []
146 for i, fu in enumerate(if_l):
147 fn_issue_l.append(fu.issue_i)
148 fn_busy_l.append(fu.busy_o)
149 m.d.sync += fu.issue_i.eq(issueunit.i.fn_issue_o[i])
150 m.d.comb += fu.dest_i.eq(self.int_dest_i)
151 m.d.comb += fu.src1_i.eq(self.int_src1_i)
152 m.d.comb += fu.src2_i.eq(self.int_src2_i)
153 # XXX sync, so as to stop a simulation infinite loop
154 m.d.comb += issueunit.i.busy_i[i].eq(fu.busy_o)
155
156 #---------
157 # connect Function Units
158 #---------
159
160 # XXX sync, again to avoid an infinite loop. is it the right thing???
161
162 # Group Picker... done manually for now. TODO: cat array of pick sigs
163 m.d.sync += if_l[0].go_rd_i.eq(intpick1.go_rd_o[0]) # add rd
164 m.d.sync += if_l[0].go_wr_i.eq(intpick1.go_wr_o[0]) # add wr
165
166 m.d.sync += if_l[1].go_rd_i.eq(intpick1.go_rd_o[1]) # subtract rd
167 m.d.sync += if_l[1].go_wr_i.eq(intpick1.go_wr_o[1]) # subtract wr
168
169 # Connect INT Fn Unit global wr/rd pending
170 for fu in if_l:
171 m.d.comb += fu.g_int_wr_pend_i.eq(g_int_wr_pend_v.g_pend_o)
172 m.d.comb += fu.g_int_rd_pend_i.eq(g_int_rd_pend_v.g_pend_o)
173
174 # Connect Picker
175 #---------
176 m.d.comb += intpick1.req_rel_i[0].eq(int_alus[0].req_rel_o)
177 m.d.comb += intpick1.req_rel_i[1].eq(int_alus[1].req_rel_o)
178 m.d.comb += intpick1.readable_i[0].eq(if_l[0].int_readable_o) # add rd
179 m.d.comb += intpick1.writable_i[0].eq(if_l[0].int_writable_o) # add wr
180 m.d.comb += intpick1.readable_i[1].eq(if_l[1].int_readable_o) # sub rd
181 m.d.comb += intpick1.writable_i[1].eq(if_l[1].int_writable_o) # sub wr
182
183 #---------
184 # Connect Register File(s)
185 #---------
186 with m.If(if_l[0].go_wr_i | if_l[1].go_wr_i):
187 m.d.comb += int_dest.wen.eq(g_int_wr_pend_v.g_pend_o)
188 #with m.If(intpick1.go_rd_o):
189 m.d.comb += int_src1.ren.eq(g_int_src1_pend_v.g_pend_o)
190 m.d.comb += int_src2.ren.eq(g_int_src2_pend_v.g_pend_o)
191
192 # merge (OR) all integer FU / ALU outputs to a single value
193 # bit of a hack: treereduce needs a list with an item named "dest_o"
194 dest_o = treereduce(int_alus)
195 m.d.comb += int_dest.data_i.eq(dest_o)
196
197 # connect ALUs
198 for i, alu in enumerate(int_alus):
199 m.d.sync += alu.go_rd_i.eq(intpick1.go_rd_o[i])
200 m.d.sync += alu.go_wr_i.eq(intpick1.go_wr_o[i])
201 m.d.comb += alu.issue_i.eq(fn_issue_l[i])
202 #m.d.comb += fn_busy_l[i].eq(alu.busy_o) # XXX ignore, use fnissue
203 m.d.comb += alu.src1_i.eq(int_src1.data_o)
204 m.d.comb += alu.src2_i.eq(int_src2.data_o)
205 m.d.comb += if_l[i].req_rel_i.eq(alu.req_rel_o) # pipe out ready
206
207 return m
208
209
210 def __iter__(self):
211 yield from self.intregs
212 yield from self.fpregs
213 yield self.int_store_i
214 yield self.int_dest_i
215 yield self.int_src1_i
216 yield self.int_src2_i
217 yield self.issue_o
218 #yield from self.int_src1
219 #yield from self.int_dest
220 #yield from self.int_src1
221 #yield from self.int_src2
222 #yield from self.fp_dest
223 #yield from self.fp_src1
224 #yield from self.fp_src2
225
226 def ports(self):
227 return list(self)
228
229 IADD = 0
230 ISUB = 1
231
232 class RegSim:
233 def __init__(self, rwidth, nregs):
234 self.rwidth = rwidth
235 self.regs = [0] * nregs
236
237 def op(self, op, src1, src2, dest):
238 src1 = self.regs[src1]
239 src2 = self.regs[src2]
240 if op == IADD:
241 val = (src1 + src2) & ((1<<(self.rwidth))-1)
242 elif op == ISUB:
243 val = (src1 - src2) & ((1<<(self.rwidth))-1)
244 self.regs[dest] = val
245
246 def setval(self, dest, val):
247 self.regs[dest] = val
248
249 def dump(self, dut):
250 for i, val in enumerate(self.regs):
251 reg = yield dut.intregs.regs[i].reg
252 okstr = "OK" if reg == val else "!ok"
253 print("reg %d expected %x received %x %s" % (i, val, reg, okstr))
254
255 def check(self, dut):
256 for i, val in enumerate(self.regs):
257 reg = yield dut.intregs.regs[i].reg
258 if reg != val:
259 print("reg %d expected %x received %x\n" % (i, val, reg))
260 yield from self.dump(dut)
261 assert False
262
263 def int_instr(dut, alusim, op, src1, src2, dest):
264 for i in range(len(dut.int_insn_i)):
265 yield dut.int_insn_i[i].eq(0)
266 yield dut.int_dest_i.eq(dest)
267 yield dut.int_src1_i.eq(src1)
268 yield dut.int_src2_i.eq(src2)
269 yield dut.int_insn_i[op].eq(1)
270 alusim.op(op, src1, src2, dest)
271
272
273 def print_reg(dut, rnums):
274 rs = []
275 for rnum in rnums:
276 reg = yield dut.intregs.regs[rnum].reg
277 rs.append("%x" % reg)
278 rnums = map(str, rnums)
279 print ("reg %s: %s" % (','.join(rnums), ','.join(rs)))
280
281
282 def scoreboard_sim(dut, alusim):
283 yield dut.int_store_i.eq(0)
284
285 for i in range(1, dut.n_regs):
286 yield dut.intregs.regs[i].reg.eq(i)
287 alusim.setval(i, i)
288
289 if False:
290 yield from int_instr(dut, alusim, IADD, 4, 3, 5)
291 yield from print_reg(dut, [3,4,5])
292 yield
293 yield from int_instr(dut, alusim, IADD, 5, 2, 5)
294 yield from print_reg(dut, [3,4,5])
295 yield
296 yield from int_instr(dut, alusim, ISUB, 5, 1, 3)
297 yield from print_reg(dut, [3,4,5])
298 yield
299 for i in range(len(dut.int_insn_i)):
300 yield dut.int_insn_i[i].eq(0)
301 yield from print_reg(dut, [3,4,5])
302 yield
303 yield from print_reg(dut, [3,4,5])
304 yield
305 yield from print_reg(dut, [3,4,5])
306 yield
307
308 yield from alusim.check(dut)
309
310 for i in range(2):
311 src1 = randint(1, dut.n_regs-1)
312 src2 = randint(1, dut.n_regs-1)
313 while True:
314 dest = randint(1, dut.n_regs-1)
315 break
316 if dest not in [src1, src2]:
317 break
318 op = randint(0, 1)
319 if False:
320 if i % 2 == 0:
321 src1 = 6
322 src2 = 6
323 dest = 1
324 else:
325 src1 = 1
326 src2 = 7
327 dest = 2
328 #src1 = 2
329 #src2 = 3
330 #dest = 2
331
332 op = i
333
334 if True:
335 if i == 0:
336 src1 = 2
337 src2 = 3
338 dest = 3
339 else:
340 src1 = 5
341 src2 = 4
342 dest = 7
343
344 #op = (i+1) % 2
345 op = 0
346
347 print ("random %d: %d %d %d %d\n" % (i, op, src1, src2, dest))
348 yield from int_instr(dut, alusim, op, src1, src2, dest)
349 yield from print_reg(dut, [3,4,5])
350 while True:
351 yield
352 issue_o = yield dut.issue_o
353 if issue_o:
354 yield from print_reg(dut, [3,4,5])
355 for i in range(len(dut.int_insn_i)):
356 yield dut.int_insn_i[i].eq(0)
357 yield
358 break
359 print ("busy",)
360 yield from print_reg(dut, [3,4,5])
361
362
363 yield
364 yield from print_reg(dut, [3,4,5])
365 yield
366 yield from print_reg(dut, [3,4,5])
367 yield
368 yield from print_reg(dut, [3,4,5])
369 yield
370 yield from print_reg(dut, [3,4,5])
371 yield
372 yield
373 yield
374 yield
375 yield
376 yield
377 yield
378 yield
379 yield
380 yield from alusim.check(dut)
381 yield from alusim.dump(dut)
382
383
384 def explore_groups(dut):
385 from nmigen.hdl.ir import Fragment
386 from nmigen.hdl.xfrm import LHSGroupAnalyzer
387
388 fragment = dut.elaborate(platform=None)
389 fr = Fragment.get(fragment, platform=None)
390
391 groups = LHSGroupAnalyzer()(fragment._statements)
392
393 print (groups)
394
395
396 def test_scoreboard():
397 dut = Scoreboard(32, 8)
398 alusim = RegSim(32, 8)
399 vl = rtlil.convert(dut, ports=dut.ports())
400 with open("test_scoreboard.il", "w") as f:
401 f.write(vl)
402
403 run_simulation(dut, scoreboard_sim(dut, alusim),
404 vcd_name='test_scoreboard.vcd')
405
406
407 if __name__ == '__main__':
408 test_scoreboard()