PLRUs were selecting an output index, only one selected
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
24 Record)
25 from nmigen.cli import main, rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmigen.utils import log2_int
28 from nmigen.lib.coding import Decoder
29 from nmutil.util import Display
30
31 #from nmutil.plru import PLRU
32 from soc.experiment.plru import PLRU, PLRUs
33 from soc.experiment.cache_ram import CacheRam
34
35 from soc.experiment.mem_types import (Fetch1ToICacheType,
36 ICacheToDecode1Type,
37 MMUToICacheType)
38
39 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
40 WB_SEL_BITS, WBAddrType, WBDataType,
41 WBSelType, WBMasterOut, WBSlaveOut,
42 )
43
44 from nmigen_soc.wishbone.bus import Interface
45
46 # for test
47 from soc.bus.sram import SRAM
48 from nmigen import Memory
49 from nmutil.util import wrap
50 from nmigen.cli import main, rtlil
51
52 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
53 # Also, check out the cxxsim nmigen branch, and latest yosys from git
54 from nmutil.sim_tmp_alternative import Simulator, Settle
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 16
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
80 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
81 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
82 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
83 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 # Bit fields counts in the address
87 #
88 # INSN_BITS is the number of bits to select an instruction in a row
89 INSN_BITS = log2_int(INSN_PER_ROW)
90 # ROW_BITS is the number of bits to select a row
91 ROW_BITS = log2_int(BRAM_ROWS)
92 # ROW_LINE_BITS is the number of bits to select a row within a line
93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
94 # LINE_OFF_BITS is the number of bits for the offset in a cache line
95 LINE_OFF_BITS = log2_int(LINE_SIZE)
96 # ROW_OFF_BITS is the number of bits for the offset in a row
97 ROW_OFF_BITS = log2_int(ROW_SIZE)
98 # INDEX_BITS is the number of bits to select a cache line
99 INDEX_BITS = log2_int(NUM_LINES)
100 # SET_SIZE_BITS is the log base 2 of the set size
101 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
102 # TAG_BITS is the number of bits of the tag part of the address
103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
104 # TAG_WIDTH is the width in bits of each way of the tag RAM
105 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
106
107 # WAY_BITS is the number of bits to select a way
108 WAY_BITS = log2_int(NUM_WAYS)
109 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
110
111 # L1 ITLB
112 TLB_BITS = log2_int(TLB_SIZE)
113 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
114 TLB_PTE_BITS = 64
115
116 print("BRAM_ROWS =", BRAM_ROWS)
117 print("INDEX_BITS =", INDEX_BITS)
118 print("INSN_BITS =", INSN_BITS)
119 print("INSN_PER_ROW =", INSN_PER_ROW)
120 print("LINE_SIZE =", LINE_SIZE)
121 print("LINE_OFF_BITS =", LINE_OFF_BITS)
122 print("LOG_LENGTH =", LOG_LENGTH)
123 print("NUM_LINES =", NUM_LINES)
124 print("NUM_WAYS =", NUM_WAYS)
125 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
126 print("ROW_BITS =", ROW_BITS)
127 print("ROW_OFF_BITS =", ROW_OFF_BITS)
128 print("ROW_LINE_BITS =", ROW_LINE_BITS)
129 print("ROW_PER_LINE =", ROW_PER_LINE)
130 print("ROW_SIZE =", ROW_SIZE)
131 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
132 print("SET_SIZE_BITS =", SET_SIZE_BITS)
133 print("SIM =", SIM)
134 print("TAG_BITS =", TAG_BITS)
135 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
136 print("TAG_BITS =", TAG_BITS)
137 print("TLB_BITS =", TLB_BITS)
138 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
139 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
140 print("TLB_PTE_BITS =", TLB_PTE_BITS)
141 print("TLB_SIZE =", TLB_SIZE)
142 print("WAY_BITS =", WAY_BITS)
143
144 # from microwatt/utils.vhdl
145 def ispow2(n):
146 return n != 0 and (n & (n - 1)) == 0
147
148 assert LINE_SIZE % ROW_SIZE == 0
149 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
150 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
151 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
152 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
153 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
154 "geometry bits don't add up"
155 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
156 "geometry bits don't add up"
157 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
158 "geometry bits don't add up"
159 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
160 "geometry bits don't add up"
161
162 # Example of layout for 32 lines of 64 bytes:
163 #
164 # .. tag |index| line |
165 # .. | row | |
166 # .. | | | |00| zero (2)
167 # .. | | |-| | INSN_BITS (1)
168 # .. | |---| | ROW_LINE_BITS (3)
169 # .. | |--- - --| LINE_OFF_BITS (6)
170 # .. | |- --| ROW_OFF_BITS (3)
171 # .. |----- ---| | ROW_BITS (8)
172 # .. |-----| | INDEX_BITS (5)
173 # .. --------| | TAG_BITS (53)
174
175 # The cache data BRAM organized as described above for each way
176 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
177 #
178 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
179 # not handle a clean (commented) definition of the cache tags as a 3d
180 # memory. For now, work around it by putting all the tags
181 def CacheTagArray():
182 tag_layout = [('valid', 1),
183 ('tag', TAG_RAM_WIDTH),
184 ]
185 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
186
187 def RowPerLineValidArray():
188 return Array(Signal(name="rows_valid_%d" %x) \
189 for x in range(ROW_PER_LINE))
190
191
192 # TODO to be passed to nigmen as ram attributes
193 # attribute ram_style : string;
194 # attribute ram_style of cache_tags : signal is "distributed";
195
196 def TLBArray():
197 tlb_layout = [('valid', 1),
198 ('tag', TLB_EA_TAG_BITS),
199 ('pte', TLB_PTE_BITS)
200 ]
201 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
202
203 # Cache RAM interface
204 def CacheRamOut():
205 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
206 for x in range(NUM_WAYS))
207
208 # PLRU output interface
209 def PLRUOut():
210 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
211 for x in range(NUM_LINES))
212
213 # Return the cache line index (tag index) for an address
214 def get_index(addr):
215 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
216
217 # Return the cache row index (data memory) for an address
218 def get_row(addr):
219 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
220
221 # Return the index of a row within a line
222 def get_row_of_line(row):
223 return row[:ROW_BITS][:ROW_LINE_BITS]
224
225 # Returns whether this is the last row of a line
226 def is_last_row_addr(addr, last):
227 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
228
229 # Returns whether this is the last row of a line
230 def is_last_row(row, last):
231 return get_row_of_line(row) == last
232
233 # Return the next row in the current cache line. We use a dedicated
234 # function in order to limit the size of the generated adder to be
235 # only the bits within a cache line (3 bits with default settings)
236 def next_row(row):
237 row_v = row[0:ROW_LINE_BITS] + 1
238 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
239
240 # Read the instruction word for the given address
241 # in the current cache row
242 def read_insn_word(addr, data):
243 word = addr[2:INSN_BITS+2]
244 return data.word_select(word, 32)
245
246 # Get the tag value from the address
247 def get_tag(addr):
248 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
249
250 # Read a tag from a tag memory row
251 def read_tag(way, tagset):
252 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
253
254 # Write a tag to tag memory row
255 def write_tag(way, tagset, tag):
256 return read_tag(way, tagset).eq(tag)
257
258 # Simple hash for direct-mapped TLB index
259 def hash_ea(addr):
260 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
261 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
262 ] ^ addr[
263 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
264 ]
265 return hsh
266
267
268 # Cache reload state machine
269 @unique
270 class State(Enum):
271 IDLE = 0
272 CLR_TAG = 1
273 WAIT_ACK = 2
274
275
276 class RegInternal(RecordObject):
277 def __init__(self):
278 super().__init__()
279 # Cache hit state (Latches for 1 cycle BRAM access)
280 self.hit_way = Signal(WAY_BITS)
281 self.hit_nia = Signal(64)
282 self.hit_smark = Signal()
283 self.hit_valid = Signal()
284
285 # Cache miss state (reload state machine)
286 self.state = Signal(State, reset=State.IDLE)
287 self.wb = WBMasterOut("wb")
288 self.req_adr = Signal(64)
289 self.store_way = Signal(WAY_BITS)
290 self.store_index = Signal(INDEX_BITS)
291 self.store_row = Signal(ROW_BITS)
292 self.store_tag = Signal(TAG_BITS)
293 self.store_valid = Signal()
294 self.end_row_ix = Signal(ROW_LINE_BITS)
295 self.rows_valid = RowPerLineValidArray()
296
297 # TLB miss state
298 self.fetch_failed = Signal()
299
300
301 class ICache(Elaboratable):
302 """64 bit direct mapped icache. All instructions are 4B aligned."""
303 def __init__(self):
304 self.i_in = Fetch1ToICacheType(name="i_in")
305 self.i_out = ICacheToDecode1Type(name="i_out")
306
307 self.m_in = MMUToICacheType(name="m_in")
308
309 self.stall_in = Signal()
310 self.stall_out = Signal()
311 self.flush_in = Signal()
312 self.inval_in = Signal()
313
314 # standard naming (wired to non-standard for compatibility)
315 self.bus = Interface(addr_width=32,
316 data_width=64,
317 granularity=8,
318 features={'stall'},
319 alignment=0,
320 name="dcache")
321
322 self.log_out = Signal(54)
323
324
325 # Generate a cache RAM for each way
326 def rams(self, m, r, cache_out_row, use_previous,
327 replace_way, req_row):
328
329 comb = m.d.comb
330 sync = m.d.sync
331
332 bus, stall_in = self.bus, self.stall_in
333
334 # read condition (for every cache ram)
335 do_read = Signal()
336 comb += do_read.eq(~(stall_in | use_previous))
337
338 rd_addr = Signal(ROW_BITS)
339 wr_addr = Signal(ROW_BITS)
340 comb += rd_addr.eq(req_row)
341 comb += wr_addr.eq(r.store_row)
342
343 # binary-to-unary converters: replace-way enabled by bus.ack,
344 # hit-way left permanently enabled
345 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
346 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
347 comb += re.i.eq(replace_way)
348 comb += re.n.eq(~bus.ack)
349 comb += he.i.eq(r.hit_way)
350
351 for i in range(NUM_WAYS):
352 do_write = Signal(name="do_wr_%d" % i)
353 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
354 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
355
356 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
357 m.submodules["cacheram_%d" % i] = way
358
359 comb += way.rd_en.eq(do_read)
360 comb += way.rd_addr.eq(rd_addr)
361 comb += d_out.eq(way.rd_data_o)
362 comb += way.wr_sel.eq(wr_sel)
363 comb += way.wr_addr.eq(wr_addr)
364 comb += way.wr_data.eq(bus.dat_r)
365
366 comb += do_write.eq(re.o[i])
367
368 with m.If(do_write):
369 sync += Display("cache write adr: %x data: %lx",
370 wr_addr, way.wr_data)
371
372 with m.If(he.o[i]):
373 comb += cache_out_row.eq(d_out)
374 with m.If(do_read):
375 sync += Display("cache read adr: %x data: %x",
376 req_row, d_out)
377
378 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
379
380 # Generate PLRUs
381 def maybe_plrus(self, m, r, plru_victim):
382 comb = m.d.comb
383
384 if NUM_WAYS == 0:
385 return
386
387
388 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
389 comb += plru.way.eq(r.hit_way)
390 comb += plru.valid.eq(r.hit_valid)
391 comb += plru.index.eq(get_index(r.hit_nia))
392 comb += plru.isel.eq(r.store_index) # select victim
393 comb += plru_victim.eq(plru.o_index) # selected victim
394
395 # TLB hit detection and real address generation
396 def itlb_lookup(self, m, tlb_req_index, itlb,
397 real_addr, ra_valid, eaa_priv,
398 priv_fault, access_ok):
399
400 comb = m.d.comb
401
402 i_in = self.i_in
403
404 pte = Signal(TLB_PTE_BITS)
405 ttag = Signal(TLB_EA_TAG_BITS)
406
407 comb += tlb_req_index.eq(hash_ea(i_in.nia))
408 comb += pte.eq(itlb[tlb_req_index].pte)
409 comb += ttag.eq(itlb[tlb_req_index].tag)
410
411 with m.If(i_in.virt_mode):
412 comb += real_addr.eq(Cat(
413 i_in.nia[:TLB_LG_PGSZ],
414 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
415 ))
416
417 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
418 comb += ra_valid.eq(itlb[tlb_req_index].valid)
419
420 comb += eaa_priv.eq(pte[3])
421
422 with m.Else():
423 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
424 comb += ra_valid.eq(1)
425 comb += eaa_priv.eq(1)
426
427 # No IAMR, so no KUEP support for now
428 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
429 comb += access_ok.eq(ra_valid & ~priv_fault)
430
431 # iTLB update
432 def itlb_update(self, m, itlb):
433 comb = m.d.comb
434 sync = m.d.sync
435
436 m_in = self.m_in
437
438 wr_index = Signal(TLB_SIZE)
439 comb += wr_index.eq(hash_ea(m_in.addr))
440
441 with m.If(m_in.tlbie & m_in.doall):
442 # Clear all valid bits
443 for i in range(TLB_SIZE):
444 sync += itlb[i].valid.eq(0)
445
446 with m.Elif(m_in.tlbie):
447 # Clear entry regardless of hit or miss
448 sync += itlb[wr_index].valid.eq(0)
449
450 with m.Elif(m_in.tlbld):
451 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
452 sync += itlb[wr_index].pte.eq(m_in.pte)
453 sync += itlb[wr_index].valid.eq(1)
454
455 # Cache hit detection, output to fetch2 and other misc logic
456 def icache_comb(self, m, use_previous, r, req_index, req_row,
457 req_hit_way, req_tag, real_addr, req_laddr,
458 cache_tags, access_ok,
459 req_is_hit, req_is_miss, replace_way,
460 plru_victim, cache_out_row):
461
462 comb = m.d.comb
463
464 i_in, i_out, bus = self.i_in, self.i_out, self.bus
465 flush_in, stall_out = self.flush_in, self.stall_out
466
467 is_hit = Signal()
468 hit_way = Signal(WAY_BITS)
469
470 # i_in.sequential means that i_in.nia this cycle is 4 more than
471 # last cycle. If we read more than 32 bits at a time, had a
472 # cache hit last cycle, and we don't want the first 32-bit chunk
473 # then we can keep the data we read last cycle and just use that.
474 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
475 comb += use_previous.eq(i_in.sequential & r.hit_valid)
476
477 # Extract line, row and tag from request
478 comb += req_index.eq(get_index(i_in.nia))
479 comb += req_row.eq(get_row(i_in.nia))
480 comb += req_tag.eq(get_tag(real_addr))
481
482 # Calculate address of beginning of cache row, will be
483 # used for cache miss processing if needed
484 comb += req_laddr.eq(Cat(
485 Const(0, ROW_OFF_BITS),
486 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
487 ))
488
489 # Test if pending request is a hit on any way
490 hitcond = Signal()
491 comb += hitcond.eq((r.state == State.WAIT_ACK)
492 & (req_index == r.store_index)
493 & r.rows_valid[req_row % ROW_PER_LINE]
494 )
495 # i_in.req asserts Decoder active
496 cvb = Signal(NUM_WAYS)
497 ctag = Signal(TAG_RAM_WIDTH)
498 comb += ctag.eq(cache_tags[req_index].tag)
499 comb += cvb.eq(cache_tags[req_index].valid)
500 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
501 comb += se.i.eq(r.store_way)
502 comb += se.n.eq(~i_in.req)
503 for i in range(NUM_WAYS):
504 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
505 hit_test = Signal(name="hit_test%d" % i)
506 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
507 comb += tagi.eq(read_tag(i, ctag))
508 comb += hit_test.eq(se.o[i])
509 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
510 (tagi == req_tag))
511 with m.If(is_tag_hit):
512 comb += hit_way.eq(i)
513 comb += is_hit.eq(1)
514
515 # Generate the "hit" and "miss" signals
516 # for the synchronous blocks
517 with m.If(i_in.req & access_ok & ~flush_in):
518 comb += req_is_hit.eq(is_hit)
519 comb += req_is_miss.eq(~is_hit)
520
521 comb += req_hit_way.eq(hit_way)
522
523 # The way to replace on a miss
524 with m.If(r.state == State.CLR_TAG):
525 comb += replace_way.eq(plru_victim)
526 with m.Else():
527 comb += replace_way.eq(r.store_way)
528
529 # Output instruction from current cache row
530 #
531 # Note: This is a mild violation of our design principle of
532 # having pipeline stages output from a clean latch. In this
533 # case we output the result of a mux. The alternative would
534 # be output an entire row which I prefer not to do just yet
535 # as it would force fetch2 to know about some of the cache
536 # geometry information.
537 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
538 comb += i_out.valid.eq(r.hit_valid)
539 comb += i_out.nia.eq(r.hit_nia)
540 comb += i_out.stop_mark.eq(r.hit_smark)
541 comb += i_out.fetch_failed.eq(r.fetch_failed)
542
543 # Stall fetch1 if we have a miss on cache or TLB
544 # or a protection fault
545 comb += stall_out.eq(~(is_hit & access_ok))
546
547 # Wishbone requests output (from the cache miss reload machine)
548 comb += bus.we.eq(r.wb.we)
549 comb += bus.adr.eq(r.wb.adr)
550 comb += bus.sel.eq(r.wb.sel)
551 comb += bus.stb.eq(r.wb.stb)
552 comb += bus.dat_w.eq(r.wb.dat)
553 comb += bus.cyc.eq(r.wb.cyc)
554
555 # Cache hit synchronous machine
556 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
557 req_index, req_tag, real_addr):
558 sync = m.d.sync
559
560 i_in, stall_in = self.i_in, self.stall_in
561 flush_in = self.flush_in
562
563 # keep outputs to fetch2 unchanged on a stall
564 # except that flush or reset sets valid to 0
565 # If use_previous, keep the same data as last
566 # cycle and use the second half
567 with m.If(stall_in | use_previous):
568 with m.If(flush_in):
569 sync += r.hit_valid.eq(0)
570 with m.Else():
571 # On a hit, latch the request for the next cycle,
572 # when the BRAM data will be available on the
573 # cache_out output of the corresponding way
574 sync += r.hit_valid.eq(req_is_hit)
575
576 with m.If(req_is_hit):
577 sync += r.hit_way.eq(req_hit_way)
578 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
579 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
580 i_in.stop_mark, req_index, req_tag,
581 req_hit_way, real_addr)
582
583 with m.If(~stall_in):
584 # Send stop marks and NIA down regardless of validity
585 sync += r.hit_smark.eq(i_in.stop_mark)
586 sync += r.hit_nia.eq(i_in.nia)
587
588 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
589 req_index, req_tag, replace_way, real_addr):
590 comb = m.d.comb
591 sync = m.d.sync
592
593 i_in = self.i_in
594
595 # Reset per-row valid flags, only used in WAIT_ACK
596 for i in range(ROW_PER_LINE):
597 sync += r.rows_valid[i].eq(0)
598
599 # We need to read a cache line
600 with m.If(req_is_miss):
601 sync += Display(
602 "cache miss nia:%x IR:%x SM:%x idx:%x "
603 " way:%x tag:%x RA:%x", i_in.nia,
604 i_in.virt_mode, i_in.stop_mark, req_index,
605 replace_way, req_tag, real_addr)
606
607 # Keep track of our index and way for subsequent stores
608 st_row = Signal(ROW_BITS)
609 comb += st_row.eq(get_row(req_laddr))
610 sync += r.store_index.eq(req_index)
611 sync += r.store_row.eq(st_row)
612 sync += r.store_tag.eq(req_tag)
613 sync += r.store_valid.eq(1)
614 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
615
616 # Prep for first wishbone read. We calculate the address
617 # of the start of the cache line and start the WB cycle.
618 sync += r.req_adr.eq(req_laddr)
619 sync += r.wb.cyc.eq(1)
620 sync += r.wb.stb.eq(1)
621
622 # Track that we had one request sent
623 sync += r.state.eq(State.CLR_TAG)
624
625 def icache_miss_clr_tag(self, m, r, replace_way,
626 req_index,
627 tagset, cache_tags):
628 comb = m.d.comb
629 sync = m.d.sync
630
631 # Get victim way from plru
632 sync += r.store_way.eq(replace_way)
633
634 # Force misses on that way while reloading that line
635 cv = Signal(INDEX_BITS)
636 comb += cv.eq(cache_tags[req_index].valid)
637 comb += cv.bit_select(replace_way, 1).eq(0)
638 sync += cache_tags[req_index].valid.eq(cv)
639
640 for i in range(NUM_WAYS):
641 with m.If(i == replace_way):
642 comb += tagset.eq(cache_tags[r.store_index].tag)
643 comb += write_tag(i, tagset, r.store_tag)
644 sync += cache_tags[r.store_index].tag.eq(tagset)
645
646 sync += r.state.eq(State.WAIT_ACK)
647
648 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
649 cache_tags, stbs_done):
650 comb = m.d.comb
651 sync = m.d.sync
652
653 bus = self.bus
654
655 # Requests are all sent if stb is 0
656 stbs_zero = Signal()
657 comb += stbs_zero.eq(r.wb.stb == 0)
658 comb += stbs_done.eq(stbs_zero)
659
660 # If we are still sending requests, was one accepted?
661 with m.If(~bus.stall & ~stbs_zero):
662 # That was the last word? We are done sending.
663 # Clear stb and set stbs_done so we can handle
664 # an eventual last ack on the same cycle.
665 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
666 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
667 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
668 "stbs_done:%x", r.wb.adr, r.end_row_ix,
669 r.wb.stb, stbs_zero, stbs_done)
670 sync += r.wb.stb.eq(0)
671 comb += stbs_done.eq(1)
672
673 # Calculate the next row address
674 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
675 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
676 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
677 sync += Display("RARANGE r.req_adr:%x rarange:%x "
678 "stbs_zero:%x stbs_done:%x",
679 r.req_adr, rarange, stbs_zero, stbs_done)
680
681 # Incoming acks processing
682 with m.If(bus.ack):
683 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
684 "stbs_done:%x",
685 bus.dat_r, stbs_zero, stbs_done)
686
687 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
688
689 # Check for completion
690 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
691 # Complete wishbone cycle
692 sync += r.wb.cyc.eq(0)
693 # be nice, clear addr
694 sync += r.req_adr.eq(0)
695
696 # Cache line is now valid
697 cv = Signal(INDEX_BITS)
698 comb += cv.eq(cache_tags[r.store_index].valid)
699 comb += cv.bit_select(replace_way, 1).eq(
700 r.store_valid & ~inval_in)
701 sync += cache_tags[r.store_index].valid.eq(cv)
702
703 sync += r.state.eq(State.IDLE)
704
705 # move on to next request in row
706 # Increment store row counter
707 sync += r.store_row.eq(next_row(r.store_row))
708
709 # Cache miss/reload synchronous machine
710 def icache_miss(self, m, r, req_is_miss,
711 req_index, req_laddr, req_tag, replace_way,
712 cache_tags, access_ok, real_addr):
713 comb = m.d.comb
714 sync = m.d.sync
715
716 i_in, bus, m_in = self.i_in, self.bus, self.m_in
717 stall_in, flush_in = self.stall_in, self.flush_in
718 inval_in = self.inval_in
719
720 tagset = Signal(TAG_RAM_WIDTH)
721 stbs_done = Signal()
722
723 comb += r.wb.sel.eq(-1)
724 comb += r.wb.adr.eq(r.req_adr[3:])
725
726 # Process cache invalidations
727 with m.If(inval_in):
728 for i in range(NUM_LINES):
729 sync += cache_tags[i].valid.eq(0)
730 sync += r.store_valid.eq(0)
731
732 # Main state machine
733 with m.Switch(r.state):
734
735 with m.Case(State.IDLE):
736 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
737 req_index, req_tag, replace_way,
738 real_addr)
739
740 with m.Case(State.CLR_TAG, State.WAIT_ACK):
741 with m.If(r.state == State.CLR_TAG):
742 self.icache_miss_clr_tag(m, r, replace_way,
743 req_index, tagset, cache_tags)
744
745 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
746 cache_tags, stbs_done)
747
748 # TLB miss and protection fault processing
749 with m.If(flush_in | m_in.tlbld):
750 sync += r.fetch_failed.eq(0)
751 with m.Elif(i_in.req & ~access_ok & ~stall_in):
752 sync += r.fetch_failed.eq(1)
753
754 # icache_log: if LOG_LENGTH > 0 generate
755 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
756 req_is_miss, req_is_hit, lway, wstate, r):
757 comb = m.d.comb
758 sync = m.d.sync
759
760 bus, i_out = self.bus, self.i_out
761 log_out, stall_out = self.log_out, self.stall_out
762
763 # Output data to logger
764 for i in range(LOG_LENGTH):
765 log_data = Signal(54)
766 lway = Signal(WAY_BITS)
767 wstate = Signal()
768
769 sync += lway.eq(req_hit_way)
770 sync += wstate.eq(0)
771
772 with m.If(r.state != State.IDLE):
773 sync += wstate.eq(1)
774
775 sync += log_data.eq(Cat(
776 ra_valid, access_ok, req_is_miss, req_is_hit,
777 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
778 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
779 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
780 ))
781 comb += log_out.eq(log_data)
782
783 def elaborate(self, platform):
784
785 m = Module()
786 comb = m.d.comb
787
788 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
789 cache_tags = CacheTagArray()
790
791 # TLB Array
792 itlb = TLBArray()
793
794 # TODO to be passed to nmigen as ram attributes
795 # attribute ram_style of itlb_tags : signal is "distributed";
796 # attribute ram_style of itlb_ptes : signal is "distributed";
797
798 # Privilege bit from PTE EAA field
799 eaa_priv = Signal()
800
801 r = RegInternal()
802
803 # Async signal on incoming request
804 req_index = Signal(INDEX_BITS)
805 req_row = Signal(ROW_BITS)
806 req_hit_way = Signal(WAY_BITS)
807 req_tag = Signal(TAG_BITS)
808 req_is_hit = Signal()
809 req_is_miss = Signal()
810 req_laddr = Signal(64)
811
812 tlb_req_index = Signal(TLB_SIZE)
813 real_addr = Signal(REAL_ADDR_BITS)
814 ra_valid = Signal()
815 priv_fault = Signal()
816 access_ok = Signal()
817 use_previous = Signal()
818
819 cache_out_row = Signal(ROW_SIZE_BITS)
820
821 plru_victim = Signal(WAY_BITS)
822 replace_way = Signal(WAY_BITS)
823
824 # fake-up the wishbone stall signal to comply with pipeline mode
825 # same thing is done in dcache.py
826 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
827
828 # call sub-functions putting everything together,
829 # using shared signals established above
830 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
831 self.maybe_plrus(m, r, plru_victim)
832 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
833 ra_valid, eaa_priv, priv_fault,
834 access_ok)
835 self.itlb_update(m, itlb)
836 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
837 req_tag, real_addr, req_laddr,
838 cache_tags, access_ok, req_is_hit, req_is_miss,
839 replace_way, plru_victim, cache_out_row)
840 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
841 req_index, req_tag, real_addr)
842 self.icache_miss(m, r, req_is_miss, req_index,
843 req_laddr, req_tag, replace_way, cache_tags,
844 access_ok, real_addr)
845 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
846 # req_is_miss, req_is_hit, lway, wstate, r)
847
848 return m
849
850
851 def icache_sim(dut):
852 i_in = dut.i_in
853 i_out = dut.i_out
854 m_out = dut.m_in
855
856 yield i_in.priv_mode.eq(1)
857 yield i_in.req.eq(0)
858 yield i_in.nia.eq(0)
859 yield i_in.stop_mark.eq(0)
860 yield m_out.tlbld.eq(0)
861 yield m_out.tlbie.eq(0)
862 yield m_out.addr.eq(0)
863 yield m_out.pte.eq(0)
864 yield
865 yield
866 yield
867 yield
868
869 # miss, stalls for a bit
870 yield i_in.req.eq(1)
871 yield i_in.nia.eq(Const(0x0000000000000004, 64))
872 yield
873 valid = yield i_out.valid
874 while not valid:
875 yield
876 valid = yield i_out.valid
877 yield i_in.req.eq(0)
878
879 insn = yield i_out.insn
880 nia = yield i_out.nia
881 assert insn == 0x00000001, \
882 "insn @%x=%x expected 00000001" % (nia, insn)
883 yield i_in.req.eq(0)
884 yield
885
886 # hit
887 yield i_in.req.eq(1)
888 yield i_in.nia.eq(Const(0x0000000000000008, 64))
889 yield
890 valid = yield i_out.valid
891 while not valid:
892 yield
893 valid = yield i_out.valid
894 yield i_in.req.eq(0)
895
896 nia = yield i_out.nia
897 insn = yield i_out.insn
898 yield
899 assert insn == 0x00000002, \
900 "insn @%x=%x expected 00000002" % (nia, insn)
901
902 # another miss
903 yield i_in.req.eq(1)
904 yield i_in.nia.eq(Const(0x0000000000000040, 64))
905 yield
906 valid = yield i_out.valid
907 while not valid:
908 yield
909 valid = yield i_out.valid
910 yield i_in.req.eq(0)
911
912 nia = yield i_in.nia
913 insn = yield i_out.insn
914 assert insn == 0x00000010, \
915 "insn @%x=%x expected 00000010" % (nia, insn)
916
917 # test something that aliases (this only works because
918 # the unit test SRAM is a depth of 512)
919 yield i_in.req.eq(1)
920 yield i_in.nia.eq(Const(0x0000000000000100, 64))
921 yield
922 yield
923 valid = yield i_out.valid
924 assert ~valid
925 for i in range(30):
926 yield
927 yield
928 insn = yield i_out.insn
929 valid = yield i_out.valid
930 insn = yield i_out.insn
931 assert valid
932 assert insn == 0x00000040, \
933 "insn @%x=%x expected 00000040" % (nia, insn)
934 yield i_in.req.eq(0)
935
936
937 def test_icache(mem):
938 dut = ICache()
939
940 memory = Memory(width=64, depth=512, init=mem)
941 sram = SRAM(memory=memory, granularity=8)
942
943 m = Module()
944
945 m.submodules.icache = dut
946 m.submodules.sram = sram
947
948 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
949 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
950 m.d.comb += sram.bus.we.eq(dut.bus.we)
951 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
952 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
953 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
954
955 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
956 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
957
958 # nmigen Simulation
959 sim = Simulator(m)
960 sim.add_clock(1e-6)
961
962 sim.add_sync_process(wrap(icache_sim(dut)))
963 with sim.write_vcd('test_icache.vcd'):
964 sim.run()
965
966
967 if __name__ == '__main__':
968 dut = ICache()
969 vl = rtlil.convert(dut, ports=[])
970 with open("test_icache.il", "w") as f:
971 f.write(vl)
972
973 # set up memory every 32-bits with incrementing values 0 1 2 ...
974 mem = []
975 for i in range(512):
976 mem.append((i*2) | ((i*2+1)<<32))
977
978 test_icache(mem)