use Memory for cache_tags in icache
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52 from soc.minerva.units.fetch import FetchUnitInterface
53
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmutil.util import wrap
59 from nmigen.cli import main, rtlil
60
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil.sim_tmp_alternative import Simulator, Settle
64
65
66 SIM = 0
67 LINE_SIZE = 64
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
71 #
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8
74 # Number of lines in a set
75 NUM_LINES = 64
76 # Number of ways
77 NUM_WAYS = 2
78 # L1 ITLB number of entries (direct mapped)
79 TLB_SIZE = 64
80 # L1 ITLB log_2(page_size)
81 TLB_LG_PGSZ = 12
82 # Number of real address bits that we store
83 REAL_ADDR_BITS = 56
84 # Non-zero to enable log data collection
85 LOG_LENGTH = 0
86
87 ROW_SIZE_BITS = ROW_SIZE * 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW = ROW_SIZE_BITS // 32
94
95 # Bit fields counts in the address
96 #
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS = log2_int(INSN_PER_ROW)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS = log2_int(BRAM_ROWS)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS = log2_int(LINE_SIZE)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS = log2_int(ROW_SIZE)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
115
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # L1 ITLB
121 TLB_BITS = log2_int(TLB_SIZE)
122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
123 TLB_PTE_BITS = 64
124
125 print("BRAM_ROWS =", BRAM_ROWS)
126 print("INDEX_BITS =", INDEX_BITS)
127 print("INSN_BITS =", INSN_BITS)
128 print("INSN_PER_ROW =", INSN_PER_ROW)
129 print("LINE_SIZE =", LINE_SIZE)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS)
131 print("LOG_LENGTH =", LOG_LENGTH)
132 print("NUM_LINES =", NUM_LINES)
133 print("NUM_WAYS =", NUM_WAYS)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
135 print("ROW_BITS =", ROW_BITS)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS)
138 print("ROW_PER_LINE =", ROW_PER_LINE)
139 print("ROW_SIZE =", ROW_SIZE)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS)
142 print("SIM =", SIM)
143 print("TAG_BITS =", TAG_BITS)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
145 print("TAG_BITS =", TAG_BITS)
146 print("TLB_BITS =", TLB_BITS)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS)
150 print("TLB_SIZE =", TLB_SIZE)
151 print("WAY_BITS =", WAY_BITS)
152
153 # from microwatt/utils.vhdl
154 def ispow2(n):
155 return n != 0 and (n & (n - 1)) == 0
156
157 assert LINE_SIZE % ROW_SIZE == 0
158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
169 "geometry bits don't add up"
170
171 # Example of layout for 32 lines of 64 bytes:
172 #
173 # .. tag |index| line |
174 # .. | row | |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
183
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
186 #
187 def RowPerLineValidArray():
188 return Array(Signal(name="rows_valid_%d" %x) \
189 for x in range(ROW_PER_LINE))
190
191
192 # TODO to be passed to nigmen as ram attributes
193 # attribute ram_style : string;
194 # attribute ram_style of cache_tags : signal is "distributed";
195
196 def TLBValidArray():
197 return Array(Signal(name="tlb_valid%d" % x)
198 for x in range(TLB_SIZE))
199
200 def TLBRecord(name):
201 tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
202 ('pte', TLB_PTE_BITS)
203 ]
204 return Record(tlb_layout, name=name)
205
206 def TLBArray():
207 return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
208
209 # PLRU output interface
210 def PLRUOut():
211 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
212 for x in range(NUM_LINES))
213
214 # Return the cache line index (tag index) for an address
215 def get_index(addr):
216 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
217
218 # Return the cache row index (data memory) for an address
219 def get_row(addr):
220 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
221
222 # Return the index of a row within a line
223 def get_row_of_line(row):
224 return row[:ROW_BITS][:ROW_LINE_BITS]
225
226 # Returns whether this is the last row of a line
227 def is_last_row_addr(addr, last):
228 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
229
230 # Returns whether this is the last row of a line
231 def is_last_row(row, last):
232 return get_row_of_line(row) == last
233
234 # Return the next row in the current cache line. We use a dedicated
235 # function in order to limit the size of the generated adder to be
236 # only the bits within a cache line (3 bits with default settings)
237 def next_row(row):
238 row_v = row[0:ROW_LINE_BITS] + 1
239 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
240
241 # Read the instruction word for the given address
242 # in the current cache row
243 def read_insn_word(addr, data):
244 word = addr[2:INSN_BITS+2]
245 return data.word_select(word, 32)
246
247 # Get the tag value from the address
248 def get_tag(addr):
249 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
250
251 # Read a tag from a tag memory row
252 def read_tag(way, tagset):
253 return tagset.word_select(way, TAG_BITS)
254
255 # Write a tag to tag memory row
256 def write_tag(way, tagset, tag):
257 return read_tag(way, tagset).eq(tag)
258
259 # Simple hash for direct-mapped TLB index
260 def hash_ea(addr):
261 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
262 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
263 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
264 return hsh
265
266
267 # Cache reload state machine
268 @unique
269 class State(Enum):
270 IDLE = 0
271 CLR_TAG = 1
272 WAIT_ACK = 2
273
274
275 class RegInternal(RecordObject):
276 def __init__(self):
277 super().__init__()
278 # Cache hit state (Latches for 1 cycle BRAM access)
279 self.hit_way = Signal(WAY_BITS)
280 self.hit_nia = Signal(64)
281 self.hit_smark = Signal()
282 self.hit_valid = Signal()
283
284 # Cache miss state (reload state machine)
285 self.state = Signal(State, reset=State.IDLE)
286 self.wb = WBMasterOut("wb")
287 self.req_adr = Signal(64)
288 self.store_way = Signal(WAY_BITS)
289 self.store_index = Signal(INDEX_BITS)
290 self.store_row = Signal(ROW_BITS)
291 self.store_tag = Signal(TAG_BITS)
292 self.store_valid = Signal()
293 self.end_row_ix = Signal(ROW_LINE_BITS)
294 self.rows_valid = RowPerLineValidArray()
295
296 # TLB miss state
297 self.fetch_failed = Signal()
298
299
300 class ICache(FetchUnitInterface, Elaboratable):
301 """64 bit direct mapped icache. All instructions are 4B aligned."""
302 def __init__(self, pspec):
303 FetchUnitInterface.__init__(self, pspec)
304 self.i_in = Fetch1ToICacheType(name="i_in")
305 self.i_out = ICacheToDecode1Type(name="i_out")
306
307 self.m_in = MMUToICacheType(name="m_in")
308
309 self.stall_in = Signal()
310 self.stall_out = Signal()
311 self.flush_in = Signal()
312 self.inval_in = Signal()
313
314 # standard naming (wired to non-standard for compatibility)
315 self.bus = Interface(addr_width=32,
316 data_width=64,
317 granularity=8,
318 features={'stall'},
319 alignment=0,
320 name="icache_wb")
321
322 self.log_out = Signal(54)
323
324 # use FetchUnitInterface, helps keep some unit tests running
325 self.use_fetch_iface = False
326
327 def use_fetch_interface(self):
328 self.use_fetch_iface = True
329
330 # Generate a cache RAM for each way
331 def rams(self, m, r, cache_out_row, use_previous,
332 replace_way, req_row):
333
334 comb = m.d.comb
335 sync = m.d.sync
336
337 bus, stall_in = self.bus, self.stall_in
338
339 # read condition (for every cache ram)
340 do_read = Signal()
341 comb += do_read.eq(~(stall_in | use_previous))
342
343 rd_addr = Signal(ROW_BITS)
344 wr_addr = Signal(ROW_BITS)
345 comb += rd_addr.eq(req_row)
346 comb += wr_addr.eq(r.store_row)
347
348 # binary-to-unary converters: replace-way enabled by bus.ack,
349 # hit-way left permanently enabled
350 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
351 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
352 comb += re.i.eq(replace_way)
353 comb += re.n.eq(~bus.ack)
354 comb += he.i.eq(r.hit_way)
355
356 for i in range(NUM_WAYS):
357 do_write = Signal(name="do_wr_%d" % i)
358 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
359 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
360
361 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
362 m.submodules["cacheram_%d" % i] = way
363
364 comb += way.rd_en.eq(do_read)
365 comb += way.rd_addr.eq(rd_addr)
366 comb += d_out.eq(way.rd_data_o)
367 comb += way.wr_sel.eq(wr_sel)
368 comb += way.wr_addr.eq(wr_addr)
369 comb += way.wr_data.eq(bus.dat_r)
370
371 comb += do_write.eq(re.o[i])
372
373 with m.If(do_write):
374 sync += Display("cache write adr: %x data: %lx",
375 wr_addr, way.wr_data)
376
377 with m.If(he.o[i]):
378 comb += cache_out_row.eq(d_out)
379 with m.If(do_read):
380 sync += Display("cache read adr: %x data: %x",
381 req_row, d_out)
382
383 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
384
385 # Generate PLRUs
386 def maybe_plrus(self, m, r, plru_victim):
387 comb = m.d.comb
388
389 if NUM_WAYS == 0:
390 return
391
392
393 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
394 comb += plru.way.eq(r.hit_way)
395 comb += plru.valid.eq(r.hit_valid)
396 comb += plru.index.eq(get_index(r.hit_nia))
397 comb += plru.isel.eq(r.store_index) # select victim
398 comb += plru_victim.eq(plru.o_index) # selected victim
399
400 # TLB hit detection and real address generation
401 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
402 real_addr, ra_valid, eaa_priv,
403 priv_fault, access_ok):
404
405 comb = m.d.comb
406
407 i_in = self.i_in
408
409 # use an *asynchronous* Memory read port here (combinatorial)
410 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
411 tlb = TLBRecord("tlb_rdport")
412 pte, ttag = tlb.pte, tlb.tag
413
414 comb += tlb_req_index.eq(hash_ea(i_in.nia))
415 comb += rd_tlb.addr.eq(tlb_req_index)
416 comb += tlb.eq(rd_tlb.data)
417
418 with m.If(i_in.virt_mode):
419 comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
420 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
421
422 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
423 comb += ra_valid.eq(itlb_valid[tlb_req_index])
424
425 comb += eaa_priv.eq(pte[3])
426
427 with m.Else():
428 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
429 comb += ra_valid.eq(1)
430 comb += eaa_priv.eq(1)
431
432 # No IAMR, so no KUEP support for now
433 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
434 comb += access_ok.eq(ra_valid & ~priv_fault)
435
436 # iTLB update
437 def itlb_update(self, m, itlb, itlb_valid):
438 comb = m.d.comb
439 sync = m.d.sync
440
441 m_in = self.m_in
442
443 wr_index = Signal(TLB_SIZE)
444 comb += wr_index.eq(hash_ea(m_in.addr))
445
446 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
447
448 with m.If(m_in.tlbie & m_in.doall):
449 # Clear all valid bits
450 for i in range(TLB_SIZE):
451 sync += itlb_valid[i].eq(0)
452
453 with m.Elif(m_in.tlbie):
454 # Clear entry regardless of hit or miss
455 sync += itlb_valid[wr_index].eq(0)
456
457 with m.Elif(m_in.tlbld):
458 tlb = TLBRecord("tlb_wrport")
459 comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
460 comb += tlb.pte.eq(m_in.pte)
461 comb += wr_tlb.en.eq(1)
462 comb += wr_tlb.addr.eq(wr_index)
463 comb += wr_tlb.data.eq(tlb)
464 sync += itlb_valid[wr_index].eq(1)
465
466 # Cache hit detection, output to fetch2 and other misc logic
467 def icache_comb(self, m, use_previous, r, req_index, req_row,
468 req_hit_way, req_tag, real_addr, req_laddr,
469 cache_valids, access_ok,
470 req_is_hit, req_is_miss, replace_way,
471 plru_victim, cache_out_row):
472
473 comb = m.d.comb
474 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
475
476 i_in, i_out, bus = self.i_in, self.i_out, self.bus
477 flush_in, stall_out = self.flush_in, self.stall_out
478
479 is_hit = Signal()
480 hit_way = Signal(WAY_BITS)
481
482 # i_in.sequential means that i_in.nia this cycle is 4 more than
483 # last cycle. If we read more than 32 bits at a time, had a
484 # cache hit last cycle, and we don't want the first 32-bit chunk
485 # then we can keep the data we read last cycle and just use that.
486 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
487 comb += use_previous.eq(i_in.sequential & r.hit_valid)
488
489 # Extract line, row and tag from request
490 comb += req_index.eq(get_index(i_in.nia))
491 comb += req_row.eq(get_row(i_in.nia))
492 comb += req_tag.eq(get_tag(real_addr))
493
494 # Calculate address of beginning of cache row, will be
495 # used for cache miss processing if needed
496 comb += req_laddr.eq(Cat(
497 Const(0, ROW_OFF_BITS),
498 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
499 ))
500
501 # Test if pending request is a hit on any way
502 hitcond = Signal()
503 comb += hitcond.eq((r.state == State.WAIT_ACK)
504 & (req_index == r.store_index)
505 & r.rows_valid[req_row % ROW_PER_LINE]
506 )
507 # i_in.req asserts Decoder active
508 cvb = Signal(NUM_WAYS)
509 ctag = Signal(TAG_RAM_WIDTH)
510 comb += rd_tag.addr.eq(req_index)
511 comb += ctag.eq(rd_tag.data)
512 comb += cvb.eq(cache_valids.word_select(req_index, NUM_WAYS))
513 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
514 comb += se.i.eq(r.store_way)
515 comb += se.n.eq(~i_in.req)
516 for i in range(NUM_WAYS):
517 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
518 hit_test = Signal(name="hit_test%d" % i)
519 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
520 comb += tagi.eq(read_tag(i, ctag))
521 comb += hit_test.eq(se.o[i])
522 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
523 (tagi == req_tag))
524 with m.If(is_tag_hit):
525 comb += hit_way.eq(i)
526 comb += is_hit.eq(1)
527
528 # Generate the "hit" and "miss" signals
529 # for the synchronous blocks
530 with m.If(i_in.req & access_ok & ~flush_in):
531 comb += req_is_hit.eq(is_hit)
532 comb += req_is_miss.eq(~is_hit)
533
534 comb += req_hit_way.eq(hit_way)
535
536 # The way to replace on a miss
537 with m.If(r.state == State.CLR_TAG):
538 comb += replace_way.eq(plru_victim)
539 with m.Else():
540 comb += replace_way.eq(r.store_way)
541
542 # Output instruction from current cache row
543 #
544 # Note: This is a mild violation of our design principle of
545 # having pipeline stages output from a clean latch. In this
546 # case we output the result of a mux. The alternative would
547 # be output an entire row which I prefer not to do just yet
548 # as it would force fetch2 to know about some of the cache
549 # geometry information.
550 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
551 comb += i_out.valid.eq(r.hit_valid)
552 comb += i_out.nia.eq(r.hit_nia)
553 comb += i_out.stop_mark.eq(r.hit_smark)
554 comb += i_out.fetch_failed.eq(r.fetch_failed)
555
556 # Stall fetch1 if we have a miss on cache or TLB
557 # or a protection fault
558 comb += stall_out.eq(~(is_hit & access_ok))
559
560 # Wishbone requests output (from the cache miss reload machine)
561 comb += bus.we.eq(r.wb.we)
562 comb += bus.adr.eq(r.wb.adr)
563 comb += bus.sel.eq(r.wb.sel)
564 comb += bus.stb.eq(r.wb.stb)
565 comb += bus.dat_w.eq(r.wb.dat)
566 comb += bus.cyc.eq(r.wb.cyc)
567
568 # Cache hit synchronous machine
569 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
570 req_index, req_tag, real_addr):
571 sync = m.d.sync
572
573 i_in, stall_in = self.i_in, self.stall_in
574 flush_in = self.flush_in
575
576 # keep outputs to fetch2 unchanged on a stall
577 # except that flush or reset sets valid to 0
578 # If use_previous, keep the same data as last
579 # cycle and use the second half
580 with m.If(stall_in | use_previous):
581 with m.If(flush_in):
582 sync += r.hit_valid.eq(0)
583 with m.Else():
584 # On a hit, latch the request for the next cycle,
585 # when the BRAM data will be available on the
586 # cache_out output of the corresponding way
587 sync += r.hit_valid.eq(req_is_hit)
588
589 with m.If(req_is_hit):
590 sync += r.hit_way.eq(req_hit_way)
591 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
592 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
593 i_in.stop_mark, req_index, req_tag,
594 req_hit_way, real_addr)
595
596 with m.If(~stall_in):
597 # Send stop marks and NIA down regardless of validity
598 sync += r.hit_smark.eq(i_in.stop_mark)
599 sync += r.hit_nia.eq(i_in.nia)
600
601 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
602 req_index, req_tag, replace_way, real_addr):
603 comb = m.d.comb
604 sync = m.d.sync
605
606 i_in = self.i_in
607
608 # Reset per-row valid flags, only used in WAIT_ACK
609 for i in range(ROW_PER_LINE):
610 sync += r.rows_valid[i].eq(0)
611
612 # We need to read a cache line
613 with m.If(req_is_miss):
614 sync += Display(
615 "cache miss nia:%x IR:%x SM:%x idx:%x "
616 " way:%x tag:%x RA:%x", i_in.nia,
617 i_in.virt_mode, i_in.stop_mark, req_index,
618 replace_way, req_tag, real_addr)
619
620 # Keep track of our index and way for subsequent stores
621 st_row = Signal(ROW_BITS)
622 comb += st_row.eq(get_row(req_laddr))
623 sync += r.store_index.eq(req_index)
624 sync += r.store_row.eq(st_row)
625 sync += r.store_tag.eq(req_tag)
626 sync += r.store_valid.eq(1)
627 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
628
629 # Prep for first wishbone read. We calculate the address
630 # of the start of the cache line and start the WB cycle.
631 sync += r.req_adr.eq(req_laddr)
632 sync += r.wb.cyc.eq(1)
633 sync += r.wb.stb.eq(1)
634
635 # Track that we had one request sent
636 sync += r.state.eq(State.CLR_TAG)
637
638 def icache_miss_clr_tag(self, m, r, replace_way,
639 req_index,
640 cache_valids):
641 comb = m.d.comb
642 sync = m.d.sync
643 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
644 granularity=TAG_BITS)
645
646 # Get victim way from plru
647 sync += r.store_way.eq(replace_way)
648
649 # Force misses on that way while reloading that line
650 idx = req_index*NUM_WAYS + replace_way # 2D index, 1st dim: NUM_WAYS
651 sync += cache_valids.bit_select(idx, 1).eq(0)
652
653 # use write-port "granularity" to select the tag to write to
654 # TODO: the Memory should be multipled-up (by NUM_TAGS)
655 tagset = Signal(TAG_RAM_WIDTH)
656 comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
657 comb += wr_tag.en.eq(1<<replace_way)
658 comb += wr_tag.addr.eq(r.store_index)
659 comb += wr_tag.data.eq(tagset)
660
661 sync += r.state.eq(State.WAIT_ACK)
662
663 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
664 cache_valids, stbs_done):
665 comb = m.d.comb
666 sync = m.d.sync
667
668 bus = self.bus
669
670 # Requests are all sent if stb is 0
671 stbs_zero = Signal()
672 comb += stbs_zero.eq(r.wb.stb == 0)
673 comb += stbs_done.eq(stbs_zero)
674
675 # If we are still sending requests, was one accepted?
676 with m.If(~bus.stall & ~stbs_zero):
677 # That was the last word? We are done sending.
678 # Clear stb and set stbs_done so we can handle
679 # an eventual last ack on the same cycle.
680 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
681 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
682 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
683 "stbs_done:%x", r.wb.adr, r.end_row_ix,
684 r.wb.stb, stbs_zero, stbs_done)
685 sync += r.wb.stb.eq(0)
686 comb += stbs_done.eq(1)
687
688 # Calculate the next row address
689 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
690 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
691 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
692 sync += Display("RARANGE r.req_adr:%x rarange:%x "
693 "stbs_zero:%x stbs_done:%x",
694 r.req_adr, rarange, stbs_zero, stbs_done)
695
696 # Incoming acks processing
697 with m.If(bus.ack):
698 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
699 "stbs_done:%x",
700 bus.dat_r, stbs_zero, stbs_done)
701
702 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
703
704 # Check for completion
705 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
706 # Complete wishbone cycle
707 sync += r.wb.cyc.eq(0)
708 # be nice, clear addr
709 sync += r.req_adr.eq(0)
710
711 # Cache line is now valid
712 idx = r.store_index*NUM_WAYS + replace_way # 2D index again
713 valid = r.store_valid & ~inval_in
714 sync += cache_valids.bit_select(idx, 1).eq(valid)
715 sync += r.state.eq(State.IDLE)
716
717 # move on to next request in row
718 # Increment store row counter
719 sync += r.store_row.eq(next_row(r.store_row))
720
721 # Cache miss/reload synchronous machine
722 def icache_miss(self, m, r, req_is_miss,
723 req_index, req_laddr, req_tag, replace_way,
724 cache_valids, access_ok, real_addr):
725 comb = m.d.comb
726 sync = m.d.sync
727
728 i_in, bus, m_in = self.i_in, self.bus, self.m_in
729 stall_in, flush_in = self.stall_in, self.flush_in
730 inval_in = self.inval_in
731
732 stbs_done = Signal()
733
734 comb += r.wb.sel.eq(-1)
735 comb += r.wb.adr.eq(r.req_adr[3:])
736
737 # Process cache invalidations
738 with m.If(inval_in):
739 sync += cache_valids.eq(0)
740 sync += r.store_valid.eq(0)
741
742 # Main state machine
743 with m.Switch(r.state):
744
745 with m.Case(State.IDLE):
746 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
747 req_index, req_tag, replace_way,
748 real_addr)
749
750 with m.Case(State.CLR_TAG, State.WAIT_ACK):
751 with m.If(r.state == State.CLR_TAG):
752 self.icache_miss_clr_tag(m, r, replace_way,
753 req_index,
754 cache_valids)
755
756 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
757 cache_valids, stbs_done)
758
759 # TLB miss and protection fault processing
760 with m.If(flush_in | m_in.tlbld):
761 sync += r.fetch_failed.eq(0)
762 with m.Elif(i_in.req & ~access_ok & ~stall_in):
763 sync += r.fetch_failed.eq(1)
764
765 # icache_log: if LOG_LENGTH > 0 generate
766 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
767 req_is_miss, req_is_hit, lway, wstate, r):
768 comb = m.d.comb
769 sync = m.d.sync
770
771 bus, i_out = self.bus, self.i_out
772 log_out, stall_out = self.log_out, self.stall_out
773
774 # Output data to logger
775 for i in range(LOG_LENGTH):
776 log_data = Signal(54)
777 lway = Signal(WAY_BITS)
778 wstate = Signal()
779
780 sync += lway.eq(req_hit_way)
781 sync += wstate.eq(0)
782
783 with m.If(r.state != State.IDLE):
784 sync += wstate.eq(1)
785
786 sync += log_data.eq(Cat(
787 ra_valid, access_ok, req_is_miss, req_is_hit,
788 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
789 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
790 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
791 ))
792 comb += log_out.eq(log_data)
793
794 def elaborate(self, platform):
795
796 m = Module()
797 comb = m.d.comb
798
799 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
800 # number of ways and the number of lines.
801 cache_valids = Signal(NUM_WAYS*NUM_LINES)
802
803 # TLB Array
804 itlb = TLBArray()
805 itlb_valid = TLBValidArray()
806
807 # TODO to be passed to nmigen as ram attributes
808 # attribute ram_style of itlb_tags : signal is "distributed";
809 # attribute ram_style of itlb_ptes : signal is "distributed";
810
811 # Privilege bit from PTE EAA field
812 eaa_priv = Signal()
813
814 r = RegInternal()
815
816 # Async signal on incoming request
817 req_index = Signal(INDEX_BITS)
818 req_row = Signal(ROW_BITS)
819 req_hit_way = Signal(WAY_BITS)
820 req_tag = Signal(TAG_BITS)
821 req_is_hit = Signal()
822 req_is_miss = Signal()
823 req_laddr = Signal(64)
824
825 tlb_req_index = Signal(TLB_BITS)
826 real_addr = Signal(REAL_ADDR_BITS)
827 ra_valid = Signal()
828 priv_fault = Signal()
829 access_ok = Signal()
830 use_previous = Signal()
831
832 cache_out_row = Signal(ROW_SIZE_BITS)
833
834 plru_victim = Signal(WAY_BITS)
835 replace_way = Signal(WAY_BITS)
836
837 self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
838 self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
839
840 # call sub-functions putting everything together,
841 # using shared signals established above
842 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
843 self.maybe_plrus(m, r, plru_victim)
844 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
845 ra_valid, eaa_priv, priv_fault,
846 access_ok)
847 self.itlb_update(m, itlb, itlb_valid)
848 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
849 req_tag, real_addr, req_laddr,
850 cache_valids,
851 access_ok, req_is_hit, req_is_miss,
852 replace_way, plru_victim, cache_out_row)
853 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
854 req_index, req_tag, real_addr)
855 self.icache_miss(m, r, req_is_miss, req_index,
856 req_laddr, req_tag, replace_way,
857 cache_valids,
858 access_ok, real_addr)
859 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
860 # req_is_miss, req_is_hit, lway, wstate, r)
861
862 # don't connect up to FetchUnitInterface so that some unit tests
863 # can continue to operate
864 if not self.use_fetch_iface:
865 return m
866
867 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
868 # so needs checking and iterative revising
869 i_in, bus, i_out = self.i_in, self.bus, self.i_out
870 comb += i_in.req.eq(self.a_i_valid)
871 comb += i_in.nia.eq(self.a_pc_i)
872 comb += self.stall_in.eq(self.a_stall_i)
873 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
874 comb += self.f_badaddr_o.eq(i_out.nia)
875 comb += self.f_instr_o.eq(i_out.insn)
876 comb += self.f_busy_o.eq(~i_out.valid) # probably
877
878 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
879 ibus = self.ibus
880 comb += ibus.adr.eq(self.bus.adr)
881 comb += ibus.dat_w.eq(self.bus.dat_w)
882 comb += ibus.sel.eq(self.bus.sel)
883 comb += ibus.cyc.eq(self.bus.cyc)
884 comb += ibus.stb.eq(self.bus.stb)
885 comb += ibus.we.eq(self.bus.we)
886
887 comb += self.bus.dat_r.eq(ibus.dat_r)
888 comb += self.bus.ack.eq(ibus.ack)
889 if hasattr(ibus, "stall"):
890 comb += self.bus.stall.eq(ibus.stall)
891 else:
892 # fake-up the wishbone stall signal to comply with pipeline mode
893 # same thing is done in dcache.py
894 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
895
896 return m
897
898
899 def icache_sim(dut):
900 i_in = dut.i_in
901 i_out = dut.i_out
902 m_out = dut.m_in
903
904 yield i_in.priv_mode.eq(1)
905 yield i_in.req.eq(0)
906 yield i_in.nia.eq(0)
907 yield i_in.stop_mark.eq(0)
908 yield m_out.tlbld.eq(0)
909 yield m_out.tlbie.eq(0)
910 yield m_out.addr.eq(0)
911 yield m_out.pte.eq(0)
912 yield
913 yield
914 yield
915 yield
916
917 # miss, stalls for a bit
918 yield i_in.req.eq(1)
919 yield i_in.nia.eq(Const(0x0000000000000004, 64))
920 yield
921 valid = yield i_out.valid
922 while not valid:
923 yield
924 valid = yield i_out.valid
925 yield i_in.req.eq(0)
926
927 insn = yield i_out.insn
928 nia = yield i_out.nia
929 assert insn == 0x00000001, \
930 "insn @%x=%x expected 00000001" % (nia, insn)
931 yield i_in.req.eq(0)
932 yield
933
934 # hit
935 yield i_in.req.eq(1)
936 yield i_in.nia.eq(Const(0x0000000000000008, 64))
937 yield
938 valid = yield i_out.valid
939 while not valid:
940 yield
941 valid = yield i_out.valid
942 yield i_in.req.eq(0)
943
944 nia = yield i_out.nia
945 insn = yield i_out.insn
946 yield
947 assert insn == 0x00000002, \
948 "insn @%x=%x expected 00000002" % (nia, insn)
949
950 # another miss
951 yield i_in.req.eq(1)
952 yield i_in.nia.eq(Const(0x0000000000000040, 64))
953 yield
954 valid = yield i_out.valid
955 while not valid:
956 yield
957 valid = yield i_out.valid
958 yield i_in.req.eq(0)
959
960 nia = yield i_in.nia
961 insn = yield i_out.insn
962 assert insn == 0x00000010, \
963 "insn @%x=%x expected 00000010" % (nia, insn)
964
965 # test something that aliases (this only works because
966 # the unit test SRAM is a depth of 512)
967 yield i_in.req.eq(1)
968 yield i_in.nia.eq(Const(0x0000000000000100, 64))
969 yield
970 yield
971 valid = yield i_out.valid
972 assert ~valid
973 for i in range(30):
974 yield
975 yield
976 insn = yield i_out.insn
977 valid = yield i_out.valid
978 insn = yield i_out.insn
979 assert valid
980 assert insn == 0x00000040, \
981 "insn @%x=%x expected 00000040" % (nia, insn)
982 yield i_in.req.eq(0)
983
984
985 def test_icache(mem):
986 from soc.config.test.test_loadstore import TestMemPspec
987 pspec = TestMemPspec(addr_wid=32,
988 mask_wid=8,
989 reg_wid=64,
990 )
991 dut = ICache(pspec)
992
993 memory = Memory(width=64, depth=512, init=mem)
994 sram = SRAM(memory=memory, granularity=8)
995
996 m = Module()
997
998 m.submodules.icache = dut
999 m.submodules.sram = sram
1000
1001 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1002 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1003 m.d.comb += sram.bus.we.eq(dut.bus.we)
1004 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1005 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1006 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1007
1008 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1009 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1010
1011 # nmigen Simulation
1012 sim = Simulator(m)
1013 sim.add_clock(1e-6)
1014
1015 sim.add_sync_process(wrap(icache_sim(dut)))
1016 with sim.write_vcd('test_icache.vcd'):
1017 sim.run()
1018
1019
1020 if __name__ == '__main__':
1021 from soc.config.test.test_loadstore import TestMemPspec
1022 pspec = TestMemPspec(addr_wid=64,
1023 mask_wid=8,
1024 reg_wid=64,
1025 )
1026 dut = ICache(pspec)
1027 vl = rtlil.convert(dut, ports=[])
1028 with open("test_icache.il", "w") as f:
1029 f.write(vl)
1030
1031 # set up memory every 32-bits with incrementing values 0 1 2 ...
1032 mem = []
1033 for i in range(512):
1034 mem.append((i*2) | ((i*2+1)<<32))
1035
1036 test_icache(mem)