whoops tlb_valids in ICache is a combinatorial-get/set
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66
67 SIM = 0
68 LINE_SIZE = 64
69 # BRAM organisation: We never access more than wishbone_data_bits
70 # at a time so to save resources we make the array only that wide,
71 # and use consecutive indices for to make a cache "line"
72 #
73 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
74 ROW_SIZE = WB_DATA_BITS // 8
75 # Number of lines in a set
76 NUM_LINES = 64
77 # Number of ways
78 NUM_WAYS = 2
79 # L1 ITLB number of entries (direct mapped)
80 TLB_SIZE = 64
81 # L1 ITLB log_2(page_size)
82 TLB_LG_PGSZ = 12
83 # Number of real address bits that we store
84 REAL_ADDR_BITS = 56
85 # Non-zero to enable log data collection
86 LOG_LENGTH = 0
87
88 ROW_SIZE_BITS = ROW_SIZE * 8
89 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
90 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
91 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
92 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
93 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
94 INSN_PER_ROW = ROW_SIZE_BITS // 32
95
96 # Bit fields counts in the address
97 #
98 # INSN_BITS is the number of bits to select an instruction in a row
99 INSN_BITS = log2_int(INSN_PER_ROW)
100 # ROW_BITS is the number of bits to select a row
101 ROW_BITS = log2_int(BRAM_ROWS)
102 # ROW_LINE_BITS is the number of bits to select a row within a line
103 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
104 # LINE_OFF_BITS is the number of bits for the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for the offset in a row
107 ROW_OFF_BITS = log2_int(ROW_SIZE)
108 # INDEX_BITS is the number of bits to select a cache line
109 INDEX_BITS = log2_int(NUM_LINES)
110 # SET_SIZE_BITS is the log base 2 of the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of the tag part of the address
113 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
114 # TAG_WIDTH is the width in bits of each way of the tag RAM
115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
116
117 # WAY_BITS is the number of bits to select a way
118 WAY_BITS = log2_int(NUM_WAYS)
119 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
120
121 # L1 ITLB
122 TLB_BITS = log2_int(TLB_SIZE)
123 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
124 TLB_PTE_BITS = 64
125
126 print("BRAM_ROWS =", BRAM_ROWS)
127 print("INDEX_BITS =", INDEX_BITS)
128 print("INSN_BITS =", INSN_BITS)
129 print("INSN_PER_ROW =", INSN_PER_ROW)
130 print("LINE_SIZE =", LINE_SIZE)
131 print("LINE_OFF_BITS =", LINE_OFF_BITS)
132 print("LOG_LENGTH =", LOG_LENGTH)
133 print("NUM_LINES =", NUM_LINES)
134 print("NUM_WAYS =", NUM_WAYS)
135 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
136 print("ROW_BITS =", ROW_BITS)
137 print("ROW_OFF_BITS =", ROW_OFF_BITS)
138 print("ROW_LINE_BITS =", ROW_LINE_BITS)
139 print("ROW_PER_LINE =", ROW_PER_LINE)
140 print("ROW_SIZE =", ROW_SIZE)
141 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
142 print("SET_SIZE_BITS =", SET_SIZE_BITS)
143 print("SIM =", SIM)
144 print("TAG_BITS =", TAG_BITS)
145 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
146 print("TAG_BITS =", TAG_BITS)
147 print("TLB_BITS =", TLB_BITS)
148 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
149 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
150 print("TLB_PTE_BITS =", TLB_PTE_BITS)
151 print("TLB_SIZE =", TLB_SIZE)
152 print("WAY_BITS =", WAY_BITS)
153
154 # from microwatt/utils.vhdl
155 def ispow2(n):
156 return n != 0 and (n & (n - 1)) == 0
157
158 assert LINE_SIZE % ROW_SIZE == 0
159 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
160 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
161 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
162 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
163 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
164 "geometry bits don't add up"
165 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
166 "geometry bits don't add up"
167 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
168 "geometry bits don't add up"
169 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
170 "geometry bits don't add up"
171
172 # Example of layout for 32 lines of 64 bytes:
173 #
174 # .. tag |index| line |
175 # .. | row | |
176 # .. | | | |00| zero (2)
177 # .. | | |-| | INSN_BITS (1)
178 # .. | |---| | ROW_LINE_BITS (3)
179 # .. | |--- - --| LINE_OFF_BITS (6)
180 # .. | |- --| ROW_OFF_BITS (3)
181 # .. |----- ---| | ROW_BITS (8)
182 # .. |-----| | INDEX_BITS (5)
183 # .. --------| | TAG_BITS (53)
184
185 # The cache data BRAM organized as described above for each way
186 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
187 #
188 def RowPerLineValidArray():
189 return Array(Signal(name="rows_valid_%d" %x) \
190 for x in range(ROW_PER_LINE))
191
192
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
196
197 def TLBRecord(name):
198 tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
199 ('pte', TLB_PTE_BITS)
200 ]
201 return Record(tlb_layout, name=name)
202
203 def TLBArray():
204 return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
205
206 # PLRU output interface
207 def PLRUOut():
208 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
209 for x in range(NUM_LINES))
210
211 # Return the cache line index (tag index) for an address
212 def get_index(addr):
213 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
214
215 # Return the cache row index (data memory) for an address
216 def get_row(addr):
217 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
218
219 # Return the index of a row within a line
220 def get_row_of_line(row):
221 return row[:ROW_BITS][:ROW_LINE_BITS]
222
223 # Returns whether this is the last row of a line
224 def is_last_row_addr(addr, last):
225 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
226
227 # Returns whether this is the last row of a line
228 def is_last_row(row, last):
229 return get_row_of_line(row) == last
230
231 # Return the next row in the current cache line. We use a dedicated
232 # function in order to limit the size of the generated adder to be
233 # only the bits within a cache line (3 bits with default settings)
234 def next_row(row):
235 row_v = row[0:ROW_LINE_BITS] + 1
236 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
237
238 # Read the instruction word for the given address
239 # in the current cache row
240 def read_insn_word(addr, data):
241 word = addr[2:INSN_BITS+2]
242 return data.word_select(word, 32)
243
244 # Get the tag value from the address
245 def get_tag(addr):
246 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
247
248 # Read a tag from a tag memory row
249 def read_tag(way, tagset):
250 return tagset.word_select(way, TAG_BITS)
251
252 # Write a tag to tag memory row
253 def write_tag(way, tagset, tag):
254 return read_tag(way, tagset).eq(tag)
255
256 # Simple hash for direct-mapped TLB index
257 def hash_ea(addr):
258 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
259 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
260 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
261 return hsh
262
263
264 # Cache reload state machine
265 @unique
266 class State(Enum):
267 IDLE = 0
268 CLR_TAG = 1
269 WAIT_ACK = 2
270
271
272 class RegInternal(RecordObject):
273 def __init__(self):
274 super().__init__()
275 # Cache hit state (Latches for 1 cycle BRAM access)
276 self.hit_way = Signal(WAY_BITS)
277 self.hit_nia = Signal(64)
278 self.hit_smark = Signal()
279 self.hit_valid = Signal()
280
281 # Cache miss state (reload state machine)
282 self.state = Signal(State, reset=State.IDLE)
283 self.wb = WBMasterOut("wb")
284 self.req_adr = Signal(64)
285 self.store_way = Signal(WAY_BITS)
286 self.store_index = Signal(INDEX_BITS)
287 self.store_row = Signal(ROW_BITS)
288 self.store_tag = Signal(TAG_BITS)
289 self.store_valid = Signal()
290 self.end_row_ix = Signal(ROW_LINE_BITS)
291 self.rows_valid = RowPerLineValidArray()
292
293 # TLB miss state
294 self.fetch_failed = Signal()
295
296
297 class ICache(FetchUnitInterface, Elaboratable):
298 """64 bit direct mapped icache. All instructions are 4B aligned."""
299 def __init__(self, pspec):
300 FetchUnitInterface.__init__(self, pspec)
301 self.i_in = Fetch1ToICacheType(name="i_in")
302 self.i_out = ICacheToDecode1Type(name="i_out")
303
304 self.m_in = MMUToICacheType(name="m_in")
305
306 self.stall_in = Signal()
307 self.stall_out = Signal()
308 self.flush_in = Signal()
309 self.inval_in = Signal()
310
311 # standard naming (wired to non-standard for compatibility)
312 self.bus = Interface(addr_width=32,
313 data_width=64,
314 granularity=8,
315 features={'stall'},
316 alignment=0,
317 name="icache_wb")
318
319 self.log_out = Signal(54)
320
321 # use FetchUnitInterface, helps keep some unit tests running
322 self.use_fetch_iface = False
323
324 def use_fetch_interface(self):
325 self.use_fetch_iface = True
326
327 # Generate a cache RAM for each way
328 def rams(self, m, r, cache_out_row, use_previous,
329 replace_way, req_row):
330
331 comb = m.d.comb
332 sync = m.d.sync
333
334 bus, stall_in = self.bus, self.stall_in
335
336 # read condition (for every cache ram)
337 do_read = Signal()
338 comb += do_read.eq(~(stall_in | use_previous))
339
340 rd_addr = Signal(ROW_BITS)
341 wr_addr = Signal(ROW_BITS)
342 comb += rd_addr.eq(req_row)
343 comb += wr_addr.eq(r.store_row)
344
345 # binary-to-unary converters: replace-way enabled by bus.ack,
346 # hit-way left permanently enabled
347 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
348 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
349 comb += re.i.eq(replace_way)
350 comb += re.n.eq(~bus.ack)
351 comb += he.i.eq(r.hit_way)
352
353 for i in range(NUM_WAYS):
354 do_write = Signal(name="do_wr_%d" % i)
355 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
356 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
357
358 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
359 m.submodules["cacheram_%d" % i] = way
360
361 comb += way.rd_en.eq(do_read)
362 comb += way.rd_addr.eq(rd_addr)
363 comb += d_out.eq(way.rd_data_o)
364 comb += way.wr_sel.eq(wr_sel)
365 comb += way.wr_addr.eq(wr_addr)
366 comb += way.wr_data.eq(bus.dat_r)
367
368 comb += do_write.eq(re.o[i])
369
370 with m.If(do_write):
371 sync += Display("cache write adr: %x data: %lx",
372 wr_addr, way.wr_data)
373
374 with m.If(he.o[i]):
375 comb += cache_out_row.eq(d_out)
376 with m.If(do_read):
377 sync += Display("cache read adr: %x data: %x",
378 req_row, d_out)
379
380 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
381
382 # Generate PLRUs
383 def maybe_plrus(self, m, r, plru_victim):
384 comb = m.d.comb
385
386 if NUM_WAYS == 0:
387 return
388
389
390 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
391 comb += plru.way.eq(r.hit_way)
392 comb += plru.valid.eq(r.hit_valid)
393 comb += plru.index.eq(get_index(r.hit_nia))
394 comb += plru.isel.eq(r.store_index) # select victim
395 comb += plru_victim.eq(plru.o_index) # selected victim
396
397 # TLB hit detection and real address generation
398 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
399 real_addr, ra_valid, eaa_priv,
400 priv_fault, access_ok):
401
402 comb = m.d.comb
403
404 i_in = self.i_in
405
406 # use an *asynchronous* Memory read port here (combinatorial)
407 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
408 tlb = TLBRecord("tlb_rdport")
409 pte, ttag = tlb.pte, tlb.tag
410
411 comb += tlb_req_index.eq(hash_ea(i_in.nia))
412 comb += rd_tlb.addr.eq(tlb_req_index)
413 comb += tlb.eq(rd_tlb.data)
414
415 with m.If(i_in.virt_mode):
416 comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
417 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
418
419 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
420 comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
421
422 comb += eaa_priv.eq(pte[3])
423
424 with m.Else():
425 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
426 comb += ra_valid.eq(1)
427 comb += eaa_priv.eq(1)
428
429 # No IAMR, so no KUEP support for now
430 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
431 comb += access_ok.eq(ra_valid & ~priv_fault)
432
433 # iTLB update
434 def itlb_update(self, m, itlb, itlb_valid):
435 comb = m.d.comb
436 sync = m.d.sync
437
438 m_in = self.m_in
439
440 wr_index = Signal(TLB_BITS)
441 wr_unary = Signal(TLB_SIZE)
442 comb += wr_index.eq(hash_ea(m_in.addr))
443 comb += wr_unary.eq(1<<wr_index)
444
445 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
446
447 with m.If(m_in.tlbie & m_in.doall):
448 # Clear all valid bits
449 comb += itlb_valid.r.eq(-1)
450
451 with m.Elif(m_in.tlbie):
452 # Clear entry regardless of hit or miss
453 comb += itlb_valid.r.eq(wr_unary)
454
455 with m.Elif(m_in.tlbld):
456 tlb = TLBRecord("tlb_wrport")
457 comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
458 comb += tlb.pte.eq(m_in.pte)
459 comb += wr_tlb.en.eq(1)
460 comb += wr_tlb.addr.eq(wr_index)
461 comb += wr_tlb.data.eq(tlb)
462 comb += itlb_valid.s.eq(wr_unary)
463
464 # Cache hit detection, output to fetch2 and other misc logic
465 def icache_comb(self, m, use_previous, r, req_index, req_row,
466 req_hit_way, req_tag, real_addr, req_laddr,
467 cache_valids, access_ok,
468 req_is_hit, req_is_miss, replace_way,
469 plru_victim, cache_out_row):
470
471 comb = m.d.comb
472 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
473
474 i_in, i_out, bus = self.i_in, self.i_out, self.bus
475 flush_in, stall_out = self.flush_in, self.stall_out
476
477 is_hit = Signal()
478 hit_way = Signal(WAY_BITS)
479
480 # i_in.sequential means that i_in.nia this cycle is 4 more than
481 # last cycle. If we read more than 32 bits at a time, had a
482 # cache hit last cycle, and we don't want the first 32-bit chunk
483 # then we can keep the data we read last cycle and just use that.
484 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
485 comb += use_previous.eq(i_in.sequential & r.hit_valid)
486
487 # Extract line, row and tag from request
488 comb += req_index.eq(get_index(i_in.nia))
489 comb += req_row.eq(get_row(i_in.nia))
490 comb += req_tag.eq(get_tag(real_addr))
491
492 # Calculate address of beginning of cache row, will be
493 # used for cache miss processing if needed
494 comb += req_laddr.eq(Cat(
495 Const(0, ROW_OFF_BITS),
496 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
497 ))
498
499 # Test if pending request is a hit on any way
500 hitcond = Signal()
501 comb += hitcond.eq((r.state == State.WAIT_ACK)
502 & (req_index == r.store_index)
503 & r.rows_valid[req_row % ROW_PER_LINE]
504 )
505 # i_in.req asserts Decoder active
506 cvb = Signal(NUM_WAYS)
507 ctag = Signal(TAG_RAM_WIDTH)
508 comb += rd_tag.addr.eq(req_index)
509 comb += ctag.eq(rd_tag.data)
510 comb += cvb.eq(cache_valids.q.word_select(req_index, NUM_WAYS))
511 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
512 comb += se.i.eq(r.store_way)
513 comb += se.n.eq(~i_in.req)
514 for i in range(NUM_WAYS):
515 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
516 hit_test = Signal(name="hit_test%d" % i)
517 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
518 comb += tagi.eq(read_tag(i, ctag))
519 comb += hit_test.eq(se.o[i])
520 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
521 (tagi == req_tag))
522 with m.If(is_tag_hit):
523 comb += hit_way.eq(i)
524 comb += is_hit.eq(1)
525
526 # Generate the "hit" and "miss" signals
527 # for the synchronous blocks
528 with m.If(i_in.req & access_ok & ~flush_in):
529 comb += req_is_hit.eq(is_hit)
530 comb += req_is_miss.eq(~is_hit)
531
532 comb += req_hit_way.eq(hit_way)
533
534 # The way to replace on a miss
535 with m.If(r.state == State.CLR_TAG):
536 comb += replace_way.eq(plru_victim)
537 with m.Else():
538 comb += replace_way.eq(r.store_way)
539
540 # Output instruction from current cache row
541 #
542 # Note: This is a mild violation of our design principle of
543 # having pipeline stages output from a clean latch. In this
544 # case we output the result of a mux. The alternative would
545 # be output an entire row which I prefer not to do just yet
546 # as it would force fetch2 to know about some of the cache
547 # geometry information.
548 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
549 comb += i_out.valid.eq(r.hit_valid)
550 comb += i_out.nia.eq(r.hit_nia)
551 comb += i_out.stop_mark.eq(r.hit_smark)
552 comb += i_out.fetch_failed.eq(r.fetch_failed)
553
554 # Stall fetch1 if we have a miss on cache or TLB
555 # or a protection fault
556 comb += stall_out.eq(~(is_hit & access_ok))
557
558 # Wishbone requests output (from the cache miss reload machine)
559 comb += bus.we.eq(r.wb.we)
560 comb += bus.adr.eq(r.wb.adr)
561 comb += bus.sel.eq(r.wb.sel)
562 comb += bus.stb.eq(r.wb.stb)
563 comb += bus.dat_w.eq(r.wb.dat)
564 comb += bus.cyc.eq(r.wb.cyc)
565
566 # Cache hit synchronous machine
567 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
568 req_index, req_tag, real_addr):
569 sync = m.d.sync
570
571 i_in, stall_in = self.i_in, self.stall_in
572 flush_in = self.flush_in
573
574 # keep outputs to fetch2 unchanged on a stall
575 # except that flush or reset sets valid to 0
576 # If use_previous, keep the same data as last
577 # cycle and use the second half
578 with m.If(stall_in | use_previous):
579 with m.If(flush_in):
580 sync += r.hit_valid.eq(0)
581 with m.Else():
582 # On a hit, latch the request for the next cycle,
583 # when the BRAM data will be available on the
584 # cache_out output of the corresponding way
585 sync += r.hit_valid.eq(req_is_hit)
586
587 with m.If(req_is_hit):
588 sync += r.hit_way.eq(req_hit_way)
589 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
590 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
591 i_in.stop_mark, req_index, req_tag,
592 req_hit_way, real_addr)
593
594 with m.If(~stall_in):
595 # Send stop marks and NIA down regardless of validity
596 sync += r.hit_smark.eq(i_in.stop_mark)
597 sync += r.hit_nia.eq(i_in.nia)
598
599 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
600 req_index, req_tag, replace_way, real_addr):
601 comb = m.d.comb
602 sync = m.d.sync
603
604 i_in = self.i_in
605
606 # Reset per-row valid flags, only used in WAIT_ACK
607 for i in range(ROW_PER_LINE):
608 sync += r.rows_valid[i].eq(0)
609
610 # We need to read a cache line
611 with m.If(req_is_miss):
612 sync += Display(
613 "cache miss nia:%x IR:%x SM:%x idx:%x "
614 " way:%x tag:%x RA:%x", i_in.nia,
615 i_in.virt_mode, i_in.stop_mark, req_index,
616 replace_way, req_tag, real_addr)
617
618 # Keep track of our index and way for subsequent stores
619 st_row = Signal(ROW_BITS)
620 comb += st_row.eq(get_row(req_laddr))
621 sync += r.store_index.eq(req_index)
622 sync += r.store_row.eq(st_row)
623 sync += r.store_tag.eq(req_tag)
624 sync += r.store_valid.eq(1)
625 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
626
627 # Prep for first wishbone read. We calculate the address
628 # of the start of the cache line and start the WB cycle.
629 sync += r.req_adr.eq(req_laddr)
630 sync += r.wb.cyc.eq(1)
631 sync += r.wb.stb.eq(1)
632
633 # Track that we had one request sent
634 sync += r.state.eq(State.CLR_TAG)
635
636 def icache_miss_clr_tag(self, m, r, replace_way,
637 req_index,
638 cache_valids):
639 comb = m.d.comb
640 sync = m.d.sync
641 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
642 granularity=TAG_BITS)
643
644 # Get victim way from plru
645 sync += r.store_way.eq(replace_way)
646
647 # Force misses on that way while reloading that line
648 idx = req_index*NUM_WAYS + replace_way # 2D index, 1st dim: NUM_WAYS
649 comb += cache_valids.r.eq(1<<idx)
650
651 # use write-port "granularity" to select the tag to write to
652 # TODO: the Memory should be multipled-up (by NUM_TAGS)
653 tagset = Signal(TAG_RAM_WIDTH)
654 comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
655 comb += wr_tag.en.eq(1<<replace_way)
656 comb += wr_tag.addr.eq(r.store_index)
657 comb += wr_tag.data.eq(tagset)
658
659 sync += r.state.eq(State.WAIT_ACK)
660
661 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
662 cache_valids, stbs_done):
663 comb = m.d.comb
664 sync = m.d.sync
665
666 bus = self.bus
667
668 # Requests are all sent if stb is 0
669 stbs_zero = Signal()
670 comb += stbs_zero.eq(r.wb.stb == 0)
671 comb += stbs_done.eq(stbs_zero)
672
673 # If we are still sending requests, was one accepted?
674 with m.If(~bus.stall & ~stbs_zero):
675 # That was the last word? We are done sending.
676 # Clear stb and set stbs_done so we can handle
677 # an eventual last ack on the same cycle.
678 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
679 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
680 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
681 "stbs_done:%x", r.wb.adr, r.end_row_ix,
682 r.wb.stb, stbs_zero, stbs_done)
683 sync += r.wb.stb.eq(0)
684 comb += stbs_done.eq(1)
685
686 # Calculate the next row address
687 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
688 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
689 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
690 sync += Display("RARANGE r.req_adr:%x rarange:%x "
691 "stbs_zero:%x stbs_done:%x",
692 r.req_adr, rarange, stbs_zero, stbs_done)
693
694 # Incoming acks processing
695 with m.If(bus.ack):
696 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
697 "stbs_done:%x",
698 bus.dat_r, stbs_zero, stbs_done)
699
700 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
701
702 # Check for completion
703 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
704 # Complete wishbone cycle
705 sync += r.wb.cyc.eq(0)
706 # be nice, clear addr
707 sync += r.req_adr.eq(0)
708
709 # Cache line is now valid
710 idx = r.store_index*NUM_WAYS + replace_way # 2D index again
711 valid = r.store_valid & ~inval_in
712 comb += cache_valids.s.eq(1<<idx)
713 sync += r.state.eq(State.IDLE)
714
715 # move on to next request in row
716 # Increment store row counter
717 sync += r.store_row.eq(next_row(r.store_row))
718
719 # Cache miss/reload synchronous machine
720 def icache_miss(self, m, r, req_is_miss,
721 req_index, req_laddr, req_tag, replace_way,
722 cache_valids, access_ok, real_addr):
723 comb = m.d.comb
724 sync = m.d.sync
725
726 i_in, bus, m_in = self.i_in, self.bus, self.m_in
727 stall_in, flush_in = self.stall_in, self.flush_in
728 inval_in = self.inval_in
729
730 stbs_done = Signal()
731
732 comb += r.wb.sel.eq(-1)
733 comb += r.wb.adr.eq(r.req_adr[3:])
734
735 # Process cache invalidations
736 with m.If(inval_in):
737 comb += cache_valids.r.eq(-1)
738 sync += r.store_valid.eq(0)
739
740 # Main state machine
741 with m.Switch(r.state):
742
743 with m.Case(State.IDLE):
744 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
745 req_index, req_tag, replace_way,
746 real_addr)
747
748 with m.Case(State.CLR_TAG, State.WAIT_ACK):
749 with m.If(r.state == State.CLR_TAG):
750 self.icache_miss_clr_tag(m, r, replace_way,
751 req_index,
752 cache_valids)
753
754 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
755 cache_valids, stbs_done)
756
757 # TLB miss and protection fault processing
758 with m.If(flush_in | m_in.tlbld):
759 sync += r.fetch_failed.eq(0)
760 with m.Elif(i_in.req & ~access_ok & ~stall_in):
761 sync += r.fetch_failed.eq(1)
762
763 # icache_log: if LOG_LENGTH > 0 generate
764 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
765 req_is_miss, req_is_hit, lway, wstate, r):
766 comb = m.d.comb
767 sync = m.d.sync
768
769 bus, i_out = self.bus, self.i_out
770 log_out, stall_out = self.log_out, self.stall_out
771
772 # Output data to logger
773 for i in range(LOG_LENGTH):
774 log_data = Signal(54)
775 lway = Signal(WAY_BITS)
776 wstate = Signal()
777
778 sync += lway.eq(req_hit_way)
779 sync += wstate.eq(0)
780
781 with m.If(r.state != State.IDLE):
782 sync += wstate.eq(1)
783
784 sync += log_data.eq(Cat(
785 ra_valid, access_ok, req_is_miss, req_is_hit,
786 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
787 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
788 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
789 ))
790 comb += log_out.eq(log_data)
791
792 def elaborate(self, platform):
793
794 m = Module()
795 comb = m.d.comb
796
797 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
798 # number of ways and the number of lines.
799 vec = SRLatch(sync=True, llen=NUM_WAYS*NUM_LINES, name="cachevalids")
800 m.submodules.cache_valids = cache_valids = vec
801
802 # TLB Array
803 itlb = TLBArray()
804 vec = SRLatch(sync=False, llen=TLB_SIZE, name="tlbvalids")
805 m.submodules.itlb_valids = itlb_valid = vec
806
807 # TODO to be passed to nmigen as ram attributes
808 # attribute ram_style of itlb_tags : signal is "distributed";
809 # attribute ram_style of itlb_ptes : signal is "distributed";
810
811 # Privilege bit from PTE EAA field
812 eaa_priv = Signal()
813
814 r = RegInternal()
815
816 # Async signal on incoming request
817 req_index = Signal(INDEX_BITS)
818 req_row = Signal(ROW_BITS)
819 req_hit_way = Signal(WAY_BITS)
820 req_tag = Signal(TAG_BITS)
821 req_is_hit = Signal()
822 req_is_miss = Signal()
823 req_laddr = Signal(64)
824
825 tlb_req_index = Signal(TLB_BITS)
826 real_addr = Signal(REAL_ADDR_BITS)
827 ra_valid = Signal()
828 priv_fault = Signal()
829 access_ok = Signal()
830 use_previous = Signal()
831
832 cache_out_row = Signal(ROW_SIZE_BITS)
833
834 plru_victim = Signal(WAY_BITS)
835 replace_way = Signal(WAY_BITS)
836
837 self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
838 self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
839
840 # call sub-functions putting everything together,
841 # using shared signals established above
842 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
843 self.maybe_plrus(m, r, plru_victim)
844 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
845 ra_valid, eaa_priv, priv_fault,
846 access_ok)
847 self.itlb_update(m, itlb, itlb_valid)
848 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
849 req_tag, real_addr, req_laddr,
850 cache_valids,
851 access_ok, req_is_hit, req_is_miss,
852 replace_way, plru_victim, cache_out_row)
853 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
854 req_index, req_tag, real_addr)
855 self.icache_miss(m, r, req_is_miss, req_index,
856 req_laddr, req_tag, replace_way,
857 cache_valids,
858 access_ok, real_addr)
859 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
860 # req_is_miss, req_is_hit, lway, wstate, r)
861
862 # don't connect up to FetchUnitInterface so that some unit tests
863 # can continue to operate
864 if not self.use_fetch_iface:
865 return m
866
867 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
868 # so needs checking and iterative revising
869 i_in, bus, i_out = self.i_in, self.bus, self.i_out
870 comb += i_in.req.eq(self.a_i_valid)
871 comb += i_in.nia.eq(self.a_pc_i)
872 comb += self.stall_in.eq(self.a_stall_i)
873 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
874 comb += self.f_badaddr_o.eq(i_out.nia)
875 comb += self.f_instr_o.eq(i_out.insn)
876 comb += self.f_busy_o.eq(~i_out.valid) # probably
877
878 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
879 ibus = self.ibus
880 comb += ibus.adr.eq(self.bus.adr)
881 comb += ibus.dat_w.eq(self.bus.dat_w)
882 comb += ibus.sel.eq(self.bus.sel)
883 comb += ibus.cyc.eq(self.bus.cyc)
884 comb += ibus.stb.eq(self.bus.stb)
885 comb += ibus.we.eq(self.bus.we)
886
887 comb += self.bus.dat_r.eq(ibus.dat_r)
888 comb += self.bus.ack.eq(ibus.ack)
889 if hasattr(ibus, "stall"):
890 comb += self.bus.stall.eq(ibus.stall)
891 else:
892 # fake-up the wishbone stall signal to comply with pipeline mode
893 # same thing is done in dcache.py
894 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
895
896 return m
897
898
899 def icache_sim(dut):
900 i_in = dut.i_in
901 i_out = dut.i_out
902 m_out = dut.m_in
903
904 yield i_in.priv_mode.eq(1)
905 yield i_in.req.eq(0)
906 yield i_in.nia.eq(0)
907 yield i_in.stop_mark.eq(0)
908 yield m_out.tlbld.eq(0)
909 yield m_out.tlbie.eq(0)
910 yield m_out.addr.eq(0)
911 yield m_out.pte.eq(0)
912 yield
913 yield
914 yield
915 yield
916
917 # miss, stalls for a bit
918 yield i_in.req.eq(1)
919 yield i_in.nia.eq(Const(0x0000000000000004, 64))
920 yield
921 valid = yield i_out.valid
922 while not valid:
923 yield
924 valid = yield i_out.valid
925 yield i_in.req.eq(0)
926
927 insn = yield i_out.insn
928 nia = yield i_out.nia
929 assert insn == 0x00000001, \
930 "insn @%x=%x expected 00000001" % (nia, insn)
931 yield i_in.req.eq(0)
932 yield
933
934 # hit
935 yield i_in.req.eq(1)
936 yield i_in.nia.eq(Const(0x0000000000000008, 64))
937 yield
938 valid = yield i_out.valid
939 while not valid:
940 yield
941 valid = yield i_out.valid
942 yield i_in.req.eq(0)
943
944 nia = yield i_out.nia
945 insn = yield i_out.insn
946 yield
947 assert insn == 0x00000002, \
948 "insn @%x=%x expected 00000002" % (nia, insn)
949
950 # another miss
951 yield i_in.req.eq(1)
952 yield i_in.nia.eq(Const(0x0000000000000040, 64))
953 yield
954 valid = yield i_out.valid
955 while not valid:
956 yield
957 valid = yield i_out.valid
958 yield i_in.req.eq(0)
959
960 nia = yield i_in.nia
961 insn = yield i_out.insn
962 assert insn == 0x00000010, \
963 "insn @%x=%x expected 00000010" % (nia, insn)
964
965 # test something that aliases (this only works because
966 # the unit test SRAM is a depth of 512)
967 yield i_in.req.eq(1)
968 yield i_in.nia.eq(Const(0x0000000000000100, 64))
969 yield
970 yield
971 valid = yield i_out.valid
972 assert ~valid
973 for i in range(30):
974 yield
975 yield
976 insn = yield i_out.insn
977 valid = yield i_out.valid
978 insn = yield i_out.insn
979 assert valid
980 assert insn == 0x00000040, \
981 "insn @%x=%x expected 00000040" % (nia, insn)
982 yield i_in.req.eq(0)
983
984
985 def test_icache(mem):
986 from soc.config.test.test_loadstore import TestMemPspec
987 pspec = TestMemPspec(addr_wid=32,
988 mask_wid=8,
989 reg_wid=64,
990 )
991 dut = ICache(pspec)
992
993 memory = Memory(width=64, depth=512, init=mem)
994 sram = SRAM(memory=memory, granularity=8)
995
996 m = Module()
997
998 m.submodules.icache = dut
999 m.submodules.sram = sram
1000
1001 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1002 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1003 m.d.comb += sram.bus.we.eq(dut.bus.we)
1004 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1005 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1006 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1007
1008 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1009 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1010
1011 # nmigen Simulation
1012 sim = Simulator(m)
1013 sim.add_clock(1e-6)
1014
1015 sim.add_sync_process(wrap(icache_sim(dut)))
1016 with sim.write_vcd('test_icache.vcd'):
1017 sim.run()
1018
1019
1020 if __name__ == '__main__':
1021 from soc.config.test.test_loadstore import TestMemPspec
1022 pspec = TestMemPspec(addr_wid=64,
1023 mask_wid=8,
1024 reg_wid=64,
1025 )
1026 dut = ICache(pspec)
1027 vl = rtlil.convert(dut, ports=[])
1028 with open("test_icache.il", "w") as f:
1029 f.write(vl)
1030
1031 # set up memory every 32-bits with incrementing values 0 1 2 ...
1032 mem = []
1033 for i in range(512):
1034 mem.append((i*2) | ((i*2+1)<<32))
1035
1036 test_icache(mem)