71b3f989c2f1e6dcfc3fa2141e17d213f6aabcd0
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52 from soc.minerva.units.fetch import FetchUnitInterface
53
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmutil.util import wrap
59 from nmigen.cli import main, rtlil
60
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil.sim_tmp_alternative import Simulator, Settle
64
65
66 SIM = 0
67 LINE_SIZE = 64
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
71 #
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8
74 # Number of lines in a set
75 NUM_LINES = 64
76 # Number of ways
77 NUM_WAYS = 2
78 # L1 ITLB number of entries (direct mapped)
79 TLB_SIZE = 64
80 # L1 ITLB log_2(page_size)
81 TLB_LG_PGSZ = 12
82 # Number of real address bits that we store
83 REAL_ADDR_BITS = 56
84 # Non-zero to enable log data collection
85 LOG_LENGTH = 0
86
87 ROW_SIZE_BITS = ROW_SIZE * 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW = ROW_SIZE_BITS // 32
94
95 # Bit fields counts in the address
96 #
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS = log2_int(INSN_PER_ROW)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS = log2_int(BRAM_ROWS)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS = log2_int(LINE_SIZE)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS = log2_int(ROW_SIZE)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
115
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # L1 ITLB
121 TLB_BITS = log2_int(TLB_SIZE)
122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
123 TLB_PTE_BITS = 64
124
125 print("BRAM_ROWS =", BRAM_ROWS)
126 print("INDEX_BITS =", INDEX_BITS)
127 print("INSN_BITS =", INSN_BITS)
128 print("INSN_PER_ROW =", INSN_PER_ROW)
129 print("LINE_SIZE =", LINE_SIZE)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS)
131 print("LOG_LENGTH =", LOG_LENGTH)
132 print("NUM_LINES =", NUM_LINES)
133 print("NUM_WAYS =", NUM_WAYS)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
135 print("ROW_BITS =", ROW_BITS)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS)
138 print("ROW_PER_LINE =", ROW_PER_LINE)
139 print("ROW_SIZE =", ROW_SIZE)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS)
142 print("SIM =", SIM)
143 print("TAG_BITS =", TAG_BITS)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
145 print("TAG_BITS =", TAG_BITS)
146 print("TLB_BITS =", TLB_BITS)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS)
150 print("TLB_SIZE =", TLB_SIZE)
151 print("WAY_BITS =", WAY_BITS)
152
153 # from microwatt/utils.vhdl
154 def ispow2(n):
155 return n != 0 and (n & (n - 1)) == 0
156
157 assert LINE_SIZE % ROW_SIZE == 0
158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
169 "geometry bits don't add up"
170
171 # Example of layout for 32 lines of 64 bytes:
172 #
173 # .. tag |index| line |
174 # .. | row | |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
183
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
186 #
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
190 def CacheTagArray():
191 return Array(Signal(TAG_RAM_WIDTH, name="tag%d" % x) \
192 for x in range(NUM_LINES))
193
194 def CacheValidsArray():
195 return Array(Signal(NUM_WAYS, name="tag_valids%d" % x) \
196 for x in range(NUM_LINES))
197
198 def RowPerLineValidArray():
199 return Array(Signal(name="rows_valid_%d" %x) \
200 for x in range(ROW_PER_LINE))
201
202
203 # TODO to be passed to nigmen as ram attributes
204 # attribute ram_style : string;
205 # attribute ram_style of cache_tags : signal is "distributed";
206
207 def TLBValidArray():
208 return Array(Signal(name="tlb_valid%d" % x)
209 for x in range(TLB_SIZE))
210
211 def TLBRecord(name):
212 tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
213 ('pte', TLB_PTE_BITS)
214 ]
215 return Record(tlb_layout, name=name)
216
217 def TLBArray():
218 return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
219
220 # PLRU output interface
221 def PLRUOut():
222 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
223 for x in range(NUM_LINES))
224
225 # Return the cache line index (tag index) for an address
226 def get_index(addr):
227 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
228
229 # Return the cache row index (data memory) for an address
230 def get_row(addr):
231 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
232
233 # Return the index of a row within a line
234 def get_row_of_line(row):
235 return row[:ROW_BITS][:ROW_LINE_BITS]
236
237 # Returns whether this is the last row of a line
238 def is_last_row_addr(addr, last):
239 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
240
241 # Returns whether this is the last row of a line
242 def is_last_row(row, last):
243 return get_row_of_line(row) == last
244
245 # Return the next row in the current cache line. We use a dedicated
246 # function in order to limit the size of the generated adder to be
247 # only the bits within a cache line (3 bits with default settings)
248 def next_row(row):
249 row_v = row[0:ROW_LINE_BITS] + 1
250 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
251
252 # Read the instruction word for the given address
253 # in the current cache row
254 def read_insn_word(addr, data):
255 word = addr[2:INSN_BITS+2]
256 return data.word_select(word, 32)
257
258 # Get the tag value from the address
259 def get_tag(addr):
260 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
261
262 # Read a tag from a tag memory row
263 def read_tag(way, tagset):
264 return tagset.word_select(way, TAG_BITS)
265
266 # Write a tag to tag memory row
267 def write_tag(way, tagset, tag):
268 return read_tag(way, tagset).eq(tag)
269
270 # Simple hash for direct-mapped TLB index
271 def hash_ea(addr):
272 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
273 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
274 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
275 return hsh
276
277
278 # Cache reload state machine
279 @unique
280 class State(Enum):
281 IDLE = 0
282 CLR_TAG = 1
283 WAIT_ACK = 2
284
285
286 class RegInternal(RecordObject):
287 def __init__(self):
288 super().__init__()
289 # Cache hit state (Latches for 1 cycle BRAM access)
290 self.hit_way = Signal(WAY_BITS)
291 self.hit_nia = Signal(64)
292 self.hit_smark = Signal()
293 self.hit_valid = Signal()
294
295 # Cache miss state (reload state machine)
296 self.state = Signal(State, reset=State.IDLE)
297 self.wb = WBMasterOut("wb")
298 self.req_adr = Signal(64)
299 self.store_way = Signal(WAY_BITS)
300 self.store_index = Signal(INDEX_BITS)
301 self.store_row = Signal(ROW_BITS)
302 self.store_tag = Signal(TAG_BITS)
303 self.store_valid = Signal()
304 self.end_row_ix = Signal(ROW_LINE_BITS)
305 self.rows_valid = RowPerLineValidArray()
306
307 # TLB miss state
308 self.fetch_failed = Signal()
309
310
311 class ICache(FetchUnitInterface, Elaboratable):
312 """64 bit direct mapped icache. All instructions are 4B aligned."""
313 def __init__(self, pspec):
314 FetchUnitInterface.__init__(self, pspec)
315 self.i_in = Fetch1ToICacheType(name="i_in")
316 self.i_out = ICacheToDecode1Type(name="i_out")
317
318 self.m_in = MMUToICacheType(name="m_in")
319
320 self.stall_in = Signal()
321 self.stall_out = Signal()
322 self.flush_in = Signal()
323 self.inval_in = Signal()
324
325 # standard naming (wired to non-standard for compatibility)
326 self.bus = Interface(addr_width=32,
327 data_width=64,
328 granularity=8,
329 features={'stall'},
330 alignment=0,
331 name="icache_wb")
332
333 self.log_out = Signal(54)
334
335 # use FetchUnitInterface, helps keep some unit tests running
336 self.use_fetch_iface = False
337
338 def use_fetch_interface(self):
339 self.use_fetch_iface = True
340
341 # Generate a cache RAM for each way
342 def rams(self, m, r, cache_out_row, use_previous,
343 replace_way, req_row):
344
345 comb = m.d.comb
346 sync = m.d.sync
347
348 bus, stall_in = self.bus, self.stall_in
349
350 # read condition (for every cache ram)
351 do_read = Signal()
352 comb += do_read.eq(~(stall_in | use_previous))
353
354 rd_addr = Signal(ROW_BITS)
355 wr_addr = Signal(ROW_BITS)
356 comb += rd_addr.eq(req_row)
357 comb += wr_addr.eq(r.store_row)
358
359 # binary-to-unary converters: replace-way enabled by bus.ack,
360 # hit-way left permanently enabled
361 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
362 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
363 comb += re.i.eq(replace_way)
364 comb += re.n.eq(~bus.ack)
365 comb += he.i.eq(r.hit_way)
366
367 for i in range(NUM_WAYS):
368 do_write = Signal(name="do_wr_%d" % i)
369 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
370 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
371
372 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
373 m.submodules["cacheram_%d" % i] = way
374
375 comb += way.rd_en.eq(do_read)
376 comb += way.rd_addr.eq(rd_addr)
377 comb += d_out.eq(way.rd_data_o)
378 comb += way.wr_sel.eq(wr_sel)
379 comb += way.wr_addr.eq(wr_addr)
380 comb += way.wr_data.eq(bus.dat_r)
381
382 comb += do_write.eq(re.o[i])
383
384 with m.If(do_write):
385 sync += Display("cache write adr: %x data: %lx",
386 wr_addr, way.wr_data)
387
388 with m.If(he.o[i]):
389 comb += cache_out_row.eq(d_out)
390 with m.If(do_read):
391 sync += Display("cache read adr: %x data: %x",
392 req_row, d_out)
393
394 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
395
396 # Generate PLRUs
397 def maybe_plrus(self, m, r, plru_victim):
398 comb = m.d.comb
399
400 if NUM_WAYS == 0:
401 return
402
403
404 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
405 comb += plru.way.eq(r.hit_way)
406 comb += plru.valid.eq(r.hit_valid)
407 comb += plru.index.eq(get_index(r.hit_nia))
408 comb += plru.isel.eq(r.store_index) # select victim
409 comb += plru_victim.eq(plru.o_index) # selected victim
410
411 # TLB hit detection and real address generation
412 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
413 real_addr, ra_valid, eaa_priv,
414 priv_fault, access_ok):
415
416 comb = m.d.comb
417
418 i_in = self.i_in
419
420 # use an *asynchronous* Memory read port here (combinatorial)
421 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
422 tlb = TLBRecord("tlb_rdport")
423 pte, ttag = tlb.pte, tlb.tag
424
425 comb += tlb_req_index.eq(hash_ea(i_in.nia))
426 comb += rd_tlb.addr.eq(tlb_req_index)
427 comb += tlb.eq(rd_tlb.data)
428
429 with m.If(i_in.virt_mode):
430 comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
431 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
432
433 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
434 comb += ra_valid.eq(itlb_valid[tlb_req_index])
435
436 comb += eaa_priv.eq(pte[3])
437
438 with m.Else():
439 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
440 comb += ra_valid.eq(1)
441 comb += eaa_priv.eq(1)
442
443 # No IAMR, so no KUEP support for now
444 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
445 comb += access_ok.eq(ra_valid & ~priv_fault)
446
447 # iTLB update
448 def itlb_update(self, m, itlb, itlb_valid):
449 comb = m.d.comb
450 sync = m.d.sync
451
452 m_in = self.m_in
453
454 wr_index = Signal(TLB_SIZE)
455 comb += wr_index.eq(hash_ea(m_in.addr))
456
457 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
458
459 with m.If(m_in.tlbie & m_in.doall):
460 # Clear all valid bits
461 for i in range(TLB_SIZE):
462 sync += itlb_valid[i].eq(0)
463
464 with m.Elif(m_in.tlbie):
465 # Clear entry regardless of hit or miss
466 sync += itlb_valid[wr_index].eq(0)
467
468 with m.Elif(m_in.tlbld):
469 tlb = TLBRecord("tlb_wrport")
470 comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
471 comb += tlb.pte.eq(m_in.pte)
472 comb += wr_tlb.en.eq(1)
473 comb += wr_tlb.addr.eq(wr_index)
474 comb += wr_tlb.data.eq(tlb)
475 sync += itlb_valid[wr_index].eq(1)
476
477 # Cache hit detection, output to fetch2 and other misc logic
478 def icache_comb(self, m, use_previous, r, req_index, req_row,
479 req_hit_way, req_tag, real_addr, req_laddr,
480 cache_valids, access_ok,
481 req_is_hit, req_is_miss, replace_way,
482 plru_victim, cache_out_row):
483
484 comb = m.d.comb
485 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
486
487 i_in, i_out, bus = self.i_in, self.i_out, self.bus
488 flush_in, stall_out = self.flush_in, self.stall_out
489
490 is_hit = Signal()
491 hit_way = Signal(WAY_BITS)
492
493 # i_in.sequential means that i_in.nia this cycle is 4 more than
494 # last cycle. If we read more than 32 bits at a time, had a
495 # cache hit last cycle, and we don't want the first 32-bit chunk
496 # then we can keep the data we read last cycle and just use that.
497 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
498 comb += use_previous.eq(i_in.sequential & r.hit_valid)
499
500 # Extract line, row and tag from request
501 comb += req_index.eq(get_index(i_in.nia))
502 comb += req_row.eq(get_row(i_in.nia))
503 comb += req_tag.eq(get_tag(real_addr))
504
505 # Calculate address of beginning of cache row, will be
506 # used for cache miss processing if needed
507 comb += req_laddr.eq(Cat(
508 Const(0, ROW_OFF_BITS),
509 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
510 ))
511
512 # Test if pending request is a hit on any way
513 hitcond = Signal()
514 comb += hitcond.eq((r.state == State.WAIT_ACK)
515 & (req_index == r.store_index)
516 & r.rows_valid[req_row % ROW_PER_LINE]
517 )
518 # i_in.req asserts Decoder active
519 cvb = Signal(NUM_WAYS)
520 ctag = Signal(TAG_RAM_WIDTH)
521 comb += rd_tag.addr.eq(req_index)
522 comb += ctag.eq(rd_tag.data)
523 comb += cvb.eq(cache_valids[req_index])
524 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
525 comb += se.i.eq(r.store_way)
526 comb += se.n.eq(~i_in.req)
527 for i in range(NUM_WAYS):
528 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
529 hit_test = Signal(name="hit_test%d" % i)
530 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
531 comb += tagi.eq(read_tag(i, ctag))
532 comb += hit_test.eq(se.o[i])
533 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
534 (tagi == req_tag))
535 with m.If(is_tag_hit):
536 comb += hit_way.eq(i)
537 comb += is_hit.eq(1)
538
539 # Generate the "hit" and "miss" signals
540 # for the synchronous blocks
541 with m.If(i_in.req & access_ok & ~flush_in):
542 comb += req_is_hit.eq(is_hit)
543 comb += req_is_miss.eq(~is_hit)
544
545 comb += req_hit_way.eq(hit_way)
546
547 # The way to replace on a miss
548 with m.If(r.state == State.CLR_TAG):
549 comb += replace_way.eq(plru_victim)
550 with m.Else():
551 comb += replace_way.eq(r.store_way)
552
553 # Output instruction from current cache row
554 #
555 # Note: This is a mild violation of our design principle of
556 # having pipeline stages output from a clean latch. In this
557 # case we output the result of a mux. The alternative would
558 # be output an entire row which I prefer not to do just yet
559 # as it would force fetch2 to know about some of the cache
560 # geometry information.
561 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
562 comb += i_out.valid.eq(r.hit_valid)
563 comb += i_out.nia.eq(r.hit_nia)
564 comb += i_out.stop_mark.eq(r.hit_smark)
565 comb += i_out.fetch_failed.eq(r.fetch_failed)
566
567 # Stall fetch1 if we have a miss on cache or TLB
568 # or a protection fault
569 comb += stall_out.eq(~(is_hit & access_ok))
570
571 # Wishbone requests output (from the cache miss reload machine)
572 comb += bus.we.eq(r.wb.we)
573 comb += bus.adr.eq(r.wb.adr)
574 comb += bus.sel.eq(r.wb.sel)
575 comb += bus.stb.eq(r.wb.stb)
576 comb += bus.dat_w.eq(r.wb.dat)
577 comb += bus.cyc.eq(r.wb.cyc)
578
579 # Cache hit synchronous machine
580 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
581 req_index, req_tag, real_addr):
582 sync = m.d.sync
583
584 i_in, stall_in = self.i_in, self.stall_in
585 flush_in = self.flush_in
586
587 # keep outputs to fetch2 unchanged on a stall
588 # except that flush or reset sets valid to 0
589 # If use_previous, keep the same data as last
590 # cycle and use the second half
591 with m.If(stall_in | use_previous):
592 with m.If(flush_in):
593 sync += r.hit_valid.eq(0)
594 with m.Else():
595 # On a hit, latch the request for the next cycle,
596 # when the BRAM data will be available on the
597 # cache_out output of the corresponding way
598 sync += r.hit_valid.eq(req_is_hit)
599
600 with m.If(req_is_hit):
601 sync += r.hit_way.eq(req_hit_way)
602 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
603 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
604 i_in.stop_mark, req_index, req_tag,
605 req_hit_way, real_addr)
606
607 with m.If(~stall_in):
608 # Send stop marks and NIA down regardless of validity
609 sync += r.hit_smark.eq(i_in.stop_mark)
610 sync += r.hit_nia.eq(i_in.nia)
611
612 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
613 req_index, req_tag, replace_way, real_addr):
614 comb = m.d.comb
615 sync = m.d.sync
616
617 i_in = self.i_in
618
619 # Reset per-row valid flags, only used in WAIT_ACK
620 for i in range(ROW_PER_LINE):
621 sync += r.rows_valid[i].eq(0)
622
623 # We need to read a cache line
624 with m.If(req_is_miss):
625 sync += Display(
626 "cache miss nia:%x IR:%x SM:%x idx:%x "
627 " way:%x tag:%x RA:%x", i_in.nia,
628 i_in.virt_mode, i_in.stop_mark, req_index,
629 replace_way, req_tag, real_addr)
630
631 # Keep track of our index and way for subsequent stores
632 st_row = Signal(ROW_BITS)
633 comb += st_row.eq(get_row(req_laddr))
634 sync += r.store_index.eq(req_index)
635 sync += r.store_row.eq(st_row)
636 sync += r.store_tag.eq(req_tag)
637 sync += r.store_valid.eq(1)
638 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
639
640 # Prep for first wishbone read. We calculate the address
641 # of the start of the cache line and start the WB cycle.
642 sync += r.req_adr.eq(req_laddr)
643 sync += r.wb.cyc.eq(1)
644 sync += r.wb.stb.eq(1)
645
646 # Track that we had one request sent
647 sync += r.state.eq(State.CLR_TAG)
648
649 def icache_miss_clr_tag(self, m, r, replace_way,
650 req_index,
651 cache_valids):
652 comb = m.d.comb
653 sync = m.d.sync
654 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
655 granularity=TAG_BITS)
656
657 # Get victim way from plru
658 sync += r.store_way.eq(replace_way)
659
660 # Force misses on that way while reloading that line
661 cv = Signal(INDEX_BITS)
662 comb += cv.eq(cache_valids[req_index])
663 comb += cv.bit_select(replace_way, 1).eq(0)
664 sync += cache_valids[req_index].eq(cv)
665
666 # use write-port "granularity" to select the tag to write to
667 # TODO: the Memory should be multipled-up (by NUM_TAGS)
668 tagset = Signal(TAG_RAM_WIDTH)
669 comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
670 comb += wr_tag.en.eq(1<<replace_way)
671 comb += wr_tag.addr.eq(r.store_index)
672 comb += wr_tag.data.eq(tagset)
673
674 sync += r.state.eq(State.WAIT_ACK)
675
676 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
677 cache_valids, stbs_done):
678 comb = m.d.comb
679 sync = m.d.sync
680
681 bus = self.bus
682
683 # Requests are all sent if stb is 0
684 stbs_zero = Signal()
685 comb += stbs_zero.eq(r.wb.stb == 0)
686 comb += stbs_done.eq(stbs_zero)
687
688 # If we are still sending requests, was one accepted?
689 with m.If(~bus.stall & ~stbs_zero):
690 # That was the last word? We are done sending.
691 # Clear stb and set stbs_done so we can handle
692 # an eventual last ack on the same cycle.
693 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
694 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
695 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
696 "stbs_done:%x", r.wb.adr, r.end_row_ix,
697 r.wb.stb, stbs_zero, stbs_done)
698 sync += r.wb.stb.eq(0)
699 comb += stbs_done.eq(1)
700
701 # Calculate the next row address
702 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
703 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
704 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
705 sync += Display("RARANGE r.req_adr:%x rarange:%x "
706 "stbs_zero:%x stbs_done:%x",
707 r.req_adr, rarange, stbs_zero, stbs_done)
708
709 # Incoming acks processing
710 with m.If(bus.ack):
711 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
712 "stbs_done:%x",
713 bus.dat_r, stbs_zero, stbs_done)
714
715 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
716
717 # Check for completion
718 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
719 # Complete wishbone cycle
720 sync += r.wb.cyc.eq(0)
721 # be nice, clear addr
722 sync += r.req_adr.eq(0)
723
724 # Cache line is now valid
725 cv = Signal(INDEX_BITS)
726 comb += cv.eq(cache_valids[r.store_index])
727 comb += cv.bit_select(replace_way, 1).eq(
728 r.store_valid & ~inval_in)
729 sync += cache_valids[r.store_index].eq(cv)
730
731 sync += r.state.eq(State.IDLE)
732
733 # move on to next request in row
734 # Increment store row counter
735 sync += r.store_row.eq(next_row(r.store_row))
736
737 # Cache miss/reload synchronous machine
738 def icache_miss(self, m, r, req_is_miss,
739 req_index, req_laddr, req_tag, replace_way,
740 cache_valids, access_ok, real_addr):
741 comb = m.d.comb
742 sync = m.d.sync
743
744 i_in, bus, m_in = self.i_in, self.bus, self.m_in
745 stall_in, flush_in = self.stall_in, self.flush_in
746 inval_in = self.inval_in
747
748 stbs_done = Signal()
749
750 comb += r.wb.sel.eq(-1)
751 comb += r.wb.adr.eq(r.req_adr[3:])
752
753 # Process cache invalidations
754 with m.If(inval_in):
755 for i in range(NUM_LINES):
756 sync += cache_valids[i].eq(0)
757 sync += r.store_valid.eq(0)
758
759 # Main state machine
760 with m.Switch(r.state):
761
762 with m.Case(State.IDLE):
763 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
764 req_index, req_tag, replace_way,
765 real_addr)
766
767 with m.Case(State.CLR_TAG, State.WAIT_ACK):
768 with m.If(r.state == State.CLR_TAG):
769 self.icache_miss_clr_tag(m, r, replace_way,
770 req_index,
771 cache_valids)
772
773 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
774 cache_valids, stbs_done)
775
776 # TLB miss and protection fault processing
777 with m.If(flush_in | m_in.tlbld):
778 sync += r.fetch_failed.eq(0)
779 with m.Elif(i_in.req & ~access_ok & ~stall_in):
780 sync += r.fetch_failed.eq(1)
781
782 # icache_log: if LOG_LENGTH > 0 generate
783 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
784 req_is_miss, req_is_hit, lway, wstate, r):
785 comb = m.d.comb
786 sync = m.d.sync
787
788 bus, i_out = self.bus, self.i_out
789 log_out, stall_out = self.log_out, self.stall_out
790
791 # Output data to logger
792 for i in range(LOG_LENGTH):
793 log_data = Signal(54)
794 lway = Signal(WAY_BITS)
795 wstate = Signal()
796
797 sync += lway.eq(req_hit_way)
798 sync += wstate.eq(0)
799
800 with m.If(r.state != State.IDLE):
801 sync += wstate.eq(1)
802
803 sync += log_data.eq(Cat(
804 ra_valid, access_ok, req_is_miss, req_is_hit,
805 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
806 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
807 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
808 ))
809 comb += log_out.eq(log_data)
810
811 def elaborate(self, platform):
812
813 m = Module()
814 comb = m.d.comb
815
816 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
817 cache_valids = CacheValidsArray()
818
819 # TLB Array
820 itlb = TLBArray()
821 itlb_valid = TLBValidArray()
822
823 # TODO to be passed to nmigen as ram attributes
824 # attribute ram_style of itlb_tags : signal is "distributed";
825 # attribute ram_style of itlb_ptes : signal is "distributed";
826
827 # Privilege bit from PTE EAA field
828 eaa_priv = Signal()
829
830 r = RegInternal()
831
832 # Async signal on incoming request
833 req_index = Signal(INDEX_BITS)
834 req_row = Signal(ROW_BITS)
835 req_hit_way = Signal(WAY_BITS)
836 req_tag = Signal(TAG_BITS)
837 req_is_hit = Signal()
838 req_is_miss = Signal()
839 req_laddr = Signal(64)
840
841 tlb_req_index = Signal(TLB_BITS)
842 real_addr = Signal(REAL_ADDR_BITS)
843 ra_valid = Signal()
844 priv_fault = Signal()
845 access_ok = Signal()
846 use_previous = Signal()
847
848 cache_out_row = Signal(ROW_SIZE_BITS)
849
850 plru_victim = Signal(WAY_BITS)
851 replace_way = Signal(WAY_BITS)
852
853 self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
854 self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
855
856 # call sub-functions putting everything together,
857 # using shared signals established above
858 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
859 self.maybe_plrus(m, r, plru_victim)
860 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
861 ra_valid, eaa_priv, priv_fault,
862 access_ok)
863 self.itlb_update(m, itlb, itlb_valid)
864 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
865 req_tag, real_addr, req_laddr,
866 cache_valids,
867 access_ok, req_is_hit, req_is_miss,
868 replace_way, plru_victim, cache_out_row)
869 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
870 req_index, req_tag, real_addr)
871 self.icache_miss(m, r, req_is_miss, req_index,
872 req_laddr, req_tag, replace_way,
873 cache_valids,
874 access_ok, real_addr)
875 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
876 # req_is_miss, req_is_hit, lway, wstate, r)
877
878 # don't connect up to FetchUnitInterface so that some unit tests
879 # can continue to operate
880 if not self.use_fetch_iface:
881 return m
882
883 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
884 # so needs checking and iterative revising
885 i_in, bus, i_out = self.i_in, self.bus, self.i_out
886 comb += i_in.req.eq(self.a_i_valid)
887 comb += i_in.nia.eq(self.a_pc_i)
888 comb += self.stall_in.eq(self.a_stall_i)
889 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
890 comb += self.f_badaddr_o.eq(i_out.nia)
891 comb += self.f_instr_o.eq(i_out.insn)
892 comb += self.f_busy_o.eq(~i_out.valid) # probably
893
894 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
895 ibus = self.ibus
896 comb += ibus.adr.eq(self.bus.adr)
897 comb += ibus.dat_w.eq(self.bus.dat_w)
898 comb += ibus.sel.eq(self.bus.sel)
899 comb += ibus.cyc.eq(self.bus.cyc)
900 comb += ibus.stb.eq(self.bus.stb)
901 comb += ibus.we.eq(self.bus.we)
902
903 comb += self.bus.dat_r.eq(ibus.dat_r)
904 comb += self.bus.ack.eq(ibus.ack)
905 if hasattr(ibus, "stall"):
906 comb += self.bus.stall.eq(ibus.stall)
907 else:
908 # fake-up the wishbone stall signal to comply with pipeline mode
909 # same thing is done in dcache.py
910 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
911
912 return m
913
914
915 def icache_sim(dut):
916 i_in = dut.i_in
917 i_out = dut.i_out
918 m_out = dut.m_in
919
920 yield i_in.priv_mode.eq(1)
921 yield i_in.req.eq(0)
922 yield i_in.nia.eq(0)
923 yield i_in.stop_mark.eq(0)
924 yield m_out.tlbld.eq(0)
925 yield m_out.tlbie.eq(0)
926 yield m_out.addr.eq(0)
927 yield m_out.pte.eq(0)
928 yield
929 yield
930 yield
931 yield
932
933 # miss, stalls for a bit
934 yield i_in.req.eq(1)
935 yield i_in.nia.eq(Const(0x0000000000000004, 64))
936 yield
937 valid = yield i_out.valid
938 while not valid:
939 yield
940 valid = yield i_out.valid
941 yield i_in.req.eq(0)
942
943 insn = yield i_out.insn
944 nia = yield i_out.nia
945 assert insn == 0x00000001, \
946 "insn @%x=%x expected 00000001" % (nia, insn)
947 yield i_in.req.eq(0)
948 yield
949
950 # hit
951 yield i_in.req.eq(1)
952 yield i_in.nia.eq(Const(0x0000000000000008, 64))
953 yield
954 valid = yield i_out.valid
955 while not valid:
956 yield
957 valid = yield i_out.valid
958 yield i_in.req.eq(0)
959
960 nia = yield i_out.nia
961 insn = yield i_out.insn
962 yield
963 assert insn == 0x00000002, \
964 "insn @%x=%x expected 00000002" % (nia, insn)
965
966 # another miss
967 yield i_in.req.eq(1)
968 yield i_in.nia.eq(Const(0x0000000000000040, 64))
969 yield
970 valid = yield i_out.valid
971 while not valid:
972 yield
973 valid = yield i_out.valid
974 yield i_in.req.eq(0)
975
976 nia = yield i_in.nia
977 insn = yield i_out.insn
978 assert insn == 0x00000010, \
979 "insn @%x=%x expected 00000010" % (nia, insn)
980
981 # test something that aliases (this only works because
982 # the unit test SRAM is a depth of 512)
983 yield i_in.req.eq(1)
984 yield i_in.nia.eq(Const(0x0000000000000100, 64))
985 yield
986 yield
987 valid = yield i_out.valid
988 assert ~valid
989 for i in range(30):
990 yield
991 yield
992 insn = yield i_out.insn
993 valid = yield i_out.valid
994 insn = yield i_out.insn
995 assert valid
996 assert insn == 0x00000040, \
997 "insn @%x=%x expected 00000040" % (nia, insn)
998 yield i_in.req.eq(0)
999
1000
1001 def test_icache(mem):
1002 from soc.config.test.test_loadstore import TestMemPspec
1003 pspec = TestMemPspec(addr_wid=32,
1004 mask_wid=8,
1005 reg_wid=64,
1006 )
1007 dut = ICache(pspec)
1008
1009 memory = Memory(width=64, depth=512, init=mem)
1010 sram = SRAM(memory=memory, granularity=8)
1011
1012 m = Module()
1013
1014 m.submodules.icache = dut
1015 m.submodules.sram = sram
1016
1017 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1018 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1019 m.d.comb += sram.bus.we.eq(dut.bus.we)
1020 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1021 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1022 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1023
1024 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1025 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1026
1027 # nmigen Simulation
1028 sim = Simulator(m)
1029 sim.add_clock(1e-6)
1030
1031 sim.add_sync_process(wrap(icache_sim(dut)))
1032 with sim.write_vcd('test_icache.vcd'):
1033 sim.run()
1034
1035
1036 if __name__ == '__main__':
1037 from soc.config.test.test_loadstore import TestMemPspec
1038 pspec = TestMemPspec(addr_wid=64,
1039 mask_wid=8,
1040 reg_wid=64,
1041 )
1042 dut = ICache(pspec)
1043 vl = rtlil.convert(dut, ports=[])
1044 with open("test_icache.il", "w") as f:
1045 f.write(vl)
1046
1047 # set up memory every 32-bits with incrementing values 0 1 2 ...
1048 mem = []
1049 for i in range(512):
1050 mem.append((i*2) | ((i*2+1)<<32))
1051
1052 test_icache(mem)