add discussion links and bugreport
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52
53 # for test
54 from soc.bus.sram import SRAM
55 from nmigen import Memory
56 from nmutil.util import wrap
57 from nmigen.cli import main, rtlil
58
59 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
60 # Also, check out the cxxsim nmigen branch, and latest yosys from git
61 from nmutil.sim_tmp_alternative import Simulator, Settle
62
63
64 SIM = 0
65 LINE_SIZE = 64
66 # BRAM organisation: We never access more than wishbone_data_bits
67 # at a time so to save resources we make the array only that wide,
68 # and use consecutive indices for to make a cache "line"
69 #
70 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
71 ROW_SIZE = WB_DATA_BITS // 8
72 # Number of lines in a set
73 NUM_LINES = 16
74 # Number of ways
75 NUM_WAYS = 4
76 # L1 ITLB number of entries (direct mapped)
77 TLB_SIZE = 64
78 # L1 ITLB log_2(page_size)
79 TLB_LG_PGSZ = 12
80 # Number of real address bits that we store
81 REAL_ADDR_BITS = 56
82 # Non-zero to enable log data collection
83 LOG_LENGTH = 0
84
85 ROW_SIZE_BITS = ROW_SIZE * 8
86 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
88 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
89 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
90 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
91 INSN_PER_ROW = ROW_SIZE_BITS // 32
92
93 # Bit fields counts in the address
94 #
95 # INSN_BITS is the number of bits to select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to select a row
98 ROW_BITS = log2_int(BRAM_ROWS)
99 # ROW_LINE_BITS is the number of bits to select a row within a line
100 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
101 # LINE_OFF_BITS is the number of bits for the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for the offset in a row
104 ROW_OFF_BITS = log2_int(ROW_SIZE)
105 # INDEX_BITS is the number of bits to select a cache line
106 INDEX_BITS = log2_int(NUM_LINES)
107 # SET_SIZE_BITS is the log base 2 of the set size
108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
109 # TAG_BITS is the number of bits of the tag part of the address
110 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
111 # TAG_WIDTH is the width in bits of each way of the tag RAM
112 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
113
114 # WAY_BITS is the number of bits to select a way
115 WAY_BITS = log2_int(NUM_WAYS)
116 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
117
118 # L1 ITLB
119 TLB_BITS = log2_int(TLB_SIZE)
120 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
121 TLB_PTE_BITS = 64
122
123 print("BRAM_ROWS =", BRAM_ROWS)
124 print("INDEX_BITS =", INDEX_BITS)
125 print("INSN_BITS =", INSN_BITS)
126 print("INSN_PER_ROW =", INSN_PER_ROW)
127 print("LINE_SIZE =", LINE_SIZE)
128 print("LINE_OFF_BITS =", LINE_OFF_BITS)
129 print("LOG_LENGTH =", LOG_LENGTH)
130 print("NUM_LINES =", NUM_LINES)
131 print("NUM_WAYS =", NUM_WAYS)
132 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
133 print("ROW_BITS =", ROW_BITS)
134 print("ROW_OFF_BITS =", ROW_OFF_BITS)
135 print("ROW_LINE_BITS =", ROW_LINE_BITS)
136 print("ROW_PER_LINE =", ROW_PER_LINE)
137 print("ROW_SIZE =", ROW_SIZE)
138 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
139 print("SET_SIZE_BITS =", SET_SIZE_BITS)
140 print("SIM =", SIM)
141 print("TAG_BITS =", TAG_BITS)
142 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
143 print("TAG_BITS =", TAG_BITS)
144 print("TLB_BITS =", TLB_BITS)
145 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
146 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
147 print("TLB_PTE_BITS =", TLB_PTE_BITS)
148 print("TLB_SIZE =", TLB_SIZE)
149 print("WAY_BITS =", WAY_BITS)
150
151 # from microwatt/utils.vhdl
152 def ispow2(n):
153 return n != 0 and (n & (n - 1)) == 0
154
155 assert LINE_SIZE % ROW_SIZE == 0
156 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
157 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
158 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
159 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
160 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
161 "geometry bits don't add up"
162 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
167 "geometry bits don't add up"
168
169 # Example of layout for 32 lines of 64 bytes:
170 #
171 # .. tag |index| line |
172 # .. | row | |
173 # .. | | | |00| zero (2)
174 # .. | | |-| | INSN_BITS (1)
175 # .. | |---| | ROW_LINE_BITS (3)
176 # .. | |--- - --| LINE_OFF_BITS (6)
177 # .. | |- --| ROW_OFF_BITS (3)
178 # .. |----- ---| | ROW_BITS (8)
179 # .. |-----| | INDEX_BITS (5)
180 # .. --------| | TAG_BITS (53)
181
182 # The cache data BRAM organized as described above for each way
183 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
184 #
185 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
186 # not handle a clean (commented) definition of the cache tags as a 3d
187 # memory. For now, work around it by putting all the tags
188 def CacheTagArray():
189 tag_layout = [('valid', 1),
190 ('tag', TAG_RAM_WIDTH),
191 ]
192 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
193
194 def RowPerLineValidArray():
195 return Array(Signal(name="rows_valid_%d" %x) \
196 for x in range(ROW_PER_LINE))
197
198
199 # TODO to be passed to nigmen as ram attributes
200 # attribute ram_style : string;
201 # attribute ram_style of cache_tags : signal is "distributed";
202
203 def TLBArray():
204 tlb_layout = [('valid', 1),
205 ('tag', TLB_EA_TAG_BITS),
206 ('pte', TLB_PTE_BITS)
207 ]
208 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
209
210 # Cache RAM interface
211 def CacheRamOut():
212 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
213 for x in range(NUM_WAYS))
214
215 # PLRU output interface
216 def PLRUOut():
217 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
218 for x in range(NUM_LINES))
219
220 # Return the cache line index (tag index) for an address
221 def get_index(addr):
222 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
223
224 # Return the cache row index (data memory) for an address
225 def get_row(addr):
226 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
227
228 # Return the index of a row within a line
229 def get_row_of_line(row):
230 return row[:ROW_BITS][:ROW_LINE_BITS]
231
232 # Returns whether this is the last row of a line
233 def is_last_row_addr(addr, last):
234 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
235
236 # Returns whether this is the last row of a line
237 def is_last_row(row, last):
238 return get_row_of_line(row) == last
239
240 # Return the next row in the current cache line. We use a dedicated
241 # function in order to limit the size of the generated adder to be
242 # only the bits within a cache line (3 bits with default settings)
243 def next_row(row):
244 row_v = row[0:ROW_LINE_BITS] + 1
245 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
246
247 # Read the instruction word for the given address
248 # in the current cache row
249 def read_insn_word(addr, data):
250 word = addr[2:INSN_BITS+2]
251 return data.word_select(word, 32)
252
253 # Get the tag value from the address
254 def get_tag(addr):
255 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
256
257 # Read a tag from a tag memory row
258 def read_tag(way, tagset):
259 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
260
261 # Write a tag to tag memory row
262 def write_tag(way, tagset, tag):
263 return read_tag(way, tagset).eq(tag)
264
265 # Simple hash for direct-mapped TLB index
266 def hash_ea(addr):
267 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
268 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
269 ] ^ addr[
270 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
271 ]
272 return hsh
273
274
275 # Cache reload state machine
276 @unique
277 class State(Enum):
278 IDLE = 0
279 CLR_TAG = 1
280 WAIT_ACK = 2
281
282
283 class RegInternal(RecordObject):
284 def __init__(self):
285 super().__init__()
286 # Cache hit state (Latches for 1 cycle BRAM access)
287 self.hit_way = Signal(WAY_BITS)
288 self.hit_nia = Signal(64)
289 self.hit_smark = Signal()
290 self.hit_valid = Signal()
291
292 # Cache miss state (reload state machine)
293 self.state = Signal(State, reset=State.IDLE)
294 self.wb = WBMasterOut("wb")
295 self.req_adr = Signal(64)
296 self.store_way = Signal(WAY_BITS)
297 self.store_index = Signal(INDEX_BITS)
298 self.store_row = Signal(ROW_BITS)
299 self.store_tag = Signal(TAG_BITS)
300 self.store_valid = Signal()
301 self.end_row_ix = Signal(ROW_LINE_BITS)
302 self.rows_valid = RowPerLineValidArray()
303
304 # TLB miss state
305 self.fetch_failed = Signal()
306
307
308 class ICache(Elaboratable):
309 """64 bit direct mapped icache. All instructions are 4B aligned."""
310 def __init__(self):
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 # standard naming (wired to non-standard for compatibility)
322 self.bus = Interface(addr_width=32,
323 data_width=64,
324 granularity=8,
325 features={'stall'},
326 alignment=0,
327 name="dcache")
328
329 self.log_out = Signal(54)
330
331
332 # Generate a cache RAM for each way
333 def rams(self, m, r, cache_out_row, use_previous,
334 replace_way, req_row):
335
336 comb = m.d.comb
337 sync = m.d.sync
338
339 bus, stall_in = self.bus, self.stall_in
340
341 # read condition (for every cache ram)
342 do_read = Signal()
343 comb += do_read.eq(~(stall_in | use_previous))
344
345 rd_addr = Signal(ROW_BITS)
346 wr_addr = Signal(ROW_BITS)
347 comb += rd_addr.eq(req_row)
348 comb += wr_addr.eq(r.store_row)
349
350 # binary-to-unary converters: replace-way enabled by bus.ack,
351 # hit-way left permanently enabled
352 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
353 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
354 comb += re.i.eq(replace_way)
355 comb += re.n.eq(~bus.ack)
356 comb += he.i.eq(r.hit_way)
357
358 for i in range(NUM_WAYS):
359 do_write = Signal(name="do_wr_%d" % i)
360 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
361 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
362
363 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
364 m.submodules["cacheram_%d" % i] = way
365
366 comb += way.rd_en.eq(do_read)
367 comb += way.rd_addr.eq(rd_addr)
368 comb += d_out.eq(way.rd_data_o)
369 comb += way.wr_sel.eq(wr_sel)
370 comb += way.wr_addr.eq(wr_addr)
371 comb += way.wr_data.eq(bus.dat_r)
372
373 comb += do_write.eq(re.o[i])
374
375 with m.If(do_write):
376 sync += Display("cache write adr: %x data: %lx",
377 wr_addr, way.wr_data)
378
379 with m.If(he.o[i]):
380 comb += cache_out_row.eq(d_out)
381 with m.If(do_read):
382 sync += Display("cache read adr: %x data: %x",
383 req_row, d_out)
384
385 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
386
387 # Generate PLRUs
388 def maybe_plrus(self, m, r, plru_victim):
389 comb = m.d.comb
390
391 if NUM_WAYS == 0:
392 return
393
394
395 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
396 comb += plru.way.eq(r.hit_way)
397 comb += plru.valid.eq(r.hit_valid)
398 comb += plru.index.eq(get_index(r.hit_nia))
399 comb += plru.isel.eq(r.store_index) # select victim
400 comb += plru_victim.eq(plru.o_index) # selected victim
401
402 # TLB hit detection and real address generation
403 def itlb_lookup(self, m, tlb_req_index, itlb,
404 real_addr, ra_valid, eaa_priv,
405 priv_fault, access_ok):
406
407 comb = m.d.comb
408
409 i_in = self.i_in
410
411 pte = Signal(TLB_PTE_BITS)
412 ttag = Signal(TLB_EA_TAG_BITS)
413
414 comb += tlb_req_index.eq(hash_ea(i_in.nia))
415 comb += pte.eq(itlb[tlb_req_index].pte)
416 comb += ttag.eq(itlb[tlb_req_index].tag)
417
418 with m.If(i_in.virt_mode):
419 comb += real_addr.eq(Cat(
420 i_in.nia[:TLB_LG_PGSZ],
421 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
422 ))
423
424 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
425 comb += ra_valid.eq(itlb[tlb_req_index].valid)
426
427 comb += eaa_priv.eq(pte[3])
428
429 with m.Else():
430 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
431 comb += ra_valid.eq(1)
432 comb += eaa_priv.eq(1)
433
434 # No IAMR, so no KUEP support for now
435 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
436 comb += access_ok.eq(ra_valid & ~priv_fault)
437
438 # iTLB update
439 def itlb_update(self, m, itlb):
440 comb = m.d.comb
441 sync = m.d.sync
442
443 m_in = self.m_in
444
445 wr_index = Signal(TLB_SIZE)
446 comb += wr_index.eq(hash_ea(m_in.addr))
447
448 with m.If(m_in.tlbie & m_in.doall):
449 # Clear all valid bits
450 for i in range(TLB_SIZE):
451 sync += itlb[i].valid.eq(0)
452
453 with m.Elif(m_in.tlbie):
454 # Clear entry regardless of hit or miss
455 sync += itlb[wr_index].valid.eq(0)
456
457 with m.Elif(m_in.tlbld):
458 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
459 sync += itlb[wr_index].pte.eq(m_in.pte)
460 sync += itlb[wr_index].valid.eq(1)
461
462 # Cache hit detection, output to fetch2 and other misc logic
463 def icache_comb(self, m, use_previous, r, req_index, req_row,
464 req_hit_way, req_tag, real_addr, req_laddr,
465 cache_tags, access_ok,
466 req_is_hit, req_is_miss, replace_way,
467 plru_victim, cache_out_row):
468
469 comb = m.d.comb
470
471 i_in, i_out, bus = self.i_in, self.i_out, self.bus
472 flush_in, stall_out = self.flush_in, self.stall_out
473
474 is_hit = Signal()
475 hit_way = Signal(WAY_BITS)
476
477 # i_in.sequential means that i_in.nia this cycle is 4 more than
478 # last cycle. If we read more than 32 bits at a time, had a
479 # cache hit last cycle, and we don't want the first 32-bit chunk
480 # then we can keep the data we read last cycle and just use that.
481 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
482 comb += use_previous.eq(i_in.sequential & r.hit_valid)
483
484 # Extract line, row and tag from request
485 comb += req_index.eq(get_index(i_in.nia))
486 comb += req_row.eq(get_row(i_in.nia))
487 comb += req_tag.eq(get_tag(real_addr))
488
489 # Calculate address of beginning of cache row, will be
490 # used for cache miss processing if needed
491 comb += req_laddr.eq(Cat(
492 Const(0, ROW_OFF_BITS),
493 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
494 ))
495
496 # Test if pending request is a hit on any way
497 hitcond = Signal()
498 comb += hitcond.eq((r.state == State.WAIT_ACK)
499 & (req_index == r.store_index)
500 & r.rows_valid[req_row % ROW_PER_LINE]
501 )
502 # i_in.req asserts Decoder active
503 cvb = Signal(NUM_WAYS)
504 ctag = Signal(TAG_RAM_WIDTH)
505 comb += ctag.eq(cache_tags[req_index].tag)
506 comb += cvb.eq(cache_tags[req_index].valid)
507 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
508 comb += se.i.eq(r.store_way)
509 comb += se.n.eq(~i_in.req)
510 for i in range(NUM_WAYS):
511 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
512 hit_test = Signal(name="hit_test%d" % i)
513 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
514 comb += tagi.eq(read_tag(i, ctag))
515 comb += hit_test.eq(se.o[i])
516 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
517 (tagi == req_tag))
518 with m.If(is_tag_hit):
519 comb += hit_way.eq(i)
520 comb += is_hit.eq(1)
521
522 # Generate the "hit" and "miss" signals
523 # for the synchronous blocks
524 with m.If(i_in.req & access_ok & ~flush_in):
525 comb += req_is_hit.eq(is_hit)
526 comb += req_is_miss.eq(~is_hit)
527
528 comb += req_hit_way.eq(hit_way)
529
530 # The way to replace on a miss
531 with m.If(r.state == State.CLR_TAG):
532 comb += replace_way.eq(plru_victim)
533 with m.Else():
534 comb += replace_way.eq(r.store_way)
535
536 # Output instruction from current cache row
537 #
538 # Note: This is a mild violation of our design principle of
539 # having pipeline stages output from a clean latch. In this
540 # case we output the result of a mux. The alternative would
541 # be output an entire row which I prefer not to do just yet
542 # as it would force fetch2 to know about some of the cache
543 # geometry information.
544 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
545 comb += i_out.valid.eq(r.hit_valid)
546 comb += i_out.nia.eq(r.hit_nia)
547 comb += i_out.stop_mark.eq(r.hit_smark)
548 comb += i_out.fetch_failed.eq(r.fetch_failed)
549
550 # Stall fetch1 if we have a miss on cache or TLB
551 # or a protection fault
552 comb += stall_out.eq(~(is_hit & access_ok))
553
554 # Wishbone requests output (from the cache miss reload machine)
555 comb += bus.we.eq(r.wb.we)
556 comb += bus.adr.eq(r.wb.adr)
557 comb += bus.sel.eq(r.wb.sel)
558 comb += bus.stb.eq(r.wb.stb)
559 comb += bus.dat_w.eq(r.wb.dat)
560 comb += bus.cyc.eq(r.wb.cyc)
561
562 # Cache hit synchronous machine
563 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
564 req_index, req_tag, real_addr):
565 sync = m.d.sync
566
567 i_in, stall_in = self.i_in, self.stall_in
568 flush_in = self.flush_in
569
570 # keep outputs to fetch2 unchanged on a stall
571 # except that flush or reset sets valid to 0
572 # If use_previous, keep the same data as last
573 # cycle and use the second half
574 with m.If(stall_in | use_previous):
575 with m.If(flush_in):
576 sync += r.hit_valid.eq(0)
577 with m.Else():
578 # On a hit, latch the request for the next cycle,
579 # when the BRAM data will be available on the
580 # cache_out output of the corresponding way
581 sync += r.hit_valid.eq(req_is_hit)
582
583 with m.If(req_is_hit):
584 sync += r.hit_way.eq(req_hit_way)
585 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
586 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
587 i_in.stop_mark, req_index, req_tag,
588 req_hit_way, real_addr)
589
590 with m.If(~stall_in):
591 # Send stop marks and NIA down regardless of validity
592 sync += r.hit_smark.eq(i_in.stop_mark)
593 sync += r.hit_nia.eq(i_in.nia)
594
595 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
596 req_index, req_tag, replace_way, real_addr):
597 comb = m.d.comb
598 sync = m.d.sync
599
600 i_in = self.i_in
601
602 # Reset per-row valid flags, only used in WAIT_ACK
603 for i in range(ROW_PER_LINE):
604 sync += r.rows_valid[i].eq(0)
605
606 # We need to read a cache line
607 with m.If(req_is_miss):
608 sync += Display(
609 "cache miss nia:%x IR:%x SM:%x idx:%x "
610 " way:%x tag:%x RA:%x", i_in.nia,
611 i_in.virt_mode, i_in.stop_mark, req_index,
612 replace_way, req_tag, real_addr)
613
614 # Keep track of our index and way for subsequent stores
615 st_row = Signal(ROW_BITS)
616 comb += st_row.eq(get_row(req_laddr))
617 sync += r.store_index.eq(req_index)
618 sync += r.store_row.eq(st_row)
619 sync += r.store_tag.eq(req_tag)
620 sync += r.store_valid.eq(1)
621 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
622
623 # Prep for first wishbone read. We calculate the address
624 # of the start of the cache line and start the WB cycle.
625 sync += r.req_adr.eq(req_laddr)
626 sync += r.wb.cyc.eq(1)
627 sync += r.wb.stb.eq(1)
628
629 # Track that we had one request sent
630 sync += r.state.eq(State.CLR_TAG)
631
632 def icache_miss_clr_tag(self, m, r, replace_way,
633 req_index,
634 tagset, cache_tags):
635 comb = m.d.comb
636 sync = m.d.sync
637
638 # Get victim way from plru
639 sync += r.store_way.eq(replace_way)
640
641 # Force misses on that way while reloading that line
642 cv = Signal(INDEX_BITS)
643 comb += cv.eq(cache_tags[req_index].valid)
644 comb += cv.bit_select(replace_way, 1).eq(0)
645 sync += cache_tags[req_index].valid.eq(cv)
646
647 for i in range(NUM_WAYS):
648 with m.If(i == replace_way):
649 comb += tagset.eq(cache_tags[r.store_index].tag)
650 comb += write_tag(i, tagset, r.store_tag)
651 sync += cache_tags[r.store_index].tag.eq(tagset)
652
653 sync += r.state.eq(State.WAIT_ACK)
654
655 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
656 cache_tags, stbs_done):
657 comb = m.d.comb
658 sync = m.d.sync
659
660 bus = self.bus
661
662 # Requests are all sent if stb is 0
663 stbs_zero = Signal()
664 comb += stbs_zero.eq(r.wb.stb == 0)
665 comb += stbs_done.eq(stbs_zero)
666
667 # If we are still sending requests, was one accepted?
668 with m.If(~bus.stall & ~stbs_zero):
669 # That was the last word? We are done sending.
670 # Clear stb and set stbs_done so we can handle
671 # an eventual last ack on the same cycle.
672 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
673 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
674 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
675 "stbs_done:%x", r.wb.adr, r.end_row_ix,
676 r.wb.stb, stbs_zero, stbs_done)
677 sync += r.wb.stb.eq(0)
678 comb += stbs_done.eq(1)
679
680 # Calculate the next row address
681 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
682 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
683 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
684 sync += Display("RARANGE r.req_adr:%x rarange:%x "
685 "stbs_zero:%x stbs_done:%x",
686 r.req_adr, rarange, stbs_zero, stbs_done)
687
688 # Incoming acks processing
689 with m.If(bus.ack):
690 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
691 "stbs_done:%x",
692 bus.dat_r, stbs_zero, stbs_done)
693
694 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
695
696 # Check for completion
697 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
698 # Complete wishbone cycle
699 sync += r.wb.cyc.eq(0)
700 # be nice, clear addr
701 sync += r.req_adr.eq(0)
702
703 # Cache line is now valid
704 cv = Signal(INDEX_BITS)
705 comb += cv.eq(cache_tags[r.store_index].valid)
706 comb += cv.bit_select(replace_way, 1).eq(
707 r.store_valid & ~inval_in)
708 sync += cache_tags[r.store_index].valid.eq(cv)
709
710 sync += r.state.eq(State.IDLE)
711
712 # move on to next request in row
713 # Increment store row counter
714 sync += r.store_row.eq(next_row(r.store_row))
715
716 # Cache miss/reload synchronous machine
717 def icache_miss(self, m, r, req_is_miss,
718 req_index, req_laddr, req_tag, replace_way,
719 cache_tags, access_ok, real_addr):
720 comb = m.d.comb
721 sync = m.d.sync
722
723 i_in, bus, m_in = self.i_in, self.bus, self.m_in
724 stall_in, flush_in = self.stall_in, self.flush_in
725 inval_in = self.inval_in
726
727 tagset = Signal(TAG_RAM_WIDTH)
728 stbs_done = Signal()
729
730 comb += r.wb.sel.eq(-1)
731 comb += r.wb.adr.eq(r.req_adr[3:])
732
733 # Process cache invalidations
734 with m.If(inval_in):
735 for i in range(NUM_LINES):
736 sync += cache_tags[i].valid.eq(0)
737 sync += r.store_valid.eq(0)
738
739 # Main state machine
740 with m.Switch(r.state):
741
742 with m.Case(State.IDLE):
743 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
744 req_index, req_tag, replace_way,
745 real_addr)
746
747 with m.Case(State.CLR_TAG, State.WAIT_ACK):
748 with m.If(r.state == State.CLR_TAG):
749 self.icache_miss_clr_tag(m, r, replace_way,
750 req_index, tagset, cache_tags)
751
752 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
753 cache_tags, stbs_done)
754
755 # TLB miss and protection fault processing
756 with m.If(flush_in | m_in.tlbld):
757 sync += r.fetch_failed.eq(0)
758 with m.Elif(i_in.req & ~access_ok & ~stall_in):
759 sync += r.fetch_failed.eq(1)
760
761 # icache_log: if LOG_LENGTH > 0 generate
762 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
763 req_is_miss, req_is_hit, lway, wstate, r):
764 comb = m.d.comb
765 sync = m.d.sync
766
767 bus, i_out = self.bus, self.i_out
768 log_out, stall_out = self.log_out, self.stall_out
769
770 # Output data to logger
771 for i in range(LOG_LENGTH):
772 log_data = Signal(54)
773 lway = Signal(WAY_BITS)
774 wstate = Signal()
775
776 sync += lway.eq(req_hit_way)
777 sync += wstate.eq(0)
778
779 with m.If(r.state != State.IDLE):
780 sync += wstate.eq(1)
781
782 sync += log_data.eq(Cat(
783 ra_valid, access_ok, req_is_miss, req_is_hit,
784 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
785 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
786 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
787 ))
788 comb += log_out.eq(log_data)
789
790 def elaborate(self, platform):
791
792 m = Module()
793 comb = m.d.comb
794
795 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
796 cache_tags = CacheTagArray()
797
798 # TLB Array
799 itlb = TLBArray()
800
801 # TODO to be passed to nmigen as ram attributes
802 # attribute ram_style of itlb_tags : signal is "distributed";
803 # attribute ram_style of itlb_ptes : signal is "distributed";
804
805 # Privilege bit from PTE EAA field
806 eaa_priv = Signal()
807
808 r = RegInternal()
809
810 # Async signal on incoming request
811 req_index = Signal(INDEX_BITS)
812 req_row = Signal(ROW_BITS)
813 req_hit_way = Signal(WAY_BITS)
814 req_tag = Signal(TAG_BITS)
815 req_is_hit = Signal()
816 req_is_miss = Signal()
817 req_laddr = Signal(64)
818
819 tlb_req_index = Signal(TLB_SIZE)
820 real_addr = Signal(REAL_ADDR_BITS)
821 ra_valid = Signal()
822 priv_fault = Signal()
823 access_ok = Signal()
824 use_previous = Signal()
825
826 cache_out_row = Signal(ROW_SIZE_BITS)
827
828 plru_victim = Signal(WAY_BITS)
829 replace_way = Signal(WAY_BITS)
830
831 # fake-up the wishbone stall signal to comply with pipeline mode
832 # same thing is done in dcache.py
833 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
834
835 # call sub-functions putting everything together,
836 # using shared signals established above
837 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
838 self.maybe_plrus(m, r, plru_victim)
839 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
840 ra_valid, eaa_priv, priv_fault,
841 access_ok)
842 self.itlb_update(m, itlb)
843 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
844 req_tag, real_addr, req_laddr,
845 cache_tags, access_ok, req_is_hit, req_is_miss,
846 replace_way, plru_victim, cache_out_row)
847 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
848 req_index, req_tag, real_addr)
849 self.icache_miss(m, r, req_is_miss, req_index,
850 req_laddr, req_tag, replace_way, cache_tags,
851 access_ok, real_addr)
852 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
853 # req_is_miss, req_is_hit, lway, wstate, r)
854
855 return m
856
857
858 def icache_sim(dut):
859 i_in = dut.i_in
860 i_out = dut.i_out
861 m_out = dut.m_in
862
863 yield i_in.priv_mode.eq(1)
864 yield i_in.req.eq(0)
865 yield i_in.nia.eq(0)
866 yield i_in.stop_mark.eq(0)
867 yield m_out.tlbld.eq(0)
868 yield m_out.tlbie.eq(0)
869 yield m_out.addr.eq(0)
870 yield m_out.pte.eq(0)
871 yield
872 yield
873 yield
874 yield
875
876 # miss, stalls for a bit
877 yield i_in.req.eq(1)
878 yield i_in.nia.eq(Const(0x0000000000000004, 64))
879 yield
880 valid = yield i_out.valid
881 while not valid:
882 yield
883 valid = yield i_out.valid
884 yield i_in.req.eq(0)
885
886 insn = yield i_out.insn
887 nia = yield i_out.nia
888 assert insn == 0x00000001, \
889 "insn @%x=%x expected 00000001" % (nia, insn)
890 yield i_in.req.eq(0)
891 yield
892
893 # hit
894 yield i_in.req.eq(1)
895 yield i_in.nia.eq(Const(0x0000000000000008, 64))
896 yield
897 valid = yield i_out.valid
898 while not valid:
899 yield
900 valid = yield i_out.valid
901 yield i_in.req.eq(0)
902
903 nia = yield i_out.nia
904 insn = yield i_out.insn
905 yield
906 assert insn == 0x00000002, \
907 "insn @%x=%x expected 00000002" % (nia, insn)
908
909 # another miss
910 yield i_in.req.eq(1)
911 yield i_in.nia.eq(Const(0x0000000000000040, 64))
912 yield
913 valid = yield i_out.valid
914 while not valid:
915 yield
916 valid = yield i_out.valid
917 yield i_in.req.eq(0)
918
919 nia = yield i_in.nia
920 insn = yield i_out.insn
921 assert insn == 0x00000010, \
922 "insn @%x=%x expected 00000010" % (nia, insn)
923
924 # test something that aliases (this only works because
925 # the unit test SRAM is a depth of 512)
926 yield i_in.req.eq(1)
927 yield i_in.nia.eq(Const(0x0000000000000100, 64))
928 yield
929 yield
930 valid = yield i_out.valid
931 assert ~valid
932 for i in range(30):
933 yield
934 yield
935 insn = yield i_out.insn
936 valid = yield i_out.valid
937 insn = yield i_out.insn
938 assert valid
939 assert insn == 0x00000040, \
940 "insn @%x=%x expected 00000040" % (nia, insn)
941 yield i_in.req.eq(0)
942
943
944 def test_icache(mem):
945 dut = ICache()
946
947 memory = Memory(width=64, depth=512, init=mem)
948 sram = SRAM(memory=memory, granularity=8)
949
950 m = Module()
951
952 m.submodules.icache = dut
953 m.submodules.sram = sram
954
955 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
956 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
957 m.d.comb += sram.bus.we.eq(dut.bus.we)
958 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
959 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
960 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
961
962 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
963 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
964
965 # nmigen Simulation
966 sim = Simulator(m)
967 sim.add_clock(1e-6)
968
969 sim.add_sync_process(wrap(icache_sim(dut)))
970 with sim.write_vcd('test_icache.vcd'):
971 sim.run()
972
973
974 if __name__ == '__main__':
975 dut = ICache()
976 vl = rtlil.convert(dut, ports=[])
977 with open("test_icache.il", "w") as f:
978 f.write(vl)
979
980 # set up memory every 32-bits with incrementing values 0 1 2 ...
981 mem = []
982 for i in range(512):
983 mem.append((i*2) | ((i*2+1)<<32))
984
985 test_icache(mem)