icache.py fix formatting
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48
49 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
50 # Also, check out the cxxsim nmigen branch, and latest yosys from git
51 from nmutil.sim_tmp_alternative import Simulator, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
77 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
78 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
79 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
80 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
81 INSN_PER_ROW = ROW_SIZE_BITS // 32
82
83 # Bit fields counts in the address
84 #
85 # INSN_BITS is the number of bits to select an instruction in a row
86 INSN_BITS = log2_int(INSN_PER_ROW)
87 # ROW_BITS is the number of bits to select a row
88 ROW_BITS = log2_int(BRAM_ROWS)
89 # ROW_LINE_BITS is the number of bits to select a row within a line
90 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
91 # LINE_OFF_BITS is the number of bits for the offset in a cache line
92 LINE_OFF_BITS = log2_int(LINE_SIZE)
93 # ROW_OFF_BITS is the number of bits for the offset in a row
94 ROW_OFF_BITS = log2_int(ROW_SIZE)
95 # INDEX_BITS is the number of bits to select a cache line
96 INDEX_BITS = log2_int(NUM_LINES)
97 # SET_SIZE_BITS is the log base 2 of the set size
98 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
99 # TAG_BITS is the number of bits of the tag part of the address
100 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
101 # TAG_WIDTH is the width in bits of each way of the tag RAM
102 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
103
104 # WAY_BITS is the number of bits to select a way
105 WAY_BITS = log2_int(NUM_WAYS)
106 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
107
108 # L1 ITLB
109 TLB_BITS = log2_int(TLB_SIZE)
110 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
111 TLB_PTE_BITS = 64
112
113 print("BRAM_ROWS =", BRAM_ROWS)
114 print("INDEX_BITS =", INDEX_BITS)
115 print("INSN_BITS =", INSN_BITS)
116 print("INSN_PER_ROW =", INSN_PER_ROW)
117 print("LINE_SIZE =", LINE_SIZE)
118 print("LINE_OFF_BITS =", LINE_OFF_BITS)
119 print("LOG_LENGTH =", LOG_LENGTH)
120 print("NUM_LINES =", NUM_LINES)
121 print("NUM_WAYS =", NUM_WAYS)
122 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
123 print("ROW_BITS =", ROW_BITS)
124 print("ROW_OFF_BITS =", ROW_OFF_BITS)
125 print("ROW_LINE_BITS =", ROW_LINE_BITS)
126 print("ROW_PER_LINE =", ROW_PER_LINE)
127 print("ROW_SIZE =", ROW_SIZE)
128 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
129 print("SET_SIZE_BITS =", SET_SIZE_BITS)
130 print("SIM =", SIM)
131 print("TAG_BITS =", TAG_BITS)
132 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
133 print("TAG_BITS =", TAG_BITS)
134 print("TLB_BITS =", TLB_BITS)
135 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
136 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
137 print("TLB_PTE_BITS =", TLB_PTE_BITS)
138 print("TLB_SIZE =", TLB_SIZE)
139 print("WAY_BITS =", WAY_BITS)
140
141 # from microwatt/utils.vhdl
142 def ispow2(n):
143 return n != 0 and (n & (n - 1)) == 0
144
145 assert LINE_SIZE % ROW_SIZE == 0
146 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
147 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
148 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
149 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
150 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
151 "geometry bits don't add up"
152 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
153 "geometry bits don't add up"
154 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
155 "geometry bits don't add up"
156 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
157 "geometry bits don't add up"
158
159 # Example of layout for 32 lines of 64 bytes:
160 #
161 # .. tag |index| line |
162 # .. | row | |
163 # .. | | | |00| zero (2)
164 # .. | | |-| | INSN_BITS (1)
165 # .. | |---| | ROW_LINE_BITS (3)
166 # .. | |--- - --| LINE_OFF_BITS (6)
167 # .. | |- --| ROW_OFF_BITS (3)
168 # .. |----- ---| | ROW_BITS (8)
169 # .. |-----| | INDEX_BITS (5)
170 # .. --------| | TAG_BITS (53)
171
172 # The cache data BRAM organized as described above for each way
173 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
174 #
175 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
176 # not handle a clean (commented) definition of the cache tags as a 3d
177 # memory. For now, work around it by putting all the tags
178 def CacheTagArray():
179 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
180 for x in range(NUM_LINES))
181
182 # The cache valid bits
183 def CacheValidBitsArray():
184 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
185 for x in range(NUM_LINES))
186
187 def RowPerLineValidArray():
188 return Array(Signal(name="rows_valid_%d" %x) \
189 for x in range(ROW_PER_LINE))
190
191
192 # TODO to be passed to nigmen as ram attributes
193 # attribute ram_style : string;
194 # attribute ram_style of cache_tags : signal is "distributed";
195
196
197 def TLBValidBitsArray():
198 return Array(Signal(name="tlbvalid_%d" %x) \
199 for x in range(TLB_SIZE))
200
201 def TLBTagArray():
202 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
203 for x in range(TLB_SIZE))
204
205 def TLBPtesArray():
206 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
207 for x in range(TLB_SIZE))
208
209 # Cache RAM interface
210 def CacheRamOut():
211 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
212 for x in range(NUM_WAYS))
213
214 # PLRU output interface
215 def PLRUOut():
216 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
217 for x in range(NUM_LINES))
218
219 # Return the cache line index (tag index) for an address
220 def get_index(addr):
221 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
222
223 # Return the cache row index (data memory) for an address
224 def get_row(addr):
225 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
226
227 # Return the index of a row within a line
228 def get_row_of_line(row):
229 return row[:ROW_LINE_BITS]
230
231 # Returns whether this is the last row of a line
232 def is_last_row_addr(addr, last):
233 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
234
235 # Returns whether this is the last row of a line
236 def is_last_row(row, last):
237 return get_row_of_line(row) == last
238
239 # Return the next row in the current cache line. We use a dedicated
240 # function in order to limit the size of the generated adder to be
241 # only the bits within a cache line (3 bits with default settings)
242 def next_row(row):
243 row_v = row[0:ROW_LINE_BITS] + 1
244 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
245
246 # Read the instruction word for the given address
247 # in the current cache row
248 def read_insn_word(addr, data):
249 word = addr[2:INSN_BITS+2]
250 return data.word_select(word, 32)
251
252 # Get the tag value from the address
253 def get_tag(addr):
254 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
255
256 # Read a tag from a tag memory row
257 def read_tag(way, tagset):
258 return tagset.word_select(way, TAG_BITS)
259
260 # Write a tag to tag memory row
261 def write_tag(way, tagset, tag):
262 return read_tag(way, tagset).eq(tag)
263
264 # Simple hash for direct-mapped TLB index
265 def hash_ea(addr):
266 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
267 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
268 ] ^ addr[
269 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
270 ]
271 return hsh
272
273
274 # Cache reload state machine
275 @unique
276 class State(Enum):
277 IDLE = 0
278 CLR_TAG = 1
279 WAIT_ACK = 2
280
281
282 class RegInternal(RecordObject):
283 def __init__(self):
284 super().__init__()
285 # Cache hit state (Latches for 1 cycle BRAM access)
286 self.hit_way = Signal(NUM_WAYS)
287 self.hit_nia = Signal(64)
288 self.hit_smark = Signal()
289 self.hit_valid = Signal()
290
291 # Cache miss state (reload state machine)
292 self.state = Signal(State, reset=State.IDLE)
293 self.wb = WBMasterOut("wb")
294 self.req_adr = Signal(64)
295 self.store_way = Signal(NUM_WAYS)
296 self.store_index = Signal(NUM_LINES)
297 self.store_row = Signal(BRAM_ROWS)
298 self.store_tag = Signal(TAG_BITS)
299 self.store_valid = Signal()
300 self.end_row_ix = Signal(ROW_LINE_BITS)
301 self.rows_valid = RowPerLineValidArray()
302
303 # TLB miss state
304 self.fetch_failed = Signal()
305
306
307 class ICache(Elaboratable):
308 """64 bit direct mapped icache. All instructions are 4B aligned."""
309 def __init__(self):
310 self.i_in = Fetch1ToICacheType(name="i_in")
311 self.i_out = ICacheToDecode1Type(name="i_out")
312
313 self.m_in = MMUToICacheType(name="m_in")
314
315 self.stall_in = Signal()
316 self.stall_out = Signal()
317 self.flush_in = Signal()
318 self.inval_in = Signal()
319
320 self.wb_out = WBMasterOut(name="wb_out")
321 self.wb_in = WBSlaveOut(name="wb_in")
322
323 self.log_out = Signal(54)
324
325
326 # Generate a cache RAM for each way
327 def rams(self, m, r, cache_out_row, use_previous,
328 replace_way, req_row):
329
330 comb = m.d.comb
331 sync = m.d.sync
332
333 wb_in, stall_in = self.wb_in, self.stall_in
334
335 for i in range(NUM_WAYS):
336 do_read = Signal(name="do_rd_%d" % i)
337 do_write = Signal(name="do_wr_%d" % i)
338 rd_addr = Signal(ROW_BITS)
339 wr_addr = Signal(ROW_BITS)
340 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
341 wr_sel = Signal(ROW_SIZE)
342
343 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
344 setattr(m.submodules, "cacheram_%d" % i, way)
345
346 comb += way.rd_en.eq(do_read)
347 comb += way.rd_addr.eq(rd_addr)
348 comb += d_out.eq(way.rd_data_o)
349 comb += way.wr_sel.eq(wr_sel)
350 comb += way.wr_addr.eq(wr_addr)
351 comb += way.wr_data.eq(wb_in.dat)
352
353 comb += do_read.eq(~(stall_in | use_previous))
354 comb += do_write.eq(wb_in.ack & (replace_way == i))
355
356 with m.If(do_write):
357 sync += Display("cache write adr: %x data: %lx",
358 wr_addr, way.wr_data)
359
360 with m.If(r.hit_way == i):
361 comb += cache_out_row.eq(d_out)
362 with m.If(do_read):
363 sync += Display("cache read adr: %x data: %x",
364 req_row, d_out)
365
366 comb += rd_addr.eq(req_row)
367 comb += wr_addr.eq(r.store_row)
368 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
369
370 # Generate PLRUs
371 def maybe_plrus(self, m, r, plru_victim):
372 comb = m.d.comb
373
374 with m.If(NUM_WAYS > 1):
375 for i in range(NUM_LINES):
376 plru_acc_i = Signal(WAY_BITS)
377 plru_acc_en = Signal()
378 plru = PLRU(WAY_BITS)
379 setattr(m.submodules, "plru_%d" % i, plru)
380
381 comb += plru.acc_i.eq(plru_acc_i)
382 comb += plru.acc_en.eq(plru_acc_en)
383
384 # PLRU interface
385 with m.If(get_index(r.hit_nia) == i):
386 comb += plru.acc_en.eq(r.hit_valid)
387
388 comb += plru.acc_i.eq(r.hit_way)
389 comb += plru_victim[i].eq(plru.lru_o)
390
391 # TLB hit detection and real address generation
392 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
393 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
394 priv_fault, access_ok):
395
396 comb = m.d.comb
397
398 i_in = self.i_in
399
400 pte = Signal(TLB_PTE_BITS)
401 ttag = Signal(TLB_EA_TAG_BITS)
402
403 comb += tlb_req_index.eq(hash_ea(i_in.nia))
404 comb += pte.eq(itlb_ptes[tlb_req_index])
405 comb += ttag.eq(itlb_tags[tlb_req_index])
406
407 with m.If(i_in.virt_mode):
408 comb += real_addr.eq(Cat(
409 i_in.nia[:TLB_LG_PGSZ],
410 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
411 ))
412
413 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
414 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
415
416 comb += eaa_priv.eq(pte[3])
417
418 with m.Else():
419 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
420 comb += ra_valid.eq(1)
421 comb += eaa_priv.eq(1)
422
423 # No IAMR, so no KUEP support for now
424 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
425 comb += access_ok.eq(ra_valid & ~priv_fault)
426
427 # iTLB update
428 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
429 comb = m.d.comb
430 sync = m.d.sync
431
432 m_in = self.m_in
433
434 wr_index = Signal(TLB_SIZE)
435 comb += wr_index.eq(hash_ea(m_in.addr))
436
437 with m.If(m_in.tlbie & m_in.doall):
438 # Clear all valid bits
439 for i in range(TLB_SIZE):
440 sync += itlb_valid_bits[i].eq(0)
441
442 with m.Elif(m_in.tlbie):
443 # Clear entry regardless of hit or miss
444 sync += itlb_valid_bits[wr_index].eq(0)
445
446 with m.Elif(m_in.tlbld):
447 sync += itlb_tags[wr_index].eq(
448 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
449 )
450 sync += itlb_ptes[wr_index].eq(m_in.pte)
451 sync += itlb_valid_bits[wr_index].eq(1)
452
453 # Cache hit detection, output to fetch2 and other misc logic
454 def icache_comb(self, m, use_previous, r, req_index, req_row,
455 req_hit_way, req_tag, real_addr, req_laddr,
456 cache_valid_bits, cache_tags, access_ok,
457 req_is_hit, req_is_miss, replace_way,
458 plru_victim, cache_out_row):
459
460 comb = m.d.comb
461
462 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
463 flush_in, stall_out = self.flush_in, self.stall_out
464
465 is_hit = Signal()
466 hit_way = Signal(NUM_WAYS)
467
468 # i_in.sequential means that i_in.nia this cycle is 4 more than
469 # last cycle. If we read more than 32 bits at a time, had a
470 # cache hit last cycle, and we don't want the first 32-bit chunk
471 # then we can keep the data we read last cycle and just use that.
472 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
473 comb += use_previous.eq(i_in.sequential & r.hit_valid)
474
475 # Extract line, row and tag from request
476 comb += req_index.eq(get_index(i_in.nia))
477 comb += req_row.eq(get_row(i_in.nia))
478 comb += req_tag.eq(get_tag(real_addr))
479
480 # Calculate address of beginning of cache row, will be
481 # used for cache miss processing if needed
482 comb += req_laddr.eq(Cat(
483 Const(0, ROW_OFF_BITS),
484 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
485 ))
486
487 # Test if pending request is a hit on any way
488 hitcond = Signal()
489 comb += hitcond.eq((r.state == State.WAIT_ACK)
490 & (req_index == r.store_index)
491 & r.rows_valid[req_row % ROW_PER_LINE]
492 )
493 with m.If(i_in.req):
494 cvb = Signal(NUM_WAYS)
495 ctag = Signal(TAG_RAM_WIDTH)
496 comb += ctag.eq(cache_tags[req_index])
497 comb += cvb.eq(cache_valid_bits[req_index])
498 for i in range(NUM_WAYS):
499 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
500 comb += tagi.eq(read_tag(i, ctag))
501 hit_test = Signal(name="hit_test%d" % i)
502 comb += hit_test.eq(i == r.store_way)
503 with m.If((cvb[i] | (hitcond & hit_test))
504 & (tagi == req_tag)):
505 comb += hit_way.eq(i)
506 comb += is_hit.eq(1)
507
508 # Generate the "hit" and "miss" signals
509 # for the synchronous blocks
510 with m.If(i_in.req & access_ok & ~flush_in):
511 comb += req_is_hit.eq(is_hit)
512 comb += req_is_miss.eq(~is_hit)
513
514 with m.Else():
515 comb += req_is_hit.eq(0)
516 comb += req_is_miss.eq(0)
517
518 comb += req_hit_way.eq(hit_way)
519
520 # The way to replace on a miss
521 with m.If(r.state == State.CLR_TAG):
522 comb += replace_way.eq(plru_victim[r.store_index])
523 with m.Else():
524 comb += replace_way.eq(r.store_way)
525
526 # Output instruction from current cache row
527 #
528 # Note: This is a mild violation of our design principle of
529 # having pipeline stages output from a clean latch. In this
530 # case we output the result of a mux. The alternative would
531 # be output an entire row which I prefer not to do just yet
532 # as it would force fetch2 to know about some of the cache
533 # geometry information.
534 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
535 comb += i_out.valid.eq(r.hit_valid)
536 comb += i_out.nia.eq(r.hit_nia)
537 comb += i_out.stop_mark.eq(r.hit_smark)
538 comb += i_out.fetch_failed.eq(r.fetch_failed)
539
540 # Stall fetch1 if we have a miss on cache or TLB
541 # or a protection fault
542 comb += stall_out.eq(~(is_hit & access_ok))
543
544 # Wishbone requests output (from the cache miss reload machine)
545 comb += wb_out.eq(r.wb)
546
547 # Cache hit synchronous machine
548 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
549 req_index, req_tag, real_addr):
550 sync = m.d.sync
551
552 i_in, stall_in = self.i_in, self.stall_in
553 flush_in = self.flush_in
554
555 # keep outputs to fetch2 unchanged on a stall
556 # except that flush or reset sets valid to 0
557 # If use_previous, keep the same data as last
558 # cycle and use the second half
559 with m.If(stall_in | use_previous):
560 with m.If(flush_in):
561 sync += r.hit_valid.eq(0)
562 with m.Else():
563 # On a hit, latch the request for the next cycle,
564 # when the BRAM data will be available on the
565 # cache_out output of the corresponding way
566 sync += r.hit_valid.eq(req_is_hit)
567
568 with m.If(req_is_hit):
569 sync += r.hit_way.eq(req_hit_way)
570 sync += Display(
571 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
572 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
573 i_in.stop_mark, req_index, req_tag, \
574 req_hit_way, real_addr
575 )
576
577
578
579 with m.If(~stall_in):
580 # Send stop marks and NIA down regardless of validity
581 sync += r.hit_smark.eq(i_in.stop_mark)
582 sync += r.hit_nia.eq(i_in.nia)
583
584 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
585 req_index, req_tag, replace_way, real_addr):
586 comb = m.d.comb
587 sync = m.d.sync
588
589 i_in = self.i_in
590
591 # Reset per-row valid flags, only used in WAIT_ACK
592 for i in range(ROW_PER_LINE):
593 sync += r.rows_valid[i].eq(0)
594
595 # We need to read a cache line
596 with m.If(req_is_miss):
597 sync += Display(
598 "cache miss nia:%x IR:%x SM:%x idx:%x "
599 " way:%x tag:%x RA:%x", i_in.nia,
600 i_in.virt_mode, i_in.stop_mark, req_index,
601 replace_way, req_tag, real_addr
602 )
603
604 # Keep track of our index and way for subsequent stores
605 st_row = Signal(BRAM_ROWS)
606 comb += st_row.eq(get_row(req_laddr))
607 sync += r.store_index.eq(req_index)
608 sync += r.store_row.eq(st_row)
609 sync += r.store_tag.eq(req_tag)
610 sync += r.store_valid.eq(1)
611 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
612
613 # Prep for first wishbone read. We calculate the address
614 # of the start of the cache line and start the WB cycle.
615 sync += r.req_adr.eq(req_laddr)
616 sync += r.wb.cyc.eq(1)
617 sync += r.wb.stb.eq(1)
618
619 # Track that we had one request sent
620 sync += r.state.eq(State.CLR_TAG)
621
622 def icache_miss_clr_tag(self, m, r, replace_way,
623 cache_valid_bits, req_index,
624 tagset, cache_tags):
625
626 comb = m.d.comb
627 sync = m.d.sync
628
629 # Get victim way from plru
630 sync += r.store_way.eq(replace_way)
631 # Force misses on that way while reloading that line
632 cv = Signal(INDEX_BITS)
633 comb += cv.eq(cache_valid_bits[req_index])
634 comb += cv.bit_select(replace_way, 1).eq(0)
635 sync += cache_valid_bits[req_index].eq(cv)
636
637 for i in range(NUM_WAYS):
638 with m.If(i == replace_way):
639 comb += tagset.eq(cache_tags[r.store_index])
640 comb += write_tag(i, tagset, r.store_tag)
641 sync += cache_tags[r.store_index].eq(tagset)
642
643 sync += r.state.eq(State.WAIT_ACK)
644
645 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
646 stbs_done, cache_valid_bits):
647 comb = m.d.comb
648 sync = m.d.sync
649
650 wb_in = self.wb_in
651
652 # Requests are all sent if stb is 0
653 stbs_zero = Signal()
654 comb += stbs_zero.eq(r.wb.stb == 0)
655 comb += stbs_done.eq(stbs_zero)
656
657 # If we are still sending requests, was one accepted?
658 with m.If(~wb_in.stall & ~stbs_zero):
659 # That was the last word? We are done sending.
660 # Clear stb and set stbs_done so we can handle
661 # an eventual last ack on the same cycle.
662 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
663 sync += Display(
664 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
665 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
666 "stbs_done:%x", r.wb.adr, r.end_row_ix,
667 r.wb.stb, stbs_zero, stbs_done
668 )
669 sync += r.wb.stb.eq(0)
670 comb += stbs_done.eq(1)
671
672 # Calculate the next row address
673 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
674 comb += rarange.eq(
675 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
676 )
677 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
678 rarange
679 )
680 sync += Display("RARANGE r.req_adr:%x rarange:%x "
681 "stbs_zero:%x stbs_done:%x",
682 r.req_adr, rarange, stbs_zero, stbs_done)
683
684 # Incoming acks processing
685 with m.If(wb_in.ack):
686 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
687 "stbs_done:%x",
688 wb_in.dat, stbs_zero, stbs_done)
689
690 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
691
692 # Check for completion
693 with m.If(stbs_done &
694 is_last_row(r.store_row, r.end_row_ix)):
695 # Complete wishbone cycle
696 sync += r.wb.cyc.eq(0)
697 # be nice, clear addr
698 sync += r.req_adr.eq(0)
699
700 # Cache line is now valid
701 cv = Signal(INDEX_BITS)
702 comb += cv.eq(cache_valid_bits[r.store_index])
703 comb += cv.bit_select(replace_way, 1).eq(
704 r.store_valid & ~inval_in
705 )
706 sync += cache_valid_bits[r.store_index].eq(cv)
707
708 sync += r.state.eq(State.IDLE)
709
710 # not completed, move on to next request in row
711 with m.Else():
712 # Increment store row counter
713 sync += r.store_row.eq(next_row(r.store_row))
714
715
716 # Cache miss/reload synchronous machine
717 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
718 req_index, req_laddr, req_tag, replace_way,
719 cache_tags, access_ok, real_addr):
720 comb = m.d.comb
721 sync = m.d.sync
722
723 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
724 stall_in, flush_in = self.stall_in, self.flush_in
725 inval_in = self.inval_in
726
727 tagset = Signal(TAG_RAM_WIDTH)
728 stbs_done = Signal()
729
730 comb += r.wb.sel.eq(-1)
731 comb += r.wb.adr.eq(r.req_adr[3:])
732
733 # Process cache invalidations
734 with m.If(inval_in):
735 for i in range(NUM_LINES):
736 sync += cache_valid_bits[i].eq(0)
737 sync += r.store_valid.eq(0)
738
739 # Main state machine
740 with m.Switch(r.state):
741
742 with m.Case(State.IDLE):
743 self.icache_miss_idle(
744 m, r, req_is_miss, req_laddr,
745 req_index, req_tag, replace_way,
746 real_addr
747 )
748
749 with m.Case(State.CLR_TAG, State.WAIT_ACK):
750 with m.If(r.state == State.CLR_TAG):
751 self.icache_miss_clr_tag(
752 m, r, replace_way,
753 cache_valid_bits, req_index,
754 tagset, cache_tags
755 )
756
757 self.icache_miss_wait_ack(
758 m, r, replace_way, inval_in,
759 stbs_done, cache_valid_bits
760 )
761
762 # TLB miss and protection fault processing
763 with m.If(flush_in | m_in.tlbld):
764 sync += r.fetch_failed.eq(0)
765 with m.Elif(i_in.req & ~access_ok & ~stall_in):
766 sync += r.fetch_failed.eq(1)
767
768 # icache_log: if LOG_LENGTH > 0 generate
769 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
770 req_is_miss, req_is_hit, lway, wstate, r):
771 comb = m.d.comb
772 sync = m.d.sync
773
774 wb_in, i_out = self.wb_in, self.i_out
775 log_out, stall_out = self.log_out, self.stall_out
776
777 # Output data to logger
778 for i in range(LOG_LENGTH):
779 log_data = Signal(54)
780 lway = Signal(NUM_WAYS)
781 wstate = Signal()
782
783 sync += lway.eq(req_hit_way)
784 sync += wstate.eq(0)
785
786 with m.If(r.state != State.IDLE):
787 sync += wstate.eq(1)
788
789 sync += log_data.eq(Cat(
790 ra_valid, access_ok, req_is_miss, req_is_hit,
791 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
792 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
793 r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
794 ))
795 comb += log_out.eq(log_data)
796
797 def elaborate(self, platform):
798
799 m = Module()
800 comb = m.d.comb
801
802 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
803 cache_tags = CacheTagArray()
804 cache_valid_bits = CacheValidBitsArray()
805
806 itlb_valid_bits = TLBValidBitsArray()
807 itlb_tags = TLBTagArray()
808 itlb_ptes = TLBPtesArray()
809 # TODO to be passed to nmigen as ram attributes
810 # attribute ram_style of itlb_tags : signal is "distributed";
811 # attribute ram_style of itlb_ptes : signal is "distributed";
812
813 # Privilege bit from PTE EAA field
814 eaa_priv = Signal()
815
816 r = RegInternal()
817
818 # Async signal on incoming request
819 req_index = Signal(NUM_LINES)
820 req_row = Signal(BRAM_ROWS)
821 req_hit_way = Signal(NUM_WAYS)
822 req_tag = Signal(TAG_BITS)
823 req_is_hit = Signal()
824 req_is_miss = Signal()
825 req_laddr = Signal(64)
826
827 tlb_req_index = Signal(TLB_SIZE)
828 real_addr = Signal(REAL_ADDR_BITS)
829 ra_valid = Signal()
830 priv_fault = Signal()
831 access_ok = Signal()
832 use_previous = Signal()
833
834 cache_out_row = Signal(ROW_SIZE_BITS)
835
836 plru_victim = PLRUOut()
837 replace_way = Signal(NUM_WAYS)
838
839 # call sub-functions putting everything together,
840 # using shared signals established above
841 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
842 self.maybe_plrus(m, r, plru_victim)
843 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
844 itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
845 access_ok)
846 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
847 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
848 req_tag, real_addr, req_laddr, cache_valid_bits,
849 cache_tags, access_ok, req_is_hit, req_is_miss,
850 replace_way, plru_victim, cache_out_row)
851 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
852 req_index, req_tag, real_addr)
853 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
854 req_laddr, req_tag, replace_way, cache_tags,
855 access_ok, real_addr)
856 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
857 # req_is_miss, req_is_hit, lway, wstate, r)
858
859 return m
860
861
862 def icache_sim(dut):
863 i_out = dut.i_in
864 i_in = dut.i_out
865 m_out = dut.m_in
866
867 yield i_in.valid.eq(0)
868 yield i_out.priv_mode.eq(1)
869 yield i_out.req.eq(0)
870 yield i_out.nia.eq(0)
871 yield i_out.stop_mark.eq(0)
872 yield m_out.tlbld.eq(0)
873 yield m_out.tlbie.eq(0)
874 yield m_out.addr.eq(0)
875 yield m_out.pte.eq(0)
876 yield
877 yield
878 yield
879 yield
880 yield i_out.req.eq(1)
881 yield i_out.nia.eq(Const(0x0000000000000004, 64))
882 for i in range(30):
883 yield
884 yield
885 valid = yield i_in.valid
886 nia = yield i_out.nia
887 insn = yield i_in.insn
888 print(f"valid? {valid}")
889 assert valid
890 assert insn == 0x00000001, \
891 "insn @%x=%x expected 00000001" % (nia, insn)
892 yield i_out.req.eq(0)
893 yield
894
895 # hit
896 yield
897 yield
898 yield i_out.req.eq(1)
899 yield i_out.nia.eq(Const(0x0000000000000008, 64))
900 yield
901 yield
902 valid = yield i_in.valid
903 nia = yield i_in.nia
904 insn = yield i_in.insn
905 assert valid
906 assert insn == 0x00000002, \
907 "insn @%x=%x expected 00000002" % (nia, insn)
908 yield
909
910 # another miss
911 yield i_out.req.eq(1)
912 yield i_out.nia.eq(Const(0x0000000000000040, 64))
913 for i in range(30):
914 yield
915 yield
916 valid = yield i_in.valid
917 nia = yield i_out.nia
918 insn = yield i_in.insn
919 assert valid
920 assert insn == 0x00000010, \
921 "insn @%x=%x expected 00000010" % (nia, insn)
922
923 # test something that aliases
924 yield i_out.req.eq(1)
925 yield i_out.nia.eq(Const(0x0000000000000100, 64))
926 yield
927 yield
928 valid = yield i_in.valid
929 assert ~valid
930 for i in range(30):
931 yield
932 yield
933 insn = yield i_in.insn
934 valid = yield i_in.valid
935 insn = yield i_in.insn
936 assert valid
937 assert insn == 0x00000040, \
938 "insn @%x=%x expected 00000040" % (nia, insn)
939 yield i_out.req.eq(0)
940
941
942
943 def test_icache(mem):
944 dut = ICache()
945
946 memory = Memory(width=64, depth=512, init=mem)
947 sram = SRAM(memory=memory, granularity=8)
948
949 m = Module()
950
951 m.submodules.icache = dut
952 m.submodules.sram = sram
953
954 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
955 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
956 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
957 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
958 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
959 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
960
961 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
962 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
963
964 # nmigen Simulation
965 sim = Simulator(m)
966 sim.add_clock(1e-6)
967
968 sim.add_sync_process(wrap(icache_sim(dut)))
969 with sim.write_vcd('test_icache.vcd'):
970 sim.run()
971
972 if __name__ == '__main__':
973 dut = ICache()
974 vl = rtlil.convert(dut, ports=[])
975 with open("test_icache.il", "w") as f:
976 f.write(vl)
977
978 mem = []
979 for i in range(512):
980 mem.append((i*2) | ((i*2+1)<<32))
981
982 test_icache(mem)
983