add I-Cache standard bus (not used yet)
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 )
41
42 from nmigen_soc.wishbone.bus import Interface
43
44 # for test
45 from soc.bus.sram import SRAM
46 from nmigen import Memory
47 from nmutil.util import wrap
48 from nmigen.cli import main, rtlil
49
50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
52 from nmutil.sim_tmp_alternative import Simulator, Settle
53
54
55 SIM = 0
56 LINE_SIZE = 64
57 # BRAM organisation: We never access more than wishbone_data_bits
58 # at a time so to save resources we make the array only that wide,
59 # and use consecutive indices for to make a cache "line"
60 #
61 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
62 ROW_SIZE = WB_DATA_BITS // 8
63 # Number of lines in a set
64 NUM_LINES = 16
65 # Number of ways
66 NUM_WAYS = 4
67 # L1 ITLB number of entries (direct mapped)
68 TLB_SIZE = 64
69 # L1 ITLB log_2(page_size)
70 TLB_LG_PGSZ = 12
71 # Number of real address bits that we store
72 REAL_ADDR_BITS = 56
73 # Non-zero to enable log data collection
74 LOG_LENGTH = 0
75
76 ROW_SIZE_BITS = ROW_SIZE * 8
77 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
80 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
81 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
82 INSN_PER_ROW = ROW_SIZE_BITS // 32
83
84 # Bit fields counts in the address
85 #
86 # INSN_BITS is the number of bits to select an instruction in a row
87 INSN_BITS = log2_int(INSN_PER_ROW)
88 # ROW_BITS is the number of bits to select a row
89 ROW_BITS = log2_int(BRAM_ROWS)
90 # ROW_LINE_BITS is the number of bits to select a row within a line
91 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
92 # LINE_OFF_BITS is the number of bits for the offset in a cache line
93 LINE_OFF_BITS = log2_int(LINE_SIZE)
94 # ROW_OFF_BITS is the number of bits for the offset in a row
95 ROW_OFF_BITS = log2_int(ROW_SIZE)
96 # INDEX_BITS is the number of bits to select a cache line
97 INDEX_BITS = log2_int(NUM_LINES)
98 # SET_SIZE_BITS is the log base 2 of the set size
99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
100 # TAG_BITS is the number of bits of the tag part of the address
101 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
102 # TAG_WIDTH is the width in bits of each way of the tag RAM
103 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
104
105 # WAY_BITS is the number of bits to select a way
106 WAY_BITS = log2_int(NUM_WAYS)
107 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
108
109 # L1 ITLB
110 TLB_BITS = log2_int(TLB_SIZE)
111 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
112 TLB_PTE_BITS = 64
113
114 print("BRAM_ROWS =", BRAM_ROWS)
115 print("INDEX_BITS =", INDEX_BITS)
116 print("INSN_BITS =", INSN_BITS)
117 print("INSN_PER_ROW =", INSN_PER_ROW)
118 print("LINE_SIZE =", LINE_SIZE)
119 print("LINE_OFF_BITS =", LINE_OFF_BITS)
120 print("LOG_LENGTH =", LOG_LENGTH)
121 print("NUM_LINES =", NUM_LINES)
122 print("NUM_WAYS =", NUM_WAYS)
123 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
124 print("ROW_BITS =", ROW_BITS)
125 print("ROW_OFF_BITS =", ROW_OFF_BITS)
126 print("ROW_LINE_BITS =", ROW_LINE_BITS)
127 print("ROW_PER_LINE =", ROW_PER_LINE)
128 print("ROW_SIZE =", ROW_SIZE)
129 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
130 print("SET_SIZE_BITS =", SET_SIZE_BITS)
131 print("SIM =", SIM)
132 print("TAG_BITS =", TAG_BITS)
133 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
134 print("TAG_BITS =", TAG_BITS)
135 print("TLB_BITS =", TLB_BITS)
136 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
137 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
138 print("TLB_PTE_BITS =", TLB_PTE_BITS)
139 print("TLB_SIZE =", TLB_SIZE)
140 print("WAY_BITS =", WAY_BITS)
141
142 # from microwatt/utils.vhdl
143 def ispow2(n):
144 return n != 0 and (n & (n - 1)) == 0
145
146 assert LINE_SIZE % ROW_SIZE == 0
147 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
148 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
149 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
150 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
151 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
152 "geometry bits don't add up"
153 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
154 "geometry bits don't add up"
155 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
156 "geometry bits don't add up"
157 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
158 "geometry bits don't add up"
159
160 # Example of layout for 32 lines of 64 bytes:
161 #
162 # .. tag |index| line |
163 # .. | row | |
164 # .. | | | |00| zero (2)
165 # .. | | |-| | INSN_BITS (1)
166 # .. | |---| | ROW_LINE_BITS (3)
167 # .. | |--- - --| LINE_OFF_BITS (6)
168 # .. | |- --| ROW_OFF_BITS (3)
169 # .. |----- ---| | ROW_BITS (8)
170 # .. |-----| | INDEX_BITS (5)
171 # .. --------| | TAG_BITS (53)
172
173 # The cache data BRAM organized as described above for each way
174 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
175 #
176 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
177 # not handle a clean (commented) definition of the cache tags as a 3d
178 # memory. For now, work around it by putting all the tags
179 def CacheTagArray():
180 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
181 for x in range(NUM_LINES))
182
183 # The cache valid bits
184 def CacheValidBitsArray():
185 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
186 for x in range(NUM_LINES))
187
188 def RowPerLineValidArray():
189 return Array(Signal(name="rows_valid_%d" %x) \
190 for x in range(ROW_PER_LINE))
191
192
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
196
197
198 def TLBValidBitsArray():
199 return Array(Signal(name="tlbvalid_%d" %x) \
200 for x in range(TLB_SIZE))
201
202 def TLBTagArray():
203 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
204 for x in range(TLB_SIZE))
205
206 def TLBPtesArray():
207 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
208 for x in range(TLB_SIZE))
209
210 # Cache RAM interface
211 def CacheRamOut():
212 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
213 for x in range(NUM_WAYS))
214
215 # PLRU output interface
216 def PLRUOut():
217 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
218 for x in range(NUM_LINES))
219
220 # Return the cache line index (tag index) for an address
221 def get_index(addr):
222 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
223
224 # Return the cache row index (data memory) for an address
225 def get_row(addr):
226 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
227
228 # Return the index of a row within a line
229 def get_row_of_line(row):
230 return row[:ROW_LINE_BITS]
231
232 # Returns whether this is the last row of a line
233 def is_last_row_addr(addr, last):
234 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
235
236 # Returns whether this is the last row of a line
237 def is_last_row(row, last):
238 return get_row_of_line(row) == last
239
240 # Return the next row in the current cache line. We use a dedicated
241 # function in order to limit the size of the generated adder to be
242 # only the bits within a cache line (3 bits with default settings)
243 def next_row(row):
244 row_v = row[0:ROW_LINE_BITS] + 1
245 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
246
247 # Read the instruction word for the given address
248 # in the current cache row
249 def read_insn_word(addr, data):
250 word = addr[2:INSN_BITS+2]
251 return data.word_select(word, 32)
252
253 # Get the tag value from the address
254 def get_tag(addr):
255 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
256
257 # Read a tag from a tag memory row
258 def read_tag(way, tagset):
259 return tagset.word_select(way, TAG_BITS)
260
261 # Write a tag to tag memory row
262 def write_tag(way, tagset, tag):
263 return read_tag(way, tagset).eq(tag)
264
265 # Simple hash for direct-mapped TLB index
266 def hash_ea(addr):
267 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
268 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
269 ] ^ addr[
270 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
271 ]
272 return hsh
273
274
275 # Cache reload state machine
276 @unique
277 class State(Enum):
278 IDLE = 0
279 CLR_TAG = 1
280 WAIT_ACK = 2
281
282
283 class RegInternal(RecordObject):
284 def __init__(self):
285 super().__init__()
286 # Cache hit state (Latches for 1 cycle BRAM access)
287 self.hit_way = Signal(NUM_WAYS)
288 self.hit_nia = Signal(64)
289 self.hit_smark = Signal()
290 self.hit_valid = Signal()
291
292 # Cache miss state (reload state machine)
293 self.state = Signal(State, reset=State.IDLE)
294 self.wb = WBMasterOut("wb")
295 self.req_adr = Signal(64)
296 self.store_way = Signal(NUM_WAYS)
297 self.store_index = Signal(NUM_LINES)
298 self.store_row = Signal(BRAM_ROWS)
299 self.store_tag = Signal(TAG_BITS)
300 self.store_valid = Signal()
301 self.end_row_ix = Signal(ROW_LINE_BITS)
302 self.rows_valid = RowPerLineValidArray()
303
304 # TLB miss state
305 self.fetch_failed = Signal()
306
307
308 class ICache(Elaboratable):
309 """64 bit direct mapped icache. All instructions are 4B aligned."""
310 def __init__(self):
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 self.wb_out = WBMasterOut(name="wb_out")
322 self.wb_in = WBSlaveOut(name="wb_in")
323
324 # standard naming (wired to non-standard for compatibility)
325 self.bus = Interface(addr_width=32,
326 data_width=64,
327 granularity=8,
328 features={'stall'},
329 alignment=0,
330 name="dcache")
331
332 self.log_out = Signal(54)
333
334
335 # Generate a cache RAM for each way
336 def rams(self, m, r, cache_out_row, use_previous,
337 replace_way, req_row):
338
339 comb = m.d.comb
340 sync = m.d.sync
341
342 wb_in, stall_in = self.wb_in, self.stall_in
343
344 for i in range(NUM_WAYS):
345 do_read = Signal(name="do_rd_%d" % i)
346 do_write = Signal(name="do_wr_%d" % i)
347 rd_addr = Signal(ROW_BITS)
348 wr_addr = Signal(ROW_BITS)
349 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
350 wr_sel = Signal(ROW_SIZE)
351
352 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
353 setattr(m.submodules, "cacheram_%d" % i, way)
354
355 comb += way.rd_en.eq(do_read)
356 comb += way.rd_addr.eq(rd_addr)
357 comb += d_out.eq(way.rd_data_o)
358 comb += way.wr_sel.eq(wr_sel)
359 comb += way.wr_addr.eq(wr_addr)
360 comb += way.wr_data.eq(wb_in.dat)
361
362 comb += do_read.eq(~(stall_in | use_previous))
363 comb += do_write.eq(wb_in.ack & (replace_way == i))
364
365 with m.If(do_write):
366 sync += Display("cache write adr: %x data: %lx",
367 wr_addr, way.wr_data)
368
369 with m.If(r.hit_way == i):
370 comb += cache_out_row.eq(d_out)
371 with m.If(do_read):
372 sync += Display("cache read adr: %x data: %x",
373 req_row, d_out)
374
375 comb += rd_addr.eq(req_row)
376 comb += wr_addr.eq(r.store_row)
377 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
378
379 # Generate PLRUs
380 def maybe_plrus(self, m, r, plru_victim):
381 comb = m.d.comb
382
383 with m.If(NUM_WAYS > 1):
384 for i in range(NUM_LINES):
385 plru_acc_i = Signal(WAY_BITS)
386 plru_acc_en = Signal()
387 plru = PLRU(WAY_BITS)
388 setattr(m.submodules, "plru_%d" % i, plru)
389
390 comb += plru.acc_i.eq(plru_acc_i)
391 comb += plru.acc_en.eq(plru_acc_en)
392
393 # PLRU interface
394 with m.If(get_index(r.hit_nia) == i):
395 comb += plru.acc_en.eq(r.hit_valid)
396
397 comb += plru.acc_i.eq(r.hit_way)
398 comb += plru_victim[i].eq(plru.lru_o)
399
400 # TLB hit detection and real address generation
401 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
402 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
403 priv_fault, access_ok):
404
405 comb = m.d.comb
406
407 i_in = self.i_in
408
409 pte = Signal(TLB_PTE_BITS)
410 ttag = Signal(TLB_EA_TAG_BITS)
411
412 comb += tlb_req_index.eq(hash_ea(i_in.nia))
413 comb += pte.eq(itlb_ptes[tlb_req_index])
414 comb += ttag.eq(itlb_tags[tlb_req_index])
415
416 with m.If(i_in.virt_mode):
417 comb += real_addr.eq(Cat(
418 i_in.nia[:TLB_LG_PGSZ],
419 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
420 ))
421
422 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
423 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
424
425 comb += eaa_priv.eq(pte[3])
426
427 with m.Else():
428 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
429 comb += ra_valid.eq(1)
430 comb += eaa_priv.eq(1)
431
432 # No IAMR, so no KUEP support for now
433 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
434 comb += access_ok.eq(ra_valid & ~priv_fault)
435
436 # iTLB update
437 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
438 comb = m.d.comb
439 sync = m.d.sync
440
441 m_in = self.m_in
442
443 wr_index = Signal(TLB_SIZE)
444 comb += wr_index.eq(hash_ea(m_in.addr))
445
446 with m.If(m_in.tlbie & m_in.doall):
447 # Clear all valid bits
448 for i in range(TLB_SIZE):
449 sync += itlb_valid_bits[i].eq(0)
450
451 with m.Elif(m_in.tlbie):
452 # Clear entry regardless of hit or miss
453 sync += itlb_valid_bits[wr_index].eq(0)
454
455 with m.Elif(m_in.tlbld):
456 sync += itlb_tags[wr_index].eq(
457 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
458 )
459 sync += itlb_ptes[wr_index].eq(m_in.pte)
460 sync += itlb_valid_bits[wr_index].eq(1)
461
462 # Cache hit detection, output to fetch2 and other misc logic
463 def icache_comb(self, m, use_previous, r, req_index, req_row,
464 req_hit_way, req_tag, real_addr, req_laddr,
465 cache_valid_bits, cache_tags, access_ok,
466 req_is_hit, req_is_miss, replace_way,
467 plru_victim, cache_out_row):
468
469 comb = m.d.comb
470
471 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
472 flush_in, stall_out = self.flush_in, self.stall_out
473
474 is_hit = Signal()
475 hit_way = Signal(NUM_WAYS)
476
477 # i_in.sequential means that i_in.nia this cycle is 4 more than
478 # last cycle. If we read more than 32 bits at a time, had a
479 # cache hit last cycle, and we don't want the first 32-bit chunk
480 # then we can keep the data we read last cycle and just use that.
481 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
482 comb += use_previous.eq(i_in.sequential & r.hit_valid)
483
484 # Extract line, row and tag from request
485 comb += req_index.eq(get_index(i_in.nia))
486 comb += req_row.eq(get_row(i_in.nia))
487 comb += req_tag.eq(get_tag(real_addr))
488
489 # Calculate address of beginning of cache row, will be
490 # used for cache miss processing if needed
491 comb += req_laddr.eq(Cat(
492 Const(0, ROW_OFF_BITS),
493 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
494 ))
495
496 # Test if pending request is a hit on any way
497 hitcond = Signal()
498 comb += hitcond.eq((r.state == State.WAIT_ACK)
499 & (req_index == r.store_index)
500 & r.rows_valid[req_row % ROW_PER_LINE]
501 )
502 with m.If(i_in.req):
503 cvb = Signal(NUM_WAYS)
504 ctag = Signal(TAG_RAM_WIDTH)
505 comb += ctag.eq(cache_tags[req_index])
506 comb += cvb.eq(cache_valid_bits[req_index])
507 for i in range(NUM_WAYS):
508 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
509 comb += tagi.eq(read_tag(i, ctag))
510 hit_test = Signal(name="hit_test%d" % i)
511 comb += hit_test.eq(i == r.store_way)
512 with m.If((cvb[i] | (hitcond & hit_test))
513 & (tagi == req_tag)):
514 comb += hit_way.eq(i)
515 comb += is_hit.eq(1)
516
517 # Generate the "hit" and "miss" signals
518 # for the synchronous blocks
519 with m.If(i_in.req & access_ok & ~flush_in):
520 comb += req_is_hit.eq(is_hit)
521 comb += req_is_miss.eq(~is_hit)
522
523 with m.Else():
524 comb += req_is_hit.eq(0)
525 comb += req_is_miss.eq(0)
526
527 comb += req_hit_way.eq(hit_way)
528
529 # The way to replace on a miss
530 with m.If(r.state == State.CLR_TAG):
531 comb += replace_way.eq(plru_victim[r.store_index])
532 with m.Else():
533 comb += replace_way.eq(r.store_way)
534
535 # Output instruction from current cache row
536 #
537 # Note: This is a mild violation of our design principle of
538 # having pipeline stages output from a clean latch. In this
539 # case we output the result of a mux. The alternative would
540 # be output an entire row which I prefer not to do just yet
541 # as it would force fetch2 to know about some of the cache
542 # geometry information.
543 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
544 comb += i_out.valid.eq(r.hit_valid)
545 comb += i_out.nia.eq(r.hit_nia)
546 comb += i_out.stop_mark.eq(r.hit_smark)
547 comb += i_out.fetch_failed.eq(r.fetch_failed)
548
549 # Stall fetch1 if we have a miss on cache or TLB
550 # or a protection fault
551 comb += stall_out.eq(~(is_hit & access_ok))
552
553 # Wishbone requests output (from the cache miss reload machine)
554 comb += wb_out.eq(r.wb)
555
556 # Cache hit synchronous machine
557 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
558 req_index, req_tag, real_addr):
559 sync = m.d.sync
560
561 i_in, stall_in = self.i_in, self.stall_in
562 flush_in = self.flush_in
563
564 # keep outputs to fetch2 unchanged on a stall
565 # except that flush or reset sets valid to 0
566 # If use_previous, keep the same data as last
567 # cycle and use the second half
568 with m.If(stall_in | use_previous):
569 with m.If(flush_in):
570 sync += r.hit_valid.eq(0)
571 with m.Else():
572 # On a hit, latch the request for the next cycle,
573 # when the BRAM data will be available on the
574 # cache_out output of the corresponding way
575 sync += r.hit_valid.eq(req_is_hit)
576
577 with m.If(req_is_hit):
578 sync += r.hit_way.eq(req_hit_way)
579 sync += Display(
580 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
581 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
582 i_in.stop_mark, req_index, req_tag, \
583 req_hit_way, real_addr
584 )
585
586
587
588 with m.If(~stall_in):
589 # Send stop marks and NIA down regardless of validity
590 sync += r.hit_smark.eq(i_in.stop_mark)
591 sync += r.hit_nia.eq(i_in.nia)
592
593 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
594 req_index, req_tag, replace_way, real_addr):
595 comb = m.d.comb
596 sync = m.d.sync
597
598 i_in = self.i_in
599
600 # Reset per-row valid flags, only used in WAIT_ACK
601 for i in range(ROW_PER_LINE):
602 sync += r.rows_valid[i].eq(0)
603
604 # We need to read a cache line
605 with m.If(req_is_miss):
606 sync += Display(
607 "cache miss nia:%x IR:%x SM:%x idx:%x "
608 " way:%x tag:%x RA:%x", i_in.nia,
609 i_in.virt_mode, i_in.stop_mark, req_index,
610 replace_way, req_tag, real_addr
611 )
612
613 # Keep track of our index and way for subsequent stores
614 st_row = Signal(BRAM_ROWS)
615 comb += st_row.eq(get_row(req_laddr))
616 sync += r.store_index.eq(req_index)
617 sync += r.store_row.eq(st_row)
618 sync += r.store_tag.eq(req_tag)
619 sync += r.store_valid.eq(1)
620 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
621
622 # Prep for first wishbone read. We calculate the address
623 # of the start of the cache line and start the WB cycle.
624 sync += r.req_adr.eq(req_laddr)
625 sync += r.wb.cyc.eq(1)
626 sync += r.wb.stb.eq(1)
627
628 # Track that we had one request sent
629 sync += r.state.eq(State.CLR_TAG)
630
631 def icache_miss_clr_tag(self, m, r, replace_way,
632 cache_valid_bits, req_index,
633 tagset, cache_tags):
634
635 comb = m.d.comb
636 sync = m.d.sync
637
638 # Get victim way from plru
639 sync += r.store_way.eq(replace_way)
640 # Force misses on that way while reloading that line
641 cv = Signal(INDEX_BITS)
642 comb += cv.eq(cache_valid_bits[req_index])
643 comb += cv.bit_select(replace_way, 1).eq(0)
644 sync += cache_valid_bits[req_index].eq(cv)
645
646 for i in range(NUM_WAYS):
647 with m.If(i == replace_way):
648 comb += tagset.eq(cache_tags[r.store_index])
649 comb += write_tag(i, tagset, r.store_tag)
650 sync += cache_tags[r.store_index].eq(tagset)
651
652 sync += r.state.eq(State.WAIT_ACK)
653
654 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
655 stbs_done, cache_valid_bits):
656 comb = m.d.comb
657 sync = m.d.sync
658
659 wb_in = self.wb_in
660
661 # Requests are all sent if stb is 0
662 stbs_zero = Signal()
663 comb += stbs_zero.eq(r.wb.stb == 0)
664 comb += stbs_done.eq(stbs_zero)
665
666 # If we are still sending requests, was one accepted?
667 with m.If(~wb_in.stall & ~stbs_zero):
668 # That was the last word? We are done sending.
669 # Clear stb and set stbs_done so we can handle
670 # an eventual last ack on the same cycle.
671 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
672 sync += Display(
673 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
674 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
675 "stbs_done:%x", r.wb.adr, r.end_row_ix,
676 r.wb.stb, stbs_zero, stbs_done
677 )
678 sync += r.wb.stb.eq(0)
679 comb += stbs_done.eq(1)
680
681 # Calculate the next row address
682 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
683 comb += rarange.eq(
684 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
685 )
686 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
687 rarange
688 )
689 sync += Display("RARANGE r.req_adr:%x rarange:%x "
690 "stbs_zero:%x stbs_done:%x",
691 r.req_adr, rarange, stbs_zero, stbs_done)
692
693 # Incoming acks processing
694 with m.If(wb_in.ack):
695 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
696 "stbs_done:%x",
697 wb_in.dat, stbs_zero, stbs_done)
698
699 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
700
701 # Check for completion
702 with m.If(stbs_done &
703 is_last_row(r.store_row, r.end_row_ix)):
704 # Complete wishbone cycle
705 sync += r.wb.cyc.eq(0)
706 # be nice, clear addr
707 sync += r.req_adr.eq(0)
708
709 # Cache line is now valid
710 cv = Signal(INDEX_BITS)
711 comb += cv.eq(cache_valid_bits[r.store_index])
712 comb += cv.bit_select(replace_way, 1).eq(
713 r.store_valid & ~inval_in
714 )
715 sync += cache_valid_bits[r.store_index].eq(cv)
716
717 sync += r.state.eq(State.IDLE)
718
719 # not completed, move on to next request in row
720 with m.Else():
721 # Increment store row counter
722 sync += r.store_row.eq(next_row(r.store_row))
723
724
725 # Cache miss/reload synchronous machine
726 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
727 req_index, req_laddr, req_tag, replace_way,
728 cache_tags, access_ok, real_addr):
729 comb = m.d.comb
730 sync = m.d.sync
731
732 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
733 stall_in, flush_in = self.stall_in, self.flush_in
734 inval_in = self.inval_in
735
736 tagset = Signal(TAG_RAM_WIDTH)
737 stbs_done = Signal()
738
739 comb += r.wb.sel.eq(-1)
740 comb += r.wb.adr.eq(r.req_adr[3:])
741
742 # Process cache invalidations
743 with m.If(inval_in):
744 for i in range(NUM_LINES):
745 sync += cache_valid_bits[i].eq(0)
746 sync += r.store_valid.eq(0)
747
748 # Main state machine
749 with m.Switch(r.state):
750
751 with m.Case(State.IDLE):
752 self.icache_miss_idle(
753 m, r, req_is_miss, req_laddr,
754 req_index, req_tag, replace_way,
755 real_addr
756 )
757
758 with m.Case(State.CLR_TAG, State.WAIT_ACK):
759 with m.If(r.state == State.CLR_TAG):
760 self.icache_miss_clr_tag(
761 m, r, replace_way,
762 cache_valid_bits, req_index,
763 tagset, cache_tags
764 )
765
766 self.icache_miss_wait_ack(
767 m, r, replace_way, inval_in,
768 stbs_done, cache_valid_bits
769 )
770
771 # TLB miss and protection fault processing
772 with m.If(flush_in | m_in.tlbld):
773 sync += r.fetch_failed.eq(0)
774 with m.Elif(i_in.req & ~access_ok & ~stall_in):
775 sync += r.fetch_failed.eq(1)
776
777 # icache_log: if LOG_LENGTH > 0 generate
778 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
779 req_is_miss, req_is_hit, lway, wstate, r):
780 comb = m.d.comb
781 sync = m.d.sync
782
783 wb_in, i_out = self.wb_in, self.i_out
784 log_out, stall_out = self.log_out, self.stall_out
785
786 # Output data to logger
787 for i in range(LOG_LENGTH):
788 log_data = Signal(54)
789 lway = Signal(NUM_WAYS)
790 wstate = Signal()
791
792 sync += lway.eq(req_hit_way)
793 sync += wstate.eq(0)
794
795 with m.If(r.state != State.IDLE):
796 sync += wstate.eq(1)
797
798 sync += log_data.eq(Cat(
799 ra_valid, access_ok, req_is_miss, req_is_hit,
800 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
801 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
802 r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
803 ))
804 comb += log_out.eq(log_data)
805
806 def elaborate(self, platform):
807
808 m = Module()
809 comb = m.d.comb
810
811 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
812 cache_tags = CacheTagArray()
813 cache_valid_bits = CacheValidBitsArray()
814
815 itlb_valid_bits = TLBValidBitsArray()
816 itlb_tags = TLBTagArray()
817 itlb_ptes = TLBPtesArray()
818 # TODO to be passed to nmigen as ram attributes
819 # attribute ram_style of itlb_tags : signal is "distributed";
820 # attribute ram_style of itlb_ptes : signal is "distributed";
821
822 # Privilege bit from PTE EAA field
823 eaa_priv = Signal()
824
825 r = RegInternal()
826
827 # Async signal on incoming request
828 req_index = Signal(NUM_LINES)
829 req_row = Signal(BRAM_ROWS)
830 req_hit_way = Signal(NUM_WAYS)
831 req_tag = Signal(TAG_BITS)
832 req_is_hit = Signal()
833 req_is_miss = Signal()
834 req_laddr = Signal(64)
835
836 tlb_req_index = Signal(TLB_SIZE)
837 real_addr = Signal(REAL_ADDR_BITS)
838 ra_valid = Signal()
839 priv_fault = Signal()
840 access_ok = Signal()
841 use_previous = Signal()
842
843 cache_out_row = Signal(ROW_SIZE_BITS)
844
845 plru_victim = PLRUOut()
846 replace_way = Signal(NUM_WAYS)
847
848 # call sub-functions putting everything together,
849 # using shared signals established above
850 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
851 self.maybe_plrus(m, r, plru_victim)
852 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
853 itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
854 access_ok)
855 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
856 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
857 req_tag, real_addr, req_laddr, cache_valid_bits,
858 cache_tags, access_ok, req_is_hit, req_is_miss,
859 replace_way, plru_victim, cache_out_row)
860 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
861 req_index, req_tag, real_addr)
862 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
863 req_laddr, req_tag, replace_way, cache_tags,
864 access_ok, real_addr)
865 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
866 # req_is_miss, req_is_hit, lway, wstate, r)
867
868 return m
869
870
871 def icache_sim(dut):
872 i_out = dut.i_in
873 i_in = dut.i_out
874 m_out = dut.m_in
875
876 yield i_in.valid.eq(0)
877 yield i_out.priv_mode.eq(1)
878 yield i_out.req.eq(0)
879 yield i_out.nia.eq(0)
880 yield i_out.stop_mark.eq(0)
881 yield m_out.tlbld.eq(0)
882 yield m_out.tlbie.eq(0)
883 yield m_out.addr.eq(0)
884 yield m_out.pte.eq(0)
885 yield
886 yield
887 yield
888 yield
889 yield i_out.req.eq(1)
890 yield i_out.nia.eq(Const(0x0000000000000004, 64))
891 for i in range(30):
892 yield
893 yield
894 valid = yield i_in.valid
895 nia = yield i_out.nia
896 insn = yield i_in.insn
897 print(f"valid? {valid}")
898 assert valid
899 assert insn == 0x00000001, \
900 "insn @%x=%x expected 00000001" % (nia, insn)
901 yield i_out.req.eq(0)
902 yield
903
904 # hit
905 yield
906 yield
907 yield i_out.req.eq(1)
908 yield i_out.nia.eq(Const(0x0000000000000008, 64))
909 yield
910 yield
911 valid = yield i_in.valid
912 nia = yield i_in.nia
913 insn = yield i_in.insn
914 assert valid
915 assert insn == 0x00000002, \
916 "insn @%x=%x expected 00000002" % (nia, insn)
917 yield
918
919 # another miss
920 yield i_out.req.eq(1)
921 yield i_out.nia.eq(Const(0x0000000000000040, 64))
922 for i in range(30):
923 yield
924 yield
925 valid = yield i_in.valid
926 nia = yield i_out.nia
927 insn = yield i_in.insn
928 assert valid
929 assert insn == 0x00000010, \
930 "insn @%x=%x expected 00000010" % (nia, insn)
931
932 # test something that aliases
933 yield i_out.req.eq(1)
934 yield i_out.nia.eq(Const(0x0000000000000100, 64))
935 yield
936 yield
937 valid = yield i_in.valid
938 assert ~valid
939 for i in range(30):
940 yield
941 yield
942 insn = yield i_in.insn
943 valid = yield i_in.valid
944 insn = yield i_in.insn
945 assert valid
946 assert insn == 0x00000040, \
947 "insn @%x=%x expected 00000040" % (nia, insn)
948 yield i_out.req.eq(0)
949
950
951
952 def test_icache(mem):
953 dut = ICache()
954
955 memory = Memory(width=64, depth=512, init=mem)
956 sram = SRAM(memory=memory, granularity=8)
957
958 m = Module()
959
960 m.submodules.icache = dut
961 m.submodules.sram = sram
962
963 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
964 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
965 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
966 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
967 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
968 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
969
970 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
971 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
972
973 # nmigen Simulation
974 sim = Simulator(m)
975 sim.add_clock(1e-6)
976
977 sim.add_sync_process(wrap(icache_sim(dut)))
978 with sim.write_vcd('test_icache.vcd'):
979 sim.run()
980
981 if __name__ == '__main__':
982 dut = ICache()
983 vl = rtlil.convert(dut, ports=[])
984 with open("test_icache.il", "w") as f:
985 f.write(vl)
986
987 # set up memory every 32-bits with incrementing values 0 1 2 ...
988 mem = []
989 for i in range(512):
990 mem.append((i*2) | ((i*2+1)<<32))
991
992 test_icache(mem)
993