using same tag/row functions as in dcache.py
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 )
41
42 from nmigen_soc.wishbone.bus import Interface
43
44 # for test
45 from soc.bus.sram import SRAM
46 from nmigen import Memory
47 from nmutil.util import wrap
48 from nmigen.cli import main, rtlil
49
50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
52 from nmutil.sim_tmp_alternative import Simulator, Settle
53
54
55 SIM = 0
56 LINE_SIZE = 64
57 # BRAM organisation: We never access more than wishbone_data_bits
58 # at a time so to save resources we make the array only that wide,
59 # and use consecutive indices for to make a cache "line"
60 #
61 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
62 ROW_SIZE = WB_DATA_BITS // 8
63 # Number of lines in a set
64 NUM_LINES = 16
65 # Number of ways
66 NUM_WAYS = 4
67 # L1 ITLB number of entries (direct mapped)
68 TLB_SIZE = 64
69 # L1 ITLB log_2(page_size)
70 TLB_LG_PGSZ = 12
71 # Number of real address bits that we store
72 REAL_ADDR_BITS = 56
73 # Non-zero to enable log data collection
74 LOG_LENGTH = 0
75
76 ROW_SIZE_BITS = ROW_SIZE * 8
77 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
80 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
81 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
82 INSN_PER_ROW = ROW_SIZE_BITS // 32
83
84 # Bit fields counts in the address
85 #
86 # INSN_BITS is the number of bits to select an instruction in a row
87 INSN_BITS = log2_int(INSN_PER_ROW)
88 # ROW_BITS is the number of bits to select a row
89 ROW_BITS = log2_int(BRAM_ROWS)
90 # ROW_LINE_BITS is the number of bits to select a row within a line
91 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
92 # LINE_OFF_BITS is the number of bits for the offset in a cache line
93 LINE_OFF_BITS = log2_int(LINE_SIZE)
94 # ROW_OFF_BITS is the number of bits for the offset in a row
95 ROW_OFF_BITS = log2_int(ROW_SIZE)
96 # INDEX_BITS is the number of bits to select a cache line
97 INDEX_BITS = log2_int(NUM_LINES)
98 # SET_SIZE_BITS is the log base 2 of the set size
99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
100 # TAG_BITS is the number of bits of the tag part of the address
101 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
102 # TAG_WIDTH is the width in bits of each way of the tag RAM
103 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
104
105 # WAY_BITS is the number of bits to select a way
106 WAY_BITS = log2_int(NUM_WAYS)
107 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
108
109 # L1 ITLB
110 TLB_BITS = log2_int(TLB_SIZE)
111 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
112 TLB_PTE_BITS = 64
113
114 print("BRAM_ROWS =", BRAM_ROWS)
115 print("INDEX_BITS =", INDEX_BITS)
116 print("INSN_BITS =", INSN_BITS)
117 print("INSN_PER_ROW =", INSN_PER_ROW)
118 print("LINE_SIZE =", LINE_SIZE)
119 print("LINE_OFF_BITS =", LINE_OFF_BITS)
120 print("LOG_LENGTH =", LOG_LENGTH)
121 print("NUM_LINES =", NUM_LINES)
122 print("NUM_WAYS =", NUM_WAYS)
123 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
124 print("ROW_BITS =", ROW_BITS)
125 print("ROW_OFF_BITS =", ROW_OFF_BITS)
126 print("ROW_LINE_BITS =", ROW_LINE_BITS)
127 print("ROW_PER_LINE =", ROW_PER_LINE)
128 print("ROW_SIZE =", ROW_SIZE)
129 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
130 print("SET_SIZE_BITS =", SET_SIZE_BITS)
131 print("SIM =", SIM)
132 print("TAG_BITS =", TAG_BITS)
133 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
134 print("TAG_BITS =", TAG_BITS)
135 print("TLB_BITS =", TLB_BITS)
136 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
137 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
138 print("TLB_PTE_BITS =", TLB_PTE_BITS)
139 print("TLB_SIZE =", TLB_SIZE)
140 print("WAY_BITS =", WAY_BITS)
141
142 # from microwatt/utils.vhdl
143 def ispow2(n):
144 return n != 0 and (n & (n - 1)) == 0
145
146 assert LINE_SIZE % ROW_SIZE == 0
147 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
148 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
149 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
150 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
151 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
152 "geometry bits don't add up"
153 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
154 "geometry bits don't add up"
155 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
156 "geometry bits don't add up"
157 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
158 "geometry bits don't add up"
159
160 # Example of layout for 32 lines of 64 bytes:
161 #
162 # .. tag |index| line |
163 # .. | row | |
164 # .. | | | |00| zero (2)
165 # .. | | |-| | INSN_BITS (1)
166 # .. | |---| | ROW_LINE_BITS (3)
167 # .. | |--- - --| LINE_OFF_BITS (6)
168 # .. | |- --| ROW_OFF_BITS (3)
169 # .. |----- ---| | ROW_BITS (8)
170 # .. |-----| | INDEX_BITS (5)
171 # .. --------| | TAG_BITS (53)
172
173 # The cache data BRAM organized as described above for each way
174 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
175 #
176 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
177 # not handle a clean (commented) definition of the cache tags as a 3d
178 # memory. For now, work around it by putting all the tags
179 def CacheTagArray():
180 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
181 for x in range(NUM_LINES))
182
183 # The cache valid bits
184 def CacheValidBitsArray():
185 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
186 for x in range(NUM_LINES))
187
188 def RowPerLineValidArray():
189 return Array(Signal(name="rows_valid_%d" %x) \
190 for x in range(ROW_PER_LINE))
191
192
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
196
197
198 def TLBValidBitsArray():
199 return Array(Signal(name="tlbvalid_%d" %x) \
200 for x in range(TLB_SIZE))
201
202 def TLBTagArray():
203 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
204 for x in range(TLB_SIZE))
205
206 def TLBPtesArray():
207 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
208 for x in range(TLB_SIZE))
209
210 # Cache RAM interface
211 def CacheRamOut():
212 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
213 for x in range(NUM_WAYS))
214
215 # PLRU output interface
216 def PLRUOut():
217 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
218 for x in range(NUM_LINES))
219
220 # Return the cache line index (tag index) for an address
221 def get_index(addr):
222 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
223
224 # Return the cache row index (data memory) for an address
225 def get_row(addr):
226 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
227
228 # Return the index of a row within a line
229 def get_row_of_line(row):
230 return row[:ROW_BITS][:ROW_LINE_BITS]
231
232 # Returns whether this is the last row of a line
233 def is_last_row_addr(addr, last):
234 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
235
236 # Returns whether this is the last row of a line
237 def is_last_row(row, last):
238 return get_row_of_line(row) == last
239
240 # Return the next row in the current cache line. We use a dedicated
241 # function in order to limit the size of the generated adder to be
242 # only the bits within a cache line (3 bits with default settings)
243 def next_row(row):
244 row_v = row[0:ROW_LINE_BITS] + 1
245 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
246
247 # Read the instruction word for the given address
248 # in the current cache row
249 def read_insn_word(addr, data):
250 word = addr[2:INSN_BITS+2]
251 return data.word_select(word, 32)
252
253 # Get the tag value from the address
254 def get_tag(addr):
255 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
256
257 # Read a tag from a tag memory row
258 def read_tag(way, tagset):
259 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
260
261 # Write a tag to tag memory row
262 def write_tag(way, tagset, tag):
263 return read_tag(way, tagset).eq(tag)
264
265 # Simple hash for direct-mapped TLB index
266 def hash_ea(addr):
267 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
268 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
269 ] ^ addr[
270 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
271 ]
272 return hsh
273
274
275 # Cache reload state machine
276 @unique
277 class State(Enum):
278 IDLE = 0
279 CLR_TAG = 1
280 WAIT_ACK = 2
281
282
283 class RegInternal(RecordObject):
284 def __init__(self):
285 super().__init__()
286 # Cache hit state (Latches for 1 cycle BRAM access)
287 self.hit_way = Signal(WAY_BITS)
288 self.hit_nia = Signal(64)
289 self.hit_smark = Signal()
290 self.hit_valid = Signal()
291
292 # Cache miss state (reload state machine)
293 self.state = Signal(State, reset=State.IDLE)
294 self.wb = WBMasterOut("wb")
295 self.req_adr = Signal(64)
296 self.store_way = Signal(WAY_BITS)
297 self.store_index = Signal(INDEX_BITS)
298 self.store_row = Signal(ROW_BITS)
299 self.store_tag = Signal(TAG_BITS)
300 self.store_valid = Signal()
301 self.end_row_ix = Signal(ROW_LINE_BITS)
302 self.rows_valid = RowPerLineValidArray()
303
304 # TLB miss state
305 self.fetch_failed = Signal()
306
307
308 class ICache(Elaboratable):
309 """64 bit direct mapped icache. All instructions are 4B aligned."""
310 def __init__(self):
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 self.wb_out = WBMasterOut(name="wb_out")
322 self.wb_in = WBSlaveOut(name="wb_in")
323
324 # standard naming (wired to non-standard for compatibility)
325 self.bus = Interface(addr_width=32,
326 data_width=64,
327 granularity=8,
328 features={'stall'},
329 alignment=0,
330 name="dcache")
331
332 self.log_out = Signal(54)
333
334
335 # Generate a cache RAM for each way
336 def rams(self, m, r, cache_out_row, use_previous,
337 replace_way, req_row):
338
339 comb = m.d.comb
340 sync = m.d.sync
341
342 wb_in, stall_in = self.wb_in, self.stall_in
343
344 for i in range(NUM_WAYS):
345 do_read = Signal(name="do_rd_%d" % i)
346 do_write = Signal(name="do_wr_%d" % i)
347 rd_addr = Signal(ROW_BITS)
348 wr_addr = Signal(ROW_BITS)
349 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
350 wr_sel = Signal(ROW_SIZE)
351
352 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True, ram_num=i)
353 setattr(m.submodules, "cacheram_%d" % i, way)
354
355 comb += way.rd_en.eq(do_read)
356 comb += way.rd_addr.eq(rd_addr)
357 comb += d_out.eq(way.rd_data_o)
358 comb += way.wr_sel.eq(wr_sel)
359 comb += way.wr_addr.eq(wr_addr)
360 comb += way.wr_data.eq(wb_in.dat)
361
362 comb += do_read.eq(~(stall_in | use_previous))
363 comb += do_write.eq(wb_in.ack & (replace_way == i))
364
365 with m.If(do_write):
366 sync += Display("cache write adr: %x data: %lx",
367 wr_addr, way.wr_data)
368
369 with m.If(r.hit_way == i):
370 comb += cache_out_row.eq(d_out)
371 with m.If(do_read):
372 sync += Display("cache read adr: %x data: %x",
373 req_row, d_out)
374
375 comb += rd_addr.eq(req_row)
376 comb += wr_addr.eq(r.store_row)
377 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
378
379 # Generate PLRUs
380 def maybe_plrus(self, m, r, plru_victim):
381 comb = m.d.comb
382
383 with m.If(NUM_WAYS > 1):
384 for i in range(NUM_LINES):
385 plru_acc_i = Signal(WAY_BITS)
386 plru_acc_en = Signal()
387 plru = PLRU(WAY_BITS)
388 setattr(m.submodules, "plru_%d" % i, plru)
389
390 comb += plru.acc_i.eq(plru_acc_i)
391 comb += plru.acc_en.eq(plru_acc_en)
392
393 # PLRU interface
394 with m.If(get_index(r.hit_nia) == i):
395 comb += plru.acc_en.eq(r.hit_valid)
396
397 comb += plru.acc_i.eq(r.hit_way)
398 comb += plru_victim[i].eq(plru.lru_o)
399
400 # TLB hit detection and real address generation
401 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
402 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
403 priv_fault, access_ok):
404
405 comb = m.d.comb
406
407 i_in = self.i_in
408
409 pte = Signal(TLB_PTE_BITS)
410 ttag = Signal(TLB_EA_TAG_BITS)
411
412 comb += tlb_req_index.eq(hash_ea(i_in.nia))
413 comb += pte.eq(itlb_ptes[tlb_req_index])
414 comb += ttag.eq(itlb_tags[tlb_req_index])
415
416 with m.If(i_in.virt_mode):
417 comb += real_addr.eq(Cat(
418 i_in.nia[:TLB_LG_PGSZ],
419 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
420 ))
421
422 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
423 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
424
425 comb += eaa_priv.eq(pte[3])
426
427 with m.Else():
428 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
429 comb += ra_valid.eq(1)
430 comb += eaa_priv.eq(1)
431
432 # No IAMR, so no KUEP support for now
433 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
434 comb += access_ok.eq(ra_valid & ~priv_fault)
435
436 # iTLB update
437 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
438 comb = m.d.comb
439 sync = m.d.sync
440
441 m_in = self.m_in
442
443 wr_index = Signal(TLB_SIZE)
444 comb += wr_index.eq(hash_ea(m_in.addr))
445
446 with m.If(m_in.tlbie & m_in.doall):
447 # Clear all valid bits
448 for i in range(TLB_SIZE):
449 sync += itlb_valid_bits[i].eq(0)
450
451 with m.Elif(m_in.tlbie):
452 # Clear entry regardless of hit or miss
453 sync += itlb_valid_bits[wr_index].eq(0)
454
455 with m.Elif(m_in.tlbld):
456 sync += itlb_tags[wr_index].eq(
457 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
458 )
459 sync += itlb_ptes[wr_index].eq(m_in.pte)
460 sync += itlb_valid_bits[wr_index].eq(1)
461
462 # Cache hit detection, output to fetch2 and other misc logic
463 def icache_comb(self, m, use_previous, r, req_index, req_row,
464 req_hit_way, req_tag, real_addr, req_laddr,
465 cache_valid_bits, cache_tags, access_ok,
466 req_is_hit, req_is_miss, replace_way,
467 plru_victim, cache_out_row):
468
469 comb = m.d.comb
470
471 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
472 flush_in, stall_out = self.flush_in, self.stall_out
473
474 is_hit = Signal()
475 hit_way = Signal(WAY_BITS)
476
477 # i_in.sequential means that i_in.nia this cycle is 4 more than
478 # last cycle. If we read more than 32 bits at a time, had a
479 # cache hit last cycle, and we don't want the first 32-bit chunk
480 # then we can keep the data we read last cycle and just use that.
481 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
482 comb += use_previous.eq(i_in.sequential & r.hit_valid)
483
484 # Extract line, row and tag from request
485 comb += req_index.eq(get_index(i_in.nia))
486 comb += req_row.eq(get_row(i_in.nia))
487 comb += req_tag.eq(get_tag(real_addr))
488
489 # Calculate address of beginning of cache row, will be
490 # used for cache miss processing if needed
491 comb += req_laddr.eq(Cat(
492 Const(0, ROW_OFF_BITS),
493 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
494 ))
495
496 # Test if pending request is a hit on any way
497 hitcond = Signal()
498 comb += hitcond.eq((r.state == State.WAIT_ACK)
499 & (req_index == r.store_index)
500 & r.rows_valid[req_row % ROW_PER_LINE]
501 )
502 with m.If(i_in.req):
503 cvb = Signal(NUM_WAYS)
504 ctag = Signal(TAG_RAM_WIDTH)
505 comb += ctag.eq(cache_tags[req_index])
506 comb += cvb.eq(cache_valid_bits[req_index])
507 for i in range(NUM_WAYS):
508 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
509 comb += tagi.eq(read_tag(i, ctag))
510 hit_test = Signal(name="hit_test%d" % i)
511 comb += hit_test.eq(i == r.store_way)
512 with m.If((cvb[i] | (hitcond & hit_test))
513 & (tagi == req_tag)):
514 comb += hit_way.eq(i)
515 comb += is_hit.eq(1)
516
517 # Generate the "hit" and "miss" signals
518 # for the synchronous blocks
519 with m.If(i_in.req & access_ok & ~flush_in):
520 comb += req_is_hit.eq(is_hit)
521 comb += req_is_miss.eq(~is_hit)
522
523 comb += req_hit_way.eq(hit_way)
524
525 # The way to replace on a miss
526 with m.If(r.state == State.CLR_TAG):
527 comb += replace_way.eq(plru_victim[r.store_index])
528 with m.Else():
529 comb += replace_way.eq(r.store_way)
530
531 # Output instruction from current cache row
532 #
533 # Note: This is a mild violation of our design principle of
534 # having pipeline stages output from a clean latch. In this
535 # case we output the result of a mux. The alternative would
536 # be output an entire row which I prefer not to do just yet
537 # as it would force fetch2 to know about some of the cache
538 # geometry information.
539 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
540 comb += i_out.valid.eq(r.hit_valid)
541 comb += i_out.nia.eq(r.hit_nia)
542 comb += i_out.stop_mark.eq(r.hit_smark)
543 comb += i_out.fetch_failed.eq(r.fetch_failed)
544
545 # Stall fetch1 if we have a miss on cache or TLB
546 # or a protection fault
547 comb += stall_out.eq(~(is_hit & access_ok))
548
549 # Wishbone requests output (from the cache miss reload machine)
550 comb += wb_out.eq(r.wb)
551
552 # Cache hit synchronous machine
553 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
554 req_index, req_tag, real_addr):
555 sync = m.d.sync
556
557 i_in, stall_in = self.i_in, self.stall_in
558 flush_in = self.flush_in
559
560 # keep outputs to fetch2 unchanged on a stall
561 # except that flush or reset sets valid to 0
562 # If use_previous, keep the same data as last
563 # cycle and use the second half
564 with m.If(stall_in | use_previous):
565 with m.If(flush_in):
566 sync += r.hit_valid.eq(0)
567 with m.Else():
568 # On a hit, latch the request for the next cycle,
569 # when the BRAM data will be available on the
570 # cache_out output of the corresponding way
571 sync += r.hit_valid.eq(req_is_hit)
572
573 with m.If(req_is_hit):
574 sync += r.hit_way.eq(req_hit_way)
575 sync += Display(
576 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
577 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
578 i_in.stop_mark, req_index, req_tag, \
579 req_hit_way, real_addr
580 )
581
582
583
584 with m.If(~stall_in):
585 # Send stop marks and NIA down regardless of validity
586 sync += r.hit_smark.eq(i_in.stop_mark)
587 sync += r.hit_nia.eq(i_in.nia)
588
589 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
590 req_index, req_tag, replace_way, real_addr):
591 comb = m.d.comb
592 sync = m.d.sync
593
594 i_in = self.i_in
595
596 # Reset per-row valid flags, only used in WAIT_ACK
597 for i in range(ROW_PER_LINE):
598 sync += r.rows_valid[i].eq(0)
599
600 # We need to read a cache line
601 with m.If(req_is_miss):
602 sync += Display(
603 "cache miss nia:%x IR:%x SM:%x idx:%x "
604 " way:%x tag:%x RA:%x", i_in.nia,
605 i_in.virt_mode, i_in.stop_mark, req_index,
606 replace_way, req_tag, real_addr
607 )
608
609 # Keep track of our index and way for subsequent stores
610 st_row = Signal(ROW_BITS)
611 comb += st_row.eq(get_row(req_laddr))
612 sync += r.store_index.eq(req_index)
613 sync += r.store_row.eq(st_row)
614 sync += r.store_tag.eq(req_tag)
615 sync += r.store_valid.eq(1)
616 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
617
618 # Prep for first wishbone read. We calculate the address
619 # of the start of the cache line and start the WB cycle.
620 sync += r.req_adr.eq(req_laddr)
621 sync += r.wb.cyc.eq(1)
622 sync += r.wb.stb.eq(1)
623
624 # Track that we had one request sent
625 sync += r.state.eq(State.CLR_TAG)
626
627 def icache_miss_clr_tag(self, m, r, replace_way,
628 cache_valid_bits, req_index,
629 tagset, cache_tags):
630
631 comb = m.d.comb
632 sync = m.d.sync
633
634 # Get victim way from plru
635 sync += r.store_way.eq(replace_way)
636 # Force misses on that way while reloading that line
637 cv = Signal(INDEX_BITS)
638 comb += cv.eq(cache_valid_bits[req_index])
639 comb += cv.bit_select(replace_way, 1).eq(0)
640 sync += cache_valid_bits[req_index].eq(cv)
641
642 for i in range(NUM_WAYS):
643 with m.If(i == replace_way):
644 comb += tagset.eq(cache_tags[r.store_index])
645 comb += write_tag(i, tagset, r.store_tag)
646 sync += cache_tags[r.store_index].eq(tagset)
647
648 sync += r.state.eq(State.WAIT_ACK)
649
650 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
651 stbs_done, cache_valid_bits):
652 comb = m.d.comb
653 sync = m.d.sync
654
655 wb_in = self.wb_in
656
657 # Requests are all sent if stb is 0
658 stbs_zero = Signal()
659 comb += stbs_zero.eq(r.wb.stb == 0)
660 comb += stbs_done.eq(stbs_zero)
661
662 # If we are still sending requests, was one accepted?
663 with m.If(~wb_in.stall & ~stbs_zero):
664 # That was the last word? We are done sending.
665 # Clear stb and set stbs_done so we can handle
666 # an eventual last ack on the same cycle.
667 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
668 sync += Display(
669 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
670 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
671 "stbs_done:%x", r.wb.adr, r.end_row_ix,
672 r.wb.stb, stbs_zero, stbs_done
673 )
674 sync += r.wb.stb.eq(0)
675 comb += stbs_done.eq(1)
676
677 # Calculate the next row address
678 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
679 comb += rarange.eq(
680 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
681 )
682 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
683 rarange
684 )
685 sync += Display("RARANGE r.req_adr:%x rarange:%x "
686 "stbs_zero:%x stbs_done:%x",
687 r.req_adr, rarange, stbs_zero, stbs_done)
688
689 # Incoming acks processing
690 with m.If(wb_in.ack):
691 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
692 "stbs_done:%x",
693 wb_in.dat, stbs_zero, stbs_done)
694
695 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
696
697 # Check for completion
698 with m.If(stbs_done &
699 is_last_row(r.store_row, r.end_row_ix)):
700 # Complete wishbone cycle
701 sync += r.wb.cyc.eq(0)
702 # be nice, clear addr
703 sync += r.req_adr.eq(0)
704
705 # Cache line is now valid
706 cv = Signal(INDEX_BITS)
707 comb += cv.eq(cache_valid_bits[r.store_index])
708 comb += cv.bit_select(replace_way, 1).eq(
709 r.store_valid & ~inval_in
710 )
711 sync += cache_valid_bits[r.store_index].eq(cv)
712
713 sync += r.state.eq(State.IDLE)
714
715 # not completed, move on to next request in row
716 with m.Else():
717 # Increment store row counter
718 sync += r.store_row.eq(next_row(r.store_row))
719
720
721 # Cache miss/reload synchronous machine
722 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
723 req_index, req_laddr, req_tag, replace_way,
724 cache_tags, access_ok, real_addr):
725 comb = m.d.comb
726 sync = m.d.sync
727
728 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
729 stall_in, flush_in = self.stall_in, self.flush_in
730 inval_in = self.inval_in
731
732 tagset = Signal(TAG_RAM_WIDTH)
733 stbs_done = Signal()
734
735 comb += r.wb.sel.eq(-1)
736 comb += r.wb.adr.eq(r.req_adr[3:])
737
738 # Process cache invalidations
739 with m.If(inval_in):
740 for i in range(NUM_LINES):
741 sync += cache_valid_bits[i].eq(0)
742 sync += r.store_valid.eq(0)
743
744 # Main state machine
745 with m.Switch(r.state):
746
747 with m.Case(State.IDLE):
748 self.icache_miss_idle(
749 m, r, req_is_miss, req_laddr,
750 req_index, req_tag, replace_way,
751 real_addr
752 )
753
754 with m.Case(State.CLR_TAG, State.WAIT_ACK):
755 with m.If(r.state == State.CLR_TAG):
756 self.icache_miss_clr_tag(
757 m, r, replace_way,
758 cache_valid_bits, req_index,
759 tagset, cache_tags
760 )
761
762 self.icache_miss_wait_ack(
763 m, r, replace_way, inval_in,
764 stbs_done, cache_valid_bits
765 )
766
767 # TLB miss and protection fault processing
768 with m.If(flush_in | m_in.tlbld):
769 sync += r.fetch_failed.eq(0)
770 with m.Elif(i_in.req & ~access_ok & ~stall_in):
771 sync += r.fetch_failed.eq(1)
772
773 # icache_log: if LOG_LENGTH > 0 generate
774 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
775 req_is_miss, req_is_hit, lway, wstate, r):
776 comb = m.d.comb
777 sync = m.d.sync
778
779 wb_in, i_out = self.wb_in, self.i_out
780 log_out, stall_out = self.log_out, self.stall_out
781
782 # Output data to logger
783 for i in range(LOG_LENGTH):
784 log_data = Signal(54)
785 lway = Signal(WAY_BITS)
786 wstate = Signal()
787
788 sync += lway.eq(req_hit_way)
789 sync += wstate.eq(0)
790
791 with m.If(r.state != State.IDLE):
792 sync += wstate.eq(1)
793
794 sync += log_data.eq(Cat(
795 ra_valid, access_ok, req_is_miss, req_is_hit,
796 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
797 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
798 r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
799 ))
800 comb += log_out.eq(log_data)
801
802 def elaborate(self, platform):
803
804 m = Module()
805 comb = m.d.comb
806
807 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
808 cache_tags = CacheTagArray()
809 cache_valid_bits = CacheValidBitsArray()
810
811 itlb_valid_bits = TLBValidBitsArray()
812 itlb_tags = TLBTagArray()
813 itlb_ptes = TLBPtesArray()
814 # TODO to be passed to nmigen as ram attributes
815 # attribute ram_style of itlb_tags : signal is "distributed";
816 # attribute ram_style of itlb_ptes : signal is "distributed";
817
818 # Privilege bit from PTE EAA field
819 eaa_priv = Signal()
820
821 r = RegInternal()
822
823 # Async signal on incoming request
824 req_index = Signal(INDEX_BITS)
825 req_row = Signal(ROW_BITS)
826 req_hit_way = Signal(WAY_BITS)
827 req_tag = Signal(TAG_BITS)
828 req_is_hit = Signal()
829 req_is_miss = Signal()
830 req_laddr = Signal(64)
831
832 tlb_req_index = Signal(TLB_SIZE)
833 real_addr = Signal(REAL_ADDR_BITS)
834 ra_valid = Signal()
835 priv_fault = Signal()
836 access_ok = Signal()
837 use_previous = Signal()
838
839 cache_out_row = Signal(ROW_SIZE_BITS)
840
841 plru_victim = PLRUOut()
842 replace_way = Signal(WAY_BITS)
843
844 # call sub-functions putting everything together,
845 # using shared signals established above
846 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
847 self.maybe_plrus(m, r, plru_victim)
848 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
849 itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
850 access_ok)
851 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
852 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
853 req_tag, real_addr, req_laddr, cache_valid_bits,
854 cache_tags, access_ok, req_is_hit, req_is_miss,
855 replace_way, plru_victim, cache_out_row)
856 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
857 req_index, req_tag, real_addr)
858 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
859 req_laddr, req_tag, replace_way, cache_tags,
860 access_ok, real_addr)
861 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
862 # req_is_miss, req_is_hit, lway, wstate, r)
863
864 return m
865
866
867 def icache_sim(dut):
868 i_in = dut.i_in
869 i_out = dut.i_out
870 m_out = dut.m_in
871
872 yield i_in.priv_mode.eq(1)
873 yield i_in.req.eq(0)
874 yield i_in.nia.eq(0)
875 yield i_in.stop_mark.eq(0)
876 yield m_out.tlbld.eq(0)
877 yield m_out.tlbie.eq(0)
878 yield m_out.addr.eq(0)
879 yield m_out.pte.eq(0)
880 yield
881 yield
882 yield
883 yield
884
885 # miss, stalls for a bit
886 yield i_in.req.eq(1)
887 yield i_in.nia.eq(Const(0x0000000000000004, 64))
888 yield
889 valid = yield i_out.valid
890 while not valid:
891 yield
892 valid = yield i_out.valid
893 yield i_in.req.eq(0)
894
895 insn = yield i_out.insn
896 nia = yield i_out.nia
897 assert insn == 0x00000001, \
898 "insn @%x=%x expected 00000001" % (nia, insn)
899 yield i_in.req.eq(0)
900 yield
901
902 # hit
903 yield i_in.req.eq(1)
904 yield i_in.nia.eq(Const(0x0000000000000008, 64))
905 yield
906 valid = yield i_out.valid
907 while not valid:
908 yield
909 valid = yield i_out.valid
910 yield i_in.req.eq(0)
911
912 nia = yield i_out.nia
913 insn = yield i_out.insn
914 yield
915 assert insn == 0x00000002, \
916 "insn @%x=%x expected 00000002" % (nia, insn)
917
918 # another miss
919 yield i_in.req.eq(1)
920 yield i_in.nia.eq(Const(0x0000000000000040, 64))
921 for i in range(30):
922 yield
923 yield
924 valid = yield i_out.valid
925 nia = yield i_in.nia
926 insn = yield i_out.insn
927 assert valid
928 assert insn == 0x00000010, \
929 "insn @%x=%x expected 00000010" % (nia, insn)
930
931 # test something that aliases
932 yield i_in.req.eq(1)
933 yield i_in.nia.eq(Const(0x0000000000000100, 64))
934 yield
935 yield
936 valid = yield i_out.valid
937 assert ~valid
938 for i in range(30):
939 yield
940 yield
941 insn = yield i_out.insn
942 valid = yield i_out.valid
943 insn = yield i_out.insn
944 assert valid
945 assert insn == 0x00000040, \
946 "insn @%x=%x expected 00000040" % (nia, insn)
947 yield i_in.req.eq(0)
948
949
950
951 def test_icache(mem):
952 dut = ICache()
953
954 memory = Memory(width=64, depth=512, init=mem)
955 sram = SRAM(memory=memory, granularity=8)
956
957 m = Module()
958
959 m.submodules.icache = dut
960 m.submodules.sram = sram
961
962 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
963 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
964 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
965 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
966 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
967 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
968
969 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
970 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
971
972 # nmigen Simulation
973 sim = Simulator(m)
974 sim.add_clock(1e-6)
975
976 sim.add_sync_process(wrap(icache_sim(dut)))
977 with sim.write_vcd('test_icache.vcd'):
978 sim.run()
979
980 if __name__ == '__main__':
981 dut = ICache()
982 vl = rtlil.convert(dut, ports=[])
983 with open("test_icache.il", "w") as f:
984 f.write(vl)
985
986 # set up memory every 32-bits with incrementing values 0 1 2 ...
987 mem = []
988 for i in range(512):
989 mem.append((i*2) | ((i*2+1)<<32))
990
991 test_icache(mem)
992