fix icache row store issue
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 )
41
42 from nmigen_soc.wishbone.bus import Interface
43
44 # for test
45 from soc.bus.sram import SRAM
46 from nmigen import Memory
47 from nmutil.util import wrap
48 from nmigen.cli import main, rtlil
49
50 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
51 # Also, check out the cxxsim nmigen branch, and latest yosys from git
52 from nmutil.sim_tmp_alternative import Simulator, Settle
53
54
55 SIM = 0
56 LINE_SIZE = 64
57 # BRAM organisation: We never access more than wishbone_data_bits
58 # at a time so to save resources we make the array only that wide,
59 # and use consecutive indices for to make a cache "line"
60 #
61 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
62 ROW_SIZE = WB_DATA_BITS // 8
63 # Number of lines in a set
64 NUM_LINES = 16
65 # Number of ways
66 NUM_WAYS = 4
67 # L1 ITLB number of entries (direct mapped)
68 TLB_SIZE = 64
69 # L1 ITLB log_2(page_size)
70 TLB_LG_PGSZ = 12
71 # Number of real address bits that we store
72 REAL_ADDR_BITS = 56
73 # Non-zero to enable log data collection
74 LOG_LENGTH = 0
75
76 ROW_SIZE_BITS = ROW_SIZE * 8
77 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
80 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
81 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
82 INSN_PER_ROW = ROW_SIZE_BITS // 32
83
84 # Bit fields counts in the address
85 #
86 # INSN_BITS is the number of bits to select an instruction in a row
87 INSN_BITS = log2_int(INSN_PER_ROW)
88 # ROW_BITS is the number of bits to select a row
89 ROW_BITS = log2_int(BRAM_ROWS)
90 # ROW_LINE_BITS is the number of bits to select a row within a line
91 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
92 # LINE_OFF_BITS is the number of bits for the offset in a cache line
93 LINE_OFF_BITS = log2_int(LINE_SIZE)
94 # ROW_OFF_BITS is the number of bits for the offset in a row
95 ROW_OFF_BITS = log2_int(ROW_SIZE)
96 # INDEX_BITS is the number of bits to select a cache line
97 INDEX_BITS = log2_int(NUM_LINES)
98 # SET_SIZE_BITS is the log base 2 of the set size
99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
100 # TAG_BITS is the number of bits of the tag part of the address
101 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
102 # TAG_WIDTH is the width in bits of each way of the tag RAM
103 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
104
105 # WAY_BITS is the number of bits to select a way
106 WAY_BITS = log2_int(NUM_WAYS)
107 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
108
109 # L1 ITLB
110 TLB_BITS = log2_int(TLB_SIZE)
111 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
112 TLB_PTE_BITS = 64
113
114 print("BRAM_ROWS =", BRAM_ROWS)
115 print("INDEX_BITS =", INDEX_BITS)
116 print("INSN_BITS =", INSN_BITS)
117 print("INSN_PER_ROW =", INSN_PER_ROW)
118 print("LINE_SIZE =", LINE_SIZE)
119 print("LINE_OFF_BITS =", LINE_OFF_BITS)
120 print("LOG_LENGTH =", LOG_LENGTH)
121 print("NUM_LINES =", NUM_LINES)
122 print("NUM_WAYS =", NUM_WAYS)
123 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
124 print("ROW_BITS =", ROW_BITS)
125 print("ROW_OFF_BITS =", ROW_OFF_BITS)
126 print("ROW_LINE_BITS =", ROW_LINE_BITS)
127 print("ROW_PER_LINE =", ROW_PER_LINE)
128 print("ROW_SIZE =", ROW_SIZE)
129 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
130 print("SET_SIZE_BITS =", SET_SIZE_BITS)
131 print("SIM =", SIM)
132 print("TAG_BITS =", TAG_BITS)
133 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
134 print("TAG_BITS =", TAG_BITS)
135 print("TLB_BITS =", TLB_BITS)
136 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
137 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
138 print("TLB_PTE_BITS =", TLB_PTE_BITS)
139 print("TLB_SIZE =", TLB_SIZE)
140 print("WAY_BITS =", WAY_BITS)
141
142 # from microwatt/utils.vhdl
143 def ispow2(n):
144 return n != 0 and (n & (n - 1)) == 0
145
146 assert LINE_SIZE % ROW_SIZE == 0
147 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
148 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
149 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
150 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
151 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
152 "geometry bits don't add up"
153 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
154 "geometry bits don't add up"
155 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
156 "geometry bits don't add up"
157 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
158 "geometry bits don't add up"
159
160 # Example of layout for 32 lines of 64 bytes:
161 #
162 # .. tag |index| line |
163 # .. | row | |
164 # .. | | | |00| zero (2)
165 # .. | | |-| | INSN_BITS (1)
166 # .. | |---| | ROW_LINE_BITS (3)
167 # .. | |--- - --| LINE_OFF_BITS (6)
168 # .. | |- --| ROW_OFF_BITS (3)
169 # .. |----- ---| | ROW_BITS (8)
170 # .. |-----| | INDEX_BITS (5)
171 # .. --------| | TAG_BITS (53)
172
173 # The cache data BRAM organized as described above for each way
174 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
175 #
176 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
177 # not handle a clean (commented) definition of the cache tags as a 3d
178 # memory. For now, work around it by putting all the tags
179 def CacheTagArray():
180 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
181 for x in range(NUM_LINES))
182
183 # The cache valid bits
184 def CacheValidBitsArray():
185 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
186 for x in range(NUM_LINES))
187
188 def RowPerLineValidArray():
189 return Array(Signal(name="rows_valid_%d" %x) \
190 for x in range(ROW_PER_LINE))
191
192
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
196
197
198 def TLBValidBitsArray():
199 return Array(Signal(name="tlbvalid_%d" %x) \
200 for x in range(TLB_SIZE))
201
202 def TLBTagArray():
203 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
204 for x in range(TLB_SIZE))
205
206 def TLBPtesArray():
207 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
208 for x in range(TLB_SIZE))
209
210 # Cache RAM interface
211 def CacheRamOut():
212 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
213 for x in range(NUM_WAYS))
214
215 # PLRU output interface
216 def PLRUOut():
217 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
218 for x in range(NUM_LINES))
219
220 # Return the cache line index (tag index) for an address
221 def get_index(addr):
222 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
223
224 # Return the cache row index (data memory) for an address
225 def get_row(addr):
226 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
227
228 # Return the index of a row within a line
229 def get_row_of_line(row):
230 return row[:ROW_BITS][:ROW_LINE_BITS]
231
232 # Returns whether this is the last row of a line
233 def is_last_row_addr(addr, last):
234 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
235
236 # Returns whether this is the last row of a line
237 def is_last_row(row, last):
238 return get_row_of_line(row) == last
239
240 # Return the next row in the current cache line. We use a dedicated
241 # function in order to limit the size of the generated adder to be
242 # only the bits within a cache line (3 bits with default settings)
243 def next_row(row):
244 row_v = row[0:ROW_LINE_BITS] + 1
245 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
246
247 # Read the instruction word for the given address
248 # in the current cache row
249 def read_insn_word(addr, data):
250 word = addr[2:INSN_BITS+2]
251 return data.word_select(word, 32)
252
253 # Get the tag value from the address
254 def get_tag(addr):
255 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
256
257 # Read a tag from a tag memory row
258 def read_tag(way, tagset):
259 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
260
261 # Write a tag to tag memory row
262 def write_tag(way, tagset, tag):
263 return read_tag(way, tagset).eq(tag)
264
265 # Simple hash for direct-mapped TLB index
266 def hash_ea(addr):
267 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
268 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
269 ] ^ addr[
270 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
271 ]
272 return hsh
273
274
275 # Cache reload state machine
276 @unique
277 class State(Enum):
278 IDLE = 0
279 CLR_TAG = 1
280 WAIT_ACK = 2
281
282
283 class RegInternal(RecordObject):
284 def __init__(self):
285 super().__init__()
286 # Cache hit state (Latches for 1 cycle BRAM access)
287 self.hit_way = Signal(WAY_BITS)
288 self.hit_nia = Signal(64)
289 self.hit_smark = Signal()
290 self.hit_valid = Signal()
291
292 # Cache miss state (reload state machine)
293 self.state = Signal(State, reset=State.IDLE)
294 self.wb = WBMasterOut("wb")
295 self.req_adr = Signal(64)
296 self.store_way = Signal(WAY_BITS)
297 self.store_index = Signal(INDEX_BITS)
298 self.store_row = Signal(ROW_BITS)
299 self.store_tag = Signal(TAG_BITS)
300 self.store_valid = Signal()
301 self.end_row_ix = Signal(ROW_LINE_BITS)
302 self.rows_valid = RowPerLineValidArray()
303
304 # TLB miss state
305 self.fetch_failed = Signal()
306
307
308 class ICache(Elaboratable):
309 """64 bit direct mapped icache. All instructions are 4B aligned."""
310 def __init__(self):
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 self.wb_out = WBMasterOut(name="wb_out")
322 self.wb_in = WBSlaveOut(name="wb_in")
323
324 # standard naming (wired to non-standard for compatibility)
325 self.bus = Interface(addr_width=32,
326 data_width=64,
327 granularity=8,
328 features={'stall'},
329 alignment=0,
330 name="dcache")
331
332 self.log_out = Signal(54)
333
334
335 # Generate a cache RAM for each way
336 def rams(self, m, r, cache_out_row, use_previous,
337 replace_way, req_row):
338
339 comb = m.d.comb
340 sync = m.d.sync
341
342 wb_in, stall_in = self.wb_in, self.stall_in
343
344 for i in range(NUM_WAYS):
345 do_read = Signal(name="do_rd_%d" % i)
346 do_write = Signal(name="do_wr_%d" % i)
347 rd_addr = Signal(ROW_BITS)
348 wr_addr = Signal(ROW_BITS)
349 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
350 wr_sel = Signal(ROW_SIZE)
351
352 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True, ram_num=i)
353 setattr(m.submodules, "cacheram_%d" % i, way)
354
355 comb += way.rd_en.eq(do_read)
356 comb += way.rd_addr.eq(rd_addr)
357 comb += d_out.eq(way.rd_data_o)
358 comb += way.wr_sel.eq(wr_sel)
359 comb += way.wr_addr.eq(wr_addr)
360 comb += way.wr_data.eq(wb_in.dat)
361
362 comb += do_read.eq(~(stall_in | use_previous))
363 comb += do_write.eq(wb_in.ack & (replace_way == i))
364
365 with m.If(do_write):
366 sync += Display("cache write adr: %x data: %lx",
367 wr_addr, way.wr_data)
368
369 with m.If(r.hit_way == i):
370 comb += cache_out_row.eq(d_out)
371 with m.If(do_read):
372 sync += Display("cache read adr: %x data: %x",
373 req_row, d_out)
374
375 comb += rd_addr.eq(req_row)
376 comb += wr_addr.eq(r.store_row)
377 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
378
379 # Generate PLRUs
380 def maybe_plrus(self, m, r, plru_victim):
381 comb = m.d.comb
382
383 with m.If(NUM_WAYS > 1):
384 for i in range(NUM_LINES):
385 plru_acc_i = Signal(WAY_BITS)
386 plru_acc_en = Signal()
387 plru = PLRU(WAY_BITS)
388 setattr(m.submodules, "plru_%d" % i, plru)
389
390 comb += plru.acc_i.eq(plru_acc_i)
391 comb += plru.acc_en.eq(plru_acc_en)
392
393 # PLRU interface
394 with m.If(get_index(r.hit_nia) == i):
395 comb += plru.acc_en.eq(r.hit_valid)
396
397 comb += plru.acc_i.eq(r.hit_way)
398 comb += plru_victim[i].eq(plru.lru_o)
399
400 # TLB hit detection and real address generation
401 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
402 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
403 priv_fault, access_ok):
404
405 comb = m.d.comb
406
407 i_in = self.i_in
408
409 pte = Signal(TLB_PTE_BITS)
410 ttag = Signal(TLB_EA_TAG_BITS)
411
412 comb += tlb_req_index.eq(hash_ea(i_in.nia))
413 comb += pte.eq(itlb_ptes[tlb_req_index])
414 comb += ttag.eq(itlb_tags[tlb_req_index])
415
416 with m.If(i_in.virt_mode):
417 comb += real_addr.eq(Cat(
418 i_in.nia[:TLB_LG_PGSZ],
419 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
420 ))
421
422 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
423 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
424
425 comb += eaa_priv.eq(pte[3])
426
427 with m.Else():
428 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
429 comb += ra_valid.eq(1)
430 comb += eaa_priv.eq(1)
431
432 # No IAMR, so no KUEP support for now
433 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
434 comb += access_ok.eq(ra_valid & ~priv_fault)
435
436 # iTLB update
437 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
438 comb = m.d.comb
439 sync = m.d.sync
440
441 m_in = self.m_in
442
443 wr_index = Signal(TLB_SIZE)
444 comb += wr_index.eq(hash_ea(m_in.addr))
445
446 with m.If(m_in.tlbie & m_in.doall):
447 # Clear all valid bits
448 for i in range(TLB_SIZE):
449 sync += itlb_valid_bits[i].eq(0)
450
451 with m.Elif(m_in.tlbie):
452 # Clear entry regardless of hit or miss
453 sync += itlb_valid_bits[wr_index].eq(0)
454
455 with m.Elif(m_in.tlbld):
456 sync += itlb_tags[wr_index].eq(
457 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
458 )
459 sync += itlb_ptes[wr_index].eq(m_in.pte)
460 sync += itlb_valid_bits[wr_index].eq(1)
461
462 # Cache hit detection, output to fetch2 and other misc logic
463 def icache_comb(self, m, use_previous, r, req_index, req_row,
464 req_hit_way, req_tag, real_addr, req_laddr,
465 cache_valid_bits, cache_tags, access_ok,
466 req_is_hit, req_is_miss, replace_way,
467 plru_victim, cache_out_row):
468
469 comb = m.d.comb
470
471 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
472 flush_in, stall_out = self.flush_in, self.stall_out
473
474 is_hit = Signal()
475 hit_way = Signal(WAY_BITS)
476
477 # i_in.sequential means that i_in.nia this cycle is 4 more than
478 # last cycle. If we read more than 32 bits at a time, had a
479 # cache hit last cycle, and we don't want the first 32-bit chunk
480 # then we can keep the data we read last cycle and just use that.
481 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
482 comb += use_previous.eq(i_in.sequential & r.hit_valid)
483
484 # Extract line, row and tag from request
485 comb += req_index.eq(get_index(i_in.nia))
486 comb += req_row.eq(get_row(i_in.nia))
487 comb += req_tag.eq(get_tag(real_addr))
488
489 # Calculate address of beginning of cache row, will be
490 # used for cache miss processing if needed
491 comb += req_laddr.eq(Cat(
492 Const(0, ROW_OFF_BITS),
493 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
494 ))
495
496 # Test if pending request is a hit on any way
497 hitcond = Signal()
498 comb += hitcond.eq((r.state == State.WAIT_ACK)
499 & (req_index == r.store_index)
500 & r.rows_valid[req_row % ROW_PER_LINE]
501 )
502 with m.If(i_in.req):
503 cvb = Signal(NUM_WAYS)
504 ctag = Signal(TAG_RAM_WIDTH)
505 comb += ctag.eq(cache_tags[req_index])
506 comb += cvb.eq(cache_valid_bits[req_index])
507 for i in range(NUM_WAYS):
508 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
509 comb += tagi.eq(read_tag(i, ctag))
510 hit_test = Signal(name="hit_test%d" % i)
511 comb += hit_test.eq(i == r.store_way)
512 with m.If((cvb[i] | (hitcond & hit_test))
513 & (tagi == req_tag)):
514 comb += hit_way.eq(i)
515 comb += is_hit.eq(1)
516
517 # Generate the "hit" and "miss" signals
518 # for the synchronous blocks
519 with m.If(i_in.req & access_ok & ~flush_in):
520 comb += req_is_hit.eq(is_hit)
521 comb += req_is_miss.eq(~is_hit)
522
523 comb += req_hit_way.eq(hit_way)
524
525 # The way to replace on a miss
526 with m.If(r.state == State.CLR_TAG):
527 comb += replace_way.eq(plru_victim[r.store_index])
528 with m.Else():
529 comb += replace_way.eq(r.store_way)
530
531 # Output instruction from current cache row
532 #
533 # Note: This is a mild violation of our design principle of
534 # having pipeline stages output from a clean latch. In this
535 # case we output the result of a mux. The alternative would
536 # be output an entire row which I prefer not to do just yet
537 # as it would force fetch2 to know about some of the cache
538 # geometry information.
539 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
540 comb += i_out.valid.eq(r.hit_valid)
541 comb += i_out.nia.eq(r.hit_nia)
542 comb += i_out.stop_mark.eq(r.hit_smark)
543 comb += i_out.fetch_failed.eq(r.fetch_failed)
544
545 # Stall fetch1 if we have a miss on cache or TLB
546 # or a protection fault
547 comb += stall_out.eq(~(is_hit & access_ok))
548
549 # Wishbone requests output (from the cache miss reload machine)
550 comb += wb_out.eq(r.wb)
551
552 # Cache hit synchronous machine
553 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
554 req_index, req_tag, real_addr):
555 sync = m.d.sync
556
557 i_in, stall_in = self.i_in, self.stall_in
558 flush_in = self.flush_in
559
560 # keep outputs to fetch2 unchanged on a stall
561 # except that flush or reset sets valid to 0
562 # If use_previous, keep the same data as last
563 # cycle and use the second half
564 with m.If(stall_in | use_previous):
565 with m.If(flush_in):
566 sync += r.hit_valid.eq(0)
567 with m.Else():
568 # On a hit, latch the request for the next cycle,
569 # when the BRAM data will be available on the
570 # cache_out output of the corresponding way
571 sync += r.hit_valid.eq(req_is_hit)
572
573 with m.If(req_is_hit):
574 sync += r.hit_way.eq(req_hit_way)
575 sync += Display(
576 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
577 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
578 i_in.stop_mark, req_index, req_tag, \
579 req_hit_way, real_addr
580 )
581
582
583
584 with m.If(~stall_in):
585 # Send stop marks and NIA down regardless of validity
586 sync += r.hit_smark.eq(i_in.stop_mark)
587 sync += r.hit_nia.eq(i_in.nia)
588
589 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
590 req_index, req_tag, replace_way, real_addr):
591 comb = m.d.comb
592 sync = m.d.sync
593
594 i_in = self.i_in
595
596 # Reset per-row valid flags, only used in WAIT_ACK
597 for i in range(ROW_PER_LINE):
598 sync += r.rows_valid[i].eq(0)
599
600 # We need to read a cache line
601 with m.If(req_is_miss):
602 sync += Display(
603 "cache miss nia:%x IR:%x SM:%x idx:%x "
604 " way:%x tag:%x RA:%x", i_in.nia,
605 i_in.virt_mode, i_in.stop_mark, req_index,
606 replace_way, req_tag, real_addr
607 )
608
609 # Keep track of our index and way for subsequent stores
610 st_row = Signal(ROW_BITS)
611 comb += st_row.eq(get_row(req_laddr))
612 sync += r.store_index.eq(req_index)
613 sync += r.store_row.eq(st_row)
614 sync += r.store_tag.eq(req_tag)
615 sync += r.store_valid.eq(1)
616 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
617
618 # Prep for first wishbone read. We calculate the address
619 # of the start of the cache line and start the WB cycle.
620 sync += r.req_adr.eq(req_laddr)
621 sync += r.wb.cyc.eq(1)
622 sync += r.wb.stb.eq(1)
623
624 # Track that we had one request sent
625 sync += r.state.eq(State.CLR_TAG)
626
627 def icache_miss_clr_tag(self, m, r, replace_way,
628 cache_valid_bits, req_index,
629 tagset, cache_tags):
630
631 comb = m.d.comb
632 sync = m.d.sync
633
634 # Get victim way from plru
635 sync += r.store_way.eq(replace_way)
636 # Force misses on that way while reloading that line
637 cv = Signal(INDEX_BITS)
638 comb += cv.eq(cache_valid_bits[req_index])
639 comb += cv.bit_select(replace_way, 1).eq(0)
640 sync += cache_valid_bits[req_index].eq(cv)
641
642 for i in range(NUM_WAYS):
643 with m.If(i == replace_way):
644 comb += tagset.eq(cache_tags[r.store_index])
645 comb += write_tag(i, tagset, r.store_tag)
646 sync += cache_tags[r.store_index].eq(tagset)
647
648 sync += r.state.eq(State.WAIT_ACK)
649
650 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
651 stbs_done, cache_valid_bits):
652 comb = m.d.comb
653 sync = m.d.sync
654
655 wb_in = self.wb_in
656
657 # Requests are all sent if stb is 0
658 stbs_zero = Signal()
659 comb += stbs_zero.eq(r.wb.stb == 0)
660 comb += stbs_done.eq(stbs_zero)
661
662 # If we are still sending requests, was one accepted?
663 with m.If(~wb_in.stall & ~stbs_zero):
664 # That was the last word? We are done sending.
665 # Clear stb and set stbs_done so we can handle
666 # an eventual last ack on the same cycle.
667 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
668 sync += Display(
669 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
670 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
671 "stbs_done:%x", r.wb.adr, r.end_row_ix,
672 r.wb.stb, stbs_zero, stbs_done
673 )
674 sync += r.wb.stb.eq(0)
675 comb += stbs_done.eq(1)
676
677 # Calculate the next row address
678 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
679 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
680 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
681 sync += Display("RARANGE r.req_adr:%x rarange:%x "
682 "stbs_zero:%x stbs_done:%x",
683 r.req_adr, rarange, stbs_zero, stbs_done)
684
685 # Incoming acks processing
686 with m.If(wb_in.ack):
687 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
688 "stbs_done:%x",
689 wb_in.dat, stbs_zero, stbs_done)
690
691 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
692
693 # Check for completion
694 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
695 # Complete wishbone cycle
696 sync += r.wb.cyc.eq(0)
697 # be nice, clear addr
698 sync += r.req_adr.eq(0)
699
700 # Cache line is now valid
701 cv = Signal(INDEX_BITS)
702 comb += cv.eq(cache_valid_bits[r.store_index])
703 comb += cv.bit_select(replace_way, 1).eq(
704 r.store_valid & ~inval_in
705 )
706 sync += cache_valid_bits[r.store_index].eq(cv)
707
708 sync += r.state.eq(State.IDLE)
709
710 # move on to next request in row
711 # Increment store row counter
712 sync += r.store_row.eq(next_row(r.store_row))
713
714 # Cache miss/reload synchronous machine
715 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
716 req_index, req_laddr, req_tag, replace_way,
717 cache_tags, access_ok, real_addr):
718 comb = m.d.comb
719 sync = m.d.sync
720
721 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
722 stall_in, flush_in = self.stall_in, self.flush_in
723 inval_in = self.inval_in
724
725 tagset = Signal(TAG_RAM_WIDTH)
726 stbs_done = Signal()
727
728 comb += r.wb.sel.eq(-1)
729 comb += r.wb.adr.eq(r.req_adr[3:])
730
731 # Process cache invalidations
732 with m.If(inval_in):
733 for i in range(NUM_LINES):
734 sync += cache_valid_bits[i].eq(0)
735 sync += r.store_valid.eq(0)
736
737 # Main state machine
738 with m.Switch(r.state):
739
740 with m.Case(State.IDLE):
741 self.icache_miss_idle(
742 m, r, req_is_miss, req_laddr,
743 req_index, req_tag, replace_way,
744 real_addr
745 )
746
747 with m.Case(State.CLR_TAG, State.WAIT_ACK):
748 with m.If(r.state == State.CLR_TAG):
749 self.icache_miss_clr_tag(
750 m, r, replace_way,
751 cache_valid_bits, req_index,
752 tagset, cache_tags
753 )
754
755 self.icache_miss_wait_ack(
756 m, r, replace_way, inval_in,
757 stbs_done, cache_valid_bits
758 )
759
760 # TLB miss and protection fault processing
761 with m.If(flush_in | m_in.tlbld):
762 sync += r.fetch_failed.eq(0)
763 with m.Elif(i_in.req & ~access_ok & ~stall_in):
764 sync += r.fetch_failed.eq(1)
765
766 # icache_log: if LOG_LENGTH > 0 generate
767 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
768 req_is_miss, req_is_hit, lway, wstate, r):
769 comb = m.d.comb
770 sync = m.d.sync
771
772 wb_in, i_out = self.wb_in, self.i_out
773 log_out, stall_out = self.log_out, self.stall_out
774
775 # Output data to logger
776 for i in range(LOG_LENGTH):
777 log_data = Signal(54)
778 lway = Signal(WAY_BITS)
779 wstate = Signal()
780
781 sync += lway.eq(req_hit_way)
782 sync += wstate.eq(0)
783
784 with m.If(r.state != State.IDLE):
785 sync += wstate.eq(1)
786
787 sync += log_data.eq(Cat(
788 ra_valid, access_ok, req_is_miss, req_is_hit,
789 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
790 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
791 r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
792 ))
793 comb += log_out.eq(log_data)
794
795 def elaborate(self, platform):
796
797 m = Module()
798 comb = m.d.comb
799
800 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
801 cache_tags = CacheTagArray()
802 cache_valid_bits = CacheValidBitsArray()
803
804 itlb_valid_bits = TLBValidBitsArray()
805 itlb_tags = TLBTagArray()
806 itlb_ptes = TLBPtesArray()
807 # TODO to be passed to nmigen as ram attributes
808 # attribute ram_style of itlb_tags : signal is "distributed";
809 # attribute ram_style of itlb_ptes : signal is "distributed";
810
811 # Privilege bit from PTE EAA field
812 eaa_priv = Signal()
813
814 r = RegInternal()
815
816 # Async signal on incoming request
817 req_index = Signal(INDEX_BITS)
818 req_row = Signal(ROW_BITS)
819 req_hit_way = Signal(WAY_BITS)
820 req_tag = Signal(TAG_BITS)
821 req_is_hit = Signal()
822 req_is_miss = Signal()
823 req_laddr = Signal(64)
824
825 tlb_req_index = Signal(TLB_SIZE)
826 real_addr = Signal(REAL_ADDR_BITS)
827 ra_valid = Signal()
828 priv_fault = Signal()
829 access_ok = Signal()
830 use_previous = Signal()
831
832 cache_out_row = Signal(ROW_SIZE_BITS)
833
834 plru_victim = PLRUOut()
835 replace_way = Signal(WAY_BITS)
836
837 # call sub-functions putting everything together,
838 # using shared signals established above
839 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
840 self.maybe_plrus(m, r, plru_victim)
841 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
842 itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
843 access_ok)
844 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
845 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
846 req_tag, real_addr, req_laddr, cache_valid_bits,
847 cache_tags, access_ok, req_is_hit, req_is_miss,
848 replace_way, plru_victim, cache_out_row)
849 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
850 req_index, req_tag, real_addr)
851 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
852 req_laddr, req_tag, replace_way, cache_tags,
853 access_ok, real_addr)
854 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
855 # req_is_miss, req_is_hit, lway, wstate, r)
856
857 return m
858
859
860 def icache_sim(dut):
861 i_in = dut.i_in
862 i_out = dut.i_out
863 m_out = dut.m_in
864
865 yield i_in.priv_mode.eq(1)
866 yield i_in.req.eq(0)
867 yield i_in.nia.eq(0)
868 yield i_in.stop_mark.eq(0)
869 yield m_out.tlbld.eq(0)
870 yield m_out.tlbie.eq(0)
871 yield m_out.addr.eq(0)
872 yield m_out.pte.eq(0)
873 yield
874 yield
875 yield
876 yield
877
878 # miss, stalls for a bit
879 yield i_in.req.eq(1)
880 yield i_in.nia.eq(Const(0x0000000000000004, 64))
881 yield
882 valid = yield i_out.valid
883 while not valid:
884 yield
885 valid = yield i_out.valid
886 yield i_in.req.eq(0)
887
888 insn = yield i_out.insn
889 nia = yield i_out.nia
890 assert insn == 0x00000001, \
891 "insn @%x=%x expected 00000001" % (nia, insn)
892 yield i_in.req.eq(0)
893 yield
894
895 # hit
896 yield i_in.req.eq(1)
897 yield i_in.nia.eq(Const(0x0000000000000008, 64))
898 yield
899 valid = yield i_out.valid
900 while not valid:
901 yield
902 valid = yield i_out.valid
903 yield i_in.req.eq(0)
904
905 nia = yield i_out.nia
906 insn = yield i_out.insn
907 yield
908 assert insn == 0x00000002, \
909 "insn @%x=%x expected 00000002" % (nia, insn)
910
911 # another miss
912 yield i_in.req.eq(1)
913 yield i_in.nia.eq(Const(0x0000000000000040, 64))
914 for i in range(30):
915 yield
916 yield
917 valid = yield i_out.valid
918 nia = yield i_in.nia
919 insn = yield i_out.insn
920 assert valid
921 assert insn == 0x00000010, \
922 "insn @%x=%x expected 00000010" % (nia, insn)
923
924 # test something that aliases
925 yield i_in.req.eq(1)
926 yield i_in.nia.eq(Const(0x0000000000000100, 64))
927 yield
928 yield
929 valid = yield i_out.valid
930 assert ~valid
931 for i in range(30):
932 yield
933 yield
934 insn = yield i_out.insn
935 valid = yield i_out.valid
936 insn = yield i_out.insn
937 assert valid
938 assert insn == 0x00000040, \
939 "insn @%x=%x expected 00000040" % (nia, insn)
940 yield i_in.req.eq(0)
941
942
943
944 def test_icache(mem):
945 dut = ICache()
946
947 memory = Memory(width=64, depth=512, init=mem)
948 sram = SRAM(memory=memory, granularity=8)
949
950 m = Module()
951
952 m.submodules.icache = dut
953 m.submodules.sram = sram
954
955 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
956 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
957 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
958 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
959 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
960 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
961
962 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
963 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
964
965 # nmigen Simulation
966 sim = Simulator(m)
967 sim.add_clock(1e-6)
968
969 sim.add_sync_process(wrap(icache_sim(dut)))
970 with sim.write_vcd('test_icache.vcd'):
971 sim.run()
972
973 if __name__ == '__main__':
974 dut = ICache()
975 vl = rtlil.convert(dut, ports=[])
976 with open("test_icache.il", "w") as f:
977 f.write(vl)
978
979 # set up memory every 32-bits with incrementing values 0 1 2 ...
980 mem = []
981 for i in range(512):
982 mem.append((i*2) | ((i*2+1)<<32))
983
984 test_icache(mem)
985