3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
22 from enum
import Enum
, unique
23 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
)
24 from nmigen
.cli
import main
, rtlil
25 from nmutil
.iocontrol
import RecordObject
26 from nmigen
.utils
import log2_int
27 from nmutil
.util
import Display
29 #from nmutil.plru import PLRU
30 from soc
.experiment
.cache_ram
import CacheRam
31 from soc
.experiment
.plru
import PLRU
33 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
37 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
38 WB_SEL_BITS
, WBAddrType
, WBDataType
,
39 WBSelType
, WBMasterOut
, WBSlaveOut
,
40 WBMasterOutVector
, WBSlaveOutVector
,
41 WBIOMasterOut
, WBIOSlaveOut
)
44 from nmigen_soc
.wishbone
.sram
import SRAM
45 from nmigen
import Memory
46 from nmutil
.util
import wrap
47 from nmigen
.cli
import main
, rtlil
49 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
50 # Also, check out the cxxsim nmigen branch, and latest yosys from git
51 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE
= WB_DATA_BITS
// 8
62 # Number of lines in a set
66 # L1 ITLB number of entries (direct mapped)
68 # L1 ITLB log_2(page_size)
70 # Number of real address bits that we store
72 # Non-zero to enable log data collection
75 ROW_SIZE_BITS
= ROW_SIZE
* 8
76 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
77 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
78 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
79 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
80 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
81 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
83 # Bit fields counts in the address
85 # INSN_BITS is the number of bits to select an instruction in a row
86 INSN_BITS
= log2_int(INSN_PER_ROW
)
87 # ROW_BITS is the number of bits to select a row
88 ROW_BITS
= log2_int(BRAM_ROWS
)
89 # ROW_LINE_BITS is the number of bits to select a row within a line
90 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
91 # LINE_OFF_BITS is the number of bits for the offset in a cache line
92 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
93 # ROW_OFF_BITS is the number of bits for the offset in a row
94 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
95 # INDEX_BITS is the number of bits to select a cache line
96 INDEX_BITS
= log2_int(NUM_LINES
)
97 # SET_SIZE_BITS is the log base 2 of the set size
98 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
99 # TAG_BITS is the number of bits of the tag part of the address
100 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
101 # TAG_WIDTH is the width in bits of each way of the tag RAM
102 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
104 # WAY_BITS is the number of bits to select a way
105 WAY_BITS
= log2_int(NUM_WAYS
)
106 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
109 TLB_BITS
= log2_int(TLB_SIZE
)
110 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
113 print("BRAM_ROWS =", BRAM_ROWS
)
114 print("INDEX_BITS =", INDEX_BITS
)
115 print("INSN_BITS =", INSN_BITS
)
116 print("INSN_PER_ROW =", INSN_PER_ROW
)
117 print("LINE_SIZE =", LINE_SIZE
)
118 print("LINE_OFF_BITS =", LINE_OFF_BITS
)
119 print("LOG_LENGTH =", LOG_LENGTH
)
120 print("NUM_LINES =", NUM_LINES
)
121 print("NUM_WAYS =", NUM_WAYS
)
122 print("REAL_ADDR_BITS =", REAL_ADDR_BITS
)
123 print("ROW_BITS =", ROW_BITS
)
124 print("ROW_OFF_BITS =", ROW_OFF_BITS
)
125 print("ROW_LINE_BITS =", ROW_LINE_BITS
)
126 print("ROW_PER_LINE =", ROW_PER_LINE
)
127 print("ROW_SIZE =", ROW_SIZE
)
128 print("ROW_SIZE_BITS =", ROW_SIZE_BITS
)
129 print("SET_SIZE_BITS =", SET_SIZE_BITS
)
131 print("TAG_BITS =", TAG_BITS
)
132 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH
)
133 print("TAG_BITS =", TAG_BITS
)
134 print("TLB_BITS =", TLB_BITS
)
135 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS
)
136 print("TLB_LG_PGSZ =", TLB_LG_PGSZ
)
137 print("TLB_PTE_BITS =", TLB_PTE_BITS
)
138 print("TLB_SIZE =", TLB_SIZE
)
139 print("WAY_BITS =", WAY_BITS
)
141 # from microwatt/utils.vhdl
143 return n
!= 0 and (n
& (n
- 1)) == 0
145 assert LINE_SIZE
% ROW_SIZE
== 0
146 assert ispow2(LINE_SIZE
), "LINE_SIZE not power of 2"
147 assert ispow2(NUM_LINES
), "NUM_LINES not power of 2"
148 assert ispow2(ROW_PER_LINE
), "ROW_PER_LINE not power of 2"
149 assert ispow2(INSN_PER_ROW
), "INSN_PER_ROW not power of 2"
150 assert (ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
)), \
151 "geometry bits don't add up"
152 assert (LINE_OFF_BITS
== (ROW_OFF_BITS
+ ROW_LINE_BITS
)), \
153 "geometry bits don't add up"
154 assert (REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
)), \
155 "geometry bits don't add up"
156 assert (REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
)), \
157 "geometry bits don't add up"
160 #-- Example of layout for 32 lines of 64 bytes:
162 #-- .. tag |index| line |
164 #-- .. | | | |00| zero (2)
165 #-- .. | | |-| | INSN_BITS (1)
166 #-- .. | |---| | ROW_LINE_BITS (3)
167 #-- .. | |--- - --| LINE_OFF_BITS (6)
168 #-- .. | |- --| ROW_OFF_BITS (3)
169 #-- .. |----- ---| | ROW_BITS (8)
170 #-- .. |-----| | INDEX_BITS (5)
171 #-- .. --------| | TAG_BITS (53)
172 # Example of layout for 32 lines of 64 bytes:
174 # .. tag |index| line |
176 # .. | | | |00| zero (2)
177 # .. | | |-| | INSN_BITS (1)
178 # .. | |---| | ROW_LINE_BITS (3)
179 # .. | |--- - --| LINE_OFF_BITS (6)
180 # .. | |- --| ROW_OFF_BITS (3)
181 # .. |----- ---| | ROW_BITS (8)
182 # .. |-----| | INDEX_BITS (5)
183 # .. --------| | TAG_BITS (53)
185 #-- The cache data BRAM organized as described above for each way
186 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
188 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
189 #-- not handle a clean (commented) definition of the cache tags as a 3d
190 #-- memory. For now, work around it by putting all the tags
193 return Array(Signal(TAG_RAM_WIDTH
, name
="cachetag_%d" %x) \
194 for x
in range(NUM_LINES
))
196 #-- The cache valid bits
197 def CacheValidBitsArray():
198 return Array(Signal(NUM_WAYS
, name
="cachevalid_%d" %x) \
199 for x
in range(NUM_LINES
))
201 def RowPerLineValidArray():
202 return Array(Signal(name
="rows_valid_%d" %x) \
203 for x
in range(ROW_PER_LINE
))
206 # TODO to be passed to nigmen as ram attributes
207 # attribute ram_style : string;
208 # attribute ram_style of cache_tags : signal is "distributed";
211 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
212 def TLBValidBitsArray():
213 return Array(Signal(name
="tlbvalid_%d" %x) \
214 for x
in range(TLB_SIZE
))
217 return Array(Signal(TLB_EA_TAG_BITS
, name
="tlbtag_%d" %x) \
218 for x
in range(TLB_SIZE
))
221 return Array(Signal(TLB_PTE_BITS
, name
="tlbptes_%d" %x) \
222 for x
in range(TLB_SIZE
))
225 # Cache RAM interface
227 return Array(Signal(ROW_SIZE_BITS
, name
="cache_out_%d" %x) \
228 for x
in range(NUM_WAYS
))
230 # PLRU output interface
232 return Array(Signal(WAY_BITS
, name
="plru_out_%d" %x) \
233 for x
in range(NUM_LINES
))
235 # Return the cache line index (tag index) for an address
237 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
239 # Return the cache row index (data memory) for an address
241 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
243 # Return the index of a row within a line
244 def get_row_of_line(row
):
245 return row
[:ROW_LINE_BITS
]
247 # Returns whether this is the last row of a line
248 def is_last_row_addr(addr
, last
):
249 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
251 # Returns whether this is the last row of a line
252 def is_last_row(row
, last
):
253 return get_row_of_line(row
) == last
255 # Return the next row in the current cache line. We use a dedicated
256 # function in order to limit the size of the generated adder to be
257 # only the bits within a cache line (3 bits with default settings)
259 row_v
= row
[0:ROW_LINE_BITS
] + 1
260 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
262 # Read the instruction word for the given address
263 # in the current cache row
264 def read_insn_word(addr
, data
):
265 word
= addr
[2:INSN_BITS
+2]
266 return data
.word_select(word
, 32)
268 # Get the tag value from the address
270 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
272 # Read a tag from a tag memory row
273 def read_tag(way
, tagset
):
274 return tagset
.word_select(way
, TAG_BITS
)
276 # Write a tag to tag memory row
277 def write_tag(way
, tagset
, tag
):
278 return read_tag(way
, tagset
).eq(tag
)
280 # Simple hash for direct-mapped TLB index
282 hsh
= addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^ addr
[
283 TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
285 TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
290 # Cache reload state machine
298 class RegInternal(RecordObject
):
301 # Cache hit state (Latches for 1 cycle BRAM access)
302 self
.hit_way
= Signal(NUM_WAYS
)
303 self
.hit_nia
= Signal(64)
304 self
.hit_smark
= Signal()
305 self
.hit_valid
= Signal()
307 # Cache miss state (reload state machine)
308 self
.state
= Signal(State
, reset
=State
.IDLE
)
309 self
.wb
= WBMasterOut("wb")
310 self
.req_adr
= Signal(64)
311 self
.store_way
= Signal(NUM_WAYS
)
312 self
.store_index
= Signal(NUM_LINES
)
313 self
.store_row
= Signal(BRAM_ROWS
)
314 self
.store_tag
= Signal(TAG_BITS
)
315 self
.store_valid
= Signal()
316 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
317 self
.rows_valid
= RowPerLineValidArray()
320 self
.fetch_failed
= Signal()
323 class ICache(Elaboratable
):
324 """64 bit direct mapped icache. All instructions are 4B aligned."""
326 self
.i_in
= Fetch1ToICacheType(name
="i_in")
327 self
.i_out
= ICacheToDecode1Type(name
="i_out")
329 self
.m_in
= MMUToICacheType(name
="m_in")
331 self
.stall_in
= Signal()
332 self
.stall_out
= Signal()
333 self
.flush_in
= Signal()
334 self
.inval_in
= Signal()
336 self
.wb_out
= WBMasterOut(name
="wb_out")
337 self
.wb_in
= WBSlaveOut(name
="wb_in")
339 self
.log_out
= Signal(54)
342 # Generate a cache RAM for each way
343 def rams(self
, m
, r
, cache_out_row
, use_previous
,
344 replace_way
, req_row
):
349 wb_in
, stall_in
= self
.wb_in
, self
.stall_in
351 for i
in range(NUM_WAYS
):
352 do_read
= Signal(name
="do_rd_%d" % i
)
353 do_write
= Signal(name
="do_wr_%d" % i
)
354 rd_addr
= Signal(ROW_BITS
)
355 wr_addr
= Signal(ROW_BITS
)
356 d_out
= Signal(ROW_SIZE_BITS
, name
="d_out_%d" % i
)
357 wr_sel
= Signal(ROW_SIZE
)
359 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
, True)
360 setattr(m
.submodules
, "cacheram_%d" % i
, way
)
362 comb
+= way
.rd_en
.eq(do_read
)
363 comb
+= way
.rd_addr
.eq(rd_addr
)
364 comb
+= d_out
.eq(way
.rd_data_o
)
365 comb
+= way
.wr_sel
.eq(wr_sel
)
366 comb
+= way
.wr_addr
.eq(wr_addr
)
367 comb
+= way
.wr_data
.eq(wb_in
.dat
)
369 comb
+= do_read
.eq(~
(stall_in | use_previous
))
370 comb
+= do_write
.eq(wb_in
.ack
& (replace_way
== i
))
373 sync
+= Display("cache write adr: %x data: %lx",
374 wr_addr
, way
.wr_data
)
376 with m
.If(r
.hit_way
== i
):
377 comb
+= cache_out_row
.eq(d_out
)
379 sync
+= Display("cache read adr: %x data: %x",
382 comb
+= rd_addr
.eq(req_row
)
383 comb
+= wr_addr
.eq(r
.store_row
)
384 comb
+= wr_sel
.eq(Repl(do_write
, ROW_SIZE
))
387 def maybe_plrus(self
, m
, r
, plru_victim
):
390 with m
.If(NUM_WAYS
> 1):
391 for i
in range(NUM_LINES
):
392 plru_acc_i
= Signal(WAY_BITS
)
393 plru_acc_en
= Signal()
394 plru
= PLRU(WAY_BITS
)
395 setattr(m
.submodules
, "plru_%d" % i
, plru
)
397 comb
+= plru
.acc_i
.eq(plru_acc_i
)
398 comb
+= plru
.acc_en
.eq(plru_acc_en
)
401 with m
.If(get_index(r
.hit_nia
) == i
):
402 comb
+= plru
.acc_en
.eq(r
.hit_valid
)
404 comb
+= plru
.acc_i
.eq(r
.hit_way
)
405 comb
+= plru_victim
[i
].eq(plru
.lru_o
)
407 # TLB hit detection and real address generation
408 def itlb_lookup(self
, m
, tlb_req_index
, itlb_ptes
, itlb_tags
,
409 real_addr
, itlb_valid_bits
, ra_valid
, eaa_priv
,
410 priv_fault
, access_ok
):
416 pte
= Signal(TLB_PTE_BITS
)
417 ttag
= Signal(TLB_EA_TAG_BITS
)
419 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
420 comb
+= pte
.eq(itlb_ptes
[tlb_req_index
])
421 comb
+= ttag
.eq(itlb_tags
[tlb_req_index
])
423 with m
.If(i_in
.virt_mode
):
424 comb
+= real_addr
.eq(Cat(
425 i_in
.nia
[:TLB_LG_PGSZ
],
426 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
429 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
430 comb
+= ra_valid
.eq(itlb_valid_bits
[tlb_req_index
])
432 comb
+= eaa_priv
.eq(pte
[3])
435 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
436 comb
+= ra_valid
.eq(1)
437 comb
+= eaa_priv
.eq(1)
439 # No IAMR, so no KUEP support for now
440 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
441 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
444 def itlb_update(self
, m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
):
450 wr_index
= Signal(TLB_SIZE
)
451 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
453 with m
.If(m_in
.tlbie
& m_in
.doall
):
454 # Clear all valid bits
455 for i
in range(TLB_SIZE
):
456 sync
+= itlb_valid_bits
[i
].eq(0)
458 with m
.Elif(m_in
.tlbie
):
459 # Clear entry regardless of hit or miss
460 sync
+= itlb_valid_bits
[wr_index
].eq(0)
462 with m
.Elif(m_in
.tlbld
):
463 sync
+= itlb_tags
[wr_index
].eq(
464 m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64]
466 sync
+= itlb_ptes
[wr_index
].eq(m_in
.pte
)
467 sync
+= itlb_valid_bits
[wr_index
].eq(1)
469 # Cache hit detection, output to fetch2 and other misc logic
470 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
471 req_hit_way
, req_tag
, real_addr
, req_laddr
,
472 cache_valid_bits
, cache_tags
, access_ok
,
473 req_is_hit
, req_is_miss
, replace_way
,
474 plru_victim
, cache_out_row
):
478 i_in
, i_out
, wb_out
= self
.i_in
, self
.i_out
, self
.wb_out
479 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
482 hit_way
= Signal(NUM_WAYS
)
484 # i_in.sequential means that i_in.nia this cycle is 4 more than
485 # last cycle. If we read more than 32 bits at a time, had a
486 # cache hit last cycle, and we don't want the first 32-bit chunk
487 # then we can keep the data we read last cycle and just use that.
488 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
489 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
491 # Extract line, row and tag from request
492 comb
+= req_index
.eq(get_index(i_in
.nia
))
493 comb
+= req_row
.eq(get_row(i_in
.nia
))
494 comb
+= req_tag
.eq(get_tag(real_addr
))
496 # Calculate address of beginning of cache row, will be
497 # used for cache miss processing if needed
498 comb
+= req_laddr
.eq(Cat(
499 Const(0, ROW_OFF_BITS
),
500 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
503 # Test if pending request is a hit on any way
505 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
506 & (req_index
== r
.store_index
)
507 & r
.rows_valid
[req_row
% ROW_PER_LINE
])
509 cvb
= Signal(NUM_WAYS
)
510 ctag
= Signal(TAG_RAM_WIDTH
)
511 comb
+= ctag
.eq(cache_tags
[req_index
])
512 comb
+= cvb
.eq(cache_valid_bits
[req_index
])
513 for i
in range(NUM_WAYS
):
514 tagi
= Signal(TAG_BITS
, name
="tag_i%d" % i
)
515 comb
+= tagi
.eq(read_tag(i
, ctag
))
516 hit_test
= Signal(name
="hit_test%d" % i
)
517 comb
+= hit_test
.eq(i
== r
.store_way
)
518 with m
.If((cvb
[i
] |
(hitcond
& hit_test
))
519 & (tagi
== req_tag
)):
520 comb
+= hit_way
.eq(i
)
523 # Generate the "hit" and "miss" signals
524 # for the synchronous blocks
525 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
526 comb
+= req_is_hit
.eq(is_hit
)
527 comb
+= req_is_miss
.eq(~is_hit
)
530 comb
+= req_is_hit
.eq(0)
531 comb
+= req_is_miss
.eq(0)
533 comb
+= req_hit_way
.eq(hit_way
)
535 # The way to replace on a miss
536 with m
.If(r
.state
== State
.CLR_TAG
):
537 comb
+= replace_way
.eq(plru_victim
[r
.store_index
])
539 comb
+= replace_way
.eq(r
.store_way
)
541 # Output instruction from current cache row
543 # Note: This is a mild violation of our design principle of
544 # having pipeline stages output from a clean latch. In this
545 # case we output the result of a mux. The alternative would
546 # be output an entire row which I prefer not to do just yet
547 # as it would force fetch2 to know about some of the cache
548 # geometry information.
549 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
550 comb
+= i_out
.valid
.eq(r
.hit_valid
)
551 comb
+= i_out
.nia
.eq(r
.hit_nia
)
552 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
553 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
555 # Stall fetch1 if we have a miss on cache or TLB
556 # or a protection fault
557 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
559 # Wishbone requests output (from the cache miss reload machine)
560 comb
+= wb_out
.eq(r
.wb
)
562 # Cache hit synchronous machine
563 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
564 req_index
, req_tag
, real_addr
):
567 i_in
, stall_in
= self
.i_in
, self
.stall_in
568 flush_in
= self
.flush_in
570 # keep outputs to fetch2 unchanged on a stall
571 # except that flush or reset sets valid to 0
572 # If use_previous, keep the same data as last
573 # cycle and use the second half
574 with m
.If(stall_in | use_previous
):
576 sync
+= r
.hit_valid
.eq(0)
578 # On a hit, latch the request for the next cycle,
579 # when the BRAM data will be available on the
580 # cache_out output of the corresponding way
581 sync
+= r
.hit_valid
.eq(req_is_hit
)
583 with m
.If(req_is_hit
):
584 sync
+= r
.hit_way
.eq(req_hit_way
)
586 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
587 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
, \
588 i_in
.stop_mark
, req_index
, req_tag
, \
589 req_hit_way
, real_addr
594 with m
.If(~stall_in
):
595 # Send stop marks and NIA down regardless of validity
596 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
597 sync
+= r
.hit_nia
.eq(i_in
.nia
)
599 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
600 req_index
, req_tag
, replace_way
, real_addr
):
606 # Reset per-row valid flags, only used in WAIT_ACK
607 for i
in range(ROW_PER_LINE
):
608 sync
+= r
.rows_valid
[i
].eq(0)
610 # We need to read a cache line
611 with m
.If(req_is_miss
):
613 "cache miss nia:%x IR:%x SM:%x idx:%x "
614 " way:%x tag:%x RA:%x", i_in
.nia
,
615 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
616 replace_way
, req_tag
, real_addr
619 # Keep track of our index and way for subsequent stores
620 st_row
= Signal(BRAM_ROWS
)
621 comb
+= st_row
.eq(get_row(req_laddr
))
622 sync
+= r
.store_index
.eq(req_index
)
623 sync
+= r
.store_row
.eq(st_row
)
624 sync
+= r
.store_tag
.eq(req_tag
)
625 sync
+= r
.store_valid
.eq(1)
626 sync
+= r
.end_row_ix
.eq(get_row_of_line(st_row
) - 1)
628 # Prep for first wishbone read. We calculate the address
629 # of the start of the cache line and start the WB cycle.
630 sync
+= r
.req_adr
.eq(req_laddr
)
631 sync
+= r
.wb
.cyc
.eq(1)
632 sync
+= r
.wb
.stb
.eq(1)
634 # Track that we had one request sent
635 sync
+= r
.state
.eq(State
.CLR_TAG
)
637 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
638 cache_valid_bits
, req_index
,
644 # Get victim way from plru
645 sync
+= r
.store_way
.eq(replace_way
)
646 # Force misses on that way while reloading that line
647 cv
= Signal(INDEX_BITS
)
648 comb
+= cv
.eq(cache_valid_bits
[req_index
])
649 comb
+= cv
.bit_select(replace_way
, 1).eq(0)
650 sync
+= cache_valid_bits
[req_index
].eq(cv
)
652 for i
in range(NUM_WAYS
):
653 with m
.If(i
== replace_way
):
654 comb
+= tagset
.eq(cache_tags
[r
.store_index
])
655 comb
+= write_tag(i
, tagset
, r
.store_tag
)
656 sync
+= cache_tags
[r
.store_index
].eq(tagset
)
658 sync
+= r
.state
.eq(State
.WAIT_ACK
)
660 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
661 stbs_done
, cache_valid_bits
):
667 # Requests are all sent if stb is 0
669 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
670 comb
+= stbs_done
.eq(stbs_zero
)
672 # If we are still sending requests, was one accepted?
673 with m
.If(~wb_in
.stall
& ~stbs_zero
):
674 # That was the last word ? # We are done sending.
675 # Clear stb and set stbs_done # so we can handle
676 # an eventual last ack on # the same cycle.
677 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
679 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
680 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
681 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
682 r
.wb
.stb
, stbs_zero
, stbs_done
684 sync
+= r
.wb
.stb
.eq(0)
685 comb
+= stbs_done
.eq(1)
687 # Calculate the next row address
688 rarange
= Signal(LINE_OFF_BITS
- ROW_OFF_BITS
)
690 r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1
692 sync
+= r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(
695 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
696 "stbs_zero:%x stbs_done:%x",
697 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
699 # Incoming acks processing
700 with m
.If(wb_in
.ack
):
701 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
703 wb_in
.dat
, stbs_zero
, stbs_done
)
705 sync
+= r
.rows_valid
[r
.store_row
% ROW_PER_LINE
].eq(1)
707 # Check for completion
708 with m
.If(stbs_done
&
709 is_last_row(r
.store_row
, r
.end_row_ix
)):
710 # Complete wishbone cycle
711 sync
+= r
.wb
.cyc
.eq(0)
712 sync
+= r
.req_adr
.eq(0) # be nice, clear addr
714 # Cache line is now valid
715 cv
= Signal(INDEX_BITS
)
716 comb
+= cv
.eq(cache_valid_bits
[r
.store_index
])
717 comb
+= cv
.bit_select(replace_way
, 1).eq(
718 r
.store_valid
& ~inval_in
720 sync
+= cache_valid_bits
[r
.store_index
].eq(cv
)
722 sync
+= r
.state
.eq(State
.IDLE
)
724 # not completed, move on to next request in row
726 # Increment store row counter
727 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
730 # Cache miss/reload synchronous machine
731 def icache_miss(self
, m
, cache_valid_bits
, r
, req_is_miss
,
732 req_index
, req_laddr
, req_tag
, replace_way
,
733 cache_tags
, access_ok
, real_addr
):
737 i_in
, wb_in
, m_in
= self
.i_in
, self
.wb_in
, self
.m_in
738 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
739 inval_in
= self
.inval_in
741 tagset
= Signal(TAG_RAM_WIDTH
)
744 comb
+= r
.wb
.sel
.eq(-1)
745 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
747 # Process cache invalidations
749 for i
in range(NUM_LINES
):
750 sync
+= cache_valid_bits
[i
].eq(0)
751 sync
+= r
.store_valid
.eq(0)
754 with m
.Switch(r
.state
):
756 with m
.Case(State
.IDLE
):
757 self
.icache_miss_idle(
758 m
, r
, req_is_miss
, req_laddr
,
759 req_index
, req_tag
, replace_way
,
763 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
764 with m
.If(r
.state
== State
.CLR_TAG
):
765 self
.icache_miss_clr_tag(
767 cache_valid_bits
, req_index
,
771 self
.icache_miss_wait_ack(
772 m
, r
, replace_way
, inval_in
,
773 stbs_done
, cache_valid_bits
776 # TLB miss and protection fault processing
777 with m
.If(flush_in | m_in
.tlbld
):
778 sync
+= r
.fetch_failed
.eq(0)
779 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
780 sync
+= r
.fetch_failed
.eq(1)
782 # icache_log: if LOG_LENGTH > 0 generate
783 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
784 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
788 wb_in
, i_out
= self
.wb_in
, self
.i_out
789 log_out
, stall_out
= self
.log_out
, self
.stall_out
791 # Output data to logger
792 for i
in range(LOG_LENGTH
):
793 # Output data to logger
794 log_data
= Signal(54)
795 lway
= Signal(NUM_WAYS
)
798 sync
+= lway
.eq(req_hit_way
)
801 with m
.If(r
.state
!= State
.IDLE
):
804 sync
+= log_data
.eq(Cat(
805 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
806 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
807 stall_out
, wb_in
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
808 r
.real_addr
[3:6], wb_in
.ack
, i_out
.insn
, i_out
.valid
810 comb
+= log_out
.eq(log_data
)
812 def elaborate(self
, platform
):
817 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
818 cache_tags
= CacheTagArray()
819 cache_valid_bits
= CacheValidBitsArray()
821 itlb_valid_bits
= TLBValidBitsArray()
822 itlb_tags
= TLBTagArray()
823 itlb_ptes
= TLBPtesArray()
824 # TODO to be passed to nmigen as ram attributes
825 # attribute ram_style of itlb_tags : signal is "distributed";
826 # attribute ram_style of itlb_ptes : signal is "distributed";
828 # Privilege bit from PTE EAA field
833 # Async signal on incoming request
834 req_index
= Signal(NUM_LINES
)
835 req_row
= Signal(BRAM_ROWS
)
836 req_hit_way
= Signal(NUM_WAYS
)
837 req_tag
= Signal(TAG_BITS
)
838 req_is_hit
= Signal()
839 req_is_miss
= Signal()
840 req_laddr
= Signal(64)
842 tlb_req_index
= Signal(TLB_SIZE
)
843 real_addr
= Signal(REAL_ADDR_BITS
)
845 priv_fault
= Signal()
847 use_previous
= Signal()
849 cache_out_row
= Signal(ROW_SIZE_BITS
)
851 plru_victim
= PLRUOut()
852 replace_way
= Signal(NUM_WAYS
)
854 # call sub-functions putting everything together,
855 # using shared signals established above
856 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
857 self
.maybe_plrus(m
, r
, plru_victim
)
858 self
.itlb_lookup(m
, tlb_req_index
, itlb_ptes
, itlb_tags
, real_addr
,
859 itlb_valid_bits
, ra_valid
, eaa_priv
, priv_fault
,
861 self
.itlb_update(m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
)
862 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
863 req_tag
, real_addr
, req_laddr
, cache_valid_bits
,
864 cache_tags
, access_ok
, req_is_hit
, req_is_miss
,
865 replace_way
, plru_victim
, cache_out_row
)
866 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
867 req_index
, req_tag
, real_addr
)
868 self
.icache_miss(m
, cache_valid_bits
, r
, req_is_miss
, req_index
,
869 req_laddr
, req_tag
, replace_way
, cache_tags
,
870 access_ok
, real_addr
)
871 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
872 # req_is_miss, req_is_hit, lway, wstate, r)
882 yield i_in
.valid
.eq(0)
883 yield i_out
.priv_mode
.eq(1)
884 yield i_out
.req
.eq(0)
885 yield i_out
.nia
.eq(0)
886 yield i_out
.stop_mark
.eq(0)
887 yield m_out
.tlbld
.eq(0)
888 yield m_out
.tlbie
.eq(0)
889 yield m_out
.addr
.eq(0)
890 yield m_out
.pte
.eq(0)
895 yield i_out
.req
.eq(1)
896 yield i_out
.nia
.eq(Const(0x0000000000000004, 64))
900 valid
= yield i_in
.valid
901 nia
= yield i_out
.nia
902 insn
= yield i_in
.insn
903 print(f
"valid? {valid}")
905 assert insn
== 0x00000001, \
906 "insn @%x=%x expected 00000001" % (nia
, insn
)
907 yield i_out
.req
.eq(0)
913 yield i_out
.req
.eq(1)
914 yield i_out
.nia
.eq(Const(0x0000000000000008, 64))
917 valid
= yield i_in
.valid
919 insn
= yield i_in
.insn
921 assert insn
== 0x00000002, \
922 "insn @%x=%x expected 00000002" % (nia
, insn
)
926 yield i_out
.req
.eq(1)
927 yield i_out
.nia
.eq(Const(0x0000000000000040, 64))
931 valid
= yield i_in
.valid
932 nia
= yield i_out
.nia
933 insn
= yield i_in
.insn
935 assert insn
== 0x00000010, \
936 "insn @%x=%x expected 00000010" % (nia
, insn
)
938 # test something that aliases
939 yield i_out
.req
.eq(1)
940 yield i_out
.nia
.eq(Const(0x0000000000000100, 64))
943 valid
= yield i_in
.valid
948 insn
= yield i_in
.insn
949 valid
= yield i_in
.valid
950 insn
= yield i_in
.insn
952 assert insn
== 0x00000040, \
953 "insn @%x=%x expected 00000040" % (nia
, insn
)
954 yield i_out
.req
.eq(0)
958 def test_icache(mem
):
961 memory
= Memory(width
=64, depth
=512, init
=mem
)
962 sram
= SRAM(memory
=memory
, granularity
=8)
966 m
.submodules
.icache
= dut
967 m
.submodules
.sram
= sram
969 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.wb_out
.cyc
)
970 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.wb_out
.stb
)
971 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.wb_out
.we
)
972 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.wb_out
.sel
)
973 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.wb_out
.adr
)
974 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.wb_out
.dat
)
976 m
.d
.comb
+= dut
.wb_in
.ack
.eq(sram
.bus
.ack
)
977 m
.d
.comb
+= dut
.wb_in
.dat
.eq(sram
.bus
.dat_r
)
983 sim
.add_sync_process(wrap(icache_sim(dut
)))
984 with sim
.write_vcd('test_icache.vcd'):
987 if __name__
== '__main__':
989 vl
= rtlil
.convert(dut
, ports
=[])
990 with
open("test_icache.il", "w") as f
:
995 mem
.append((i
*2)|
((i
*2+1)<<32))