3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
29 from enum
import (Enum
, unique
)
30 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
,
32 from nmigen
.cli
import main
, rtlil
33 from nmutil
.iocontrol
import RecordObject
34 from nmigen
.utils
import log2_int
35 from nmigen
.lib
.coding
import Decoder
36 from nmutil
.util
import Display
38 #from nmutil.plru import PLRU
39 from soc
.experiment
.plru
import PLRU
, PLRUs
40 from soc
.experiment
.cache_ram
import CacheRam
42 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
46 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
47 WB_SEL_BITS
, WBAddrType
, WBDataType
,
48 WBSelType
, WBMasterOut
, WBSlaveOut
,
51 from nmigen_soc
.wishbone
.bus
import Interface
52 from soc
.minerva
.units
.fetch
import FetchUnitInterface
56 from soc
.bus
.sram
import SRAM
57 from nmigen
import Memory
58 from nmutil
.util
import wrap
59 from nmigen
.cli
import main
, rtlil
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE
= WB_DATA_BITS
// 8
74 # Number of lines in a set
78 # L1 ITLB number of entries (direct mapped)
80 # L1 ITLB log_2(page_size)
82 # Number of real address bits that we store
84 # Non-zero to enable log data collection
87 ROW_SIZE_BITS
= ROW_SIZE
* 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
95 # Bit fields counts in the address
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS
= log2_int(INSN_PER_ROW
)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS
= log2_int(BRAM_ROWS
)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS
= log2_int(NUM_LINES
)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS
= log2_int(NUM_WAYS
)
118 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
121 TLB_BITS
= log2_int(TLB_SIZE
)
122 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
125 print("BRAM_ROWS =", BRAM_ROWS
)
126 print("INDEX_BITS =", INDEX_BITS
)
127 print("INSN_BITS =", INSN_BITS
)
128 print("INSN_PER_ROW =", INSN_PER_ROW
)
129 print("LINE_SIZE =", LINE_SIZE
)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS
)
131 print("LOG_LENGTH =", LOG_LENGTH
)
132 print("NUM_LINES =", NUM_LINES
)
133 print("NUM_WAYS =", NUM_WAYS
)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS
)
135 print("ROW_BITS =", ROW_BITS
)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS
)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS
)
138 print("ROW_PER_LINE =", ROW_PER_LINE
)
139 print("ROW_SIZE =", ROW_SIZE
)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS
)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS
)
143 print("TAG_BITS =", TAG_BITS
)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH
)
145 print("TAG_BITS =", TAG_BITS
)
146 print("TLB_BITS =", TLB_BITS
)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS
)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ
)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS
)
150 print("TLB_SIZE =", TLB_SIZE
)
151 print("WAY_BITS =", WAY_BITS
)
153 # from microwatt/utils.vhdl
155 return n
!= 0 and (n
& (n
- 1)) == 0
157 assert LINE_SIZE
% ROW_SIZE
== 0
158 assert ispow2(LINE_SIZE
), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES
), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE
), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW
), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS
== (ROW_OFF_BITS
+ ROW_LINE_BITS
)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
)), \
169 "geometry bits don't add up"
171 # Example of layout for 32 lines of 64 bytes:
173 # .. tag |index| line |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
191 tag_layout
= [('valid', NUM_WAYS
),
192 ('tag', TAG_RAM_WIDTH
),
194 return Array(Record(tag_layout
, name
="tag%d" % x
) for x
in range(NUM_LINES
))
196 def RowPerLineValidArray():
197 return Array(Signal(name
="rows_valid_%d" %x) \
198 for x
in range(ROW_PER_LINE
))
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
206 tlb_layout
= [('valid', 1),
207 ('tag', TLB_EA_TAG_BITS
),
208 ('pte', TLB_PTE_BITS
)
210 return Array(Record(tlb_layout
, name
="tlb%d" % x
) for x
in range(TLB_SIZE
))
212 # Cache RAM interface
214 return Array(Signal(ROW_SIZE_BITS
, name
="cache_out_%d" %x) \
215 for x
in range(NUM_WAYS
))
217 # PLRU output interface
219 return Array(Signal(WAY_BITS
, name
="plru_out_%d" %x) \
220 for x
in range(NUM_LINES
))
222 # Return the cache line index (tag index) for an address
224 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
226 # Return the cache row index (data memory) for an address
228 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
230 # Return the index of a row within a line
231 def get_row_of_line(row
):
232 return row
[:ROW_BITS
][:ROW_LINE_BITS
]
234 # Returns whether this is the last row of a line
235 def is_last_row_addr(addr
, last
):
236 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
238 # Returns whether this is the last row of a line
239 def is_last_row(row
, last
):
240 return get_row_of_line(row
) == last
242 # Return the next row in the current cache line. We use a dedicated
243 # function in order to limit the size of the generated adder to be
244 # only the bits within a cache line (3 bits with default settings)
246 row_v
= row
[0:ROW_LINE_BITS
] + 1
247 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
249 # Read the instruction word for the given address
250 # in the current cache row
251 def read_insn_word(addr
, data
):
252 word
= addr
[2:INSN_BITS
+2]
253 return data
.word_select(word
, 32)
255 # Get the tag value from the address
257 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
259 # Read a tag from a tag memory row
260 def read_tag(way
, tagset
):
261 return tagset
.word_select(way
, TAG_WIDTH
)[:TAG_BITS
]
263 # Write a tag to tag memory row
264 def write_tag(way
, tagset
, tag
):
265 return read_tag(way
, tagset
).eq(tag
)
267 # Simple hash for direct-mapped TLB index
269 hsh
= addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^ addr
[
270 TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
272 TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
277 # Cache reload state machine
285 class RegInternal(RecordObject
):
288 # Cache hit state (Latches for 1 cycle BRAM access)
289 self
.hit_way
= Signal(WAY_BITS
)
290 self
.hit_nia
= Signal(64)
291 self
.hit_smark
= Signal()
292 self
.hit_valid
= Signal()
294 # Cache miss state (reload state machine)
295 self
.state
= Signal(State
, reset
=State
.IDLE
)
296 self
.wb
= WBMasterOut("wb")
297 self
.req_adr
= Signal(64)
298 self
.store_way
= Signal(WAY_BITS
)
299 self
.store_index
= Signal(INDEX_BITS
)
300 self
.store_row
= Signal(ROW_BITS
)
301 self
.store_tag
= Signal(TAG_BITS
)
302 self
.store_valid
= Signal()
303 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
304 self
.rows_valid
= RowPerLineValidArray()
307 self
.fetch_failed
= Signal()
310 class ICache(FetchUnitInterface
, Elaboratable
):
311 """64 bit direct mapped icache. All instructions are 4B aligned."""
312 def __init__(self
, pspec
):
313 FetchUnitInterface
.__init
__(self
, pspec
)
314 self
.i_in
= Fetch1ToICacheType(name
="i_in")
315 self
.i_out
= ICacheToDecode1Type(name
="i_out")
317 self
.m_in
= MMUToICacheType(name
="m_in")
319 self
.stall_in
= Signal()
320 self
.stall_out
= Signal()
321 self
.flush_in
= Signal()
322 self
.inval_in
= Signal()
324 # standard naming (wired to non-standard for compatibility)
325 self
.bus
= Interface(addr_width
=32,
332 self
.log_out
= Signal(54)
334 # use FetchUnitInterface, helps keep some unit tests running
335 self
.use_fetch_iface
= False
337 def use_fetch_interface(self
):
338 self
.use_fetch_iface
= True
340 # Generate a cache RAM for each way
341 def rams(self
, m
, r
, cache_out_row
, use_previous
,
342 replace_way
, req_row
):
347 bus
, stall_in
= self
.bus
, self
.stall_in
349 # read condition (for every cache ram)
351 comb
+= do_read
.eq(~
(stall_in | use_previous
))
353 rd_addr
= Signal(ROW_BITS
)
354 wr_addr
= Signal(ROW_BITS
)
355 comb
+= rd_addr
.eq(req_row
)
356 comb
+= wr_addr
.eq(r
.store_row
)
358 # binary-to-unary converters: replace-way enabled by bus.ack,
359 # hit-way left permanently enabled
360 m
.submodules
.replace_way_e
= re
= Decoder(NUM_WAYS
)
361 m
.submodules
.hit_way_e
= he
= Decoder(NUM_WAYS
)
362 comb
+= re
.i
.eq(replace_way
)
363 comb
+= re
.n
.eq(~bus
.ack
)
364 comb
+= he
.i
.eq(r
.hit_way
)
366 for i
in range(NUM_WAYS
):
367 do_write
= Signal(name
="do_wr_%d" % i
)
368 d_out
= Signal(ROW_SIZE_BITS
, name
="d_out_%d" % i
)
369 wr_sel
= Signal(ROW_SIZE
, name
="wr_sel_%d" % i
)
371 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
, TRACE
=True, ram_num
=i
)
372 m
.submodules
["cacheram_%d" % i
] = way
374 comb
+= way
.rd_en
.eq(do_read
)
375 comb
+= way
.rd_addr
.eq(rd_addr
)
376 comb
+= d_out
.eq(way
.rd_data_o
)
377 comb
+= way
.wr_sel
.eq(wr_sel
)
378 comb
+= way
.wr_addr
.eq(wr_addr
)
379 comb
+= way
.wr_data
.eq(bus
.dat_r
)
381 comb
+= do_write
.eq(re
.o
[i
])
384 sync
+= Display("cache write adr: %x data: %lx",
385 wr_addr
, way
.wr_data
)
388 comb
+= cache_out_row
.eq(d_out
)
390 sync
+= Display("cache read adr: %x data: %x",
393 comb
+= wr_sel
.eq(Repl(do_write
, ROW_SIZE
))
396 def maybe_plrus(self
, m
, r
, plru_victim
):
403 m
.submodules
.plrus
= plru
= PLRUs(NUM_LINES
, WAY_BITS
)
404 comb
+= plru
.way
.eq(r
.hit_way
)
405 comb
+= plru
.valid
.eq(r
.hit_valid
)
406 comb
+= plru
.index
.eq(get_index(r
.hit_nia
))
407 comb
+= plru
.isel
.eq(r
.store_index
) # select victim
408 comb
+= plru_victim
.eq(plru
.o_index
) # selected victim
410 # TLB hit detection and real address generation
411 def itlb_lookup(self
, m
, tlb_req_index
, itlb
,
412 real_addr
, ra_valid
, eaa_priv
,
413 priv_fault
, access_ok
):
419 pte
= Signal(TLB_PTE_BITS
)
420 ttag
= Signal(TLB_EA_TAG_BITS
)
422 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
423 comb
+= pte
.eq(itlb
[tlb_req_index
].pte
)
424 comb
+= ttag
.eq(itlb
[tlb_req_index
].tag
)
426 with m
.If(i_in
.virt_mode
):
427 comb
+= real_addr
.eq(Cat(
428 i_in
.nia
[:TLB_LG_PGSZ
],
429 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
432 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
433 comb
+= ra_valid
.eq(itlb
[tlb_req_index
].valid
)
435 comb
+= eaa_priv
.eq(pte
[3])
438 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
439 comb
+= ra_valid
.eq(1)
440 comb
+= eaa_priv
.eq(1)
442 # No IAMR, so no KUEP support for now
443 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
444 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
447 def itlb_update(self
, m
, itlb
):
453 wr_index
= Signal(TLB_SIZE
)
454 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
456 with m
.If(m_in
.tlbie
& m_in
.doall
):
457 # Clear all valid bits
458 for i
in range(TLB_SIZE
):
459 sync
+= itlb
[i
].valid
.eq(0)
461 with m
.Elif(m_in
.tlbie
):
462 # Clear entry regardless of hit or miss
463 sync
+= itlb
[wr_index
].valid
.eq(0)
465 with m
.Elif(m_in
.tlbld
):
466 sync
+= itlb
[wr_index
].tag
.eq(m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64])
467 sync
+= itlb
[wr_index
].pte
.eq(m_in
.pte
)
468 sync
+= itlb
[wr_index
].valid
.eq(1)
470 # Cache hit detection, output to fetch2 and other misc logic
471 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
472 req_hit_way
, req_tag
, real_addr
, req_laddr
,
473 cache_tags
, access_ok
,
474 req_is_hit
, req_is_miss
, replace_way
,
475 plru_victim
, cache_out_row
):
479 i_in
, i_out
, bus
= self
.i_in
, self
.i_out
, self
.bus
480 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
483 hit_way
= Signal(WAY_BITS
)
485 # i_in.sequential means that i_in.nia this cycle is 4 more than
486 # last cycle. If we read more than 32 bits at a time, had a
487 # cache hit last cycle, and we don't want the first 32-bit chunk
488 # then we can keep the data we read last cycle and just use that.
489 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
490 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
492 # Extract line, row and tag from request
493 comb
+= req_index
.eq(get_index(i_in
.nia
))
494 comb
+= req_row
.eq(get_row(i_in
.nia
))
495 comb
+= req_tag
.eq(get_tag(real_addr
))
497 # Calculate address of beginning of cache row, will be
498 # used for cache miss processing if needed
499 comb
+= req_laddr
.eq(Cat(
500 Const(0, ROW_OFF_BITS
),
501 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
504 # Test if pending request is a hit on any way
506 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
507 & (req_index
== r
.store_index
)
508 & r
.rows_valid
[req_row
% ROW_PER_LINE
]
510 # i_in.req asserts Decoder active
511 cvb
= Signal(NUM_WAYS
)
512 ctag
= Signal(TAG_RAM_WIDTH
)
513 comb
+= ctag
.eq(cache_tags
[req_index
].tag
)
514 comb
+= cvb
.eq(cache_tags
[req_index
].valid
)
515 m
.submodules
.store_way_e
= se
= Decoder(NUM_WAYS
)
516 comb
+= se
.i
.eq(r
.store_way
)
517 comb
+= se
.n
.eq(~i_in
.req
)
518 for i
in range(NUM_WAYS
):
519 tagi
= Signal(TAG_BITS
, name
="tag_i%d" % i
)
520 hit_test
= Signal(name
="hit_test%d" % i
)
521 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
522 comb
+= tagi
.eq(read_tag(i
, ctag
))
523 comb
+= hit_test
.eq(se
.o
[i
])
524 comb
+= is_tag_hit
.eq((cvb
[i
] |
(hitcond
& hit_test
)) &
526 with m
.If(is_tag_hit
):
527 comb
+= hit_way
.eq(i
)
530 # Generate the "hit" and "miss" signals
531 # for the synchronous blocks
532 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
533 comb
+= req_is_hit
.eq(is_hit
)
534 comb
+= req_is_miss
.eq(~is_hit
)
536 comb
+= req_hit_way
.eq(hit_way
)
538 # The way to replace on a miss
539 with m
.If(r
.state
== State
.CLR_TAG
):
540 comb
+= replace_way
.eq(plru_victim
)
542 comb
+= replace_way
.eq(r
.store_way
)
544 # Output instruction from current cache row
546 # Note: This is a mild violation of our design principle of
547 # having pipeline stages output from a clean latch. In this
548 # case we output the result of a mux. The alternative would
549 # be output an entire row which I prefer not to do just yet
550 # as it would force fetch2 to know about some of the cache
551 # geometry information.
552 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
553 comb
+= i_out
.valid
.eq(r
.hit_valid
)
554 comb
+= i_out
.nia
.eq(r
.hit_nia
)
555 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
556 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
558 # Stall fetch1 if we have a miss on cache or TLB
559 # or a protection fault
560 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
562 # Wishbone requests output (from the cache miss reload machine)
563 comb
+= bus
.we
.eq(r
.wb
.we
)
564 comb
+= bus
.adr
.eq(r
.wb
.adr
)
565 comb
+= bus
.sel
.eq(r
.wb
.sel
)
566 comb
+= bus
.stb
.eq(r
.wb
.stb
)
567 comb
+= bus
.dat_w
.eq(r
.wb
.dat
)
568 comb
+= bus
.cyc
.eq(r
.wb
.cyc
)
570 # Cache hit synchronous machine
571 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
572 req_index
, req_tag
, real_addr
):
575 i_in
, stall_in
= self
.i_in
, self
.stall_in
576 flush_in
= self
.flush_in
578 # keep outputs to fetch2 unchanged on a stall
579 # except that flush or reset sets valid to 0
580 # If use_previous, keep the same data as last
581 # cycle and use the second half
582 with m
.If(stall_in | use_previous
):
584 sync
+= r
.hit_valid
.eq(0)
586 # On a hit, latch the request for the next cycle,
587 # when the BRAM data will be available on the
588 # cache_out output of the corresponding way
589 sync
+= r
.hit_valid
.eq(req_is_hit
)
591 with m
.If(req_is_hit
):
592 sync
+= r
.hit_way
.eq(req_hit_way
)
593 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
594 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
,
595 i_in
.stop_mark
, req_index
, req_tag
,
596 req_hit_way
, real_addr
)
598 with m
.If(~stall_in
):
599 # Send stop marks and NIA down regardless of validity
600 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
601 sync
+= r
.hit_nia
.eq(i_in
.nia
)
603 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
604 req_index
, req_tag
, replace_way
, real_addr
):
610 # Reset per-row valid flags, only used in WAIT_ACK
611 for i
in range(ROW_PER_LINE
):
612 sync
+= r
.rows_valid
[i
].eq(0)
614 # We need to read a cache line
615 with m
.If(req_is_miss
):
617 "cache miss nia:%x IR:%x SM:%x idx:%x "
618 " way:%x tag:%x RA:%x", i_in
.nia
,
619 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
620 replace_way
, req_tag
, real_addr
)
622 # Keep track of our index and way for subsequent stores
623 st_row
= Signal(ROW_BITS
)
624 comb
+= st_row
.eq(get_row(req_laddr
))
625 sync
+= r
.store_index
.eq(req_index
)
626 sync
+= r
.store_row
.eq(st_row
)
627 sync
+= r
.store_tag
.eq(req_tag
)
628 sync
+= r
.store_valid
.eq(1)
629 sync
+= r
.end_row_ix
.eq(get_row_of_line(st_row
) - 1)
631 # Prep for first wishbone read. We calculate the address
632 # of the start of the cache line and start the WB cycle.
633 sync
+= r
.req_adr
.eq(req_laddr
)
634 sync
+= r
.wb
.cyc
.eq(1)
635 sync
+= r
.wb
.stb
.eq(1)
637 # Track that we had one request sent
638 sync
+= r
.state
.eq(State
.CLR_TAG
)
640 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
646 # Get victim way from plru
647 sync
+= r
.store_way
.eq(replace_way
)
649 # Force misses on that way while reloading that line
650 cv
= Signal(INDEX_BITS
)
651 comb
+= cv
.eq(cache_tags
[req_index
].valid
)
652 comb
+= cv
.bit_select(replace_way
, 1).eq(0)
653 sync
+= cache_tags
[req_index
].valid
.eq(cv
)
655 for i
in range(NUM_WAYS
):
656 with m
.If(i
== replace_way
):
657 comb
+= tagset
.eq(cache_tags
[r
.store_index
].tag
)
658 comb
+= write_tag(i
, tagset
, r
.store_tag
)
659 sync
+= cache_tags
[r
.store_index
].tag
.eq(tagset
)
661 sync
+= r
.state
.eq(State
.WAIT_ACK
)
663 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
664 cache_tags
, stbs_done
):
670 # Requests are all sent if stb is 0
672 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
673 comb
+= stbs_done
.eq(stbs_zero
)
675 # If we are still sending requests, was one accepted?
676 with m
.If(~bus
.stall
& ~stbs_zero
):
677 # That was the last word? We are done sending.
678 # Clear stb and set stbs_done so we can handle
679 # an eventual last ack on the same cycle.
680 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
681 sync
+= Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
682 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
683 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
684 r
.wb
.stb
, stbs_zero
, stbs_done
)
685 sync
+= r
.wb
.stb
.eq(0)
686 comb
+= stbs_done
.eq(1)
688 # Calculate the next row address
689 rarange
= Signal(LINE_OFF_BITS
- ROW_OFF_BITS
)
690 comb
+= rarange
.eq(r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1)
691 sync
+= r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(rarange
)
692 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
693 "stbs_zero:%x stbs_done:%x",
694 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
696 # Incoming acks processing
698 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
700 bus
.dat_r
, stbs_zero
, stbs_done
)
702 sync
+= r
.rows_valid
[r
.store_row
% ROW_PER_LINE
].eq(1)
704 # Check for completion
705 with m
.If(stbs_done
& is_last_row(r
.store_row
, r
.end_row_ix
)):
706 # Complete wishbone cycle
707 sync
+= r
.wb
.cyc
.eq(0)
708 # be nice, clear addr
709 sync
+= r
.req_adr
.eq(0)
711 # Cache line is now valid
712 cv
= Signal(INDEX_BITS
)
713 comb
+= cv
.eq(cache_tags
[r
.store_index
].valid
)
714 comb
+= cv
.bit_select(replace_way
, 1).eq(
715 r
.store_valid
& ~inval_in
)
716 sync
+= cache_tags
[r
.store_index
].valid
.eq(cv
)
718 sync
+= r
.state
.eq(State
.IDLE
)
720 # move on to next request in row
721 # Increment store row counter
722 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
724 # Cache miss/reload synchronous machine
725 def icache_miss(self
, m
, r
, req_is_miss
,
726 req_index
, req_laddr
, req_tag
, replace_way
,
727 cache_tags
, access_ok
, real_addr
):
731 i_in
, bus
, m_in
= self
.i_in
, self
.bus
, self
.m_in
732 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
733 inval_in
= self
.inval_in
735 tagset
= Signal(TAG_RAM_WIDTH
)
738 comb
+= r
.wb
.sel
.eq(-1)
739 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
741 # Process cache invalidations
743 for i
in range(NUM_LINES
):
744 sync
+= cache_tags
[i
].valid
.eq(0)
745 sync
+= r
.store_valid
.eq(0)
748 with m
.Switch(r
.state
):
750 with m
.Case(State
.IDLE
):
751 self
.icache_miss_idle(m
, r
, req_is_miss
, req_laddr
,
752 req_index
, req_tag
, replace_way
,
755 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
756 with m
.If(r
.state
== State
.CLR_TAG
):
757 self
.icache_miss_clr_tag(m
, r
, replace_way
,
758 req_index
, tagset
, cache_tags
)
760 self
.icache_miss_wait_ack(m
, r
, replace_way
, inval_in
,
761 cache_tags
, stbs_done
)
763 # TLB miss and protection fault processing
764 with m
.If(flush_in | m_in
.tlbld
):
765 sync
+= r
.fetch_failed
.eq(0)
766 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
767 sync
+= r
.fetch_failed
.eq(1)
769 # icache_log: if LOG_LENGTH > 0 generate
770 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
771 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
775 bus
, i_out
= self
.bus
, self
.i_out
776 log_out
, stall_out
= self
.log_out
, self
.stall_out
778 # Output data to logger
779 for i
in range(LOG_LENGTH
):
780 log_data
= Signal(54)
781 lway
= Signal(WAY_BITS
)
784 sync
+= lway
.eq(req_hit_way
)
787 with m
.If(r
.state
!= State
.IDLE
):
790 sync
+= log_data
.eq(Cat(
791 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
792 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
793 stall_out
, bus
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
794 r
.real_addr
[3:6], bus
.ack
, i_out
.insn
, i_out
.valid
796 comb
+= log_out
.eq(log_data
)
798 def elaborate(self
, platform
):
803 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
804 cache_tags
= CacheTagArray()
809 # TODO to be passed to nmigen as ram attributes
810 # attribute ram_style of itlb_tags : signal is "distributed";
811 # attribute ram_style of itlb_ptes : signal is "distributed";
813 # Privilege bit from PTE EAA field
818 # Async signal on incoming request
819 req_index
= Signal(INDEX_BITS
)
820 req_row
= Signal(ROW_BITS
)
821 req_hit_way
= Signal(WAY_BITS
)
822 req_tag
= Signal(TAG_BITS
)
823 req_is_hit
= Signal()
824 req_is_miss
= Signal()
825 req_laddr
= Signal(64)
827 tlb_req_index
= Signal(TLB_BITS
)
828 real_addr
= Signal(REAL_ADDR_BITS
)
830 priv_fault
= Signal()
832 use_previous
= Signal()
834 cache_out_row
= Signal(ROW_SIZE_BITS
)
836 plru_victim
= Signal(WAY_BITS
)
837 replace_way
= Signal(WAY_BITS
)
839 # fake-up the wishbone stall signal to comply with pipeline mode
840 # same thing is done in dcache.py
841 comb
+= self
.bus
.stall
.eq(self
.bus
.cyc
& ~self
.bus
.ack
)
843 # call sub-functions putting everything together,
844 # using shared signals established above
845 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
846 self
.maybe_plrus(m
, r
, plru_victim
)
847 self
.itlb_lookup(m
, tlb_req_index
, itlb
, real_addr
,
848 ra_valid
, eaa_priv
, priv_fault
,
850 self
.itlb_update(m
, itlb
)
851 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
852 req_tag
, real_addr
, req_laddr
,
853 cache_tags
, access_ok
, req_is_hit
, req_is_miss
,
854 replace_way
, plru_victim
, cache_out_row
)
855 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
856 req_index
, req_tag
, real_addr
)
857 self
.icache_miss(m
, r
, req_is_miss
, req_index
,
858 req_laddr
, req_tag
, replace_way
, cache_tags
,
859 access_ok
, real_addr
)
860 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
861 # req_is_miss, req_is_hit, lway, wstate, r)
863 # don't connect up to FetchUnitInterface so that some unit tests
864 # can continue to operate
865 if not self
.use_fetch_iface
:
868 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
869 # so needs checking and iterative revising
870 i_in
, bus
, i_out
= self
.i_in
, self
.bus
, self
.i_out
871 comb
+= i_in
.req
.eq(self
.a_i_valid
)
872 comb
+= i_in
.nia
.eq(self
.a_pc_i
)
873 comb
+= self
.stall_in
.eq(self
.a_stall_i
)
874 comb
+= self
.f_fetch_err_o
.eq(i_out
.fetch_failed
)
875 comb
+= self
.f_badaddr_o
.eq(i_out
.nia
)
876 comb
+= self
.f_instr_o
.eq(i_out
.insn
)
877 comb
+= self
.f_busy_o
.eq(~i_out
.valid
) # probably
879 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
881 comb
+= ibus
.adr
.eq(self
.bus
.adr
)
882 comb
+= ibus
.dat_w
.eq(self
.bus
.dat_w
)
883 comb
+= ibus
.sel
.eq(self
.bus
.sel
)
884 comb
+= ibus
.cyc
.eq(self
.bus
.cyc
)
885 comb
+= ibus
.stb
.eq(self
.bus
.stb
)
886 comb
+= ibus
.we
.eq(self
.bus
.we
)
888 comb
+= self
.bus
.dat_r
.eq(ibus
.dat_r
)
889 comb
+= self
.bus
.ack
.eq(ibus
.ack
)
890 if hasattr(ibus
, "stall"):
891 comb
+= self
.bus
.stall
.eq(ibus
.stall
)
901 yield i_in
.priv_mode
.eq(1)
904 yield i_in
.stop_mark
.eq(0)
905 yield m_out
.tlbld
.eq(0)
906 yield m_out
.tlbie
.eq(0)
907 yield m_out
.addr
.eq(0)
908 yield m_out
.pte
.eq(0)
914 # miss, stalls for a bit
916 yield i_in
.nia
.eq(Const(0x0000000000000004, 64))
918 valid
= yield i_out
.valid
921 valid
= yield i_out
.valid
924 insn
= yield i_out
.insn
925 nia
= yield i_out
.nia
926 assert insn
== 0x00000001, \
927 "insn @%x=%x expected 00000001" % (nia
, insn
)
933 yield i_in
.nia
.eq(Const(0x0000000000000008, 64))
935 valid
= yield i_out
.valid
938 valid
= yield i_out
.valid
941 nia
= yield i_out
.nia
942 insn
= yield i_out
.insn
944 assert insn
== 0x00000002, \
945 "insn @%x=%x expected 00000002" % (nia
, insn
)
949 yield i_in
.nia
.eq(Const(0x0000000000000040, 64))
951 valid
= yield i_out
.valid
954 valid
= yield i_out
.valid
958 insn
= yield i_out
.insn
959 assert insn
== 0x00000010, \
960 "insn @%x=%x expected 00000010" % (nia
, insn
)
962 # test something that aliases (this only works because
963 # the unit test SRAM is a depth of 512)
965 yield i_in
.nia
.eq(Const(0x0000000000000100, 64))
968 valid
= yield i_out
.valid
973 insn
= yield i_out
.insn
974 valid
= yield i_out
.valid
975 insn
= yield i_out
.insn
977 assert insn
== 0x00000040, \
978 "insn @%x=%x expected 00000040" % (nia
, insn
)
982 def test_icache(mem
):
983 from soc
.config
.test
.test_loadstore
import TestMemPspec
984 pspec
= TestMemPspec(addr_wid
=32,
990 memory
= Memory(width
=64, depth
=512, init
=mem
)
991 sram
= SRAM(memory
=memory
, granularity
=8)
995 m
.submodules
.icache
= dut
996 m
.submodules
.sram
= sram
998 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.bus
.cyc
)
999 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.bus
.stb
)
1000 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.bus
.we
)
1001 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.bus
.sel
)
1002 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.bus
.adr
)
1003 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.bus
.dat_w
)
1005 m
.d
.comb
+= dut
.bus
.ack
.eq(sram
.bus
.ack
)
1006 m
.d
.comb
+= dut
.bus
.dat_r
.eq(sram
.bus
.dat_r
)
1012 sim
.add_sync_process(wrap(icache_sim(dut
)))
1013 with sim
.write_vcd('test_icache.vcd'):
1017 if __name__
== '__main__':
1018 from soc
.config
.test
.test_loadstore
import TestMemPspec
1019 pspec
= TestMemPspec(addr_wid
=64,
1024 vl
= rtlil
.convert(dut
, ports
=[])
1025 with
open("test_icache.il", "w") as f
:
1028 # set up memory every 32-bits with incrementing values 0 1 2 ...
1030 for i
in range(512):
1031 mem
.append((i
*2) |
((i
*2+1)<<32))