3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
29 from enum
import (Enum
, unique
)
30 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
,
32 from nmigen
.cli
import main
, rtlil
33 from nmutil
.iocontrol
import RecordObject
34 from nmigen
.utils
import log2_int
35 from nmigen
.lib
.coding
import Decoder
36 from nmutil
.util
import Display
37 from nmutil
.latch
import SRLatch
39 #from nmutil.plru import PLRU
40 from soc
.experiment
.plru
import PLRU
, PLRUs
41 from soc
.experiment
.cache_ram
import CacheRam
43 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
47 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
48 WB_SEL_BITS
, WBAddrType
, WBDataType
,
49 WBSelType
, WBMasterOut
, WBSlaveOut
,
52 from nmigen_soc
.wishbone
.bus
import Interface
53 from soc
.minerva
.units
.fetch
import FetchUnitInterface
57 from soc
.bus
.sram
import SRAM
58 from nmigen
import Memory
59 from nmutil
.util
import wrap
60 from nmigen
.cli
import main
, rtlil
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
69 # BRAM organisation: We never access more than wishbone_data_bits
70 # at a time so to save resources we make the array only that wide,
71 # and use consecutive indices for to make a cache "line"
73 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
74 ROW_SIZE
= WB_DATA_BITS
// 8
75 # Number of lines in a set
79 # L1 ITLB number of entries (direct mapped)
81 # L1 ITLB log_2(page_size)
83 # Number of real address bits that we store
85 # Non-zero to enable log data collection
88 ROW_SIZE_BITS
= ROW_SIZE
* 8
89 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
90 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
91 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
92 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
93 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
94 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
96 # Bit fields counts in the address
98 # INSN_BITS is the number of bits to select an instruction in a row
99 INSN_BITS
= log2_int(INSN_PER_ROW
)
100 # ROW_BITS is the number of bits to select a row
101 ROW_BITS
= log2_int(BRAM_ROWS
)
102 # ROW_LINE_BITS is the number of bits to select a row within a line
103 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
104 # LINE_OFF_BITS is the number of bits for the offset in a cache line
105 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
106 # ROW_OFF_BITS is the number of bits for the offset in a row
107 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
108 # INDEX_BITS is the number of bits to select a cache line
109 INDEX_BITS
= log2_int(NUM_LINES
)
110 # SET_SIZE_BITS is the log base 2 of the set size
111 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
112 # TAG_BITS is the number of bits of the tag part of the address
113 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
114 # TAG_WIDTH is the width in bits of each way of the tag RAM
115 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
117 # WAY_BITS is the number of bits to select a way
118 WAY_BITS
= log2_int(NUM_WAYS
)
119 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
122 TLB_BITS
= log2_int(TLB_SIZE
)
123 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
126 print("BRAM_ROWS =", BRAM_ROWS
)
127 print("INDEX_BITS =", INDEX_BITS
)
128 print("INSN_BITS =", INSN_BITS
)
129 print("INSN_PER_ROW =", INSN_PER_ROW
)
130 print("LINE_SIZE =", LINE_SIZE
)
131 print("LINE_OFF_BITS =", LINE_OFF_BITS
)
132 print("LOG_LENGTH =", LOG_LENGTH
)
133 print("NUM_LINES =", NUM_LINES
)
134 print("NUM_WAYS =", NUM_WAYS
)
135 print("REAL_ADDR_BITS =", REAL_ADDR_BITS
)
136 print("ROW_BITS =", ROW_BITS
)
137 print("ROW_OFF_BITS =", ROW_OFF_BITS
)
138 print("ROW_LINE_BITS =", ROW_LINE_BITS
)
139 print("ROW_PER_LINE =", ROW_PER_LINE
)
140 print("ROW_SIZE =", ROW_SIZE
)
141 print("ROW_SIZE_BITS =", ROW_SIZE_BITS
)
142 print("SET_SIZE_BITS =", SET_SIZE_BITS
)
144 print("TAG_BITS =", TAG_BITS
)
145 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH
)
146 print("TAG_BITS =", TAG_BITS
)
147 print("TLB_BITS =", TLB_BITS
)
148 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS
)
149 print("TLB_LG_PGSZ =", TLB_LG_PGSZ
)
150 print("TLB_PTE_BITS =", TLB_PTE_BITS
)
151 print("TLB_SIZE =", TLB_SIZE
)
152 print("WAY_BITS =", WAY_BITS
)
154 # from microwatt/utils.vhdl
156 return n
!= 0 and (n
& (n
- 1)) == 0
158 assert LINE_SIZE
% ROW_SIZE
== 0
159 assert ispow2(LINE_SIZE
), "LINE_SIZE not power of 2"
160 assert ispow2(NUM_LINES
), "NUM_LINES not power of 2"
161 assert ispow2(ROW_PER_LINE
), "ROW_PER_LINE not power of 2"
162 assert ispow2(INSN_PER_ROW
), "INSN_PER_ROW not power of 2"
163 assert (ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
)), \
164 "geometry bits don't add up"
165 assert (LINE_OFF_BITS
== (ROW_OFF_BITS
+ ROW_LINE_BITS
)), \
166 "geometry bits don't add up"
167 assert (REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
)), \
168 "geometry bits don't add up"
169 assert (REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
)), \
170 "geometry bits don't add up"
172 # Example of layout for 32 lines of 64 bytes:
174 # .. tag |index| line |
176 # .. | | | |00| zero (2)
177 # .. | | |-| | INSN_BITS (1)
178 # .. | |---| | ROW_LINE_BITS (3)
179 # .. | |--- - --| LINE_OFF_BITS (6)
180 # .. | |- --| ROW_OFF_BITS (3)
181 # .. |----- ---| | ROW_BITS (8)
182 # .. |-----| | INDEX_BITS (5)
183 # .. --------| | TAG_BITS (53)
185 # The cache data BRAM organized as described above for each way
186 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
188 def RowPerLineValidArray():
189 return Array(Signal(name
="rows_valid_%d" %x) \
190 for x
in range(ROW_PER_LINE
))
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
198 tlb_layout
= [ ('tag', TLB_EA_TAG_BITS
),
199 ('pte', TLB_PTE_BITS
)
201 return Record(tlb_layout
, name
=name
)
204 return Array(TLBRecord("tlb%d" % x
) for x
in range(TLB_SIZE
))
206 # PLRU output interface
208 return Array(Signal(WAY_BITS
, name
="plru_out_%d" %x) \
209 for x
in range(NUM_LINES
))
211 # Return the cache line index (tag index) for an address
213 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
215 # Return the cache row index (data memory) for an address
217 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
219 # Return the index of a row within a line
220 def get_row_of_line(row
):
221 return row
[:ROW_BITS
][:ROW_LINE_BITS
]
223 # Returns whether this is the last row of a line
224 def is_last_row_addr(addr
, last
):
225 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
227 # Returns whether this is the last row of a line
228 def is_last_row(row
, last
):
229 return get_row_of_line(row
) == last
231 # Return the next row in the current cache line. We use a dedicated
232 # function in order to limit the size of the generated adder to be
233 # only the bits within a cache line (3 bits with default settings)
235 row_v
= row
[0:ROW_LINE_BITS
] + 1
236 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
238 # Read the instruction word for the given address
239 # in the current cache row
240 def read_insn_word(addr
, data
):
241 word
= addr
[2:INSN_BITS
+2]
242 return data
.word_select(word
, 32)
244 # Get the tag value from the address
246 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
248 # Read a tag from a tag memory row
249 def read_tag(way
, tagset
):
250 return tagset
.word_select(way
, TAG_BITS
)
252 # Write a tag to tag memory row
253 def write_tag(way
, tagset
, tag
):
254 return read_tag(way
, tagset
).eq(tag
)
256 # Simple hash for direct-mapped TLB index
258 hsh
= (addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^
259 addr
[TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
] ^
260 addr
[TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
])
264 # Cache reload state machine
272 class RegInternal(RecordObject
):
275 # Cache hit state (Latches for 1 cycle BRAM access)
276 self
.hit_way
= Signal(WAY_BITS
)
277 self
.hit_nia
= Signal(64)
278 self
.hit_smark
= Signal()
279 self
.hit_valid
= Signal()
281 # Cache miss state (reload state machine)
282 self
.state
= Signal(State
, reset
=State
.IDLE
)
283 self
.wb
= WBMasterOut("wb")
284 self
.req_adr
= Signal(64)
285 self
.store_way
= Signal(WAY_BITS
)
286 self
.store_index
= Signal(INDEX_BITS
)
287 self
.store_row
= Signal(ROW_BITS
)
288 self
.store_tag
= Signal(TAG_BITS
)
289 self
.store_valid
= Signal()
290 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
291 self
.rows_valid
= RowPerLineValidArray()
294 self
.fetch_failed
= Signal()
297 class ICache(FetchUnitInterface
, Elaboratable
):
298 """64 bit direct mapped icache. All instructions are 4B aligned."""
299 def __init__(self
, pspec
):
300 FetchUnitInterface
.__init
__(self
, pspec
)
301 self
.i_in
= Fetch1ToICacheType(name
="i_in")
302 self
.i_out
= ICacheToDecode1Type(name
="i_out")
304 self
.m_in
= MMUToICacheType(name
="m_in")
306 self
.stall_in
= Signal()
307 self
.stall_out
= Signal()
308 self
.flush_in
= Signal()
309 self
.inval_in
= Signal()
311 # standard naming (wired to non-standard for compatibility)
312 self
.bus
= Interface(addr_width
=32,
319 self
.log_out
= Signal(54)
321 # use FetchUnitInterface, helps keep some unit tests running
322 self
.use_fetch_iface
= False
324 def use_fetch_interface(self
):
325 self
.use_fetch_iface
= True
327 # Generate a cache RAM for each way
328 def rams(self
, m
, r
, cache_out_row
, use_previous
,
329 replace_way
, req_row
):
334 bus
, stall_in
= self
.bus
, self
.stall_in
336 # read condition (for every cache ram)
338 comb
+= do_read
.eq(~
(stall_in | use_previous
))
340 rd_addr
= Signal(ROW_BITS
)
341 wr_addr
= Signal(ROW_BITS
)
342 comb
+= rd_addr
.eq(req_row
)
343 comb
+= wr_addr
.eq(r
.store_row
)
345 # binary-to-unary converters: replace-way enabled by bus.ack,
346 # hit-way left permanently enabled
347 m
.submodules
.replace_way_e
= re
= Decoder(NUM_WAYS
)
348 m
.submodules
.hit_way_e
= he
= Decoder(NUM_WAYS
)
349 comb
+= re
.i
.eq(replace_way
)
350 comb
+= re
.n
.eq(~bus
.ack
)
351 comb
+= he
.i
.eq(r
.hit_way
)
353 for i
in range(NUM_WAYS
):
354 do_write
= Signal(name
="do_wr_%d" % i
)
355 d_out
= Signal(ROW_SIZE_BITS
, name
="d_out_%d" % i
)
356 wr_sel
= Signal(ROW_SIZE
, name
="wr_sel_%d" % i
)
358 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
, TRACE
=True, ram_num
=i
)
359 m
.submodules
["cacheram_%d" % i
] = way
361 comb
+= way
.rd_en
.eq(do_read
)
362 comb
+= way
.rd_addr
.eq(rd_addr
)
363 comb
+= d_out
.eq(way
.rd_data_o
)
364 comb
+= way
.wr_sel
.eq(wr_sel
)
365 comb
+= way
.wr_addr
.eq(wr_addr
)
366 comb
+= way
.wr_data
.eq(bus
.dat_r
)
368 comb
+= do_write
.eq(re
.o
[i
])
371 sync
+= Display("cache write adr: %x data: %lx",
372 wr_addr
, way
.wr_data
)
375 comb
+= cache_out_row
.eq(d_out
)
377 sync
+= Display("cache read adr: %x data: %x",
380 comb
+= wr_sel
.eq(Repl(do_write
, ROW_SIZE
))
383 def maybe_plrus(self
, m
, r
, plru_victim
):
390 m
.submodules
.plrus
= plru
= PLRUs(NUM_LINES
, WAY_BITS
)
391 comb
+= plru
.way
.eq(r
.hit_way
)
392 comb
+= plru
.valid
.eq(r
.hit_valid
)
393 comb
+= plru
.index
.eq(get_index(r
.hit_nia
))
394 comb
+= plru
.isel
.eq(r
.store_index
) # select victim
395 comb
+= plru_victim
.eq(plru
.o_index
) # selected victim
397 # TLB hit detection and real address generation
398 def itlb_lookup(self
, m
, tlb_req_index
, itlb
, itlb_valid
,
399 real_addr
, ra_valid
, eaa_priv
,
400 priv_fault
, access_ok
):
406 # use an *asynchronous* Memory read port here (combinatorial)
407 m
.submodules
.rd_tlb
= rd_tlb
= self
.tlbmem
.read_port(domain
="comb")
408 tlb
= TLBRecord("tlb_rdport")
409 pte
, ttag
= tlb
.pte
, tlb
.tag
411 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
412 comb
+= rd_tlb
.addr
.eq(tlb_req_index
)
413 comb
+= tlb
.eq(rd_tlb
.data
)
415 with m
.If(i_in
.virt_mode
):
416 comb
+= real_addr
.eq(Cat(i_in
.nia
[:TLB_LG_PGSZ
],
417 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
419 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
420 comb
+= ra_valid
.eq(itlb_valid
.q
.bit_select(tlb_req_index
, 1))
422 comb
+= eaa_priv
.eq(pte
[3])
425 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
426 comb
+= ra_valid
.eq(1)
427 comb
+= eaa_priv
.eq(1)
429 # No IAMR, so no KUEP support for now
430 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
431 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
434 def itlb_update(self
, m
, itlb
, itlb_valid
):
440 wr_index
= Signal(TLB_BITS
)
441 wr_unary
= Signal(TLB_SIZE
)
442 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
443 comb
+= wr_unary
.eq(1<<wr_index
)
445 m
.submodules
.wr_tlb
= wr_tlb
= self
.tlbmem
.write_port()
447 with m
.If(m_in
.tlbie
& m_in
.doall
):
448 # Clear all valid bits
449 comb
+= itlb_valid
.r
.eq(-1)
451 with m
.Elif(m_in
.tlbie
):
452 # Clear entry regardless of hit or miss
453 comb
+= itlb_valid
.r
.eq(wr_unary
)
455 with m
.Elif(m_in
.tlbld
):
456 tlb
= TLBRecord("tlb_wrport")
457 comb
+= tlb
.tag
.eq(m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64])
458 comb
+= tlb
.pte
.eq(m_in
.pte
)
459 comb
+= wr_tlb
.en
.eq(1)
460 comb
+= wr_tlb
.addr
.eq(wr_index
)
461 comb
+= wr_tlb
.data
.eq(tlb
)
462 comb
+= itlb_valid
.s
.eq(wr_unary
)
464 # Cache hit detection, output to fetch2 and other misc logic
465 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
466 req_hit_way
, req_tag
, real_addr
, req_laddr
,
467 cache_valids
, access_ok
,
468 req_is_hit
, req_is_miss
, replace_way
,
469 plru_victim
, cache_out_row
):
472 m
.submodules
.rd_tag
= rd_tag
= self
.tagmem
.read_port(domain
="comb")
474 i_in
, i_out
, bus
= self
.i_in
, self
.i_out
, self
.bus
475 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
478 hit_way
= Signal(WAY_BITS
)
480 # i_in.sequential means that i_in.nia this cycle is 4 more than
481 # last cycle. If we read more than 32 bits at a time, had a
482 # cache hit last cycle, and we don't want the first 32-bit chunk
483 # then we can keep the data we read last cycle and just use that.
484 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
485 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
487 # Extract line, row and tag from request
488 comb
+= req_index
.eq(get_index(i_in
.nia
))
489 comb
+= req_row
.eq(get_row(i_in
.nia
))
490 comb
+= req_tag
.eq(get_tag(real_addr
))
492 # Calculate address of beginning of cache row, will be
493 # used for cache miss processing if needed
494 comb
+= req_laddr
.eq(Cat(
495 Const(0, ROW_OFF_BITS
),
496 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
499 # Test if pending request is a hit on any way
501 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
502 & (req_index
== r
.store_index
)
503 & r
.rows_valid
[req_row
% ROW_PER_LINE
]
505 # i_in.req asserts Decoder active
506 cvb
= Signal(NUM_WAYS
)
507 ctag
= Signal(TAG_RAM_WIDTH
)
508 comb
+= rd_tag
.addr
.eq(req_index
)
509 comb
+= ctag
.eq(rd_tag
.data
)
510 comb
+= cvb
.eq(cache_valids
.q
.word_select(req_index
, NUM_WAYS
))
511 m
.submodules
.store_way_e
= se
= Decoder(NUM_WAYS
)
512 comb
+= se
.i
.eq(r
.store_way
)
513 comb
+= se
.n
.eq(~i_in
.req
)
514 for i
in range(NUM_WAYS
):
515 tagi
= Signal(TAG_BITS
, name
="tag_i%d" % i
)
516 hit_test
= Signal(name
="hit_test%d" % i
)
517 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
518 comb
+= tagi
.eq(read_tag(i
, ctag
))
519 comb
+= hit_test
.eq(se
.o
[i
])
520 comb
+= is_tag_hit
.eq((cvb
[i
] |
(hitcond
& hit_test
)) &
522 with m
.If(is_tag_hit
):
523 comb
+= hit_way
.eq(i
)
526 # Generate the "hit" and "miss" signals
527 # for the synchronous blocks
528 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
529 comb
+= req_is_hit
.eq(is_hit
)
530 comb
+= req_is_miss
.eq(~is_hit
)
532 comb
+= req_hit_way
.eq(hit_way
)
534 # The way to replace on a miss
535 with m
.If(r
.state
== State
.CLR_TAG
):
536 comb
+= replace_way
.eq(plru_victim
)
538 comb
+= replace_way
.eq(r
.store_way
)
540 # Output instruction from current cache row
542 # Note: This is a mild violation of our design principle of
543 # having pipeline stages output from a clean latch. In this
544 # case we output the result of a mux. The alternative would
545 # be output an entire row which I prefer not to do just yet
546 # as it would force fetch2 to know about some of the cache
547 # geometry information.
548 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
549 comb
+= i_out
.valid
.eq(r
.hit_valid
)
550 comb
+= i_out
.nia
.eq(r
.hit_nia
)
551 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
552 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
554 # Stall fetch1 if we have a miss on cache or TLB
555 # or a protection fault
556 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
558 # Wishbone requests output (from the cache miss reload machine)
559 comb
+= bus
.we
.eq(r
.wb
.we
)
560 comb
+= bus
.adr
.eq(r
.wb
.adr
)
561 comb
+= bus
.sel
.eq(r
.wb
.sel
)
562 comb
+= bus
.stb
.eq(r
.wb
.stb
)
563 comb
+= bus
.dat_w
.eq(r
.wb
.dat
)
564 comb
+= bus
.cyc
.eq(r
.wb
.cyc
)
566 # Cache hit synchronous machine
567 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
568 req_index
, req_tag
, real_addr
):
571 i_in
, stall_in
= self
.i_in
, self
.stall_in
572 flush_in
= self
.flush_in
574 # keep outputs to fetch2 unchanged on a stall
575 # except that flush or reset sets valid to 0
576 # If use_previous, keep the same data as last
577 # cycle and use the second half
578 with m
.If(stall_in | use_previous
):
580 sync
+= r
.hit_valid
.eq(0)
582 # On a hit, latch the request for the next cycle,
583 # when the BRAM data will be available on the
584 # cache_out output of the corresponding way
585 sync
+= r
.hit_valid
.eq(req_is_hit
)
587 with m
.If(req_is_hit
):
588 sync
+= r
.hit_way
.eq(req_hit_way
)
589 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
590 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
,
591 i_in
.stop_mark
, req_index
, req_tag
,
592 req_hit_way
, real_addr
)
594 with m
.If(~stall_in
):
595 # Send stop marks and NIA down regardless of validity
596 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
597 sync
+= r
.hit_nia
.eq(i_in
.nia
)
599 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
600 req_index
, req_tag
, replace_way
, real_addr
):
606 # Reset per-row valid flags, only used in WAIT_ACK
607 for i
in range(ROW_PER_LINE
):
608 sync
+= r
.rows_valid
[i
].eq(0)
610 # We need to read a cache line
611 with m
.If(req_is_miss
):
613 "cache miss nia:%x IR:%x SM:%x idx:%x "
614 " way:%x tag:%x RA:%x", i_in
.nia
,
615 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
616 replace_way
, req_tag
, real_addr
)
618 # Keep track of our index and way for subsequent stores
619 st_row
= Signal(ROW_BITS
)
620 comb
+= st_row
.eq(get_row(req_laddr
))
621 sync
+= r
.store_index
.eq(req_index
)
622 sync
+= r
.store_row
.eq(st_row
)
623 sync
+= r
.store_tag
.eq(req_tag
)
624 sync
+= r
.store_valid
.eq(1)
625 sync
+= r
.end_row_ix
.eq(get_row_of_line(st_row
) - 1)
627 # Prep for first wishbone read. We calculate the address
628 # of the start of the cache line and start the WB cycle.
629 sync
+= r
.req_adr
.eq(req_laddr
)
630 sync
+= r
.wb
.cyc
.eq(1)
631 sync
+= r
.wb
.stb
.eq(1)
633 # Track that we had one request sent
634 sync
+= r
.state
.eq(State
.CLR_TAG
)
636 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
641 m
.submodules
.wr_tag
= wr_tag
= self
.tagmem
.write_port(
642 granularity
=TAG_BITS
)
644 # Get victim way from plru
645 sync
+= r
.store_way
.eq(replace_way
)
647 # Force misses on that way while reloading that line
648 idx
= req_index
*NUM_WAYS
+ replace_way
# 2D index, 1st dim: NUM_WAYS
649 comb
+= cache_valids
.r
.eq(1<<idx
)
651 # use write-port "granularity" to select the tag to write to
652 # TODO: the Memory should be multipled-up (by NUM_TAGS)
653 tagset
= Signal(TAG_RAM_WIDTH
)
654 comb
+= tagset
.eq(r
.store_tag
<< (replace_way
*TAG_BITS
))
655 comb
+= wr_tag
.en
.eq(1<<replace_way
)
656 comb
+= wr_tag
.addr
.eq(r
.store_index
)
657 comb
+= wr_tag
.data
.eq(tagset
)
659 sync
+= r
.state
.eq(State
.WAIT_ACK
)
661 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
662 cache_valids
, stbs_done
):
668 # Requests are all sent if stb is 0
670 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
671 comb
+= stbs_done
.eq(stbs_zero
)
673 # If we are still sending requests, was one accepted?
674 with m
.If(~bus
.stall
& ~stbs_zero
):
675 # That was the last word? We are done sending.
676 # Clear stb and set stbs_done so we can handle
677 # an eventual last ack on the same cycle.
678 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
679 sync
+= Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
680 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
681 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
682 r
.wb
.stb
, stbs_zero
, stbs_done
)
683 sync
+= r
.wb
.stb
.eq(0)
684 comb
+= stbs_done
.eq(1)
686 # Calculate the next row address
687 rarange
= Signal(LINE_OFF_BITS
- ROW_OFF_BITS
)
688 comb
+= rarange
.eq(r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1)
689 sync
+= r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(rarange
)
690 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
691 "stbs_zero:%x stbs_done:%x",
692 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
694 # Incoming acks processing
696 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
698 bus
.dat_r
, stbs_zero
, stbs_done
)
700 sync
+= r
.rows_valid
[r
.store_row
% ROW_PER_LINE
].eq(1)
702 # Check for completion
703 with m
.If(stbs_done
& is_last_row(r
.store_row
, r
.end_row_ix
)):
704 # Complete wishbone cycle
705 sync
+= r
.wb
.cyc
.eq(0)
706 # be nice, clear addr
707 sync
+= r
.req_adr
.eq(0)
709 # Cache line is now valid
710 idx
= r
.store_index
*NUM_WAYS
+ replace_way
# 2D index again
711 valid
= r
.store_valid
& ~inval_in
712 comb
+= cache_valids
.s
.eq(1<<idx
)
713 sync
+= r
.state
.eq(State
.IDLE
)
715 # move on to next request in row
716 # Increment store row counter
717 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
719 # Cache miss/reload synchronous machine
720 def icache_miss(self
, m
, r
, req_is_miss
,
721 req_index
, req_laddr
, req_tag
, replace_way
,
722 cache_valids
, access_ok
, real_addr
):
726 i_in
, bus
, m_in
= self
.i_in
, self
.bus
, self
.m_in
727 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
728 inval_in
= self
.inval_in
732 comb
+= r
.wb
.sel
.eq(-1)
733 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
735 # Process cache invalidations
737 comb
+= cache_valids
.r
.eq(-1)
738 sync
+= r
.store_valid
.eq(0)
741 with m
.Switch(r
.state
):
743 with m
.Case(State
.IDLE
):
744 self
.icache_miss_idle(m
, r
, req_is_miss
, req_laddr
,
745 req_index
, req_tag
, replace_way
,
748 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
749 with m
.If(r
.state
== State
.CLR_TAG
):
750 self
.icache_miss_clr_tag(m
, r
, replace_way
,
754 self
.icache_miss_wait_ack(m
, r
, replace_way
, inval_in
,
755 cache_valids
, stbs_done
)
757 # TLB miss and protection fault processing
758 with m
.If(flush_in | m_in
.tlbld
):
759 sync
+= r
.fetch_failed
.eq(0)
760 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
761 sync
+= r
.fetch_failed
.eq(1)
763 # icache_log: if LOG_LENGTH > 0 generate
764 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
765 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
769 bus
, i_out
= self
.bus
, self
.i_out
770 log_out
, stall_out
= self
.log_out
, self
.stall_out
772 # Output data to logger
773 for i
in range(LOG_LENGTH
):
774 log_data
= Signal(54)
775 lway
= Signal(WAY_BITS
)
778 sync
+= lway
.eq(req_hit_way
)
781 with m
.If(r
.state
!= State
.IDLE
):
784 sync
+= log_data
.eq(Cat(
785 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
786 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
787 stall_out
, bus
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
788 r
.real_addr
[3:6], bus
.ack
, i_out
.insn
, i_out
.valid
790 comb
+= log_out
.eq(log_data
)
792 def elaborate(self
, platform
):
797 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
798 # number of ways and the number of lines.
799 vec
= SRLatch(sync
=True, llen
=NUM_WAYS
*NUM_LINES
, name
="cachevalids")
800 m
.submodules
.cache_valids
= cache_valids
= vec
804 vec
= SRLatch(sync
=False, llen
=TLB_SIZE
, name
="tlbvalids")
805 m
.submodules
.itlb_valids
= itlb_valid
= vec
807 # TODO to be passed to nmigen as ram attributes
808 # attribute ram_style of itlb_tags : signal is "distributed";
809 # attribute ram_style of itlb_ptes : signal is "distributed";
811 # Privilege bit from PTE EAA field
816 # Async signal on incoming request
817 req_index
= Signal(INDEX_BITS
)
818 req_row
= Signal(ROW_BITS
)
819 req_hit_way
= Signal(WAY_BITS
)
820 req_tag
= Signal(TAG_BITS
)
821 req_is_hit
= Signal()
822 req_is_miss
= Signal()
823 req_laddr
= Signal(64)
825 tlb_req_index
= Signal(TLB_BITS
)
826 real_addr
= Signal(REAL_ADDR_BITS
)
828 priv_fault
= Signal()
830 use_previous
= Signal()
832 cache_out_row
= Signal(ROW_SIZE_BITS
)
834 plru_victim
= Signal(WAY_BITS
)
835 replace_way
= Signal(WAY_BITS
)
837 self
.tlbmem
= Memory(depth
=TLB_SIZE
, width
=TLB_EA_TAG_BITS
+TLB_PTE_BITS
)
838 self
.tagmem
= Memory(depth
=NUM_LINES
, width
=TAG_RAM_WIDTH
)
840 # call sub-functions putting everything together,
841 # using shared signals established above
842 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
843 self
.maybe_plrus(m
, r
, plru_victim
)
844 self
.itlb_lookup(m
, tlb_req_index
, itlb
, itlb_valid
, real_addr
,
845 ra_valid
, eaa_priv
, priv_fault
,
847 self
.itlb_update(m
, itlb
, itlb_valid
)
848 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
849 req_tag
, real_addr
, req_laddr
,
851 access_ok
, req_is_hit
, req_is_miss
,
852 replace_way
, plru_victim
, cache_out_row
)
853 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
854 req_index
, req_tag
, real_addr
)
855 self
.icache_miss(m
, r
, req_is_miss
, req_index
,
856 req_laddr
, req_tag
, replace_way
,
858 access_ok
, real_addr
)
859 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
860 # req_is_miss, req_is_hit, lway, wstate, r)
862 # don't connect up to FetchUnitInterface so that some unit tests
863 # can continue to operate
864 if not self
.use_fetch_iface
:
867 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
868 # so needs checking and iterative revising
869 i_in
, bus
, i_out
= self
.i_in
, self
.bus
, self
.i_out
870 comb
+= i_in
.req
.eq(self
.a_i_valid
)
871 comb
+= i_in
.nia
.eq(self
.a_pc_i
)
872 comb
+= self
.stall_in
.eq(self
.a_stall_i
)
873 comb
+= self
.f_fetch_err_o
.eq(i_out
.fetch_failed
)
874 comb
+= self
.f_badaddr_o
.eq(i_out
.nia
)
875 comb
+= self
.f_instr_o
.eq(i_out
.insn
)
876 comb
+= self
.f_busy_o
.eq(~i_out
.valid
) # probably
878 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
880 comb
+= ibus
.adr
.eq(self
.bus
.adr
)
881 comb
+= ibus
.dat_w
.eq(self
.bus
.dat_w
)
882 comb
+= ibus
.sel
.eq(self
.bus
.sel
)
883 comb
+= ibus
.cyc
.eq(self
.bus
.cyc
)
884 comb
+= ibus
.stb
.eq(self
.bus
.stb
)
885 comb
+= ibus
.we
.eq(self
.bus
.we
)
887 comb
+= self
.bus
.dat_r
.eq(ibus
.dat_r
)
888 comb
+= self
.bus
.ack
.eq(ibus
.ack
)
889 if hasattr(ibus
, "stall"):
890 comb
+= self
.bus
.stall
.eq(ibus
.stall
)
892 # fake-up the wishbone stall signal to comply with pipeline mode
893 # same thing is done in dcache.py
894 comb
+= self
.bus
.stall
.eq(self
.bus
.cyc
& ~self
.bus
.ack
)
904 yield i_in
.priv_mode
.eq(1)
907 yield i_in
.stop_mark
.eq(0)
908 yield m_out
.tlbld
.eq(0)
909 yield m_out
.tlbie
.eq(0)
910 yield m_out
.addr
.eq(0)
911 yield m_out
.pte
.eq(0)
917 # miss, stalls for a bit
919 yield i_in
.nia
.eq(Const(0x0000000000000004, 64))
921 valid
= yield i_out
.valid
924 valid
= yield i_out
.valid
927 insn
= yield i_out
.insn
928 nia
= yield i_out
.nia
929 assert insn
== 0x00000001, \
930 "insn @%x=%x expected 00000001" % (nia
, insn
)
936 yield i_in
.nia
.eq(Const(0x0000000000000008, 64))
938 valid
= yield i_out
.valid
941 valid
= yield i_out
.valid
944 nia
= yield i_out
.nia
945 insn
= yield i_out
.insn
947 assert insn
== 0x00000002, \
948 "insn @%x=%x expected 00000002" % (nia
, insn
)
952 yield i_in
.nia
.eq(Const(0x0000000000000040, 64))
954 valid
= yield i_out
.valid
957 valid
= yield i_out
.valid
961 insn
= yield i_out
.insn
962 assert insn
== 0x00000010, \
963 "insn @%x=%x expected 00000010" % (nia
, insn
)
965 # test something that aliases (this only works because
966 # the unit test SRAM is a depth of 512)
968 yield i_in
.nia
.eq(Const(0x0000000000000100, 64))
971 valid
= yield i_out
.valid
976 insn
= yield i_out
.insn
977 valid
= yield i_out
.valid
978 insn
= yield i_out
.insn
980 assert insn
== 0x00000040, \
981 "insn @%x=%x expected 00000040" % (nia
, insn
)
985 def test_icache(mem
):
986 from soc
.config
.test
.test_loadstore
import TestMemPspec
987 pspec
= TestMemPspec(addr_wid
=32,
993 memory
= Memory(width
=64, depth
=512, init
=mem
)
994 sram
= SRAM(memory
=memory
, granularity
=8)
998 m
.submodules
.icache
= dut
999 m
.submodules
.sram
= sram
1001 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.bus
.cyc
)
1002 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.bus
.stb
)
1003 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.bus
.we
)
1004 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.bus
.sel
)
1005 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.bus
.adr
)
1006 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.bus
.dat_w
)
1008 m
.d
.comb
+= dut
.bus
.ack
.eq(sram
.bus
.ack
)
1009 m
.d
.comb
+= dut
.bus
.dat_r
.eq(sram
.bus
.dat_r
)
1015 sim
.add_sync_process(wrap(icache_sim(dut
)))
1016 with sim
.write_vcd('test_icache.vcd'):
1020 if __name__
== '__main__':
1021 from soc
.config
.test
.test_loadstore
import TestMemPspec
1022 pspec
= TestMemPspec(addr_wid
=64,
1027 vl
= rtlil
.convert(dut
, ports
=[])
1028 with
open("test_icache.il", "w") as f
:
1031 # set up memory every 32-bits with incrementing values 0 1 2 ...
1033 for i
in range(512):
1034 mem
.append((i
*2) |
((i
*2+1)<<32))