3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
29 from enum
import (Enum
, unique
)
30 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
,
32 from nmigen
.cli
import main
, rtlil
33 from nmutil
.iocontrol
import RecordObject
34 from nmigen
.utils
import log2_int
35 from nmigen
.lib
.coding
import Decoder
36 from nmutil
.util
import Display
37 from nmutil
.latch
import SRLatch
39 #from nmutil.plru import PLRU
40 from soc
.experiment
.plru
import PLRU
, PLRUs
41 from soc
.experiment
.cache_ram
import CacheRam
43 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
47 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
48 WB_SEL_BITS
, WBAddrType
, WBDataType
,
49 WBSelType
, WBMasterOut
, WBSlaveOut
,
52 from nmigen_soc
.wishbone
.bus
import Interface
53 from soc
.minerva
.units
.fetch
import FetchUnitInterface
57 from soc
.bus
.sram
import SRAM
58 from nmigen
import Memory
59 from nmutil
.util
import wrap
60 from nmigen
.cli
import main
, rtlil
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
66 # from microwatt/utils.vhdl
68 return n
!= 0 and (n
& (n
- 1)) == 0
71 # Non-zero to enable log data collection
75 def __init__(self
, self
.LINE_SIZE
= 64
76 self
.NUM_LINE
= 16 # Number of lines in a set
77 self
.NUM_WAYS
= 1, # Number of ways
78 self
.TLB_SIZE
= 64, # L1 ITLB number of entries
79 self
.TLB_LG_PGSZ
= 12): # L1 ITLB log_2(page_size)
81 self
.NUM_LINE
= 16 # Number of lines in a set
82 self
.NUM_WAYS
= 1 # Number of ways
83 self
.TLB_SIZE
= 64 # L1 ITLB number of entries
84 self
.TLB_LG_PGSZ
= 12 # L1 ITLB log_2(page_size)
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
90 # self.ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
91 self
.ROW_SIZE
= WB_DATA_BITS
// 8
92 # Number of real address bits that we store
93 self
.REAL_ADDR_BITS
= 56
95 self
.ROW_SIZE_BITS
= self
.ROW_SIZE
* 8
96 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
97 self
.ROW_PER_LINE
= self
.LINE_SIZE
// self
.ROW_SIZE
98 # BRAM_ROWS is the number of rows in BRAM
99 # needed to represent the full icache
100 self
.BRAM_ROWS
= self
.NUM_LINE
* self
.ROW_PER_LINE
101 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
102 self
.INSN_PER_ROW
= self
.ROW_SIZE_BITS
// 32
104 # Bit fields counts in the address
106 # INSN_BITS is the number of bits to select an instruction in a row
107 self
.INSN_BITS
= log2_int(self
.INSN_PER_ROW
)
108 # ROW_BITS is the number of bits to select a row
109 self
.ROW_BITS
= log2_int(self
.BRAM_ROWS
)
110 # ROW_LINE_BITS is the number of bits to select a row within a line
111 self
.ROW_LINE_BITS
= log2_int(self
.ROW_PER_LINE
)
112 # LINE_OFF_BITS is the number of bits for the offset in a cache line
113 self
.LINE_OFF_BITS
= log2_int(self
.LINE_SIZE
)
114 # ROW_OFF_BITS is the number of bits for the offset in a row
115 self
.ROW_OFF_BITS
= log2_int(self
.ROW_SIZE
)
116 # INDEX_BITS is the number of bits to select a cache line
117 self
.INDEX_BITS
= log2_int(self
.NUM_LINE
)
118 # SET_SIZE_BITS is the log base 2 of the set size
119 self
.SET_SIZE_BITS
= self
.LINE_OFF_BITS
+ self
.INDEX_BITS
120 # TAG_BITS is the number of bits of the tag part of the address
121 self
.TAG_BITS
= self
.REAL_ADDR_BITS
- self
.SET_SIZE_BITS
122 # TAG_WIDTH is the width in bits of each way of the tag RAM
123 self
.TAG_WIDTH
= self
.TAG_BITS
+ 7 - ((self
.TAG_BITS
+ 7) % 8)
125 # WAY_BITS is the number of bits to select a way
126 self
.WAY_BITS
= log2_int(self
.NUM_WAYS
)
127 self
.TAG_RAM_WIDTH
= self
.TAG_BITS
* self
.NUM_WAYS
130 self
.TL_BITS
= log2_int(self
.TLB_SIZE
)
131 self
.TLB_EA_TAG_BITS
= 64 - (self
.TLB_LG_PGSZ
+ self
.TL_BITS
)
132 self
.TLB_PTE_BITS
= 64
134 print("self.BRAM_ROWS =", self
.BRAM_ROWS
)
135 print("self.INDEX_BITS =", self
.INDEX_BITS
)
136 print("self.INSN_BITS =", self
.INSN_BITS
)
137 print("self.INSN_PER_ROW =", self
.INSN_PER_ROW
)
138 print("self.LINE_SIZE =", self
.LINE_SIZE
)
139 print("self.LINE_OFF_BITS =", self
.LINE_OFF_BITS
)
140 print("LOG_LENGTH =", LOG_LENGTH
)
141 print("self.NUM_LINE =", self
.NUM_LINE
)
142 print("self.NUM_WAYS =", self
.NUM_WAYS
)
143 print("self.REAL_ADDR_BITS =", self
.REAL_ADDR_BITS
)
144 print("self.ROW_BITS =", self
.ROW_BITS
)
145 print("self.ROW_OFF_BITS =", self
.ROW_OFF_BITS
)
146 print("self.ROW_LINE_BITS =", self
.ROW_LINE_BITS
)
147 print("self.ROW_PER_LINE =", self
.ROW_PER_LINE
)
148 print("self.ROW_SIZE =", self
.ROW_SIZE
)
149 print("self.ROW_SIZE_BITS =", self
.ROW_SIZE_BITS
)
150 print("self.SET_SIZE_BITS =", self
.SET_SIZE_BITS
)
152 print("self.TAG_BITS =", self
.TAG_BITS
)
153 print("self.TAG_RAM_WIDTH =", self
.TAG_RAM_WIDTH
)
154 print("self.TAG_BITS =", self
.TAG_BITS
)
155 print("self.TL_BITS =", self
.TL_BITS
)
156 print("self.TLB_EA_TAG_BITS =", self
.TLB_EA_TAG_BITS
)
157 print("self.TLB_LG_PGSZ =", self
.TLB_LG_PGSZ
)
158 print("self.TLB_PTE_BITS =", self
.TLB_PTE_BITS
)
159 print("self.TLB_SIZE =", self
.TLB_SIZE
)
160 print("self.WAY_BITS =", self
.WAY_BITS
)
162 assert self
.LINE_SIZE
% self
.ROW_SIZE
== 0
163 assert ispow2(self
.LINE_SIZE
), "self.LINE_SIZE not power of 2"
164 assert ispow2(self
.NUM_LINE
), "self.NUM_LINE not power of 2"
165 assert ispow2(self
.ROW_PER_LINE
), "self.ROW_PER_LINE not power of 2"
166 assert ispow2(self
.INSN_PER_ROW
), "self.INSN_PER_ROW not power of 2"
167 assert (self
.ROW_BITS
== (self
.INDEX_BITS
+ self
.ROW_LINE_BITS
)), \
168 "geometry bits don't add up"
169 assert (self
.LINE_OFF_BITS
== (self
.ROW_OFF_BITS
+ self
.ROW_LINE_BITS
)), \
170 "geometry bits don't add up"
171 assert (self
.REAL_ADDR_BITS
== (self
.TAG_BITS
+ self
.INDEX_BITS
+ self
.LINE_OFF_BITS
)), \
172 "geometry bits don't add up"
173 assert (self
.REAL_ADDR_BITS
== (self
.TAG_BITS
+ self
.ROW_BITS
+ self
.ROW_OFF_BITS
)), \
174 "geometry bits don't add up"
176 # Example of layout for 32 lines of 64 bytes:
178 # .. tag |index| line |
180 # .. | | | |00| zero (2)
181 # .. | | |-| | self.INSN_BITS (1)
182 # .. | |---| | self.ROW_LINE_BITS (3)
183 # .. | |--- - --| self.LINE_OFF_BITS (6)
184 # .. | |- --| self.ROW_OFF_BITS (3)
185 # .. |----- ---| | self.ROW_BITS (8)
186 # .. |-----| | self.INDEX_BITS (5)
187 # .. --------| | self.TAG_BITS (53)
189 # The cache data BRAM organized as described above for each way
190 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
192 def RowPerLineValidArray():
193 return Array(Signal(name
="rows_valid_%d" %x) \
194 for x
in range(self
.ROW_PER_LINE
))
197 # TODO to be passed to nigmen as ram attributes
198 # attribute ram_style : string;
199 # attribute ram_style of cache_tags : signal is "distributed";
202 tlb_layout
= [ ('tag', self
.TLB_EA_TAG_BITS
),
203 ('pte', self
.TLB_PTE_BITS
)
205 return Record(tlb_layout
, name
=name
)
208 return Array(TLBRecord("tlb%d" % x
) for x
in range(self
.TLB_SIZE
))
210 # PLRU output interface
212 return Array(Signal(self
.WAY_BITS
, name
="plru_out_%d" %x) \
213 for x
in range(self
.NUM_LINE
))
215 # Return the cache line index (tag index) for an address
217 return addr
[self
.LINE_OFF_BITS
:self
.SET_SIZE_BITS
]
219 # Return the cache row index (data memory) for an address
221 return addr
[self
.ROW_OFF_BITS
:self
.SET_SIZE_BITS
]
223 # Return the index of a row within a line
224 def get_row_of_line(row
):
225 return row
[:self
.ROW_BITS
][:self
.ROW_LINE_BITS
]
227 # Returns whether this is the last row of a line
228 def is_last_row_addr(addr
, last
):
229 return addr
[self
.ROW_OFF_BITS
:self
.LINE_OFF_BITS
] == last
231 # Returns whether this is the last row of a line
232 def is_last_row(row
, last
):
233 return get_row_of_line(row
) == last
235 # Return the next row in the current cache line. We use a dedicated
236 # function in order to limit the size of the generated adder to be
237 # only the bits within a cache line (3 bits with default settings)
239 row_v
= row
[0:self
.ROW_LINE_BITS
] + 1
240 return Cat(row_v
[:self
.ROW_LINE_BITS
], row
[self
.ROW_LINE_BITS
:])
242 # Read the instruction word for the given address
243 # in the current cache row
244 def read_insn_word(addr
, data
):
245 word
= addr
[2:self
.INSN_BITS
+2]
246 return data
.word_select(word
, 32)
248 # Get the tag value from the address
250 return addr
[self
.SET_SIZE_BITS
:self
.REAL_ADDR_BITS
]
252 # Read a tag from a tag memory row
253 def read_tag(way
, tagset
):
254 return tagset
.word_select(way
, self
.TAG_BITS
)
256 # Write a tag to tag memory row
257 def write_tag(way
, tagset
, tag
):
258 return read_tag(way
, tagset
).eq(tag
)
260 # Simple hash for direct-mapped TLB index
262 hsh
= (addr
[self
.TLB_LG_PGSZ
:self
.TLB_LG_PGSZ
+ self
.TL_BITS
] ^
263 addr
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:self
.TLB_LG_PGSZ
+ 2 * self
.TL_BITS
] ^
264 addr
[self
.TLB_LG_PGSZ
+ 2 * self
.TL_BITS
:self
.TLB_LG_PGSZ
+ 3 * self
.TL_BITS
])
268 # Cache reload state machine
276 class RegInternal(RecordObject
):
279 # Cache hit state (Latches for 1 cycle BRAM access)
280 self
.hit_way
= Signal(self
.WAY_BITS
)
281 self
.hit_nia
= Signal(64)
282 self
.hit_smark
= Signal()
283 self
.hit_valid
= Signal()
285 # Cache miss state (reload state machine)
286 self
.state
= Signal(State
, reset
=State
.IDLE
)
287 self
.wb
= WBMasterOut("wb")
288 self
.req_adr
= Signal(64)
289 self
.store_way
= Signal(self
.WAY_BITS
)
290 self
.store_index
= Signal(self
.INDEX_BITS
)
291 self
.store_row
= Signal(self
.ROW_BITS
)
292 self
.store_tag
= Signal(self
.TAG_BITS
)
293 self
.store_valid
= Signal()
294 self
.end_row_ix
= Signal(self
.ROW_LINE_BITS
)
295 self
.rows_valid
= RowPerLineValidArray()
298 self
.fetch_failed
= Signal()
301 class ICache(FetchUnitInterface
, Elaboratable
):
302 """64 bit direct mapped icache. All instructions are 4B aligned."""
303 def __init__(self
, pspec
):
304 FetchUnitInterface
.__init
__(self
, pspec
)
305 self
.i_in
= Fetch1ToICacheType(name
="i_in")
306 self
.i_out
= ICacheToDecode1Type(name
="i_out")
308 self
.m_in
= MMUToICacheType(name
="m_in")
310 self
.stall_in
= Signal()
311 self
.stall_out
= Signal()
312 self
.flush_in
= Signal()
313 self
.inval_in
= Signal()
315 # standard naming (wired to non-standard for compatibility)
316 self
.bus
= Interface(addr_width
=32,
323 self
.log_out
= Signal(54)
325 # use FetchUnitInterface, helps keep some unit tests running
326 self
.use_fetch_iface
= False
328 def use_fetch_interface(self
):
329 self
.use_fetch_iface
= True
331 # Generate a cache RAM for each way
332 def rams(self
, m
, r
, cache_out_row
, use_previous
,
333 replace_way
, req_row
):
338 bus
, stall_in
= self
.bus
, self
.stall_in
340 # read condition (for every cache ram)
342 comb
+= do_read
.eq(~
(stall_in | use_previous
))
344 rd_addr
= Signal(self
.ROW_BITS
)
345 wr_addr
= Signal(self
.ROW_BITS
)
346 comb
+= rd_addr
.eq(req_row
)
347 comb
+= wr_addr
.eq(r
.store_row
)
349 # binary-to-unary converters: replace-way enabled by bus.ack,
350 # hit-way left permanently enabled
351 m
.submodules
.replace_way_e
= re
= Decoder(self
.NUM_WAYS
)
352 m
.submodules
.hit_way_e
= he
= Decoder(self
.NUM_WAYS
)
353 comb
+= re
.i
.eq(replace_way
)
354 comb
+= re
.n
.eq(~bus
.ack
)
355 comb
+= he
.i
.eq(r
.hit_way
)
357 for i
in range(self
.NUM_WAYS
):
358 do_write
= Signal(name
="do_wr_%d" % i
)
359 d_out
= Signal(self
.ROW_SIZE_BITS
, name
="d_out_%d" % i
)
360 wr_sel
= Signal(self
.ROW_SIZE
, name
="wr_sel_%d" % i
)
362 way
= CacheRam(self
.ROW_BITS
, self
.ROW_SIZE_BITS
, TRACE
=True, ram_num
=i
)
363 m
.submodules
["cacheram_%d" % i
] = way
365 comb
+= way
.rd_en
.eq(do_read
)
366 comb
+= way
.rd_addr
.eq(rd_addr
)
367 comb
+= d_out
.eq(way
.rd_data_o
)
368 comb
+= way
.wr_sel
.eq(wr_sel
)
369 comb
+= way
.wr_addr
.eq(wr_addr
)
370 comb
+= way
.wr_data
.eq(bus
.dat_r
)
372 comb
+= do_write
.eq(re
.o
[i
])
375 sync
+= Display("cache write adr: %x data: %lx",
376 wr_addr
, way
.wr_data
)
379 comb
+= cache_out_row
.eq(d_out
)
381 sync
+= Display("cache read adr: %x data: %x",
384 comb
+= wr_sel
.eq(Repl(do_write
, self
.ROW_SIZE
))
387 def maybe_plrus(self
, m
, r
, plru_victim
):
390 if self
.NUM_WAYS
== 0:
394 m
.submodules
.plrus
= plru
= PLRUs(self
.NUM_LINE
, self
.WAY_BITS
)
395 comb
+= plru
.way
.eq(r
.hit_way
)
396 comb
+= plru
.valid
.eq(r
.hit_valid
)
397 comb
+= plru
.index
.eq(get_index(r
.hit_nia
))
398 comb
+= plru
.isel
.eq(r
.store_index
) # select victim
399 comb
+= plru_victim
.eq(plru
.o_index
) # selected victim
401 # TLB hit detection and real address generation
402 def itlb_lookup(self
, m
, tlb_req_index
, itlb
, itlb_valid
,
403 real_addr
, ra_valid
, eaa_priv
,
404 priv_fault
, access_ok
):
410 # use an *asynchronous* Memory read port here (combinatorial)
411 m
.submodules
.rd_tlb
= rd_tlb
= self
.tlbmem
.read_port(domain
="comb")
412 tlb
= TLBRecord("tlb_rdport")
413 pte
, ttag
= tlb
.pte
, tlb
.tag
415 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
416 comb
+= rd_tlb
.addr
.eq(tlb_req_index
)
417 comb
+= tlb
.eq(rd_tlb
.data
)
419 with m
.If(i_in
.virt_mode
):
420 comb
+= real_addr
.eq(Cat(i_in
.nia
[:self
.TLB_LG_PGSZ
],
421 pte
[self
.TLB_LG_PGSZ
:self
.REAL_ADDR_BITS
]))
423 with m
.If(ttag
== i_in
.nia
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:64]):
424 comb
+= ra_valid
.eq(itlb_valid
.q
.bit_select(tlb_req_index
, 1))
426 comb
+= eaa_priv
.eq(pte
[3])
429 comb
+= real_addr
.eq(i_in
.nia
[:self
.REAL_ADDR_BITS
])
430 comb
+= ra_valid
.eq(1)
431 comb
+= eaa_priv
.eq(1)
433 # No IAMR, so no KUEP support for now
434 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
435 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
438 def itlb_update(self
, m
, itlb
, itlb_valid
):
444 wr_index
= Signal(self
.TL_BITS
)
445 wr_unary
= Signal(self
.TLB_SIZE
)
446 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
447 comb
+= wr_unary
.eq(1<<wr_index
)
449 m
.submodules
.wr_tlb
= wr_tlb
= self
.tlbmem
.write_port()
450 sync
+= itlb_valid
.s
.eq(0)
451 sync
+= itlb_valid
.r
.eq(0)
453 with m
.If(m_in
.tlbie
& m_in
.doall
):
454 # Clear all valid bits
455 sync
+= itlb_valid
.r
.eq(-1)
457 with m
.Elif(m_in
.tlbie
):
458 # Clear entry regardless of hit or miss
459 sync
+= itlb_valid
.r
.eq(wr_unary
)
461 with m
.Elif(m_in
.tlbld
):
462 tlb
= TLBRecord("tlb_wrport")
463 comb
+= tlb
.tag
.eq(m_in
.addr
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:64])
464 comb
+= tlb
.pte
.eq(m_in
.pte
)
465 comb
+= wr_tlb
.en
.eq(1)
466 comb
+= wr_tlb
.addr
.eq(wr_index
)
467 comb
+= wr_tlb
.data
.eq(tlb
)
468 sync
+= itlb_valid
.s
.eq(wr_unary
)
470 # Cache hit detection, output to fetch2 and other misc logic
471 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
472 req_hit_way
, req_tag
, real_addr
, req_laddr
,
473 cache_valids
, access_ok
,
474 req_is_hit
, req_is_miss
, replace_way
,
475 plru_victim
, cache_out_row
):
478 m
.submodules
.rd_tag
= rd_tag
= self
.tagmem
.read_port(domain
="comb")
480 i_in
, i_out
, bus
= self
.i_in
, self
.i_out
, self
.bus
481 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
484 hit_way
= Signal(self
.WAY_BITS
)
486 # i_in.sequential means that i_in.nia this cycle is 4 more than
487 # last cycle. If we read more than 32 bits at a time, had a
488 # cache hit last cycle, and we don't want the first 32-bit chunk
489 # then we can keep the data we read last cycle and just use that.
490 with m
.If(i_in
.nia
[2:self
.INSN_BITS
+2] != 0):
491 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
493 # Extract line, row and tag from request
494 comb
+= req_index
.eq(get_index(i_in
.nia
))
495 comb
+= req_row
.eq(get_row(i_in
.nia
))
496 comb
+= req_tag
.eq(get_tag(real_addr
))
498 # Calculate address of beginning of cache row, will be
499 # used for cache miss processing if needed
500 comb
+= req_laddr
.eq(Cat(
501 Const(0, self
.ROW_OFF_BITS
),
502 real_addr
[self
.ROW_OFF_BITS
:self
.REAL_ADDR_BITS
],
505 # Test if pending request is a hit on any way
507 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
508 & (req_index
== r
.store_index
)
509 & r
.rows_valid
[req_row
% self
.ROW_PER_LINE
]
511 # i_in.req asserts Decoder active
512 cvb
= Signal(self
.NUM_WAYS
)
513 ctag
= Signal(self
.TAG_RAM_WIDTH
)
514 comb
+= rd_tag
.addr
.eq(req_index
)
515 comb
+= ctag
.eq(rd_tag
.data
)
516 comb
+= cvb
.eq(cache_valids
.q
.word_select(req_index
, self
.NUM_WAYS
))
517 m
.submodules
.store_way_e
= se
= Decoder(self
.NUM_WAYS
)
518 comb
+= se
.i
.eq(r
.store_way
)
519 comb
+= se
.n
.eq(~i_in
.req
)
520 for i
in range(self
.NUM_WAYS
):
521 tagi
= Signal(self
.TAG_BITS
, name
="tag_i%d" % i
)
522 hit_test
= Signal(name
="hit_test%d" % i
)
523 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
524 comb
+= tagi
.eq(read_tag(i
, ctag
))
525 comb
+= hit_test
.eq(se
.o
[i
])
526 comb
+= is_tag_hit
.eq((cvb
[i
] |
(hitcond
& hit_test
)) &
528 with m
.If(is_tag_hit
):
529 comb
+= hit_way
.eq(i
)
532 # Generate the "hit" and "miss" signals
533 # for the synchronous blocks
534 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
535 comb
+= req_is_hit
.eq(is_hit
)
536 comb
+= req_is_miss
.eq(~is_hit
)
538 comb
+= req_hit_way
.eq(hit_way
)
540 # The way to replace on a miss
541 with m
.If(r
.state
== State
.CLR_TAG
):
542 comb
+= replace_way
.eq(plru_victim
)
544 comb
+= replace_way
.eq(r
.store_way
)
546 # Output instruction from current cache row
548 # Note: This is a mild violation of our design principle of
549 # having pipeline stages output from a clean latch. In this
550 # case we output the result of a mux. The alternative would
551 # be output an entire row which I prefer not to do just yet
552 # as it would force fetch2 to know about some of the cache
553 # geometry information.
554 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
555 comb
+= i_out
.valid
.eq(r
.hit_valid
)
556 comb
+= i_out
.nia
.eq(r
.hit_nia
)
557 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
558 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
560 # Stall fetch1 if we have a miss on cache or TLB
561 # or a protection fault
562 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
564 # Wishbone requests output (from the cache miss reload machine)
565 comb
+= bus
.we
.eq(r
.wb
.we
)
566 comb
+= bus
.adr
.eq(r
.wb
.adr
)
567 comb
+= bus
.sel
.eq(r
.wb
.sel
)
568 comb
+= bus
.stb
.eq(r
.wb
.stb
)
569 comb
+= bus
.dat_w
.eq(r
.wb
.dat
)
570 comb
+= bus
.cyc
.eq(r
.wb
.cyc
)
572 # Cache hit synchronous machine
573 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
574 req_index
, req_tag
, real_addr
):
577 i_in
, stall_in
= self
.i_in
, self
.stall_in
578 flush_in
= self
.flush_in
580 # keep outputs to fetch2 unchanged on a stall
581 # except that flush or reset sets valid to 0
582 # If use_previous, keep the same data as last
583 # cycle and use the second half
584 with m
.If(stall_in | use_previous
):
586 sync
+= r
.hit_valid
.eq(0)
588 # On a hit, latch the request for the next cycle,
589 # when the BRAM data will be available on the
590 # cache_out output of the corresponding way
591 sync
+= r
.hit_valid
.eq(req_is_hit
)
593 with m
.If(req_is_hit
):
594 sync
+= r
.hit_way
.eq(req_hit_way
)
595 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
596 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
,
597 i_in
.stop_mark
, req_index
, req_tag
,
598 req_hit_way
, real_addr
)
600 with m
.If(~stall_in
):
601 # Send stop marks and NIA down regardless of validity
602 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
603 sync
+= r
.hit_nia
.eq(i_in
.nia
)
605 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
606 req_index
, req_tag
, replace_way
, real_addr
):
612 # Reset per-row valid flags, only used in WAIT_ACK
613 for i
in range(self
.ROW_PER_LINE
):
614 sync
+= r
.rows_valid
[i
].eq(0)
616 # We need to read a cache line
617 with m
.If(req_is_miss
):
619 "cache miss nia:%x IR:%x SM:%x idx:%x "
620 " way:%x tag:%x RA:%x", i_in
.nia
,
621 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
622 replace_way
, req_tag
, real_addr
)
624 # Keep track of our index and way for subsequent stores
625 st_row
= Signal(self
.ROW_BITS
)
626 comb
+= st_row
.eq(get_row(req_laddr
))
627 sync
+= r
.store_index
.eq(req_index
)
628 sync
+= r
.store_row
.eq(st_row
)
629 sync
+= r
.store_tag
.eq(req_tag
)
630 sync
+= r
.store_valid
.eq(1)
631 sync
+= r
.end_row_ix
.eq(get_row_of_line(st_row
) - 1)
633 # Prep for first wishbone read. We calculate the address
634 # of the start of the cache line and start the WB cycle.
635 sync
+= r
.req_adr
.eq(req_laddr
)
636 sync
+= r
.wb
.cyc
.eq(1)
637 sync
+= r
.wb
.stb
.eq(1)
639 # Track that we had one request sent
640 sync
+= r
.state
.eq(State
.CLR_TAG
)
642 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
647 m
.submodules
.wr_tag
= wr_tag
= self
.tagmem
.write_port(
648 granularity
=self
.TAG_BITS
)
650 # Get victim way from plru
651 sync
+= r
.store_way
.eq(replace_way
)
653 # Force misses on that way while reloading that line
654 idx
= req_index
*self
.NUM_WAYS
+ replace_way
# 2D index, 1st dim: self.NUM_WAYS
655 comb
+= cache_valids
.r
.eq(1<<idx
)
657 # use write-port "granularity" to select the tag to write to
658 # TODO: the Memory should be multipled-up (by NUM_TAGS)
659 tagset
= Signal(self
.TAG_RAM_WIDTH
)
660 comb
+= tagset
.eq(r
.store_tag
<< (replace_way
*self
.TAG_BITS
))
661 comb
+= wr_tag
.en
.eq(1<<replace_way
)
662 comb
+= wr_tag
.addr
.eq(r
.store_index
)
663 comb
+= wr_tag
.data
.eq(tagset
)
665 sync
+= r
.state
.eq(State
.WAIT_ACK
)
667 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
668 cache_valids
, stbs_done
):
674 # Requests are all sent if stb is 0
676 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
677 comb
+= stbs_done
.eq(stbs_zero
)
679 # If we are still sending requests, was one accepted?
680 with m
.If(~bus
.stall
& ~stbs_zero
):
681 # That was the last word? We are done sending.
682 # Clear stb and set stbs_done so we can handle
683 # an eventual last ack on the same cycle.
684 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
685 sync
+= Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
686 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
687 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
688 r
.wb
.stb
, stbs_zero
, stbs_done
)
689 sync
+= r
.wb
.stb
.eq(0)
690 comb
+= stbs_done
.eq(1)
692 # Calculate the next row address
693 rarange
= Signal(self
.LINE_OFF_BITS
- self
.ROW_OFF_BITS
)
694 comb
+= rarange
.eq(r
.req_adr
[self
.ROW_OFF_BITS
:self
.LINE_OFF_BITS
] + 1)
695 sync
+= r
.req_adr
[self
.ROW_OFF_BITS
:self
.LINE_OFF_BITS
].eq(rarange
)
696 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
697 "stbs_zero:%x stbs_done:%x",
698 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
700 # Incoming acks processing
702 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
704 bus
.dat_r
, stbs_zero
, stbs_done
)
706 sync
+= r
.rows_valid
[r
.store_row
% self
.ROW_PER_LINE
].eq(1)
708 # Check for completion
709 with m
.If(stbs_done
& is_last_row(r
.store_row
, r
.end_row_ix
)):
710 # Complete wishbone cycle
711 sync
+= r
.wb
.cyc
.eq(0)
712 # be nice, clear addr
713 sync
+= r
.req_adr
.eq(0)
715 # Cache line is now valid
716 idx
= r
.store_index
*self
.NUM_WAYS
+ replace_way
# 2D index again
717 valid
= r
.store_valid
& ~inval_in
718 comb
+= cache_valids
.s
.eq(1<<idx
)
719 sync
+= r
.state
.eq(State
.IDLE
)
721 # move on to next request in row
722 # Increment store row counter
723 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
725 # Cache miss/reload synchronous machine
726 def icache_miss(self
, m
, r
, req_is_miss
,
727 req_index
, req_laddr
, req_tag
, replace_way
,
728 cache_valids
, access_ok
, real_addr
):
732 i_in
, bus
, m_in
= self
.i_in
, self
.bus
, self
.m_in
733 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
734 inval_in
= self
.inval_in
738 comb
+= r
.wb
.sel
.eq(-1)
739 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
741 # Process cache invalidations
743 comb
+= cache_valids
.r
.eq(-1)
744 sync
+= r
.store_valid
.eq(0)
747 with m
.Switch(r
.state
):
749 with m
.Case(State
.IDLE
):
750 self
.icache_miss_idle(m
, r
, req_is_miss
, req_laddr
,
751 req_index
, req_tag
, replace_way
,
754 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
755 with m
.If(r
.state
== State
.CLR_TAG
):
756 self
.icache_miss_clr_tag(m
, r
, replace_way
,
760 self
.icache_miss_wait_ack(m
, r
, replace_way
, inval_in
,
761 cache_valids
, stbs_done
)
763 # TLB miss and protection fault processing
764 with m
.If(flush_in | m_in
.tlbld
):
765 sync
+= r
.fetch_failed
.eq(0)
766 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
767 sync
+= r
.fetch_failed
.eq(1)
769 # icache_log: if LOG_LENGTH > 0 generate
770 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
771 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
775 bus
, i_out
= self
.bus
, self
.i_out
776 log_out
, stall_out
= self
.log_out
, self
.stall_out
778 # Output data to logger
779 for i
in range(LOG_LENGTH
):
780 log_data
= Signal(54)
781 lway
= Signal(self
.WAY_BITS
)
784 sync
+= lway
.eq(req_hit_way
)
787 with m
.If(r
.state
!= State
.IDLE
):
790 sync
+= log_data
.eq(Cat(
791 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
792 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
793 stall_out
, bus
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
794 r
.real_addr
[3:6], bus
.ack
, i_out
.insn
, i_out
.valid
796 comb
+= log_out
.eq(log_data
)
798 def elaborate(self
, platform
):
803 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
804 # number of ways and the number of lines.
805 vec
= SRLatch(sync
=True, llen
=self
.NUM_WAYS
*self
.NUM_LINE
, name
="cachevalids")
806 m
.submodules
.cache_valids
= cache_valids
= vec
810 vec
= SRLatch(sync
=False, llen
=self
.TLB_SIZE
, name
="tlbvalids")
811 m
.submodules
.itlb_valids
= itlb_valid
= vec
813 # TODO to be passed to nmigen as ram attributes
814 # attribute ram_style of itlb_tags : signal is "distributed";
815 # attribute ram_style of itlb_ptes : signal is "distributed";
817 # Privilege bit from PTE EAA field
822 # Async signal on incoming request
823 req_index
= Signal(self
.INDEX_BITS
)
824 req_row
= Signal(self
.ROW_BITS
)
825 req_hit_way
= Signal(self
.WAY_BITS
)
826 req_tag
= Signal(self
.TAG_BITS
)
827 req_is_hit
= Signal()
828 req_is_miss
= Signal()
829 req_laddr
= Signal(64)
831 tlb_req_index
= Signal(self
.TL_BITS
)
832 real_addr
= Signal(self
.REAL_ADDR_BITS
)
834 priv_fault
= Signal()
836 use_previous
= Signal()
838 cache_out_row
= Signal(self
.ROW_SIZE_BITS
)
840 plru_victim
= Signal(self
.WAY_BITS
)
841 replace_way
= Signal(self
.WAY_BITS
)
843 self
.tlbmem
= Memory(depth
=self
.TLB_SIZE
, width
=self
.TLB_EA_TAG_BITS
+self
.TLB_PTE_BITS
)
844 self
.tagmem
= Memory(depth
=self
.NUM_LINE
, width
=self
.TAG_RAM_WIDTH
)
846 # call sub-functions putting everything together,
847 # using shared signals established above
848 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
849 self
.maybe_plrus(m
, r
, plru_victim
)
850 self
.itlb_lookup(m
, tlb_req_index
, itlb
, itlb_valid
, real_addr
,
851 ra_valid
, eaa_priv
, priv_fault
,
853 self
.itlb_update(m
, itlb
, itlb_valid
)
854 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
855 req_tag
, real_addr
, req_laddr
,
857 access_ok
, req_is_hit
, req_is_miss
,
858 replace_way
, plru_victim
, cache_out_row
)
859 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
860 req_index
, req_tag
, real_addr
)
861 self
.icache_miss(m
, r
, req_is_miss
, req_index
,
862 req_laddr
, req_tag
, replace_way
,
864 access_ok
, real_addr
)
865 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
866 # req_is_miss, req_is_hit, lway, wstate, r)
868 # don't connect up to FetchUnitInterface so that some unit tests
869 # can continue to operate
870 if not self
.use_fetch_iface
:
873 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
874 # so needs checking and iterative revising
875 i_in
, bus
, i_out
= self
.i_in
, self
.bus
, self
.i_out
876 comb
+= i_in
.req
.eq(self
.a_i_valid
)
877 comb
+= i_in
.nia
.eq(self
.a_pc_i
)
878 comb
+= self
.stall_in
.eq(self
.a_stall_i
)
879 comb
+= self
.f_fetch_err_o
.eq(i_out
.fetch_failed
)
880 comb
+= self
.f_badaddr_o
.eq(i_out
.nia
)
881 comb
+= self
.f_instr_o
.eq(i_out
.insn
)
882 comb
+= self
.f_busy_o
.eq(~i_out
.valid
) # probably
884 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
886 comb
+= ibus
.adr
.eq(self
.bus
.adr
)
887 comb
+= ibus
.dat_w
.eq(self
.bus
.dat_w
)
888 comb
+= ibus
.sel
.eq(self
.bus
.sel
)
889 comb
+= ibus
.cyc
.eq(self
.bus
.cyc
)
890 comb
+= ibus
.stb
.eq(self
.bus
.stb
)
891 comb
+= ibus
.we
.eq(self
.bus
.we
)
893 comb
+= self
.bus
.dat_r
.eq(ibus
.dat_r
)
894 comb
+= self
.bus
.ack
.eq(ibus
.ack
)
895 if hasattr(ibus
, "stall"):
896 comb
+= self
.bus
.stall
.eq(ibus
.stall
)
898 # fake-up the wishbone stall signal to comply with pipeline mode
899 # same thing is done in dcache.py
900 comb
+= self
.bus
.stall
.eq(self
.bus
.cyc
& ~self
.bus
.ack
)
910 yield i_in
.priv_mode
.eq(1)
913 yield i_in
.stop_mark
.eq(0)
914 yield m_out
.tlbld
.eq(0)
915 yield m_out
.tlbie
.eq(0)
916 yield m_out
.addr
.eq(0)
917 yield m_out
.pte
.eq(0)
923 # miss, stalls for a bit
925 yield i_in
.nia
.eq(Const(0x0000000000000004, 64))
927 valid
= yield i_out
.valid
930 valid
= yield i_out
.valid
933 insn
= yield i_out
.insn
934 nia
= yield i_out
.nia
935 assert insn
== 0x00000001, \
936 "insn @%x=%x expected 00000001" % (nia
, insn
)
942 yield i_in
.nia
.eq(Const(0x0000000000000008, 64))
944 valid
= yield i_out
.valid
947 valid
= yield i_out
.valid
950 nia
= yield i_out
.nia
951 insn
= yield i_out
.insn
953 assert insn
== 0x00000002, \
954 "insn @%x=%x expected 00000002" % (nia
, insn
)
958 yield i_in
.nia
.eq(Const(0x0000000000000040, 64))
960 valid
= yield i_out
.valid
963 valid
= yield i_out
.valid
967 insn
= yield i_out
.insn
968 assert insn
== 0x00000010, \
969 "insn @%x=%x expected 00000010" % (nia
, insn
)
971 # test something that aliases (this only works because
972 # the unit test SRAM is a depth of 512)
974 yield i_in
.nia
.eq(Const(0x0000000000000100, 64))
977 valid
= yield i_out
.valid
982 insn
= yield i_out
.insn
983 valid
= yield i_out
.valid
984 insn
= yield i_out
.insn
986 assert insn
== 0x00000040, \
987 "insn @%x=%x expected 00000040" % (nia
, insn
)
991 def test_icache(mem
):
992 from soc
.config
.test
.test_loadstore
import TestMemPspec
993 pspec
= TestMemPspec(addr_wid
=32,
999 memory
= Memory(width
=64, depth
=512, init
=mem
)
1000 sram
= SRAM(memory
=memory
, granularity
=8)
1004 m
.submodules
.icache
= dut
1005 m
.submodules
.sram
= sram
1007 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.bus
.cyc
)
1008 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.bus
.stb
)
1009 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.bus
.we
)
1010 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.bus
.sel
)
1011 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.bus
.adr
)
1012 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.bus
.dat_w
)
1014 m
.d
.comb
+= dut
.bus
.ack
.eq(sram
.bus
.ack
)
1015 m
.d
.comb
+= dut
.bus
.dat_r
.eq(sram
.bus
.dat_r
)
1021 sim
.add_sync_process(wrap(icache_sim(dut
)))
1022 with sim
.write_vcd('test_icache.vcd'):
1026 if __name__
== '__main__':
1027 from soc
.config
.test
.test_loadstore
import TestMemPspec
1028 pspec
= TestMemPspec(addr_wid
=64,
1033 vl
= rtlil
.convert(dut
, ports
=[])
1034 with
open("test_icache.il", "w") as f
:
1037 # set up memory every 32-bits with incrementing values 0 1 2 ...
1039 for i
in range(512):
1040 mem
.append((i
*2) |
((i
*2+1)<<32))