58bdb6c5fcce78c1b88b880b24a115ae23d36e47
3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
29 from enum
import (Enum
, unique
)
30 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
,
32 from nmigen
.cli
import main
, rtlil
33 from nmutil
.iocontrol
import RecordObject
34 from nmigen
.utils
import log2_int
35 from nmigen
.lib
.coding
import Decoder
36 from nmutil
.util
import Display
37 from nmutil
.latch
import SRLatch
39 #from nmutil.plru import PLRU
40 from soc
.experiment
.plru
import PLRU
, PLRUs
41 from soc
.experiment
.cache_ram
import CacheRam
43 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
47 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
48 WB_SEL_BITS
, WBAddrType
, WBDataType
,
49 WBSelType
, WBMasterOut
, WBSlaveOut
,
52 from nmigen_soc
.wishbone
.bus
import Interface
53 from soc
.minerva
.units
.fetch
import FetchUnitInterface
57 from soc
.bus
.sram
import SRAM
58 from nmigen
import Memory
59 from nmutil
.util
import wrap
60 from nmigen
.cli
import main
, rtlil
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
66 # from microwatt/utils.vhdl
68 return n
!= 0 and (n
& (n
- 1)) == 0
71 # Non-zero to enable log data collection
75 def __init__(self
, LINE_SIZE
= 64,
76 NUM_LINES
= 16, # Number of lines in a set
77 NUM_WAYS
= 1, # Number of ways
78 TLB_SIZE
= 64, # L1 ITLB number of entries
79 TLB_LG_PGSZ
= 12): # L1 ITLB log_2(page_size)
80 self
.LINE_SIZE
= LINE_SIZE
81 self
.NUM_LINES
= NUM_LINES
82 self
.NUM_WAYS
= NUM_WAYS
83 self
.TLB_SIZE
= TLB_SIZE
84 self
.TLB_LG_PGSZ
= TLB_LG_PGSZ
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
90 # self.ROW_SIZE is the width in bytes of the BRAM
91 # (based on WB, so 64-bits)
92 self
.ROW_SIZE
= WB_DATA_BITS
// 8
93 # Number of real address bits that we store
94 self
.REAL_ADDR_BITS
= 56
96 self
.ROW_SIZE_BITS
= self
.ROW_SIZE
* 8
97 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
98 self
.ROW_PER_LINE
= self
.LINE_SIZE
// self
.ROW_SIZE
99 # BRAM_ROWS is the number of rows in BRAM
100 # needed to represent the full icache
101 self
.BRAM_ROWS
= self
.NUM_LINES
* self
.ROW_PER_LINE
102 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
103 self
.INSN_PER_ROW
= self
.ROW_SIZE_BITS
// 32
105 # Bit fields counts in the address
107 # INSN_BITS is the number of bits to select an instruction in a row
108 self
.INSN_BITS
= log2_int(self
.INSN_PER_ROW
)
109 # ROW_BITS is the number of bits to select a row
110 self
.ROW_BITS
= log2_int(self
.BRAM_ROWS
)
111 # ROW_LINE_BITS is the number of bits to select a row within a line
112 self
.ROW_LINE_BITS
= log2_int(self
.ROW_PER_LINE
)
113 # LINE_OFF_BITS is the number of bits for the offset in a cache line
114 self
.LINE_OFF_BITS
= log2_int(self
.LINE_SIZE
)
115 # ROW_OFF_BITS is the number of bits for the offset in a row
116 self
.ROW_OFF_BITS
= log2_int(self
.ROW_SIZE
)
117 # INDEX_BITS is the number of bits to select a cache line
118 self
.INDEX_BITS
= log2_int(self
.NUM_LINES
)
119 # SET_SIZE_BITS is the log base 2 of the set size
120 self
.SET_SIZE_BITS
= self
.LINE_OFF_BITS
+ self
.INDEX_BITS
121 # TAG_BITS is the number of bits of the tag part of the address
122 self
.TAG_BITS
= self
.REAL_ADDR_BITS
- self
.SET_SIZE_BITS
123 # TAG_WIDTH is the width in bits of each way of the tag RAM
124 self
.TAG_WIDTH
= self
.TAG_BITS
+ 7 - ((self
.TAG_BITS
+ 7) % 8)
126 # WAY_BITS is the number of bits to select a way
127 self
.WAY_BITS
= log2_int(self
.NUM_WAYS
)
128 self
.TAG_RAM_WIDTH
= self
.TAG_BITS
* self
.NUM_WAYS
131 self
.TL_BITS
= log2_int(self
.TLB_SIZE
)
132 self
.TLB_EA_TAG_BITS
= 64 - (self
.TLB_LG_PGSZ
+ self
.TL_BITS
)
133 self
.TLB_PTE_BITS
= 64
135 print("self.BRAM_ROWS =", self
.BRAM_ROWS
)
136 print("self.INDEX_BITS =", self
.INDEX_BITS
)
137 print("self.INSN_BITS =", self
.INSN_BITS
)
138 print("self.INSN_PER_ROW =", self
.INSN_PER_ROW
)
139 print("self.LINE_SIZE =", self
.LINE_SIZE
)
140 print("self.LINE_OFF_BITS =", self
.LINE_OFF_BITS
)
141 print("LOG_LENGTH =", LOG_LENGTH
)
142 print("self.NUM_LINES =", self
.NUM_LINES
)
143 print("self.NUM_WAYS =", self
.NUM_WAYS
)
144 print("self.REAL_ADDR_BITS =", self
.REAL_ADDR_BITS
)
145 print("self.ROW_BITS =", self
.ROW_BITS
)
146 print("self.ROW_OFF_BITS =", self
.ROW_OFF_BITS
)
147 print("self.ROW_LINE_BITS =", self
.ROW_LINE_BITS
)
148 print("self.ROW_PER_LINE =", self
.ROW_PER_LINE
)
149 print("self.ROW_SIZE =", self
.ROW_SIZE
)
150 print("self.ROW_SIZE_BITS =", self
.ROW_SIZE_BITS
)
151 print("self.SET_SIZE_BITS =", self
.SET_SIZE_BITS
)
153 print("self.TAG_BITS =", self
.TAG_BITS
)
154 print("self.TAG_RAM_WIDTH =", self
.TAG_RAM_WIDTH
)
155 print("self.TAG_BITS =", self
.TAG_BITS
)
156 print("self.TL_BITS =", self
.TL_BITS
)
157 print("self.TLB_EA_TAG_BITS =", self
.TLB_EA_TAG_BITS
)
158 print("self.TLB_LG_PGSZ =", self
.TLB_LG_PGSZ
)
159 print("self.TLB_PTE_BITS =", self
.TLB_PTE_BITS
)
160 print("self.TLB_SIZE =", self
.TLB_SIZE
)
161 print("self.WAY_BITS =", self
.WAY_BITS
)
163 assert self
.LINE_SIZE
% self
.ROW_SIZE
== 0
164 assert ispow2(self
.LINE_SIZE
), "self.LINE_SIZE not power of 2"
165 assert ispow2(self
.NUM_LINES
), "self.NUM_LINES not power of 2"
166 assert ispow2(self
.ROW_PER_LINE
), "self.ROW_PER_LINE not power of 2"
167 assert ispow2(self
.INSN_PER_ROW
), "self.INSN_PER_ROW not power of 2"
168 assert (self
.ROW_BITS
== (self
.INDEX_BITS
+ self
.ROW_LINE_BITS
)), \
169 "geometry bits don't add up"
170 assert (self
.LINE_OFF_BITS
==
171 (self
.ROW_OFF_BITS
+ self
.ROW_LINE_BITS
)), \
172 "geometry bits don't add up"
173 assert (self
.REAL_ADDR_BITS
==
174 (self
.TAG_BITS
+ self
.INDEX_BITS
+ self
.LINE_OFF_BITS
)), \
175 "geometry bits don't add up"
176 assert (self
.REAL_ADDR_BITS
==
177 (self
.TAG_BITS
+ self
.ROW_BITS
+ self
.ROW_OFF_BITS
)), \
178 "geometry bits don't add up"
180 # Example of layout for 32 lines of 64 bytes:
182 # .. tag |index| line |
184 # .. | | | |00| zero (2)
185 # .. | | |-| | self.INSN_BITS (1)
186 # .. | |---| | self.ROW_LINE_BITS (3)
187 # .. | |--- - --| self.LINE_OFF_BITS (6)
188 # .. | |- --| self.ROW_OFF_BITS (3)
189 # .. |----- ---| | self.ROW_BITS (8)
190 # .. |-----| | self.INDEX_BITS (5)
191 # .. --------| | self.TAG_BITS (53)
193 # The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
196 def RowPerLineValidArray(self
):
197 return Array(Signal(name
="rows_valid_%d" %x) \
198 for x
in range(self
.ROW_PER_LINE
))
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
205 def TLBRecord(self
, name
):
206 tlb_layout
= [ ('tag', self
.TLB_EA_TAG_BITS
),
207 ('pte', self
.TLB_PTE_BITS
)
209 return Record(tlb_layout
, name
=name
)
212 return Array(self
.TLBRecord("tlb%d" % x
) for x
in range(self
.TLB_SIZE
))
214 # PLRU output interface
216 return Array(Signal(self
.WAY_BITS
, name
="plru_out_%d" %x) \
217 for x
in range(self
.NUM_LINES
))
219 # Return the cache line index (tag index) for an address
220 def get_index(self
, addr
):
221 return addr
[self
.LINE_OFF_BITS
:self
.SET_SIZE_BITS
]
223 # Return the cache row index (data memory) for an address
224 def get_row(self
, addr
):
225 return addr
[self
.ROW_OFF_BITS
:self
.SET_SIZE_BITS
]
227 # Return the index of a row within a line
228 def get_row_of_line(self
, row
):
229 return row
[:self
.ROW_BITS
][:self
.ROW_LINE_BITS
]
231 # Returns whether this is the last row of a line
232 def is_last_row_addr(self
, addr
, last
):
233 return addr
[self
.ROW_OFF_BITS
:self
.LINE_OFF_BITS
] == last
235 # Returns whether this is the last row of a line
236 def is_last_row(self
, row
, last
):
237 return self
.get_row_of_line(row
) == last
239 # Return the next row in the current cache line. We use a dedicated
240 # function in order to limit the size of the generated adder to be
241 # only the bits within a cache line (3 bits with default settings)
242 def next_row(self
, row
):
243 row_v
= row
[0:self
.ROW_LINE_BITS
] + 1
244 return Cat(row_v
[:self
.ROW_LINE_BITS
], row
[self
.ROW_LINE_BITS
:])
246 # Read the instruction word for the given address
247 # in the current cache row
248 def read_insn_word(self
, addr
, data
):
249 word
= addr
[2:self
.INSN_BITS
+2]
250 return data
.word_select(word
, 32)
252 # Get the tag value from the address
253 def get_tag(self
, addr
):
254 return addr
[self
.SET_SIZE_BITS
:self
.REAL_ADDR_BITS
]
256 # Read a tag from a tag memory row
257 def read_tag(self
, way
, tagset
):
258 return tagset
.word_select(way
, self
.TAG_BITS
)
260 # Write a tag to tag memory row
261 def write_tag(self
, way
, tagset
, tag
):
262 return self
.read_tag(way
, tagset
).eq(tag
)
264 # Simple hash for direct-mapped TLB index
265 def hash_ea(self
, addr
):
266 hsh
= (addr
[self
.TLB_LG_PGSZ
:self
.TLB_LG_PGSZ
+ self
.TL_BITS
] ^
267 addr
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:
268 self
.TLB_LG_PGSZ
+ 2 * self
.TL_BITS
] ^
269 addr
[self
.TLB_LG_PGSZ
+ 2 * self
.TL_BITS
:
270 self
.TLB_LG_PGSZ
+ 3 * self
.TL_BITS
])
274 # Cache reload state machine
282 class RegInternal(RecordObject
):
283 def __init__(self
, cfg
):
285 # Cache hit state (Latches for 1 cycle BRAM access)
286 self
.hit_way
= Signal(cfg
.WAY_BITS
)
287 self
.hit_nia
= Signal(64)
288 self
.hit_smark
= Signal()
289 self
.hit_valid
= Signal()
291 # Cache miss state (reload state machine)
292 self
.state
= Signal(State
, reset
=State
.IDLE
)
293 self
.wb
= WBMasterOut("wb")
294 self
.req_adr
= Signal(64)
295 self
.store_way
= Signal(cfg
.WAY_BITS
)
296 self
.store_index
= Signal(cfg
.INDEX_BITS
)
297 self
.store_row
= Signal(cfg
.ROW_BITS
)
298 self
.store_tag
= Signal(cfg
.TAG_BITS
)
299 self
.store_valid
= Signal()
300 self
.end_row_ix
= Signal(cfg
.ROW_LINE_BITS
)
301 self
.rows_valid
= cfg
.RowPerLineValidArray()
304 self
.fetch_failed
= Signal()
307 class ICache(FetchUnitInterface
, Elaboratable
, ICacheConfig
):
308 """64 bit direct mapped icache. All instructions are 4B aligned."""
309 def __init__(self
, pspec
):
310 FetchUnitInterface
.__init
__(self
, pspec
)
311 ICacheConfig
.__init
__(self
)
312 self
.i_in
= Fetch1ToICacheType(name
="i_in")
313 self
.i_out
= ICacheToDecode1Type(name
="i_out")
315 self
.m_in
= MMUToICacheType(name
="m_in")
317 self
.stall_in
= Signal()
318 self
.stall_out
= Signal()
319 self
.flush_in
= Signal()
320 self
.inval_in
= Signal()
322 # standard naming (wired to non-standard for compatibility)
323 self
.bus
= Interface(addr_width
=32,
330 self
.log_out
= Signal(54)
332 # use FetchUnitInterface, helps keep some unit tests running
333 self
.use_fetch_iface
= False
335 def use_fetch_interface(self
):
336 self
.use_fetch_iface
= True
338 # Generate a cache RAM for each way
339 def rams(self
, m
, r
, cache_out_row
, use_previous
,
340 replace_way
, req_row
):
345 bus
, stall_in
= self
.bus
, self
.stall_in
347 # read condition (for every cache ram)
349 comb
+= do_read
.eq(~
(stall_in | use_previous
))
351 rd_addr
= Signal(self
.ROW_BITS
)
352 wr_addr
= Signal(self
.ROW_BITS
)
353 comb
+= rd_addr
.eq(req_row
)
354 comb
+= wr_addr
.eq(r
.store_row
)
356 # binary-to-unary converters: replace-way enabled by bus.ack,
357 # hit-way left permanently enabled
358 m
.submodules
.replace_way_e
= re
= Decoder(self
.NUM_WAYS
)
359 m
.submodules
.hit_way_e
= he
= Decoder(self
.NUM_WAYS
)
360 comb
+= re
.i
.eq(replace_way
)
361 comb
+= re
.n
.eq(~bus
.ack
)
362 comb
+= he
.i
.eq(r
.hit_way
)
364 for i
in range(self
.NUM_WAYS
):
365 do_write
= Signal(name
="do_wr_%d" % i
)
366 d_out
= Signal(self
.ROW_SIZE_BITS
, name
="d_out_%d" % i
)
367 wr_sel
= Signal(self
.ROW_SIZE
, name
="wr_sel_%d" % i
)
369 way
= CacheRam(self
.ROW_BITS
, self
.ROW_SIZE_BITS
,
370 TRACE
=True, ram_num
=i
)
371 m
.submodules
["cacheram_%d" % i
] = way
373 comb
+= way
.rd_en
.eq(do_read
)
374 comb
+= way
.rd_addr
.eq(rd_addr
)
375 comb
+= d_out
.eq(way
.rd_data_o
)
376 comb
+= way
.wr_sel
.eq(wr_sel
)
377 comb
+= way
.wr_addr
.eq(wr_addr
)
378 comb
+= way
.wr_data
.eq(bus
.dat_r
)
380 comb
+= do_write
.eq(re
.o
[i
])
383 sync
+= Display("cache write adr: %x data: %lx",
384 wr_addr
, way
.wr_data
)
387 comb
+= cache_out_row
.eq(d_out
)
389 sync
+= Display("cache read adr: %x data: %x",
392 comb
+= wr_sel
.eq(Repl(do_write
, self
.ROW_SIZE
))
395 def maybe_plrus(self
, m
, r
, plru_victim
):
398 if self
.NUM_WAYS
== 0:
402 m
.submodules
.plrus
= plru
= PLRUs(self
.NUM_LINES
, self
.WAY_BITS
)
403 comb
+= plru
.way
.eq(r
.hit_way
)
404 comb
+= plru
.valid
.eq(r
.hit_valid
)
405 comb
+= plru
.index
.eq(self
.get_index(r
.hit_nia
))
406 comb
+= plru
.isel
.eq(r
.store_index
) # select victim
407 comb
+= plru_victim
.eq(plru
.o_index
) # selected victim
409 # TLB hit detection and real address generation
410 def itlb_lookup(self
, m
, tlb_req_index
, itlb
, itlb_valid
,
411 real_addr
, ra_valid
, eaa_priv
,
412 priv_fault
, access_ok
):
418 # use an *asynchronous* Memory read port here (combinatorial)
419 m
.submodules
.rd_tlb
= rd_tlb
= self
.tlbmem
.read_port(domain
="comb")
420 tlb
= self
.TLBRecord("tlb_rdport")
421 pte
, ttag
= tlb
.pte
, tlb
.tag
423 comb
+= tlb_req_index
.eq(self
.hash_ea(i_in
.nia
))
424 comb
+= rd_tlb
.addr
.eq(tlb_req_index
)
425 comb
+= tlb
.eq(rd_tlb
.data
)
427 with m
.If(i_in
.virt_mode
):
428 comb
+= real_addr
.eq(Cat(i_in
.nia
[:self
.TLB_LG_PGSZ
],
429 pte
[self
.TLB_LG_PGSZ
:self
.REAL_ADDR_BITS
]))
431 with m
.If(ttag
== i_in
.nia
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:64]):
432 comb
+= ra_valid
.eq(itlb_valid
.q
.bit_select(tlb_req_index
, 1))
434 comb
+= eaa_priv
.eq(pte
[3])
437 comb
+= real_addr
.eq(i_in
.nia
[:self
.REAL_ADDR_BITS
])
438 comb
+= ra_valid
.eq(1)
439 comb
+= eaa_priv
.eq(1)
441 # No IAMR, so no KUEP support for now
442 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
443 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
446 def itlb_update(self
, m
, itlb
, itlb_valid
):
452 wr_index
= Signal(self
.TL_BITS
)
453 wr_unary
= Signal(self
.TLB_SIZE
)
454 comb
+= wr_index
.eq(self
.hash_ea(m_in
.addr
))
455 comb
+= wr_unary
.eq(1<<wr_index
)
457 m
.submodules
.wr_tlb
= wr_tlb
= self
.tlbmem
.write_port()
458 sync
+= itlb_valid
.s
.eq(0)
459 sync
+= itlb_valid
.r
.eq(0)
461 with m
.If(m_in
.tlbie
& m_in
.doall
):
462 # Clear all valid bits
463 sync
+= itlb_valid
.r
.eq(-1)
465 with m
.Elif(m_in
.tlbie
):
466 # Clear entry regardless of hit or miss
467 sync
+= itlb_valid
.r
.eq(wr_unary
)
469 with m
.Elif(m_in
.tlbld
):
470 tlb
= self
.TLBRecord("tlb_wrport")
471 comb
+= tlb
.tag
.eq(m_in
.addr
[self
.TLB_LG_PGSZ
+ self
.TL_BITS
:64])
472 comb
+= tlb
.pte
.eq(m_in
.pte
)
473 comb
+= wr_tlb
.en
.eq(1)
474 comb
+= wr_tlb
.addr
.eq(wr_index
)
475 comb
+= wr_tlb
.data
.eq(tlb
)
476 sync
+= itlb_valid
.s
.eq(wr_unary
)
478 # Cache hit detection, output to fetch2 and other misc logic
479 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
480 req_hit_way
, req_tag
, real_addr
, req_laddr
,
481 cache_valids
, access_ok
,
482 req_is_hit
, req_is_miss
, replace_way
,
483 plru_victim
, cache_out_row
):
486 m
.submodules
.rd_tag
= rd_tag
= self
.tagmem
.read_port(domain
="comb")
488 i_in
, i_out
, bus
= self
.i_in
, self
.i_out
, self
.bus
489 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
492 hit_way
= Signal(self
.WAY_BITS
)
494 # i_in.sequential means that i_in.nia this cycle is 4 more than
495 # last cycle. If we read more than 32 bits at a time, had a
496 # cache hit last cycle, and we don't want the first 32-bit chunk
497 # then we can keep the data we read last cycle and just use that.
498 with m
.If(i_in
.nia
[2:self
.INSN_BITS
+2] != 0):
499 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
501 # Extract line, row and tag from request
502 comb
+= req_index
.eq(self
.get_index(i_in
.nia
))
503 comb
+= req_row
.eq(self
.get_row(i_in
.nia
))
504 comb
+= req_tag
.eq(self
.get_tag(real_addr
))
506 # Calculate address of beginning of cache row, will be
507 # used for cache miss processing if needed
508 comb
+= req_laddr
.eq(Cat(
509 Const(0, self
.ROW_OFF_BITS
),
510 real_addr
[self
.ROW_OFF_BITS
:self
.REAL_ADDR_BITS
],
513 # Test if pending request is a hit on any way
515 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
516 & (req_index
== r
.store_index
)
517 & r
.rows_valid
[req_row
% self
.ROW_PER_LINE
]
519 # i_in.req asserts Decoder active
520 cvb
= Signal(self
.NUM_WAYS
)
521 ctag
= Signal(self
.TAG_RAM_WIDTH
)
522 comb
+= rd_tag
.addr
.eq(req_index
)
523 comb
+= ctag
.eq(rd_tag
.data
)
524 comb
+= cvb
.eq(cache_valids
.q
.word_select(req_index
, self
.NUM_WAYS
))
525 m
.submodules
.store_way_e
= se
= Decoder(self
.NUM_WAYS
)
526 comb
+= se
.i
.eq(r
.store_way
)
527 comb
+= se
.n
.eq(~i_in
.req
)
528 for i
in range(self
.NUM_WAYS
):
529 tagi
= Signal(self
.TAG_BITS
, name
="tag_i%d" % i
)
530 hit_test
= Signal(name
="hit_test%d" % i
)
531 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
532 comb
+= tagi
.eq(self
.read_tag(i
, ctag
))
533 comb
+= hit_test
.eq(se
.o
[i
])
534 comb
+= is_tag_hit
.eq((cvb
[i
] |
(hitcond
& hit_test
)) &
536 with m
.If(is_tag_hit
):
537 comb
+= hit_way
.eq(i
)
540 # Generate the "hit" and "miss" signals
541 # for the synchronous blocks
542 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
543 comb
+= req_is_hit
.eq(is_hit
)
544 comb
+= req_is_miss
.eq(~is_hit
)
546 comb
+= req_hit_way
.eq(hit_way
)
548 # The way to replace on a miss
549 with m
.If(r
.state
== State
.CLR_TAG
):
550 comb
+= replace_way
.eq(plru_victim
)
552 comb
+= replace_way
.eq(r
.store_way
)
554 # Output instruction from current cache row
556 # Note: This is a mild violation of our design principle of
557 # having pipeline stages output from a clean latch. In this
558 # case we output the result of a mux. The alternative would
559 # be output an entire row which I prefer not to do just yet
560 # as it would force fetch2 to know about some of the cache
561 # geometry information.
562 comb
+= i_out
.insn
.eq(self
.read_insn_word(r
.hit_nia
, cache_out_row
))
563 comb
+= i_out
.valid
.eq(r
.hit_valid
)
564 comb
+= i_out
.nia
.eq(r
.hit_nia
)
565 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
566 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
568 # Stall fetch1 if we have a miss on cache or TLB
569 # or a protection fault
570 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
572 # Wishbone requests output (from the cache miss reload machine)
573 comb
+= bus
.we
.eq(r
.wb
.we
)
574 comb
+= bus
.adr
.eq(r
.wb
.adr
)
575 comb
+= bus
.sel
.eq(r
.wb
.sel
)
576 comb
+= bus
.stb
.eq(r
.wb
.stb
)
577 comb
+= bus
.dat_w
.eq(r
.wb
.dat
)
578 comb
+= bus
.cyc
.eq(r
.wb
.cyc
)
580 # Cache hit synchronous machine
581 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
582 req_index
, req_tag
, real_addr
):
585 i_in
, stall_in
= self
.i_in
, self
.stall_in
586 flush_in
= self
.flush_in
588 # keep outputs to fetch2 unchanged on a stall
589 # except that flush or reset sets valid to 0
590 # If use_previous, keep the same data as last
591 # cycle and use the second half
592 with m
.If(stall_in | use_previous
):
594 sync
+= r
.hit_valid
.eq(0)
596 # On a hit, latch the request for the next cycle,
597 # when the BRAM data will be available on the
598 # cache_out output of the corresponding way
599 sync
+= r
.hit_valid
.eq(req_is_hit
)
601 with m
.If(req_is_hit
):
602 sync
+= r
.hit_way
.eq(req_hit_way
)
603 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
604 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
,
605 i_in
.stop_mark
, req_index
, req_tag
,
606 req_hit_way
, real_addr
)
608 with m
.If(~stall_in
):
609 # Send stop marks and NIA down regardless of validity
610 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
611 sync
+= r
.hit_nia
.eq(i_in
.nia
)
613 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
614 req_index
, req_tag
, replace_way
, real_addr
):
620 # Reset per-row valid flags, only used in WAIT_ACK
621 for i
in range(self
.ROW_PER_LINE
):
622 sync
+= r
.rows_valid
[i
].eq(0)
624 # We need to read a cache line
625 with m
.If(req_is_miss
):
627 "cache miss nia:%x IR:%x SM:%x idx:%x "
628 " way:%x tag:%x RA:%x", i_in
.nia
,
629 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
630 replace_way
, req_tag
, real_addr
)
632 # Keep track of our index and way for subsequent stores
633 st_row
= Signal(self
.ROW_BITS
)
634 comb
+= st_row
.eq(self
.get_row(req_laddr
))
635 sync
+= r
.store_index
.eq(req_index
)
636 sync
+= r
.store_row
.eq(st_row
)
637 sync
+= r
.store_tag
.eq(req_tag
)
638 sync
+= r
.store_valid
.eq(1)
639 sync
+= r
.end_row_ix
.eq(self
.get_row_of_line(st_row
) - 1)
641 # Prep for first wishbone read. We calculate the address
642 # of the start of the cache line and start the WB cycle.
643 sync
+= r
.req_adr
.eq(req_laddr
)
644 sync
+= r
.wb
.cyc
.eq(1)
645 sync
+= r
.wb
.stb
.eq(1)
647 # Track that we had one request sent
648 sync
+= r
.state
.eq(State
.CLR_TAG
)
650 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
655 m
.submodules
.wr_tag
= wr_tag
= self
.tagmem
.write_port(
656 granularity
=self
.TAG_BITS
)
658 # Get victim way from plru
659 sync
+= r
.store_way
.eq(replace_way
)
661 # Force misses on that way while reloading that line
662 idx
= req_index
*self
.NUM_WAYS
+ replace_way
# 2D index, 1st dim: self.NUM_WAYS
663 comb
+= cache_valids
.r
.eq(1<<idx
)
665 # use write-port "granularity" to select the tag to write to
666 # TODO: the Memory should be multipled-up (by NUM_TAGS)
667 tagset
= Signal(self
.TAG_RAM_WIDTH
)
668 comb
+= tagset
.eq(r
.store_tag
<< (replace_way
*self
.TAG_BITS
))
669 comb
+= wr_tag
.en
.eq(1<<replace_way
)
670 comb
+= wr_tag
.addr
.eq(r
.store_index
)
671 comb
+= wr_tag
.data
.eq(tagset
)
673 sync
+= r
.state
.eq(State
.WAIT_ACK
)
675 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
676 cache_valids
, stbs_done
):
682 # Requests are all sent if stb is 0
684 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
685 comb
+= stbs_done
.eq(stbs_zero
)
687 # If we are still sending requests, was one accepted?
688 with m
.If(~bus
.stall
& ~stbs_zero
):
689 # That was the last word? We are done sending.
690 # Clear stb and set stbs_done so we can handle
691 # an eventual last ack on the same cycle.
692 with m
.If(self
.is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
693 sync
+= Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
694 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
695 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
696 r
.wb
.stb
, stbs_zero
, stbs_done
)
697 sync
+= r
.wb
.stb
.eq(0)
698 comb
+= stbs_done
.eq(1)
700 # Calculate the next row address
701 rarange
= Signal(self
.LINE_OFF_BITS
- self
.ROW_OFF_BITS
)
702 comb
+= rarange
.eq(r
.req_adr
[self
.ROW_OFF_BITS
:
703 self
.LINE_OFF_BITS
] + 1)
704 sync
+= r
.req_adr
[self
.ROW_OFF_BITS
:self
.LINE_OFF_BITS
].eq(rarange
)
705 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
706 "stbs_zero:%x stbs_done:%x",
707 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
709 # Incoming acks processing
711 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
713 bus
.dat_r
, stbs_zero
, stbs_done
)
715 sync
+= r
.rows_valid
[r
.store_row
% self
.ROW_PER_LINE
].eq(1)
717 # Check for completion
718 with m
.If(stbs_done
& self
.is_last_row(r
.store_row
, r
.end_row_ix
)):
719 # Complete wishbone cycle
720 sync
+= r
.wb
.cyc
.eq(0)
721 # be nice, clear addr
722 sync
+= r
.req_adr
.eq(0)
724 # Cache line is now valid
725 idx
= r
.store_index
*self
.NUM_WAYS
+ replace_way
# 2D index again
726 valid
= r
.store_valid
& ~inval_in
727 comb
+= cache_valids
.s
.eq(1<<idx
)
728 sync
+= r
.state
.eq(State
.IDLE
)
730 # move on to next request in row
731 # Increment store row counter
732 sync
+= r
.store_row
.eq(self
.next_row(r
.store_row
))
734 # Cache miss/reload synchronous machine
735 def icache_miss(self
, m
, r
, req_is_miss
,
736 req_index
, req_laddr
, req_tag
, replace_way
,
737 cache_valids
, access_ok
, real_addr
):
741 i_in
, bus
, m_in
= self
.i_in
, self
.bus
, self
.m_in
742 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
743 inval_in
= self
.inval_in
747 comb
+= r
.wb
.sel
.eq(-1)
748 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
750 # Process cache invalidations
752 comb
+= cache_valids
.r
.eq(-1)
753 sync
+= r
.store_valid
.eq(0)
756 with m
.Switch(r
.state
):
758 with m
.Case(State
.IDLE
):
759 self
.icache_miss_idle(m
, r
, req_is_miss
, req_laddr
,
760 req_index
, req_tag
, replace_way
,
763 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
764 with m
.If(r
.state
== State
.CLR_TAG
):
765 self
.icache_miss_clr_tag(m
, r
, replace_way
,
769 self
.icache_miss_wait_ack(m
, r
, replace_way
, inval_in
,
770 cache_valids
, stbs_done
)
772 # TLB miss and protection fault processing
773 with m
.If(flush_in | m_in
.tlbld
):
774 sync
+= r
.fetch_failed
.eq(0)
775 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
776 sync
+= r
.fetch_failed
.eq(1)
778 # icache_log: if LOG_LENGTH > 0 generate
779 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
780 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
784 bus
, i_out
= self
.bus
, self
.i_out
785 log_out
, stall_out
= self
.log_out
, self
.stall_out
787 # Output data to logger
788 for i
in range(LOG_LENGTH
):
789 log_data
= Signal(54)
790 lway
= Signal(self
.WAY_BITS
)
793 sync
+= lway
.eq(req_hit_way
)
796 with m
.If(r
.state
!= State
.IDLE
):
799 sync
+= log_data
.eq(Cat(
800 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
801 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
802 stall_out
, bus
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
803 r
.real_addr
[3:6], bus
.ack
, i_out
.insn
, i_out
.valid
805 comb
+= log_out
.eq(log_data
)
807 def elaborate(self
, platform
):
812 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
813 # number of ways and the number of lines.
814 vec
= SRLatch(sync
=True, llen
=self
.NUM_WAYS
*self
.NUM_LINES
,
816 m
.submodules
.cache_valids
= cache_valids
= vec
819 itlb
= self
.TLBArray()
820 vec
= SRLatch(sync
=False, llen
=self
.TLB_SIZE
, name
="tlbvalids")
821 m
.submodules
.itlb_valids
= itlb_valid
= vec
823 # TODO to be passed to nmigen as ram attributes
824 # attribute ram_style of itlb_tags : signal is "distributed";
825 # attribute ram_style of itlb_ptes : signal is "distributed";
827 # Privilege bit from PTE EAA field
830 r
= RegInternal(self
)
832 # Async signal on incoming request
833 req_index
= Signal(self
.INDEX_BITS
)
834 req_row
= Signal(self
.ROW_BITS
)
835 req_hit_way
= Signal(self
.WAY_BITS
)
836 req_tag
= Signal(self
.TAG_BITS
)
837 req_is_hit
= Signal()
838 req_is_miss
= Signal()
839 req_laddr
= Signal(64)
841 tlb_req_index
= Signal(self
.TL_BITS
)
842 real_addr
= Signal(self
.REAL_ADDR_BITS
)
844 priv_fault
= Signal()
846 use_previous
= Signal()
848 cache_out_row
= Signal(self
.ROW_SIZE_BITS
)
850 plru_victim
= Signal(self
.WAY_BITS
)
851 replace_way
= Signal(self
.WAY_BITS
)
853 self
.tlbmem
= Memory(depth
=self
.TLB_SIZE
,
854 width
=self
.TLB_EA_TAG_BITS
+self
.TLB_PTE_BITS
)
855 self
.tagmem
= Memory(depth
=self
.NUM_LINES
,
856 width
=self
.TAG_RAM_WIDTH
)
858 # call sub-functions putting everything together,
859 # using shared signals established above
860 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
861 self
.maybe_plrus(m
, r
, plru_victim
)
862 self
.itlb_lookup(m
, tlb_req_index
, itlb
, itlb_valid
, real_addr
,
863 ra_valid
, eaa_priv
, priv_fault
,
865 self
.itlb_update(m
, itlb
, itlb_valid
)
866 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
867 req_tag
, real_addr
, req_laddr
,
869 access_ok
, req_is_hit
, req_is_miss
,
870 replace_way
, plru_victim
, cache_out_row
)
871 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
872 req_index
, req_tag
, real_addr
)
873 self
.icache_miss(m
, r
, req_is_miss
, req_index
,
874 req_laddr
, req_tag
, replace_way
,
876 access_ok
, real_addr
)
877 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
878 # req_is_miss, req_is_hit, lway, wstate, r)
880 # don't connect up to FetchUnitInterface so that some unit tests
881 # can continue to operate
882 if not self
.use_fetch_iface
:
885 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
886 # so needs checking and iterative revising
887 i_in
, bus
, i_out
= self
.i_in
, self
.bus
, self
.i_out
888 comb
+= i_in
.req
.eq(self
.a_i_valid
)
889 comb
+= i_in
.nia
.eq(self
.a_pc_i
)
890 comb
+= self
.stall_in
.eq(self
.a_stall_i
)
891 comb
+= self
.f_fetch_err_o
.eq(i_out
.fetch_failed
)
892 comb
+= self
.f_badaddr_o
.eq(i_out
.nia
)
893 comb
+= self
.f_instr_o
.eq(i_out
.insn
)
894 comb
+= self
.f_busy_o
.eq(~i_out
.valid
) # probably
896 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
898 comb
+= ibus
.adr
.eq(self
.bus
.adr
)
899 comb
+= ibus
.dat_w
.eq(self
.bus
.dat_w
)
900 comb
+= ibus
.sel
.eq(self
.bus
.sel
)
901 comb
+= ibus
.cyc
.eq(self
.bus
.cyc
)
902 comb
+= ibus
.stb
.eq(self
.bus
.stb
)
903 comb
+= ibus
.we
.eq(self
.bus
.we
)
905 comb
+= self
.bus
.dat_r
.eq(ibus
.dat_r
)
906 comb
+= self
.bus
.ack
.eq(ibus
.ack
)
907 if hasattr(ibus
, "stall"):
908 comb
+= self
.bus
.stall
.eq(ibus
.stall
)
910 # fake-up the wishbone stall signal to comply with pipeline mode
911 # same thing is done in dcache.py
912 comb
+= self
.bus
.stall
.eq(self
.bus
.cyc
& ~self
.bus
.ack
)
922 yield i_in
.priv_mode
.eq(1)
925 yield i_in
.stop_mark
.eq(0)
926 yield m_out
.tlbld
.eq(0)
927 yield m_out
.tlbie
.eq(0)
928 yield m_out
.addr
.eq(0)
929 yield m_out
.pte
.eq(0)
935 # miss, stalls for a bit
937 yield i_in
.nia
.eq(Const(0x0000000000000004, 64))
939 valid
= yield i_out
.valid
942 valid
= yield i_out
.valid
945 insn
= yield i_out
.insn
946 nia
= yield i_out
.nia
947 assert insn
== 0x00000001, \
948 "insn @%x=%x expected 00000001" % (nia
, insn
)
954 yield i_in
.nia
.eq(Const(0x0000000000000008, 64))
956 valid
= yield i_out
.valid
959 valid
= yield i_out
.valid
962 nia
= yield i_out
.nia
963 insn
= yield i_out
.insn
965 assert insn
== 0x00000002, \
966 "insn @%x=%x expected 00000002" % (nia
, insn
)
970 yield i_in
.nia
.eq(Const(0x0000000000000040, 64))
972 valid
= yield i_out
.valid
975 valid
= yield i_out
.valid
979 insn
= yield i_out
.insn
980 assert insn
== 0x00000010, \
981 "insn @%x=%x expected 00000010" % (nia
, insn
)
983 # test something that aliases (this only works because
984 # the unit test SRAM is a depth of 512)
986 yield i_in
.nia
.eq(Const(0x0000000000000100, 64))
989 valid
= yield i_out
.valid
994 insn
= yield i_out
.insn
995 valid
= yield i_out
.valid
996 insn
= yield i_out
.insn
998 assert insn
== 0x00000040, \
999 "insn @%x=%x expected 00000040" % (nia
, insn
)
1000 yield i_in
.req
.eq(0)
1003 def test_icache(mem
):
1004 from soc
.config
.test
.test_loadstore
import TestMemPspec
1005 pspec
= TestMemPspec(addr_wid
=32,
1011 memory
= Memory(width
=64, depth
=512, init
=mem
)
1012 sram
= SRAM(memory
=memory
, granularity
=8)
1016 m
.submodules
.icache
= dut
1017 m
.submodules
.sram
= sram
1019 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.bus
.cyc
)
1020 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.bus
.stb
)
1021 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.bus
.we
)
1022 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.bus
.sel
)
1023 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.bus
.adr
)
1024 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.bus
.dat_w
)
1026 m
.d
.comb
+= dut
.bus
.ack
.eq(sram
.bus
.ack
)
1027 m
.d
.comb
+= dut
.bus
.dat_r
.eq(sram
.bus
.dat_r
)
1033 sim
.add_sync_process(wrap(icache_sim(dut
)))
1034 with sim
.write_vcd('test_icache.vcd'):
1038 if __name__
== '__main__':
1039 from soc
.config
.test
.test_loadstore
import TestMemPspec
1040 pspec
= TestMemPspec(addr_wid
=64,
1045 vl
= rtlil
.convert(dut
, ports
=[])
1046 with
open("test_icache.il", "w") as f
:
1049 # set up memory every 32-bits with incrementing values 0 1 2 ...
1051 for i
in range(512):
1052 mem
.append((i
*2) |
((i
*2+1)<<32))