25a5adb4bbe93909f786decab78dcf3da4ea71a6
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66 # from microwatt/utils.vhdl
67 def ispow2(n):
68 return n != 0 and (n & (n - 1)) == 0
69
70 SIM = 0
71 # Non-zero to enable log data collection
72 LOG_LENGTH = 0
73
74 class ICacheConfig:
75 def __init__(self, LINE_SIZE = 64,
76 NUM_LINES = 64, # Number of lines in a set
77 NUM_WAYS = 2, # Number of ways
78 TLB_SIZE = 64, # L1 ITLB number of entries
79 TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
80 self.LINE_SIZE = LINE_SIZE
81 self.NUM_LINES = NUM_LINES
82 self.NUM_WAYS = NUM_WAYS
83 self.TLB_SIZE = TLB_SIZE
84 self.TLB_LG_PGSZ = TLB_LG_PGSZ
85
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
89 #
90 # self.ROW_SIZE is the width in bytes of the BRAM
91 # (based on WB, so 64-bits)
92 self.ROW_SIZE = WB_DATA_BITS // 8
93 # Number of real address bits that we store
94 self.REAL_ADDR_BITS = 56
95
96 self.ROW_SIZE_BITS = self.ROW_SIZE * 8
97 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
98 self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
99 # BRAM_ROWS is the number of rows in BRAM
100 # needed to represent the full icache
101 self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
102 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
103 self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
104
105 # Bit fields counts in the address
106 #
107 # INSN_BITS is the number of bits to select an instruction in a row
108 self.INSN_BITS = log2_int(self.INSN_PER_ROW)
109 # ROW_BITS is the number of bits to select a row
110 self.ROW_BITS = log2_int(self.BRAM_ROWS)
111 # ROW_LINE_BITS is the number of bits to select a row within a line
112 self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
113 # LINE_OFF_BITS is the number of bits for the offset in a cache line
114 self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
115 # ROW_OFF_BITS is the number of bits for the offset in a row
116 self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
117 # INDEX_BITS is the number of bits to select a cache line
118 self.INDEX_BITS = log2_int(self.NUM_LINES)
119 # SET_SIZE_BITS is the log base 2 of the set size
120 self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
121 # TAG_BITS is the number of bits of the tag part of the address
122 self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
123 # TAG_WIDTH is the width in bits of each way of the tag RAM
124 self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
125
126 # WAY_BITS is the number of bits to select a way
127 self.WAY_BITS = log2_int(self.NUM_WAYS)
128 self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
129
130 # L1 ITLB
131 self.TL_BITS = log2_int(self.TLB_SIZE)
132 self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS)
133 self.TLB_PTE_BITS = 64
134
135 print("self.BRAM_ROWS =", self.BRAM_ROWS)
136 print("self.INDEX_BITS =", self.INDEX_BITS)
137 print("self.INSN_BITS =", self.INSN_BITS)
138 print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
139 print("self.LINE_SIZE =", self.LINE_SIZE)
140 print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
141 print("LOG_LENGTH =", LOG_LENGTH)
142 print("self.NUM_LINES =", self.NUM_LINES)
143 print("self.NUM_WAYS =", self.NUM_WAYS)
144 print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
145 print("self.ROW_BITS =", self.ROW_BITS)
146 print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
147 print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
148 print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
149 print("self.ROW_SIZE =", self.ROW_SIZE)
150 print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
151 print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
152 print("SIM =", SIM)
153 print("self.TAG_BITS =", self.TAG_BITS)
154 print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
155 print("self.TAG_BITS =", self.TAG_BITS)
156 print("self.TL_BITS =", self.TL_BITS)
157 print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
158 print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
159 print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
160 print("self.TLB_SIZE =", self.TLB_SIZE)
161 print("self.WAY_BITS =", self.WAY_BITS)
162
163 assert self.LINE_SIZE % self.ROW_SIZE == 0
164 assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
165 assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
166 assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
167 assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
168 assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
169 "geometry bits don't add up"
170 assert (self.LINE_OFF_BITS ==
171 (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
172 "geometry bits don't add up"
173 assert (self.REAL_ADDR_BITS ==
174 (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
175 "geometry bits don't add up"
176 assert (self.REAL_ADDR_BITS ==
177 (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
178 "geometry bits don't add up"
179
180 # Example of layout for 32 lines of 64 bytes:
181 #
182 # .. tag |index| line |
183 # .. | row | |
184 # .. | | | |00| zero (2)
185 # .. | | |-| | self.INSN_BITS (1)
186 # .. | |---| | self.ROW_LINE_BITS (3)
187 # .. | |--- - --| self.LINE_OFF_BITS (6)
188 # .. | |- --| self.ROW_OFF_BITS (3)
189 # .. |----- ---| | self.ROW_BITS (8)
190 # .. |-----| | self.INDEX_BITS (5)
191 # .. --------| | self.TAG_BITS (53)
192
193 # The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
195 #
196 def RowPerLineValidArray(self):
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(self.ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBRecord(self, name):
206 tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
207 ('pte', self.TLB_PTE_BITS)
208 ]
209 return Record(tlb_layout, name=name)
210
211 def TLBArray(self):
212 return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
213
214 # PLRU output interface
215 def PLRUOut(self):
216 return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
217 for x in range(self.NUM_LINES))
218
219 # Return the cache line index (tag index) for an address
220 def get_index(self, addr):
221 return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
222
223 # Return the cache row index (data memory) for an address
224 def get_row(self, addr):
225 return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
226
227 # Return the index of a row within a line
228 def get_row_of_line(self, row):
229 return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
230
231 # Returns whether this is the last row of a line
232 def is_last_row_addr(self, addr, last):
233 return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
234
235 # Returns whether this is the last row of a line
236 def is_last_row(self, row, last):
237 return self.get_row_of_line(row) == last
238
239 # Return the next row in the current cache line. We use a dedicated
240 # function in order to limit the size of the generated adder to be
241 # only the bits within a cache line (3 bits with default settings)
242 def next_row(self, row):
243 row_v = row[0:self.ROW_LINE_BITS] + 1
244 return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
245
246 # Read the instruction word for the given address
247 # in the current cache row
248 def read_insn_word(self, addr, data):
249 word = addr[2:self.INSN_BITS+2]
250 return data.word_select(word, 32)
251
252 # Get the tag value from the address
253 def get_tag(self, addr):
254 return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
255
256 # Read a tag from a tag memory row
257 def read_tag(self, way, tagset):
258 return tagset.word_select(way, self.TAG_BITS)
259
260 # Write a tag to tag memory row
261 def write_tag(self, way, tagset, tag):
262 return self.read_tag(way, tagset).eq(tag)
263
264 # Simple hash for direct-mapped TLB index
265 def hash_ea(self, addr):
266 hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
267 addr[self.TLB_LG_PGSZ + self.TL_BITS:
268 self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
269 addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
270 self.TLB_LG_PGSZ + 3 * self.TL_BITS])
271 return hsh
272
273
274 # Cache reload state machine
275 @unique
276 class State(Enum):
277 IDLE = 0
278 CLR_TAG = 1
279 WAIT_ACK = 2
280
281
282 class RegInternal(RecordObject):
283 def __init__(self, cfg):
284 super().__init__()
285 # Cache hit state (Latches for 1 cycle BRAM access)
286 self.hit_way = Signal(cfg.WAY_BITS)
287 self.hit_nia = Signal(64)
288 self.hit_smark = Signal()
289 self.hit_valid = Signal()
290
291 # Cache miss state (reload state machine)
292 self.state = Signal(State, reset=State.IDLE)
293 self.wb = WBMasterOut("wb")
294 self.req_adr = Signal(64)
295 self.store_way = Signal(cfg.WAY_BITS)
296 self.store_index = Signal(cfg.INDEX_BITS)
297 self.store_row = Signal(cfg.ROW_BITS)
298 self.store_tag = Signal(cfg.TAG_BITS)
299 self.store_valid = Signal()
300 self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
301 self.rows_valid = cfg.RowPerLineValidArray()
302
303 # TLB miss state
304 self.fetch_failed = Signal()
305
306
307 class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
308 """64 bit direct mapped icache. All instructions are 4B aligned."""
309 def __init__(self, pspec):
310 FetchUnitInterface.__init__(self, pspec)
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 # standard naming (wired to non-standard for compatibility)
322 self.bus = Interface(addr_width=32,
323 data_width=64,
324 granularity=8,
325 features={'stall'},
326 #alignment=0,
327 name="icache_wb")
328
329 self.log_out = Signal(54)
330
331 # use FetchUnitInterface, helps keep some unit tests running
332 self.use_fetch_iface = False
333
334 # test if microwatt compatibility is to be enabled
335 self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
336 (pspec.microwatt_compat == True))
337
338 if self.microwatt_compat:
339 # reduce way sizes and num lines
340 ICacheConfig.__init__(self, NUM_LINES = 4,
341 NUM_WAYS = 1,
342 )
343 else:
344 ICacheConfig.__init__(self)
345
346 def use_fetch_interface(self):
347 self.use_fetch_iface = True
348
349 # Generate a cache RAM for each way
350 def rams(self, m, r, cache_out_row, use_previous,
351 replace_way, req_row):
352
353 comb = m.d.comb
354 sync = m.d.sync
355
356 bus, stall_in = self.bus, self.stall_in
357
358 # read condition (for every cache ram)
359 do_read = Signal()
360 comb += do_read.eq(~(stall_in | use_previous))
361
362 rd_addr = Signal(self.ROW_BITS)
363 wr_addr = Signal(self.ROW_BITS)
364 comb += rd_addr.eq(req_row)
365 comb += wr_addr.eq(r.store_row)
366
367 # binary-to-unary converters: replace-way enabled by bus.ack,
368 # hit-way left permanently enabled
369 m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
370 m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
371 comb += re.i.eq(replace_way)
372 comb += re.n.eq(~bus.ack)
373 comb += he.i.eq(r.hit_way)
374
375 for i in range(self.NUM_WAYS):
376 do_write = Signal(name="do_wr_%d" % i)
377 d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
378 wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
379
380 way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
381 TRACE=True, ram_num=i)
382 m.submodules["cacheram_%d" % i] = way
383
384 comb += way.rd_en.eq(do_read)
385 comb += way.rd_addr.eq(rd_addr)
386 comb += d_out.eq(way.rd_data_o)
387 comb += way.wr_sel.eq(wr_sel)
388 comb += way.wr_addr.eq(wr_addr)
389 comb += way.wr_data.eq(bus.dat_r)
390
391 comb += do_write.eq(re.o[i])
392
393 with m.If(do_write):
394 sync += Display("cache write adr: %x data: %lx",
395 wr_addr, way.wr_data)
396
397 with m.If(he.o[i]):
398 comb += cache_out_row.eq(d_out)
399 with m.If(do_read):
400 sync += Display("cache read adr: %x data: %x",
401 req_row, d_out)
402
403 comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
404
405 # Generate PLRUs
406 def maybe_plrus(self, m, r, plru_victim):
407 comb = m.d.comb
408
409 if self.NUM_WAYS == 0:
410 return
411
412
413 m.submodules.plrus = plru = PLRUs(self.NUM_LINES, self.WAY_BITS)
414 comb += plru.way.eq(r.hit_way)
415 comb += plru.valid.eq(r.hit_valid)
416 comb += plru.index.eq(self.get_index(r.hit_nia))
417 comb += plru.isel.eq(r.store_index) # select victim
418 comb += plru_victim.eq(plru.o_index) # selected victim
419
420 # TLB hit detection and real address generation
421 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
422 real_addr, ra_valid, eaa_priv,
423 priv_fault, access_ok):
424
425 comb = m.d.comb
426
427 i_in = self.i_in
428
429 # use an *asynchronous* Memory read port here (combinatorial)
430 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
431 tlb = self.TLBRecord("tlb_rdport")
432 pte, ttag = tlb.pte, tlb.tag
433
434 comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
435 comb += rd_tlb.addr.eq(tlb_req_index)
436 comb += tlb.eq(rd_tlb.data)
437
438 with m.If(i_in.virt_mode):
439 comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
440 pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
441
442 with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
443 comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
444
445 comb += eaa_priv.eq(pte[3])
446
447 with m.Else():
448 comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
449 comb += ra_valid.eq(1)
450 comb += eaa_priv.eq(1)
451
452 # No IAMR, so no KUEP support for now
453 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
454 comb += access_ok.eq(ra_valid & ~priv_fault)
455
456 # iTLB update
457 def itlb_update(self, m, itlb, itlb_valid):
458 comb = m.d.comb
459 sync = m.d.sync
460
461 m_in = self.m_in
462
463 wr_index = Signal(self.TL_BITS)
464 wr_unary = Signal(self.TLB_SIZE)
465 comb += wr_index.eq(self.hash_ea(m_in.addr))
466 comb += wr_unary.eq(1<<wr_index)
467
468 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
469 sync += itlb_valid.s.eq(0)
470 sync += itlb_valid.r.eq(0)
471
472 with m.If(m_in.tlbie & m_in.doall):
473 # Clear all valid bits
474 sync += itlb_valid.r.eq(-1)
475
476 with m.Elif(m_in.tlbie):
477 # Clear entry regardless of hit or miss
478 sync += itlb_valid.r.eq(wr_unary)
479
480 with m.Elif(m_in.tlbld):
481 tlb = self.TLBRecord("tlb_wrport")
482 comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
483 comb += tlb.pte.eq(m_in.pte)
484 comb += wr_tlb.en.eq(1)
485 comb += wr_tlb.addr.eq(wr_index)
486 comb += wr_tlb.data.eq(tlb)
487 sync += itlb_valid.s.eq(wr_unary)
488
489 # Cache hit detection, output to fetch2 and other misc logic
490 def icache_comb(self, m, use_previous, r, req_index, req_row,
491 req_hit_way, req_tag, real_addr, req_laddr,
492 cache_valids, access_ok,
493 req_is_hit, req_is_miss, replace_way,
494 plru_victim, cache_out_row):
495
496 comb = m.d.comb
497 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
498
499 i_in, i_out, bus = self.i_in, self.i_out, self.bus
500 flush_in, stall_out = self.flush_in, self.stall_out
501
502 is_hit = Signal()
503 hit_way = Signal(self.WAY_BITS)
504
505 # i_in.sequential means that i_in.nia this cycle is 4 more than
506 # last cycle. If we read more than 32 bits at a time, had a
507 # cache hit last cycle, and we don't want the first 32-bit chunk
508 # then we can keep the data we read last cycle and just use that.
509 with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
510 comb += use_previous.eq(i_in.sequential & r.hit_valid)
511
512 # Extract line, row and tag from request
513 comb += req_index.eq(self.get_index(i_in.nia))
514 comb += req_row.eq(self.get_row(i_in.nia))
515 comb += req_tag.eq(self.get_tag(real_addr))
516
517 # Calculate address of beginning of cache row, will be
518 # used for cache miss processing if needed
519 comb += req_laddr.eq(Cat(
520 Const(0, self.ROW_OFF_BITS),
521 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
522 ))
523
524 # Test if pending request is a hit on any way
525 hitcond = Signal()
526 comb += hitcond.eq((r.state == State.WAIT_ACK)
527 & (req_index == r.store_index)
528 & r.rows_valid[req_row % self.ROW_PER_LINE]
529 )
530 # i_in.req asserts Decoder active
531 cvb = Signal(self.NUM_WAYS)
532 ctag = Signal(self.TAG_RAM_WIDTH)
533 comb += rd_tag.addr.eq(req_index)
534 comb += ctag.eq(rd_tag.data)
535 comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
536 m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
537 comb += se.i.eq(r.store_way)
538 comb += se.n.eq(~i_in.req)
539 for i in range(self.NUM_WAYS):
540 tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
541 hit_test = Signal(name="hit_test%d" % i)
542 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
543 comb += tagi.eq(self.read_tag(i, ctag))
544 comb += hit_test.eq(se.o[i])
545 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
546 (tagi == req_tag))
547 with m.If(is_tag_hit):
548 comb += hit_way.eq(i)
549 comb += is_hit.eq(1)
550
551 # Generate the "hit" and "miss" signals
552 # for the synchronous blocks
553 with m.If(i_in.req & access_ok & ~flush_in):
554 comb += req_is_hit.eq(is_hit)
555 comb += req_is_miss.eq(~is_hit)
556
557 comb += req_hit_way.eq(hit_way)
558
559 # The way to replace on a miss
560 with m.If(r.state == State.CLR_TAG):
561 comb += replace_way.eq(plru_victim)
562 with m.Else():
563 comb += replace_way.eq(r.store_way)
564
565 # Output instruction from current cache row
566 #
567 # Note: This is a mild violation of our design principle of
568 # having pipeline stages output from a clean latch. In this
569 # case we output the result of a mux. The alternative would
570 # be output an entire row which I prefer not to do just yet
571 # as it would force fetch2 to know about some of the cache
572 # geometry information.
573 comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
574 comb += i_out.valid.eq(r.hit_valid)
575 comb += i_out.nia.eq(r.hit_nia)
576 comb += i_out.stop_mark.eq(r.hit_smark)
577 comb += i_out.fetch_failed.eq(r.fetch_failed)
578
579 # Stall fetch1 if we have a miss on cache or TLB
580 # or a protection fault
581 comb += stall_out.eq(~(is_hit & access_ok))
582
583 # Wishbone requests output (from the cache miss reload machine)
584 comb += bus.we.eq(r.wb.we)
585 comb += bus.adr.eq(r.wb.adr)
586 comb += bus.sel.eq(r.wb.sel)
587 comb += bus.stb.eq(r.wb.stb)
588 comb += bus.dat_w.eq(r.wb.dat)
589 comb += bus.cyc.eq(r.wb.cyc)
590
591 # Cache hit synchronous machine
592 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
593 req_index, req_tag, real_addr):
594 sync = m.d.sync
595
596 i_in, stall_in = self.i_in, self.stall_in
597 flush_in = self.flush_in
598
599 # keep outputs to fetch2 unchanged on a stall
600 # except that flush or reset sets valid to 0
601 # If use_previous, keep the same data as last
602 # cycle and use the second half
603 with m.If(stall_in | use_previous):
604 with m.If(flush_in):
605 sync += r.hit_valid.eq(0)
606 with m.Else():
607 # On a hit, latch the request for the next cycle,
608 # when the BRAM data will be available on the
609 # cache_out output of the corresponding way
610 sync += r.hit_valid.eq(req_is_hit)
611
612 with m.If(req_is_hit):
613 sync += r.hit_way.eq(req_hit_way)
614 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
615 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
616 i_in.stop_mark, req_index, req_tag,
617 req_hit_way, real_addr)
618
619 with m.If(~stall_in):
620 # Send stop marks and NIA down regardless of validity
621 sync += r.hit_smark.eq(i_in.stop_mark)
622 sync += r.hit_nia.eq(i_in.nia)
623
624 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
625 req_index, req_tag, replace_way, real_addr):
626 comb = m.d.comb
627 sync = m.d.sync
628
629 i_in = self.i_in
630
631 # Reset per-row valid flags, only used in WAIT_ACK
632 for i in range(self.ROW_PER_LINE):
633 sync += r.rows_valid[i].eq(0)
634
635 # We need to read a cache line
636 with m.If(req_is_miss):
637 sync += Display(
638 "cache miss nia:%x IR:%x SM:%x idx:%x "
639 " way:%x tag:%x RA:%x", i_in.nia,
640 i_in.virt_mode, i_in.stop_mark, req_index,
641 replace_way, req_tag, real_addr)
642
643 # Keep track of our index and way for subsequent stores
644 st_row = Signal(self.ROW_BITS)
645 comb += st_row.eq(self.get_row(req_laddr))
646 sync += r.store_index.eq(req_index)
647 sync += r.store_row.eq(st_row)
648 sync += r.store_tag.eq(req_tag)
649 sync += r.store_valid.eq(1)
650 sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
651
652 # Prep for first wishbone read. We calculate the address
653 # of the start of the cache line and start the WB cycle.
654 sync += r.req_adr.eq(req_laddr)
655 sync += r.wb.cyc.eq(1)
656 sync += r.wb.stb.eq(1)
657
658 # Track that we had one request sent
659 sync += r.state.eq(State.CLR_TAG)
660
661 def icache_miss_clr_tag(self, m, r, replace_way,
662 req_index,
663 cache_valids):
664 comb = m.d.comb
665 sync = m.d.sync
666 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
667 granularity=self.TAG_BITS)
668
669 # Get victim way from plru
670 sync += r.store_way.eq(replace_way)
671
672 # Force misses on that way while reloading that line
673 idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
674 comb += cache_valids.r.eq(1<<idx)
675
676 # use write-port "granularity" to select the tag to write to
677 # TODO: the Memory should be multipled-up (by NUM_TAGS)
678 tagset = Signal(self.TAG_RAM_WIDTH)
679 comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
680 comb += wr_tag.en.eq(1<<replace_way)
681 comb += wr_tag.addr.eq(r.store_index)
682 comb += wr_tag.data.eq(tagset)
683
684 sync += r.state.eq(State.WAIT_ACK)
685
686 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
687 cache_valids, stbs_done):
688 comb = m.d.comb
689 sync = m.d.sync
690
691 bus = self.bus
692
693 # Requests are all sent if stb is 0
694 stbs_zero = Signal()
695 comb += stbs_zero.eq(r.wb.stb == 0)
696 comb += stbs_done.eq(stbs_zero)
697
698 # If we are still sending requests, was one accepted?
699 with m.If(~bus.stall & ~stbs_zero):
700 # That was the last word? We are done sending.
701 # Clear stb and set stbs_done so we can handle
702 # an eventual last ack on the same cycle.
703 with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
704 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
705 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
706 "stbs_done:%x", r.wb.adr, r.end_row_ix,
707 r.wb.stb, stbs_zero, stbs_done)
708 sync += r.wb.stb.eq(0)
709 comb += stbs_done.eq(1)
710
711 # Calculate the next row address
712 rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
713 comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
714 self.LINE_OFF_BITS] + 1)
715 sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
716 sync += Display("RARANGE r.req_adr:%x rarange:%x "
717 "stbs_zero:%x stbs_done:%x",
718 r.req_adr, rarange, stbs_zero, stbs_done)
719
720 # Incoming acks processing
721 with m.If(bus.ack):
722 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
723 "stbs_done:%x",
724 bus.dat_r, stbs_zero, stbs_done)
725
726 sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
727
728 # Check for completion
729 with m.If(stbs_done & self.is_last_row(r.store_row, r.end_row_ix)):
730 # Complete wishbone cycle
731 sync += r.wb.cyc.eq(0)
732 # be nice, clear addr
733 sync += r.req_adr.eq(0)
734
735 # Cache line is now valid
736 idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
737 valid = r.store_valid & ~inval_in
738 comb += cache_valids.s.eq(1<<idx)
739 sync += r.state.eq(State.IDLE)
740
741 # move on to next request in row
742 # Increment store row counter
743 sync += r.store_row.eq(self.next_row(r.store_row))
744
745 # Cache miss/reload synchronous machine
746 def icache_miss(self, m, r, req_is_miss,
747 req_index, req_laddr, req_tag, replace_way,
748 cache_valids, access_ok, real_addr):
749 comb = m.d.comb
750 sync = m.d.sync
751
752 i_in, bus, m_in = self.i_in, self.bus, self.m_in
753 stall_in, flush_in = self.stall_in, self.flush_in
754 inval_in = self.inval_in
755
756 stbs_done = Signal()
757
758 comb += r.wb.sel.eq(-1)
759 comb += r.wb.adr.eq(r.req_adr[3:])
760
761 # Process cache invalidations
762 with m.If(inval_in):
763 comb += cache_valids.r.eq(-1)
764 sync += r.store_valid.eq(0)
765
766 # Main state machine
767 with m.Switch(r.state):
768
769 with m.Case(State.IDLE):
770 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
771 req_index, req_tag, replace_way,
772 real_addr)
773
774 with m.Case(State.CLR_TAG, State.WAIT_ACK):
775 with m.If(r.state == State.CLR_TAG):
776 self.icache_miss_clr_tag(m, r, replace_way,
777 req_index,
778 cache_valids)
779
780 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
781 cache_valids, stbs_done)
782
783 # TLB miss and protection fault processing
784 with m.If(flush_in | m_in.tlbld):
785 sync += r.fetch_failed.eq(0)
786 with m.Elif(i_in.req & ~access_ok & ~stall_in):
787 sync += r.fetch_failed.eq(1)
788
789 # icache_log: if LOG_LENGTH > 0 generate
790 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
791 req_is_miss, req_is_hit, lway, wstate, r):
792 comb = m.d.comb
793 sync = m.d.sync
794
795 bus, i_out = self.bus, self.i_out
796 log_out, stall_out = self.log_out, self.stall_out
797
798 # Output data to logger
799 for i in range(LOG_LENGTH):
800 log_data = Signal(54)
801 lway = Signal(self.WAY_BITS)
802 wstate = Signal()
803
804 sync += lway.eq(req_hit_way)
805 sync += wstate.eq(0)
806
807 with m.If(r.state != State.IDLE):
808 sync += wstate.eq(1)
809
810 sync += log_data.eq(Cat(
811 ra_valid, access_ok, req_is_miss, req_is_hit,
812 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
813 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
814 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
815 ))
816 comb += log_out.eq(log_data)
817
818 def elaborate(self, platform):
819
820 m = Module()
821 comb = m.d.comb
822
823 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
824 # number of ways and the number of lines.
825 vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
826 name="cachevalids")
827 m.submodules.cache_valids = cache_valids = vec
828
829 # TLB Array
830 itlb = self.TLBArray()
831 vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
832 m.submodules.itlb_valids = itlb_valid = vec
833
834 # TODO to be passed to nmigen as ram attributes
835 # attribute ram_style of itlb_tags : signal is "distributed";
836 # attribute ram_style of itlb_ptes : signal is "distributed";
837
838 # Privilege bit from PTE EAA field
839 eaa_priv = Signal()
840
841 r = RegInternal(self)
842
843 # Async signal on incoming request
844 req_index = Signal(self.INDEX_BITS)
845 req_row = Signal(self.ROW_BITS)
846 req_hit_way = Signal(self.WAY_BITS)
847 req_tag = Signal(self.TAG_BITS)
848 req_is_hit = Signal()
849 req_is_miss = Signal()
850 req_laddr = Signal(64)
851
852 tlb_req_index = Signal(self.TL_BITS)
853 real_addr = Signal(self.REAL_ADDR_BITS)
854 ra_valid = Signal()
855 priv_fault = Signal()
856 access_ok = Signal()
857 use_previous = Signal()
858
859 cache_out_row = Signal(self.ROW_SIZE_BITS)
860
861 plru_victim = Signal(self.WAY_BITS)
862 replace_way = Signal(self.WAY_BITS)
863
864 self.tlbmem = Memory(depth=self.TLB_SIZE,
865 width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS)
866 self.tagmem = Memory(depth=self.NUM_LINES,
867 width=self.TAG_RAM_WIDTH)
868
869 # call sub-functions putting everything together,
870 # using shared signals established above
871 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
872 self.maybe_plrus(m, r, plru_victim)
873 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
874 ra_valid, eaa_priv, priv_fault,
875 access_ok)
876 self.itlb_update(m, itlb, itlb_valid)
877 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
878 req_tag, real_addr, req_laddr,
879 cache_valids,
880 access_ok, req_is_hit, req_is_miss,
881 replace_way, plru_victim, cache_out_row)
882 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
883 req_index, req_tag, real_addr)
884 self.icache_miss(m, r, req_is_miss, req_index,
885 req_laddr, req_tag, replace_way,
886 cache_valids,
887 access_ok, real_addr)
888 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
889 # req_is_miss, req_is_hit, lway, wstate, r)
890
891 # don't connect up to FetchUnitInterface so that some unit tests
892 # can continue to operate
893 if not self.use_fetch_iface:
894 return m
895
896 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
897 # so needs checking and iterative revising
898 i_in, bus, i_out = self.i_in, self.bus, self.i_out
899 comb += i_in.req.eq(self.a_i_valid)
900 comb += i_in.nia.eq(self.a_pc_i)
901 comb += self.stall_in.eq(self.a_stall_i)
902 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
903 comb += self.f_badaddr_o.eq(i_out.nia)
904 comb += self.f_instr_o.eq(i_out.insn)
905 comb += self.f_busy_o.eq(~i_out.valid) # probably
906
907 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
908 ibus = self.ibus
909 comb += ibus.adr.eq(self.bus.adr)
910 comb += ibus.dat_w.eq(self.bus.dat_w)
911 comb += ibus.sel.eq(self.bus.sel)
912 comb += ibus.cyc.eq(self.bus.cyc)
913 comb += ibus.stb.eq(self.bus.stb)
914 comb += ibus.we.eq(self.bus.we)
915
916 comb += self.bus.dat_r.eq(ibus.dat_r)
917 comb += self.bus.ack.eq(ibus.ack)
918 if hasattr(ibus, "stall"):
919 comb += self.bus.stall.eq(ibus.stall)
920 else:
921 # fake-up the wishbone stall signal to comply with pipeline mode
922 # same thing is done in dcache.py
923 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
924
925 return m
926
927
928 def icache_sim(dut):
929 i_in = dut.i_in
930 i_out = dut.i_out
931 m_out = dut.m_in
932
933 yield i_in.priv_mode.eq(1)
934 yield i_in.req.eq(0)
935 yield i_in.nia.eq(0)
936 yield i_in.stop_mark.eq(0)
937 yield m_out.tlbld.eq(0)
938 yield m_out.tlbie.eq(0)
939 yield m_out.addr.eq(0)
940 yield m_out.pte.eq(0)
941 yield
942 yield
943 yield
944 yield
945
946 # miss, stalls for a bit
947 yield i_in.req.eq(1)
948 yield i_in.nia.eq(Const(0x0000000000000004, 64))
949 yield
950 valid = yield i_out.valid
951 while not valid:
952 yield
953 valid = yield i_out.valid
954 yield i_in.req.eq(0)
955
956 insn = yield i_out.insn
957 nia = yield i_out.nia
958 assert insn == 0x00000001, \
959 "insn @%x=%x expected 00000001" % (nia, insn)
960 yield i_in.req.eq(0)
961 yield
962
963 # hit
964 yield i_in.req.eq(1)
965 yield i_in.nia.eq(Const(0x0000000000000008, 64))
966 yield
967 valid = yield i_out.valid
968 while not valid:
969 yield
970 valid = yield i_out.valid
971 yield i_in.req.eq(0)
972
973 nia = yield i_out.nia
974 insn = yield i_out.insn
975 yield
976 assert insn == 0x00000002, \
977 "insn @%x=%x expected 00000002" % (nia, insn)
978
979 # another miss
980 yield i_in.req.eq(1)
981 yield i_in.nia.eq(Const(0x0000000000000040, 64))
982 yield
983 valid = yield i_out.valid
984 while not valid:
985 yield
986 valid = yield i_out.valid
987 yield i_in.req.eq(0)
988
989 nia = yield i_in.nia
990 insn = yield i_out.insn
991 assert insn == 0x00000010, \
992 "insn @%x=%x expected 00000010" % (nia, insn)
993
994 # test something that aliases (this only works because
995 # the unit test SRAM is a depth of 512)
996 yield i_in.req.eq(1)
997 yield i_in.nia.eq(Const(0x0000000000000100, 64))
998 yield
999 yield
1000 valid = yield i_out.valid
1001 assert ~valid
1002 for i in range(30):
1003 yield
1004 yield
1005 insn = yield i_out.insn
1006 valid = yield i_out.valid
1007 insn = yield i_out.insn
1008 assert valid
1009 assert insn == 0x00000040, \
1010 "insn @%x=%x expected 00000040" % (nia, insn)
1011 yield i_in.req.eq(0)
1012
1013
1014 def test_icache(mem):
1015 from soc.config.test.test_loadstore import TestMemPspec
1016 pspec = TestMemPspec(addr_wid=32,
1017 mask_wid=8,
1018 reg_wid=64,
1019 )
1020 dut = ICache(pspec)
1021
1022 memory = Memory(width=64, depth=512, init=mem)
1023 sram = SRAM(memory=memory, granularity=8)
1024
1025 m = Module()
1026
1027 m.submodules.icache = dut
1028 m.submodules.sram = sram
1029
1030 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1031 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1032 m.d.comb += sram.bus.we.eq(dut.bus.we)
1033 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1034 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1035 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1036
1037 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1038 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1039
1040 # nmigen Simulation
1041 sim = Simulator(m)
1042 sim.add_clock(1e-6)
1043
1044 sim.add_sync_process(wrap(icache_sim(dut)))
1045 with sim.write_vcd('test_icache.vcd'):
1046 sim.run()
1047
1048
1049 if __name__ == '__main__':
1050 from soc.config.test.test_loadstore import TestMemPspec
1051 pspec = TestMemPspec(addr_wid=64,
1052 mask_wid=8,
1053 reg_wid=64,
1054 )
1055 dut = ICache(pspec)
1056 vl = rtlil.convert(dut, ports=[])
1057 with open("test_icache.il", "w") as f:
1058 f.write(vl)
1059
1060 # set up memory every 32-bits with incrementing values 0 1 2 ...
1061 mem = []
1062 for i in range(512):
1063 mem.append((i*2) | ((i*2+1)<<32))
1064
1065 test_icache(mem)