use block_ram attribute for FPGA synthesis
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66 # from microwatt/utils.vhdl
67 def ispow2(n):
68 return n != 0 and (n & (n - 1)) == 0
69
70 SIM = 0
71 # Non-zero to enable log data collection
72 LOG_LENGTH = 0
73
74 class ICacheConfig:
75 def __init__(self, LINE_SIZE = 64,
76 NUM_LINES = 64, # Number of lines in a set
77 NUM_WAYS = 2, # Number of ways
78 TLB_SIZE = 64, # L1 ITLB number of entries
79 TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
80 self.LINE_SIZE = LINE_SIZE
81 self.NUM_LINES = NUM_LINES
82 self.NUM_WAYS = NUM_WAYS
83 self.TLB_SIZE = TLB_SIZE
84 self.TLB_LG_PGSZ = TLB_LG_PGSZ
85
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
89 #
90 # self.ROW_SIZE is the width in bytes of the BRAM
91 # (based on WB, so 64-bits)
92 self.ROW_SIZE = WB_DATA_BITS // 8
93 # Number of real address bits that we store
94 self.REAL_ADDR_BITS = 56
95
96 self.ROW_SIZE_BITS = self.ROW_SIZE * 8
97 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
98 self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
99 # BRAM_ROWS is the number of rows in BRAM
100 # needed to represent the full icache
101 self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
102 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
103 self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
104
105 # Bit fields counts in the address
106 #
107 # INSN_BITS is the number of bits to select an instruction in a row
108 self.INSN_BITS = log2_int(self.INSN_PER_ROW)
109 # ROW_BITS is the number of bits to select a row
110 self.ROW_BITS = log2_int(self.BRAM_ROWS)
111 # ROW_LINE_BITS is the number of bits to select a row within a line
112 self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
113 # LINE_OFF_BITS is the number of bits for the offset in a cache line
114 self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
115 # ROW_OFF_BITS is the number of bits for the offset in a row
116 self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
117 # INDEX_BITS is the number of bits to select a cache line
118 self.INDEX_BITS = log2_int(self.NUM_LINES)
119 # SET_SIZE_BITS is the log base 2 of the set size
120 self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
121 # TAG_BITS is the number of bits of the tag part of the address
122 self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
123 # TAG_WIDTH is the width in bits of each way of the tag RAM
124 self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
125
126 # WAY_BITS is the number of bits to select a way
127 self.WAY_BITS = log2_int(self.NUM_WAYS)
128 self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
129
130 # L1 ITLB
131 self.TL_BITS = log2_int(self.TLB_SIZE)
132 self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS)
133 self.TLB_PTE_BITS = 64
134
135 print("self.BRAM_ROWS =", self.BRAM_ROWS)
136 print("self.INDEX_BITS =", self.INDEX_BITS)
137 print("self.INSN_BITS =", self.INSN_BITS)
138 print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
139 print("self.LINE_SIZE =", self.LINE_SIZE)
140 print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
141 print("LOG_LENGTH =", LOG_LENGTH)
142 print("self.NUM_LINES =", self.NUM_LINES)
143 print("self.NUM_WAYS =", self.NUM_WAYS)
144 print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
145 print("self.ROW_BITS =", self.ROW_BITS)
146 print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
147 print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
148 print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
149 print("self.ROW_SIZE =", self.ROW_SIZE)
150 print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
151 print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
152 print("SIM =", SIM)
153 print("self.TAG_BITS =", self.TAG_BITS)
154 print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
155 print("self.TAG_BITS =", self.TAG_BITS)
156 print("self.TL_BITS =", self.TL_BITS)
157 print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
158 print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
159 print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
160 print("self.TLB_SIZE =", self.TLB_SIZE)
161 print("self.WAY_BITS =", self.WAY_BITS)
162
163 assert self.LINE_SIZE % self.ROW_SIZE == 0
164 assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
165 assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
166 assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
167 assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
168 assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
169 "geometry bits don't add up"
170 assert (self.LINE_OFF_BITS ==
171 (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
172 "geometry bits don't add up"
173 assert (self.REAL_ADDR_BITS ==
174 (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
175 "geometry bits don't add up"
176 assert (self.REAL_ADDR_BITS ==
177 (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
178 "geometry bits don't add up"
179
180 # Example of layout for 32 lines of 64 bytes:
181 #
182 # .. tag |index| line |
183 # .. | row | |
184 # .. | | | |00| zero (2)
185 # .. | | |-| | self.INSN_BITS (1)
186 # .. | |---| | self.ROW_LINE_BITS (3)
187 # .. | |--- - --| self.LINE_OFF_BITS (6)
188 # .. | |- --| self.ROW_OFF_BITS (3)
189 # .. |----- ---| | self.ROW_BITS (8)
190 # .. |-----| | self.INDEX_BITS (5)
191 # .. --------| | self.TAG_BITS (53)
192
193 # The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
195 #
196 def RowPerLineValidArray(self):
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(self.ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBRecord(self, name):
206 tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
207 ('pte', self.TLB_PTE_BITS)
208 ]
209 return Record(tlb_layout, name=name)
210
211 def TLBArray(self):
212 return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
213
214 # PLRU output interface
215 def PLRUOut(self):
216 return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
217 for x in range(self.NUM_LINES))
218
219 # Return the cache line index (tag index) for an address
220 def get_index(self, addr):
221 return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
222
223 # Return the cache row index (data memory) for an address
224 def get_row(self, addr):
225 return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
226
227 # Return the index of a row within a line
228 def get_row_of_line(self, row):
229 return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
230
231 # Returns whether this is the last row of a line
232 def is_last_row_addr(self, addr, last):
233 return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
234
235 # Returns whether this is the last row of a line
236 def is_last_row(self, row, last):
237 return self.get_row_of_line(row) == last
238
239 # Return the next row in the current cache line. We use a dedicated
240 # function in order to limit the size of the generated adder to be
241 # only the bits within a cache line (3 bits with default settings)
242 def next_row(self, row):
243 row_v = row[0:self.ROW_LINE_BITS] + 1
244 return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
245
246 # Read the instruction word for the given address
247 # in the current cache row
248 def read_insn_word(self, addr, data):
249 word = addr[2:self.INSN_BITS+2]
250 return data.word_select(word, 32)
251
252 # Get the tag value from the address
253 def get_tag(self, addr):
254 return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
255
256 # Read a tag from a tag memory row
257 def read_tag(self, way, tagset):
258 return tagset.word_select(way, self.TAG_BITS)
259
260 # Write a tag to tag memory row
261 def write_tag(self, way, tagset, tag):
262 return self.read_tag(way, tagset).eq(tag)
263
264 # Simple hash for direct-mapped TLB index
265 def hash_ea(self, addr):
266 hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
267 addr[self.TLB_LG_PGSZ + self.TL_BITS:
268 self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
269 addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
270 self.TLB_LG_PGSZ + 3 * self.TL_BITS])
271 return hsh
272
273
274 # Cache reload state machine
275 @unique
276 class State(Enum):
277 IDLE = 0
278 CLR_TAG = 1
279 WAIT_ACK = 2
280
281
282 class RegInternal(RecordObject):
283 def __init__(self, cfg):
284 super().__init__()
285 # Cache hit state (Latches for 1 cycle BRAM access)
286 self.hit_way = Signal(cfg.WAY_BITS)
287 self.hit_nia = Signal(64)
288 self.hit_smark = Signal()
289 self.hit_valid = Signal()
290
291 # Cache miss state (reload state machine)
292 self.state = Signal(State, reset=State.IDLE)
293 self.wb = WBMasterOut("wb")
294 self.req_adr = Signal(64)
295 self.store_way = Signal(cfg.WAY_BITS)
296 self.store_index = Signal(cfg.INDEX_BITS)
297 self.store_row = Signal(cfg.ROW_BITS)
298 self.store_tag = Signal(cfg.TAG_BITS)
299 self.store_valid = Signal()
300 self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
301 self.rows_valid = cfg.RowPerLineValidArray()
302
303 # TLB miss state
304 self.fetch_failed = Signal()
305
306
307 class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
308 """64 bit direct mapped icache. All instructions are 4B aligned."""
309 def __init__(self, pspec):
310 FetchUnitInterface.__init__(self, pspec)
311 self.i_in = Fetch1ToICacheType(name="i_in")
312 self.i_out = ICacheToDecode1Type(name="i_out")
313
314 self.m_in = MMUToICacheType(name="m_in")
315
316 self.stall_in = Signal()
317 self.stall_out = Signal()
318 self.flush_in = Signal()
319 self.inval_in = Signal()
320
321 # standard naming (wired to non-standard for compatibility)
322 self.bus = Interface(addr_width=32,
323 data_width=64,
324 granularity=8,
325 features={'stall'},
326 #alignment=0,
327 name="icache_wb")
328
329 self.log_out = Signal(54)
330
331 # use FetchUnitInterface, helps keep some unit tests running
332 self.use_fetch_iface = False
333
334 # test if microwatt compatibility is to be enabled
335 self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
336 (pspec.microwatt_compat == True))
337
338 if self.microwatt_compat:
339 # reduce way sizes and num lines
340 ICacheConfig.__init__(self, NUM_LINES = 4,
341 NUM_WAYS = 1,
342 TLB_SIZE=16 # needs device-tree update
343 )
344 else:
345 ICacheConfig.__init__(self)
346
347 def use_fetch_interface(self):
348 self.use_fetch_iface = True
349
350 # Generate a cache RAM for each way
351 def rams(self, m, r, cache_out_row, use_previous,
352 replace_way, req_row):
353
354 comb = m.d.comb
355 sync = m.d.sync
356
357 bus, stall_in = self.bus, self.stall_in
358
359 # read condition (for every cache ram)
360 do_read = Signal()
361 comb += do_read.eq(~(stall_in | use_previous))
362
363 rd_addr = Signal(self.ROW_BITS)
364 wr_addr = Signal(self.ROW_BITS)
365 comb += rd_addr.eq(req_row)
366 comb += wr_addr.eq(r.store_row)
367
368 # binary-to-unary converters: replace-way enabled by bus.ack,
369 # hit-way left permanently enabled
370 m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
371 m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
372 comb += re.i.eq(replace_way)
373 comb += re.n.eq(~bus.ack)
374 comb += he.i.eq(r.hit_way)
375
376 for i in range(self.NUM_WAYS):
377 do_write = Signal(name="do_wr_%d" % i)
378 d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
379 wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
380
381 way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
382 TRACE=True, ram_num=i)
383 m.submodules["cacheram_%d" % i] = way
384
385 comb += way.rd_en.eq(do_read)
386 comb += way.rd_addr.eq(rd_addr)
387 comb += d_out.eq(way.rd_data_o)
388 comb += way.wr_sel.eq(wr_sel)
389 comb += way.wr_addr.eq(wr_addr)
390 comb += way.wr_data.eq(bus.dat_r)
391
392 comb += do_write.eq(re.o[i])
393
394 with m.If(do_write):
395 sync += Display("cache write adr: %x data: %lx",
396 wr_addr, way.wr_data)
397
398 with m.If(he.o[i]):
399 comb += cache_out_row.eq(d_out)
400 with m.If(do_read):
401 sync += Display("cache read adr: %x data: %x",
402 req_row, d_out)
403
404 comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
405
406 # Generate PLRUs
407 def maybe_plrus(self, m, r, plru_victim):
408 comb = m.d.comb
409
410 if self.NUM_WAYS == 0:
411 return
412
413
414 m.submodules.plrus = plru = PLRUs(self.NUM_LINES, self.WAY_BITS)
415 comb += plru.way.eq(r.hit_way)
416 comb += plru.valid.eq(r.hit_valid)
417 comb += plru.index.eq(self.get_index(r.hit_nia))
418 comb += plru.isel.eq(r.store_index) # select victim
419 comb += plru_victim.eq(plru.o_index) # selected victim
420
421 # TLB hit detection and real address generation
422 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
423 real_addr, ra_valid, eaa_priv,
424 priv_fault, access_ok):
425
426 comb = m.d.comb
427
428 i_in = self.i_in
429
430 # use an *asynchronous* Memory read port here (combinatorial)
431 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
432 tlb = self.TLBRecord("tlb_rdport")
433 pte, ttag = tlb.pte, tlb.tag
434
435 comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
436 comb += rd_tlb.addr.eq(tlb_req_index)
437 comb += tlb.eq(rd_tlb.data)
438
439 with m.If(i_in.virt_mode):
440 comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
441 pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
442
443 with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
444 comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
445
446 comb += eaa_priv.eq(pte[3])
447
448 with m.Else():
449 comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
450 comb += ra_valid.eq(1)
451 comb += eaa_priv.eq(1)
452
453 # No IAMR, so no KUEP support for now
454 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
455 comb += access_ok.eq(ra_valid & ~priv_fault)
456
457 # iTLB update
458 def itlb_update(self, m, itlb, itlb_valid):
459 comb = m.d.comb
460 sync = m.d.sync
461
462 m_in = self.m_in
463
464 wr_index = Signal(self.TL_BITS)
465 wr_unary = Signal(self.TLB_SIZE)
466 comb += wr_index.eq(self.hash_ea(m_in.addr))
467 comb += wr_unary.eq(1<<wr_index)
468
469 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
470 sync += itlb_valid.s.eq(0)
471 sync += itlb_valid.r.eq(0)
472
473 with m.If(m_in.tlbie & m_in.doall):
474 # Clear all valid bits
475 sync += itlb_valid.r.eq(-1)
476
477 with m.Elif(m_in.tlbie):
478 # Clear entry regardless of hit or miss
479 sync += itlb_valid.r.eq(wr_unary)
480
481 with m.Elif(m_in.tlbld):
482 tlb = self.TLBRecord("tlb_wrport")
483 comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
484 comb += tlb.pte.eq(m_in.pte)
485 comb += wr_tlb.en.eq(1)
486 comb += wr_tlb.addr.eq(wr_index)
487 comb += wr_tlb.data.eq(tlb)
488 sync += itlb_valid.s.eq(wr_unary)
489
490 # Cache hit detection, output to fetch2 and other misc logic
491 def icache_comb(self, m, use_previous, r, req_index, req_row,
492 req_hit_way, req_tag, real_addr, req_laddr,
493 cache_valids, access_ok,
494 req_is_hit, req_is_miss, replace_way,
495 plru_victim, cache_out_row):
496
497 comb = m.d.comb
498 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
499
500 i_in, i_out, bus = self.i_in, self.i_out, self.bus
501 flush_in, stall_out = self.flush_in, self.stall_out
502
503 is_hit = Signal()
504 hit_way = Signal(self.WAY_BITS)
505
506 # i_in.sequential means that i_in.nia this cycle is 4 more than
507 # last cycle. If we read more than 32 bits at a time, had a
508 # cache hit last cycle, and we don't want the first 32-bit chunk
509 # then we can keep the data we read last cycle and just use that.
510 with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
511 comb += use_previous.eq(i_in.sequential & r.hit_valid)
512
513 # Extract line, row and tag from request
514 comb += req_index.eq(self.get_index(i_in.nia))
515 comb += req_row.eq(self.get_row(i_in.nia))
516 comb += req_tag.eq(self.get_tag(real_addr))
517
518 # Calculate address of beginning of cache row, will be
519 # used for cache miss processing if needed
520 comb += req_laddr.eq(Cat(
521 Const(0, self.ROW_OFF_BITS),
522 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
523 ))
524
525 # Test if pending request is a hit on any way
526 hitcond = Signal()
527 rowvalid = Signal()
528 comb += rowvalid.eq(r.rows_valid[req_row % self.ROW_PER_LINE])
529 comb += hitcond.eq((r.state == State.WAIT_ACK) &
530 (req_index == r.store_index) &
531 rowvalid
532 )
533 # i_in.req asserts Decoder active
534 cvb = Signal(self.NUM_WAYS)
535 ctag = Signal(self.TAG_RAM_WIDTH)
536 comb += rd_tag.addr.eq(req_index)
537 comb += ctag.eq(rd_tag.data)
538 comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
539 m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
540 comb += se.i.eq(r.store_way)
541 comb += se.n.eq(~i_in.req)
542 for i in range(self.NUM_WAYS):
543 tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
544 hit_test = Signal(name="hit_test%d" % i)
545 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
546 comb += tagi.eq(self.read_tag(i, ctag))
547 comb += hit_test.eq(se.o[i])
548 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
549 (tagi == req_tag))
550 with m.If(is_tag_hit):
551 comb += hit_way.eq(i)
552 comb += is_hit.eq(1)
553
554 # Generate the "hit" and "miss" signals
555 # for the synchronous blocks
556 with m.If(i_in.req & access_ok & ~flush_in):
557 comb += req_is_hit.eq(is_hit)
558 comb += req_is_miss.eq(~is_hit)
559
560 comb += req_hit_way.eq(hit_way)
561
562 # The way to replace on a miss
563 with m.If(r.state == State.CLR_TAG):
564 comb += replace_way.eq(plru_victim)
565 with m.Else():
566 comb += replace_way.eq(r.store_way)
567
568 # Output instruction from current cache row
569 #
570 # Note: This is a mild violation of our design principle of
571 # having pipeline stages output from a clean latch. In this
572 # case we output the result of a mux. The alternative would
573 # be output an entire row which I prefer not to do just yet
574 # as it would force fetch2 to know about some of the cache
575 # geometry information.
576 comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
577 comb += i_out.valid.eq(r.hit_valid)
578 comb += i_out.nia.eq(r.hit_nia)
579 comb += i_out.stop_mark.eq(r.hit_smark)
580 comb += i_out.fetch_failed.eq(r.fetch_failed)
581
582 # Stall fetch1 if we have a miss on cache or TLB
583 # or a protection fault
584 comb += stall_out.eq(~(is_hit & access_ok))
585
586 # Wishbone requests output (from the cache miss reload machine)
587 comb += bus.we.eq(r.wb.we)
588 comb += bus.adr.eq(r.wb.adr)
589 comb += bus.sel.eq(r.wb.sel)
590 comb += bus.stb.eq(r.wb.stb)
591 comb += bus.dat_w.eq(r.wb.dat)
592 comb += bus.cyc.eq(r.wb.cyc)
593
594 # Cache hit synchronous machine
595 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
596 req_index, req_tag, real_addr):
597 sync = m.d.sync
598
599 i_in, stall_in = self.i_in, self.stall_in
600 flush_in = self.flush_in
601
602 # keep outputs to fetch2 unchanged on a stall
603 # except that flush or reset sets valid to 0
604 # If use_previous, keep the same data as last
605 # cycle and use the second half
606 with m.If(stall_in | use_previous):
607 with m.If(flush_in):
608 sync += r.hit_valid.eq(0)
609 with m.Else():
610 # On a hit, latch the request for the next cycle,
611 # when the BRAM data will be available on the
612 # cache_out output of the corresponding way
613 sync += r.hit_valid.eq(req_is_hit)
614
615 with m.If(req_is_hit):
616 sync += r.hit_way.eq(req_hit_way)
617 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
618 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
619 i_in.stop_mark, req_index, req_tag,
620 req_hit_way, real_addr)
621
622 with m.If(~stall_in):
623 # Send stop marks and NIA down regardless of validity
624 sync += r.hit_smark.eq(i_in.stop_mark)
625 sync += r.hit_nia.eq(i_in.nia)
626
627 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
628 req_index, req_tag, replace_way, real_addr):
629 comb = m.d.comb
630 sync = m.d.sync
631
632 i_in = self.i_in
633
634 # Reset per-row valid flags, only used in WAIT_ACK
635 for i in range(self.ROW_PER_LINE):
636 sync += r.rows_valid[i].eq(0)
637
638 # We need to read a cache line
639 with m.If(req_is_miss):
640 sync += Display(
641 "cache miss nia:%x IR:%x SM:%x idx:%x "
642 " way:%x tag:%x RA:%x", i_in.nia,
643 i_in.virt_mode, i_in.stop_mark, req_index,
644 replace_way, req_tag, real_addr)
645
646 # Keep track of our index and way for subsequent stores
647 st_row = Signal(self.ROW_BITS)
648 comb += st_row.eq(self.get_row(req_laddr))
649 sync += r.store_index.eq(req_index)
650 sync += r.store_row.eq(st_row)
651 sync += r.store_tag.eq(req_tag)
652 sync += r.store_valid.eq(1)
653 sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
654
655 # Prep for first wishbone read. We calculate the address
656 # of the start of the cache line and start the WB cycle.
657 sync += r.req_adr.eq(req_laddr)
658 sync += r.wb.cyc.eq(1)
659 sync += r.wb.stb.eq(1)
660
661 # Track that we had one request sent
662 sync += r.state.eq(State.CLR_TAG)
663
664 def icache_miss_clr_tag(self, m, r, replace_way,
665 req_index,
666 cache_valids):
667 comb = m.d.comb
668 sync = m.d.sync
669 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
670 granularity=self.TAG_BITS)
671
672 # Get victim way from plru
673 sync += r.store_way.eq(replace_way)
674
675 # Force misses on that way while reloading that line
676 idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
677 comb += cache_valids.r.eq(1<<idx)
678
679 # use write-port "granularity" to select the tag to write to
680 # TODO: the Memory should be multipled-up (by NUM_TAGS)
681 tagset = Signal(self.TAG_RAM_WIDTH)
682 comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
683 comb += wr_tag.en.eq(1<<replace_way)
684 comb += wr_tag.addr.eq(r.store_index)
685 comb += wr_tag.data.eq(tagset)
686
687 sync += r.state.eq(State.WAIT_ACK)
688
689 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
690 cache_valids, stbs_done):
691 comb = m.d.comb
692 sync = m.d.sync
693
694 bus = self.bus
695
696 # Requests are all sent if stb is 0
697 stbs_zero = Signal()
698 comb += stbs_zero.eq(r.wb.stb == 0)
699 comb += stbs_done.eq(stbs_zero)
700
701 # If we are still sending requests, was one accepted?
702 with m.If(~bus.stall & ~stbs_zero):
703 # That was the last word? We are done sending.
704 # Clear stb and set stbs_done so we can handle
705 # an eventual last ack on the same cycle.
706 with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
707 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
708 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
709 "stbs_done:%x", r.wb.adr, r.end_row_ix,
710 r.wb.stb, stbs_zero, stbs_done)
711 sync += r.wb.stb.eq(0)
712 comb += stbs_done.eq(1)
713
714 # Calculate the next row address
715 rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
716 comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
717 self.LINE_OFF_BITS] + 1)
718 sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
719 sync += Display("RARANGE r.req_adr:%x rarange:%x "
720 "stbs_zero:%x stbs_done:%x",
721 r.req_adr, rarange, stbs_zero, stbs_done)
722
723 # Incoming acks processing
724 with m.If(bus.ack):
725 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
726 "stbs_done:%x",
727 bus.dat_r, stbs_zero, stbs_done)
728
729 sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
730
731 # Check for completion
732 with m.If(stbs_done & self.is_last_row(r.store_row, r.end_row_ix)):
733 # Complete wishbone cycle
734 sync += r.wb.cyc.eq(0)
735 # be nice, clear addr
736 sync += r.req_adr.eq(0)
737
738 # Cache line is now valid
739 idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
740 valid = r.store_valid & ~inval_in
741 comb += cache_valids.s.eq(1<<idx)
742 sync += r.state.eq(State.IDLE)
743
744 # move on to next request in row
745 # Increment store row counter
746 sync += r.store_row.eq(self.next_row(r.store_row))
747
748 # Cache miss/reload synchronous machine
749 def icache_miss(self, m, r, req_is_miss,
750 req_index, req_laddr, req_tag, replace_way,
751 cache_valids, access_ok, real_addr):
752 comb = m.d.comb
753 sync = m.d.sync
754
755 i_in, bus, m_in = self.i_in, self.bus, self.m_in
756 stall_in, flush_in = self.stall_in, self.flush_in
757 inval_in = self.inval_in
758
759 stbs_done = Signal()
760
761 comb += r.wb.sel.eq(-1)
762 comb += r.wb.adr.eq(r.req_adr[3:])
763
764 # Process cache invalidations
765 with m.If(inval_in):
766 comb += cache_valids.r.eq(-1)
767 sync += r.store_valid.eq(0)
768
769 # Main state machine
770 with m.Switch(r.state):
771
772 with m.Case(State.IDLE):
773 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
774 req_index, req_tag, replace_way,
775 real_addr)
776
777 with m.Case(State.CLR_TAG, State.WAIT_ACK):
778 with m.If(r.state == State.CLR_TAG):
779 self.icache_miss_clr_tag(m, r, replace_way,
780 req_index,
781 cache_valids)
782
783 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
784 cache_valids, stbs_done)
785
786 # TLB miss and protection fault processing
787 with m.If(flush_in | m_in.tlbld):
788 sync += r.fetch_failed.eq(0)
789 with m.Elif(i_in.req & ~access_ok & ~stall_in):
790 sync += r.fetch_failed.eq(1)
791
792 # icache_log: if LOG_LENGTH > 0 generate
793 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
794 req_is_miss, req_is_hit, lway, wstate, r):
795 comb = m.d.comb
796 sync = m.d.sync
797
798 bus, i_out = self.bus, self.i_out
799 log_out, stall_out = self.log_out, self.stall_out
800
801 # Output data to logger
802 for i in range(LOG_LENGTH):
803 log_data = Signal(54)
804 lway = Signal(self.WAY_BITS)
805 wstate = Signal()
806
807 sync += lway.eq(req_hit_way)
808 sync += wstate.eq(0)
809
810 with m.If(r.state != State.IDLE):
811 sync += wstate.eq(1)
812
813 sync += log_data.eq(Cat(
814 ra_valid, access_ok, req_is_miss, req_is_hit,
815 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
816 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
817 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
818 ))
819 comb += log_out.eq(log_data)
820
821 def elaborate(self, platform):
822
823 m = Module()
824 comb = m.d.comb
825
826 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
827 # number of ways and the number of lines.
828 vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
829 name="cachevalids")
830 m.submodules.cache_valids = cache_valids = vec
831
832 # TLB Array
833 itlb = self.TLBArray()
834 vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
835 m.submodules.itlb_valids = itlb_valid = vec
836
837 # TODO to be passed to nmigen as ram attributes
838 # attribute ram_style of itlb_tags : signal is "distributed";
839 # attribute ram_style of itlb_ptes : signal is "distributed";
840
841 # Privilege bit from PTE EAA field
842 eaa_priv = Signal()
843
844 r = RegInternal(self)
845
846 # Async signal on incoming request
847 req_index = Signal(self.INDEX_BITS)
848 req_row = Signal(self.ROW_BITS)
849 req_hit_way = Signal(self.WAY_BITS)
850 req_tag = Signal(self.TAG_BITS)
851 req_is_hit = Signal()
852 req_is_miss = Signal()
853 req_laddr = Signal(64)
854
855 tlb_req_index = Signal(self.TL_BITS)
856 real_addr = Signal(self.REAL_ADDR_BITS)
857 ra_valid = Signal()
858 priv_fault = Signal()
859 access_ok = Signal()
860 use_previous = Signal()
861
862 cache_out_row = Signal(self.ROW_SIZE_BITS)
863
864 plru_victim = Signal(self.WAY_BITS)
865 replace_way = Signal(self.WAY_BITS)
866
867 self.tlbmem = Memory(depth=self.TLB_SIZE,
868 width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS,
869 #attrs={'syn_ramstyle': "block_ram"}
870 )
871 self.tagmem = Memory(depth=self.NUM_LINES,
872 width=self.TAG_RAM_WIDTH,
873 #attrs={'syn_ramstyle': "block_ram"}
874 )
875
876 # call sub-functions putting everything together,
877 # using shared signals established above
878 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
879 self.maybe_plrus(m, r, plru_victim)
880 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
881 ra_valid, eaa_priv, priv_fault,
882 access_ok)
883 self.itlb_update(m, itlb, itlb_valid)
884 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
885 req_tag, real_addr, req_laddr,
886 cache_valids,
887 access_ok, req_is_hit, req_is_miss,
888 replace_way, plru_victim, cache_out_row)
889 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
890 req_index, req_tag, real_addr)
891 self.icache_miss(m, r, req_is_miss, req_index,
892 req_laddr, req_tag, replace_way,
893 cache_valids,
894 access_ok, real_addr)
895 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
896 # req_is_miss, req_is_hit, lway, wstate, r)
897
898 # don't connect up to FetchUnitInterface so that some unit tests
899 # can continue to operate
900 if not self.use_fetch_iface:
901 return m
902
903 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
904 # so needs checking and iterative revising
905 i_in, bus, i_out = self.i_in, self.bus, self.i_out
906 comb += i_in.req.eq(self.a_i_valid)
907 comb += i_in.nia.eq(self.a_pc_i)
908 comb += self.stall_in.eq(self.a_stall_i)
909 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
910 comb += self.f_badaddr_o.eq(i_out.nia)
911 comb += self.f_instr_o.eq(i_out.insn)
912 comb += self.f_busy_o.eq(~i_out.valid) # probably
913
914 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
915 ibus = self.ibus
916 comb += ibus.adr.eq(self.bus.adr)
917 comb += ibus.dat_w.eq(self.bus.dat_w)
918 comb += ibus.sel.eq(self.bus.sel)
919 comb += ibus.cyc.eq(self.bus.cyc)
920 comb += ibus.stb.eq(self.bus.stb)
921 comb += ibus.we.eq(self.bus.we)
922
923 comb += self.bus.dat_r.eq(ibus.dat_r)
924 comb += self.bus.ack.eq(ibus.ack)
925 if hasattr(ibus, "stall"):
926 comb += self.bus.stall.eq(ibus.stall)
927 else:
928 # fake-up the wishbone stall signal to comply with pipeline mode
929 # same thing is done in dcache.py
930 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
931
932 return m
933
934
935 def icache_sim(dut):
936 i_in = dut.i_in
937 i_out = dut.i_out
938 m_out = dut.m_in
939
940 yield i_in.priv_mode.eq(1)
941 yield i_in.req.eq(0)
942 yield i_in.nia.eq(0)
943 yield i_in.stop_mark.eq(0)
944 yield m_out.tlbld.eq(0)
945 yield m_out.tlbie.eq(0)
946 yield m_out.addr.eq(0)
947 yield m_out.pte.eq(0)
948 yield
949 yield
950 yield
951 yield
952
953 # miss, stalls for a bit
954 yield i_in.req.eq(1)
955 yield i_in.nia.eq(Const(0x0000000000000004, 64))
956 yield
957 valid = yield i_out.valid
958 while not valid:
959 yield
960 valid = yield i_out.valid
961 yield i_in.req.eq(0)
962
963 insn = yield i_out.insn
964 nia = yield i_out.nia
965 assert insn == 0x00000001, \
966 "insn @%x=%x expected 00000001" % (nia, insn)
967 yield i_in.req.eq(0)
968 yield
969
970 # hit
971 yield i_in.req.eq(1)
972 yield i_in.nia.eq(Const(0x0000000000000008, 64))
973 yield
974 valid = yield i_out.valid
975 while not valid:
976 yield
977 valid = yield i_out.valid
978 yield i_in.req.eq(0)
979
980 nia = yield i_out.nia
981 insn = yield i_out.insn
982 yield
983 assert insn == 0x00000002, \
984 "insn @%x=%x expected 00000002" % (nia, insn)
985
986 # another miss
987 yield i_in.req.eq(1)
988 yield i_in.nia.eq(Const(0x0000000000000040, 64))
989 yield
990 valid = yield i_out.valid
991 while not valid:
992 yield
993 valid = yield i_out.valid
994 yield i_in.req.eq(0)
995
996 nia = yield i_in.nia
997 insn = yield i_out.insn
998 assert insn == 0x00000010, \
999 "insn @%x=%x expected 00000010" % (nia, insn)
1000
1001 # test something that aliases (this only works because
1002 # the unit test SRAM is a depth of 512)
1003 yield i_in.req.eq(1)
1004 yield i_in.nia.eq(Const(0x0000000000000100, 64))
1005 yield
1006 yield
1007 valid = yield i_out.valid
1008 assert ~valid
1009 for i in range(30):
1010 yield
1011 yield
1012 insn = yield i_out.insn
1013 valid = yield i_out.valid
1014 insn = yield i_out.insn
1015 assert valid
1016 assert insn == 0x00000040, \
1017 "insn @%x=%x expected 00000040" % (nia, insn)
1018 yield i_in.req.eq(0)
1019
1020
1021 def test_icache(mem):
1022 from soc.config.test.test_loadstore import TestMemPspec
1023 pspec = TestMemPspec(addr_wid=32,
1024 mask_wid=8,
1025 reg_wid=64,
1026 )
1027 dut = ICache(pspec)
1028
1029 memory = Memory(width=64, depth=512, init=mem)
1030 sram = SRAM(memory=memory, granularity=8)
1031
1032 m = Module()
1033
1034 m.submodules.icache = dut
1035 m.submodules.sram = sram
1036
1037 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1038 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1039 m.d.comb += sram.bus.we.eq(dut.bus.we)
1040 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1041 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1042 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1043
1044 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1045 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1046
1047 # nmigen Simulation
1048 sim = Simulator(m)
1049 sim.add_clock(1e-6)
1050
1051 sim.add_sync_process(wrap(icache_sim(dut)))
1052 with sim.write_vcd('test_icache.vcd'):
1053 sim.run()
1054
1055
1056 if __name__ == '__main__':
1057 from soc.config.test.test_loadstore import TestMemPspec
1058 pspec = TestMemPspec(addr_wid=64,
1059 mask_wid=8,
1060 reg_wid=64,
1061 )
1062 dut = ICache(pspec)
1063 vl = rtlil.convert(dut, ports=[])
1064 with open("test_icache.il", "w") as f:
1065 f.write(vl)
1066
1067 # set up memory every 32-bits with incrementing values 0 1 2 ...
1068 mem = []
1069 for i in range(512):
1070 mem.append((i*2) | ((i*2+1)<<32))
1071
1072 test_icache(mem)