add opencores SDRAM verilog wrapper
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66 # from microwatt/utils.vhdl
67 def ispow2(n):
68 return n != 0 and (n & (n - 1)) == 0
69
70 SIM = 0
71 # Non-zero to enable log data collection
72 LOG_LENGTH = 0
73
74 class ICacheConfig:
75 def __init__(self, self.LINE_SIZE = 64
76 self.NUM_LINE = 16 # Number of lines in a set
77 self.NUM_WAYS = 1, # Number of ways
78 self.TLB_SIZE = 64, # L1 ITLB number of entries
79 self.TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
80 self.LINE_SIZE = 64
81 self.NUM_LINE = 16 # Number of lines in a set
82 self.NUM_WAYS = 1 # Number of ways
83 self.TLB_SIZE = 64 # L1 ITLB number of entries
84 self.TLB_LG_PGSZ = 12 # L1 ITLB log_2(page_size)
85
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
89 #
90 # self.ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
91 self.ROW_SIZE = WB_DATA_BITS // 8
92 # Number of real address bits that we store
93 self.REAL_ADDR_BITS = 56
94
95 self.ROW_SIZE_BITS = self.ROW_SIZE * 8
96 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
97 self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
98 # BRAM_ROWS is the number of rows in BRAM
99 # needed to represent the full icache
100 self.BRAM_ROWS = self.NUM_LINE * self.ROW_PER_LINE
101 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
102 self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
103
104 # Bit fields counts in the address
105 #
106 # INSN_BITS is the number of bits to select an instruction in a row
107 self.INSN_BITS = log2_int(self.INSN_PER_ROW)
108 # ROW_BITS is the number of bits to select a row
109 self.ROW_BITS = log2_int(self.BRAM_ROWS)
110 # ROW_LINE_BITS is the number of bits to select a row within a line
111 self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
112 # LINE_OFF_BITS is the number of bits for the offset in a cache line
113 self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
114 # ROW_OFF_BITS is the number of bits for the offset in a row
115 self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
116 # INDEX_BITS is the number of bits to select a cache line
117 self.INDEX_BITS = log2_int(self.NUM_LINE)
118 # SET_SIZE_BITS is the log base 2 of the set size
119 self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
120 # TAG_BITS is the number of bits of the tag part of the address
121 self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
122 # TAG_WIDTH is the width in bits of each way of the tag RAM
123 self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
124
125 # WAY_BITS is the number of bits to select a way
126 self.WAY_BITS = log2_int(self.NUM_WAYS)
127 self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
128
129 # L1 ITLB
130 self.TL_BITS = log2_int(self.TLB_SIZE)
131 self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS)
132 self.TLB_PTE_BITS = 64
133
134 print("self.BRAM_ROWS =", self.BRAM_ROWS)
135 print("self.INDEX_BITS =", self.INDEX_BITS)
136 print("self.INSN_BITS =", self.INSN_BITS)
137 print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
138 print("self.LINE_SIZE =", self.LINE_SIZE)
139 print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
140 print("LOG_LENGTH =", LOG_LENGTH)
141 print("self.NUM_LINE =", self.NUM_LINE)
142 print("self.NUM_WAYS =", self.NUM_WAYS)
143 print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
144 print("self.ROW_BITS =", self.ROW_BITS)
145 print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
146 print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
147 print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
148 print("self.ROW_SIZE =", self.ROW_SIZE)
149 print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
150 print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
151 print("SIM =", SIM)
152 print("self.TAG_BITS =", self.TAG_BITS)
153 print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
154 print("self.TAG_BITS =", self.TAG_BITS)
155 print("self.TL_BITS =", self.TL_BITS)
156 print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
157 print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
158 print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
159 print("self.TLB_SIZE =", self.TLB_SIZE)
160 print("self.WAY_BITS =", self.WAY_BITS)
161
162 assert self.LINE_SIZE % self.ROW_SIZE == 0
163 assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
164 assert ispow2(self.NUM_LINE), "self.NUM_LINE not power of 2"
165 assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
166 assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
167 assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
168 "geometry bits don't add up"
169 assert (self.LINE_OFF_BITS == (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
170 "geometry bits don't add up"
171 assert (self.REAL_ADDR_BITS == (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
172 "geometry bits don't add up"
173 assert (self.REAL_ADDR_BITS == (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
174 "geometry bits don't add up"
175
176 # Example of layout for 32 lines of 64 bytes:
177 #
178 # .. tag |index| line |
179 # .. | row | |
180 # .. | | | |00| zero (2)
181 # .. | | |-| | self.INSN_BITS (1)
182 # .. | |---| | self.ROW_LINE_BITS (3)
183 # .. | |--- - --| self.LINE_OFF_BITS (6)
184 # .. | |- --| self.ROW_OFF_BITS (3)
185 # .. |----- ---| | self.ROW_BITS (8)
186 # .. |-----| | self.INDEX_BITS (5)
187 # .. --------| | self.TAG_BITS (53)
188
189 # The cache data BRAM organized as described above for each way
190 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
191 #
192 def RowPerLineValidArray():
193 return Array(Signal(name="rows_valid_%d" %x) \
194 for x in range(self.ROW_PER_LINE))
195
196
197 # TODO to be passed to nigmen as ram attributes
198 # attribute ram_style : string;
199 # attribute ram_style of cache_tags : signal is "distributed";
200
201 def TLBRecord(name):
202 tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
203 ('pte', self.TLB_PTE_BITS)
204 ]
205 return Record(tlb_layout, name=name)
206
207 def TLBArray():
208 return Array(TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
209
210 # PLRU output interface
211 def PLRUOut():
212 return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
213 for x in range(self.NUM_LINE))
214
215 # Return the cache line index (tag index) for an address
216 def get_index(addr):
217 return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
218
219 # Return the cache row index (data memory) for an address
220 def get_row(addr):
221 return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
222
223 # Return the index of a row within a line
224 def get_row_of_line(row):
225 return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
226
227 # Returns whether this is the last row of a line
228 def is_last_row_addr(addr, last):
229 return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
230
231 # Returns whether this is the last row of a line
232 def is_last_row(row, last):
233 return get_row_of_line(row) == last
234
235 # Return the next row in the current cache line. We use a dedicated
236 # function in order to limit the size of the generated adder to be
237 # only the bits within a cache line (3 bits with default settings)
238 def next_row(row):
239 row_v = row[0:self.ROW_LINE_BITS] + 1
240 return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
241
242 # Read the instruction word for the given address
243 # in the current cache row
244 def read_insn_word(addr, data):
245 word = addr[2:self.INSN_BITS+2]
246 return data.word_select(word, 32)
247
248 # Get the tag value from the address
249 def get_tag(addr):
250 return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
251
252 # Read a tag from a tag memory row
253 def read_tag(way, tagset):
254 return tagset.word_select(way, self.TAG_BITS)
255
256 # Write a tag to tag memory row
257 def write_tag(way, tagset, tag):
258 return read_tag(way, tagset).eq(tag)
259
260 # Simple hash for direct-mapped TLB index
261 def hash_ea(addr):
262 hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
263 addr[self.TLB_LG_PGSZ + self.TL_BITS:self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
264 addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:self.TLB_LG_PGSZ + 3 * self.TL_BITS])
265 return hsh
266
267
268 # Cache reload state machine
269 @unique
270 class State(Enum):
271 IDLE = 0
272 CLR_TAG = 1
273 WAIT_ACK = 2
274
275
276 class RegInternal(RecordObject):
277 def __init__(self):
278 super().__init__()
279 # Cache hit state (Latches for 1 cycle BRAM access)
280 self.hit_way = Signal(self.WAY_BITS)
281 self.hit_nia = Signal(64)
282 self.hit_smark = Signal()
283 self.hit_valid = Signal()
284
285 # Cache miss state (reload state machine)
286 self.state = Signal(State, reset=State.IDLE)
287 self.wb = WBMasterOut("wb")
288 self.req_adr = Signal(64)
289 self.store_way = Signal(self.WAY_BITS)
290 self.store_index = Signal(self.INDEX_BITS)
291 self.store_row = Signal(self.ROW_BITS)
292 self.store_tag = Signal(self.TAG_BITS)
293 self.store_valid = Signal()
294 self.end_row_ix = Signal(self.ROW_LINE_BITS)
295 self.rows_valid = RowPerLineValidArray()
296
297 # TLB miss state
298 self.fetch_failed = Signal()
299
300
301 class ICache(FetchUnitInterface, Elaboratable):
302 """64 bit direct mapped icache. All instructions are 4B aligned."""
303 def __init__(self, pspec):
304 FetchUnitInterface.__init__(self, pspec)
305 self.i_in = Fetch1ToICacheType(name="i_in")
306 self.i_out = ICacheToDecode1Type(name="i_out")
307
308 self.m_in = MMUToICacheType(name="m_in")
309
310 self.stall_in = Signal()
311 self.stall_out = Signal()
312 self.flush_in = Signal()
313 self.inval_in = Signal()
314
315 # standard naming (wired to non-standard for compatibility)
316 self.bus = Interface(addr_width=32,
317 data_width=64,
318 granularity=8,
319 features={'stall'},
320 #alignment=0,
321 name="icache_wb")
322
323 self.log_out = Signal(54)
324
325 # use FetchUnitInterface, helps keep some unit tests running
326 self.use_fetch_iface = False
327
328 def use_fetch_interface(self):
329 self.use_fetch_iface = True
330
331 # Generate a cache RAM for each way
332 def rams(self, m, r, cache_out_row, use_previous,
333 replace_way, req_row):
334
335 comb = m.d.comb
336 sync = m.d.sync
337
338 bus, stall_in = self.bus, self.stall_in
339
340 # read condition (for every cache ram)
341 do_read = Signal()
342 comb += do_read.eq(~(stall_in | use_previous))
343
344 rd_addr = Signal(self.ROW_BITS)
345 wr_addr = Signal(self.ROW_BITS)
346 comb += rd_addr.eq(req_row)
347 comb += wr_addr.eq(r.store_row)
348
349 # binary-to-unary converters: replace-way enabled by bus.ack,
350 # hit-way left permanently enabled
351 m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
352 m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
353 comb += re.i.eq(replace_way)
354 comb += re.n.eq(~bus.ack)
355 comb += he.i.eq(r.hit_way)
356
357 for i in range(self.NUM_WAYS):
358 do_write = Signal(name="do_wr_%d" % i)
359 d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
360 wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
361
362 way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS, TRACE=True, ram_num=i)
363 m.submodules["cacheram_%d" % i] = way
364
365 comb += way.rd_en.eq(do_read)
366 comb += way.rd_addr.eq(rd_addr)
367 comb += d_out.eq(way.rd_data_o)
368 comb += way.wr_sel.eq(wr_sel)
369 comb += way.wr_addr.eq(wr_addr)
370 comb += way.wr_data.eq(bus.dat_r)
371
372 comb += do_write.eq(re.o[i])
373
374 with m.If(do_write):
375 sync += Display("cache write adr: %x data: %lx",
376 wr_addr, way.wr_data)
377
378 with m.If(he.o[i]):
379 comb += cache_out_row.eq(d_out)
380 with m.If(do_read):
381 sync += Display("cache read adr: %x data: %x",
382 req_row, d_out)
383
384 comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
385
386 # Generate PLRUs
387 def maybe_plrus(self, m, r, plru_victim):
388 comb = m.d.comb
389
390 if self.NUM_WAYS == 0:
391 return
392
393
394 m.submodules.plrus = plru = PLRUs(self.NUM_LINE, self.WAY_BITS)
395 comb += plru.way.eq(r.hit_way)
396 comb += plru.valid.eq(r.hit_valid)
397 comb += plru.index.eq(get_index(r.hit_nia))
398 comb += plru.isel.eq(r.store_index) # select victim
399 comb += plru_victim.eq(plru.o_index) # selected victim
400
401 # TLB hit detection and real address generation
402 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
403 real_addr, ra_valid, eaa_priv,
404 priv_fault, access_ok):
405
406 comb = m.d.comb
407
408 i_in = self.i_in
409
410 # use an *asynchronous* Memory read port here (combinatorial)
411 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
412 tlb = TLBRecord("tlb_rdport")
413 pte, ttag = tlb.pte, tlb.tag
414
415 comb += tlb_req_index.eq(hash_ea(i_in.nia))
416 comb += rd_tlb.addr.eq(tlb_req_index)
417 comb += tlb.eq(rd_tlb.data)
418
419 with m.If(i_in.virt_mode):
420 comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
421 pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
422
423 with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
424 comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
425
426 comb += eaa_priv.eq(pte[3])
427
428 with m.Else():
429 comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
430 comb += ra_valid.eq(1)
431 comb += eaa_priv.eq(1)
432
433 # No IAMR, so no KUEP support for now
434 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
435 comb += access_ok.eq(ra_valid & ~priv_fault)
436
437 # iTLB update
438 def itlb_update(self, m, itlb, itlb_valid):
439 comb = m.d.comb
440 sync = m.d.sync
441
442 m_in = self.m_in
443
444 wr_index = Signal(self.TL_BITS)
445 wr_unary = Signal(self.TLB_SIZE)
446 comb += wr_index.eq(hash_ea(m_in.addr))
447 comb += wr_unary.eq(1<<wr_index)
448
449 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
450 sync += itlb_valid.s.eq(0)
451 sync += itlb_valid.r.eq(0)
452
453 with m.If(m_in.tlbie & m_in.doall):
454 # Clear all valid bits
455 sync += itlb_valid.r.eq(-1)
456
457 with m.Elif(m_in.tlbie):
458 # Clear entry regardless of hit or miss
459 sync += itlb_valid.r.eq(wr_unary)
460
461 with m.Elif(m_in.tlbld):
462 tlb = TLBRecord("tlb_wrport")
463 comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
464 comb += tlb.pte.eq(m_in.pte)
465 comb += wr_tlb.en.eq(1)
466 comb += wr_tlb.addr.eq(wr_index)
467 comb += wr_tlb.data.eq(tlb)
468 sync += itlb_valid.s.eq(wr_unary)
469
470 # Cache hit detection, output to fetch2 and other misc logic
471 def icache_comb(self, m, use_previous, r, req_index, req_row,
472 req_hit_way, req_tag, real_addr, req_laddr,
473 cache_valids, access_ok,
474 req_is_hit, req_is_miss, replace_way,
475 plru_victim, cache_out_row):
476
477 comb = m.d.comb
478 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
479
480 i_in, i_out, bus = self.i_in, self.i_out, self.bus
481 flush_in, stall_out = self.flush_in, self.stall_out
482
483 is_hit = Signal()
484 hit_way = Signal(self.WAY_BITS)
485
486 # i_in.sequential means that i_in.nia this cycle is 4 more than
487 # last cycle. If we read more than 32 bits at a time, had a
488 # cache hit last cycle, and we don't want the first 32-bit chunk
489 # then we can keep the data we read last cycle and just use that.
490 with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
491 comb += use_previous.eq(i_in.sequential & r.hit_valid)
492
493 # Extract line, row and tag from request
494 comb += req_index.eq(get_index(i_in.nia))
495 comb += req_row.eq(get_row(i_in.nia))
496 comb += req_tag.eq(get_tag(real_addr))
497
498 # Calculate address of beginning of cache row, will be
499 # used for cache miss processing if needed
500 comb += req_laddr.eq(Cat(
501 Const(0, self.ROW_OFF_BITS),
502 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
503 ))
504
505 # Test if pending request is a hit on any way
506 hitcond = Signal()
507 comb += hitcond.eq((r.state == State.WAIT_ACK)
508 & (req_index == r.store_index)
509 & r.rows_valid[req_row % self.ROW_PER_LINE]
510 )
511 # i_in.req asserts Decoder active
512 cvb = Signal(self.NUM_WAYS)
513 ctag = Signal(self.TAG_RAM_WIDTH)
514 comb += rd_tag.addr.eq(req_index)
515 comb += ctag.eq(rd_tag.data)
516 comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
517 m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
518 comb += se.i.eq(r.store_way)
519 comb += se.n.eq(~i_in.req)
520 for i in range(self.NUM_WAYS):
521 tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
522 hit_test = Signal(name="hit_test%d" % i)
523 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
524 comb += tagi.eq(read_tag(i, ctag))
525 comb += hit_test.eq(se.o[i])
526 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
527 (tagi == req_tag))
528 with m.If(is_tag_hit):
529 comb += hit_way.eq(i)
530 comb += is_hit.eq(1)
531
532 # Generate the "hit" and "miss" signals
533 # for the synchronous blocks
534 with m.If(i_in.req & access_ok & ~flush_in):
535 comb += req_is_hit.eq(is_hit)
536 comb += req_is_miss.eq(~is_hit)
537
538 comb += req_hit_way.eq(hit_way)
539
540 # The way to replace on a miss
541 with m.If(r.state == State.CLR_TAG):
542 comb += replace_way.eq(plru_victim)
543 with m.Else():
544 comb += replace_way.eq(r.store_way)
545
546 # Output instruction from current cache row
547 #
548 # Note: This is a mild violation of our design principle of
549 # having pipeline stages output from a clean latch. In this
550 # case we output the result of a mux. The alternative would
551 # be output an entire row which I prefer not to do just yet
552 # as it would force fetch2 to know about some of the cache
553 # geometry information.
554 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
555 comb += i_out.valid.eq(r.hit_valid)
556 comb += i_out.nia.eq(r.hit_nia)
557 comb += i_out.stop_mark.eq(r.hit_smark)
558 comb += i_out.fetch_failed.eq(r.fetch_failed)
559
560 # Stall fetch1 if we have a miss on cache or TLB
561 # or a protection fault
562 comb += stall_out.eq(~(is_hit & access_ok))
563
564 # Wishbone requests output (from the cache miss reload machine)
565 comb += bus.we.eq(r.wb.we)
566 comb += bus.adr.eq(r.wb.adr)
567 comb += bus.sel.eq(r.wb.sel)
568 comb += bus.stb.eq(r.wb.stb)
569 comb += bus.dat_w.eq(r.wb.dat)
570 comb += bus.cyc.eq(r.wb.cyc)
571
572 # Cache hit synchronous machine
573 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
574 req_index, req_tag, real_addr):
575 sync = m.d.sync
576
577 i_in, stall_in = self.i_in, self.stall_in
578 flush_in = self.flush_in
579
580 # keep outputs to fetch2 unchanged on a stall
581 # except that flush or reset sets valid to 0
582 # If use_previous, keep the same data as last
583 # cycle and use the second half
584 with m.If(stall_in | use_previous):
585 with m.If(flush_in):
586 sync += r.hit_valid.eq(0)
587 with m.Else():
588 # On a hit, latch the request for the next cycle,
589 # when the BRAM data will be available on the
590 # cache_out output of the corresponding way
591 sync += r.hit_valid.eq(req_is_hit)
592
593 with m.If(req_is_hit):
594 sync += r.hit_way.eq(req_hit_way)
595 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
596 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
597 i_in.stop_mark, req_index, req_tag,
598 req_hit_way, real_addr)
599
600 with m.If(~stall_in):
601 # Send stop marks and NIA down regardless of validity
602 sync += r.hit_smark.eq(i_in.stop_mark)
603 sync += r.hit_nia.eq(i_in.nia)
604
605 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
606 req_index, req_tag, replace_way, real_addr):
607 comb = m.d.comb
608 sync = m.d.sync
609
610 i_in = self.i_in
611
612 # Reset per-row valid flags, only used in WAIT_ACK
613 for i in range(self.ROW_PER_LINE):
614 sync += r.rows_valid[i].eq(0)
615
616 # We need to read a cache line
617 with m.If(req_is_miss):
618 sync += Display(
619 "cache miss nia:%x IR:%x SM:%x idx:%x "
620 " way:%x tag:%x RA:%x", i_in.nia,
621 i_in.virt_mode, i_in.stop_mark, req_index,
622 replace_way, req_tag, real_addr)
623
624 # Keep track of our index and way for subsequent stores
625 st_row = Signal(self.ROW_BITS)
626 comb += st_row.eq(get_row(req_laddr))
627 sync += r.store_index.eq(req_index)
628 sync += r.store_row.eq(st_row)
629 sync += r.store_tag.eq(req_tag)
630 sync += r.store_valid.eq(1)
631 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
632
633 # Prep for first wishbone read. We calculate the address
634 # of the start of the cache line and start the WB cycle.
635 sync += r.req_adr.eq(req_laddr)
636 sync += r.wb.cyc.eq(1)
637 sync += r.wb.stb.eq(1)
638
639 # Track that we had one request sent
640 sync += r.state.eq(State.CLR_TAG)
641
642 def icache_miss_clr_tag(self, m, r, replace_way,
643 req_index,
644 cache_valids):
645 comb = m.d.comb
646 sync = m.d.sync
647 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
648 granularity=self.TAG_BITS)
649
650 # Get victim way from plru
651 sync += r.store_way.eq(replace_way)
652
653 # Force misses on that way while reloading that line
654 idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
655 comb += cache_valids.r.eq(1<<idx)
656
657 # use write-port "granularity" to select the tag to write to
658 # TODO: the Memory should be multipled-up (by NUM_TAGS)
659 tagset = Signal(self.TAG_RAM_WIDTH)
660 comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
661 comb += wr_tag.en.eq(1<<replace_way)
662 comb += wr_tag.addr.eq(r.store_index)
663 comb += wr_tag.data.eq(tagset)
664
665 sync += r.state.eq(State.WAIT_ACK)
666
667 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
668 cache_valids, stbs_done):
669 comb = m.d.comb
670 sync = m.d.sync
671
672 bus = self.bus
673
674 # Requests are all sent if stb is 0
675 stbs_zero = Signal()
676 comb += stbs_zero.eq(r.wb.stb == 0)
677 comb += stbs_done.eq(stbs_zero)
678
679 # If we are still sending requests, was one accepted?
680 with m.If(~bus.stall & ~stbs_zero):
681 # That was the last word? We are done sending.
682 # Clear stb and set stbs_done so we can handle
683 # an eventual last ack on the same cycle.
684 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
685 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
686 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
687 "stbs_done:%x", r.wb.adr, r.end_row_ix,
688 r.wb.stb, stbs_zero, stbs_done)
689 sync += r.wb.stb.eq(0)
690 comb += stbs_done.eq(1)
691
692 # Calculate the next row address
693 rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
694 comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] + 1)
695 sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
696 sync += Display("RARANGE r.req_adr:%x rarange:%x "
697 "stbs_zero:%x stbs_done:%x",
698 r.req_adr, rarange, stbs_zero, stbs_done)
699
700 # Incoming acks processing
701 with m.If(bus.ack):
702 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
703 "stbs_done:%x",
704 bus.dat_r, stbs_zero, stbs_done)
705
706 sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
707
708 # Check for completion
709 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
710 # Complete wishbone cycle
711 sync += r.wb.cyc.eq(0)
712 # be nice, clear addr
713 sync += r.req_adr.eq(0)
714
715 # Cache line is now valid
716 idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
717 valid = r.store_valid & ~inval_in
718 comb += cache_valids.s.eq(1<<idx)
719 sync += r.state.eq(State.IDLE)
720
721 # move on to next request in row
722 # Increment store row counter
723 sync += r.store_row.eq(next_row(r.store_row))
724
725 # Cache miss/reload synchronous machine
726 def icache_miss(self, m, r, req_is_miss,
727 req_index, req_laddr, req_tag, replace_way,
728 cache_valids, access_ok, real_addr):
729 comb = m.d.comb
730 sync = m.d.sync
731
732 i_in, bus, m_in = self.i_in, self.bus, self.m_in
733 stall_in, flush_in = self.stall_in, self.flush_in
734 inval_in = self.inval_in
735
736 stbs_done = Signal()
737
738 comb += r.wb.sel.eq(-1)
739 comb += r.wb.adr.eq(r.req_adr[3:])
740
741 # Process cache invalidations
742 with m.If(inval_in):
743 comb += cache_valids.r.eq(-1)
744 sync += r.store_valid.eq(0)
745
746 # Main state machine
747 with m.Switch(r.state):
748
749 with m.Case(State.IDLE):
750 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
751 req_index, req_tag, replace_way,
752 real_addr)
753
754 with m.Case(State.CLR_TAG, State.WAIT_ACK):
755 with m.If(r.state == State.CLR_TAG):
756 self.icache_miss_clr_tag(m, r, replace_way,
757 req_index,
758 cache_valids)
759
760 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
761 cache_valids, stbs_done)
762
763 # TLB miss and protection fault processing
764 with m.If(flush_in | m_in.tlbld):
765 sync += r.fetch_failed.eq(0)
766 with m.Elif(i_in.req & ~access_ok & ~stall_in):
767 sync += r.fetch_failed.eq(1)
768
769 # icache_log: if LOG_LENGTH > 0 generate
770 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
771 req_is_miss, req_is_hit, lway, wstate, r):
772 comb = m.d.comb
773 sync = m.d.sync
774
775 bus, i_out = self.bus, self.i_out
776 log_out, stall_out = self.log_out, self.stall_out
777
778 # Output data to logger
779 for i in range(LOG_LENGTH):
780 log_data = Signal(54)
781 lway = Signal(self.WAY_BITS)
782 wstate = Signal()
783
784 sync += lway.eq(req_hit_way)
785 sync += wstate.eq(0)
786
787 with m.If(r.state != State.IDLE):
788 sync += wstate.eq(1)
789
790 sync += log_data.eq(Cat(
791 ra_valid, access_ok, req_is_miss, req_is_hit,
792 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
793 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
794 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
795 ))
796 comb += log_out.eq(log_data)
797
798 def elaborate(self, platform):
799
800 m = Module()
801 comb = m.d.comb
802
803 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
804 # number of ways and the number of lines.
805 vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINE, name="cachevalids")
806 m.submodules.cache_valids = cache_valids = vec
807
808 # TLB Array
809 itlb = TLBArray()
810 vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
811 m.submodules.itlb_valids = itlb_valid = vec
812
813 # TODO to be passed to nmigen as ram attributes
814 # attribute ram_style of itlb_tags : signal is "distributed";
815 # attribute ram_style of itlb_ptes : signal is "distributed";
816
817 # Privilege bit from PTE EAA field
818 eaa_priv = Signal()
819
820 r = RegInternal()
821
822 # Async signal on incoming request
823 req_index = Signal(self.INDEX_BITS)
824 req_row = Signal(self.ROW_BITS)
825 req_hit_way = Signal(self.WAY_BITS)
826 req_tag = Signal(self.TAG_BITS)
827 req_is_hit = Signal()
828 req_is_miss = Signal()
829 req_laddr = Signal(64)
830
831 tlb_req_index = Signal(self.TL_BITS)
832 real_addr = Signal(self.REAL_ADDR_BITS)
833 ra_valid = Signal()
834 priv_fault = Signal()
835 access_ok = Signal()
836 use_previous = Signal()
837
838 cache_out_row = Signal(self.ROW_SIZE_BITS)
839
840 plru_victim = Signal(self.WAY_BITS)
841 replace_way = Signal(self.WAY_BITS)
842
843 self.tlbmem = Memory(depth=self.TLB_SIZE, width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS)
844 self.tagmem = Memory(depth=self.NUM_LINE, width=self.TAG_RAM_WIDTH)
845
846 # call sub-functions putting everything together,
847 # using shared signals established above
848 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
849 self.maybe_plrus(m, r, plru_victim)
850 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
851 ra_valid, eaa_priv, priv_fault,
852 access_ok)
853 self.itlb_update(m, itlb, itlb_valid)
854 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
855 req_tag, real_addr, req_laddr,
856 cache_valids,
857 access_ok, req_is_hit, req_is_miss,
858 replace_way, plru_victim, cache_out_row)
859 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
860 req_index, req_tag, real_addr)
861 self.icache_miss(m, r, req_is_miss, req_index,
862 req_laddr, req_tag, replace_way,
863 cache_valids,
864 access_ok, real_addr)
865 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
866 # req_is_miss, req_is_hit, lway, wstate, r)
867
868 # don't connect up to FetchUnitInterface so that some unit tests
869 # can continue to operate
870 if not self.use_fetch_iface:
871 return m
872
873 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
874 # so needs checking and iterative revising
875 i_in, bus, i_out = self.i_in, self.bus, self.i_out
876 comb += i_in.req.eq(self.a_i_valid)
877 comb += i_in.nia.eq(self.a_pc_i)
878 comb += self.stall_in.eq(self.a_stall_i)
879 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
880 comb += self.f_badaddr_o.eq(i_out.nia)
881 comb += self.f_instr_o.eq(i_out.insn)
882 comb += self.f_busy_o.eq(~i_out.valid) # probably
883
884 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
885 ibus = self.ibus
886 comb += ibus.adr.eq(self.bus.adr)
887 comb += ibus.dat_w.eq(self.bus.dat_w)
888 comb += ibus.sel.eq(self.bus.sel)
889 comb += ibus.cyc.eq(self.bus.cyc)
890 comb += ibus.stb.eq(self.bus.stb)
891 comb += ibus.we.eq(self.bus.we)
892
893 comb += self.bus.dat_r.eq(ibus.dat_r)
894 comb += self.bus.ack.eq(ibus.ack)
895 if hasattr(ibus, "stall"):
896 comb += self.bus.stall.eq(ibus.stall)
897 else:
898 # fake-up the wishbone stall signal to comply with pipeline mode
899 # same thing is done in dcache.py
900 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
901
902 return m
903
904
905 def icache_sim(dut):
906 i_in = dut.i_in
907 i_out = dut.i_out
908 m_out = dut.m_in
909
910 yield i_in.priv_mode.eq(1)
911 yield i_in.req.eq(0)
912 yield i_in.nia.eq(0)
913 yield i_in.stop_mark.eq(0)
914 yield m_out.tlbld.eq(0)
915 yield m_out.tlbie.eq(0)
916 yield m_out.addr.eq(0)
917 yield m_out.pte.eq(0)
918 yield
919 yield
920 yield
921 yield
922
923 # miss, stalls for a bit
924 yield i_in.req.eq(1)
925 yield i_in.nia.eq(Const(0x0000000000000004, 64))
926 yield
927 valid = yield i_out.valid
928 while not valid:
929 yield
930 valid = yield i_out.valid
931 yield i_in.req.eq(0)
932
933 insn = yield i_out.insn
934 nia = yield i_out.nia
935 assert insn == 0x00000001, \
936 "insn @%x=%x expected 00000001" % (nia, insn)
937 yield i_in.req.eq(0)
938 yield
939
940 # hit
941 yield i_in.req.eq(1)
942 yield i_in.nia.eq(Const(0x0000000000000008, 64))
943 yield
944 valid = yield i_out.valid
945 while not valid:
946 yield
947 valid = yield i_out.valid
948 yield i_in.req.eq(0)
949
950 nia = yield i_out.nia
951 insn = yield i_out.insn
952 yield
953 assert insn == 0x00000002, \
954 "insn @%x=%x expected 00000002" % (nia, insn)
955
956 # another miss
957 yield i_in.req.eq(1)
958 yield i_in.nia.eq(Const(0x0000000000000040, 64))
959 yield
960 valid = yield i_out.valid
961 while not valid:
962 yield
963 valid = yield i_out.valid
964 yield i_in.req.eq(0)
965
966 nia = yield i_in.nia
967 insn = yield i_out.insn
968 assert insn == 0x00000010, \
969 "insn @%x=%x expected 00000010" % (nia, insn)
970
971 # test something that aliases (this only works because
972 # the unit test SRAM is a depth of 512)
973 yield i_in.req.eq(1)
974 yield i_in.nia.eq(Const(0x0000000000000100, 64))
975 yield
976 yield
977 valid = yield i_out.valid
978 assert ~valid
979 for i in range(30):
980 yield
981 yield
982 insn = yield i_out.insn
983 valid = yield i_out.valid
984 insn = yield i_out.insn
985 assert valid
986 assert insn == 0x00000040, \
987 "insn @%x=%x expected 00000040" % (nia, insn)
988 yield i_in.req.eq(0)
989
990
991 def test_icache(mem):
992 from soc.config.test.test_loadstore import TestMemPspec
993 pspec = TestMemPspec(addr_wid=32,
994 mask_wid=8,
995 reg_wid=64,
996 )
997 dut = ICache(pspec)
998
999 memory = Memory(width=64, depth=512, init=mem)
1000 sram = SRAM(memory=memory, granularity=8)
1001
1002 m = Module()
1003
1004 m.submodules.icache = dut
1005 m.submodules.sram = sram
1006
1007 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1008 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1009 m.d.comb += sram.bus.we.eq(dut.bus.we)
1010 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1011 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1012 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1013
1014 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1015 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1016
1017 # nmigen Simulation
1018 sim = Simulator(m)
1019 sim.add_clock(1e-6)
1020
1021 sim.add_sync_process(wrap(icache_sim(dut)))
1022 with sim.write_vcd('test_icache.vcd'):
1023 sim.run()
1024
1025
1026 if __name__ == '__main__':
1027 from soc.config.test.test_loadstore import TestMemPspec
1028 pspec = TestMemPspec(addr_wid=64,
1029 mask_wid=8,
1030 reg_wid=64,
1031 )
1032 dut = ICache(pspec)
1033 vl = rtlil.convert(dut, ports=[])
1034 with open("test_icache.il", "w") as f:
1035 f.write(vl)
1036
1037 # set up memory every 32-bits with incrementing values 0 1 2 ...
1038 mem = []
1039 for i in range(512):
1040 mem.append((i*2) | ((i*2+1)<<32))
1041
1042 test_icache(mem)