use an SRLatch for cache_valids, at least it reduces graphviz size
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66
67 SIM = 0
68 LINE_SIZE = 64
69 # BRAM organisation: We never access more than wishbone_data_bits
70 # at a time so to save resources we make the array only that wide,
71 # and use consecutive indices for to make a cache "line"
72 #
73 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
74 ROW_SIZE = WB_DATA_BITS // 8
75 # Number of lines in a set
76 NUM_LINES = 64
77 # Number of ways
78 NUM_WAYS = 2
79 # L1 ITLB number of entries (direct mapped)
80 TLB_SIZE = 64
81 # L1 ITLB log_2(page_size)
82 TLB_LG_PGSZ = 12
83 # Number of real address bits that we store
84 REAL_ADDR_BITS = 56
85 # Non-zero to enable log data collection
86 LOG_LENGTH = 0
87
88 ROW_SIZE_BITS = ROW_SIZE * 8
89 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
90 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
91 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
92 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
93 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
94 INSN_PER_ROW = ROW_SIZE_BITS // 32
95
96 # Bit fields counts in the address
97 #
98 # INSN_BITS is the number of bits to select an instruction in a row
99 INSN_BITS = log2_int(INSN_PER_ROW)
100 # ROW_BITS is the number of bits to select a row
101 ROW_BITS = log2_int(BRAM_ROWS)
102 # ROW_LINE_BITS is the number of bits to select a row within a line
103 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
104 # LINE_OFF_BITS is the number of bits for the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for the offset in a row
107 ROW_OFF_BITS = log2_int(ROW_SIZE)
108 # INDEX_BITS is the number of bits to select a cache line
109 INDEX_BITS = log2_int(NUM_LINES)
110 # SET_SIZE_BITS is the log base 2 of the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of the tag part of the address
113 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
114 # TAG_WIDTH is the width in bits of each way of the tag RAM
115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
116
117 # WAY_BITS is the number of bits to select a way
118 WAY_BITS = log2_int(NUM_WAYS)
119 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
120
121 # L1 ITLB
122 TLB_BITS = log2_int(TLB_SIZE)
123 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
124 TLB_PTE_BITS = 64
125
126 print("BRAM_ROWS =", BRAM_ROWS)
127 print("INDEX_BITS =", INDEX_BITS)
128 print("INSN_BITS =", INSN_BITS)
129 print("INSN_PER_ROW =", INSN_PER_ROW)
130 print("LINE_SIZE =", LINE_SIZE)
131 print("LINE_OFF_BITS =", LINE_OFF_BITS)
132 print("LOG_LENGTH =", LOG_LENGTH)
133 print("NUM_LINES =", NUM_LINES)
134 print("NUM_WAYS =", NUM_WAYS)
135 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
136 print("ROW_BITS =", ROW_BITS)
137 print("ROW_OFF_BITS =", ROW_OFF_BITS)
138 print("ROW_LINE_BITS =", ROW_LINE_BITS)
139 print("ROW_PER_LINE =", ROW_PER_LINE)
140 print("ROW_SIZE =", ROW_SIZE)
141 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
142 print("SET_SIZE_BITS =", SET_SIZE_BITS)
143 print("SIM =", SIM)
144 print("TAG_BITS =", TAG_BITS)
145 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
146 print("TAG_BITS =", TAG_BITS)
147 print("TLB_BITS =", TLB_BITS)
148 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
149 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
150 print("TLB_PTE_BITS =", TLB_PTE_BITS)
151 print("TLB_SIZE =", TLB_SIZE)
152 print("WAY_BITS =", WAY_BITS)
153
154 # from microwatt/utils.vhdl
155 def ispow2(n):
156 return n != 0 and (n & (n - 1)) == 0
157
158 assert LINE_SIZE % ROW_SIZE == 0
159 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
160 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
161 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
162 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
163 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
164 "geometry bits don't add up"
165 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
166 "geometry bits don't add up"
167 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
168 "geometry bits don't add up"
169 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
170 "geometry bits don't add up"
171
172 # Example of layout for 32 lines of 64 bytes:
173 #
174 # .. tag |index| line |
175 # .. | row | |
176 # .. | | | |00| zero (2)
177 # .. | | |-| | INSN_BITS (1)
178 # .. | |---| | ROW_LINE_BITS (3)
179 # .. | |--- - --| LINE_OFF_BITS (6)
180 # .. | |- --| ROW_OFF_BITS (3)
181 # .. |----- ---| | ROW_BITS (8)
182 # .. |-----| | INDEX_BITS (5)
183 # .. --------| | TAG_BITS (53)
184
185 # The cache data BRAM organized as described above for each way
186 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
187 #
188 def RowPerLineValidArray():
189 return Array(Signal(name="rows_valid_%d" %x) \
190 for x in range(ROW_PER_LINE))
191
192
193 # TODO to be passed to nigmen as ram attributes
194 # attribute ram_style : string;
195 # attribute ram_style of cache_tags : signal is "distributed";
196
197 def TLBValidArray():
198 return Array(Signal(name="tlb_valid%d" % x)
199 for x in range(TLB_SIZE))
200
201 def TLBRecord(name):
202 tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
203 ('pte', TLB_PTE_BITS)
204 ]
205 return Record(tlb_layout, name=name)
206
207 def TLBArray():
208 return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
209
210 # PLRU output interface
211 def PLRUOut():
212 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
213 for x in range(NUM_LINES))
214
215 # Return the cache line index (tag index) for an address
216 def get_index(addr):
217 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
218
219 # Return the cache row index (data memory) for an address
220 def get_row(addr):
221 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
222
223 # Return the index of a row within a line
224 def get_row_of_line(row):
225 return row[:ROW_BITS][:ROW_LINE_BITS]
226
227 # Returns whether this is the last row of a line
228 def is_last_row_addr(addr, last):
229 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
230
231 # Returns whether this is the last row of a line
232 def is_last_row(row, last):
233 return get_row_of_line(row) == last
234
235 # Return the next row in the current cache line. We use a dedicated
236 # function in order to limit the size of the generated adder to be
237 # only the bits within a cache line (3 bits with default settings)
238 def next_row(row):
239 row_v = row[0:ROW_LINE_BITS] + 1
240 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
241
242 # Read the instruction word for the given address
243 # in the current cache row
244 def read_insn_word(addr, data):
245 word = addr[2:INSN_BITS+2]
246 return data.word_select(word, 32)
247
248 # Get the tag value from the address
249 def get_tag(addr):
250 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
251
252 # Read a tag from a tag memory row
253 def read_tag(way, tagset):
254 return tagset.word_select(way, TAG_BITS)
255
256 # Write a tag to tag memory row
257 def write_tag(way, tagset, tag):
258 return read_tag(way, tagset).eq(tag)
259
260 # Simple hash for direct-mapped TLB index
261 def hash_ea(addr):
262 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
263 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
264 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
265 return hsh
266
267
268 # Cache reload state machine
269 @unique
270 class State(Enum):
271 IDLE = 0
272 CLR_TAG = 1
273 WAIT_ACK = 2
274
275
276 class RegInternal(RecordObject):
277 def __init__(self):
278 super().__init__()
279 # Cache hit state (Latches for 1 cycle BRAM access)
280 self.hit_way = Signal(WAY_BITS)
281 self.hit_nia = Signal(64)
282 self.hit_smark = Signal()
283 self.hit_valid = Signal()
284
285 # Cache miss state (reload state machine)
286 self.state = Signal(State, reset=State.IDLE)
287 self.wb = WBMasterOut("wb")
288 self.req_adr = Signal(64)
289 self.store_way = Signal(WAY_BITS)
290 self.store_index = Signal(INDEX_BITS)
291 self.store_row = Signal(ROW_BITS)
292 self.store_tag = Signal(TAG_BITS)
293 self.store_valid = Signal()
294 self.end_row_ix = Signal(ROW_LINE_BITS)
295 self.rows_valid = RowPerLineValidArray()
296
297 # TLB miss state
298 self.fetch_failed = Signal()
299
300
301 class ICache(FetchUnitInterface, Elaboratable):
302 """64 bit direct mapped icache. All instructions are 4B aligned."""
303 def __init__(self, pspec):
304 FetchUnitInterface.__init__(self, pspec)
305 self.i_in = Fetch1ToICacheType(name="i_in")
306 self.i_out = ICacheToDecode1Type(name="i_out")
307
308 self.m_in = MMUToICacheType(name="m_in")
309
310 self.stall_in = Signal()
311 self.stall_out = Signal()
312 self.flush_in = Signal()
313 self.inval_in = Signal()
314
315 # standard naming (wired to non-standard for compatibility)
316 self.bus = Interface(addr_width=32,
317 data_width=64,
318 granularity=8,
319 features={'stall'},
320 alignment=0,
321 name="icache_wb")
322
323 self.log_out = Signal(54)
324
325 # use FetchUnitInterface, helps keep some unit tests running
326 self.use_fetch_iface = False
327
328 def use_fetch_interface(self):
329 self.use_fetch_iface = True
330
331 # Generate a cache RAM for each way
332 def rams(self, m, r, cache_out_row, use_previous,
333 replace_way, req_row):
334
335 comb = m.d.comb
336 sync = m.d.sync
337
338 bus, stall_in = self.bus, self.stall_in
339
340 # read condition (for every cache ram)
341 do_read = Signal()
342 comb += do_read.eq(~(stall_in | use_previous))
343
344 rd_addr = Signal(ROW_BITS)
345 wr_addr = Signal(ROW_BITS)
346 comb += rd_addr.eq(req_row)
347 comb += wr_addr.eq(r.store_row)
348
349 # binary-to-unary converters: replace-way enabled by bus.ack,
350 # hit-way left permanently enabled
351 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
352 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
353 comb += re.i.eq(replace_way)
354 comb += re.n.eq(~bus.ack)
355 comb += he.i.eq(r.hit_way)
356
357 for i in range(NUM_WAYS):
358 do_write = Signal(name="do_wr_%d" % i)
359 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
360 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
361
362 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
363 m.submodules["cacheram_%d" % i] = way
364
365 comb += way.rd_en.eq(do_read)
366 comb += way.rd_addr.eq(rd_addr)
367 comb += d_out.eq(way.rd_data_o)
368 comb += way.wr_sel.eq(wr_sel)
369 comb += way.wr_addr.eq(wr_addr)
370 comb += way.wr_data.eq(bus.dat_r)
371
372 comb += do_write.eq(re.o[i])
373
374 with m.If(do_write):
375 sync += Display("cache write adr: %x data: %lx",
376 wr_addr, way.wr_data)
377
378 with m.If(he.o[i]):
379 comb += cache_out_row.eq(d_out)
380 with m.If(do_read):
381 sync += Display("cache read adr: %x data: %x",
382 req_row, d_out)
383
384 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
385
386 # Generate PLRUs
387 def maybe_plrus(self, m, r, plru_victim):
388 comb = m.d.comb
389
390 if NUM_WAYS == 0:
391 return
392
393
394 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
395 comb += plru.way.eq(r.hit_way)
396 comb += plru.valid.eq(r.hit_valid)
397 comb += plru.index.eq(get_index(r.hit_nia))
398 comb += plru.isel.eq(r.store_index) # select victim
399 comb += plru_victim.eq(plru.o_index) # selected victim
400
401 # TLB hit detection and real address generation
402 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
403 real_addr, ra_valid, eaa_priv,
404 priv_fault, access_ok):
405
406 comb = m.d.comb
407
408 i_in = self.i_in
409
410 # use an *asynchronous* Memory read port here (combinatorial)
411 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
412 tlb = TLBRecord("tlb_rdport")
413 pte, ttag = tlb.pte, tlb.tag
414
415 comb += tlb_req_index.eq(hash_ea(i_in.nia))
416 comb += rd_tlb.addr.eq(tlb_req_index)
417 comb += tlb.eq(rd_tlb.data)
418
419 with m.If(i_in.virt_mode):
420 comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
421 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
422
423 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
424 comb += ra_valid.eq(itlb_valid[tlb_req_index])
425
426 comb += eaa_priv.eq(pte[3])
427
428 with m.Else():
429 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
430 comb += ra_valid.eq(1)
431 comb += eaa_priv.eq(1)
432
433 # No IAMR, so no KUEP support for now
434 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
435 comb += access_ok.eq(ra_valid & ~priv_fault)
436
437 # iTLB update
438 def itlb_update(self, m, itlb, itlb_valid):
439 comb = m.d.comb
440 sync = m.d.sync
441
442 m_in = self.m_in
443
444 wr_index = Signal(TLB_SIZE)
445 comb += wr_index.eq(hash_ea(m_in.addr))
446
447 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
448
449 with m.If(m_in.tlbie & m_in.doall):
450 # Clear all valid bits
451 for i in range(TLB_SIZE):
452 sync += itlb_valid[i].eq(0)
453
454 with m.Elif(m_in.tlbie):
455 # Clear entry regardless of hit or miss
456 sync += itlb_valid[wr_index].eq(0)
457
458 with m.Elif(m_in.tlbld):
459 tlb = TLBRecord("tlb_wrport")
460 comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
461 comb += tlb.pte.eq(m_in.pte)
462 comb += wr_tlb.en.eq(1)
463 comb += wr_tlb.addr.eq(wr_index)
464 comb += wr_tlb.data.eq(tlb)
465 sync += itlb_valid[wr_index].eq(1)
466
467 # Cache hit detection, output to fetch2 and other misc logic
468 def icache_comb(self, m, use_previous, r, req_index, req_row,
469 req_hit_way, req_tag, real_addr, req_laddr,
470 cache_valids, access_ok,
471 req_is_hit, req_is_miss, replace_way,
472 plru_victim, cache_out_row):
473
474 comb = m.d.comb
475 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
476
477 i_in, i_out, bus = self.i_in, self.i_out, self.bus
478 flush_in, stall_out = self.flush_in, self.stall_out
479
480 is_hit = Signal()
481 hit_way = Signal(WAY_BITS)
482
483 # i_in.sequential means that i_in.nia this cycle is 4 more than
484 # last cycle. If we read more than 32 bits at a time, had a
485 # cache hit last cycle, and we don't want the first 32-bit chunk
486 # then we can keep the data we read last cycle and just use that.
487 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
488 comb += use_previous.eq(i_in.sequential & r.hit_valid)
489
490 # Extract line, row and tag from request
491 comb += req_index.eq(get_index(i_in.nia))
492 comb += req_row.eq(get_row(i_in.nia))
493 comb += req_tag.eq(get_tag(real_addr))
494
495 # Calculate address of beginning of cache row, will be
496 # used for cache miss processing if needed
497 comb += req_laddr.eq(Cat(
498 Const(0, ROW_OFF_BITS),
499 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
500 ))
501
502 # Test if pending request is a hit on any way
503 hitcond = Signal()
504 comb += hitcond.eq((r.state == State.WAIT_ACK)
505 & (req_index == r.store_index)
506 & r.rows_valid[req_row % ROW_PER_LINE]
507 )
508 # i_in.req asserts Decoder active
509 cvb = Signal(NUM_WAYS)
510 ctag = Signal(TAG_RAM_WIDTH)
511 comb += rd_tag.addr.eq(req_index)
512 comb += ctag.eq(rd_tag.data)
513 comb += cvb.eq(cache_valids.q.word_select(req_index, NUM_WAYS))
514 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
515 comb += se.i.eq(r.store_way)
516 comb += se.n.eq(~i_in.req)
517 for i in range(NUM_WAYS):
518 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
519 hit_test = Signal(name="hit_test%d" % i)
520 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
521 comb += tagi.eq(read_tag(i, ctag))
522 comb += hit_test.eq(se.o[i])
523 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
524 (tagi == req_tag))
525 with m.If(is_tag_hit):
526 comb += hit_way.eq(i)
527 comb += is_hit.eq(1)
528
529 # Generate the "hit" and "miss" signals
530 # for the synchronous blocks
531 with m.If(i_in.req & access_ok & ~flush_in):
532 comb += req_is_hit.eq(is_hit)
533 comb += req_is_miss.eq(~is_hit)
534
535 comb += req_hit_way.eq(hit_way)
536
537 # The way to replace on a miss
538 with m.If(r.state == State.CLR_TAG):
539 comb += replace_way.eq(plru_victim)
540 with m.Else():
541 comb += replace_way.eq(r.store_way)
542
543 # Output instruction from current cache row
544 #
545 # Note: This is a mild violation of our design principle of
546 # having pipeline stages output from a clean latch. In this
547 # case we output the result of a mux. The alternative would
548 # be output an entire row which I prefer not to do just yet
549 # as it would force fetch2 to know about some of the cache
550 # geometry information.
551 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
552 comb += i_out.valid.eq(r.hit_valid)
553 comb += i_out.nia.eq(r.hit_nia)
554 comb += i_out.stop_mark.eq(r.hit_smark)
555 comb += i_out.fetch_failed.eq(r.fetch_failed)
556
557 # Stall fetch1 if we have a miss on cache or TLB
558 # or a protection fault
559 comb += stall_out.eq(~(is_hit & access_ok))
560
561 # Wishbone requests output (from the cache miss reload machine)
562 comb += bus.we.eq(r.wb.we)
563 comb += bus.adr.eq(r.wb.adr)
564 comb += bus.sel.eq(r.wb.sel)
565 comb += bus.stb.eq(r.wb.stb)
566 comb += bus.dat_w.eq(r.wb.dat)
567 comb += bus.cyc.eq(r.wb.cyc)
568
569 # Cache hit synchronous machine
570 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
571 req_index, req_tag, real_addr):
572 sync = m.d.sync
573
574 i_in, stall_in = self.i_in, self.stall_in
575 flush_in = self.flush_in
576
577 # keep outputs to fetch2 unchanged on a stall
578 # except that flush or reset sets valid to 0
579 # If use_previous, keep the same data as last
580 # cycle and use the second half
581 with m.If(stall_in | use_previous):
582 with m.If(flush_in):
583 sync += r.hit_valid.eq(0)
584 with m.Else():
585 # On a hit, latch the request for the next cycle,
586 # when the BRAM data will be available on the
587 # cache_out output of the corresponding way
588 sync += r.hit_valid.eq(req_is_hit)
589
590 with m.If(req_is_hit):
591 sync += r.hit_way.eq(req_hit_way)
592 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
593 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
594 i_in.stop_mark, req_index, req_tag,
595 req_hit_way, real_addr)
596
597 with m.If(~stall_in):
598 # Send stop marks and NIA down regardless of validity
599 sync += r.hit_smark.eq(i_in.stop_mark)
600 sync += r.hit_nia.eq(i_in.nia)
601
602 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
603 req_index, req_tag, replace_way, real_addr):
604 comb = m.d.comb
605 sync = m.d.sync
606
607 i_in = self.i_in
608
609 # Reset per-row valid flags, only used in WAIT_ACK
610 for i in range(ROW_PER_LINE):
611 sync += r.rows_valid[i].eq(0)
612
613 # We need to read a cache line
614 with m.If(req_is_miss):
615 sync += Display(
616 "cache miss nia:%x IR:%x SM:%x idx:%x "
617 " way:%x tag:%x RA:%x", i_in.nia,
618 i_in.virt_mode, i_in.stop_mark, req_index,
619 replace_way, req_tag, real_addr)
620
621 # Keep track of our index and way for subsequent stores
622 st_row = Signal(ROW_BITS)
623 comb += st_row.eq(get_row(req_laddr))
624 sync += r.store_index.eq(req_index)
625 sync += r.store_row.eq(st_row)
626 sync += r.store_tag.eq(req_tag)
627 sync += r.store_valid.eq(1)
628 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
629
630 # Prep for first wishbone read. We calculate the address
631 # of the start of the cache line and start the WB cycle.
632 sync += r.req_adr.eq(req_laddr)
633 sync += r.wb.cyc.eq(1)
634 sync += r.wb.stb.eq(1)
635
636 # Track that we had one request sent
637 sync += r.state.eq(State.CLR_TAG)
638
639 def icache_miss_clr_tag(self, m, r, replace_way,
640 req_index,
641 cache_valids):
642 comb = m.d.comb
643 sync = m.d.sync
644 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
645 granularity=TAG_BITS)
646
647 # Get victim way from plru
648 sync += r.store_way.eq(replace_way)
649
650 # Force misses on that way while reloading that line
651 idx = req_index*NUM_WAYS + replace_way # 2D index, 1st dim: NUM_WAYS
652 comb += cache_valids.r.eq(1<<idx)
653
654 # use write-port "granularity" to select the tag to write to
655 # TODO: the Memory should be multipled-up (by NUM_TAGS)
656 tagset = Signal(TAG_RAM_WIDTH)
657 comb += tagset.eq(r.store_tag << (replace_way*TAG_BITS))
658 comb += wr_tag.en.eq(1<<replace_way)
659 comb += wr_tag.addr.eq(r.store_index)
660 comb += wr_tag.data.eq(tagset)
661
662 sync += r.state.eq(State.WAIT_ACK)
663
664 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
665 cache_valids, stbs_done):
666 comb = m.d.comb
667 sync = m.d.sync
668
669 bus = self.bus
670
671 # Requests are all sent if stb is 0
672 stbs_zero = Signal()
673 comb += stbs_zero.eq(r.wb.stb == 0)
674 comb += stbs_done.eq(stbs_zero)
675
676 # If we are still sending requests, was one accepted?
677 with m.If(~bus.stall & ~stbs_zero):
678 # That was the last word? We are done sending.
679 # Clear stb and set stbs_done so we can handle
680 # an eventual last ack on the same cycle.
681 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
682 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
683 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
684 "stbs_done:%x", r.wb.adr, r.end_row_ix,
685 r.wb.stb, stbs_zero, stbs_done)
686 sync += r.wb.stb.eq(0)
687 comb += stbs_done.eq(1)
688
689 # Calculate the next row address
690 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
691 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
692 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
693 sync += Display("RARANGE r.req_adr:%x rarange:%x "
694 "stbs_zero:%x stbs_done:%x",
695 r.req_adr, rarange, stbs_zero, stbs_done)
696
697 # Incoming acks processing
698 with m.If(bus.ack):
699 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
700 "stbs_done:%x",
701 bus.dat_r, stbs_zero, stbs_done)
702
703 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
704
705 # Check for completion
706 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
707 # Complete wishbone cycle
708 sync += r.wb.cyc.eq(0)
709 # be nice, clear addr
710 sync += r.req_adr.eq(0)
711
712 # Cache line is now valid
713 idx = r.store_index*NUM_WAYS + replace_way # 2D index again
714 valid = r.store_valid & ~inval_in
715 comb += cache_valids.s.eq(1<<idx)
716 sync += r.state.eq(State.IDLE)
717
718 # move on to next request in row
719 # Increment store row counter
720 sync += r.store_row.eq(next_row(r.store_row))
721
722 # Cache miss/reload synchronous machine
723 def icache_miss(self, m, r, req_is_miss,
724 req_index, req_laddr, req_tag, replace_way,
725 cache_valids, access_ok, real_addr):
726 comb = m.d.comb
727 sync = m.d.sync
728
729 i_in, bus, m_in = self.i_in, self.bus, self.m_in
730 stall_in, flush_in = self.stall_in, self.flush_in
731 inval_in = self.inval_in
732
733 stbs_done = Signal()
734
735 comb += r.wb.sel.eq(-1)
736 comb += r.wb.adr.eq(r.req_adr[3:])
737
738 # Process cache invalidations
739 with m.If(inval_in):
740 comb += cache_valids.r.eq(-1)
741 sync += r.store_valid.eq(0)
742
743 # Main state machine
744 with m.Switch(r.state):
745
746 with m.Case(State.IDLE):
747 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
748 req_index, req_tag, replace_way,
749 real_addr)
750
751 with m.Case(State.CLR_TAG, State.WAIT_ACK):
752 with m.If(r.state == State.CLR_TAG):
753 self.icache_miss_clr_tag(m, r, replace_way,
754 req_index,
755 cache_valids)
756
757 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
758 cache_valids, stbs_done)
759
760 # TLB miss and protection fault processing
761 with m.If(flush_in | m_in.tlbld):
762 sync += r.fetch_failed.eq(0)
763 with m.Elif(i_in.req & ~access_ok & ~stall_in):
764 sync += r.fetch_failed.eq(1)
765
766 # icache_log: if LOG_LENGTH > 0 generate
767 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
768 req_is_miss, req_is_hit, lway, wstate, r):
769 comb = m.d.comb
770 sync = m.d.sync
771
772 bus, i_out = self.bus, self.i_out
773 log_out, stall_out = self.log_out, self.stall_out
774
775 # Output data to logger
776 for i in range(LOG_LENGTH):
777 log_data = Signal(54)
778 lway = Signal(WAY_BITS)
779 wstate = Signal()
780
781 sync += lway.eq(req_hit_way)
782 sync += wstate.eq(0)
783
784 with m.If(r.state != State.IDLE):
785 sync += wstate.eq(1)
786
787 sync += log_data.eq(Cat(
788 ra_valid, access_ok, req_is_miss, req_is_hit,
789 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
790 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
791 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
792 ))
793 comb += log_out.eq(log_data)
794
795 def elaborate(self, platform):
796
797 m = Module()
798 comb = m.d.comb
799
800 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
801 # number of ways and the number of lines.
802 vec = SRLatch(sync=True, llen=NUM_WAYS*NUM_LINES, name="cachevalids")
803 m.submodules.cache_valids = cache_valids = vec
804
805 # TLB Array
806 itlb = TLBArray()
807 itlb_valid = TLBValidArray()
808
809 # TODO to be passed to nmigen as ram attributes
810 # attribute ram_style of itlb_tags : signal is "distributed";
811 # attribute ram_style of itlb_ptes : signal is "distributed";
812
813 # Privilege bit from PTE EAA field
814 eaa_priv = Signal()
815
816 r = RegInternal()
817
818 # Async signal on incoming request
819 req_index = Signal(INDEX_BITS)
820 req_row = Signal(ROW_BITS)
821 req_hit_way = Signal(WAY_BITS)
822 req_tag = Signal(TAG_BITS)
823 req_is_hit = Signal()
824 req_is_miss = Signal()
825 req_laddr = Signal(64)
826
827 tlb_req_index = Signal(TLB_BITS)
828 real_addr = Signal(REAL_ADDR_BITS)
829 ra_valid = Signal()
830 priv_fault = Signal()
831 access_ok = Signal()
832 use_previous = Signal()
833
834 cache_out_row = Signal(ROW_SIZE_BITS)
835
836 plru_victim = Signal(WAY_BITS)
837 replace_way = Signal(WAY_BITS)
838
839 self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
840 self.tagmem = Memory(depth=NUM_LINES, width=TAG_RAM_WIDTH)
841
842 # call sub-functions putting everything together,
843 # using shared signals established above
844 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
845 self.maybe_plrus(m, r, plru_victim)
846 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
847 ra_valid, eaa_priv, priv_fault,
848 access_ok)
849 self.itlb_update(m, itlb, itlb_valid)
850 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
851 req_tag, real_addr, req_laddr,
852 cache_valids,
853 access_ok, req_is_hit, req_is_miss,
854 replace_way, plru_victim, cache_out_row)
855 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
856 req_index, req_tag, real_addr)
857 self.icache_miss(m, r, req_is_miss, req_index,
858 req_laddr, req_tag, replace_way,
859 cache_valids,
860 access_ok, real_addr)
861 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
862 # req_is_miss, req_is_hit, lway, wstate, r)
863
864 # don't connect up to FetchUnitInterface so that some unit tests
865 # can continue to operate
866 if not self.use_fetch_iface:
867 return m
868
869 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
870 # so needs checking and iterative revising
871 i_in, bus, i_out = self.i_in, self.bus, self.i_out
872 comb += i_in.req.eq(self.a_i_valid)
873 comb += i_in.nia.eq(self.a_pc_i)
874 comb += self.stall_in.eq(self.a_stall_i)
875 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
876 comb += self.f_badaddr_o.eq(i_out.nia)
877 comb += self.f_instr_o.eq(i_out.insn)
878 comb += self.f_busy_o.eq(~i_out.valid) # probably
879
880 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
881 ibus = self.ibus
882 comb += ibus.adr.eq(self.bus.adr)
883 comb += ibus.dat_w.eq(self.bus.dat_w)
884 comb += ibus.sel.eq(self.bus.sel)
885 comb += ibus.cyc.eq(self.bus.cyc)
886 comb += ibus.stb.eq(self.bus.stb)
887 comb += ibus.we.eq(self.bus.we)
888
889 comb += self.bus.dat_r.eq(ibus.dat_r)
890 comb += self.bus.ack.eq(ibus.ack)
891 if hasattr(ibus, "stall"):
892 comb += self.bus.stall.eq(ibus.stall)
893 else:
894 # fake-up the wishbone stall signal to comply with pipeline mode
895 # same thing is done in dcache.py
896 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
897
898 return m
899
900
901 def icache_sim(dut):
902 i_in = dut.i_in
903 i_out = dut.i_out
904 m_out = dut.m_in
905
906 yield i_in.priv_mode.eq(1)
907 yield i_in.req.eq(0)
908 yield i_in.nia.eq(0)
909 yield i_in.stop_mark.eq(0)
910 yield m_out.tlbld.eq(0)
911 yield m_out.tlbie.eq(0)
912 yield m_out.addr.eq(0)
913 yield m_out.pte.eq(0)
914 yield
915 yield
916 yield
917 yield
918
919 # miss, stalls for a bit
920 yield i_in.req.eq(1)
921 yield i_in.nia.eq(Const(0x0000000000000004, 64))
922 yield
923 valid = yield i_out.valid
924 while not valid:
925 yield
926 valid = yield i_out.valid
927 yield i_in.req.eq(0)
928
929 insn = yield i_out.insn
930 nia = yield i_out.nia
931 assert insn == 0x00000001, \
932 "insn @%x=%x expected 00000001" % (nia, insn)
933 yield i_in.req.eq(0)
934 yield
935
936 # hit
937 yield i_in.req.eq(1)
938 yield i_in.nia.eq(Const(0x0000000000000008, 64))
939 yield
940 valid = yield i_out.valid
941 while not valid:
942 yield
943 valid = yield i_out.valid
944 yield i_in.req.eq(0)
945
946 nia = yield i_out.nia
947 insn = yield i_out.insn
948 yield
949 assert insn == 0x00000002, \
950 "insn @%x=%x expected 00000002" % (nia, insn)
951
952 # another miss
953 yield i_in.req.eq(1)
954 yield i_in.nia.eq(Const(0x0000000000000040, 64))
955 yield
956 valid = yield i_out.valid
957 while not valid:
958 yield
959 valid = yield i_out.valid
960 yield i_in.req.eq(0)
961
962 nia = yield i_in.nia
963 insn = yield i_out.insn
964 assert insn == 0x00000010, \
965 "insn @%x=%x expected 00000010" % (nia, insn)
966
967 # test something that aliases (this only works because
968 # the unit test SRAM is a depth of 512)
969 yield i_in.req.eq(1)
970 yield i_in.nia.eq(Const(0x0000000000000100, 64))
971 yield
972 yield
973 valid = yield i_out.valid
974 assert ~valid
975 for i in range(30):
976 yield
977 yield
978 insn = yield i_out.insn
979 valid = yield i_out.valid
980 insn = yield i_out.insn
981 assert valid
982 assert insn == 0x00000040, \
983 "insn @%x=%x expected 00000040" % (nia, insn)
984 yield i_in.req.eq(0)
985
986
987 def test_icache(mem):
988 from soc.config.test.test_loadstore import TestMemPspec
989 pspec = TestMemPspec(addr_wid=32,
990 mask_wid=8,
991 reg_wid=64,
992 )
993 dut = ICache(pspec)
994
995 memory = Memory(width=64, depth=512, init=mem)
996 sram = SRAM(memory=memory, granularity=8)
997
998 m = Module()
999
1000 m.submodules.icache = dut
1001 m.submodules.sram = sram
1002
1003 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1004 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1005 m.d.comb += sram.bus.we.eq(dut.bus.we)
1006 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1007 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1008 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1009
1010 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1011 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1012
1013 # nmigen Simulation
1014 sim = Simulator(m)
1015 sim.add_clock(1e-6)
1016
1017 sim.add_sync_process(wrap(icache_sim(dut)))
1018 with sim.write_vcd('test_icache.vcd'):
1019 sim.run()
1020
1021
1022 if __name__ == '__main__':
1023 from soc.config.test.test_loadstore import TestMemPspec
1024 pspec = TestMemPspec(addr_wid=64,
1025 mask_wid=8,
1026 reg_wid=64,
1027 )
1028 dut = ICache(pspec)
1029 vl = rtlil.convert(dut, ports=[])
1030 with open("test_icache.il", "w") as f:
1031 f.write(vl)
1032
1033 # set up memory every 32-bits with incrementing values 0 1 2 ...
1034 mem = []
1035 for i in range(512):
1036 mem.append((i*2) | ((i*2+1)<<32))
1037
1038 test_icache(mem)