81e0d55895657c0f9561b059763f544f7e8910e5
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52 from soc.minerva.units.fetch import FetchUnitInterface
53
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmutil.util import wrap
59 from nmigen.cli import main, rtlil
60
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil.sim_tmp_alternative import Simulator, Settle
64
65
66 SIM = 0
67 LINE_SIZE = 64
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
71 #
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8
74 # Number of lines in a set
75 NUM_LINES = 64
76 # Number of ways
77 NUM_WAYS = 2
78 # L1 ITLB number of entries (direct mapped)
79 TLB_SIZE = 64
80 # L1 ITLB log_2(page_size)
81 TLB_LG_PGSZ = 12
82 # Number of real address bits that we store
83 REAL_ADDR_BITS = 56
84 # Non-zero to enable log data collection
85 LOG_LENGTH = 0
86
87 ROW_SIZE_BITS = ROW_SIZE * 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW = ROW_SIZE_BITS // 32
94
95 # Bit fields counts in the address
96 #
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS = log2_int(INSN_PER_ROW)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS = log2_int(BRAM_ROWS)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS = log2_int(LINE_SIZE)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS = log2_int(ROW_SIZE)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
115
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # L1 ITLB
121 TLB_BITS = log2_int(TLB_SIZE)
122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
123 TLB_PTE_BITS = 64
124
125 print("BRAM_ROWS =", BRAM_ROWS)
126 print("INDEX_BITS =", INDEX_BITS)
127 print("INSN_BITS =", INSN_BITS)
128 print("INSN_PER_ROW =", INSN_PER_ROW)
129 print("LINE_SIZE =", LINE_SIZE)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS)
131 print("LOG_LENGTH =", LOG_LENGTH)
132 print("NUM_LINES =", NUM_LINES)
133 print("NUM_WAYS =", NUM_WAYS)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
135 print("ROW_BITS =", ROW_BITS)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS)
138 print("ROW_PER_LINE =", ROW_PER_LINE)
139 print("ROW_SIZE =", ROW_SIZE)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS)
142 print("SIM =", SIM)
143 print("TAG_BITS =", TAG_BITS)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
145 print("TAG_BITS =", TAG_BITS)
146 print("TLB_BITS =", TLB_BITS)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS)
150 print("TLB_SIZE =", TLB_SIZE)
151 print("WAY_BITS =", WAY_BITS)
152
153 # from microwatt/utils.vhdl
154 def ispow2(n):
155 return n != 0 and (n & (n - 1)) == 0
156
157 assert LINE_SIZE % ROW_SIZE == 0
158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
169 "geometry bits don't add up"
170
171 # Example of layout for 32 lines of 64 bytes:
172 #
173 # .. tag |index| line |
174 # .. | row | |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
183
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
186 #
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
190 def CacheTagArray():
191 tag_layout = [('valid', NUM_WAYS),
192 ('tag', TAG_RAM_WIDTH),
193 ]
194 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
195
196 def RowPerLineValidArray():
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBValidArray():
206 return Array(Signal(name="tlb_valid%d" % x)
207 for x in range(TLB_SIZE))
208
209 def TLBRecord(name):
210 tlb_layout = [ ('tag', TLB_EA_TAG_BITS),
211 ('pte', TLB_PTE_BITS)
212 ]
213 return Record(tlb_layout, name=name)
214
215 def TLBArray():
216 return Array(TLBRecord("tlb%d" % x) for x in range(TLB_SIZE))
217
218 # PLRU output interface
219 def PLRUOut():
220 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
221 for x in range(NUM_LINES))
222
223 # Return the cache line index (tag index) for an address
224 def get_index(addr):
225 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
226
227 # Return the cache row index (data memory) for an address
228 def get_row(addr):
229 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
230
231 # Return the index of a row within a line
232 def get_row_of_line(row):
233 return row[:ROW_BITS][:ROW_LINE_BITS]
234
235 # Returns whether this is the last row of a line
236 def is_last_row_addr(addr, last):
237 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
238
239 # Returns whether this is the last row of a line
240 def is_last_row(row, last):
241 return get_row_of_line(row) == last
242
243 # Return the next row in the current cache line. We use a dedicated
244 # function in order to limit the size of the generated adder to be
245 # only the bits within a cache line (3 bits with default settings)
246 def next_row(row):
247 row_v = row[0:ROW_LINE_BITS] + 1
248 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
249
250 # Read the instruction word for the given address
251 # in the current cache row
252 def read_insn_word(addr, data):
253 word = addr[2:INSN_BITS+2]
254 return data.word_select(word, 32)
255
256 # Get the tag value from the address
257 def get_tag(addr):
258 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
259
260 # Read a tag from a tag memory row
261 def read_tag(way, tagset):
262 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
263
264 # Write a tag to tag memory row
265 def write_tag(way, tagset, tag):
266 return read_tag(way, tagset).eq(tag)
267
268 # Simple hash for direct-mapped TLB index
269 def hash_ea(addr):
270 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
271 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
272 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
273 return hsh
274
275
276 # Cache reload state machine
277 @unique
278 class State(Enum):
279 IDLE = 0
280 CLR_TAG = 1
281 WAIT_ACK = 2
282
283
284 class RegInternal(RecordObject):
285 def __init__(self):
286 super().__init__()
287 # Cache hit state (Latches for 1 cycle BRAM access)
288 self.hit_way = Signal(WAY_BITS)
289 self.hit_nia = Signal(64)
290 self.hit_smark = Signal()
291 self.hit_valid = Signal()
292
293 # Cache miss state (reload state machine)
294 self.state = Signal(State, reset=State.IDLE)
295 self.wb = WBMasterOut("wb")
296 self.req_adr = Signal(64)
297 self.store_way = Signal(WAY_BITS)
298 self.store_index = Signal(INDEX_BITS)
299 self.store_row = Signal(ROW_BITS)
300 self.store_tag = Signal(TAG_BITS)
301 self.store_valid = Signal()
302 self.end_row_ix = Signal(ROW_LINE_BITS)
303 self.rows_valid = RowPerLineValidArray()
304
305 # TLB miss state
306 self.fetch_failed = Signal()
307
308
309 class ICache(FetchUnitInterface, Elaboratable):
310 """64 bit direct mapped icache. All instructions are 4B aligned."""
311 def __init__(self, pspec):
312 FetchUnitInterface.__init__(self, pspec)
313 self.i_in = Fetch1ToICacheType(name="i_in")
314 self.i_out = ICacheToDecode1Type(name="i_out")
315
316 self.m_in = MMUToICacheType(name="m_in")
317
318 self.stall_in = Signal()
319 self.stall_out = Signal()
320 self.flush_in = Signal()
321 self.inval_in = Signal()
322
323 # standard naming (wired to non-standard for compatibility)
324 self.bus = Interface(addr_width=32,
325 data_width=64,
326 granularity=8,
327 features={'stall'},
328 alignment=0,
329 name="icache_wb")
330
331 self.log_out = Signal(54)
332
333 # use FetchUnitInterface, helps keep some unit tests running
334 self.use_fetch_iface = False
335
336 def use_fetch_interface(self):
337 self.use_fetch_iface = True
338
339 # Generate a cache RAM for each way
340 def rams(self, m, r, cache_out_row, use_previous,
341 replace_way, req_row):
342
343 comb = m.d.comb
344 sync = m.d.sync
345
346 bus, stall_in = self.bus, self.stall_in
347
348 # read condition (for every cache ram)
349 do_read = Signal()
350 comb += do_read.eq(~(stall_in | use_previous))
351
352 rd_addr = Signal(ROW_BITS)
353 wr_addr = Signal(ROW_BITS)
354 comb += rd_addr.eq(req_row)
355 comb += wr_addr.eq(r.store_row)
356
357 # binary-to-unary converters: replace-way enabled by bus.ack,
358 # hit-way left permanently enabled
359 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
360 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
361 comb += re.i.eq(replace_way)
362 comb += re.n.eq(~bus.ack)
363 comb += he.i.eq(r.hit_way)
364
365 for i in range(NUM_WAYS):
366 do_write = Signal(name="do_wr_%d" % i)
367 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
368 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
369
370 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
371 m.submodules["cacheram_%d" % i] = way
372
373 comb += way.rd_en.eq(do_read)
374 comb += way.rd_addr.eq(rd_addr)
375 comb += d_out.eq(way.rd_data_o)
376 comb += way.wr_sel.eq(wr_sel)
377 comb += way.wr_addr.eq(wr_addr)
378 comb += way.wr_data.eq(bus.dat_r)
379
380 comb += do_write.eq(re.o[i])
381
382 with m.If(do_write):
383 sync += Display("cache write adr: %x data: %lx",
384 wr_addr, way.wr_data)
385
386 with m.If(he.o[i]):
387 comb += cache_out_row.eq(d_out)
388 with m.If(do_read):
389 sync += Display("cache read adr: %x data: %x",
390 req_row, d_out)
391
392 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
393
394 # Generate PLRUs
395 def maybe_plrus(self, m, r, plru_victim):
396 comb = m.d.comb
397
398 if NUM_WAYS == 0:
399 return
400
401
402 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
403 comb += plru.way.eq(r.hit_way)
404 comb += plru.valid.eq(r.hit_valid)
405 comb += plru.index.eq(get_index(r.hit_nia))
406 comb += plru.isel.eq(r.store_index) # select victim
407 comb += plru_victim.eq(plru.o_index) # selected victim
408
409 # TLB hit detection and real address generation
410 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
411 real_addr, ra_valid, eaa_priv,
412 priv_fault, access_ok):
413
414 comb = m.d.comb
415
416 i_in = self.i_in
417
418 # use an *asynchronous* Memory read port here (combinatorial)
419 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
420 tlb = TLBRecord("tlb_rdport")
421 pte, ttag = tlb.pte, tlb.tag
422
423 comb += tlb_req_index.eq(hash_ea(i_in.nia))
424 comb += rd_tlb.addr.eq(tlb_req_index)
425 comb += tlb.eq(rd_tlb.data)
426
427 with m.If(i_in.virt_mode):
428 comb += real_addr.eq(Cat(i_in.nia[:TLB_LG_PGSZ],
429 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
430
431 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
432 comb += ra_valid.eq(itlb_valid[tlb_req_index])
433
434 comb += eaa_priv.eq(pte[3])
435
436 with m.Else():
437 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
438 comb += ra_valid.eq(1)
439 comb += eaa_priv.eq(1)
440
441 # No IAMR, so no KUEP support for now
442 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
443 comb += access_ok.eq(ra_valid & ~priv_fault)
444
445 # iTLB update
446 def itlb_update(self, m, itlb, itlb_valid):
447 comb = m.d.comb
448 sync = m.d.sync
449
450 m_in = self.m_in
451
452 wr_index = Signal(TLB_SIZE)
453 comb += wr_index.eq(hash_ea(m_in.addr))
454
455 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
456
457 with m.If(m_in.tlbie & m_in.doall):
458 # Clear all valid bits
459 for i in range(TLB_SIZE):
460 sync += itlb_valid[i].eq(0)
461
462 with m.Elif(m_in.tlbie):
463 # Clear entry regardless of hit or miss
464 sync += itlb_valid[wr_index].eq(0)
465
466 with m.Elif(m_in.tlbld):
467 tlb = TLBRecord("tlb_wrport")
468 comb += tlb.tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
469 comb += tlb.pte.eq(m_in.pte)
470 comb += wr_tlb.en.eq(1)
471 comb += wr_tlb.addr.eq(wr_index)
472 comb += wr_tlb.data.eq(tlb)
473 sync += itlb_valid[wr_index].eq(1)
474
475 # Cache hit detection, output to fetch2 and other misc logic
476 def icache_comb(self, m, use_previous, r, req_index, req_row,
477 req_hit_way, req_tag, real_addr, req_laddr,
478 cache_tags, access_ok,
479 req_is_hit, req_is_miss, replace_way,
480 plru_victim, cache_out_row):
481
482 comb = m.d.comb
483
484 i_in, i_out, bus = self.i_in, self.i_out, self.bus
485 flush_in, stall_out = self.flush_in, self.stall_out
486
487 is_hit = Signal()
488 hit_way = Signal(WAY_BITS)
489
490 # i_in.sequential means that i_in.nia this cycle is 4 more than
491 # last cycle. If we read more than 32 bits at a time, had a
492 # cache hit last cycle, and we don't want the first 32-bit chunk
493 # then we can keep the data we read last cycle and just use that.
494 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
495 comb += use_previous.eq(i_in.sequential & r.hit_valid)
496
497 # Extract line, row and tag from request
498 comb += req_index.eq(get_index(i_in.nia))
499 comb += req_row.eq(get_row(i_in.nia))
500 comb += req_tag.eq(get_tag(real_addr))
501
502 # Calculate address of beginning of cache row, will be
503 # used for cache miss processing if needed
504 comb += req_laddr.eq(Cat(
505 Const(0, ROW_OFF_BITS),
506 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
507 ))
508
509 # Test if pending request is a hit on any way
510 hitcond = Signal()
511 comb += hitcond.eq((r.state == State.WAIT_ACK)
512 & (req_index == r.store_index)
513 & r.rows_valid[req_row % ROW_PER_LINE]
514 )
515 # i_in.req asserts Decoder active
516 cvb = Signal(NUM_WAYS)
517 ctag = Signal(TAG_RAM_WIDTH)
518 comb += ctag.eq(cache_tags[req_index].tag)
519 comb += cvb.eq(cache_tags[req_index].valid)
520 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
521 comb += se.i.eq(r.store_way)
522 comb += se.n.eq(~i_in.req)
523 for i in range(NUM_WAYS):
524 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
525 hit_test = Signal(name="hit_test%d" % i)
526 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
527 comb += tagi.eq(read_tag(i, ctag))
528 comb += hit_test.eq(se.o[i])
529 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
530 (tagi == req_tag))
531 with m.If(is_tag_hit):
532 comb += hit_way.eq(i)
533 comb += is_hit.eq(1)
534
535 # Generate the "hit" and "miss" signals
536 # for the synchronous blocks
537 with m.If(i_in.req & access_ok & ~flush_in):
538 comb += req_is_hit.eq(is_hit)
539 comb += req_is_miss.eq(~is_hit)
540
541 comb += req_hit_way.eq(hit_way)
542
543 # The way to replace on a miss
544 with m.If(r.state == State.CLR_TAG):
545 comb += replace_way.eq(plru_victim)
546 with m.Else():
547 comb += replace_way.eq(r.store_way)
548
549 # Output instruction from current cache row
550 #
551 # Note: This is a mild violation of our design principle of
552 # having pipeline stages output from a clean latch. In this
553 # case we output the result of a mux. The alternative would
554 # be output an entire row which I prefer not to do just yet
555 # as it would force fetch2 to know about some of the cache
556 # geometry information.
557 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
558 comb += i_out.valid.eq(r.hit_valid)
559 comb += i_out.nia.eq(r.hit_nia)
560 comb += i_out.stop_mark.eq(r.hit_smark)
561 comb += i_out.fetch_failed.eq(r.fetch_failed)
562
563 # Stall fetch1 if we have a miss on cache or TLB
564 # or a protection fault
565 comb += stall_out.eq(~(is_hit & access_ok))
566
567 # Wishbone requests output (from the cache miss reload machine)
568 comb += bus.we.eq(r.wb.we)
569 comb += bus.adr.eq(r.wb.adr)
570 comb += bus.sel.eq(r.wb.sel)
571 comb += bus.stb.eq(r.wb.stb)
572 comb += bus.dat_w.eq(r.wb.dat)
573 comb += bus.cyc.eq(r.wb.cyc)
574
575 # Cache hit synchronous machine
576 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
577 req_index, req_tag, real_addr):
578 sync = m.d.sync
579
580 i_in, stall_in = self.i_in, self.stall_in
581 flush_in = self.flush_in
582
583 # keep outputs to fetch2 unchanged on a stall
584 # except that flush or reset sets valid to 0
585 # If use_previous, keep the same data as last
586 # cycle and use the second half
587 with m.If(stall_in | use_previous):
588 with m.If(flush_in):
589 sync += r.hit_valid.eq(0)
590 with m.Else():
591 # On a hit, latch the request for the next cycle,
592 # when the BRAM data will be available on the
593 # cache_out output of the corresponding way
594 sync += r.hit_valid.eq(req_is_hit)
595
596 with m.If(req_is_hit):
597 sync += r.hit_way.eq(req_hit_way)
598 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
599 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
600 i_in.stop_mark, req_index, req_tag,
601 req_hit_way, real_addr)
602
603 with m.If(~stall_in):
604 # Send stop marks and NIA down regardless of validity
605 sync += r.hit_smark.eq(i_in.stop_mark)
606 sync += r.hit_nia.eq(i_in.nia)
607
608 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
609 req_index, req_tag, replace_way, real_addr):
610 comb = m.d.comb
611 sync = m.d.sync
612
613 i_in = self.i_in
614
615 # Reset per-row valid flags, only used in WAIT_ACK
616 for i in range(ROW_PER_LINE):
617 sync += r.rows_valid[i].eq(0)
618
619 # We need to read a cache line
620 with m.If(req_is_miss):
621 sync += Display(
622 "cache miss nia:%x IR:%x SM:%x idx:%x "
623 " way:%x tag:%x RA:%x", i_in.nia,
624 i_in.virt_mode, i_in.stop_mark, req_index,
625 replace_way, req_tag, real_addr)
626
627 # Keep track of our index and way for subsequent stores
628 st_row = Signal(ROW_BITS)
629 comb += st_row.eq(get_row(req_laddr))
630 sync += r.store_index.eq(req_index)
631 sync += r.store_row.eq(st_row)
632 sync += r.store_tag.eq(req_tag)
633 sync += r.store_valid.eq(1)
634 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
635
636 # Prep for first wishbone read. We calculate the address
637 # of the start of the cache line and start the WB cycle.
638 sync += r.req_adr.eq(req_laddr)
639 sync += r.wb.cyc.eq(1)
640 sync += r.wb.stb.eq(1)
641
642 # Track that we had one request sent
643 sync += r.state.eq(State.CLR_TAG)
644
645 def icache_miss_clr_tag(self, m, r, replace_way,
646 req_index,
647 cache_tags):
648 comb = m.d.comb
649 sync = m.d.sync
650
651 # Get victim way from plru
652 sync += r.store_way.eq(replace_way)
653
654 # Force misses on that way while reloading that line
655 cv = Signal(INDEX_BITS)
656 comb += cv.eq(cache_tags[req_index].valid)
657 comb += cv.bit_select(replace_way, 1).eq(0)
658 sync += cache_tags[req_index].valid.eq(cv)
659
660 for i in range(NUM_WAYS):
661 with m.If(i == replace_way):
662 tagset = Signal(TAG_RAM_WIDTH)
663 comb += tagset.eq(cache_tags[r.store_index].tag)
664 comb += write_tag(i, tagset, r.store_tag)
665 sync += cache_tags[r.store_index].tag.eq(tagset)
666
667 sync += r.state.eq(State.WAIT_ACK)
668
669 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
670 cache_tags, stbs_done):
671 comb = m.d.comb
672 sync = m.d.sync
673
674 bus = self.bus
675
676 # Requests are all sent if stb is 0
677 stbs_zero = Signal()
678 comb += stbs_zero.eq(r.wb.stb == 0)
679 comb += stbs_done.eq(stbs_zero)
680
681 # If we are still sending requests, was one accepted?
682 with m.If(~bus.stall & ~stbs_zero):
683 # That was the last word? We are done sending.
684 # Clear stb and set stbs_done so we can handle
685 # an eventual last ack on the same cycle.
686 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
687 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
688 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
689 "stbs_done:%x", r.wb.adr, r.end_row_ix,
690 r.wb.stb, stbs_zero, stbs_done)
691 sync += r.wb.stb.eq(0)
692 comb += stbs_done.eq(1)
693
694 # Calculate the next row address
695 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
696 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
697 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
698 sync += Display("RARANGE r.req_adr:%x rarange:%x "
699 "stbs_zero:%x stbs_done:%x",
700 r.req_adr, rarange, stbs_zero, stbs_done)
701
702 # Incoming acks processing
703 with m.If(bus.ack):
704 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
705 "stbs_done:%x",
706 bus.dat_r, stbs_zero, stbs_done)
707
708 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
709
710 # Check for completion
711 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
712 # Complete wishbone cycle
713 sync += r.wb.cyc.eq(0)
714 # be nice, clear addr
715 sync += r.req_adr.eq(0)
716
717 # Cache line is now valid
718 cv = Signal(INDEX_BITS)
719 comb += cv.eq(cache_tags[r.store_index].valid)
720 comb += cv.bit_select(replace_way, 1).eq(
721 r.store_valid & ~inval_in)
722 sync += cache_tags[r.store_index].valid.eq(cv)
723
724 sync += r.state.eq(State.IDLE)
725
726 # move on to next request in row
727 # Increment store row counter
728 sync += r.store_row.eq(next_row(r.store_row))
729
730 # Cache miss/reload synchronous machine
731 def icache_miss(self, m, r, req_is_miss,
732 req_index, req_laddr, req_tag, replace_way,
733 cache_tags, access_ok, real_addr):
734 comb = m.d.comb
735 sync = m.d.sync
736
737 i_in, bus, m_in = self.i_in, self.bus, self.m_in
738 stall_in, flush_in = self.stall_in, self.flush_in
739 inval_in = self.inval_in
740
741 stbs_done = Signal()
742
743 comb += r.wb.sel.eq(-1)
744 comb += r.wb.adr.eq(r.req_adr[3:])
745
746 # Process cache invalidations
747 with m.If(inval_in):
748 for i in range(NUM_LINES):
749 sync += cache_tags[i].valid.eq(0)
750 sync += r.store_valid.eq(0)
751
752 # Main state machine
753 with m.Switch(r.state):
754
755 with m.Case(State.IDLE):
756 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
757 req_index, req_tag, replace_way,
758 real_addr)
759
760 with m.Case(State.CLR_TAG, State.WAIT_ACK):
761 with m.If(r.state == State.CLR_TAG):
762 self.icache_miss_clr_tag(m, r, replace_way,
763 req_index, cache_tags)
764
765 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
766 cache_tags, stbs_done)
767
768 # TLB miss and protection fault processing
769 with m.If(flush_in | m_in.tlbld):
770 sync += r.fetch_failed.eq(0)
771 with m.Elif(i_in.req & ~access_ok & ~stall_in):
772 sync += r.fetch_failed.eq(1)
773
774 # icache_log: if LOG_LENGTH > 0 generate
775 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
776 req_is_miss, req_is_hit, lway, wstate, r):
777 comb = m.d.comb
778 sync = m.d.sync
779
780 bus, i_out = self.bus, self.i_out
781 log_out, stall_out = self.log_out, self.stall_out
782
783 # Output data to logger
784 for i in range(LOG_LENGTH):
785 log_data = Signal(54)
786 lway = Signal(WAY_BITS)
787 wstate = Signal()
788
789 sync += lway.eq(req_hit_way)
790 sync += wstate.eq(0)
791
792 with m.If(r.state != State.IDLE):
793 sync += wstate.eq(1)
794
795 sync += log_data.eq(Cat(
796 ra_valid, access_ok, req_is_miss, req_is_hit,
797 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
798 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
799 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
800 ))
801 comb += log_out.eq(log_data)
802
803 def elaborate(self, platform):
804
805 m = Module()
806 comb = m.d.comb
807
808 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
809 cache_tags = CacheTagArray()
810
811 # TLB Array
812 itlb = TLBArray()
813 itlb_valid = TLBValidArray()
814
815 # TODO to be passed to nmigen as ram attributes
816 # attribute ram_style of itlb_tags : signal is "distributed";
817 # attribute ram_style of itlb_ptes : signal is "distributed";
818
819 # Privilege bit from PTE EAA field
820 eaa_priv = Signal()
821
822 r = RegInternal()
823
824 # Async signal on incoming request
825 req_index = Signal(INDEX_BITS)
826 req_row = Signal(ROW_BITS)
827 req_hit_way = Signal(WAY_BITS)
828 req_tag = Signal(TAG_BITS)
829 req_is_hit = Signal()
830 req_is_miss = Signal()
831 req_laddr = Signal(64)
832
833 tlb_req_index = Signal(TLB_BITS)
834 real_addr = Signal(REAL_ADDR_BITS)
835 ra_valid = Signal()
836 priv_fault = Signal()
837 access_ok = Signal()
838 use_previous = Signal()
839
840 cache_out_row = Signal(ROW_SIZE_BITS)
841
842 plru_victim = Signal(WAY_BITS)
843 replace_way = Signal(WAY_BITS)
844
845 self.tlbmem = Memory(depth=TLB_SIZE, width=TLB_EA_TAG_BITS+TLB_PTE_BITS)
846
847 # call sub-functions putting everything together,
848 # using shared signals established above
849 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
850 self.maybe_plrus(m, r, plru_victim)
851 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
852 ra_valid, eaa_priv, priv_fault,
853 access_ok)
854 self.itlb_update(m, itlb, itlb_valid)
855 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
856 req_tag, real_addr, req_laddr,
857 cache_tags, access_ok, req_is_hit, req_is_miss,
858 replace_way, plru_victim, cache_out_row)
859 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
860 req_index, req_tag, real_addr)
861 self.icache_miss(m, r, req_is_miss, req_index,
862 req_laddr, req_tag, replace_way, cache_tags,
863 access_ok, real_addr)
864 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
865 # req_is_miss, req_is_hit, lway, wstate, r)
866
867 # don't connect up to FetchUnitInterface so that some unit tests
868 # can continue to operate
869 if not self.use_fetch_iface:
870 return m
871
872 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
873 # so needs checking and iterative revising
874 i_in, bus, i_out = self.i_in, self.bus, self.i_out
875 comb += i_in.req.eq(self.a_i_valid)
876 comb += i_in.nia.eq(self.a_pc_i)
877 comb += self.stall_in.eq(self.a_stall_i)
878 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
879 comb += self.f_badaddr_o.eq(i_out.nia)
880 comb += self.f_instr_o.eq(i_out.insn)
881 comb += self.f_busy_o.eq(~i_out.valid) # probably
882
883 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
884 ibus = self.ibus
885 comb += ibus.adr.eq(self.bus.adr)
886 comb += ibus.dat_w.eq(self.bus.dat_w)
887 comb += ibus.sel.eq(self.bus.sel)
888 comb += ibus.cyc.eq(self.bus.cyc)
889 comb += ibus.stb.eq(self.bus.stb)
890 comb += ibus.we.eq(self.bus.we)
891
892 comb += self.bus.dat_r.eq(ibus.dat_r)
893 comb += self.bus.ack.eq(ibus.ack)
894 if hasattr(ibus, "stall"):
895 comb += self.bus.stall.eq(ibus.stall)
896 else:
897 # fake-up the wishbone stall signal to comply with pipeline mode
898 # same thing is done in dcache.py
899 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
900
901 return m
902
903
904 def icache_sim(dut):
905 i_in = dut.i_in
906 i_out = dut.i_out
907 m_out = dut.m_in
908
909 yield i_in.priv_mode.eq(1)
910 yield i_in.req.eq(0)
911 yield i_in.nia.eq(0)
912 yield i_in.stop_mark.eq(0)
913 yield m_out.tlbld.eq(0)
914 yield m_out.tlbie.eq(0)
915 yield m_out.addr.eq(0)
916 yield m_out.pte.eq(0)
917 yield
918 yield
919 yield
920 yield
921
922 # miss, stalls for a bit
923 yield i_in.req.eq(1)
924 yield i_in.nia.eq(Const(0x0000000000000004, 64))
925 yield
926 valid = yield i_out.valid
927 while not valid:
928 yield
929 valid = yield i_out.valid
930 yield i_in.req.eq(0)
931
932 insn = yield i_out.insn
933 nia = yield i_out.nia
934 assert insn == 0x00000001, \
935 "insn @%x=%x expected 00000001" % (nia, insn)
936 yield i_in.req.eq(0)
937 yield
938
939 # hit
940 yield i_in.req.eq(1)
941 yield i_in.nia.eq(Const(0x0000000000000008, 64))
942 yield
943 valid = yield i_out.valid
944 while not valid:
945 yield
946 valid = yield i_out.valid
947 yield i_in.req.eq(0)
948
949 nia = yield i_out.nia
950 insn = yield i_out.insn
951 yield
952 assert insn == 0x00000002, \
953 "insn @%x=%x expected 00000002" % (nia, insn)
954
955 # another miss
956 yield i_in.req.eq(1)
957 yield i_in.nia.eq(Const(0x0000000000000040, 64))
958 yield
959 valid = yield i_out.valid
960 while not valid:
961 yield
962 valid = yield i_out.valid
963 yield i_in.req.eq(0)
964
965 nia = yield i_in.nia
966 insn = yield i_out.insn
967 assert insn == 0x00000010, \
968 "insn @%x=%x expected 00000010" % (nia, insn)
969
970 # test something that aliases (this only works because
971 # the unit test SRAM is a depth of 512)
972 yield i_in.req.eq(1)
973 yield i_in.nia.eq(Const(0x0000000000000100, 64))
974 yield
975 yield
976 valid = yield i_out.valid
977 assert ~valid
978 for i in range(30):
979 yield
980 yield
981 insn = yield i_out.insn
982 valid = yield i_out.valid
983 insn = yield i_out.insn
984 assert valid
985 assert insn == 0x00000040, \
986 "insn @%x=%x expected 00000040" % (nia, insn)
987 yield i_in.req.eq(0)
988
989
990 def test_icache(mem):
991 from soc.config.test.test_loadstore import TestMemPspec
992 pspec = TestMemPspec(addr_wid=32,
993 mask_wid=8,
994 reg_wid=64,
995 )
996 dut = ICache(pspec)
997
998 memory = Memory(width=64, depth=512, init=mem)
999 sram = SRAM(memory=memory, granularity=8)
1000
1001 m = Module()
1002
1003 m.submodules.icache = dut
1004 m.submodules.sram = sram
1005
1006 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1007 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1008 m.d.comb += sram.bus.we.eq(dut.bus.we)
1009 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1010 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1011 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1012
1013 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1014 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1015
1016 # nmigen Simulation
1017 sim = Simulator(m)
1018 sim.add_clock(1e-6)
1019
1020 sim.add_sync_process(wrap(icache_sim(dut)))
1021 with sim.write_vcd('test_icache.vcd'):
1022 sim.run()
1023
1024
1025 if __name__ == '__main__':
1026 from soc.config.test.test_loadstore import TestMemPspec
1027 pspec = TestMemPspec(addr_wid=64,
1028 mask_wid=8,
1029 reg_wid=64,
1030 )
1031 dut = ICache(pspec)
1032 vl = rtlil.convert(dut, ports=[])
1033 with open("test_icache.il", "w") as f:
1034 f.write(vl)
1035
1036 # set up memory every 32-bits with incrementing values 0 1 2 ...
1037 mem = []
1038 for i in range(512):
1039 mem.append((i*2) | ((i*2+1)<<32))
1040
1041 test_icache(mem)