tlb_req_index is TLB_BITS long not TLB_SIZE
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52 from soc.minerva.units.fetch import FetchUnitInterface
53
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmutil.util import wrap
59 from nmigen.cli import main, rtlil
60
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil.sim_tmp_alternative import Simulator, Settle
64
65
66 SIM = 0
67 LINE_SIZE = 64
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
71 #
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8
74 # Number of lines in a set
75 NUM_LINES = 16
76 # Number of ways
77 NUM_WAYS = 4
78 # L1 ITLB number of entries (direct mapped)
79 TLB_SIZE = 64
80 # L1 ITLB log_2(page_size)
81 TLB_LG_PGSZ = 12
82 # Number of real address bits that we store
83 REAL_ADDR_BITS = 56
84 # Non-zero to enable log data collection
85 LOG_LENGTH = 0
86
87 ROW_SIZE_BITS = ROW_SIZE * 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW = ROW_SIZE_BITS // 32
94
95 # Bit fields counts in the address
96 #
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS = log2_int(INSN_PER_ROW)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS = log2_int(BRAM_ROWS)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS = log2_int(LINE_SIZE)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS = log2_int(ROW_SIZE)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
115
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # L1 ITLB
121 TLB_BITS = log2_int(TLB_SIZE)
122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
123 TLB_PTE_BITS = 64
124
125 print("BRAM_ROWS =", BRAM_ROWS)
126 print("INDEX_BITS =", INDEX_BITS)
127 print("INSN_BITS =", INSN_BITS)
128 print("INSN_PER_ROW =", INSN_PER_ROW)
129 print("LINE_SIZE =", LINE_SIZE)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS)
131 print("LOG_LENGTH =", LOG_LENGTH)
132 print("NUM_LINES =", NUM_LINES)
133 print("NUM_WAYS =", NUM_WAYS)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
135 print("ROW_BITS =", ROW_BITS)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS)
138 print("ROW_PER_LINE =", ROW_PER_LINE)
139 print("ROW_SIZE =", ROW_SIZE)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS)
142 print("SIM =", SIM)
143 print("TAG_BITS =", TAG_BITS)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
145 print("TAG_BITS =", TAG_BITS)
146 print("TLB_BITS =", TLB_BITS)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS)
150 print("TLB_SIZE =", TLB_SIZE)
151 print("WAY_BITS =", WAY_BITS)
152
153 # from microwatt/utils.vhdl
154 def ispow2(n):
155 return n != 0 and (n & (n - 1)) == 0
156
157 assert LINE_SIZE % ROW_SIZE == 0
158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
169 "geometry bits don't add up"
170
171 # Example of layout for 32 lines of 64 bytes:
172 #
173 # .. tag |index| line |
174 # .. | row | |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
183
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
186 #
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
190 def CacheTagArray():
191 tag_layout = [('valid', NUM_WAYS),
192 ('tag', TAG_RAM_WIDTH),
193 ]
194 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
195
196 def RowPerLineValidArray():
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBArray():
206 tlb_layout = [('valid', 1),
207 ('tag', TLB_EA_TAG_BITS),
208 ('pte', TLB_PTE_BITS)
209 ]
210 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
211
212 # Cache RAM interface
213 def CacheRamOut():
214 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
215 for x in range(NUM_WAYS))
216
217 # PLRU output interface
218 def PLRUOut():
219 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
220 for x in range(NUM_LINES))
221
222 # Return the cache line index (tag index) for an address
223 def get_index(addr):
224 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
225
226 # Return the cache row index (data memory) for an address
227 def get_row(addr):
228 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
229
230 # Return the index of a row within a line
231 def get_row_of_line(row):
232 return row[:ROW_BITS][:ROW_LINE_BITS]
233
234 # Returns whether this is the last row of a line
235 def is_last_row_addr(addr, last):
236 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
237
238 # Returns whether this is the last row of a line
239 def is_last_row(row, last):
240 return get_row_of_line(row) == last
241
242 # Return the next row in the current cache line. We use a dedicated
243 # function in order to limit the size of the generated adder to be
244 # only the bits within a cache line (3 bits with default settings)
245 def next_row(row):
246 row_v = row[0:ROW_LINE_BITS] + 1
247 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
248
249 # Read the instruction word for the given address
250 # in the current cache row
251 def read_insn_word(addr, data):
252 word = addr[2:INSN_BITS+2]
253 return data.word_select(word, 32)
254
255 # Get the tag value from the address
256 def get_tag(addr):
257 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
258
259 # Read a tag from a tag memory row
260 def read_tag(way, tagset):
261 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
262
263 # Write a tag to tag memory row
264 def write_tag(way, tagset, tag):
265 return read_tag(way, tagset).eq(tag)
266
267 # Simple hash for direct-mapped TLB index
268 def hash_ea(addr):
269 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
270 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
271 ] ^ addr[
272 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
273 ]
274 return hsh
275
276
277 # Cache reload state machine
278 @unique
279 class State(Enum):
280 IDLE = 0
281 CLR_TAG = 1
282 WAIT_ACK = 2
283
284
285 class RegInternal(RecordObject):
286 def __init__(self):
287 super().__init__()
288 # Cache hit state (Latches for 1 cycle BRAM access)
289 self.hit_way = Signal(WAY_BITS)
290 self.hit_nia = Signal(64)
291 self.hit_smark = Signal()
292 self.hit_valid = Signal()
293
294 # Cache miss state (reload state machine)
295 self.state = Signal(State, reset=State.IDLE)
296 self.wb = WBMasterOut("wb")
297 self.req_adr = Signal(64)
298 self.store_way = Signal(WAY_BITS)
299 self.store_index = Signal(INDEX_BITS)
300 self.store_row = Signal(ROW_BITS)
301 self.store_tag = Signal(TAG_BITS)
302 self.store_valid = Signal()
303 self.end_row_ix = Signal(ROW_LINE_BITS)
304 self.rows_valid = RowPerLineValidArray()
305
306 # TLB miss state
307 self.fetch_failed = Signal()
308
309
310 class ICache(FetchUnitInterface, Elaboratable):
311 """64 bit direct mapped icache. All instructions are 4B aligned."""
312 def __init__(self, pspec):
313 FetchUnitInterface.__init__(self, pspec)
314 self.i_in = Fetch1ToICacheType(name="i_in")
315 self.i_out = ICacheToDecode1Type(name="i_out")
316
317 self.m_in = MMUToICacheType(name="m_in")
318
319 self.stall_in = Signal()
320 self.stall_out = Signal()
321 self.flush_in = Signal()
322 self.inval_in = Signal()
323
324 # standard naming (wired to non-standard for compatibility)
325 self.bus = Interface(addr_width=32,
326 data_width=64,
327 granularity=8,
328 features={'stall'},
329 alignment=0,
330 name="icache_wb")
331
332 self.log_out = Signal(54)
333
334 # use FetchUnitInterface, helps keep some unit tests running
335 self.use_fetch_iface = False
336
337 def use_fetch_interface(self):
338 self.use_fetch_iface = True
339
340 # Generate a cache RAM for each way
341 def rams(self, m, r, cache_out_row, use_previous,
342 replace_way, req_row):
343
344 comb = m.d.comb
345 sync = m.d.sync
346
347 bus, stall_in = self.bus, self.stall_in
348
349 # read condition (for every cache ram)
350 do_read = Signal()
351 comb += do_read.eq(~(stall_in | use_previous))
352
353 rd_addr = Signal(ROW_BITS)
354 wr_addr = Signal(ROW_BITS)
355 comb += rd_addr.eq(req_row)
356 comb += wr_addr.eq(r.store_row)
357
358 # binary-to-unary converters: replace-way enabled by bus.ack,
359 # hit-way left permanently enabled
360 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
361 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
362 comb += re.i.eq(replace_way)
363 comb += re.n.eq(~bus.ack)
364 comb += he.i.eq(r.hit_way)
365
366 for i in range(NUM_WAYS):
367 do_write = Signal(name="do_wr_%d" % i)
368 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
369 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
370
371 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
372 m.submodules["cacheram_%d" % i] = way
373
374 comb += way.rd_en.eq(do_read)
375 comb += way.rd_addr.eq(rd_addr)
376 comb += d_out.eq(way.rd_data_o)
377 comb += way.wr_sel.eq(wr_sel)
378 comb += way.wr_addr.eq(wr_addr)
379 comb += way.wr_data.eq(bus.dat_r)
380
381 comb += do_write.eq(re.o[i])
382
383 with m.If(do_write):
384 sync += Display("cache write adr: %x data: %lx",
385 wr_addr, way.wr_data)
386
387 with m.If(he.o[i]):
388 comb += cache_out_row.eq(d_out)
389 with m.If(do_read):
390 sync += Display("cache read adr: %x data: %x",
391 req_row, d_out)
392
393 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
394
395 # Generate PLRUs
396 def maybe_plrus(self, m, r, plru_victim):
397 comb = m.d.comb
398
399 if NUM_WAYS == 0:
400 return
401
402
403 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
404 comb += plru.way.eq(r.hit_way)
405 comb += plru.valid.eq(r.hit_valid)
406 comb += plru.index.eq(get_index(r.hit_nia))
407 comb += plru.isel.eq(r.store_index) # select victim
408 comb += plru_victim.eq(plru.o_index) # selected victim
409
410 # TLB hit detection and real address generation
411 def itlb_lookup(self, m, tlb_req_index, itlb,
412 real_addr, ra_valid, eaa_priv,
413 priv_fault, access_ok):
414
415 comb = m.d.comb
416
417 i_in = self.i_in
418
419 pte = Signal(TLB_PTE_BITS)
420 ttag = Signal(TLB_EA_TAG_BITS)
421
422 comb += tlb_req_index.eq(hash_ea(i_in.nia))
423 comb += pte.eq(itlb[tlb_req_index].pte)
424 comb += ttag.eq(itlb[tlb_req_index].tag)
425
426 with m.If(i_in.virt_mode):
427 comb += real_addr.eq(Cat(
428 i_in.nia[:TLB_LG_PGSZ],
429 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
430 ))
431
432 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
433 comb += ra_valid.eq(itlb[tlb_req_index].valid)
434
435 comb += eaa_priv.eq(pte[3])
436
437 with m.Else():
438 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
439 comb += ra_valid.eq(1)
440 comb += eaa_priv.eq(1)
441
442 # No IAMR, so no KUEP support for now
443 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
444 comb += access_ok.eq(ra_valid & ~priv_fault)
445
446 # iTLB update
447 def itlb_update(self, m, itlb):
448 comb = m.d.comb
449 sync = m.d.sync
450
451 m_in = self.m_in
452
453 wr_index = Signal(TLB_SIZE)
454 comb += wr_index.eq(hash_ea(m_in.addr))
455
456 with m.If(m_in.tlbie & m_in.doall):
457 # Clear all valid bits
458 for i in range(TLB_SIZE):
459 sync += itlb[i].valid.eq(0)
460
461 with m.Elif(m_in.tlbie):
462 # Clear entry regardless of hit or miss
463 sync += itlb[wr_index].valid.eq(0)
464
465 with m.Elif(m_in.tlbld):
466 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
467 sync += itlb[wr_index].pte.eq(m_in.pte)
468 sync += itlb[wr_index].valid.eq(1)
469
470 # Cache hit detection, output to fetch2 and other misc logic
471 def icache_comb(self, m, use_previous, r, req_index, req_row,
472 req_hit_way, req_tag, real_addr, req_laddr,
473 cache_tags, access_ok,
474 req_is_hit, req_is_miss, replace_way,
475 plru_victim, cache_out_row):
476
477 comb = m.d.comb
478
479 i_in, i_out, bus = self.i_in, self.i_out, self.bus
480 flush_in, stall_out = self.flush_in, self.stall_out
481
482 is_hit = Signal()
483 hit_way = Signal(WAY_BITS)
484
485 # i_in.sequential means that i_in.nia this cycle is 4 more than
486 # last cycle. If we read more than 32 bits at a time, had a
487 # cache hit last cycle, and we don't want the first 32-bit chunk
488 # then we can keep the data we read last cycle and just use that.
489 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
490 comb += use_previous.eq(i_in.sequential & r.hit_valid)
491
492 # Extract line, row and tag from request
493 comb += req_index.eq(get_index(i_in.nia))
494 comb += req_row.eq(get_row(i_in.nia))
495 comb += req_tag.eq(get_tag(real_addr))
496
497 # Calculate address of beginning of cache row, will be
498 # used for cache miss processing if needed
499 comb += req_laddr.eq(Cat(
500 Const(0, ROW_OFF_BITS),
501 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
502 ))
503
504 # Test if pending request is a hit on any way
505 hitcond = Signal()
506 comb += hitcond.eq((r.state == State.WAIT_ACK)
507 & (req_index == r.store_index)
508 & r.rows_valid[req_row % ROW_PER_LINE]
509 )
510 # i_in.req asserts Decoder active
511 cvb = Signal(NUM_WAYS)
512 ctag = Signal(TAG_RAM_WIDTH)
513 comb += ctag.eq(cache_tags[req_index].tag)
514 comb += cvb.eq(cache_tags[req_index].valid)
515 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
516 comb += se.i.eq(r.store_way)
517 comb += se.n.eq(~i_in.req)
518 for i in range(NUM_WAYS):
519 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
520 hit_test = Signal(name="hit_test%d" % i)
521 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
522 comb += tagi.eq(read_tag(i, ctag))
523 comb += hit_test.eq(se.o[i])
524 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
525 (tagi == req_tag))
526 with m.If(is_tag_hit):
527 comb += hit_way.eq(i)
528 comb += is_hit.eq(1)
529
530 # Generate the "hit" and "miss" signals
531 # for the synchronous blocks
532 with m.If(i_in.req & access_ok & ~flush_in):
533 comb += req_is_hit.eq(is_hit)
534 comb += req_is_miss.eq(~is_hit)
535
536 comb += req_hit_way.eq(hit_way)
537
538 # The way to replace on a miss
539 with m.If(r.state == State.CLR_TAG):
540 comb += replace_way.eq(plru_victim)
541 with m.Else():
542 comb += replace_way.eq(r.store_way)
543
544 # Output instruction from current cache row
545 #
546 # Note: This is a mild violation of our design principle of
547 # having pipeline stages output from a clean latch. In this
548 # case we output the result of a mux. The alternative would
549 # be output an entire row which I prefer not to do just yet
550 # as it would force fetch2 to know about some of the cache
551 # geometry information.
552 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
553 comb += i_out.valid.eq(r.hit_valid)
554 comb += i_out.nia.eq(r.hit_nia)
555 comb += i_out.stop_mark.eq(r.hit_smark)
556 comb += i_out.fetch_failed.eq(r.fetch_failed)
557
558 # Stall fetch1 if we have a miss on cache or TLB
559 # or a protection fault
560 comb += stall_out.eq(~(is_hit & access_ok))
561
562 # Wishbone requests output (from the cache miss reload machine)
563 comb += bus.we.eq(r.wb.we)
564 comb += bus.adr.eq(r.wb.adr)
565 comb += bus.sel.eq(r.wb.sel)
566 comb += bus.stb.eq(r.wb.stb)
567 comb += bus.dat_w.eq(r.wb.dat)
568 comb += bus.cyc.eq(r.wb.cyc)
569
570 # Cache hit synchronous machine
571 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
572 req_index, req_tag, real_addr):
573 sync = m.d.sync
574
575 i_in, stall_in = self.i_in, self.stall_in
576 flush_in = self.flush_in
577
578 # keep outputs to fetch2 unchanged on a stall
579 # except that flush or reset sets valid to 0
580 # If use_previous, keep the same data as last
581 # cycle and use the second half
582 with m.If(stall_in | use_previous):
583 with m.If(flush_in):
584 sync += r.hit_valid.eq(0)
585 with m.Else():
586 # On a hit, latch the request for the next cycle,
587 # when the BRAM data will be available on the
588 # cache_out output of the corresponding way
589 sync += r.hit_valid.eq(req_is_hit)
590
591 with m.If(req_is_hit):
592 sync += r.hit_way.eq(req_hit_way)
593 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
594 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
595 i_in.stop_mark, req_index, req_tag,
596 req_hit_way, real_addr)
597
598 with m.If(~stall_in):
599 # Send stop marks and NIA down regardless of validity
600 sync += r.hit_smark.eq(i_in.stop_mark)
601 sync += r.hit_nia.eq(i_in.nia)
602
603 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
604 req_index, req_tag, replace_way, real_addr):
605 comb = m.d.comb
606 sync = m.d.sync
607
608 i_in = self.i_in
609
610 # Reset per-row valid flags, only used in WAIT_ACK
611 for i in range(ROW_PER_LINE):
612 sync += r.rows_valid[i].eq(0)
613
614 # We need to read a cache line
615 with m.If(req_is_miss):
616 sync += Display(
617 "cache miss nia:%x IR:%x SM:%x idx:%x "
618 " way:%x tag:%x RA:%x", i_in.nia,
619 i_in.virt_mode, i_in.stop_mark, req_index,
620 replace_way, req_tag, real_addr)
621
622 # Keep track of our index and way for subsequent stores
623 st_row = Signal(ROW_BITS)
624 comb += st_row.eq(get_row(req_laddr))
625 sync += r.store_index.eq(req_index)
626 sync += r.store_row.eq(st_row)
627 sync += r.store_tag.eq(req_tag)
628 sync += r.store_valid.eq(1)
629 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
630
631 # Prep for first wishbone read. We calculate the address
632 # of the start of the cache line and start the WB cycle.
633 sync += r.req_adr.eq(req_laddr)
634 sync += r.wb.cyc.eq(1)
635 sync += r.wb.stb.eq(1)
636
637 # Track that we had one request sent
638 sync += r.state.eq(State.CLR_TAG)
639
640 def icache_miss_clr_tag(self, m, r, replace_way,
641 req_index,
642 tagset, cache_tags):
643 comb = m.d.comb
644 sync = m.d.sync
645
646 # Get victim way from plru
647 sync += r.store_way.eq(replace_way)
648
649 # Force misses on that way while reloading that line
650 cv = Signal(INDEX_BITS)
651 comb += cv.eq(cache_tags[req_index].valid)
652 comb += cv.bit_select(replace_way, 1).eq(0)
653 sync += cache_tags[req_index].valid.eq(cv)
654
655 for i in range(NUM_WAYS):
656 with m.If(i == replace_way):
657 comb += tagset.eq(cache_tags[r.store_index].tag)
658 comb += write_tag(i, tagset, r.store_tag)
659 sync += cache_tags[r.store_index].tag.eq(tagset)
660
661 sync += r.state.eq(State.WAIT_ACK)
662
663 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
664 cache_tags, stbs_done):
665 comb = m.d.comb
666 sync = m.d.sync
667
668 bus = self.bus
669
670 # Requests are all sent if stb is 0
671 stbs_zero = Signal()
672 comb += stbs_zero.eq(r.wb.stb == 0)
673 comb += stbs_done.eq(stbs_zero)
674
675 # If we are still sending requests, was one accepted?
676 with m.If(~bus.stall & ~stbs_zero):
677 # That was the last word? We are done sending.
678 # Clear stb and set stbs_done so we can handle
679 # an eventual last ack on the same cycle.
680 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
681 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
682 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
683 "stbs_done:%x", r.wb.adr, r.end_row_ix,
684 r.wb.stb, stbs_zero, stbs_done)
685 sync += r.wb.stb.eq(0)
686 comb += stbs_done.eq(1)
687
688 # Calculate the next row address
689 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
690 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
691 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
692 sync += Display("RARANGE r.req_adr:%x rarange:%x "
693 "stbs_zero:%x stbs_done:%x",
694 r.req_adr, rarange, stbs_zero, stbs_done)
695
696 # Incoming acks processing
697 with m.If(bus.ack):
698 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
699 "stbs_done:%x",
700 bus.dat_r, stbs_zero, stbs_done)
701
702 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
703
704 # Check for completion
705 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
706 # Complete wishbone cycle
707 sync += r.wb.cyc.eq(0)
708 # be nice, clear addr
709 sync += r.req_adr.eq(0)
710
711 # Cache line is now valid
712 cv = Signal(INDEX_BITS)
713 comb += cv.eq(cache_tags[r.store_index].valid)
714 comb += cv.bit_select(replace_way, 1).eq(
715 r.store_valid & ~inval_in)
716 sync += cache_tags[r.store_index].valid.eq(cv)
717
718 sync += r.state.eq(State.IDLE)
719
720 # move on to next request in row
721 # Increment store row counter
722 sync += r.store_row.eq(next_row(r.store_row))
723
724 # Cache miss/reload synchronous machine
725 def icache_miss(self, m, r, req_is_miss,
726 req_index, req_laddr, req_tag, replace_way,
727 cache_tags, access_ok, real_addr):
728 comb = m.d.comb
729 sync = m.d.sync
730
731 i_in, bus, m_in = self.i_in, self.bus, self.m_in
732 stall_in, flush_in = self.stall_in, self.flush_in
733 inval_in = self.inval_in
734
735 tagset = Signal(TAG_RAM_WIDTH)
736 stbs_done = Signal()
737
738 comb += r.wb.sel.eq(-1)
739 comb += r.wb.adr.eq(r.req_adr[3:])
740
741 # Process cache invalidations
742 with m.If(inval_in):
743 for i in range(NUM_LINES):
744 sync += cache_tags[i].valid.eq(0)
745 sync += r.store_valid.eq(0)
746
747 # Main state machine
748 with m.Switch(r.state):
749
750 with m.Case(State.IDLE):
751 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
752 req_index, req_tag, replace_way,
753 real_addr)
754
755 with m.Case(State.CLR_TAG, State.WAIT_ACK):
756 with m.If(r.state == State.CLR_TAG):
757 self.icache_miss_clr_tag(m, r, replace_way,
758 req_index, tagset, cache_tags)
759
760 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
761 cache_tags, stbs_done)
762
763 # TLB miss and protection fault processing
764 with m.If(flush_in | m_in.tlbld):
765 sync += r.fetch_failed.eq(0)
766 with m.Elif(i_in.req & ~access_ok & ~stall_in):
767 sync += r.fetch_failed.eq(1)
768
769 # icache_log: if LOG_LENGTH > 0 generate
770 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
771 req_is_miss, req_is_hit, lway, wstate, r):
772 comb = m.d.comb
773 sync = m.d.sync
774
775 bus, i_out = self.bus, self.i_out
776 log_out, stall_out = self.log_out, self.stall_out
777
778 # Output data to logger
779 for i in range(LOG_LENGTH):
780 log_data = Signal(54)
781 lway = Signal(WAY_BITS)
782 wstate = Signal()
783
784 sync += lway.eq(req_hit_way)
785 sync += wstate.eq(0)
786
787 with m.If(r.state != State.IDLE):
788 sync += wstate.eq(1)
789
790 sync += log_data.eq(Cat(
791 ra_valid, access_ok, req_is_miss, req_is_hit,
792 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
793 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
794 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
795 ))
796 comb += log_out.eq(log_data)
797
798 def elaborate(self, platform):
799
800 m = Module()
801 comb = m.d.comb
802
803 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
804 cache_tags = CacheTagArray()
805
806 # TLB Array
807 itlb = TLBArray()
808
809 # TODO to be passed to nmigen as ram attributes
810 # attribute ram_style of itlb_tags : signal is "distributed";
811 # attribute ram_style of itlb_ptes : signal is "distributed";
812
813 # Privilege bit from PTE EAA field
814 eaa_priv = Signal()
815
816 r = RegInternal()
817
818 # Async signal on incoming request
819 req_index = Signal(INDEX_BITS)
820 req_row = Signal(ROW_BITS)
821 req_hit_way = Signal(WAY_BITS)
822 req_tag = Signal(TAG_BITS)
823 req_is_hit = Signal()
824 req_is_miss = Signal()
825 req_laddr = Signal(64)
826
827 tlb_req_index = Signal(TLB_BITS)
828 real_addr = Signal(REAL_ADDR_BITS)
829 ra_valid = Signal()
830 priv_fault = Signal()
831 access_ok = Signal()
832 use_previous = Signal()
833
834 cache_out_row = Signal(ROW_SIZE_BITS)
835
836 plru_victim = Signal(WAY_BITS)
837 replace_way = Signal(WAY_BITS)
838
839 # fake-up the wishbone stall signal to comply with pipeline mode
840 # same thing is done in dcache.py
841 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
842
843 # call sub-functions putting everything together,
844 # using shared signals established above
845 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
846 self.maybe_plrus(m, r, plru_victim)
847 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
848 ra_valid, eaa_priv, priv_fault,
849 access_ok)
850 self.itlb_update(m, itlb)
851 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
852 req_tag, real_addr, req_laddr,
853 cache_tags, access_ok, req_is_hit, req_is_miss,
854 replace_way, plru_victim, cache_out_row)
855 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
856 req_index, req_tag, real_addr)
857 self.icache_miss(m, r, req_is_miss, req_index,
858 req_laddr, req_tag, replace_way, cache_tags,
859 access_ok, real_addr)
860 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
861 # req_is_miss, req_is_hit, lway, wstate, r)
862
863 # don't connect up to FetchUnitInterface so that some unit tests
864 # can continue to operate
865 if not self.use_fetch_iface:
866 return m
867
868 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
869 # so needs checking and iterative revising
870 i_in, bus, i_out = self.i_in, self.bus, self.i_out
871 comb += i_in.req.eq(self.a_i_valid)
872 comb += i_in.nia.eq(self.a_pc_i)
873 comb += self.stall_in.eq(self.a_stall_i)
874 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
875 comb += self.f_badaddr_o.eq(i_out.nia)
876 comb += self.f_instr_o.eq(i_out.insn)
877 comb += self.f_busy_o.eq(~i_out.valid) # probably
878
879 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
880 ibus = self.ibus
881 comb += ibus.adr.eq(self.bus.adr)
882 comb += ibus.dat_w.eq(self.bus.dat_w)
883 comb += ibus.sel.eq(self.bus.sel)
884 comb += ibus.cyc.eq(self.bus.cyc)
885 comb += ibus.stb.eq(self.bus.stb)
886 comb += ibus.we.eq(self.bus.we)
887
888 comb += self.bus.dat_r.eq(ibus.dat_r)
889 comb += self.bus.ack.eq(ibus.ack)
890 if hasattr(ibus, "stall"):
891 comb += self.bus.stall.eq(ibus.stall)
892
893 return m
894
895
896 def icache_sim(dut):
897 i_in = dut.i_in
898 i_out = dut.i_out
899 m_out = dut.m_in
900
901 yield i_in.priv_mode.eq(1)
902 yield i_in.req.eq(0)
903 yield i_in.nia.eq(0)
904 yield i_in.stop_mark.eq(0)
905 yield m_out.tlbld.eq(0)
906 yield m_out.tlbie.eq(0)
907 yield m_out.addr.eq(0)
908 yield m_out.pte.eq(0)
909 yield
910 yield
911 yield
912 yield
913
914 # miss, stalls for a bit
915 yield i_in.req.eq(1)
916 yield i_in.nia.eq(Const(0x0000000000000004, 64))
917 yield
918 valid = yield i_out.valid
919 while not valid:
920 yield
921 valid = yield i_out.valid
922 yield i_in.req.eq(0)
923
924 insn = yield i_out.insn
925 nia = yield i_out.nia
926 assert insn == 0x00000001, \
927 "insn @%x=%x expected 00000001" % (nia, insn)
928 yield i_in.req.eq(0)
929 yield
930
931 # hit
932 yield i_in.req.eq(1)
933 yield i_in.nia.eq(Const(0x0000000000000008, 64))
934 yield
935 valid = yield i_out.valid
936 while not valid:
937 yield
938 valid = yield i_out.valid
939 yield i_in.req.eq(0)
940
941 nia = yield i_out.nia
942 insn = yield i_out.insn
943 yield
944 assert insn == 0x00000002, \
945 "insn @%x=%x expected 00000002" % (nia, insn)
946
947 # another miss
948 yield i_in.req.eq(1)
949 yield i_in.nia.eq(Const(0x0000000000000040, 64))
950 yield
951 valid = yield i_out.valid
952 while not valid:
953 yield
954 valid = yield i_out.valid
955 yield i_in.req.eq(0)
956
957 nia = yield i_in.nia
958 insn = yield i_out.insn
959 assert insn == 0x00000010, \
960 "insn @%x=%x expected 00000010" % (nia, insn)
961
962 # test something that aliases (this only works because
963 # the unit test SRAM is a depth of 512)
964 yield i_in.req.eq(1)
965 yield i_in.nia.eq(Const(0x0000000000000100, 64))
966 yield
967 yield
968 valid = yield i_out.valid
969 assert ~valid
970 for i in range(30):
971 yield
972 yield
973 insn = yield i_out.insn
974 valid = yield i_out.valid
975 insn = yield i_out.insn
976 assert valid
977 assert insn == 0x00000040, \
978 "insn @%x=%x expected 00000040" % (nia, insn)
979 yield i_in.req.eq(0)
980
981
982 def test_icache(mem):
983 from soc.config.test.test_loadstore import TestMemPspec
984 pspec = TestMemPspec(addr_wid=32,
985 mask_wid=8,
986 reg_wid=64,
987 )
988 dut = ICache(pspec)
989
990 memory = Memory(width=64, depth=512, init=mem)
991 sram = SRAM(memory=memory, granularity=8)
992
993 m = Module()
994
995 m.submodules.icache = dut
996 m.submodules.sram = sram
997
998 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
999 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1000 m.d.comb += sram.bus.we.eq(dut.bus.we)
1001 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1002 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1003 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1004
1005 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1006 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1007
1008 # nmigen Simulation
1009 sim = Simulator(m)
1010 sim.add_clock(1e-6)
1011
1012 sim.add_sync_process(wrap(icache_sim(dut)))
1013 with sim.write_vcd('test_icache.vcd'):
1014 sim.run()
1015
1016
1017 if __name__ == '__main__':
1018 from soc.config.test.test_loadstore import TestMemPspec
1019 pspec = TestMemPspec(addr_wid=64,
1020 mask_wid=8,
1021 reg_wid=64,
1022 )
1023 dut = ICache(pspec)
1024 vl = rtlil.convert(dut, ports=[])
1025 with open("test_icache.il", "w") as f:
1026 f.write(vl)
1027
1028 # set up memory every 32-bits with incrementing values 0 1 2 ...
1029 mem = []
1030 for i in range(512):
1031 mem.append((i*2) | ((i*2+1)<<32))
1032
1033 test_icache(mem)