use one-hot binary-to-unary in dcache.py
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
24 Record)
25 from nmigen.cli import main, rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmigen.utils import log2_int
28 from nmigen.lib.coding import Decoder
29 from nmutil.util import Display
30
31 #from nmutil.plru import PLRU
32 from soc.experiment.cache_ram import CacheRam
33 from soc.experiment.plru import PLRU
34
35 from soc.experiment.mem_types import (Fetch1ToICacheType,
36 ICacheToDecode1Type,
37 MMUToICacheType)
38
39 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
40 WB_SEL_BITS, WBAddrType, WBDataType,
41 WBSelType, WBMasterOut, WBSlaveOut,
42 )
43
44 from nmigen_soc.wishbone.bus import Interface
45
46 # for test
47 from soc.bus.sram import SRAM
48 from nmigen import Memory
49 from nmutil.util import wrap
50 from nmigen.cli import main, rtlil
51
52 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
53 # Also, check out the cxxsim nmigen branch, and latest yosys from git
54 from nmutil.sim_tmp_alternative import Simulator, Settle
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 16
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
80 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
81 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
82 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
83 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 # Bit fields counts in the address
87 #
88 # INSN_BITS is the number of bits to select an instruction in a row
89 INSN_BITS = log2_int(INSN_PER_ROW)
90 # ROW_BITS is the number of bits to select a row
91 ROW_BITS = log2_int(BRAM_ROWS)
92 # ROW_LINE_BITS is the number of bits to select a row within a line
93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
94 # LINE_OFF_BITS is the number of bits for the offset in a cache line
95 LINE_OFF_BITS = log2_int(LINE_SIZE)
96 # ROW_OFF_BITS is the number of bits for the offset in a row
97 ROW_OFF_BITS = log2_int(ROW_SIZE)
98 # INDEX_BITS is the number of bits to select a cache line
99 INDEX_BITS = log2_int(NUM_LINES)
100 # SET_SIZE_BITS is the log base 2 of the set size
101 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
102 # TAG_BITS is the number of bits of the tag part of the address
103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
104 # TAG_WIDTH is the width in bits of each way of the tag RAM
105 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
106
107 # WAY_BITS is the number of bits to select a way
108 WAY_BITS = log2_int(NUM_WAYS)
109 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
110
111 # L1 ITLB
112 TLB_BITS = log2_int(TLB_SIZE)
113 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
114 TLB_PTE_BITS = 64
115
116 print("BRAM_ROWS =", BRAM_ROWS)
117 print("INDEX_BITS =", INDEX_BITS)
118 print("INSN_BITS =", INSN_BITS)
119 print("INSN_PER_ROW =", INSN_PER_ROW)
120 print("LINE_SIZE =", LINE_SIZE)
121 print("LINE_OFF_BITS =", LINE_OFF_BITS)
122 print("LOG_LENGTH =", LOG_LENGTH)
123 print("NUM_LINES =", NUM_LINES)
124 print("NUM_WAYS =", NUM_WAYS)
125 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
126 print("ROW_BITS =", ROW_BITS)
127 print("ROW_OFF_BITS =", ROW_OFF_BITS)
128 print("ROW_LINE_BITS =", ROW_LINE_BITS)
129 print("ROW_PER_LINE =", ROW_PER_LINE)
130 print("ROW_SIZE =", ROW_SIZE)
131 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
132 print("SET_SIZE_BITS =", SET_SIZE_BITS)
133 print("SIM =", SIM)
134 print("TAG_BITS =", TAG_BITS)
135 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
136 print("TAG_BITS =", TAG_BITS)
137 print("TLB_BITS =", TLB_BITS)
138 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
139 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
140 print("TLB_PTE_BITS =", TLB_PTE_BITS)
141 print("TLB_SIZE =", TLB_SIZE)
142 print("WAY_BITS =", WAY_BITS)
143
144 # from microwatt/utils.vhdl
145 def ispow2(n):
146 return n != 0 and (n & (n - 1)) == 0
147
148 assert LINE_SIZE % ROW_SIZE == 0
149 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
150 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
151 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
152 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
153 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
154 "geometry bits don't add up"
155 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
156 "geometry bits don't add up"
157 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
158 "geometry bits don't add up"
159 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
160 "geometry bits don't add up"
161
162 # Example of layout for 32 lines of 64 bytes:
163 #
164 # .. tag |index| line |
165 # .. | row | |
166 # .. | | | |00| zero (2)
167 # .. | | |-| | INSN_BITS (1)
168 # .. | |---| | ROW_LINE_BITS (3)
169 # .. | |--- - --| LINE_OFF_BITS (6)
170 # .. | |- --| ROW_OFF_BITS (3)
171 # .. |----- ---| | ROW_BITS (8)
172 # .. |-----| | INDEX_BITS (5)
173 # .. --------| | TAG_BITS (53)
174
175 # The cache data BRAM organized as described above for each way
176 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
177 #
178 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
179 # not handle a clean (commented) definition of the cache tags as a 3d
180 # memory. For now, work around it by putting all the tags
181 def CacheTagArray():
182 tag_layout = [('valid', 1),
183 ('tag', TAG_RAM_WIDTH),
184 ]
185 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
186
187 def RowPerLineValidArray():
188 return Array(Signal(name="rows_valid_%d" %x) \
189 for x in range(ROW_PER_LINE))
190
191
192 # TODO to be passed to nigmen as ram attributes
193 # attribute ram_style : string;
194 # attribute ram_style of cache_tags : signal is "distributed";
195
196 def TLBArray():
197 tlb_layout = [('valid', 1),
198 ('tag', TLB_EA_TAG_BITS),
199 ('pte', TLB_PTE_BITS)
200 ]
201 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
202
203 # Cache RAM interface
204 def CacheRamOut():
205 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
206 for x in range(NUM_WAYS))
207
208 # PLRU output interface
209 def PLRUOut():
210 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
211 for x in range(NUM_LINES))
212
213 # Return the cache line index (tag index) for an address
214 def get_index(addr):
215 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
216
217 # Return the cache row index (data memory) for an address
218 def get_row(addr):
219 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
220
221 # Return the index of a row within a line
222 def get_row_of_line(row):
223 return row[:ROW_BITS][:ROW_LINE_BITS]
224
225 # Returns whether this is the last row of a line
226 def is_last_row_addr(addr, last):
227 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
228
229 # Returns whether this is the last row of a line
230 def is_last_row(row, last):
231 return get_row_of_line(row) == last
232
233 # Return the next row in the current cache line. We use a dedicated
234 # function in order to limit the size of the generated adder to be
235 # only the bits within a cache line (3 bits with default settings)
236 def next_row(row):
237 row_v = row[0:ROW_LINE_BITS] + 1
238 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
239
240 # Read the instruction word for the given address
241 # in the current cache row
242 def read_insn_word(addr, data):
243 word = addr[2:INSN_BITS+2]
244 return data.word_select(word, 32)
245
246 # Get the tag value from the address
247 def get_tag(addr):
248 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
249
250 # Read a tag from a tag memory row
251 def read_tag(way, tagset):
252 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
253
254 # Write a tag to tag memory row
255 def write_tag(way, tagset, tag):
256 return read_tag(way, tagset).eq(tag)
257
258 # Simple hash for direct-mapped TLB index
259 def hash_ea(addr):
260 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
261 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
262 ] ^ addr[
263 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
264 ]
265 return hsh
266
267
268 # Cache reload state machine
269 @unique
270 class State(Enum):
271 IDLE = 0
272 CLR_TAG = 1
273 WAIT_ACK = 2
274
275
276 class RegInternal(RecordObject):
277 def __init__(self):
278 super().__init__()
279 # Cache hit state (Latches for 1 cycle BRAM access)
280 self.hit_way = Signal(WAY_BITS)
281 self.hit_nia = Signal(64)
282 self.hit_smark = Signal()
283 self.hit_valid = Signal()
284
285 # Cache miss state (reload state machine)
286 self.state = Signal(State, reset=State.IDLE)
287 self.wb = WBMasterOut("wb")
288 self.req_adr = Signal(64)
289 self.store_way = Signal(WAY_BITS)
290 self.store_index = Signal(INDEX_BITS)
291 self.store_row = Signal(ROW_BITS)
292 self.store_tag = Signal(TAG_BITS)
293 self.store_valid = Signal()
294 self.end_row_ix = Signal(ROW_LINE_BITS)
295 self.rows_valid = RowPerLineValidArray()
296
297 # TLB miss state
298 self.fetch_failed = Signal()
299
300
301 class ICache(Elaboratable):
302 """64 bit direct mapped icache. All instructions are 4B aligned."""
303 def __init__(self):
304 self.i_in = Fetch1ToICacheType(name="i_in")
305 self.i_out = ICacheToDecode1Type(name="i_out")
306
307 self.m_in = MMUToICacheType(name="m_in")
308
309 self.stall_in = Signal()
310 self.stall_out = Signal()
311 self.flush_in = Signal()
312 self.inval_in = Signal()
313
314 # standard naming (wired to non-standard for compatibility)
315 self.bus = Interface(addr_width=32,
316 data_width=64,
317 granularity=8,
318 features={'stall'},
319 alignment=0,
320 name="dcache")
321
322 self.log_out = Signal(54)
323
324
325 # Generate a cache RAM for each way
326 def rams(self, m, r, cache_out_row, use_previous,
327 replace_way, req_row):
328
329 comb = m.d.comb
330 sync = m.d.sync
331
332 bus, stall_in = self.bus, self.stall_in
333
334 # read condition (for every cache ram)
335 do_read = Signal()
336 comb += do_read.eq(~(stall_in | use_previous))
337
338 # binary-to-unary converters: replace-way enabled by bus.ack,
339 # hit-way left permanently enabled
340 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
341 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
342 comb += re.i.eq(replace_way)
343 comb += re.n.eq(~bus.ack)
344 comb += he.i.eq(r.hit_way)
345
346 for i in range(NUM_WAYS):
347 do_write = Signal(name="do_wr_%d" % i)
348 rd_addr = Signal(ROW_BITS)
349 wr_addr = Signal(ROW_BITS)
350 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
351 wr_sel = Signal(ROW_SIZE)
352
353 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
354 m.submodules["cacheram_%d" % i] = way
355
356 comb += way.rd_en.eq(do_read)
357 comb += way.rd_addr.eq(rd_addr)
358 comb += d_out.eq(way.rd_data_o)
359 comb += way.wr_sel.eq(wr_sel)
360 comb += way.wr_addr.eq(wr_addr)
361 comb += way.wr_data.eq(bus.dat_r)
362
363 comb += do_write.eq(re.o[i])
364
365 with m.If(do_write):
366 sync += Display("cache write adr: %x data: %lx",
367 wr_addr, way.wr_data)
368
369 with m.If(he.o[i]):
370 comb += cache_out_row.eq(d_out)
371 with m.If(do_read):
372 sync += Display("cache read adr: %x data: %x",
373 req_row, d_out)
374
375 comb += rd_addr.eq(req_row)
376 comb += wr_addr.eq(r.store_row)
377 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
378
379 # Generate PLRUs
380 def maybe_plrus(self, m, r, plru_victim):
381 comb = m.d.comb
382
383 with m.If(NUM_WAYS > 1):
384 m.submodules.plru_e = e = Decoder(NUM_LINES)
385 comb += e.i.eq(get_index(r.hit_nia))
386
387 for i in range(NUM_LINES):
388 plru = PLRU(WAY_BITS)
389 m.submodules["plru_%d" % i] = plru
390
391 # PLRU interface
392 with m.If(e.o[i]):
393 comb += plru.acc_en.eq(r.hit_valid)
394
395 comb += plru.acc_i.eq(r.hit_way)
396 comb += plru_victim[i].eq(plru.lru_o)
397
398 # TLB hit detection and real address generation
399 def itlb_lookup(self, m, tlb_req_index, itlb,
400 real_addr, ra_valid, eaa_priv,
401 priv_fault, access_ok):
402
403 comb = m.d.comb
404
405 i_in = self.i_in
406
407 pte = Signal(TLB_PTE_BITS)
408 ttag = Signal(TLB_EA_TAG_BITS)
409
410 comb += tlb_req_index.eq(hash_ea(i_in.nia))
411 comb += pte.eq(itlb[tlb_req_index].pte)
412 comb += ttag.eq(itlb[tlb_req_index].tag)
413
414 with m.If(i_in.virt_mode):
415 comb += real_addr.eq(Cat(
416 i_in.nia[:TLB_LG_PGSZ],
417 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
418 ))
419
420 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
421 comb += ra_valid.eq(itlb[tlb_req_index].valid)
422
423 comb += eaa_priv.eq(pte[3])
424
425 with m.Else():
426 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
427 comb += ra_valid.eq(1)
428 comb += eaa_priv.eq(1)
429
430 # No IAMR, so no KUEP support for now
431 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
432 comb += access_ok.eq(ra_valid & ~priv_fault)
433
434 # iTLB update
435 def itlb_update(self, m, itlb):
436 comb = m.d.comb
437 sync = m.d.sync
438
439 m_in = self.m_in
440
441 wr_index = Signal(TLB_SIZE)
442 comb += wr_index.eq(hash_ea(m_in.addr))
443
444 with m.If(m_in.tlbie & m_in.doall):
445 # Clear all valid bits
446 for i in range(TLB_SIZE):
447 sync += itlb[i].valid.eq(0)
448
449 with m.Elif(m_in.tlbie):
450 # Clear entry regardless of hit or miss
451 sync += itlb[wr_index].valid.eq(0)
452
453 with m.Elif(m_in.tlbld):
454 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
455 sync += itlb[wr_index].pte.eq(m_in.pte)
456 sync += itlb[wr_index].valid.eq(1)
457
458 # Cache hit detection, output to fetch2 and other misc logic
459 def icache_comb(self, m, use_previous, r, req_index, req_row,
460 req_hit_way, req_tag, real_addr, req_laddr,
461 cache_tags, access_ok,
462 req_is_hit, req_is_miss, replace_way,
463 plru_victim, cache_out_row):
464
465 comb = m.d.comb
466
467 i_in, i_out, bus = self.i_in, self.i_out, self.bus
468 flush_in, stall_out = self.flush_in, self.stall_out
469
470 is_hit = Signal()
471 hit_way = Signal(WAY_BITS)
472
473 # i_in.sequential means that i_in.nia this cycle is 4 more than
474 # last cycle. If we read more than 32 bits at a time, had a
475 # cache hit last cycle, and we don't want the first 32-bit chunk
476 # then we can keep the data we read last cycle and just use that.
477 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
478 comb += use_previous.eq(i_in.sequential & r.hit_valid)
479
480 # Extract line, row and tag from request
481 comb += req_index.eq(get_index(i_in.nia))
482 comb += req_row.eq(get_row(i_in.nia))
483 comb += req_tag.eq(get_tag(real_addr))
484
485 # Calculate address of beginning of cache row, will be
486 # used for cache miss processing if needed
487 comb += req_laddr.eq(Cat(
488 Const(0, ROW_OFF_BITS),
489 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
490 ))
491
492 # Test if pending request is a hit on any way
493 hitcond = Signal()
494 comb += hitcond.eq((r.state == State.WAIT_ACK)
495 & (req_index == r.store_index)
496 & r.rows_valid[req_row % ROW_PER_LINE]
497 )
498 # i_in.req asserts Decoder active
499 cvb = Signal(NUM_WAYS)
500 ctag = Signal(TAG_RAM_WIDTH)
501 comb += ctag.eq(cache_tags[req_index].tag)
502 comb += cvb.eq(cache_tags[req_index].valid)
503 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
504 comb += se.i.eq(r.store_way)
505 comb += se.n.eq(~i_in.req)
506 for i in range(NUM_WAYS):
507 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
508 hit_test = Signal(name="hit_test%d" % i)
509 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
510 comb += tagi.eq(read_tag(i, ctag))
511 comb += hit_test.eq(se.o[i])
512 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
513 (tagi == req_tag))
514 with m.If(is_tag_hit):
515 comb += hit_way.eq(i)
516 comb += is_hit.eq(1)
517
518 # Generate the "hit" and "miss" signals
519 # for the synchronous blocks
520 with m.If(i_in.req & access_ok & ~flush_in):
521 comb += req_is_hit.eq(is_hit)
522 comb += req_is_miss.eq(~is_hit)
523
524 comb += req_hit_way.eq(hit_way)
525
526 # The way to replace on a miss
527 with m.If(r.state == State.CLR_TAG):
528 comb += replace_way.eq(plru_victim[r.store_index])
529 with m.Else():
530 comb += replace_way.eq(r.store_way)
531
532 # Output instruction from current cache row
533 #
534 # Note: This is a mild violation of our design principle of
535 # having pipeline stages output from a clean latch. In this
536 # case we output the result of a mux. The alternative would
537 # be output an entire row which I prefer not to do just yet
538 # as it would force fetch2 to know about some of the cache
539 # geometry information.
540 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
541 comb += i_out.valid.eq(r.hit_valid)
542 comb += i_out.nia.eq(r.hit_nia)
543 comb += i_out.stop_mark.eq(r.hit_smark)
544 comb += i_out.fetch_failed.eq(r.fetch_failed)
545
546 # Stall fetch1 if we have a miss on cache or TLB
547 # or a protection fault
548 comb += stall_out.eq(~(is_hit & access_ok))
549
550 # Wishbone requests output (from the cache miss reload machine)
551 comb += bus.we.eq(r.wb.we)
552 comb += bus.adr.eq(r.wb.adr)
553 comb += bus.sel.eq(r.wb.sel)
554 comb += bus.stb.eq(r.wb.stb)
555 comb += bus.dat_w.eq(r.wb.dat)
556 comb += bus.cyc.eq(r.wb.cyc)
557
558 # Cache hit synchronous machine
559 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
560 req_index, req_tag, real_addr):
561 sync = m.d.sync
562
563 i_in, stall_in = self.i_in, self.stall_in
564 flush_in = self.flush_in
565
566 # keep outputs to fetch2 unchanged on a stall
567 # except that flush or reset sets valid to 0
568 # If use_previous, keep the same data as last
569 # cycle and use the second half
570 with m.If(stall_in | use_previous):
571 with m.If(flush_in):
572 sync += r.hit_valid.eq(0)
573 with m.Else():
574 # On a hit, latch the request for the next cycle,
575 # when the BRAM data will be available on the
576 # cache_out output of the corresponding way
577 sync += r.hit_valid.eq(req_is_hit)
578
579 with m.If(req_is_hit):
580 sync += r.hit_way.eq(req_hit_way)
581 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
582 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
583 i_in.stop_mark, req_index, req_tag,
584 req_hit_way, real_addr)
585
586 with m.If(~stall_in):
587 # Send stop marks and NIA down regardless of validity
588 sync += r.hit_smark.eq(i_in.stop_mark)
589 sync += r.hit_nia.eq(i_in.nia)
590
591 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
592 req_index, req_tag, replace_way, real_addr):
593 comb = m.d.comb
594 sync = m.d.sync
595
596 i_in = self.i_in
597
598 # Reset per-row valid flags, only used in WAIT_ACK
599 for i in range(ROW_PER_LINE):
600 sync += r.rows_valid[i].eq(0)
601
602 # We need to read a cache line
603 with m.If(req_is_miss):
604 sync += Display(
605 "cache miss nia:%x IR:%x SM:%x idx:%x "
606 " way:%x tag:%x RA:%x", i_in.nia,
607 i_in.virt_mode, i_in.stop_mark, req_index,
608 replace_way, req_tag, real_addr)
609
610 # Keep track of our index and way for subsequent stores
611 st_row = Signal(ROW_BITS)
612 comb += st_row.eq(get_row(req_laddr))
613 sync += r.store_index.eq(req_index)
614 sync += r.store_row.eq(st_row)
615 sync += r.store_tag.eq(req_tag)
616 sync += r.store_valid.eq(1)
617 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
618
619 # Prep for first wishbone read. We calculate the address
620 # of the start of the cache line and start the WB cycle.
621 sync += r.req_adr.eq(req_laddr)
622 sync += r.wb.cyc.eq(1)
623 sync += r.wb.stb.eq(1)
624
625 # Track that we had one request sent
626 sync += r.state.eq(State.CLR_TAG)
627
628 def icache_miss_clr_tag(self, m, r, replace_way,
629 req_index,
630 tagset, cache_tags):
631 comb = m.d.comb
632 sync = m.d.sync
633
634 # Get victim way from plru
635 sync += r.store_way.eq(replace_way)
636
637 # Force misses on that way while reloading that line
638 cv = Signal(INDEX_BITS)
639 comb += cv.eq(cache_tags[req_index].valid)
640 comb += cv.bit_select(replace_way, 1).eq(0)
641 sync += cache_tags[req_index].valid.eq(cv)
642
643 for i in range(NUM_WAYS):
644 with m.If(i == replace_way):
645 comb += tagset.eq(cache_tags[r.store_index].tag)
646 comb += write_tag(i, tagset, r.store_tag)
647 sync += cache_tags[r.store_index].tag.eq(tagset)
648
649 sync += r.state.eq(State.WAIT_ACK)
650
651 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
652 cache_tags, stbs_done):
653 comb = m.d.comb
654 sync = m.d.sync
655
656 bus = self.bus
657
658 # Requests are all sent if stb is 0
659 stbs_zero = Signal()
660 comb += stbs_zero.eq(r.wb.stb == 0)
661 comb += stbs_done.eq(stbs_zero)
662
663 # If we are still sending requests, was one accepted?
664 with m.If(~bus.stall & ~stbs_zero):
665 # That was the last word? We are done sending.
666 # Clear stb and set stbs_done so we can handle
667 # an eventual last ack on the same cycle.
668 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
669 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
670 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
671 "stbs_done:%x", r.wb.adr, r.end_row_ix,
672 r.wb.stb, stbs_zero, stbs_done)
673 sync += r.wb.stb.eq(0)
674 comb += stbs_done.eq(1)
675
676 # Calculate the next row address
677 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
678 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
679 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
680 sync += Display("RARANGE r.req_adr:%x rarange:%x "
681 "stbs_zero:%x stbs_done:%x",
682 r.req_adr, rarange, stbs_zero, stbs_done)
683
684 # Incoming acks processing
685 with m.If(bus.ack):
686 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
687 "stbs_done:%x",
688 bus.dat_r, stbs_zero, stbs_done)
689
690 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
691
692 # Check for completion
693 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
694 # Complete wishbone cycle
695 sync += r.wb.cyc.eq(0)
696 # be nice, clear addr
697 sync += r.req_adr.eq(0)
698
699 # Cache line is now valid
700 cv = Signal(INDEX_BITS)
701 comb += cv.eq(cache_tags[r.store_index].valid)
702 comb += cv.bit_select(replace_way, 1).eq(
703 r.store_valid & ~inval_in)
704 sync += cache_tags[r.store_index].valid.eq(cv)
705
706 sync += r.state.eq(State.IDLE)
707
708 # move on to next request in row
709 # Increment store row counter
710 sync += r.store_row.eq(next_row(r.store_row))
711
712 # Cache miss/reload synchronous machine
713 def icache_miss(self, m, r, req_is_miss,
714 req_index, req_laddr, req_tag, replace_way,
715 cache_tags, access_ok, real_addr):
716 comb = m.d.comb
717 sync = m.d.sync
718
719 i_in, bus, m_in = self.i_in, self.bus, self.m_in
720 stall_in, flush_in = self.stall_in, self.flush_in
721 inval_in = self.inval_in
722
723 tagset = Signal(TAG_RAM_WIDTH)
724 stbs_done = Signal()
725
726 comb += r.wb.sel.eq(-1)
727 comb += r.wb.adr.eq(r.req_adr[3:])
728
729 # Process cache invalidations
730 with m.If(inval_in):
731 for i in range(NUM_LINES):
732 sync += cache_tags[i].valid.eq(0)
733 sync += r.store_valid.eq(0)
734
735 # Main state machine
736 with m.Switch(r.state):
737
738 with m.Case(State.IDLE):
739 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
740 req_index, req_tag, replace_way,
741 real_addr)
742
743 with m.Case(State.CLR_TAG, State.WAIT_ACK):
744 with m.If(r.state == State.CLR_TAG):
745 self.icache_miss_clr_tag(m, r, replace_way,
746 req_index, tagset, cache_tags)
747
748 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
749 cache_tags, stbs_done)
750
751 # TLB miss and protection fault processing
752 with m.If(flush_in | m_in.tlbld):
753 sync += r.fetch_failed.eq(0)
754 with m.Elif(i_in.req & ~access_ok & ~stall_in):
755 sync += r.fetch_failed.eq(1)
756
757 # icache_log: if LOG_LENGTH > 0 generate
758 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
759 req_is_miss, req_is_hit, lway, wstate, r):
760 comb = m.d.comb
761 sync = m.d.sync
762
763 bus, i_out = self.bus, self.i_out
764 log_out, stall_out = self.log_out, self.stall_out
765
766 # Output data to logger
767 for i in range(LOG_LENGTH):
768 log_data = Signal(54)
769 lway = Signal(WAY_BITS)
770 wstate = Signal()
771
772 sync += lway.eq(req_hit_way)
773 sync += wstate.eq(0)
774
775 with m.If(r.state != State.IDLE):
776 sync += wstate.eq(1)
777
778 sync += log_data.eq(Cat(
779 ra_valid, access_ok, req_is_miss, req_is_hit,
780 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
781 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
782 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
783 ))
784 comb += log_out.eq(log_data)
785
786 def elaborate(self, platform):
787
788 m = Module()
789 comb = m.d.comb
790
791 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
792 cache_tags = CacheTagArray()
793
794 # TLB Array
795 itlb = TLBArray()
796
797 # TODO to be passed to nmigen as ram attributes
798 # attribute ram_style of itlb_tags : signal is "distributed";
799 # attribute ram_style of itlb_ptes : signal is "distributed";
800
801 # Privilege bit from PTE EAA field
802 eaa_priv = Signal()
803
804 r = RegInternal()
805
806 # Async signal on incoming request
807 req_index = Signal(INDEX_BITS)
808 req_row = Signal(ROW_BITS)
809 req_hit_way = Signal(WAY_BITS)
810 req_tag = Signal(TAG_BITS)
811 req_is_hit = Signal()
812 req_is_miss = Signal()
813 req_laddr = Signal(64)
814
815 tlb_req_index = Signal(TLB_SIZE)
816 real_addr = Signal(REAL_ADDR_BITS)
817 ra_valid = Signal()
818 priv_fault = Signal()
819 access_ok = Signal()
820 use_previous = Signal()
821
822 cache_out_row = Signal(ROW_SIZE_BITS)
823
824 plru_victim = PLRUOut()
825 replace_way = Signal(WAY_BITS)
826
827 # fake-up the wishbone stall signal to comply with pipeline mode
828 # same thing is done in dcache.py
829 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
830
831 # call sub-functions putting everything together,
832 # using shared signals established above
833 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
834 self.maybe_plrus(m, r, plru_victim)
835 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
836 ra_valid, eaa_priv, priv_fault,
837 access_ok)
838 self.itlb_update(m, itlb)
839 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
840 req_tag, real_addr, req_laddr,
841 cache_tags, access_ok, req_is_hit, req_is_miss,
842 replace_way, plru_victim, cache_out_row)
843 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
844 req_index, req_tag, real_addr)
845 self.icache_miss(m, r, req_is_miss, req_index,
846 req_laddr, req_tag, replace_way, cache_tags,
847 access_ok, real_addr)
848 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
849 # req_is_miss, req_is_hit, lway, wstate, r)
850
851 return m
852
853
854 def icache_sim(dut):
855 i_in = dut.i_in
856 i_out = dut.i_out
857 m_out = dut.m_in
858
859 yield i_in.priv_mode.eq(1)
860 yield i_in.req.eq(0)
861 yield i_in.nia.eq(0)
862 yield i_in.stop_mark.eq(0)
863 yield m_out.tlbld.eq(0)
864 yield m_out.tlbie.eq(0)
865 yield m_out.addr.eq(0)
866 yield m_out.pte.eq(0)
867 yield
868 yield
869 yield
870 yield
871
872 # miss, stalls for a bit
873 yield i_in.req.eq(1)
874 yield i_in.nia.eq(Const(0x0000000000000004, 64))
875 yield
876 valid = yield i_out.valid
877 while not valid:
878 yield
879 valid = yield i_out.valid
880 yield i_in.req.eq(0)
881
882 insn = yield i_out.insn
883 nia = yield i_out.nia
884 assert insn == 0x00000001, \
885 "insn @%x=%x expected 00000001" % (nia, insn)
886 yield i_in.req.eq(0)
887 yield
888
889 # hit
890 yield i_in.req.eq(1)
891 yield i_in.nia.eq(Const(0x0000000000000008, 64))
892 yield
893 valid = yield i_out.valid
894 while not valid:
895 yield
896 valid = yield i_out.valid
897 yield i_in.req.eq(0)
898
899 nia = yield i_out.nia
900 insn = yield i_out.insn
901 yield
902 assert insn == 0x00000002, \
903 "insn @%x=%x expected 00000002" % (nia, insn)
904
905 # another miss
906 yield i_in.req.eq(1)
907 yield i_in.nia.eq(Const(0x0000000000000040, 64))
908 yield
909 valid = yield i_out.valid
910 while not valid:
911 yield
912 valid = yield i_out.valid
913 yield i_in.req.eq(0)
914
915 nia = yield i_in.nia
916 insn = yield i_out.insn
917 assert insn == 0x00000010, \
918 "insn @%x=%x expected 00000010" % (nia, insn)
919
920 # test something that aliases (this only works because
921 # the unit test SRAM is a depth of 512)
922 yield i_in.req.eq(1)
923 yield i_in.nia.eq(Const(0x0000000000000100, 64))
924 yield
925 yield
926 valid = yield i_out.valid
927 assert ~valid
928 for i in range(30):
929 yield
930 yield
931 insn = yield i_out.insn
932 valid = yield i_out.valid
933 insn = yield i_out.insn
934 assert valid
935 assert insn == 0x00000040, \
936 "insn @%x=%x expected 00000040" % (nia, insn)
937 yield i_in.req.eq(0)
938
939
940 def test_icache(mem):
941 dut = ICache()
942
943 memory = Memory(width=64, depth=512, init=mem)
944 sram = SRAM(memory=memory, granularity=8)
945
946 m = Module()
947
948 m.submodules.icache = dut
949 m.submodules.sram = sram
950
951 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
952 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
953 m.d.comb += sram.bus.we.eq(dut.bus.we)
954 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
955 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
956 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
957
958 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
959 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
960
961 # nmigen Simulation
962 sim = Simulator(m)
963 sim.add_clock(1e-6)
964
965 sim.add_sync_process(wrap(icache_sim(dut)))
966 with sim.write_vcd('test_icache.vcd'):
967 sim.run()
968
969
970 if __name__ == '__main__':
971 dut = ICache()
972 vl = rtlil.convert(dut, ports=[])
973 with open("test_icache.il", "w") as f:
974 f.write(vl)
975
976 # set up memory every 32-bits with incrementing values 0 1 2 ...
977 mem = []
978 for i in range(512):
979 mem.append((i*2) | ((i*2+1)<<32))
980
981 test_icache(mem)