whitespace
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20 """
21
22 from enum import (Enum, unique)
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
24 Record)
25 from nmigen.cli import main, rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmigen.utils import log2_int
28 from nmutil.util import Display
29
30 #from nmutil.plru import PLRU
31 from soc.experiment.cache_ram import CacheRam
32 from soc.experiment.plru import PLRU
33
34 from soc.experiment.mem_types import (Fetch1ToICacheType,
35 ICacheToDecode1Type,
36 MMUToICacheType)
37
38 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
39 WB_SEL_BITS, WBAddrType, WBDataType,
40 WBSelType, WBMasterOut, WBSlaveOut,
41 )
42
43 from nmigen_soc.wishbone.bus import Interface
44
45 # for test
46 from soc.bus.sram import SRAM
47 from nmigen import Memory
48 from nmutil.util import wrap
49 from nmigen.cli import main, rtlil
50
51 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
52 # Also, check out the cxxsim nmigen branch, and latest yosys from git
53 from nmutil.sim_tmp_alternative import Simulator, Settle
54
55
56 SIM = 0
57 LINE_SIZE = 64
58 # BRAM organisation: We never access more than wishbone_data_bits
59 # at a time so to save resources we make the array only that wide,
60 # and use consecutive indices for to make a cache "line"
61 #
62 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
63 ROW_SIZE = WB_DATA_BITS // 8
64 # Number of lines in a set
65 NUM_LINES = 16
66 # Number of ways
67 NUM_WAYS = 4
68 # L1 ITLB number of entries (direct mapped)
69 TLB_SIZE = 64
70 # L1 ITLB log_2(page_size)
71 TLB_LG_PGSZ = 12
72 # Number of real address bits that we store
73 REAL_ADDR_BITS = 56
74 # Non-zero to enable log data collection
75 LOG_LENGTH = 0
76
77 ROW_SIZE_BITS = ROW_SIZE * 8
78 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
79 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
80 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
83 INSN_PER_ROW = ROW_SIZE_BITS // 32
84
85 # Bit fields counts in the address
86 #
87 # INSN_BITS is the number of bits to select an instruction in a row
88 INSN_BITS = log2_int(INSN_PER_ROW)
89 # ROW_BITS is the number of bits to select a row
90 ROW_BITS = log2_int(BRAM_ROWS)
91 # ROW_LINE_BITS is the number of bits to select a row within a line
92 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
93 # LINE_OFF_BITS is the number of bits for the offset in a cache line
94 LINE_OFF_BITS = log2_int(LINE_SIZE)
95 # ROW_OFF_BITS is the number of bits for the offset in a row
96 ROW_OFF_BITS = log2_int(ROW_SIZE)
97 # INDEX_BITS is the number of bits to select a cache line
98 INDEX_BITS = log2_int(NUM_LINES)
99 # SET_SIZE_BITS is the log base 2 of the set size
100 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
101 # TAG_BITS is the number of bits of the tag part of the address
102 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
103 # TAG_WIDTH is the width in bits of each way of the tag RAM
104 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
105
106 # WAY_BITS is the number of bits to select a way
107 WAY_BITS = log2_int(NUM_WAYS)
108 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
109
110 # L1 ITLB
111 TLB_BITS = log2_int(TLB_SIZE)
112 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
113 TLB_PTE_BITS = 64
114
115 print("BRAM_ROWS =", BRAM_ROWS)
116 print("INDEX_BITS =", INDEX_BITS)
117 print("INSN_BITS =", INSN_BITS)
118 print("INSN_PER_ROW =", INSN_PER_ROW)
119 print("LINE_SIZE =", LINE_SIZE)
120 print("LINE_OFF_BITS =", LINE_OFF_BITS)
121 print("LOG_LENGTH =", LOG_LENGTH)
122 print("NUM_LINES =", NUM_LINES)
123 print("NUM_WAYS =", NUM_WAYS)
124 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
125 print("ROW_BITS =", ROW_BITS)
126 print("ROW_OFF_BITS =", ROW_OFF_BITS)
127 print("ROW_LINE_BITS =", ROW_LINE_BITS)
128 print("ROW_PER_LINE =", ROW_PER_LINE)
129 print("ROW_SIZE =", ROW_SIZE)
130 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
131 print("SET_SIZE_BITS =", SET_SIZE_BITS)
132 print("SIM =", SIM)
133 print("TAG_BITS =", TAG_BITS)
134 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
135 print("TAG_BITS =", TAG_BITS)
136 print("TLB_BITS =", TLB_BITS)
137 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
138 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
139 print("TLB_PTE_BITS =", TLB_PTE_BITS)
140 print("TLB_SIZE =", TLB_SIZE)
141 print("WAY_BITS =", WAY_BITS)
142
143 # from microwatt/utils.vhdl
144 def ispow2(n):
145 return n != 0 and (n & (n - 1)) == 0
146
147 assert LINE_SIZE % ROW_SIZE == 0
148 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
149 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
150 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
151 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
152 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
153 "geometry bits don't add up"
154 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
155 "geometry bits don't add up"
156 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
157 "geometry bits don't add up"
158 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
159 "geometry bits don't add up"
160
161 # Example of layout for 32 lines of 64 bytes:
162 #
163 # .. tag |index| line |
164 # .. | row | |
165 # .. | | | |00| zero (2)
166 # .. | | |-| | INSN_BITS (1)
167 # .. | |---| | ROW_LINE_BITS (3)
168 # .. | |--- - --| LINE_OFF_BITS (6)
169 # .. | |- --| ROW_OFF_BITS (3)
170 # .. |----- ---| | ROW_BITS (8)
171 # .. |-----| | INDEX_BITS (5)
172 # .. --------| | TAG_BITS (53)
173
174 # The cache data BRAM organized as described above for each way
175 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
176 #
177 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
178 # not handle a clean (commented) definition of the cache tags as a 3d
179 # memory. For now, work around it by putting all the tags
180 def CacheTagArray():
181 tag_layout = [('valid', 1),
182 ('tag', TAG_RAM_WIDTH),
183 ]
184 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
185
186 def RowPerLineValidArray():
187 return Array(Signal(name="rows_valid_%d" %x) \
188 for x in range(ROW_PER_LINE))
189
190
191 # TODO to be passed to nigmen as ram attributes
192 # attribute ram_style : string;
193 # attribute ram_style of cache_tags : signal is "distributed";
194
195 def TLBArray():
196 tlb_layout = [('valid', 1),
197 ('tag', TLB_EA_TAG_BITS),
198 ('pte', TLB_PTE_BITS)
199 ]
200 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
201
202 # Cache RAM interface
203 def CacheRamOut():
204 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
205 for x in range(NUM_WAYS))
206
207 # PLRU output interface
208 def PLRUOut():
209 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
210 for x in range(NUM_LINES))
211
212 # Return the cache line index (tag index) for an address
213 def get_index(addr):
214 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
215
216 # Return the cache row index (data memory) for an address
217 def get_row(addr):
218 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
219
220 # Return the index of a row within a line
221 def get_row_of_line(row):
222 return row[:ROW_BITS][:ROW_LINE_BITS]
223
224 # Returns whether this is the last row of a line
225 def is_last_row_addr(addr, last):
226 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
227
228 # Returns whether this is the last row of a line
229 def is_last_row(row, last):
230 return get_row_of_line(row) == last
231
232 # Return the next row in the current cache line. We use a dedicated
233 # function in order to limit the size of the generated adder to be
234 # only the bits within a cache line (3 bits with default settings)
235 def next_row(row):
236 row_v = row[0:ROW_LINE_BITS] + 1
237 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
238
239 # Read the instruction word for the given address
240 # in the current cache row
241 def read_insn_word(addr, data):
242 word = addr[2:INSN_BITS+2]
243 return data.word_select(word, 32)
244
245 # Get the tag value from the address
246 def get_tag(addr):
247 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
248
249 # Read a tag from a tag memory row
250 def read_tag(way, tagset):
251 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
252
253 # Write a tag to tag memory row
254 def write_tag(way, tagset, tag):
255 return read_tag(way, tagset).eq(tag)
256
257 # Simple hash for direct-mapped TLB index
258 def hash_ea(addr):
259 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
260 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
261 ] ^ addr[
262 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
263 ]
264 return hsh
265
266
267 # Cache reload state machine
268 @unique
269 class State(Enum):
270 IDLE = 0
271 CLR_TAG = 1
272 WAIT_ACK = 2
273
274
275 class RegInternal(RecordObject):
276 def __init__(self):
277 super().__init__()
278 # Cache hit state (Latches for 1 cycle BRAM access)
279 self.hit_way = Signal(WAY_BITS)
280 self.hit_nia = Signal(64)
281 self.hit_smark = Signal()
282 self.hit_valid = Signal()
283
284 # Cache miss state (reload state machine)
285 self.state = Signal(State, reset=State.IDLE)
286 self.wb = WBMasterOut("wb")
287 self.req_adr = Signal(64)
288 self.store_way = Signal(WAY_BITS)
289 self.store_index = Signal(INDEX_BITS)
290 self.store_row = Signal(ROW_BITS)
291 self.store_tag = Signal(TAG_BITS)
292 self.store_valid = Signal()
293 self.end_row_ix = Signal(ROW_LINE_BITS)
294 self.rows_valid = RowPerLineValidArray()
295
296 # TLB miss state
297 self.fetch_failed = Signal()
298
299
300 class ICache(Elaboratable):
301 """64 bit direct mapped icache. All instructions are 4B aligned."""
302 def __init__(self):
303 self.i_in = Fetch1ToICacheType(name="i_in")
304 self.i_out = ICacheToDecode1Type(name="i_out")
305
306 self.m_in = MMUToICacheType(name="m_in")
307
308 self.stall_in = Signal()
309 self.stall_out = Signal()
310 self.flush_in = Signal()
311 self.inval_in = Signal()
312
313 # standard naming (wired to non-standard for compatibility)
314 self.bus = Interface(addr_width=32,
315 data_width=64,
316 granularity=8,
317 features={'stall'},
318 alignment=0,
319 name="dcache")
320
321 self.log_out = Signal(54)
322
323
324 # Generate a cache RAM for each way
325 def rams(self, m, r, cache_out_row, use_previous,
326 replace_way, req_row):
327
328 comb = m.d.comb
329 sync = m.d.sync
330
331 bus, stall_in = self.bus, self.stall_in
332
333 for i in range(NUM_WAYS):
334 do_read = Signal(name="do_rd_%d" % i)
335 do_write = Signal(name="do_wr_%d" % i)
336 rd_addr = Signal(ROW_BITS)
337 wr_addr = Signal(ROW_BITS)
338 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
339 wr_sel = Signal(ROW_SIZE)
340
341 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True, ram_num=i)
342 setattr(m.submodules, "cacheram_%d" % i, way)
343
344 comb += way.rd_en.eq(do_read)
345 comb += way.rd_addr.eq(rd_addr)
346 comb += d_out.eq(way.rd_data_o)
347 comb += way.wr_sel.eq(wr_sel)
348 comb += way.wr_addr.eq(wr_addr)
349 comb += way.wr_data.eq(bus.dat_r)
350
351 comb += do_read.eq(~(stall_in | use_previous))
352 comb += do_write.eq(bus.ack & (replace_way == i))
353
354 with m.If(do_write):
355 sync += Display("cache write adr: %x data: %lx",
356 wr_addr, way.wr_data)
357
358 with m.If(r.hit_way == i):
359 comb += cache_out_row.eq(d_out)
360 with m.If(do_read):
361 sync += Display("cache read adr: %x data: %x",
362 req_row, d_out)
363
364 comb += rd_addr.eq(req_row)
365 comb += wr_addr.eq(r.store_row)
366 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
367
368 # Generate PLRUs
369 def maybe_plrus(self, m, r, plru_victim):
370 comb = m.d.comb
371
372 with m.If(NUM_WAYS > 1):
373 for i in range(NUM_LINES):
374 plru_acc_i = Signal(WAY_BITS)
375 plru_acc_en = Signal()
376 plru = PLRU(WAY_BITS)
377 setattr(m.submodules, "plru_%d" % i, plru)
378
379 comb += plru.acc_i.eq(plru_acc_i)
380 comb += plru.acc_en.eq(plru_acc_en)
381
382 # PLRU interface
383 with m.If(get_index(r.hit_nia) == i):
384 comb += plru.acc_en.eq(r.hit_valid)
385
386 comb += plru.acc_i.eq(r.hit_way)
387 comb += plru_victim[i].eq(plru.lru_o)
388
389 # TLB hit detection and real address generation
390 def itlb_lookup(self, m, tlb_req_index, itlb,
391 real_addr, ra_valid, eaa_priv,
392 priv_fault, access_ok):
393
394 comb = m.d.comb
395
396 i_in = self.i_in
397
398 pte = Signal(TLB_PTE_BITS)
399 ttag = Signal(TLB_EA_TAG_BITS)
400
401 comb += tlb_req_index.eq(hash_ea(i_in.nia))
402 comb += pte.eq(itlb[tlb_req_index].pte)
403 comb += ttag.eq(itlb[tlb_req_index].tag)
404
405 with m.If(i_in.virt_mode):
406 comb += real_addr.eq(Cat(
407 i_in.nia[:TLB_LG_PGSZ],
408 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
409 ))
410
411 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
412 comb += ra_valid.eq(itlb[tlb_req_index].valid)
413
414 comb += eaa_priv.eq(pte[3])
415
416 with m.Else():
417 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
418 comb += ra_valid.eq(1)
419 comb += eaa_priv.eq(1)
420
421 # No IAMR, so no KUEP support for now
422 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
423 comb += access_ok.eq(ra_valid & ~priv_fault)
424
425 # iTLB update
426 def itlb_update(self, m, itlb):
427 comb = m.d.comb
428 sync = m.d.sync
429
430 m_in = self.m_in
431
432 wr_index = Signal(TLB_SIZE)
433 comb += wr_index.eq(hash_ea(m_in.addr))
434
435 with m.If(m_in.tlbie & m_in.doall):
436 # Clear all valid bits
437 for i in range(TLB_SIZE):
438 sync += itlb[i].valid.eq(0)
439
440 with m.Elif(m_in.tlbie):
441 # Clear entry regardless of hit or miss
442 sync += itlb[wr_index].valid.eq(0)
443
444 with m.Elif(m_in.tlbld):
445 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
446 sync += itlb[wr_index].pte.eq(m_in.pte)
447 sync += itlb[wr_index].valid.eq(1)
448
449 # Cache hit detection, output to fetch2 and other misc logic
450 def icache_comb(self, m, use_previous, r, req_index, req_row,
451 req_hit_way, req_tag, real_addr, req_laddr,
452 cache_tags, access_ok,
453 req_is_hit, req_is_miss, replace_way,
454 plru_victim, cache_out_row):
455
456 comb = m.d.comb
457
458 i_in, i_out, bus = self.i_in, self.i_out, self.bus
459 flush_in, stall_out = self.flush_in, self.stall_out
460
461 is_hit = Signal()
462 hit_way = Signal(WAY_BITS)
463
464 # i_in.sequential means that i_in.nia this cycle is 4 more than
465 # last cycle. If we read more than 32 bits at a time, had a
466 # cache hit last cycle, and we don't want the first 32-bit chunk
467 # then we can keep the data we read last cycle and just use that.
468 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
469 comb += use_previous.eq(i_in.sequential & r.hit_valid)
470
471 # Extract line, row and tag from request
472 comb += req_index.eq(get_index(i_in.nia))
473 comb += req_row.eq(get_row(i_in.nia))
474 comb += req_tag.eq(get_tag(real_addr))
475
476 # Calculate address of beginning of cache row, will be
477 # used for cache miss processing if needed
478 comb += req_laddr.eq(Cat(
479 Const(0, ROW_OFF_BITS),
480 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
481 ))
482
483 # Test if pending request is a hit on any way
484 hitcond = Signal()
485 comb += hitcond.eq((r.state == State.WAIT_ACK)
486 & (req_index == r.store_index)
487 & r.rows_valid[req_row % ROW_PER_LINE]
488 )
489 with m.If(i_in.req):
490 cvb = Signal(NUM_WAYS)
491 ctag = Signal(TAG_RAM_WIDTH)
492 comb += ctag.eq(cache_tags[req_index].tag)
493 comb += cvb.eq(cache_tags[req_index].valid)
494 for i in range(NUM_WAYS):
495 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
496 comb += tagi.eq(read_tag(i, ctag))
497 hit_test = Signal(name="hit_test%d" % i)
498 comb += hit_test.eq(i == r.store_way)
499 with m.If((cvb[i] | (hitcond & hit_test)) & (tagi == req_tag)):
500 comb += hit_way.eq(i)
501 comb += is_hit.eq(1)
502
503 # Generate the "hit" and "miss" signals
504 # for the synchronous blocks
505 with m.If(i_in.req & access_ok & ~flush_in):
506 comb += req_is_hit.eq(is_hit)
507 comb += req_is_miss.eq(~is_hit)
508
509 comb += req_hit_way.eq(hit_way)
510
511 # The way to replace on a miss
512 with m.If(r.state == State.CLR_TAG):
513 comb += replace_way.eq(plru_victim[r.store_index])
514 with m.Else():
515 comb += replace_way.eq(r.store_way)
516
517 # Output instruction from current cache row
518 #
519 # Note: This is a mild violation of our design principle of
520 # having pipeline stages output from a clean latch. In this
521 # case we output the result of a mux. The alternative would
522 # be output an entire row which I prefer not to do just yet
523 # as it would force fetch2 to know about some of the cache
524 # geometry information.
525 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
526 comb += i_out.valid.eq(r.hit_valid)
527 comb += i_out.nia.eq(r.hit_nia)
528 comb += i_out.stop_mark.eq(r.hit_smark)
529 comb += i_out.fetch_failed.eq(r.fetch_failed)
530
531 # Stall fetch1 if we have a miss on cache or TLB
532 # or a protection fault
533 comb += stall_out.eq(~(is_hit & access_ok))
534
535 # Wishbone requests output (from the cache miss reload machine)
536 comb += bus.we.eq(r.wb.we)
537 comb += bus.adr.eq(r.wb.adr)
538 comb += bus.sel.eq(r.wb.sel)
539 comb += bus.stb.eq(r.wb.stb)
540 comb += bus.dat_w.eq(r.wb.dat)
541 comb += bus.cyc.eq(r.wb.cyc)
542
543 # Cache hit synchronous machine
544 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
545 req_index, req_tag, real_addr):
546 sync = m.d.sync
547
548 i_in, stall_in = self.i_in, self.stall_in
549 flush_in = self.flush_in
550
551 # keep outputs to fetch2 unchanged on a stall
552 # except that flush or reset sets valid to 0
553 # If use_previous, keep the same data as last
554 # cycle and use the second half
555 with m.If(stall_in | use_previous):
556 with m.If(flush_in):
557 sync += r.hit_valid.eq(0)
558 with m.Else():
559 # On a hit, latch the request for the next cycle,
560 # when the BRAM data will be available on the
561 # cache_out output of the corresponding way
562 sync += r.hit_valid.eq(req_is_hit)
563
564 with m.If(req_is_hit):
565 sync += r.hit_way.eq(req_hit_way)
566 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
567 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
568 i_in.stop_mark, req_index, req_tag,
569 req_hit_way, real_addr)
570
571 with m.If(~stall_in):
572 # Send stop marks and NIA down regardless of validity
573 sync += r.hit_smark.eq(i_in.stop_mark)
574 sync += r.hit_nia.eq(i_in.nia)
575
576 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
577 req_index, req_tag, replace_way, real_addr):
578 comb = m.d.comb
579 sync = m.d.sync
580
581 i_in = self.i_in
582
583 # Reset per-row valid flags, only used in WAIT_ACK
584 for i in range(ROW_PER_LINE):
585 sync += r.rows_valid[i].eq(0)
586
587 # We need to read a cache line
588 with m.If(req_is_miss):
589 sync += Display(
590 "cache miss nia:%x IR:%x SM:%x idx:%x "
591 " way:%x tag:%x RA:%x", i_in.nia,
592 i_in.virt_mode, i_in.stop_mark, req_index,
593 replace_way, req_tag, real_addr)
594
595 # Keep track of our index and way for subsequent stores
596 st_row = Signal(ROW_BITS)
597 comb += st_row.eq(get_row(req_laddr))
598 sync += r.store_index.eq(req_index)
599 sync += r.store_row.eq(st_row)
600 sync += r.store_tag.eq(req_tag)
601 sync += r.store_valid.eq(1)
602 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
603
604 # Prep for first wishbone read. We calculate the address
605 # of the start of the cache line and start the WB cycle.
606 sync += r.req_adr.eq(req_laddr)
607 sync += r.wb.cyc.eq(1)
608 sync += r.wb.stb.eq(1)
609
610 # Track that we had one request sent
611 sync += r.state.eq(State.CLR_TAG)
612
613 def icache_miss_clr_tag(self, m, r, replace_way,
614 req_index,
615 tagset, cache_tags):
616 comb = m.d.comb
617 sync = m.d.sync
618
619 # Get victim way from plru
620 sync += r.store_way.eq(replace_way)
621
622 # Force misses on that way while reloading that line
623 cv = Signal(INDEX_BITS)
624 comb += cv.eq(cache_tags[req_index].valid)
625 comb += cv.bit_select(replace_way, 1).eq(0)
626 sync += cache_tags[req_index].valid.eq(cv)
627
628 for i in range(NUM_WAYS):
629 with m.If(i == replace_way):
630 comb += tagset.eq(cache_tags[r.store_index].tag)
631 comb += write_tag(i, tagset, r.store_tag)
632 sync += cache_tags[r.store_index].tag.eq(tagset)
633
634 sync += r.state.eq(State.WAIT_ACK)
635
636 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
637 cache_tags, stbs_done):
638 comb = m.d.comb
639 sync = m.d.sync
640
641 bus = self.bus
642
643 # Requests are all sent if stb is 0
644 stbs_zero = Signal()
645 comb += stbs_zero.eq(r.wb.stb == 0)
646 comb += stbs_done.eq(stbs_zero)
647
648 # If we are still sending requests, was one accepted?
649 with m.If(~bus.stall & ~stbs_zero):
650 # That was the last word? We are done sending.
651 # Clear stb and set stbs_done so we can handle
652 # an eventual last ack on the same cycle.
653 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
654 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
655 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
656 "stbs_done:%x", r.wb.adr, r.end_row_ix,
657 r.wb.stb, stbs_zero, stbs_done)
658 sync += r.wb.stb.eq(0)
659 comb += stbs_done.eq(1)
660
661 # Calculate the next row address
662 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
663 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
664 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
665 sync += Display("RARANGE r.req_adr:%x rarange:%x "
666 "stbs_zero:%x stbs_done:%x",
667 r.req_adr, rarange, stbs_zero, stbs_done)
668
669 # Incoming acks processing
670 with m.If(bus.ack):
671 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
672 "stbs_done:%x",
673 bus.dat_r, stbs_zero, stbs_done)
674
675 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
676
677 # Check for completion
678 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
679 # Complete wishbone cycle
680 sync += r.wb.cyc.eq(0)
681 # be nice, clear addr
682 sync += r.req_adr.eq(0)
683
684 # Cache line is now valid
685 cv = Signal(INDEX_BITS)
686 comb += cv.eq(cache_tags[r.store_index].valid)
687 comb += cv.bit_select(replace_way, 1).eq(
688 r.store_valid & ~inval_in)
689 sync += cache_tags[r.store_index].valid.eq(cv)
690
691 sync += r.state.eq(State.IDLE)
692
693 # move on to next request in row
694 # Increment store row counter
695 sync += r.store_row.eq(next_row(r.store_row))
696
697 # Cache miss/reload synchronous machine
698 def icache_miss(self, m, r, req_is_miss,
699 req_index, req_laddr, req_tag, replace_way,
700 cache_tags, access_ok, real_addr):
701 comb = m.d.comb
702 sync = m.d.sync
703
704 i_in, bus, m_in = self.i_in, self.bus, self.m_in
705 stall_in, flush_in = self.stall_in, self.flush_in
706 inval_in = self.inval_in
707
708 tagset = Signal(TAG_RAM_WIDTH)
709 stbs_done = Signal()
710
711 comb += r.wb.sel.eq(-1)
712 comb += r.wb.adr.eq(r.req_adr[3:])
713
714 # Process cache invalidations
715 with m.If(inval_in):
716 for i in range(NUM_LINES):
717 sync += cache_tags[i].valid.eq(0)
718 sync += r.store_valid.eq(0)
719
720 # Main state machine
721 with m.Switch(r.state):
722
723 with m.Case(State.IDLE):
724 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
725 req_index, req_tag, replace_way,
726 real_addr)
727
728 with m.Case(State.CLR_TAG, State.WAIT_ACK):
729 with m.If(r.state == State.CLR_TAG):
730 self.icache_miss_clr_tag(m, r, replace_way,
731 req_index, tagset, cache_tags)
732
733 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
734 cache_tags, stbs_done)
735
736 # TLB miss and protection fault processing
737 with m.If(flush_in | m_in.tlbld):
738 sync += r.fetch_failed.eq(0)
739 with m.Elif(i_in.req & ~access_ok & ~stall_in):
740 sync += r.fetch_failed.eq(1)
741
742 # icache_log: if LOG_LENGTH > 0 generate
743 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
744 req_is_miss, req_is_hit, lway, wstate, r):
745 comb = m.d.comb
746 sync = m.d.sync
747
748 bus, i_out = self.bus, self.i_out
749 log_out, stall_out = self.log_out, self.stall_out
750
751 # Output data to logger
752 for i in range(LOG_LENGTH):
753 log_data = Signal(54)
754 lway = Signal(WAY_BITS)
755 wstate = Signal()
756
757 sync += lway.eq(req_hit_way)
758 sync += wstate.eq(0)
759
760 with m.If(r.state != State.IDLE):
761 sync += wstate.eq(1)
762
763 sync += log_data.eq(Cat(
764 ra_valid, access_ok, req_is_miss, req_is_hit,
765 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
766 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
767 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
768 ))
769 comb += log_out.eq(log_data)
770
771 def elaborate(self, platform):
772
773 m = Module()
774 comb = m.d.comb
775
776 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
777 cache_tags = CacheTagArray()
778
779 # TLB Array
780 itlb = TLBArray()
781
782 # TODO to be passed to nmigen as ram attributes
783 # attribute ram_style of itlb_tags : signal is "distributed";
784 # attribute ram_style of itlb_ptes : signal is "distributed";
785
786 # Privilege bit from PTE EAA field
787 eaa_priv = Signal()
788
789 r = RegInternal()
790
791 # Async signal on incoming request
792 req_index = Signal(INDEX_BITS)
793 req_row = Signal(ROW_BITS)
794 req_hit_way = Signal(WAY_BITS)
795 req_tag = Signal(TAG_BITS)
796 req_is_hit = Signal()
797 req_is_miss = Signal()
798 req_laddr = Signal(64)
799
800 tlb_req_index = Signal(TLB_SIZE)
801 real_addr = Signal(REAL_ADDR_BITS)
802 ra_valid = Signal()
803 priv_fault = Signal()
804 access_ok = Signal()
805 use_previous = Signal()
806
807 cache_out_row = Signal(ROW_SIZE_BITS)
808
809 plru_victim = PLRUOut()
810 replace_way = Signal(WAY_BITS)
811
812 # fake-up the wishbone stall signal to comply with pipeline mode
813 # same thing is done in dcache.py
814 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
815
816 # call sub-functions putting everything together,
817 # using shared signals established above
818 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
819 self.maybe_plrus(m, r, plru_victim)
820 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
821 ra_valid, eaa_priv, priv_fault,
822 access_ok)
823 self.itlb_update(m, itlb)
824 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
825 req_tag, real_addr, req_laddr,
826 cache_tags, access_ok, req_is_hit, req_is_miss,
827 replace_way, plru_victim, cache_out_row)
828 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
829 req_index, req_tag, real_addr)
830 self.icache_miss(m, r, req_is_miss, req_index,
831 req_laddr, req_tag, replace_way, cache_tags,
832 access_ok, real_addr)
833 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
834 # req_is_miss, req_is_hit, lway, wstate, r)
835
836 return m
837
838
839 def icache_sim(dut):
840 i_in = dut.i_in
841 i_out = dut.i_out
842 m_out = dut.m_in
843
844 yield i_in.priv_mode.eq(1)
845 yield i_in.req.eq(0)
846 yield i_in.nia.eq(0)
847 yield i_in.stop_mark.eq(0)
848 yield m_out.tlbld.eq(0)
849 yield m_out.tlbie.eq(0)
850 yield m_out.addr.eq(0)
851 yield m_out.pte.eq(0)
852 yield
853 yield
854 yield
855 yield
856
857 # miss, stalls for a bit
858 yield i_in.req.eq(1)
859 yield i_in.nia.eq(Const(0x0000000000000004, 64))
860 yield
861 valid = yield i_out.valid
862 while not valid:
863 yield
864 valid = yield i_out.valid
865 yield i_in.req.eq(0)
866
867 insn = yield i_out.insn
868 nia = yield i_out.nia
869 assert insn == 0x00000001, \
870 "insn @%x=%x expected 00000001" % (nia, insn)
871 yield i_in.req.eq(0)
872 yield
873
874 # hit
875 yield i_in.req.eq(1)
876 yield i_in.nia.eq(Const(0x0000000000000008, 64))
877 yield
878 valid = yield i_out.valid
879 while not valid:
880 yield
881 valid = yield i_out.valid
882 yield i_in.req.eq(0)
883
884 nia = yield i_out.nia
885 insn = yield i_out.insn
886 yield
887 assert insn == 0x00000002, \
888 "insn @%x=%x expected 00000002" % (nia, insn)
889
890 # another miss
891 yield i_in.req.eq(1)
892 yield i_in.nia.eq(Const(0x0000000000000040, 64))
893 yield
894 valid = yield i_out.valid
895 while not valid:
896 yield
897 valid = yield i_out.valid
898 yield i_in.req.eq(0)
899
900 nia = yield i_in.nia
901 insn = yield i_out.insn
902 assert insn == 0x00000010, \
903 "insn @%x=%x expected 00000010" % (nia, insn)
904
905 # test something that aliases (this only works because
906 # the unit test SRAM is a depth of 512)
907 yield i_in.req.eq(1)
908 yield i_in.nia.eq(Const(0x0000000000000100, 64))
909 yield
910 yield
911 valid = yield i_out.valid
912 assert ~valid
913 for i in range(30):
914 yield
915 yield
916 insn = yield i_out.insn
917 valid = yield i_out.valid
918 insn = yield i_out.insn
919 assert valid
920 assert insn == 0x00000040, \
921 "insn @%x=%x expected 00000040" % (nia, insn)
922 yield i_in.req.eq(0)
923
924
925 def test_icache(mem):
926 dut = ICache()
927
928 memory = Memory(width=64, depth=512, init=mem)
929 sram = SRAM(memory=memory, granularity=8)
930
931 m = Module()
932
933 m.submodules.icache = dut
934 m.submodules.sram = sram
935
936 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
937 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
938 m.d.comb += sram.bus.we.eq(dut.bus.we)
939 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
940 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
941 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
942
943 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
944 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
945
946 # nmigen Simulation
947 sim = Simulator(m)
948 sim.add_clock(1e-6)
949
950 sim.add_sync_process(wrap(icache_sim(dut)))
951 with sim.write_vcd('test_icache.vcd'):
952 sim.run()
953
954
955 if __name__ == '__main__':
956 dut = ICache()
957 vl = rtlil.convert(dut, ports=[])
958 with open("test_icache.il", "w") as f:
959 f.write(vl)
960
961 # set up memory every 32-bits with incrementing values 0 1 2 ...
962 mem = []
963 for i in range(512):
964 mem.append((i*2) | ((i*2+1)<<32))
965
966 test_icache(mem)