icache.py add python asserts that were a TODO commented section from
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 # Bit fields counts in the address
87 #
88 # INSN_BITS is the number of bits to
89 # select an instruction in a row
90 INSN_BITS = log2_int(INSN_PER_ROW)
91 # ROW_BITS is the number of bits to
92 # select a row
93 ROW_BITS = log2_int(BRAM_ROWS)
94 # ROW_LINE_BITS is the number of bits to
95 # select a row within a line
96 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
97 # LINE_OFF_BITS is the number of bits for
98 # the offset in a cache line
99 LINE_OFF_BITS = log2_int(LINE_SIZE)
100 # ROW_OFF_BITS is the number of bits for
101 # the offset in a row
102 ROW_OFF_BITS = log2_int(ROW_SIZE)
103 # INDEX_BITS is the number of bits to
104 # select a cache line
105 INDEX_BITS = log2_int(NUM_LINES)
106 # SET_SIZE_BITS is the log base 2 of
107 # the set size
108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
109 # TAG_BITS is the number of bits of
110 # the tag part of the address
111 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
112 # TAG_WIDTH is the width in bits of each way of the tag RAM
113 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
114
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 #-- L1 ITLB.
121 #constant TLB_BITS : natural := log2(TLB_SIZE);
122 #constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 #constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 print("BRAM_ROWS =", BRAM_ROWS)
129 print("INDEX_BITS =", INDEX_BITS)
130 print("INSN_BITS =", INSN_BITS)
131 print("INSN_PER_ROW =", INSN_PER_ROW)
132 print("LINE_SIZE =", LINE_SIZE)
133 print("LINE_OFF_BITS =", LINE_OFF_BITS)
134 print("LOG_LENGTH =", LOG_LENGTH)
135 print("NUM_LINES =", NUM_LINES)
136 print("NUM_WAYS =", NUM_WAYS)
137 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
138 print("ROW_BITS =", ROW_BITS)
139 print("ROW_OFF_BITS =", ROW_OFF_BITS)
140 print("ROW_LINE_BITS =", ROW_LINE_BITS)
141 print("ROW_PER_LINE =", ROW_PER_LINE)
142 print("ROW_SIZE =", ROW_SIZE)
143 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
144 print("SET_SIZE_BITS =", SET_SIZE_BITS)
145 print("SIM =", SIM)
146 print("TAG_BITS =", TAG_BITS)
147 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
148 print("TAG_BITS =", TAG_BITS)
149 print("TLB_BITS =", TLB_BITS)
150 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
151 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
152 print("TLB_PTE_BITS =", TLB_PTE_BITS)
153 print("TLB_SIZE =", TLB_SIZE)
154 print("WAY_BITS =", WAY_BITS)
155
156 # from microwatt/utils.vhdl
157 def ispow2(n):
158 if ((n << 32) & ((n-1) << 32)) == 0:
159 return True
160
161 else:
162 return False
163
164 assert LINE_SIZE % ROW_SIZE == 0
165 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
166 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
167 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
168 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
169 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
170 "geometry bits don't add up"
171 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
172 "geometry bits don't add up"
173 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
174 "geometry bits don't add up"
175 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
176 "geometry bits don't add up"
177
178 # architecture rtl of icache is
179 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
180 #-- ROW_PER_LINE is the number of row (wishbone
181 #-- transactions) in a line
182 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
183 #-- BRAM_ROWS is the number of rows in BRAM
184 #-- needed to represent the full
185 #-- icache
186 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
187 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
188 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
189 #-- Bit fields counts in the address
190 #
191 #-- INSN_BITS is the number of bits to select
192 #-- an instruction in a row
193 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
194 #-- ROW_BITS is the number of bits to select a row
195 #constant ROW_BITS : natural := log2(BRAM_ROWS);
196 #-- ROW_LINE_BITS is the number of bits to
197 #-- select a row within a line
198 #constant ROW_LINE_BITS : natural := log2(ROW_PER_LINE);
199 #-- LINE_OFF_BITS is the number of bits for the offset
200 #-- in a cache line
201 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
202 #-- ROW_OFF_BITS is the number of bits for the offset in a row
203 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
204 #-- INDEX_BITS is the number of bits to select a cache line
205 #constant INDEX_BITS : natural := log2(NUM_LINES);
206 #-- SET_SIZE_BITS is the log base 2 of the set size
207 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
208 #-- TAG_BITS is the number of bits of the tag part of the address
209 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
210 #-- WAY_BITS is the number of bits to select a way
211 #constant WAY_BITS : natural := log2(NUM_WAYS);
212
213 #-- Example of layout for 32 lines of 64 bytes:
214 #--
215 #-- .. tag |index| line |
216 #-- .. | row | |
217 #-- .. | | | |00| zero (2)
218 #-- .. | | |-| | INSN_BITS (1)
219 #-- .. | |---| | ROW_LINE_BITS (3)
220 #-- .. | |--- - --| LINE_OFF_BITS (6)
221 #-- .. | |- --| ROW_OFF_BITS (3)
222 #-- .. |----- ---| | ROW_BITS (8)
223 #-- .. |-----| | INDEX_BITS (5)
224 #-- .. --------| | TAG_BITS (53)
225 # Example of layout for 32 lines of 64 bytes:
226 #
227 # .. tag |index| line |
228 # .. | row | |
229 # .. | | | |00| zero (2)
230 # .. | | |-| | INSN_BITS (1)
231 # .. | |---| | ROW_LINE_BITS (3)
232 # .. | |--- - --| LINE_OFF_BITS (6)
233 # .. | |- --| ROW_OFF_BITS (3)
234 # .. |----- ---| | ROW_BITS (8)
235 # .. |-----| | INDEX_BITS (5)
236 # .. --------| | TAG_BITS (53)
237
238 #subtype row_t is integer range 0 to BRAM_ROWS-1;
239 #subtype index_t is integer range 0 to NUM_LINES-1;
240 #subtype way_t is integer range 0 to NUM_WAYS-1;
241 #subtype row_in_line_t is unsigned(ROW_LINE_BITS-1 downto 0);
242 #
243 #-- The cache data BRAM organized as described above for each way
244 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
245 #
246 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
247 #-- not handle a clean (commented) definition of the cache tags as a 3d
248 #-- memory. For now, work around it by putting all the tags
249 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
250 # type cache_tags_set_t is array(way_t) of cache_tag_t;
251 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
252 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
253 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
254 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
255 def CacheTagArray():
256 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
257 for x in range(NUM_LINES))
258
259 #-- The cache valid bits
260 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
261 #type cache_valids_t is array(index_t) of cache_way_valids_t;
262 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
263 def CacheValidBitsArray():
264 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
265 for x in range(NUM_LINES))
266
267 def RowPerLineValidArray():
268 return Array(Signal(name="rows_valid_%d" %x) \
269 for x in range(ROW_PER_LINE))
270
271
272 #attribute ram_style : string;
273 #attribute ram_style of cache_tags : signal is "distributed";
274 # TODO to be passed to nigmen as ram attributes
275 # attribute ram_style : string;
276 # attribute ram_style of cache_tags : signal is "distributed";
277
278
279 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
280 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
281 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
282 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
283 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
284 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
285 def TLBValidBitsArray():
286 return Array(Signal(name="tlbvalid_%d" %x) \
287 for x in range(TLB_SIZE))
288
289 def TLBTagArray():
290 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
291 for x in range(TLB_SIZE))
292
293 def TLBPtesArray():
294 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
295 for x in range(TLB_SIZE))
296
297
298 #-- Cache RAM interface
299 #type cache_ram_out_t is array(way_t) of cache_row_t;
300 # Cache RAM interface
301 def CacheRamOut():
302 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
303 for x in range(NUM_WAYS))
304
305 #-- PLRU output interface
306 #type plru_out_t is array(index_t) of
307 # std_ulogic_vector(WAY_BITS-1 downto 0);
308 # PLRU output interface
309 def PLRUOut():
310 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
311 for x in range(NUM_LINES))
312
313 # -- Return the cache line index (tag index) for an address
314 # function get_index(addr: std_ulogic_vector(63 downto 0))
315 # return index_t is
316 # begin
317 # return to_integer(unsigned(
318 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
319 # ));
320 # end;
321 # Return the cache line index (tag index) for an address
322 def get_index(addr):
323 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
324
325 # -- Return the cache row index (data memory) for an address
326 # function get_row(addr: std_ulogic_vector(63 downto 0))
327 # return row_t is
328 # begin
329 # return to_integer(unsigned(
330 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
331 # ));
332 # end;
333 # Return the cache row index (data memory) for an address
334 def get_row(addr):
335 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
336
337 # -- Return the index of a row within a line
338 # function get_row_of_line(row: row_t) return row_in_line_t is
339 # variable row_v : unsigned(ROW_BITS-1 downto 0);
340 # begin
341 # row_v := to_unsigned(row, ROW_BITS);
342 # return row_v(ROW_LINE_BITS-1 downto 0);
343 # end;
344 # Return the index of a row within a line
345 def get_row_of_line(row):
346 return row[:ROW_LINE_BITS]
347
348 # -- Returns whether this is the last row of a line
349 # function is_last_row_addr(addr: wishbone_addr_type;
350 # last: row_in_line_t
351 # )
352 # return boolean is
353 # begin
354 # return unsigned(
355 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
356 # ) = last;
357 # end;
358 # Returns whether this is the last row of a line
359 def is_last_row_addr(addr, last):
360 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
361
362 # -- Returns whether this is the last row of a line
363 # function is_last_row(row: row_t;
364 # last: row_in_line_t) return boolean is
365 # begin
366 # return get_row_of_line(row) = last;
367 # end;
368 # Returns whether this is the last row of a line
369 def is_last_row(row, last):
370 return get_row_of_line(row) == last
371
372 # -- Return the next row in the current cache line. We use a dedicated
373 # -- function in order to limit the size of the generated adder to be
374 # -- only the bits within a cache line (3 bits with default settings)
375 # function next_row(row: row_t) return row_t is
376 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
377 # variable row_idx : std_ulogic_vector(ROW_LINE_BITS-1 downto 0);
378 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
379 # begin
380 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
381 # row_idx := row_v(ROW_LINE_BITS-1 downto 0);
382 # row_v(ROW_LINE_BITS-1 downto 0) :=
383 # std_ulogic_vector(unsigned(row_idx) + 1);
384 # return to_integer(unsigned(row_v));
385 # end;
386 # Return the next row in the current cache line. We use a dedicated
387 # function in order to limit the size of the generated adder to be
388 # only the bits within a cache line (3 bits with default settings)
389 def next_row(row):
390 row_v = row[0:ROW_LINE_BITS] + 1
391 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
392 # -- Read the instruction word for the given address in the
393 # -- current cache row
394 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
395 # data: cache_row_t) return std_ulogic_vector is
396 # variable word: integer range 0 to INSN_PER_ROW-1;
397 # begin
398 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
399 # return data(31+word*32 downto word*32);
400 # end;
401 # Read the instruction word for the given address
402 # in the current cache row
403 def read_insn_word(addr, data):
404 word = addr[2:INSN_BITS+2]
405 return data.word_select(word, 32)
406
407 # -- Get the tag value from the address
408 # function get_tag(
409 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
410 # )
411 # return cache_tag_t is
412 # begin
413 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
414 # end;
415 # Get the tag value from the address
416 def get_tag(addr):
417 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
418
419 # -- Read a tag from a tag memory row
420 # function read_tag(way: way_t; tagset: cache_tags_set_t)
421 # return cache_tag_t is
422 # begin
423 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
424 # end;
425 # Read a tag from a tag memory row
426 def read_tag(way, tagset):
427 return tagset.word_select(way, TAG_BITS)
428
429 # -- Write a tag to tag memory row
430 # procedure write_tag(way: in way_t;
431 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
432 # begin
433 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
434 # end;
435 # Write a tag to tag memory row
436 def write_tag(way, tagset, tag):
437 return read_tag(way, tagset).eq(tag)
438
439 # -- Simple hash for direct-mapped TLB index
440 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
441 # return tlb_index_t is
442 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
443 # begin
444 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
445 # xor addr(
446 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
447 # TLB_LG_PGSZ + TLB_BITS
448 # )
449 # xor addr(
450 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
451 # TLB_LG_PGSZ + 2 * TLB_BITS
452 # );
453 # return to_integer(unsigned(hash));
454 # end;
455 # Simple hash for direct-mapped TLB index
456 def hash_ea(addr):
457 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
458 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
459 ] ^ addr[
460 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
461 ]
462 return hsh
463
464
465 # Cache reload state machine
466 @unique
467 class State(Enum):
468 IDLE = 0
469 CLR_TAG = 1
470 WAIT_ACK = 2
471
472
473 class RegInternal(RecordObject):
474 def __init__(self):
475 super().__init__()
476 # Cache hit state (Latches for 1 cycle BRAM access)
477 self.hit_way = Signal(NUM_WAYS)
478 self.hit_nia = Signal(64)
479 self.hit_smark = Signal()
480 self.hit_valid = Signal()
481
482 # Cache miss state (reload state machine)
483 self.state = Signal(State, reset=State.IDLE)
484 self.wb = WBMasterOut("wb")
485 self.req_adr = Signal(64)
486 self.store_way = Signal(NUM_WAYS)
487 self.store_index = Signal(NUM_LINES)
488 self.store_row = Signal(BRAM_ROWS)
489 self.store_tag = Signal(TAG_BITS)
490 self.store_valid = Signal()
491 self.end_row_ix = Signal(ROW_LINE_BITS)
492 self.rows_valid = RowPerLineValidArray()
493
494 # TLB miss state
495 self.fetch_failed = Signal()
496
497 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
498 #
499 # entity icache is
500 # generic (
501 # SIM : boolean := false;
502 # -- Line size in bytes
503 # LINE_SIZE : positive := 64;
504 # -- BRAM organisation: We never access more
505 # -- than wishbone_data_bits
506 # -- at a time so to save resources we make the
507 # -- array only that wide,
508 # -- and use consecutive indices for to make a cache "line"
509 # --
510 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
511 # -- so 64-bits)
512 # ROW_SIZE : positive := wishbone_data_bits / 8;
513 # -- Number of lines in a set
514 # NUM_LINES : positive := 32;
515 # -- Number of ways
516 # NUM_WAYS : positive := 4;
517 # -- L1 ITLB number of entries (direct mapped)
518 # TLB_SIZE : positive := 64;
519 # -- L1 ITLB log_2(page_size)
520 # TLB_LG_PGSZ : positive := 12;
521 # -- Number of real address bits that we store
522 # REAL_ADDR_BITS : positive := 56;
523 # -- Non-zero to enable log data collection
524 # LOG_LENGTH : natural := 0
525 # );
526 # port (
527 # clk : in std_ulogic;
528 # rst : in std_ulogic;
529 #
530 # i_in : in Fetch1ToIcacheType;
531 # i_out : out IcacheToDecode1Type;
532 #
533 # m_in : in MmuToIcacheType;
534 #
535 # stall_in : in std_ulogic;
536 # stall_out : out std_ulogic;
537 # flush_in : in std_ulogic;
538 # inval_in : in std_ulogic;
539 #
540 # wishbone_out : out wishbone_master_out;
541 # wishbone_in : in wishbone_slave_out;
542 #
543 # log_out : out std_ulogic_vector(53 downto 0)
544 # );
545 # end entity icache;
546 # 64 bit direct mapped icache. All instructions are 4B aligned.
547 class ICache(Elaboratable):
548 """64 bit direct mapped icache. All instructions are 4B aligned."""
549 def __init__(self):
550 self.i_in = Fetch1ToICacheType(name="i_in")
551 self.i_out = ICacheToDecode1Type(name="i_out")
552
553 self.m_in = MMUToICacheType(name="m_in")
554
555 self.stall_in = Signal()
556 self.stall_out = Signal()
557 self.flush_in = Signal()
558 self.inval_in = Signal()
559
560 self.wb_out = WBMasterOut(name="wb_out")
561 self.wb_in = WBSlaveOut(name="wb_in")
562
563 self.log_out = Signal(54)
564
565
566 # Generate a cache RAM for each way
567 def rams(self, m, r, cache_out_row, use_previous,
568 replace_way, req_row):
569
570 comb = m.d.comb
571 sync = m.d.sync
572
573 wb_in, stall_in = self.wb_in, self.stall_in
574
575 for i in range(NUM_WAYS):
576 do_read = Signal(name="do_rd_%d" % i)
577 do_write = Signal(name="do_wr_%d" % i)
578 rd_addr = Signal(ROW_BITS)
579 wr_addr = Signal(ROW_BITS)
580 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
581 wr_sel = Signal(ROW_SIZE)
582
583 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
584 setattr(m.submodules, "cacheram_%d" % i, way)
585
586 comb += way.rd_en.eq(do_read)
587 comb += way.rd_addr.eq(rd_addr)
588 comb += d_out.eq(way.rd_data_o)
589 comb += way.wr_sel.eq(wr_sel)
590 comb += way.wr_addr.eq(wr_addr)
591 comb += way.wr_data.eq(wb_in.dat)
592
593 comb += do_read.eq(~(stall_in | use_previous))
594 comb += do_write.eq(wb_in.ack & (replace_way == i))
595
596 with m.If(do_write):
597 sync += Display("cache write adr: %x data: %lx",
598 wr_addr, way.wr_data)
599
600 with m.If(r.hit_way == i):
601 comb += cache_out_row.eq(d_out)
602 with m.If(do_read):
603 sync += Display("cache read adr: %x data: %x",
604 req_row, d_out)
605
606 comb += rd_addr.eq(req_row)
607 comb += wr_addr.eq(r.store_row)
608 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
609
610 # Generate PLRUs
611 def maybe_plrus(self, m, r, plru_victim):
612 comb = m.d.comb
613
614 with m.If(NUM_WAYS > 1):
615 for i in range(NUM_LINES):
616 plru_acc_i = Signal(WAY_BITS)
617 plru_acc_en = Signal()
618 plru = PLRU(WAY_BITS)
619 setattr(m.submodules, "plru_%d" % i, plru)
620
621 comb += plru.acc_i.eq(plru_acc_i)
622 comb += plru.acc_en.eq(plru_acc_en)
623
624 # PLRU interface
625 with m.If(get_index(r.hit_nia) == i):
626 comb += plru.acc_en.eq(r.hit_valid)
627
628 comb += plru.acc_i.eq(r.hit_way)
629 comb += plru_victim[i].eq(plru.lru_o)
630
631 # TLB hit detection and real address generation
632 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
633 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
634 priv_fault, access_ok):
635
636 comb = m.d.comb
637
638 i_in = self.i_in
639
640 pte = Signal(TLB_PTE_BITS)
641 ttag = Signal(TLB_EA_TAG_BITS)
642
643 comb += tlb_req_index.eq(hash_ea(i_in.nia))
644 comb += pte.eq(itlb_ptes[tlb_req_index])
645 comb += ttag.eq(itlb_tags[tlb_req_index])
646
647 with m.If(i_in.virt_mode):
648 comb += real_addr.eq(Cat(
649 i_in.nia[:TLB_LG_PGSZ],
650 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
651 ))
652
653 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
654 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
655
656 comb += eaa_priv.eq(pte[3])
657
658 with m.Else():
659 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
660 comb += ra_valid.eq(1)
661 comb += eaa_priv.eq(1)
662
663 # No IAMR, so no KUEP support for now
664 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
665 comb += access_ok.eq(ra_valid & ~priv_fault)
666
667 # iTLB update
668 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
669 comb = m.d.comb
670 sync = m.d.sync
671
672 m_in = self.m_in
673
674 wr_index = Signal(TLB_SIZE)
675 comb += wr_index.eq(hash_ea(m_in.addr))
676
677 with m.If(m_in.tlbie & m_in.doall):
678 # Clear all valid bits
679 for i in range(TLB_SIZE):
680 sync += itlb_valid_bits[i].eq(0)
681
682 with m.Elif(m_in.tlbie):
683 # Clear entry regardless of hit or miss
684 sync += itlb_valid_bits[wr_index].eq(0)
685
686 with m.Elif(m_in.tlbld):
687 sync += itlb_tags[wr_index].eq(
688 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
689 )
690 sync += itlb_ptes[wr_index].eq(m_in.pte)
691 sync += itlb_valid_bits[wr_index].eq(1)
692
693 # Cache hit detection, output to fetch2 and other misc logic
694 def icache_comb(self, m, use_previous, r, req_index, req_row,
695 req_hit_way, req_tag, real_addr, req_laddr,
696 cache_valid_bits, cache_tags, access_ok,
697 req_is_hit, req_is_miss, replace_way,
698 plru_victim, cache_out_row):
699
700 comb = m.d.comb
701
702 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
703 flush_in, stall_out = self.flush_in, self.stall_out
704
705 is_hit = Signal()
706 hit_way = Signal(NUM_WAYS)
707
708 # i_in.sequential means that i_in.nia this cycle is 4 more than
709 # last cycle. If we read more than 32 bits at a time, had a
710 # cache hit last cycle, and we don't want the first 32-bit chunk
711 # then we can keep the data we read last cycle and just use that.
712 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
713 comb += use_previous.eq(i_in.sequential & r.hit_valid)
714
715 # Extract line, row and tag from request
716 comb += req_index.eq(get_index(i_in.nia))
717 comb += req_row.eq(get_row(i_in.nia))
718 comb += req_tag.eq(get_tag(real_addr))
719
720 # Calculate address of beginning of cache row, will be
721 # used for cache miss processing if needed
722 comb += req_laddr.eq(Cat(
723 Const(0, ROW_OFF_BITS),
724 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
725 ))
726
727 # Test if pending request is a hit on any way
728 hitcond = Signal()
729 comb += hitcond.eq((r.state == State.WAIT_ACK)
730 & (req_index == r.store_index)
731 & r.rows_valid[req_row % ROW_PER_LINE])
732 with m.If(i_in.req):
733 cvb = Signal(NUM_WAYS)
734 ctag = Signal(TAG_RAM_WIDTH)
735 comb += ctag.eq(cache_tags[req_index])
736 comb += cvb.eq(cache_valid_bits[req_index])
737 for i in range(NUM_WAYS):
738 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
739 comb += tagi.eq(read_tag(i, ctag))
740 hit_test = Signal(name="hit_test%d" % i)
741 comb += hit_test.eq(i == r.store_way)
742 with m.If((cvb[i] | (hitcond & hit_test))
743 & (tagi == req_tag)):
744 comb += hit_way.eq(i)
745 comb += is_hit.eq(1)
746
747 # Generate the "hit" and "miss" signals
748 # for the synchronous blocks
749 with m.If(i_in.req & access_ok & ~flush_in):
750 comb += req_is_hit.eq(is_hit)
751 comb += req_is_miss.eq(~is_hit)
752
753 with m.Else():
754 comb += req_is_hit.eq(0)
755 comb += req_is_miss.eq(0)
756
757 comb += req_hit_way.eq(hit_way)
758
759 # The way to replace on a miss
760 with m.If(r.state == State.CLR_TAG):
761 comb += replace_way.eq(plru_victim[r.store_index])
762 with m.Else():
763 comb += replace_way.eq(r.store_way)
764
765 # Output instruction from current cache row
766 #
767 # Note: This is a mild violation of our design principle of
768 # having pipeline stages output from a clean latch. In this
769 # case we output the result of a mux. The alternative would
770 # be output an entire row which I prefer not to do just yet
771 # as it would force fetch2 to know about some of the cache
772 # geometry information.
773 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
774 comb += i_out.valid.eq(r.hit_valid)
775 comb += i_out.nia.eq(r.hit_nia)
776 comb += i_out.stop_mark.eq(r.hit_smark)
777 comb += i_out.fetch_failed.eq(r.fetch_failed)
778
779 # Stall fetch1 if we have a miss on cache or TLB
780 # or a protection fault
781 comb += stall_out.eq(~(is_hit & access_ok))
782
783 # Wishbone requests output (from the cache miss reload machine)
784 comb += wb_out.eq(r.wb)
785
786 # Cache hit synchronous machine
787 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
788 req_index, req_tag, real_addr):
789 sync = m.d.sync
790
791 i_in, stall_in = self.i_in, self.stall_in
792 flush_in = self.flush_in
793
794 # keep outputs to fetch2 unchanged on a stall
795 # except that flush or reset sets valid to 0
796 # If use_previous, keep the same data as last
797 # cycle and use the second half
798 with m.If(stall_in | use_previous):
799 with m.If(flush_in):
800 sync += r.hit_valid.eq(0)
801 with m.Else():
802 # On a hit, latch the request for the next cycle,
803 # when the BRAM data will be available on the
804 # cache_out output of the corresponding way
805 sync += r.hit_valid.eq(req_is_hit)
806
807 with m.If(req_is_hit):
808 sync += r.hit_way.eq(req_hit_way)
809 sync += Display(
810 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
811 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
812 i_in.stop_mark, req_index, req_tag, \
813 req_hit_way, real_addr
814 )
815
816
817
818 with m.If(~stall_in):
819 # Send stop marks and NIA down regardless of validity
820 sync += r.hit_smark.eq(i_in.stop_mark)
821 sync += r.hit_nia.eq(i_in.nia)
822
823 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
824 req_index, req_tag, replace_way, real_addr):
825 comb = m.d.comb
826 sync = m.d.sync
827
828 i_in = self.i_in
829
830 # Reset per-row valid flags, only used in WAIT_ACK
831 for i in range(ROW_PER_LINE):
832 sync += r.rows_valid[i].eq(0)
833
834 # We need to read a cache line
835 with m.If(req_is_miss):
836 sync += Display(
837 "cache miss nia:%x IR:%x SM:%x idx:%x "
838 " way:%x tag:%x RA:%x", i_in.nia,
839 i_in.virt_mode, i_in.stop_mark, req_index,
840 replace_way, req_tag, real_addr
841 )
842
843 # Keep track of our index and way for subsequent stores
844 st_row = Signal(BRAM_ROWS)
845 comb += st_row.eq(get_row(req_laddr))
846 sync += r.store_index.eq(req_index)
847 sync += r.store_row.eq(st_row)
848 sync += r.store_tag.eq(req_tag)
849 sync += r.store_valid.eq(1)
850 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
851
852 # Prep for first wishbone read. We calculate the address
853 # of the start of the cache line and start the WB cycle.
854 sync += r.req_adr.eq(req_laddr)
855 sync += r.wb.cyc.eq(1)
856 sync += r.wb.stb.eq(1)
857
858 # Track that we had one request sent
859 sync += r.state.eq(State.CLR_TAG)
860
861 def icache_miss_clr_tag(self, m, r, replace_way,
862 cache_valid_bits, req_index,
863 tagset, cache_tags):
864
865 comb = m.d.comb
866 sync = m.d.sync
867
868 # Get victim way from plru
869 sync += r.store_way.eq(replace_way)
870 # Force misses on that way while reloading that line
871 cv = Signal(INDEX_BITS)
872 comb += cv.eq(cache_valid_bits[req_index])
873 comb += cv.bit_select(replace_way, 1).eq(0)
874 sync += cache_valid_bits[req_index].eq(cv)
875
876 for i in range(NUM_WAYS):
877 with m.If(i == replace_way):
878 comb += tagset.eq(cache_tags[r.store_index])
879 comb += write_tag(i, tagset, r.store_tag)
880 sync += cache_tags[r.store_index].eq(tagset)
881
882 sync += r.state.eq(State.WAIT_ACK)
883
884 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
885 stbs_done, cache_valid_bits):
886 comb = m.d.comb
887 sync = m.d.sync
888
889 wb_in = self.wb_in
890
891 # Requests are all sent if stb is 0
892 stbs_zero = Signal()
893 comb += stbs_zero.eq(r.wb.stb == 0)
894 comb += stbs_done.eq(stbs_zero)
895
896 # If we are still sending requests, was one accepted?
897 with m.If(~wb_in.stall & ~stbs_zero):
898 # That was the last word ? # We are done sending.
899 # Clear stb and set stbs_done # so we can handle
900 # an eventual last ack on # the same cycle.
901 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
902 sync += Display(
903 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
904 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
905 "stbs_done:%x", r.wb.adr, r.end_row_ix,
906 r.wb.stb, stbs_zero, stbs_done
907 )
908 sync += r.wb.stb.eq(0)
909 comb += stbs_done.eq(1)
910
911 # Calculate the next row address
912 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
913 comb += rarange.eq(
914 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
915 )
916 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
917 rarange
918 )
919 sync += Display("RARANGE r.req_adr:%x rarange:%x "
920 "stbs_zero:%x stbs_done:%x",
921 r.req_adr, rarange, stbs_zero, stbs_done)
922
923 # Incoming acks processing
924 with m.If(wb_in.ack):
925 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
926 "stbs_done:%x",
927 wb_in.dat, stbs_zero, stbs_done)
928
929 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
930
931 # Check for completion
932 with m.If(stbs_done &
933 is_last_row(r.store_row, r.end_row_ix)):
934 # Complete wishbone cycle
935 sync += r.wb.cyc.eq(0)
936 sync += r.req_adr.eq(0) # be nice, clear addr
937
938 # Cache line is now valid
939 cv = Signal(INDEX_BITS)
940 comb += cv.eq(cache_valid_bits[r.store_index])
941 comb += cv.bit_select(replace_way, 1).eq(
942 r.store_valid & ~inval_in
943 )
944 sync += cache_valid_bits[r.store_index].eq(cv)
945
946 sync += r.state.eq(State.IDLE)
947
948 # not completed, move on to next request in row
949 with m.Else():
950 # Increment store row counter
951 sync += r.store_row.eq(next_row(r.store_row))
952
953
954 # Cache miss/reload synchronous machine
955 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
956 req_index, req_laddr, req_tag, replace_way,
957 cache_tags, access_ok, real_addr):
958 comb = m.d.comb
959 sync = m.d.sync
960
961 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
962 stall_in, flush_in = self.stall_in, self.flush_in
963 inval_in = self.inval_in
964
965 # variable tagset : cache_tags_set_t;
966 # variable stbs_done : boolean;
967
968 tagset = Signal(TAG_RAM_WIDTH)
969 stbs_done = Signal()
970
971 comb += r.wb.sel.eq(-1)
972 comb += r.wb.adr.eq(r.req_adr[3:])
973
974 # Process cache invalidations
975 with m.If(inval_in):
976 for i in range(NUM_LINES):
977 sync += cache_valid_bits[i].eq(0)
978 sync += r.store_valid.eq(0)
979
980 # Main state machine
981 with m.Switch(r.state):
982
983 with m.Case(State.IDLE):
984 self.icache_miss_idle(
985 m, r, req_is_miss, req_laddr,
986 req_index, req_tag, replace_way,
987 real_addr
988 )
989
990 with m.Case(State.CLR_TAG, State.WAIT_ACK):
991 with m.If(r.state == State.CLR_TAG):
992 self.icache_miss_clr_tag(
993 m, r, replace_way,
994 cache_valid_bits, req_index,
995 tagset, cache_tags
996 )
997
998 self.icache_miss_wait_ack(
999 m, r, replace_way, inval_in,
1000 stbs_done, cache_valid_bits
1001 )
1002
1003 # TLB miss and protection fault processing
1004 with m.If(flush_in | m_in.tlbld):
1005 sync += r.fetch_failed.eq(0)
1006 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1007 sync += r.fetch_failed.eq(1)
1008
1009 # icache_log: if LOG_LENGTH > 0 generate
1010 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1011 req_is_miss, req_is_hit, lway, wstate, r):
1012 comb = m.d.comb
1013 sync = m.d.sync
1014
1015 wb_in, i_out = self.wb_in, self.i_out
1016 log_out, stall_out = self.log_out, self.stall_out
1017
1018 # -- Output data to logger
1019 # signal log_data : std_ulogic_vector(53 downto 0);
1020 # begin
1021 # data_log: process(clk)
1022 # variable lway: way_t;
1023 # variable wstate: std_ulogic;
1024 # Output data to logger
1025 for i in range(LOG_LENGTH):
1026 # Output data to logger
1027 log_data = Signal(54)
1028 lway = Signal(NUM_WAYS)
1029 wstate = Signal()
1030
1031 # begin
1032 # if rising_edge(clk) then
1033 # lway := req_hit_way;
1034 # wstate := '0';
1035 sync += lway.eq(req_hit_way)
1036 sync += wstate.eq(0)
1037
1038 # if r.state /= IDLE then
1039 # wstate := '1';
1040 # end if;
1041 with m.If(r.state != State.IDLE):
1042 sync += wstate.eq(1)
1043
1044 # log_data <= i_out.valid &
1045 # i_out.insn &
1046 # wishbone_in.ack &
1047 # r.wb.adr(5 downto 3) &
1048 # r.wb.stb & r.wb.cyc &
1049 # wishbone_in.stall &
1050 # stall_out &
1051 # r.fetch_failed &
1052 # r.hit_nia(5 downto 2) &
1053 # wstate &
1054 # std_ulogic_vector(to_unsigned(lway, 3)) &
1055 # req_is_hit & req_is_miss &
1056 # access_ok &
1057 # ra_valid;
1058 sync += log_data.eq(Cat(
1059 ra_valid, access_ok, req_is_miss, req_is_hit,
1060 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
1061 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
1062 r.wb.adr[3:6], wb_in.ack, i_out.insn, i_out.valid
1063 ))
1064 # end if;
1065 # end process;
1066 # log_out <= log_data;
1067 comb += log_out.eq(log_data)
1068 # end generate;
1069 # end;
1070
1071 def elaborate(self, platform):
1072
1073 m = Module()
1074 comb = m.d.comb
1075
1076 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1077 cache_tags = CacheTagArray()
1078 cache_valid_bits = CacheValidBitsArray()
1079
1080 # signal itlb_valids : tlb_valids_t;
1081 # signal itlb_tags : tlb_tags_t;
1082 # signal itlb_ptes : tlb_ptes_t;
1083 # attribute ram_style of itlb_tags : signal is "distributed";
1084 # attribute ram_style of itlb_ptes : signal is "distributed";
1085 itlb_valid_bits = TLBValidBitsArray()
1086 itlb_tags = TLBTagArray()
1087 itlb_ptes = TLBPtesArray()
1088 # TODO to be passed to nmigen as ram attributes
1089 # attribute ram_style of itlb_tags : signal is "distributed";
1090 # attribute ram_style of itlb_ptes : signal is "distributed";
1091
1092 # -- Privilege bit from PTE EAA field
1093 # signal eaa_priv : std_ulogic;
1094 # Privilege bit from PTE EAA field
1095 eaa_priv = Signal()
1096
1097 # signal r : reg_internal_t;
1098 r = RegInternal()
1099
1100 # -- Async signals on incoming request
1101 # signal req_index : index_t;
1102 # signal req_row : row_t;
1103 # signal req_hit_way : way_t;
1104 # signal req_tag : cache_tag_t;
1105 # signal req_is_hit : std_ulogic;
1106 # signal req_is_miss : std_ulogic;
1107 # signal req_laddr : std_ulogic_vector(63 downto 0);
1108 # Async signal on incoming request
1109 req_index = Signal(NUM_LINES)
1110 req_row = Signal(BRAM_ROWS)
1111 req_hit_way = Signal(NUM_WAYS)
1112 req_tag = Signal(TAG_BITS)
1113 req_is_hit = Signal()
1114 req_is_miss = Signal()
1115 req_laddr = Signal(64)
1116
1117 # signal tlb_req_index : tlb_index_t;
1118 # signal real_addr : std_ulogic_vector(
1119 # REAL_ADDR_BITS - 1 downto 0
1120 # );
1121 # signal ra_valid : std_ulogic;
1122 # signal priv_fault : std_ulogic;
1123 # signal access_ok : std_ulogic;
1124 # signal use_previous : std_ulogic;
1125 tlb_req_index = Signal(TLB_SIZE)
1126 real_addr = Signal(REAL_ADDR_BITS)
1127 ra_valid = Signal()
1128 priv_fault = Signal()
1129 access_ok = Signal()
1130 use_previous = Signal()
1131
1132 # signal cache_out : cache_ram_out_t;
1133 cache_out_row = Signal(ROW_SIZE_BITS)
1134
1135 # signal plru_victim : plru_out_t;
1136 # signal replace_way : way_t;
1137 plru_victim = PLRUOut()
1138 replace_way = Signal(NUM_WAYS)
1139
1140 # call sub-functions putting everything together,
1141 # using shared signals established above
1142 self.rams(
1143 m, r, cache_out_row, use_previous, replace_way, req_row
1144 )
1145 self.maybe_plrus(m, r, plru_victim)
1146 self.itlb_lookup(
1147 m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
1148 itlb_valid_bits, ra_valid, eaa_priv, priv_fault, access_ok
1149 )
1150 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1151 self.icache_comb(
1152 m, use_previous, r, req_index, req_row, req_hit_way,
1153 req_tag, real_addr, req_laddr, cache_valid_bits,
1154 cache_tags, access_ok, req_is_hit, req_is_miss,
1155 replace_way, plru_victim, cache_out_row
1156 )
1157 self.icache_hit(
1158 m, use_previous, r, req_is_hit, req_hit_way, req_index,
1159 req_tag, real_addr
1160 )
1161 self.icache_miss(
1162 m, cache_valid_bits, r, req_is_miss, req_index, req_laddr,
1163 req_tag, replace_way, cache_tags, access_ok, real_addr
1164 )
1165 #self.icache_log(
1166 # m, log_out, req_hit_way, ra_valid, access_ok,
1167 # req_is_miss, req_is_hit, lway, wstate, r
1168 #)
1169
1170 return m
1171
1172
1173 def icache_sim(dut):
1174 i_out = dut.i_in
1175 i_in = dut.i_out
1176 m_out = dut.m_in
1177
1178 yield i_in.valid.eq(0)
1179 yield i_out.priv_mode.eq(1)
1180 yield i_out.req.eq(0)
1181 yield i_out.nia.eq(0)
1182 yield i_out.stop_mark.eq(0)
1183 yield m_out.tlbld.eq(0)
1184 yield m_out.tlbie.eq(0)
1185 yield m_out.addr.eq(0)
1186 yield m_out.pte.eq(0)
1187 yield
1188 yield
1189 yield
1190 yield
1191 yield i_out.req.eq(1)
1192 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1193 for i in range(30):
1194 yield
1195 yield
1196 valid = yield i_in.valid
1197 nia = yield i_out.nia
1198 insn = yield i_in.insn
1199 print(f"valid? {valid}")
1200 assert valid
1201 assert insn == 0x00000001, \
1202 "insn @%x=%x expected 00000001" % (nia, insn)
1203 yield i_out.req.eq(0)
1204 yield
1205
1206 # hit
1207 yield
1208 yield
1209 yield i_out.req.eq(1)
1210 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1211 yield
1212 yield
1213 valid = yield i_in.valid
1214 nia = yield i_in.nia
1215 insn = yield i_in.insn
1216 assert valid
1217 assert insn == 0x00000002, \
1218 "insn @%x=%x expected 00000002" % (nia, insn)
1219 yield
1220
1221 # another miss
1222 yield i_out.req.eq(1)
1223 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1224 for i in range(30):
1225 yield
1226 yield
1227 valid = yield i_in.valid
1228 nia = yield i_out.nia
1229 insn = yield i_in.insn
1230 assert valid
1231 assert insn == 0x00000010, \
1232 "insn @%x=%x expected 00000010" % (nia, insn)
1233
1234 # test something that aliases
1235 yield i_out.req.eq(1)
1236 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1237 yield
1238 yield
1239 valid = yield i_in.valid
1240 assert ~valid
1241 for i in range(30):
1242 yield
1243 yield
1244 insn = yield i_in.insn
1245 valid = yield i_in.valid
1246 insn = yield i_in.insn
1247 assert valid
1248 assert insn == 0x00000040, \
1249 "insn @%x=%x expected 00000040" % (nia, insn)
1250 yield i_out.req.eq(0)
1251
1252
1253
1254 def test_icache(mem):
1255 dut = ICache()
1256
1257 memory = Memory(width=64, depth=512, init=mem)
1258 sram = SRAM(memory=memory, granularity=8)
1259
1260 m = Module()
1261
1262 m.submodules.icache = dut
1263 m.submodules.sram = sram
1264
1265 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1266 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1267 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1268 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1269 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1270 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1271
1272 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1273 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1274
1275 # nmigen Simulation
1276 sim = Simulator(m)
1277 sim.add_clock(1e-6)
1278
1279 sim.add_sync_process(wrap(icache_sim(dut)))
1280 with sim.write_vcd('test_icache.vcd'):
1281 sim.run()
1282
1283 if __name__ == '__main__':
1284 dut = ICache()
1285 vl = rtlil.convert(dut, ports=[])
1286 with open("test_icache.il", "w") as f:
1287 f.write(vl)
1288
1289 mem = []
1290 for i in range(512):
1291 mem.append((i*2)| ((i*2+1)<<32))
1292
1293 test_icache(mem)
1294