get rid of rst
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray():
206 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
207
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray():
213 return Array(Signal(NUM_WAYS) for x in range(NUM_LINES))
214
215 def RowPerLineValidArray():
216 return Array(Signal() for x in range(ROW_PER_LINE))
217
218
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
224
225
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray():
233 return Array(Signal() for x in range(TLB_SIZE))
234
235 def TLBTagArray():
236 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
237
238 def TLBPTEArray():
239 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
240
241
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
245 def CacheRamOut():
246 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
247
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
252 def PLRUOut():
253 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
254
255 # -- Return the cache line index (tag index) for an address
256 # function get_index(addr: std_ulogic_vector(63 downto 0))
257 # return index_t is
258 # begin
259 # return to_integer(unsigned(
260 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
261 # ));
262 # end;
263 # Return the cache line index (tag index) for an address
264 def get_index(addr):
265 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
266
267 # -- Return the cache row index (data memory) for an address
268 # function get_row(addr: std_ulogic_vector(63 downto 0))
269 # return row_t is
270 # begin
271 # return to_integer(unsigned(
272 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
273 # ));
274 # end;
275 # Return the cache row index (data memory) for an address
276 def get_row(addr):
277 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
278
279 # -- Return the index of a row within a line
280 # function get_row_of_line(row: row_t) return row_in_line_t is
281 # variable row_v : unsigned(ROW_BITS-1 downto 0);
282 # begin
283 # row_v := to_unsigned(row, ROW_BITS);
284 # return row_v(ROW_LINEBITS-1 downto 0);
285 # end;
286 # Return the index of a row within a line
287 def get_row_of_line(row):
288 row[:ROW_LINE_BITS]
289
290 # -- Returns whether this is the last row of a line
291 # function is_last_row_addr(addr: wishbone_addr_type;
292 # last: row_in_line_t
293 # )
294 # return boolean is
295 # begin
296 # return unsigned(
297 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
298 # ) = last;
299 # end;
300 # Returns whether this is the last row of a line
301 def is_last_row_addr(addr, last):
302 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
303
304 # -- Returns whether this is the last row of a line
305 # function is_last_row(row: row_t;
306 # last: row_in_line_t) return boolean is
307 # begin
308 # return get_row_of_line(row) = last;
309 # end;
310 # Returns whether this is the last row of a line
311 def is_last_row(row, last):
312 return get_row_of_line(row) == last
313
314 # -- Return the address of the next row in the current cache line
315 # function next_row_addr(addr: wishbone_addr_type)
316 # return std_ulogic_vector is
317 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
318 # variable result : wishbone_addr_type;
319 # begin
320 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
321 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
322 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
323 # result := addr;
324 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
325 # return result;
326 # end;
327 # Return the address of the next row in the current cache line
328 def next_row_addr(addr):
329 # TODO no idea what's going on here, looks like double assignments
330 # overriding earlier assignments ??? Help please!
331 pass
332
333 # -- Return the next row in the current cache line. We use a dedicated
334 # -- function in order to limit the size of the generated adder to be
335 # -- only the bits within a cache line (3 bits with default settings)
336 # function next_row(row: row_t) return row_t is
337 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
338 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
339 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
340 # begin
341 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
342 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
343 # row_v(ROW_LINEBITS-1 downto 0) :=
344 # std_ulogic_vector(unsigned(row_idx) + 1);
345 # return to_integer(unsigned(row_v));
346 # end;
347 # Return the next row in the current cache line. We use a dedicated
348 # function in order to limit the size of the generated adder to be
349 # only the bits within a cache line (3 bits with default settings)
350 def next_row(row):
351 # TODO no idea what's going on here, looks like double assignments
352 # overriding earlier assignments ??? Help please!
353 pass
354
355 # -- Read the instruction word for the given address in the
356 # -- current cache row
357 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
358 # data: cache_row_t) return std_ulogic_vector is
359 # variable word: integer range 0 to INSN_PER_ROW-1;
360 # begin
361 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
362 # return data(31+word*32 downto word*32);
363 # end;
364 # Read the instruction word for the given address
365 # in the current cache row
366 def read_insn_word(addr, data):
367 word = addr[2:INSN_BITS+3]
368 return data.word_select(word, 32)
369
370 # -- Get the tag value from the address
371 # function get_tag(
372 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
373 # )
374 # return cache_tag_t is
375 # begin
376 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
377 # end;
378 # Get the tag value from the address
379 def get_tag(addr):
380 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
381
382 # -- Read a tag from a tag memory row
383 # function read_tag(way: way_t; tagset: cache_tags_set_t)
384 # return cache_tag_t is
385 # begin
386 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
387 # end;
388 # Read a tag from a tag memory row
389 def read_tag(way, tagset):
390 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
391
392 # -- Write a tag to tag memory row
393 # procedure write_tag(way: in way_t;
394 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
395 # begin
396 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
397 # end;
398 # Write a tag to tag memory row
399 def write_tag(way, tagset, tag):
400 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
401
402 # -- Simple hash for direct-mapped TLB index
403 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
404 # return tlb_index_t is
405 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
406 # begin
407 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
408 # xor addr(
409 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
410 # TLB_LG_PGSZ + TLB_BITS
411 # )
412 # xor addr(
413 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
414 # TLB_LG_PGSZ + 2 * TLB_BITS
415 # );
416 # return to_integer(unsigned(hash));
417 # end;
418 # Simple hash for direct-mapped TLB index
419 def hash_ea(addr):
420 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
421 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
422 ] ^ addr[
423 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
424 ]
425 return hsh
426
427 # begin
428 #
429 # assert LINE_SIZE mod ROW_SIZE = 0;
430 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
431 # severity FAILURE;
432 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
433 # severity FAILURE;
434 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
435 # severity FAILURE;
436 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
437 # severity FAILURE;
438 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
439 # report "geometry bits don't add up" severity FAILURE;
440 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
441 # report "geometry bits don't add up" severity FAILURE;
442 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
443 # report "geometry bits don't add up" severity FAILURE;
444 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
445 # report "geometry bits don't add up" severity FAILURE;
446 #
447 # sim_debug: if SIM generate
448 # debug: process
449 # begin
450 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
451 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
452 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
453 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
454 # report "INSN_BITS = " & natural'image(INSN_BITS);
455 # report "ROW_BITS = " & natural'image(ROW_BITS);
456 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
457 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
458 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
459 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
460 # report "TAG_BITS = " & natural'image(TAG_BITS);
461 # report "WAY_BITS = " & natural'image(WAY_BITS);
462 # wait;
463 # end process;
464 # end generate;
465
466 # Cache reload state machine
467 @unique
468 class State(Enum):
469 IDLE = 0
470 CLR_TAG = 1
471 WAIT_ACK = 2
472
473 # type reg_internal_t is record
474 # -- Cache hit state (Latches for 1 cycle BRAM access)
475 # hit_way : way_t;
476 # hit_nia : std_ulogic_vector(63 downto 0);
477 # hit_smark : std_ulogic;
478 # hit_valid : std_ulogic;
479 #
480 # -- Cache miss state (reload state machine)
481 # state : state_t;
482 # wb : wishbone_master_out;
483 # store_way : way_t;
484 # store_index : index_t;
485 # store_row : row_t;
486 # store_tag : cache_tag_t;
487 # store_valid : std_ulogic;
488 # end_row_ix : row_in_line_t;
489 # rows_valid : row_per_line_valid_t;
490 #
491 # -- TLB miss state
492 # fetch_failed : std_ulogic;
493 # end record;
494 class RegInternal(RecordObject):
495 def __init__(self):
496 super().__init__()
497 # Cache hit state (Latches for 1 cycle BRAM access)
498 self.hit_way = Signal(NUM_WAYS)
499 self.hit_nia = Signal(64)
500 self.hit_smark = Signal()
501 self.hit_valid = Signal()
502
503 # Cache miss state (reload state machine)
504 self.state = Signal(State)
505 self.wb = WBMasterOut()
506 self.store_way = Signal(NUM_WAYS)
507 self.store_index = Signal(NUM_LINES)
508 self.store_row = Signal(BRAM_ROWS)
509 self.store_tag = Signal(TAG_BITS)
510 self.store_valid = Signal()
511 self.end_row_ix = Signal(ROW_LINE_BITS)
512 self.rows_valid = RowPerLineValidArray()
513
514 # TLB miss state
515 self.fetch_failed = Signal()
516
517 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
518 #
519 # entity icache is
520 # generic (
521 # SIM : boolean := false;
522 # -- Line size in bytes
523 # LINE_SIZE : positive := 64;
524 # -- BRAM organisation: We never access more
525 # -- than wishbone_data_bits
526 # -- at a time so to save resources we make the
527 # -- array only that wide,
528 # -- and use consecutive indices for to make a cache "line"
529 # --
530 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
531 # -- so 64-bits)
532 # ROW_SIZE : positive := wishbone_data_bits / 8;
533 # -- Number of lines in a set
534 # NUM_LINES : positive := 32;
535 # -- Number of ways
536 # NUM_WAYS : positive := 4;
537 # -- L1 ITLB number of entries (direct mapped)
538 # TLB_SIZE : positive := 64;
539 # -- L1 ITLB log_2(page_size)
540 # TLB_LG_PGSZ : positive := 12;
541 # -- Number of real address bits that we store
542 # REAL_ADDR_BITS : positive := 56;
543 # -- Non-zero to enable log data collection
544 # LOG_LENGTH : natural := 0
545 # );
546 # port (
547 # clk : in std_ulogic;
548 # rst : in std_ulogic;
549 #
550 # i_in : in Fetch1ToIcacheType;
551 # i_out : out IcacheToDecode1Type;
552 #
553 # m_in : in MmuToIcacheType;
554 #
555 # stall_in : in std_ulogic;
556 # stall_out : out std_ulogic;
557 # flush_in : in std_ulogic;
558 # inval_in : in std_ulogic;
559 #
560 # wishbone_out : out wishbone_master_out;
561 # wishbone_in : in wishbone_slave_out;
562 #
563 # log_out : out std_ulogic_vector(53 downto 0)
564 # );
565 # end entity icache;
566 # 64 bit direct mapped icache. All instructions are 4B aligned.
567 class ICache(Elaboratable):
568 """64 bit direct mapped icache. All instructions are 4B aligned."""
569 def __init__(self):
570 self.i_in = Fetch1ToICacheType()
571 self.i_out = ICacheToDecode1Type()
572
573 self.m_in = MMUToICacheType()
574
575 self.stall_in = Signal()
576 self.stall_out = Signal()
577 self.flush_in = Signal()
578 self.inval_in = Signal()
579
580 self.wb_out = WBMasterOut()
581 self.wb_in = WBSlaveOut()
582
583 self.log_out = Signal(54)
584
585
586 # -- Generate a cache RAM for each way
587 # rams: for i in 0 to NUM_WAYS-1 generate
588 # signal do_read : std_ulogic;
589 # signal do_write : std_ulogic;
590 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
591 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
592 # signal dout : cache_row_t;
593 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
594 # begin
595 # way: entity work.cache_ram
596 # generic map (
597 # ROW_BITS => ROW_BITS,
598 # WIDTH => ROW_SIZE_BITS
599 # )
600 # port map (
601 # clk => clk,
602 # rd_en => do_read,
603 # rd_addr => rd_addr,
604 # rd_data => dout,
605 # wr_sel => wr_sel,
606 # wr_addr => wr_addr,
607 # wr_data => wishbone_in.dat
608 # );
609 # process(all)
610 # begin
611 # do_read <= not (stall_in or use_previous);
612 # do_write <= '0';
613 # if wishbone_in.ack = '1' and replace_way = i then
614 # do_write <= '1';
615 # end if;
616 # cache_out(i) <= dout;
617 # rd_addr <=
618 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
619 # wr_addr <=
620 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
621 # for i in 0 to ROW_SIZE-1 loop
622 # wr_sel(i) <= do_write;
623 # end loop;
624 # end process;
625 # end generate;
626 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
627 comb = m.d.comb
628
629 wb_in, stall_in = self.wb_in, self.stall_in
630
631 do_read = Signal()
632 do_write = Signal()
633 rd_addr = Signal(ROW_BITS)
634 wr_addr = Signal(ROW_BITS)
635 _d_out = Signal(ROW_SIZE_BITS)
636 wr_sel = Signal(ROW_SIZE)
637
638 for i in range(NUM_WAYS):
639 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
640 comb += way.rd_en.eq(do_read)
641 comb += way.rd_addr.eq(rd_addr)
642 comb += way.rd_data_o.eq(_d_out)
643 comb += way.wr_sel.eq(wr_sel)
644 comb += way.wr_addr.eq(wr_addr)
645 comb += way.wr_data.eq(wb_in.dat)
646
647 comb += do_read.eq(~(stall_in | use_previous))
648 comb += do_write.eq(0)
649
650 with m.If(wb_in.ack & (replace_way == i)):
651 comb += do_write.eq(1)
652
653 comb += cache_out[i].eq(_d_out)
654 comb += rd_addr.eq(req_row)
655 comb += wr_addr.eq(r.store_row)
656 for j in range(ROW_SIZE):
657 comb += wr_sel[j].eq(do_write)
658
659 # -- Generate PLRUs
660 # maybe_plrus: if NUM_WAYS > 1 generate
661 # begin
662 # plrus: for i in 0 to NUM_LINES-1 generate
663 # -- PLRU interface
664 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
665 # signal plru_acc_en : std_ulogic;
666 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
667 #
668 # begin
669 # plru : entity work.plru
670 # generic map (
671 # BITS => WAY_BITS
672 # )
673 # port map (
674 # clk => clk,
675 # rst => rst,
676 # acc => plru_acc,
677 # acc_en => plru_acc_en,
678 # lru => plru_out
679 # );
680 #
681 # process(all)
682 # begin
683 # -- PLRU interface
684 # if get_index(r.hit_nia) = i then
685 # plru_acc_en <= r.hit_valid;
686 # else
687 # plru_acc_en <= '0';
688 # end if;
689 # plru_acc <=
690 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
691 # plru_victim(i) <= plru_out;
692 # end process;
693 # end generate;
694 # end generate;
695 def maybe_plrus(self, m, r, plru_victim):
696 comb = m.d.comb
697
698 with m.If(NUM_WAYS > 1):
699 for i in range(NUM_LINES):
700 plru_acc = Signal(WAY_BITS)
701 plru_acc_en = Signal()
702 plru_out = Signal(WAY_BITS)
703 plru = PLRU(WAY_BITS)
704 comb += plru.acc.eq(plru_acc)
705 comb += plru.acc_en.eq(plru_acc_en)
706 comb += plru.lru_o.eq(plru_out)
707
708 # PLRU interface
709 with m.If(get_index(r.hit_nia) == i):
710 comb += plru.acc_en.eq(r.hit_valid)
711
712 with m.Else():
713 comb += plru.acc_en.eq(0)
714
715 comb += plru.acc.eq(r.hit_way)
716 comb += plru_victim[i].eq(plru.lru_o)
717
718 # -- TLB hit detection and real address generation
719 # itlb_lookup : process(all)
720 # variable pte : tlb_pte_t;
721 # variable ttag : tlb_tag_t;
722 # begin
723 # tlb_req_index <= hash_ea(i_in.nia);
724 # pte := itlb_ptes(tlb_req_index);
725 # ttag := itlb_tags(tlb_req_index);
726 # if i_in.virt_mode = '1' then
727 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
728 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
729 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
730 # ra_valid <= itlb_valids(tlb_req_index);
731 # else
732 # ra_valid <= '0';
733 # end if;
734 # eaa_priv <= pte(3);
735 # else
736 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
737 # ra_valid <= '1';
738 # eaa_priv <= '1';
739 # end if;
740 #
741 # -- no IAMR, so no KUEP support for now
742 # priv_fault <= eaa_priv and not i_in.priv_mode;
743 # access_ok <= ra_valid and not priv_fault;
744 # end process;
745 # TLB hit detection and real address generation
746 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
747 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
748 priv_fault, access_ok):
749 comb = m.d.comb
750
751 i_in = self.i_in
752
753 pte = Signal(TLB_PTE_BITS)
754 ttag = Signal(TLB_EA_TAG_BITS)
755
756 comb += tlb_req_index.eq(hash_ea(i_in.nia))
757 comb += pte.eq(itlb_ptes[tlb_req_index])
758 comb += ttag.eq(itlb_tags[tlb_req_index])
759
760 with m.If(i_in.virt_mode):
761 comb += real_addr.eq(Cat(
762 i_in.nia[:TLB_LG_PGSZ],
763 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
764 ))
765
766 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
767 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
768
769 with m.Else():
770 comb += ra_valid.eq(0)
771
772 with m.Else():
773 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
774 comb += ra_valid.eq(1)
775 comb += eaa_priv.eq(1)
776
777 # No IAMR, so no KUEP support for now
778 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
779 comb += access_ok.eq(ra_valid & ~priv_fault)
780
781 # -- iTLB update
782 # itlb_update: process(clk)
783 # variable wr_index : tlb_index_t;
784 # begin
785 # if rising_edge(clk) then
786 # wr_index := hash_ea(m_in.addr);
787 # if rst = '1' or
788 # (m_in.tlbie = '1' and m_in.doall = '1') then
789 # -- clear all valid bits
790 # for i in tlb_index_t loop
791 # itlb_valids(i) <= '0';
792 # end loop;
793 # elsif m_in.tlbie = '1' then
794 # -- clear entry regardless of hit or miss
795 # itlb_valids(wr_index) <= '0';
796 # elsif m_in.tlbld = '1' then
797 # itlb_tags(wr_index) <=
798 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
799 # itlb_ptes(wr_index) <= m_in.pte;
800 # itlb_valids(wr_index) <= '1';
801 # end if;
802 # end if;
803 # end process;
804 # iTLB update
805 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
806 comb = m.d.comb
807 sync = m.d.sync
808
809 m_in = self.m_in
810
811 wr_index = Signal(TLB_SIZE)
812 comb += wr_index.eq(hash_ea(m_in.addr))
813
814 with m.If(m_in.tlbie & m_in.doall):
815 # Clear all valid bits
816 for i in range(TLB_SIZE):
817 sync += itlb_valid_bits[i].eq(0)
818
819 with m.Elif(m_in.tlbie):
820 # Clear entry regardless of hit or miss
821 sync += itlb_valid_bits[wr_index].eq(0)
822
823 with m.Elif(m_in.tlbld):
824 sync += itlb_tags[wr_index].eq(
825 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
826 )
827 sync += itlb_ptes[wr_index].eq(m_in.pte)
828 sync += itlb_valid_bits[wr_index].eq(1)
829
830 # -- Cache hit detection, output to fetch2 and other misc logic
831 # icache_comb : process(all)
832 # Cache hit detection, output to fetch2 and other misc logic
833 def icache_comb(self, m, use_previous, r, req_index, req_row,
834 req_tag, real_addr, req_laddr, cache_valid_bits,
835 cache_tags, access_ok, req_is_hit,
836 req_is_miss, replace_way, plru_victim, cache_out):
837 # variable is_hit : std_ulogic;
838 # variable hit_way : way_t;
839 comb = m.d.comb
840
841 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
842 flush_in, stall_out = self.flush_in, self.stall_out
843
844 is_hit = Signal()
845 hit_way = Signal(NUM_WAYS)
846 # begin
847 # -- i_in.sequential means that i_in.nia this cycle
848 # -- is 4 more than last cycle. If we read more
849 # -- than 32 bits at a time, had a cache hit last
850 # -- cycle, and we don't want the first 32-bit chunk
851 # -- then we can keep the data we read last cycle
852 # -- and just use that.
853 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
854 # use_previous <= i_in.sequential and r.hit_valid;
855 # else
856 # use_previous <= '0';
857 # end if;
858 # i_in.sequential means that i_in.nia this cycle is 4 more than
859 # last cycle. If we read more than 32 bits at a time, had a
860 # cache hit last cycle, and we don't want the first 32-bit chunk
861 # then we can keep the data we read last cycle and just use that.
862 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
863 comb += use_previous.eq(i_in.sequential & r.hit_valid)
864
865 with m.Else():
866 comb += use_previous.eq(0)
867
868 # -- Extract line, row and tag from request
869 # req_index <= get_index(i_in.nia);
870 # req_row <= get_row(i_in.nia);
871 # req_tag <= get_tag(real_addr);
872 # Extract line, row and tag from request
873 comb += req_index.eq(get_index(i_in.nia))
874 comb += req_row.eq(get_row(i_in.nia))
875 comb += req_tag.eq(get_tag(real_addr))
876
877 # -- Calculate address of beginning of cache row, will be
878 # -- used for cache miss processing if needed
879 # req_laddr <=
880 # (63 downto REAL_ADDR_BITS => '0') &
881 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
882 # (ROW_OFF_BITS-1 downto 0 => '0');
883 # Calculate address of beginning of cache row, will be
884 # used for cache miss processing if needed
885 comb += req_laddr.eq(Cat(
886 Const(0b0, ROW_OFF_BITS),
887 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
888 Const(0, REAL_ADDR_BITS)
889 ))
890
891 # -- Test if pending request is a hit on any way
892 # hit_way := 0;
893 # is_hit := '0';
894 # for i in way_t loop
895 # if i_in.req = '1' and
896 # (cache_valids(req_index)(i) = '1' or
897 # (r.state = WAIT_ACK and
898 # req_index = r.store_index and
899 # i = r.store_way and
900 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
901 # if read_tag(i, cache_tags(req_index)) = req_tag then
902 # hit_way := i;
903 # is_hit := '1';
904 # end if;
905 # end if;
906 # end loop;
907 # Test if pending request is a hit on any way
908 for i in range(NUM_WAYS):
909 with m.If(i_in.req &
910 (cache_valid_bits[req_index][i] |
911 ((r.state == State.WAIT_ACK)
912 & (req_index == r.store_index)
913 & (i == r.store_way)
914 & r.rows_valid[req_row % ROW_PER_LINE]))):
915 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
916 comb += hit_way.eq(i)
917 comb += is_hit.eq(1)
918
919 # -- Generate the "hit" and "miss" signals
920 # -- for the synchronous blocks
921 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
922 # and rst = '0' then
923 # req_is_hit <= is_hit;
924 # req_is_miss <= not is_hit;
925 # else
926 # req_is_hit <= '0';
927 # req_is_miss <= '0';
928 # end if;
929 # req_hit_way <= hit_way;
930 # Generate the "hit" and "miss" signals
931 # for the synchronous blocks
932 with m.If(i_in.req & access_ok & ~flush_in):
933 comb += req_is_hit.eq(is_hit)
934 comb += req_is_miss.eq(~is_hit)
935
936 with m.Else():
937 comb += req_is_hit.eq(0)
938 comb += req_is_miss.eq(0)
939
940 # -- The way to replace on a miss
941 # if r.state = CLR_TAG then
942 # replace_way <=
943 # to_integer(unsigned(plru_victim(r.store_index)));
944 # else
945 # replace_way <= r.store_way;
946 # end if;
947 # The way to replace on a miss
948 with m.If(r.state == State.CLR_TAG):
949 comb += replace_way.eq(plru_victim[r.store_index])
950
951 with m.Else():
952 comb += replace_way.eq(r.store_way)
953
954 # -- Output instruction from current cache row
955 # --
956 # -- Note: This is a mild violation of our design principle of
957 # -- having pipeline stages output from a clean latch. In this
958 # -- case we output the result of a mux. The alternative would
959 # -- be output an entire row which I prefer not to do just yet
960 # -- as it would force fetch2 to know about some of the cache
961 # -- geometry information.
962 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
963 # i_out.valid <= r.hit_valid;
964 # i_out.nia <= r.hit_nia;
965 # i_out.stop_mark <= r.hit_smark;
966 # i_out.fetch_failed <= r.fetch_failed;
967 # Output instruction from current cache row
968 #
969 # Note: This is a mild violation of our design principle of
970 # having pipeline stages output from a clean latch. In this
971 # case we output the result of a mux. The alternative would
972 # be output an entire row which I prefer not to do just yet
973 # as it would force fetch2 to know about some of the cache
974 # geometry information.
975 comb += i_out.insn.eq(
976 read_insn_word(r.hit_nia, cache_out[r.hit_way])
977 )
978 comb += i_out.valid.eq(r.hit_valid)
979 comb += i_out.nia.eq(r.hit_nia)
980 comb += i_out.stop_mark.eq(r.hit_smark)
981 comb += i_out.fetch_failed.eq(r.fetch_failed)
982
983 # -- Stall fetch1 if we have a miss on cache or TLB
984 # -- or a protection fault
985 # stall_out <= not (is_hit and access_ok);
986 # Stall fetch1 if we have a miss on cache or TLB
987 # or a protection fault
988 comb += stall_out.eq(~(is_hit & access_ok))
989
990 # -- Wishbone requests output (from the cache miss reload machine)
991 # wishbone_out <= r.wb;
992 # Wishbone requests output (from the cache miss reload machine)
993 comb += wb_out.eq(r.wb)
994 # end process;
995
996 # -- Cache hit synchronous machine
997 # icache_hit : process(clk)
998 # Cache hit synchronous machine
999 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
1000 req_index, req_tag, real_addr):
1001 sync = m.d.sync
1002
1003 i_in, stall_in = self.i_in, self.stall_in
1004 flush_in = self.flush_in
1005
1006 # begin
1007 # if rising_edge(clk) then
1008 # -- keep outputs to fetch2 unchanged on a stall
1009 # -- except that flush or reset sets valid to 0
1010 # -- If use_previous, keep the same data as last
1011 # -- cycle and use the second half
1012 # if stall_in = '1' or use_previous = '1' then
1013 # if rst = '1' or flush_in = '1' then
1014 # r.hit_valid <= '0';
1015 # end if;
1016 # keep outputs to fetch2 unchanged on a stall
1017 # except that flush or reset sets valid to 0
1018 # If use_previous, keep the same data as last
1019 # cycle and use the second half
1020 with m.If(stall_in | use_previous):
1021 with m.If(flush_in):
1022 sync += r.hit_valid.eq(0)
1023 # else
1024 # -- On a hit, latch the request for the next cycle,
1025 # -- when the BRAM data will be available on the
1026 # -- cache_out output of the corresponding way
1027 # r.hit_valid <= req_is_hit;
1028 # if req_is_hit = '1' then
1029 # r.hit_way <= req_hit_way;
1030 with m.Else():
1031 # On a hit, latch the request for the next cycle,
1032 # when the BRAM data will be available on the
1033 # cache_out output of the corresponding way
1034 sync += r.hit_valid.eq(req_is_hit)
1035
1036 with m.If(req_is_hit):
1037 sync += r.hit_way.eq(req_hit_way)
1038
1039 # report "cache hit nia:" & to_hstring(i_in.nia) &
1040 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1041 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1042 # " idx:" & integer'image(req_index) &
1043 # " tag:" & to_hstring(req_tag) &
1044 # " way:" & integer'image(req_hit_way) &
1045 # " RA:" & to_hstring(real_addr);
1046 print(f"cache hit nia:{i_in.nia}, " \
1047 f"IR:{i_in.virt_mode}, " \
1048 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1049 f"tag:{req_tag}, way:{req_hit_way}, " \
1050 f"RA:{real_addr}")
1051 # end if;
1052 # end if;
1053 # if stall_in = '0' then
1054 # -- Send stop marks and NIA down regardless of validity
1055 # r.hit_smark <= i_in.stop_mark;
1056 # r.hit_nia <= i_in.nia;
1057 # end if;
1058 with m.If(~stall_in):
1059 # Send stop marks and NIA down regardless of validity
1060 sync += r.hit_smark.eq(i_in.stop_mark)
1061 sync += r.hit_nia.eq(i_in.nia)
1062 # end if;
1063 # end process;
1064
1065 # -- Cache miss/reload synchronous machine
1066 # icache_miss : process(clk)
1067 # Cache miss/reload synchronous machine
1068 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1069 req_index, req_laddr, req_tag, replace_way,
1070 cache_tags, access_ok):
1071 comb = m.d.comb
1072 sync = m.d.sync
1073
1074 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1075 stall_in, flush_in = self.stall_in, self.flush_in
1076 inval_in = self.inval_in
1077
1078 # variable tagset : cache_tags_set_t;
1079 # variable stbs_done : boolean;
1080
1081 tagset = Signal(TAG_RAM_WIDTH)
1082 stbs_done = Signal()
1083
1084 # begin
1085 # if rising_edge(clk) then
1086 # -- On reset, clear all valid bits to force misses
1087 # if rst = '1' then
1088 # On reset, clear all valid bits to force misses
1089 # for i in index_t loop
1090 # cache_valids(i) <= (others => '0');
1091 # end loop;
1092 # r.state <= IDLE;
1093 # r.wb.cyc <= '0';
1094 # r.wb.stb <= '0';
1095 # -- We only ever do reads on wishbone
1096 # r.wb.dat <= (others => '0');
1097 # r.wb.sel <= "11111111";
1098 # r.wb.we <= '0';
1099
1100 # We only ever do reads on wishbone
1101 comb += r.wb.sel.eq(~0) # set to all 1s
1102
1103 # -- Not useful normally but helps avoiding
1104 # -- tons of sim warnings
1105 # r.wb.adr <= (others => '0');
1106
1107 # else
1108
1109 # -- Process cache invalidations
1110 # if inval_in = '1' then
1111 # for i in index_t loop
1112 # cache_valids(i) <= (others => '0');
1113 # end loop;
1114 # r.store_valid <= '0';
1115 # end if;
1116 # Process cache invalidations
1117 with m.If(inval_in):
1118 for i in range(NUM_LINES):
1119 sync += cache_valid_bits[i].eq(~1) # NO just set to zero.
1120 # look again: others == 0
1121
1122 sync += r.store_valid.eq(0)
1123
1124 # -- Main state machine
1125 # case r.state is
1126 # Main state machine
1127 with m.Switch(r.state):
1128
1129 # when IDLE =>
1130 with m.Case(State.IDLE):
1131 # -- Reset per-row valid flags,
1132 # -- only used in WAIT_ACK
1133 # for i in 0 to ROW_PER_LINE - 1 loop
1134 # r.rows_valid(i) <= '0';
1135 # end loop;
1136 # Reset per-row valid flags,
1137 # only used in WAIT_ACK
1138 for i in range(ROW_PER_LINE):
1139 sync += r.rows_valid[i].eq(0)
1140
1141 # -- We need to read a cache line
1142 # if req_is_miss = '1' then
1143 # report "cache miss nia:" & to_hstring(i_in.nia) &
1144 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1145 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1146 # " idx:" & integer'image(req_index) &
1147 # " way:" & integer'image(replace_way) &
1148 # " tag:" & to_hstring(req_tag) &
1149 # " RA:" & to_hstring(real_addr);
1150 # We need to read a cache line
1151 with m.If(req_is_miss):
1152 print(f"cache miss nia:{i_in.nia} " \
1153 f"IR:{i_in.virt_mode} " \
1154 f"SM:{i_in.stop_mark} " \
1155 F"idx:{req_index} " \
1156 f"way:{replace_way} tag:{req_tag} " \
1157 f"RA:{real_addr}")
1158
1159 # -- Keep track of our index and way for
1160 # -- subsequent stores
1161 # r.store_index <= req_index;
1162 # r.store_row <= get_row(req_laddr);
1163 # r.store_tag <= req_tag;
1164 # r.store_valid <= '1';
1165 # r.end_row_ix <=
1166 # get_row_of_line(get_row(req_laddr)) - 1;
1167 # Keep track of our index and way
1168 # for subsequent stores
1169 sync += r.store_index.eq(req_index)
1170 sync += r.store_row.eq(get_row(req_laddr))
1171 sync += r.store_tag.eq(req_tag)
1172 sync += r.store_valid.eq(1)
1173 sync += r.end_row_ix.eq(
1174 get_row_of_line(
1175 get_row(req_laddr)
1176 ) - 1
1177 )
1178
1179 # -- Prep for first wishbone read. We calculate the
1180 # -- address of the start of the cache line and
1181 # -- start the WB cycle.
1182 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1183 # r.wb.cyc <= '1';
1184 # r.wb.stb <= '1';
1185 # Prep for first wishbone read.
1186 # We calculate the
1187 # address of the start of the cache line and
1188 # start the WB cycle.
1189 sync += r.wb.adr.eq(
1190 req_laddr[:r.wb.adr]
1191 )
1192
1193 # -- Track that we had one request sent
1194 # r.state <= CLR_TAG;
1195 # Track that we had one request sent
1196 sync += r.state.eq(State.CLR_TAG)
1197 # end if;
1198
1199 # when CLR_TAG | WAIT_ACK =>
1200 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1201 # if r.state = CLR_TAG then
1202 with m.If(r.state == State.CLR_TAG):
1203 # -- Get victim way from plru
1204 # r.store_way <= replace_way;
1205 # Get victim way from plru
1206 sync += r.store_way.eq(replace_way)
1207 #
1208 # -- Force misses on that way while
1209 # -- reloading that line
1210 # cache_valids(req_index)(replace_way) <= '0';
1211 # Force misses on that way while
1212 # realoading that line
1213 sync += cache_valid_bits[
1214 req_index
1215 ][replace_way].eq(0)
1216
1217 # -- Store new tag in selected way
1218 # for i in 0 to NUM_WAYS-1 loop
1219 # if i = replace_way then
1220 # tagset := cache_tags(r.store_index);
1221 # write_tag(i, tagset, r.store_tag);
1222 # cache_tags(r.store_index) <= tagset;
1223 # end if;
1224 # end loop;
1225 for i in range(NUM_WAYS):
1226 with m.If(i == replace_way):
1227 comb += tagset.eq(
1228 cache_tags[r.store_index]
1229 )
1230 sync += write_tag(
1231 i, tagset, r.store_tag
1232 )
1233 sync += cache_tags[r.store_index].eq(
1234 tagset
1235 )
1236
1237 # r.state <= WAIT_ACK;
1238 sync += r.state.eq(State.WAIT_ACK)
1239 # end if;
1240
1241 # -- Requests are all sent if stb is 0
1242 # stbs_done := r.wb.stb = '0';
1243 # Requests are all sent if stb is 0
1244 comb += stbs_done.eq(r.wb.stb == 0)
1245
1246 # -- If we are still sending requests,
1247 # -- was one accepted ?
1248 # if wishbone_in.stall = '0' and not stbs_done then
1249 # If we are still sending requests,
1250 # was one accepted?
1251 with m.If(~wb_in.stall & ~stbs_done):
1252 # -- That was the last word ? We are done sending.
1253 # -- Clear stb and set stbs_done so we can handle
1254 # -- an eventual last ack on the same cycle.
1255 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1256 # r.wb.stb <= '0';
1257 # stbs_done := true;
1258 # end if;
1259 # That was the last word ?
1260 # We are done sending.
1261 # Clear stb and set stbs_done
1262 # so we can handle
1263 # an eventual last ack on
1264 # the same cycle.
1265 with m.If(is_last_row_addr(
1266 r.wb.adr, r.end_row_ix)):
1267 sync += r.wb.stb.eq(0)
1268 stbs_done.eq(1)
1269
1270 # -- Calculate the next row address
1271 # r.wb.adr <= next_row_addr(r.wb.adr);
1272 # Calculate the next row address
1273 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1274 # end if;
1275
1276 # -- Incoming acks processing
1277 # if wishbone_in.ack = '1' then
1278 # Incoming acks processing
1279 with m.If(wb_in.ack):
1280 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1281 # <= '1';
1282 sync += r.rows_valid[
1283 r.store_row & ROW_PER_LINE
1284 ].eq(1)
1285
1286 # -- Check for completion
1287 # if stbs_done and
1288 # is_last_row(r.store_row, r.end_row_ix) then
1289 # Check for completion
1290 with m.If(stbs_done & is_last_row(
1291 r.store_row, r.end_row_ix)):
1292 # -- Complete wishbone cycle
1293 # r.wb.cyc <= '0';
1294 # Complete wishbone cycle
1295 sync += r.wb.cyc.eq(0)
1296
1297 # -- Cache line is now valid
1298 # cache_valids(r.store_index)(replace_way) <=
1299 # r.store_valid and not inval_in;
1300 # Cache line is now valid
1301 sync += cache_valid_bits[
1302 r.store_index
1303 ][relace_way].eq(
1304 r.store_valid & ~inval_in
1305 )
1306
1307 # -- We are done
1308 # r.state <= IDLE;
1309 # We are done
1310 sync += r.state.eq(State.IDLE)
1311 # end if;
1312
1313 # -- Increment store row counter
1314 # r.store_row <= next_row(r.store_row);
1315 # Increment store row counter
1316 sync += store_row.eq(next_row(r.store_row))
1317 # end if;
1318 # end case;
1319 # end if;
1320 #
1321 # -- TLB miss and protection fault processing
1322 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1323 # r.fetch_failed <= '0';
1324 # elsif i_in.req = '1' and access_ok = '0' and
1325 # stall_in = '0' then
1326 # r.fetch_failed <= '1';
1327 # end if;
1328 # TLB miss and protection fault processing
1329 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1330 sync += r.fetch_failed.eq(0)
1331
1332 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1333 sync += r.fetch_failed.eq(1)
1334 # end if;
1335 # end process;
1336
1337 # icache_log: if LOG_LENGTH > 0 generate
1338 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1339 req_is_miss, req_is_hit, lway, wstate, r):
1340 comb = m.d.comb
1341 sync = m.d.sync
1342
1343 wb_in, i_out = self.wb_in, self.i_out
1344 log_out, stall_out = self.log_out, self.stall_out
1345
1346 # -- Output data to logger
1347 # signal log_data : std_ulogic_vector(53 downto 0);
1348 # begin
1349 # data_log: process(clk)
1350 # variable lway: way_t;
1351 # variable wstate: std_ulogic;
1352 # Output data to logger
1353 for i in range(LOG_LENGTH):
1354 # Output data to logger
1355 log_data = Signal(54)
1356 lway = Signal(NUM_WAYS)
1357 wstate = Signal()
1358
1359 # begin
1360 # if rising_edge(clk) then
1361 # lway := req_hit_way;
1362 # wstate := '0';
1363 comb += lway.eq(req_hit_way)
1364 comb += wstate.eq(0)
1365
1366 # if r.state /= IDLE then
1367 # wstate := '1';
1368 # end if;
1369 with m.If(r.state != State.IDLE):
1370 sync += wstate.eq(1)
1371
1372 # log_data <= i_out.valid &
1373 # i_out.insn &
1374 # wishbone_in.ack &
1375 # r.wb.adr(5 downto 3) &
1376 # r.wb.stb & r.wb.cyc &
1377 # wishbone_in.stall &
1378 # stall_out &
1379 # r.fetch_failed &
1380 # r.hit_nia(5 downto 2) &
1381 # wstate &
1382 # std_ulogic_vector(to_unsigned(lway, 3)) &
1383 # req_is_hit & req_is_miss &
1384 # access_ok &
1385 # ra_valid;
1386 sync += log_data.eq(Cat(
1387 ra_valid, access_ok, req_is_miss, req_is_hit,
1388 lway, wstate, r.hit_nia[2:6],
1389 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1390 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1391 i_out.valid
1392 ))
1393 # end if;
1394 # end process;
1395 # log_out <= log_data;
1396 comb += log_out.eq(log_data)
1397 # end generate;
1398 # end;
1399
1400 def elaborate(self, platform):
1401
1402 m = Module()
1403 comb = m.d.comb
1404
1405 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1406 cache_tags = CacheTagArray()
1407 cache_valid_bits = CacheValidBitsArray()
1408
1409 # signal itlb_valids : tlb_valids_t;
1410 # signal itlb_tags : tlb_tags_t;
1411 # signal itlb_ptes : tlb_ptes_t;
1412 # attribute ram_style of itlb_tags : signal is "distributed";
1413 # attribute ram_style of itlb_ptes : signal is "distributed";
1414 itlb_valid_bits = TLBValidBitsArray()
1415 itlb_tags = TLBTagArray()
1416 itlb_ptes = TLBPTEArray()
1417 # TODO to be passed to nmigen as ram attributes
1418 # attribute ram_style of itlb_tags : signal is "distributed";
1419 # attribute ram_style of itlb_ptes : signal is "distributed";
1420
1421 # -- Privilege bit from PTE EAA field
1422 # signal eaa_priv : std_ulogic;
1423 # Privilege bit from PTE EAA field
1424 eaa_priv = Signal()
1425
1426 # signal r : reg_internal_t;
1427 r = RegInternal()
1428
1429 # -- Async signals on incoming request
1430 # signal req_index : index_t;
1431 # signal req_row : row_t;
1432 # signal req_hit_way : way_t;
1433 # signal req_tag : cache_tag_t;
1434 # signal req_is_hit : std_ulogic;
1435 # signal req_is_miss : std_ulogic;
1436 # signal req_laddr : std_ulogic_vector(63 downto 0);
1437 # Async signal on incoming request
1438 req_index = Signal(NUM_LINES)
1439 req_row = Signal(BRAM_ROWS)
1440 req_hit_way = Signal(NUM_WAYS)
1441 req_tag = Signal(TAG_BITS)
1442 req_is_hit = Signal()
1443 req_is_miss = Signal()
1444 req_laddr = Signal(64)
1445
1446 # signal tlb_req_index : tlb_index_t;
1447 # signal real_addr : std_ulogic_vector(
1448 # REAL_ADDR_BITS - 1 downto 0
1449 # );
1450 # signal ra_valid : std_ulogic;
1451 # signal priv_fault : std_ulogic;
1452 # signal access_ok : std_ulogic;
1453 # signal use_previous : std_ulogic;
1454 tlb_req_index = Signal(TLB_SIZE)
1455 real_addr = Signal(REAL_ADDR_BITS)
1456 ra_valid = Signal()
1457 priv_fault = Signal()
1458 access_ok = Signal()
1459 use_previous = Signal()
1460
1461 # signal cache_out : cache_ram_out_t;
1462 cache_out = CacheRamOut()
1463
1464 # signal plru_victim : plru_out_t;
1465 # signal replace_way : way_t;
1466 plru_victim = PLRUOut()
1467 replace_way = Signal(NUM_WAYS)
1468
1469 # call sub-functions putting everything together, using shared
1470 # signals established above
1471 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1472 self.maybe_plrus(m, r, plru_victim)
1473 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1474 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1475 priv_fault, access_ok)
1476 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1477 self.icache_comb(m, use_previous, r, req_index, req_row,
1478 req_tag, real_addr, req_laddr, cache_valid_bits,
1479 cache_tags, access_ok, req_is_hit, req_is_miss,
1480 replace_way, plru_victim, cache_out)
1481 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1482 req_index, req_tag, real_addr)
1483 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1484 req_laddr, req_tag, replace_way, cache_tags,
1485 access_ok)
1486 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1487 # req_is_miss, req_is_hit, lway, wstate, r)
1488
1489 return m
1490
1491
1492 # icache_tb.vhdl
1493 #
1494 # library ieee;
1495 # use ieee.std_logic_1164.all;
1496 #
1497 # library work;
1498 # use work.common.all;
1499 # use work.wishbone_types.all;
1500 #
1501 # entity icache_tb is
1502 # end icache_tb;
1503 #
1504 # architecture behave of icache_tb is
1505 # signal clk : std_ulogic;
1506 # signal rst : std_ulogic;
1507 #
1508 # signal i_out : Fetch1ToIcacheType;
1509 # signal i_in : IcacheToDecode1Type;
1510 #
1511 # signal m_out : MmuToIcacheType;
1512 #
1513 # signal wb_bram_in : wishbone_master_out;
1514 # signal wb_bram_out : wishbone_slave_out;
1515 #
1516 # constant clk_period : time := 10 ns;
1517 # begin
1518 # icache0: entity work.icache
1519 # generic map(
1520 # LINE_SIZE => 64,
1521 # NUM_LINES => 4
1522 # )
1523 # port map(
1524 # clk => clk,
1525 # rst => rst,
1526 # i_in => i_out,
1527 # i_out => i_in,
1528 # m_in => m_out,
1529 # stall_in => '0',
1530 # flush_in => '0',
1531 # inval_in => '0',
1532 # wishbone_out => wb_bram_in,
1533 # wishbone_in => wb_bram_out
1534 # );
1535 #
1536 # -- BRAM Memory slave
1537 # bram0: entity work.wishbone_bram_wrapper
1538 # generic map(
1539 # MEMORY_SIZE => 1024,
1540 # RAM_INIT_FILE => "icache_test.bin"
1541 # )
1542 # port map(
1543 # clk => clk,
1544 # rst => rst,
1545 # wishbone_in => wb_bram_in,
1546 # wishbone_out => wb_bram_out
1547 # );
1548 #
1549 # clk_process: process
1550 # begin
1551 # clk <= '0';
1552 # wait for clk_period/2;
1553 # clk <= '1';
1554 # wait for clk_period/2;
1555 # end process;
1556 #
1557 # rst_process: process
1558 # begin
1559 # rst <= '1';
1560 # wait for 2*clk_period;
1561 # rst <= '0';
1562 # wait;
1563 # end process;
1564 #
1565 # stim: process
1566 # begin
1567 # i_out.req <= '0';
1568 # i_out.nia <= (others => '0');
1569 # i_out.stop_mark <= '0';
1570 #
1571 # m_out.tlbld <= '0';
1572 # m_out.tlbie <= '0';
1573 # m_out.addr <= (others => '0');
1574 # m_out.pte <= (others => '0');
1575 #
1576 # wait until rising_edge(clk);
1577 # wait until rising_edge(clk);
1578 # wait until rising_edge(clk);
1579 # wait until rising_edge(clk);
1580 #
1581 # i_out.req <= '1';
1582 # i_out.nia <= x"0000000000000004";
1583 #
1584 # wait for 30*clk_period;
1585 # wait until rising_edge(clk);
1586 #
1587 # assert i_in.valid = '1' severity failure;
1588 # assert i_in.insn = x"00000001"
1589 # report "insn @" & to_hstring(i_out.nia) &
1590 # "=" & to_hstring(i_in.insn) &
1591 # " expected 00000001"
1592 # severity failure;
1593 #
1594 # i_out.req <= '0';
1595 #
1596 # wait until rising_edge(clk);
1597 #
1598 # -- hit
1599 # i_out.req <= '1';
1600 # i_out.nia <= x"0000000000000008";
1601 # wait until rising_edge(clk);
1602 # wait until rising_edge(clk);
1603 # assert i_in.valid = '1' severity failure;
1604 # assert i_in.insn = x"00000002"
1605 # report "insn @" & to_hstring(i_out.nia) &
1606 # "=" & to_hstring(i_in.insn) &
1607 # " expected 00000002"
1608 # severity failure;
1609 # wait until rising_edge(clk);
1610 #
1611 # -- another miss
1612 # i_out.req <= '1';
1613 # i_out.nia <= x"0000000000000040";
1614 #
1615 # wait for 30*clk_period;
1616 # wait until rising_edge(clk);
1617 #
1618 # assert i_in.valid = '1' severity failure;
1619 # assert i_in.insn = x"00000010"
1620 # report "insn @" & to_hstring(i_out.nia) &
1621 # "=" & to_hstring(i_in.insn) &
1622 # " expected 00000010"
1623 # severity failure;
1624 #
1625 # -- test something that aliases
1626 # i_out.req <= '1';
1627 # i_out.nia <= x"0000000000000100";
1628 # wait until rising_edge(clk);
1629 # wait until rising_edge(clk);
1630 # assert i_in.valid = '0' severity failure;
1631 # wait until rising_edge(clk);
1632 #
1633 # wait for 30*clk_period;
1634 # wait until rising_edge(clk);
1635 #
1636 # assert i_in.valid = '1' severity failure;
1637 # assert i_in.insn = x"00000040"
1638 # report "insn @" & to_hstring(i_out.nia) &
1639 # "=" & to_hstring(i_in.insn) &
1640 # " expected 00000040"
1641 # severity failure;
1642 #
1643 # i_out.req <= '0';
1644 #
1645 # std.env.finish;
1646 # end process;
1647 # end;
1648 def icache_sim(dut):
1649 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1650
1651 yield i_out.req.eq(0)
1652 yield i_out.nia.eq(~1)
1653 yield i_out.stop_mark.eq(0)
1654 yield m_out.tlbld.eq(0)
1655 yield m_out.tlbie.eq(0)
1656 yield m_out.addr.eq(~1)
1657 yield m_out.pte.eq(~1)
1658 yield
1659 yield
1660 yield
1661 yield
1662 yield i_out.req.eq(1)
1663 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1664 for i in range(30):
1665 yield
1666 yield
1667 assert i_in.valid
1668 assert i_in.insn == Const(0x00000001, 32), \
1669 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1670 yield i_out.req.eq(0)
1671 yield
1672
1673 # hit
1674 yield i_out.req.eq(1)
1675 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1676 yield
1677 yield
1678 assert i_in.valid
1679 assert i_in.insn == Const(0x00000002, 32), \
1680 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1681 yield
1682
1683 # another miss
1684 yield i_out.req(1)
1685 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1686 for i in range(30):
1687 yield
1688 yield
1689 assert i_in.valid
1690 assert i_in.insn == Const(0x00000010, 32), \
1691 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1692
1693 # test something that aliases
1694 yield i_out.req.eq(1)
1695 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1696 yield
1697 yield
1698 assert i_in.valid
1699 for i in range(30):
1700 yield
1701 yield
1702 assert i_in.valid
1703 assert i_in.insn == Const(0x00000040, 32), \
1704 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1705 yield i_out.req.eq(0)
1706
1707
1708 def test_icache():
1709 dut = ICache()
1710
1711 m = Module()
1712 m.submodules.icache = dut
1713
1714 # nmigen Simulation
1715 sim = Simulator(m)
1716 sim.add_clock(1e-6)
1717
1718 sim.add_sync_process(wrap(icache_sim(dut)))
1719 with sim.write_vcd('test_icache.vcd'):
1720 sim.run()
1721
1722 if __name__ == '__main__':
1723 dut = ICache()
1724 vl = rtlil.convert(dut, ports=[])
1725 with open("test_icache.il", "w") as f:
1726 f.write(vl)
1727
1728 test_icache()