3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
22 from enum
import Enum
, unique
23 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
)
24 from nmigen
.cli
import main
, rtlil
25 from nmutil
.iocontrol
import RecordObject
26 from nmigen
.utils
import log2_int
27 from nmutil
.util
import Display
29 #from nmutil.plru import PLRU
30 from soc
.experiment
.cache_ram
import CacheRam
31 from soc
.experiment
.plru
import PLRU
33 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
37 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
38 WB_SEL_BITS
, WBAddrType
, WBDataType
,
39 WBSelType
, WBMasterOut
, WBSlaveOut
,
40 WBMasterOutVector
, WBSlaveOutVector
,
41 WBIOMasterOut
, WBIOSlaveOut
)
44 from nmigen_soc
.wishbone
.sram
import SRAM
45 from nmigen
import Memory
46 from nmutil
.util
import wrap
47 from nmigen
.cli
import main
, rtlil
49 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
51 from nmigen
.sim
.cxxsim
import Simulator
, Delay
, Settle
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE
= WB_DATA_BITS
// 8
62 # Number of lines in a set
66 # L1 ITLB number of entries (direct mapped)
68 # L1 ITLB log_2(page_size)
70 # Number of real address bits that we store
72 # Non-zero to enable log data collection
75 ROW_SIZE_BITS
= ROW_SIZE
* 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
86 print("ROW_SIZE", ROW_SIZE
)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS
)
88 print("ROW_PER_LINE", ROW_PER_LINE
)
89 print("BRAM_ROWS", BRAM_ROWS
)
90 print("INSN_PER_ROW", INSN_PER_ROW
)
92 # Bit fields counts in the address
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS
= log2_int(INSN_PER_ROW
)
97 # ROW_BITS is the number of bits to
99 ROW_BITS
= log2_int(BRAM_ROWS
)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS
= log2_int(NUM_LINES
)
112 # SET_SIZE_BITS is the log base 2 of
114 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
121 # WAY_BITS is the number of bits to
123 WAY_BITS
= log2_int(NUM_WAYS
)
124 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS
= log2_int(TLB_SIZE
)
131 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
135 print("INSN_BITS", INSN_BITS
)
136 print("ROW_BITS", ROW_BITS
)
137 print("ROW_LINE_BITS", ROW_LINE_BITS
)
138 print("LINE_OFF_BITS", LINE_OFF_BITS
)
139 print("ROW_OFF_BITS", ROW_OFF_BITS
)
140 print("INDEX_BITS", INDEX_BITS
)
141 print("SET_SIZE_BITS", SET_SIZE_BITS
)
142 print("TAG_BITS", TAG_BITS
)
143 print("WAY_BITS", WAY_BITS
)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH
)
145 print("TLB_BITS", TLB_BITS
)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS
)
147 print("TLB_PTE_BITS", TLB_PTE_BITS
)
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
187 #-- Example of layout for 32 lines of 64 bytes:
189 #-- .. tag |index| line |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
201 # .. tag |index| line |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
230 return Array(Signal(TAG_RAM_WIDTH
, name
="cachetag_%d" %x) \
231 for x
in range(NUM_LINES
))
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS
, name
="cachevalid_%d" %x) \
239 for x
in range(NUM_LINES
))
241 def RowPerLineValidArray():
242 return Array(Signal(name
="rows_valid_%d" %x) \
243 for x
in range(ROW_PER_LINE
))
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name
="tlbvalid_%d" %x) \
261 for x
in range(TLB_SIZE
))
264 return Array(Signal(TLB_EA_TAG_BITS
, name
="tlbtag_%d" %x) \
265 for x
in range(TLB_SIZE
))
268 return Array(Signal(TLB_PTE_BITS
, name
="tlbptes_%d" %x) \
269 for x
in range(TLB_SIZE
))
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
276 return Array(Signal(ROW_SIZE_BITS
, name
="cache_out_%d" %x) \
277 for x
in range(NUM_WAYS
))
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
284 return Array(Signal(WAY_BITS
, name
="plru_out_%d" %x) \
285 for x
in range(NUM_LINES
))
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
295 # Return the cache line index (tag index) for an address
297 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
307 # Return the cache row index (data memory) for an address
309 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
318 # Return the index of a row within a line
319 def get_row_of_line(row
):
320 return row
[:ROW_LINE_BITS
]
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr
, last
):
334 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
340 # return get_row_of_line(row) = last;
342 # Returns whether this is the last row of a line
343 def is_last_row(row
, last
):
344 return get_row_of_line(row
) == last
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
364 row_v
= row
[0:ROW_LINE_BITS
] + 1
365 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr
, data
):
378 word
= addr
[2:INSN_BITS
+2]
379 return data
.word_select(word
, 32)
381 # -- Get the tag value from the address
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
385 # return cache_tag_t is
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
389 # Get the tag value from the address
391 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
399 # Read a tag from a tag memory row
400 def read_tag(way
, tagset
):
401 return tagset
.word_select(way
, TAG_BITS
)
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
409 # Write a tag to tag memory row
410 def write_tag(way
, tagset
, tag
):
411 return read_tag(way
, tagset
).eq(tag
)
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
427 # return to_integer(unsigned(hash));
429 # Simple hash for direct-mapped TLB index
431 hsh
= addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^ addr
[
432 TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
434 TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
440 # assert LINE_SIZE mod ROW_SIZE = 0;
441 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
443 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
447 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
449 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up" severity FAILURE;
451 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
452 # report "geometry bits don't add up" severity FAILURE;
453 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
454 # report "geometry bits don't add up" severity FAILURE;
455 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
456 # report "geometry bits don't add up" severity FAILURE;
458 # sim_debug: if SIM generate
461 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
462 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
463 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
464 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
465 # report "INSN_BITS = " & natural'image(INSN_BITS);
466 # report "ROW_BITS = " & natural'image(ROW_BITS);
467 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
468 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
469 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
470 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
471 # report "TAG_BITS = " & natural'image(TAG_BITS);
472 # report "WAY_BITS = " & natural'image(WAY_BITS);
477 # Cache reload state machine
484 # type reg_internal_t is record
485 # -- Cache hit state (Latches for 1 cycle BRAM access)
487 # hit_nia : std_ulogic_vector(63 downto 0);
488 # hit_smark : std_ulogic;
489 # hit_valid : std_ulogic;
491 # -- Cache miss state (reload state machine)
493 # wb : wishbone_master_out;
495 # store_index : index_t;
497 # store_tag : cache_tag_t;
498 # store_valid : std_ulogic;
499 # end_row_ix : row_in_line_t;
500 # rows_valid : row_per_line_valid_t;
503 # fetch_failed : std_ulogic;
505 class RegInternal(RecordObject
):
508 # Cache hit state (Latches for 1 cycle BRAM access)
509 self
.hit_way
= Signal(NUM_WAYS
)
510 self
.hit_nia
= Signal(64)
511 self
.hit_smark
= Signal()
512 self
.hit_valid
= Signal()
514 # Cache miss state (reload state machine)
515 self
.state
= Signal(State
, reset
=State
.IDLE
)
516 self
.wb
= WBMasterOut("wb")
517 self
.req_adr
= Signal(64)
518 self
.store_way
= Signal(NUM_WAYS
)
519 self
.store_index
= Signal(NUM_LINES
)
520 self
.store_row
= Signal(BRAM_ROWS
)
521 self
.store_tag
= Signal(TAG_BITS
)
522 self
.store_valid
= Signal()
523 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
524 self
.rows_valid
= RowPerLineValidArray()
527 self
.fetch_failed
= Signal()
529 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
533 # SIM : boolean := false;
534 # -- Line size in bytes
535 # LINE_SIZE : positive := 64;
536 # -- BRAM organisation: We never access more
537 # -- than wishbone_data_bits
538 # -- at a time so to save resources we make the
539 # -- array only that wide,
540 # -- and use consecutive indices for to make a cache "line"
542 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
544 # ROW_SIZE : positive := wishbone_data_bits / 8;
545 # -- Number of lines in a set
546 # NUM_LINES : positive := 32;
548 # NUM_WAYS : positive := 4;
549 # -- L1 ITLB number of entries (direct mapped)
550 # TLB_SIZE : positive := 64;
551 # -- L1 ITLB log_2(page_size)
552 # TLB_LG_PGSZ : positive := 12;
553 # -- Number of real address bits that we store
554 # REAL_ADDR_BITS : positive := 56;
555 # -- Non-zero to enable log data collection
556 # LOG_LENGTH : natural := 0
559 # clk : in std_ulogic;
560 # rst : in std_ulogic;
562 # i_in : in Fetch1ToIcacheType;
563 # i_out : out IcacheToDecode1Type;
565 # m_in : in MmuToIcacheType;
567 # stall_in : in std_ulogic;
568 # stall_out : out std_ulogic;
569 # flush_in : in std_ulogic;
570 # inval_in : in std_ulogic;
572 # wishbone_out : out wishbone_master_out;
573 # wishbone_in : in wishbone_slave_out;
575 # log_out : out std_ulogic_vector(53 downto 0)
578 # 64 bit direct mapped icache. All instructions are 4B aligned.
579 class ICache(Elaboratable
):
580 """64 bit direct mapped icache. All instructions are 4B aligned."""
582 self
.i_in
= Fetch1ToICacheType(name
="i_in")
583 self
.i_out
= ICacheToDecode1Type(name
="i_out")
585 self
.m_in
= MMUToICacheType(name
="m_in")
587 self
.stall_in
= Signal()
588 self
.stall_out
= Signal()
589 self
.flush_in
= Signal()
590 self
.inval_in
= Signal()
592 self
.wb_out
= WBMasterOut(name
="wb_out")
593 self
.wb_in
= WBSlaveOut(name
="wb_in")
595 self
.log_out
= Signal(54)
598 # -- Generate a cache RAM for each way
599 # rams: for i in 0 to NUM_WAYS-1 generate
600 # signal do_read : std_ulogic;
601 # signal do_write : std_ulogic;
602 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
603 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
604 # signal dout : cache_row_t;
605 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
607 # way: entity work.cache_ram
609 # ROW_BITS => ROW_BITS,
610 # WIDTH => ROW_SIZE_BITS
615 # rd_addr => rd_addr,
618 # wr_addr => wr_addr,
619 # wr_data => wishbone_in.dat
623 # do_read <= not (stall_in or use_previous);
625 # if wishbone_in.ack = '1' and replace_way = i then
628 # cache_out(i) <= dout;
630 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
632 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
633 # for i in 0 to ROW_SIZE-1 loop
634 # wr_sel(i) <= do_write;
638 def rams(self
, m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
):
641 wb_in
, stall_in
= self
.wb_in
, self
.stall_in
644 for i
in range(NUM_WAYS
):
645 do_read
= Signal(name
="do_rd_%d" % i
)
646 do_write
= Signal(name
="do_wr_%d" % i
)
647 rd_addr
= Signal(ROW_BITS
)
648 wr_addr
= Signal(ROW_BITS
)
649 d_out
= Signal(ROW_SIZE_BITS
, name
="d_out_%d" % i
)
650 wr_sel
= Signal(ROW_SIZE
)
652 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
, True)
653 setattr(m
.submodules
, "cacheram_%d" % i
, way
)
655 comb
+= way
.rd_en
.eq(do_read
)
656 comb
+= way
.rd_addr
.eq(rd_addr
)
657 comb
+= d_out
.eq(way
.rd_data_o
)
658 comb
+= way
.wr_sel
.eq(wr_sel
)
659 comb
+= way
.wr_addr
.eq(wr_addr
)
660 comb
+= way
.wr_data
.eq(wb_in
.dat
)
662 comb
+= do_read
.eq(~
(stall_in | use_previous
))
664 with m
.If(wb_in
.ack
& (replace_way
== i
)):
665 comb
+= do_write
.eq(1)
667 with m
.If(r
.hit_way
== i
):
668 comb
+= cache_out_row
.eq(d_out
)
669 comb
+= rd_addr
.eq(req_row
)
670 comb
+= wr_addr
.eq(r
.store_row
)
671 for j
in range(ROW_SIZE
):
672 comb
+= wr_sel
[j
].eq(do_write
)
675 # maybe_plrus: if NUM_WAYS > 1 generate
677 # plrus: for i in 0 to NUM_LINES-1 generate
679 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
680 # signal plru_acc_en : std_ulogic;
681 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
684 # plru : entity work.plru
692 # acc_en => plru_acc_en,
699 # if get_index(r.hit_nia) = i then
700 # plru_acc_en <= r.hit_valid;
702 # plru_acc_en <= '0';
705 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
706 # plru_victim(i) <= plru_out;
710 def maybe_plrus(self
, m
, r
, plru_victim
):
713 with m
.If(NUM_WAYS
> 1):
714 for i
in range(NUM_LINES
):
715 plru_acc_i
= Signal(WAY_BITS
)
716 plru_acc_en
= Signal()
717 plru
= PLRU(WAY_BITS
)
718 setattr(m
.submodules
, "plru_%d" % i
, plru
)
720 comb
+= plru
.acc_i
.eq(plru_acc_i
)
721 comb
+= plru
.acc_en
.eq(plru_acc_en
)
724 with m
.If(get_index(r
.hit_nia
) == i
):
725 comb
+= plru
.acc_en
.eq(r
.hit_valid
)
727 comb
+= plru
.acc_i
.eq(r
.hit_way
)
728 comb
+= plru_victim
[i
].eq(plru
.lru_o
)
730 # -- TLB hit detection and real address generation
731 # itlb_lookup : process(all)
732 # variable pte : tlb_pte_t;
733 # variable ttag : tlb_tag_t;
735 # tlb_req_index <= hash_ea(i_in.nia);
736 # pte := itlb_ptes(tlb_req_index);
737 # ttag := itlb_tags(tlb_req_index);
738 # if i_in.virt_mode = '1' then
739 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
740 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
741 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
742 # ra_valid <= itlb_valids(tlb_req_index);
746 # eaa_priv <= pte(3);
748 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
753 # -- no IAMR, so no KUEP support for now
754 # priv_fault <= eaa_priv and not i_in.priv_mode;
755 # access_ok <= ra_valid and not priv_fault;
757 # TLB hit detection and real address generation
758 def itlb_lookup(self
, m
, tlb_req_index
, itlb_ptes
, itlb_tags
,
759 real_addr
, itlb_valid_bits
, ra_valid
, eaa_priv
,
760 priv_fault
, access_ok
):
765 pte
= Signal(TLB_PTE_BITS
)
766 ttag
= Signal(TLB_EA_TAG_BITS
)
768 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
769 comb
+= pte
.eq(itlb_ptes
[tlb_req_index
])
770 comb
+= ttag
.eq(itlb_tags
[tlb_req_index
])
772 with m
.If(i_in
.virt_mode
):
773 comb
+= real_addr
.eq(Cat(
774 i_in
.nia
[:TLB_LG_PGSZ
],
775 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
778 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
779 comb
+= ra_valid
.eq(itlb_valid_bits
[tlb_req_index
])
781 comb
+= eaa_priv
.eq(pte
[3])
784 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
785 comb
+= ra_valid
.eq(1)
786 comb
+= eaa_priv
.eq(1)
788 # No IAMR, so no KUEP support for now
789 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
790 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
793 # itlb_update: process(clk)
794 # variable wr_index : tlb_index_t;
796 # if rising_edge(clk) then
797 # wr_index := hash_ea(m_in.addr);
799 # (m_in.tlbie = '1' and m_in.doall = '1') then
800 # -- clear all valid bits
801 # for i in tlb_index_t loop
802 # itlb_valids(i) <= '0';
804 # elsif m_in.tlbie = '1' then
805 # -- clear entry regardless of hit or miss
806 # itlb_valids(wr_index) <= '0';
807 # elsif m_in.tlbld = '1' then
808 # itlb_tags(wr_index) <=
809 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
810 # itlb_ptes(wr_index) <= m_in.pte;
811 # itlb_valids(wr_index) <= '1';
816 def itlb_update(self
, m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
):
822 wr_index
= Signal(TLB_SIZE
)
823 sync
+= wr_index
.eq(hash_ea(m_in
.addr
))
825 with m
.If(m_in
.tlbie
& m_in
.doall
):
826 # Clear all valid bits
827 for i
in range(TLB_SIZE
):
828 sync
+= itlb_valid_bits
[i
].eq(0)
830 with m
.Elif(m_in
.tlbie
):
831 # Clear entry regardless of hit or miss
832 sync
+= itlb_valid_bits
[wr_index
].eq(0)
834 with m
.Elif(m_in
.tlbld
):
835 sync
+= itlb_tags
[wr_index
].eq(
836 m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64]
838 sync
+= itlb_ptes
[wr_index
].eq(m_in
.pte
)
839 sync
+= itlb_valid_bits
[wr_index
].eq(1)
841 # -- Cache hit detection, output to fetch2 and other misc logic
842 # icache_comb : process(all)
843 # Cache hit detection, output to fetch2 and other misc logic
844 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
845 req_tag
, real_addr
, req_laddr
, cache_valid_bits
,
846 cache_tags
, access_ok
, req_is_hit
,
847 req_is_miss
, replace_way
, plru_victim
, cache_out_row
):
848 # variable is_hit : std_ulogic;
849 # variable hit_way : way_t;
852 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x " \
853 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x " \
854 # "access_ok:%x req_is_hit:%x req_is_miss:%x " \
855 # "replace_way:%x", use_previous, req_index, req_row, \
856 # req_tag, real_addr, req_laddr, access_ok, \
857 # req_is_hit, req_is_miss, replace_way)
859 i_in
, i_out
, wb_out
= self
.i_in
, self
.i_out
, self
.wb_out
860 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
863 hit_way
= Signal(NUM_WAYS
)
865 # -- i_in.sequential means that i_in.nia this cycle
866 # -- is 4 more than last cycle. If we read more
867 # -- than 32 bits at a time, had a cache hit last
868 # -- cycle, and we don't want the first 32-bit chunk
869 # -- then we can keep the data we read last cycle
870 # -- and just use that.
871 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
872 # use_previous <= i_in.sequential and r.hit_valid;
874 # use_previous <= '0';
876 # i_in.sequential means that i_in.nia this cycle is 4 more than
877 # last cycle. If we read more than 32 bits at a time, had a
878 # cache hit last cycle, and we don't want the first 32-bit chunk
879 # then we can keep the data we read last cycle and just use that.
880 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
881 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
883 # -- Extract line, row and tag from request
884 # req_index <= get_index(i_in.nia);
885 # req_row <= get_row(i_in.nia);
886 # req_tag <= get_tag(real_addr);
887 # Extract line, row and tag from request
888 comb
+= req_index
.eq(get_index(i_in
.nia
))
889 comb
+= req_row
.eq(get_row(i_in
.nia
))
890 comb
+= req_tag
.eq(get_tag(real_addr
))
892 # -- Calculate address of beginning of cache row, will be
893 # -- used for cache miss processing if needed
895 # (63 downto REAL_ADDR_BITS => '0') &
896 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
897 # (ROW_OFF_BITS-1 downto 0 => '0');
898 # Calculate address of beginning of cache row, will be
899 # used for cache miss processing if needed
900 comb
+= req_laddr
.eq(Cat(
901 Const(0b0, ROW_OFF_BITS
),
902 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
906 # -- Test if pending request is a hit on any way
909 # for i in way_t loop
910 # if i_in.req = '1' and
911 # (cache_valids(req_index)(i) = '1' or
912 # (r.state = WAIT_ACK and
913 # req_index = r.store_index and
914 # i = r.store_way and
915 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
916 # if read_tag(i, cache_tags(req_index)) = req_tag then
922 # Test if pending request is a hit on any way
924 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
925 & (req_index
== r
.store_index
)
926 & r
.rows_valid
[req_row
% ROW_PER_LINE
])
928 cvb
= Signal(NUM_WAYS
)
929 ctag
= Signal(TAG_RAM_WIDTH
)
930 comb
+= ctag
.eq(cache_tags
[req_index
])
931 comb
+= cvb
.eq(cache_valid_bits
[req_index
])
932 for i
in range(NUM_WAYS
):
933 tagi
= Signal(TAG_BITS
, name
="ti%d" % i
)
934 comb
+= tagi
.eq(read_tag(i
, ctag
))
935 hit_test
= Signal(name
="hit_test%d" % i
)
936 comb
+= hit_test
.eq(i
== r
.store_way
)
937 with m
.If((cvb
[i
] |
(hitcond
& hit_test
)) & (tagi
== req_tag
)):
938 comb
+= hit_way
.eq(i
)
941 # -- Generate the "hit" and "miss" signals
942 # -- for the synchronous blocks
943 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
945 # req_is_hit <= is_hit;
946 # req_is_miss <= not is_hit;
949 # req_is_miss <= '0';
951 # req_hit_way <= hit_way;
952 # Generate the "hit" and "miss" signals
953 # for the synchronous blocks
954 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
955 comb
+= req_is_hit
.eq(is_hit
)
956 comb
+= req_is_miss
.eq(~is_hit
)
959 comb
+= req_is_hit
.eq(0)
960 comb
+= req_is_miss
.eq(0)
962 # -- The way to replace on a miss
963 # if r.state = CLR_TAG then
965 # to_integer(unsigned(plru_victim(r.store_index)));
967 # replace_way <= r.store_way;
969 # The way to replace on a miss
970 with m
.If(r
.state
== State
.CLR_TAG
):
971 comb
+= replace_way
.eq(plru_victim
[r
.store_index
])
974 comb
+= replace_way
.eq(r
.store_way
)
976 # -- Output instruction from current cache row
978 # -- Note: This is a mild violation of our design principle of
979 # -- having pipeline stages output from a clean latch. In this
980 # -- case we output the result of a mux. The alternative would
981 # -- be output an entire row which I prefer not to do just yet
982 # -- as it would force fetch2 to know about some of the cache
983 # -- geometry information.
984 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
985 # i_out.valid <= r.hit_valid;
986 # i_out.nia <= r.hit_nia;
987 # i_out.stop_mark <= r.hit_smark;
988 # i_out.fetch_failed <= r.fetch_failed;
989 # Output instruction from current cache row
991 # Note: This is a mild violation of our design principle of
992 # having pipeline stages output from a clean latch. In this
993 # case we output the result of a mux. The alternative would
994 # be output an entire row which I prefer not to do just yet
995 # as it would force fetch2 to know about some of the cache
996 # geometry information.
997 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
998 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
999 # r.hit_way, cache_out[r.hit_way])
1000 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
1001 comb
+= i_out
.valid
.eq(r
.hit_valid
)
1002 comb
+= i_out
.nia
.eq(r
.hit_nia
)
1003 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
1004 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
1006 # -- Stall fetch1 if we have a miss on cache or TLB
1007 # -- or a protection fault
1008 # stall_out <= not (is_hit and access_ok);
1009 # Stall fetch1 if we have a miss on cache or TLB
1010 # or a protection fault
1011 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
1013 # -- Wishbone requests output (from the cache miss reload machine)
1014 # wishbone_out <= r.wb;
1015 # Wishbone requests output (from the cache miss reload machine)
1016 comb
+= wb_out
.eq(r
.wb
)
1019 # -- Cache hit synchronous machine
1020 # icache_hit : process(clk)
1021 # Cache hit synchronous machine
1022 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
1023 req_index
, req_tag
, real_addr
):
1026 i_in
, stall_in
= self
.i_in
, self
.stall_in
1027 flush_in
= self
.flush_in
1030 # if rising_edge(clk) then
1031 # -- keep outputs to fetch2 unchanged on a stall
1032 # -- except that flush or reset sets valid to 0
1033 # -- If use_previous, keep the same data as last
1034 # -- cycle and use the second half
1035 # if stall_in = '1' or use_previous = '1' then
1036 # if rst = '1' or flush_in = '1' then
1037 # r.hit_valid <= '0';
1039 # keep outputs to fetch2 unchanged on a stall
1040 # except that flush or reset sets valid to 0
1041 # If use_previous, keep the same data as last
1042 # cycle and use the second half
1043 with m
.If(stall_in | use_previous
):
1044 with m
.If(flush_in
):
1045 sync
+= r
.hit_valid
.eq(0)
1047 # -- On a hit, latch the request for the next cycle,
1048 # -- when the BRAM data will be available on the
1049 # -- cache_out output of the corresponding way
1050 # r.hit_valid <= req_is_hit;
1051 # if req_is_hit = '1' then
1052 # r.hit_way <= req_hit_way;
1054 # On a hit, latch the request for the next cycle,
1055 # when the BRAM data will be available on the
1056 # cache_out output of the corresponding way
1057 sync
+= r
.hit_valid
.eq(req_is_hit
)
1059 with m
.If(req_is_hit
):
1060 sync
+= r
.hit_way
.eq(req_hit_way
)
1062 # report "cache hit nia:" & to_hstring(i_in.nia) &
1063 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1064 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1065 # " idx:" & integer'image(req_index) &
1066 # " tag:" & to_hstring(req_tag) &
1067 # " way:" & integer'image(req_hit_way) &
1068 # " RA:" & to_hstring(real_addr);
1069 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x " \
1070 "tag:%x way:%x RA:%x", i_in
.nia
, \
1071 i_in
.virt_mode
, i_in
.stop_mark
, req_index
, \
1072 req_tag
, req_hit_way
, real_addr
)
1078 # if stall_in = '0' then
1079 # -- Send stop marks and NIA down regardless of validity
1080 # r.hit_smark <= i_in.stop_mark;
1081 # r.hit_nia <= i_in.nia;
1083 with m
.If(~stall_in
):
1084 # Send stop marks and NIA down regardless of validity
1085 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
1086 sync
+= r
.hit_nia
.eq(i_in
.nia
)
1090 # -- Cache miss/reload synchronous machine
1091 # icache_miss : process(clk)
1092 # Cache miss/reload synchronous machine
1093 def icache_miss(self
, m
, cache_valid_bits
, r
, req_is_miss
,
1094 req_index
, req_laddr
, req_tag
, replace_way
,
1095 cache_tags
, access_ok
, real_addr
):
1099 i_in
, wb_in
, m_in
= self
.i_in
, self
.wb_in
, self
.m_in
1100 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
1101 inval_in
= self
.inval_in
1103 # variable tagset : cache_tags_set_t;
1104 # variable stbs_done : boolean;
1106 tagset
= Signal(TAG_RAM_WIDTH
)
1107 stbs_done
= Signal()
1110 # if rising_edge(clk) then
1111 # -- On reset, clear all valid bits to force misses
1113 # On reset, clear all valid bits to force misses
1114 # for i in index_t loop
1115 # cache_valids(i) <= (others => '0');
1120 # -- We only ever do reads on wishbone
1121 # r.wb.dat <= (others => '0');
1122 # r.wb.sel <= "11111111";
1125 # -- Not useful normally but helps avoiding
1126 # -- tons of sim warnings
1127 # r.wb.adr <= (others => '0');
1131 # -- Process cache invalidations
1132 # if inval_in = '1' then
1133 # for i in index_t loop
1134 # cache_valids(i) <= (others => '0');
1136 # r.store_valid <= '0';
1138 comb
+= r
.wb
.sel
.eq(-1)
1139 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
1141 # Process cache invalidations
1142 with m
.If(inval_in
):
1143 for i
in range(NUM_LINES
):
1144 sync
+= cache_valid_bits
[i
].eq(0)
1145 sync
+= r
.store_valid
.eq(0)
1147 # -- Main state machine
1149 # Main state machine
1150 with m
.Switch(r
.state
):
1153 with m
.Case(State
.IDLE
):
1154 # -- Reset per-row valid flags,
1155 # -- only used in WAIT_ACK
1156 # for i in 0 to ROW_PER_LINE - 1 loop
1157 # r.rows_valid(i) <= '0';
1159 # Reset per-row valid flags,
1160 # only used in WAIT_ACK
1161 for i
in range(ROW_PER_LINE
):
1162 sync
+= r
.rows_valid
[i
].eq(0)
1164 # -- We need to read a cache line
1165 # if req_is_miss = '1' then
1166 # report "cache miss nia:" & to_hstring(i_in.nia) &
1167 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1168 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1169 # " idx:" & integer'image(req_index) &
1170 # " way:" & integer'image(replace_way) &
1171 # " tag:" & to_hstring(req_tag) &
1172 # " RA:" & to_hstring(real_addr);
1173 # We need to read a cache line
1174 with m
.If(req_is_miss
):
1176 "cache miss nia:%x IR:%x SM:%x idx:%x " \
1177 " way:%x tag:%x RA:%x", i_in
.nia
, \
1178 i_in
.virt_mode
, i_in
.stop_mark
, req_index
, \
1179 replace_way
, req_tag
, real_addr
)
1181 # -- Keep track of our index and way for
1182 # -- subsequent stores
1183 # r.store_index <= req_index;
1184 # r.store_row <= get_row(req_laddr);
1185 # r.store_tag <= req_tag;
1186 # r.store_valid <= '1';
1188 # get_row_of_line(get_row(req_laddr)) - 1;
1189 # Keep track of our index and way
1190 # for subsequent stores
1191 sync
+= r
.store_index
.eq(req_index
)
1192 sync
+= r
.store_row
.eq(get_row(req_laddr
))
1193 sync
+= r
.store_tag
.eq(req_tag
)
1194 sync
+= r
.store_valid
.eq(1)
1195 sync
+= r
.end_row_ix
.eq(
1201 # -- Prep for first wishbone read. We calculate the
1202 # -- address of the start of the cache line and
1203 # -- start the WB cycle.
1204 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1207 # Prep for first wishbone read.
1209 # address of the start of the cache line and
1210 # start the WB cycle.
1211 sync
+= r
.req_adr
.eq(req_laddr
)
1212 sync
+= r
.wb
.cyc
.eq(1)
1213 sync
+= r
.wb
.stb
.eq(1)
1215 # -- Track that we had one request sent
1216 # r.state <= CLR_TAG;
1217 # Track that we had one request sent
1218 sync
+= r
.state
.eq(State
.CLR_TAG
)
1221 # when CLR_TAG | WAIT_ACK =>
1222 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
1223 # if r.state = CLR_TAG then
1224 with m
.If(r
.state
== State
.CLR_TAG
):
1225 # -- Get victim way from plru
1226 # r.store_way <= replace_way;
1227 # Get victim way from plru
1228 sync
+= r
.store_way
.eq(replace_way
)
1230 # -- Force misses on that way while
1231 # -- reloading that line
1232 # cache_valids(req_index)(replace_way) <= '0';
1233 # Force misses on that way while
1234 # realoading that line
1235 cv
= Signal(INDEX_BITS
)
1236 comb
+= cv
.eq(cache_valid_bits
[req_index
])
1237 comb
+= cv
.bit_select(replace_way
, 1).eq(0)
1238 sync
+= cache_valid_bits
[req_index
].eq(cv
)
1240 # -- Store new tag in selected way
1241 # for i in 0 to NUM_WAYS-1 loop
1242 # if i = replace_way then
1243 # tagset := cache_tags(r.store_index);
1244 # write_tag(i, tagset, r.store_tag);
1245 # cache_tags(r.store_index) <= tagset;
1248 for i
in range(NUM_WAYS
):
1249 with m
.If(i
== replace_way
):
1250 comb
+= tagset
.eq(cache_tags
[r
.store_index
])
1251 comb
+= write_tag(i
, tagset
, r
.store_tag
)
1252 sync
+= cache_tags
[r
.store_index
].eq(tagset
)
1254 # r.state <= WAIT_ACK;
1255 sync
+= r
.state
.eq(State
.WAIT_ACK
)
1258 # -- Requests are all sent if stb is 0
1259 # stbs_done := r.wb.stb = '0';
1260 # Requests are all sent if stb is 0
1261 stbs_zero
= Signal()
1262 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
1263 comb
+= stbs_done
.eq(stbs_zero
)
1265 # -- If we are still sending requests,
1266 # -- was one accepted ?
1267 # if wishbone_in.stall = '0' and not stbs_done then
1268 # If we are still sending requests,
1270 with m
.If(~wb_in
.stall
& ~stbs_zero
):
1271 # -- That was the last word ? We are done sending.
1272 # -- Clear stb and set stbs_done so we can handle
1273 # -- an eventual last ack on the same cycle.
1274 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1276 # stbs_done := true;
1278 # That was the last word ?
1279 # We are done sending.
1280 # Clear stb and set stbs_done
1282 # an eventual last ack on
1284 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
1285 sync
+= Display("IS_LAST_ROW_ADDR " \
1286 "r.wb.addr:%x r.end_row_ix:%x " \
1287 "r.wb.stb:%x stbs_zero:%x " \
1288 "stbs_done:%x", r
.wb
.adr
, \
1289 r
.end_row_ix
, r
.wb
.stb
, \
1290 stbs_zero
, stbs_done
)
1291 sync
+= r
.wb
.stb
.eq(0)
1292 comb
+= stbs_done
.eq(1)
1294 # -- Calculate the next row address
1295 # r.wb.adr <= next_row_addr(r.wb.adr);
1296 # Calculate the next row address
1297 rarange
= Signal(LINE_OFF_BITS
- ROW_OFF_BITS
)
1299 r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1
1301 sync
+= r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(
1304 sync
+= Display("RARANGE r.wb.adr:%x stbs_zero:%x " \
1305 "stbs_done:%x", rarange
, stbs_zero
, \
1309 # -- Incoming acks processing
1310 # if wishbone_in.ack = '1' then
1311 # Incoming acks processing
1312 with m
.If(wb_in
.ack
):
1313 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1315 sync
+= Display("WB_IN_ACK stbs_zero:%x " \
1317 stbs_zero
, stbs_done
)
1319 sync
+= r
.rows_valid
[r
.store_row
% ROW_PER_LINE
].eq(1)
1321 # -- Check for completion
1323 # is_last_row(r.store_row, r.end_row_ix) then
1324 # Check for completion
1325 with m
.If(stbs_done
&
1326 is_last_row(r
.store_row
, r
.end_row_ix
)):
1327 # -- Complete wishbone cycle
1329 # Complete wishbone cycle
1330 sync
+= r
.wb
.cyc
.eq(0)
1332 # -- Cache line is now valid
1333 # cache_valids(r.store_index)(replace_way) <=
1334 # r.store_valid and not inval_in;
1335 # Cache line is now valid
1336 cv
= Signal(INDEX_BITS
)
1337 comb
+= cv
.eq(cache_valid_bits
[r
.store_index
])
1338 comb
+= cv
.bit_select(replace_way
, 1).eq(
1339 r
.store_valid
& ~inval_in
1341 sync
+= cache_valid_bits
[r
.store_index
].eq(cv
)
1346 sync
+= r
.state
.eq(State
.IDLE
)
1349 # -- Increment store row counter
1350 # r.store_row <= next_row(r.store_row);
1351 # Increment store row counter
1352 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
1357 # -- TLB miss and protection fault processing
1358 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1359 # r.fetch_failed <= '0';
1360 # elsif i_in.req = '1' and access_ok = '0' and
1361 # stall_in = '0' then
1362 # r.fetch_failed <= '1';
1364 # TLB miss and protection fault processing
1365 with m
.If(flush_in | m_in
.tlbld
):
1366 sync
+= r
.fetch_failed
.eq(0)
1368 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
1369 sync
+= r
.fetch_failed
.eq(1)
1373 # icache_log: if LOG_LENGTH > 0 generate
1374 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
1375 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
1379 wb_in
, i_out
= self
.wb_in
, self
.i_out
1380 log_out
, stall_out
= self
.log_out
, self
.stall_out
1382 # -- Output data to logger
1383 # signal log_data : std_ulogic_vector(53 downto 0);
1385 # data_log: process(clk)
1386 # variable lway: way_t;
1387 # variable wstate: std_ulogic;
1388 # Output data to logger
1389 for i
in range(LOG_LENGTH
):
1390 # Output data to logger
1391 log_data
= Signal(54)
1392 lway
= Signal(NUM_WAYS
)
1396 # if rising_edge(clk) then
1397 # lway := req_hit_way;
1399 sync
+= lway
.eq(req_hit_way
)
1400 sync
+= wstate
.eq(0)
1402 # if r.state /= IDLE then
1405 with m
.If(r
.state
!= State
.IDLE
):
1406 sync
+= wstate
.eq(1)
1408 # log_data <= i_out.valid &
1411 # r.wb.adr(5 downto 3) &
1412 # r.wb.stb & r.wb.cyc &
1413 # wishbone_in.stall &
1416 # r.hit_nia(5 downto 2) &
1418 # std_ulogic_vector(to_unsigned(lway, 3)) &
1419 # req_is_hit & req_is_miss &
1422 sync
+= log_data
.eq(Cat(
1423 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
1424 lway
, wstate
, r
.hit_nia
[2:6],
1425 r
.fetch_failed
, stall_out
, wb_in
.stall
, r
.wb
.cyc
,
1426 r
.wb
.stb
, r
.wb
.adr
[3:6], wb_in
.ack
, i_out
.insn
,
1431 # log_out <= log_data;
1432 comb
+= log_out
.eq(log_data
)
1436 def elaborate(self
, platform
):
1441 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1442 cache_tags
= CacheTagArray()
1443 cache_valid_bits
= CacheValidBitsArray()
1445 # signal itlb_valids : tlb_valids_t;
1446 # signal itlb_tags : tlb_tags_t;
1447 # signal itlb_ptes : tlb_ptes_t;
1448 # attribute ram_style of itlb_tags : signal is "distributed";
1449 # attribute ram_style of itlb_ptes : signal is "distributed";
1450 itlb_valid_bits
= TLBValidBitsArray()
1451 itlb_tags
= TLBTagArray()
1452 itlb_ptes
= TLBPtesArray()
1453 # TODO to be passed to nmigen as ram attributes
1454 # attribute ram_style of itlb_tags : signal is "distributed";
1455 # attribute ram_style of itlb_ptes : signal is "distributed";
1457 # -- Privilege bit from PTE EAA field
1458 # signal eaa_priv : std_ulogic;
1459 # Privilege bit from PTE EAA field
1462 # signal r : reg_internal_t;
1465 # -- Async signals on incoming request
1466 # signal req_index : index_t;
1467 # signal req_row : row_t;
1468 # signal req_hit_way : way_t;
1469 # signal req_tag : cache_tag_t;
1470 # signal req_is_hit : std_ulogic;
1471 # signal req_is_miss : std_ulogic;
1472 # signal req_laddr : std_ulogic_vector(63 downto 0);
1473 # Async signal on incoming request
1474 req_index
= Signal(NUM_LINES
)
1475 req_row
= Signal(BRAM_ROWS
)
1476 req_hit_way
= Signal(NUM_WAYS
)
1477 req_tag
= Signal(TAG_BITS
)
1478 req_is_hit
= Signal()
1479 req_is_miss
= Signal()
1480 req_laddr
= Signal(64)
1482 # signal tlb_req_index : tlb_index_t;
1483 # signal real_addr : std_ulogic_vector(
1484 # REAL_ADDR_BITS - 1 downto 0
1486 # signal ra_valid : std_ulogic;
1487 # signal priv_fault : std_ulogic;
1488 # signal access_ok : std_ulogic;
1489 # signal use_previous : std_ulogic;
1490 tlb_req_index
= Signal(TLB_SIZE
)
1491 real_addr
= Signal(REAL_ADDR_BITS
)
1493 priv_fault
= Signal()
1494 access_ok
= Signal()
1495 use_previous
= Signal()
1497 # signal cache_out : cache_ram_out_t;
1498 cache_out_row
= Signal(ROW_SIZE_BITS
)
1500 # signal plru_victim : plru_out_t;
1501 # signal replace_way : way_t;
1502 plru_victim
= PLRUOut()
1503 replace_way
= Signal(NUM_WAYS
)
1505 # call sub-functions putting everything together, using shared
1506 # signals established above
1507 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
1508 self
.maybe_plrus(m
, r
, plru_victim
)
1509 self
.itlb_lookup(m
, tlb_req_index
, itlb_ptes
, itlb_tags
,
1510 real_addr
, itlb_valid_bits
, ra_valid
, eaa_priv
,
1511 priv_fault
, access_ok
)
1512 self
.itlb_update(m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
)
1513 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
,
1514 req_tag
, real_addr
, req_laddr
, cache_valid_bits
,
1515 cache_tags
, access_ok
, req_is_hit
, req_is_miss
,
1516 replace_way
, plru_victim
, cache_out_row
)
1517 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
1518 req_index
, req_tag
, real_addr
)
1519 self
.icache_miss(m
, cache_valid_bits
, r
, req_is_miss
, req_index
,
1520 req_laddr
, req_tag
, replace_way
, cache_tags
,
1521 access_ok
, real_addr
)
1522 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1523 # req_is_miss, req_is_hit, lway, wstate, r)
1531 # use ieee.std_logic_1164.all;
1534 # use work.common.all;
1535 # use work.wishbone_types.all;
1537 # entity icache_tb is
1540 # architecture behave of icache_tb is
1541 # signal clk : std_ulogic;
1542 # signal rst : std_ulogic;
1544 # signal i_out : Fetch1ToIcacheType;
1545 # signal i_in : IcacheToDecode1Type;
1547 # signal m_out : MmuToIcacheType;
1549 # signal wb_bram_in : wishbone_master_out;
1550 # signal wb_bram_out : wishbone_slave_out;
1552 # constant clk_period : time := 10 ns;
1554 # icache0: entity work.icache
1568 # wishbone_out => wb_bram_in,
1569 # wishbone_in => wb_bram_out
1572 # -- BRAM Memory slave
1573 # bram0: entity work.wishbone_bram_wrapper
1575 # MEMORY_SIZE => 1024,
1576 # RAM_INIT_FILE => "icache_test.bin"
1581 # wishbone_in => wb_bram_in,
1582 # wishbone_out => wb_bram_out
1585 # clk_process: process
1588 # wait for clk_period/2;
1590 # wait for clk_period/2;
1593 # rst_process: process
1596 # wait for 2*clk_period;
1604 # i_out.nia <= (others => '0');
1605 # i_out.stop_mark <= '0';
1607 # m_out.tlbld <= '0';
1608 # m_out.tlbie <= '0';
1609 # m_out.addr <= (others => '0');
1610 # m_out.pte <= (others => '0');
1612 # wait until rising_edge(clk);
1613 # wait until rising_edge(clk);
1614 # wait until rising_edge(clk);
1615 # wait until rising_edge(clk);
1618 # i_out.nia <= x"0000000000000004";
1620 # wait for 30*clk_period;
1621 # wait until rising_edge(clk);
1623 # assert i_in.valid = '1' severity failure;
1624 # assert i_in.insn = x"00000001"
1625 # report "insn @" & to_hstring(i_out.nia) &
1626 # "=" & to_hstring(i_in.insn) &
1627 # " expected 00000001"
1632 # wait until rising_edge(clk);
1636 # i_out.nia <= x"0000000000000008";
1637 # wait until rising_edge(clk);
1638 # wait until rising_edge(clk);
1639 # assert i_in.valid = '1' severity failure;
1640 # assert i_in.insn = x"00000002"
1641 # report "insn @" & to_hstring(i_out.nia) &
1642 # "=" & to_hstring(i_in.insn) &
1643 # " expected 00000002"
1645 # wait until rising_edge(clk);
1649 # i_out.nia <= x"0000000000000040";
1651 # wait for 30*clk_period;
1652 # wait until rising_edge(clk);
1654 # assert i_in.valid = '1' severity failure;
1655 # assert i_in.insn = x"00000010"
1656 # report "insn @" & to_hstring(i_out.nia) &
1657 # "=" & to_hstring(i_in.insn) &
1658 # " expected 00000010"
1661 # -- test something that aliases
1663 # i_out.nia <= x"0000000000000100";
1664 # wait until rising_edge(clk);
1665 # wait until rising_edge(clk);
1666 # assert i_in.valid = '0' severity failure;
1667 # wait until rising_edge(clk);
1669 # wait for 30*clk_period;
1670 # wait until rising_edge(clk);
1672 # assert i_in.valid = '1' severity failure;
1673 # assert i_in.insn = x"00000040"
1674 # report "insn @" & to_hstring(i_out.nia) &
1675 # "=" & to_hstring(i_in.insn) &
1676 # " expected 00000040"
1684 def icache_sim(dut
):
1689 yield i_in
.valid
.eq(0)
1690 yield i_out
.priv_mode
.eq(1)
1691 yield i_out
.req
.eq(0)
1692 yield i_out
.nia
.eq(0)
1693 yield i_out
.stop_mark
.eq(0)
1694 yield m_out
.tlbld
.eq(0)
1695 yield m_out
.tlbie
.eq(0)
1696 yield m_out
.addr
.eq(0)
1697 yield m_out
.pte
.eq(0)
1702 yield i_out
.req
.eq(1)
1703 yield i_out
.nia
.eq(Const(0x0000000000000004, 64))
1707 valid
= yield i_in
.valid
1708 nia
= yield i_out
.nia
1709 insn
= yield i_in
.insn
1710 print(f
"valid? {valid}")
1712 assert insn
== 0x00000001, \
1713 "insn @%x=%x expected 00000001" % (nia
, insn
)
1714 yield i_out
.req
.eq(0)
1718 yield i_out
.req
.eq(1)
1719 yield i_out
.nia
.eq(Const(0x0000000000000008, 64))
1722 valid
= yield i_in
.valid
1723 nia
= yield i_in
.nia
1724 insn
= yield i_in
.insn
1726 assert insn
== 0x00000002, \
1727 "insn @%x=%x expected 00000002" % (nia
, insn
)
1731 yield i_out
.req
.eq(1)
1732 yield i_out
.nia
.eq(Const(0x0000000000000040, 64))
1736 valid
= yield i_in
.valid
1737 nia
= yield i_out
.nia
1738 insn
= yield i_in
.insn
1740 assert insn
== 0x00000010, \
1741 "insn @%x=%x expected 00000010" % (nia
, insn
)
1743 # test something that aliases
1744 yield i_out
.req
.eq(1)
1745 yield i_out
.nia
.eq(Const(0x0000000000000100, 64))
1748 valid
= yield i_in
.valid
1753 insn
= yield i_in
.insn
1754 valid
= yield i_in
.valid
1755 insn
= yield i_in
.insn
1757 assert insn
== 0x00000040, \
1758 "insn @%x=%x expected 00000040" % (nia
, insn
)
1759 yield i_out
.req
.eq(0)
1763 def test_icache(mem
):
1766 memory
= Memory(width
=64, depth
=16*64, init
=mem
)
1767 sram
= SRAM(memory
=memory
, granularity
=8)
1771 m
.submodules
.icache
= dut
1772 m
.submodules
.sram
= sram
1774 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.wb_out
.cyc
)
1775 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.wb_out
.stb
)
1776 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.wb_out
.we
)
1777 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.wb_out
.sel
)
1778 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.wb_out
.adr
)
1779 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.wb_out
.dat
)
1781 m
.d
.comb
+= dut
.wb_in
.ack
.eq(sram
.bus
.ack
)
1782 m
.d
.comb
+= dut
.wb_in
.dat
.eq(sram
.bus
.dat_r
)
1788 sim
.add_sync_process(wrap(icache_sim(dut
)))
1789 with sim
.write_vcd('test_icache.vcd'):
1792 if __name__
== '__main__':
1794 vl
= rtlil
.convert(dut
, ports
=[])
1795 with
open("test_icache.il", "w") as f
:
1799 for i
in range(512):
1800 mem
.append((i
*2)|
((i
*2+1)<<32))