221c7b8a29fedb43f19519e9750dcf846dd1ab7d
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray(): # XXX name
206 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
207
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray(): # XXX name
213 return Array(Signal(NUM_WAYS) for x in range(NUM_LINES))
214
215 def RowPerLineValidArray(): # XXX name
216 return Array(Signal() for x in range(ROW_PER_LINE))
217
218
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
224
225
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray(): # XXX name
233 return Array(Signal() for x in range(TLB_SIZE))
234
235 def TLBTagArray(): # XXX name
236 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
237
238 def TLBPTEArray(): # XXX name
239 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
240
241
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
245 def CacheRamOut(): # XXX name
246 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
247
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
252 def PLRUOut():
253 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
254
255 # -- Return the cache line index (tag index) for an address
256 # function get_index(addr: std_ulogic_vector(63 downto 0))
257 # return index_t is
258 # begin
259 # return to_integer(unsigned(
260 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
261 # ));
262 # end;
263 # Return the cache line index (tag index) for an address
264 def get_index(addr):
265 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
266
267 # -- Return the cache row index (data memory) for an address
268 # function get_row(addr: std_ulogic_vector(63 downto 0))
269 # return row_t is
270 # begin
271 # return to_integer(unsigned(
272 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
273 # ));
274 # end;
275 # Return the cache row index (data memory) for an address
276 def get_row(addr):
277 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
278
279 # -- Return the index of a row within a line
280 # function get_row_of_line(row: row_t) return row_in_line_t is
281 # variable row_v : unsigned(ROW_BITS-1 downto 0);
282 # begin
283 # row_v := to_unsigned(row, ROW_BITS);
284 # return row_v(ROW_LINEBITS-1 downto 0);
285 # end;
286 # Return the index of a row within a line
287 def get_row_of_line(row):
288 return row[:ROW_LINE_BITS]
289
290 # -- Returns whether this is the last row of a line
291 # function is_last_row_addr(addr: wishbone_addr_type;
292 # last: row_in_line_t
293 # )
294 # return boolean is
295 # begin
296 # return unsigned(
297 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
298 # ) = last;
299 # end;
300 # Returns whether this is the last row of a line
301 def is_last_row_addr(addr, last):
302 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
303
304 # -- Returns whether this is the last row of a line
305 # function is_last_row(row: row_t;
306 # last: row_in_line_t) return boolean is
307 # begin
308 # return get_row_of_line(row) = last;
309 # end;
310 # Returns whether this is the last row of a line
311 def is_last_row(row, last):
312 return get_row_of_line(row) == last
313
314 # -- Return the address of the next row in the current cache line
315 # function next_row_addr(addr: wishbone_addr_type)
316 # return std_ulogic_vector is
317 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
318 # variable result : wishbone_addr_type;
319 # begin
320 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
321 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
322 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
323 # result := addr;
324 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
325 # return result;
326 # end;
327 # Return the address of the next row in the current cache line
328 def next_row_addr(addr):
329 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
330 return addr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row_idx)
331
332 # -- Return the next row in the current cache line. We use a dedicated
333 # -- function in order to limit the size of the generated adder to be
334 # -- only the bits within a cache line (3 bits with default settings)
335 # function next_row(row: row_t) return row_t is
336 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
337 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
338 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
339 # begin
340 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
341 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
342 # row_v(ROW_LINEBITS-1 downto 0) :=
343 # std_ulogic_vector(unsigned(row_idx) + 1);
344 # return to_integer(unsigned(row_v));
345 # end;
346 # Return the next row in the current cache line. We use a dedicated
347 # function in order to limit the size of the generated adder to be
348 # only the bits within a cache line (3 bits with default settings)
349 def next_row(row):
350 row_idx = row[:ROW_LINE_BITS]
351 return row[:ROW_LINE_BITS].eq(row_idx + 1)
352
353 # -- Read the instruction word for the given address in the
354 # -- current cache row
355 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
356 # data: cache_row_t) return std_ulogic_vector is
357 # variable word: integer range 0 to INSN_PER_ROW-1;
358 # begin
359 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
360 # return data(31+word*32 downto word*32);
361 # end;
362 # Read the instruction word for the given address
363 # in the current cache row
364 def read_insn_word(addr, data):
365 word = addr[2:INSN_BITS+3]
366 return data.word_select(word, 32)
367
368 # -- Get the tag value from the address
369 # function get_tag(
370 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
371 # )
372 # return cache_tag_t is
373 # begin
374 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
375 # end;
376 # Get the tag value from the address
377 def get_tag(addr):
378 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
379
380 # -- Read a tag from a tag memory row
381 # function read_tag(way: way_t; tagset: cache_tags_set_t)
382 # return cache_tag_t is
383 # begin
384 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
385 # end;
386 # Read a tag from a tag memory row
387 def read_tag(way, tagset):
388 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
389
390 # -- Write a tag to tag memory row
391 # procedure write_tag(way: in way_t;
392 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
393 # begin
394 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
395 # end;
396 # Write a tag to tag memory row
397 def write_tag(way, tagset, tag):
398 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
399
400 # -- Simple hash for direct-mapped TLB index
401 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
402 # return tlb_index_t is
403 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
404 # begin
405 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
406 # xor addr(
407 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
408 # TLB_LG_PGSZ + TLB_BITS
409 # )
410 # xor addr(
411 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
412 # TLB_LG_PGSZ + 2 * TLB_BITS
413 # );
414 # return to_integer(unsigned(hash));
415 # end;
416 # Simple hash for direct-mapped TLB index
417 def hash_ea(addr):
418 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
419 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
420 ] ^ addr[
421 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
422 ]
423 return hsh
424
425 # begin
426 #
427 # assert LINE_SIZE mod ROW_SIZE = 0;
428 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
429 # severity FAILURE;
430 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
431 # severity FAILURE;
432 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
433 # severity FAILURE;
434 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
435 # severity FAILURE;
436 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
437 # report "geometry bits don't add up" severity FAILURE;
438 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
439 # report "geometry bits don't add up" severity FAILURE;
440 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
441 # report "geometry bits don't add up" severity FAILURE;
442 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
443 # report "geometry bits don't add up" severity FAILURE;
444 #
445 # sim_debug: if SIM generate
446 # debug: process
447 # begin
448 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
449 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
450 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
451 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
452 # report "INSN_BITS = " & natural'image(INSN_BITS);
453 # report "ROW_BITS = " & natural'image(ROW_BITS);
454 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
455 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
456 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
457 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
458 # report "TAG_BITS = " & natural'image(TAG_BITS);
459 # report "WAY_BITS = " & natural'image(WAY_BITS);
460 # wait;
461 # end process;
462 # end generate;
463
464 # Cache reload state machine
465 @unique
466 class State(Enum):
467 IDLE = 0
468 CLR_TAG = 1
469 WAIT_ACK = 2
470
471 # type reg_internal_t is record
472 # -- Cache hit state (Latches for 1 cycle BRAM access)
473 # hit_way : way_t;
474 # hit_nia : std_ulogic_vector(63 downto 0);
475 # hit_smark : std_ulogic;
476 # hit_valid : std_ulogic;
477 #
478 # -- Cache miss state (reload state machine)
479 # state : state_t;
480 # wb : wishbone_master_out;
481 # store_way : way_t;
482 # store_index : index_t;
483 # store_row : row_t;
484 # store_tag : cache_tag_t;
485 # store_valid : std_ulogic;
486 # end_row_ix : row_in_line_t;
487 # rows_valid : row_per_line_valid_t;
488 #
489 # -- TLB miss state
490 # fetch_failed : std_ulogic;
491 # end record;
492 class RegInternal(RecordObject):
493 def __init__(self):
494 super().__init__()
495 # Cache hit state (Latches for 1 cycle BRAM access)
496 self.hit_way = Signal(NUM_WAYS)
497 self.hit_nia = Signal(64)
498 self.hit_smark = Signal()
499 self.hit_valid = Signal()
500
501 # Cache miss state (reload state machine)
502 self.state = Signal(State)
503 self.wb = WBMasterOut() # XXX name
504 self.store_way = Signal(NUM_WAYS)
505 self.store_index = Signal(NUM_LINES)
506 self.store_row = Signal(BRAM_ROWS)
507 self.store_tag = Signal(TAG_BITS)
508 self.store_valid = Signal()
509 self.end_row_ix = Signal(ROW_LINE_BITS)
510 self.rows_valid = RowPerLineValidArray() # XXX name
511
512 # TLB miss state
513 self.fetch_failed = Signal()
514
515 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
516 #
517 # entity icache is
518 # generic (
519 # SIM : boolean := false;
520 # -- Line size in bytes
521 # LINE_SIZE : positive := 64;
522 # -- BRAM organisation: We never access more
523 # -- than wishbone_data_bits
524 # -- at a time so to save resources we make the
525 # -- array only that wide,
526 # -- and use consecutive indices for to make a cache "line"
527 # --
528 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
529 # -- so 64-bits)
530 # ROW_SIZE : positive := wishbone_data_bits / 8;
531 # -- Number of lines in a set
532 # NUM_LINES : positive := 32;
533 # -- Number of ways
534 # NUM_WAYS : positive := 4;
535 # -- L1 ITLB number of entries (direct mapped)
536 # TLB_SIZE : positive := 64;
537 # -- L1 ITLB log_2(page_size)
538 # TLB_LG_PGSZ : positive := 12;
539 # -- Number of real address bits that we store
540 # REAL_ADDR_BITS : positive := 56;
541 # -- Non-zero to enable log data collection
542 # LOG_LENGTH : natural := 0
543 # );
544 # port (
545 # clk : in std_ulogic;
546 # rst : in std_ulogic;
547 #
548 # i_in : in Fetch1ToIcacheType;
549 # i_out : out IcacheToDecode1Type;
550 #
551 # m_in : in MmuToIcacheType;
552 #
553 # stall_in : in std_ulogic;
554 # stall_out : out std_ulogic;
555 # flush_in : in std_ulogic;
556 # inval_in : in std_ulogic;
557 #
558 # wishbone_out : out wishbone_master_out;
559 # wishbone_in : in wishbone_slave_out;
560 #
561 # log_out : out std_ulogic_vector(53 downto 0)
562 # );
563 # end entity icache;
564 # 64 bit direct mapped icache. All instructions are 4B aligned.
565 class ICache(Elaboratable):
566 """64 bit direct mapped icache. All instructions are 4B aligned."""
567 def __init__(self):
568 self.i_in = Fetch1ToICacheType() # XXX name
569 self.i_out = ICacheToDecode1Type() # XXX name
570
571 self.m_in = MMUToICacheType() # XXX name
572
573 self.stall_in = Signal()
574 self.stall_out = Signal()
575 self.flush_in = Signal()
576 self.inval_in = Signal()
577
578 self.wb_out = WBMasterOut() # XXX name
579 self.wb_in = WBSlaveOut() # XXX name
580
581 self.log_out = Signal(54)
582
583
584 # -- Generate a cache RAM for each way
585 # rams: for i in 0 to NUM_WAYS-1 generate
586 # signal do_read : std_ulogic;
587 # signal do_write : std_ulogic;
588 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
589 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
590 # signal dout : cache_row_t;
591 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
592 # begin
593 # way: entity work.cache_ram
594 # generic map (
595 # ROW_BITS => ROW_BITS,
596 # WIDTH => ROW_SIZE_BITS
597 # )
598 # port map (
599 # clk => clk,
600 # rd_en => do_read,
601 # rd_addr => rd_addr,
602 # rd_data => dout,
603 # wr_sel => wr_sel,
604 # wr_addr => wr_addr,
605 # wr_data => wishbone_in.dat
606 # );
607 # process(all)
608 # begin
609 # do_read <= not (stall_in or use_previous);
610 # do_write <= '0';
611 # if wishbone_in.ack = '1' and replace_way = i then
612 # do_write <= '1';
613 # end if;
614 # cache_out(i) <= dout;
615 # rd_addr <=
616 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
617 # wr_addr <=
618 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
619 # for i in 0 to ROW_SIZE-1 loop
620 # wr_sel(i) <= do_write;
621 # end loop;
622 # end process;
623 # end generate;
624 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
625 comb = m.d.comb
626
627 wb_in, stall_in = self.wb_in, self.stall_in
628
629 do_read = Signal()
630 do_write = Signal()
631 rd_addr = Signal(ROW_BITS)
632 wr_addr = Signal(ROW_BITS)
633 _d_out = Signal(ROW_SIZE_BITS)
634 wr_sel = Signal(ROW_SIZE)
635
636 for i in range(NUM_WAYS):
637 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
638 comb += way.rd_en.eq(do_read)
639 comb += way.rd_addr.eq(rd_addr)
640 comb += way.rd_data_o.eq(_d_out)
641 comb += way.wr_sel.eq(wr_sel)
642 comb += way.wr_addr.eq(wr_addr)
643 comb += way.wr_data.eq(wb_in.dat)
644
645 comb += do_read.eq(~(stall_in | use_previous))
646
647 with m.If(wb_in.ack & (replace_way == i)):
648 comb += do_write.eq(1)
649
650 comb += cache_out[i].eq(_d_out)
651 comb += rd_addr.eq(req_row)
652 comb += wr_addr.eq(r.store_row)
653 for j in range(ROW_SIZE):
654 comb += wr_sel[j].eq(do_write)
655
656 # -- Generate PLRUs
657 # maybe_plrus: if NUM_WAYS > 1 generate
658 # begin
659 # plrus: for i in 0 to NUM_LINES-1 generate
660 # -- PLRU interface
661 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
662 # signal plru_acc_en : std_ulogic;
663 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
664 #
665 # begin
666 # plru : entity work.plru
667 # generic map (
668 # BITS => WAY_BITS
669 # )
670 # port map (
671 # clk => clk,
672 # rst => rst,
673 # acc => plru_acc,
674 # acc_en => plru_acc_en,
675 # lru => plru_out
676 # );
677 #
678 # process(all)
679 # begin
680 # -- PLRU interface
681 # if get_index(r.hit_nia) = i then
682 # plru_acc_en <= r.hit_valid;
683 # else
684 # plru_acc_en <= '0';
685 # end if;
686 # plru_acc <=
687 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
688 # plru_victim(i) <= plru_out;
689 # end process;
690 # end generate;
691 # end generate;
692 def maybe_plrus(self, m, r, plru_victim):
693 comb = m.d.comb
694
695 with m.If(NUM_WAYS > 1):
696 for i in range(NUM_LINES):
697 plru_acc_i = Signal(WAY_BITS)
698 plru_acc_en = Signal()
699 plru_out = Signal(WAY_BITS)
700 plru = PLRU(WAY_BITS)
701 comb += plru.acc_i.eq(plru_acc_i)
702 comb += plru.acc_en.eq(plru_acc_en)
703 comb += plru.lru_o.eq(plru_out)
704
705 # PLRU interface
706 with m.If(get_index(r.hit_nia) == i):
707 comb += plru.acc_en.eq(r.hit_valid)
708
709 comb += plru.acc_i.eq(r.hit_way)
710 comb += plru_victim[i].eq(plru.lru_o)
711
712 # -- TLB hit detection and real address generation
713 # itlb_lookup : process(all)
714 # variable pte : tlb_pte_t;
715 # variable ttag : tlb_tag_t;
716 # begin
717 # tlb_req_index <= hash_ea(i_in.nia);
718 # pte := itlb_ptes(tlb_req_index);
719 # ttag := itlb_tags(tlb_req_index);
720 # if i_in.virt_mode = '1' then
721 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
722 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
723 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
724 # ra_valid <= itlb_valids(tlb_req_index);
725 # else
726 # ra_valid <= '0';
727 # end if;
728 # eaa_priv <= pte(3);
729 # else
730 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
731 # ra_valid <= '1';
732 # eaa_priv <= '1';
733 # end if;
734 #
735 # -- no IAMR, so no KUEP support for now
736 # priv_fault <= eaa_priv and not i_in.priv_mode;
737 # access_ok <= ra_valid and not priv_fault;
738 # end process;
739 # TLB hit detection and real address generation
740 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
741 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
742 priv_fault, access_ok):
743 comb = m.d.comb
744
745 i_in = self.i_in
746
747 pte = Signal(TLB_PTE_BITS)
748 ttag = Signal(TLB_EA_TAG_BITS)
749
750 comb += tlb_req_index.eq(hash_ea(i_in.nia))
751 comb += pte.eq(itlb_ptes[tlb_req_index])
752 comb += ttag.eq(itlb_tags[tlb_req_index])
753
754 with m.If(i_in.virt_mode):
755 comb += real_addr.eq(Cat(
756 i_in.nia[:TLB_LG_PGSZ],
757 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
758 ))
759
760 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
761 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
762
763 with m.Else():
764 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
765 comb += ra_valid.eq(1)
766 comb += eaa_priv.eq(1)
767
768 # No IAMR, so no KUEP support for now
769 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
770 comb += access_ok.eq(ra_valid & ~priv_fault)
771
772 # -- iTLB update
773 # itlb_update: process(clk)
774 # variable wr_index : tlb_index_t;
775 # begin
776 # if rising_edge(clk) then
777 # wr_index := hash_ea(m_in.addr);
778 # if rst = '1' or
779 # (m_in.tlbie = '1' and m_in.doall = '1') then
780 # -- clear all valid bits
781 # for i in tlb_index_t loop
782 # itlb_valids(i) <= '0';
783 # end loop;
784 # elsif m_in.tlbie = '1' then
785 # -- clear entry regardless of hit or miss
786 # itlb_valids(wr_index) <= '0';
787 # elsif m_in.tlbld = '1' then
788 # itlb_tags(wr_index) <=
789 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
790 # itlb_ptes(wr_index) <= m_in.pte;
791 # itlb_valids(wr_index) <= '1';
792 # end if;
793 # end if;
794 # end process;
795 # iTLB update
796 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
797 comb = m.d.comb
798 sync = m.d.sync
799
800 m_in = self.m_in
801
802 wr_index = Signal(TLB_SIZE)
803 comb += wr_index.eq(hash_ea(m_in.addr))
804
805 with m.If(m_in.tlbie & m_in.doall):
806 # Clear all valid bits
807 for i in range(TLB_SIZE):
808 sync += itlb_valid_bits[i].eq(0)
809
810 with m.Elif(m_in.tlbie):
811 # Clear entry regardless of hit or miss
812 sync += itlb_valid_bits[wr_index].eq(0)
813
814 with m.Elif(m_in.tlbld):
815 sync += itlb_tags[wr_index].eq(
816 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
817 )
818 sync += itlb_ptes[wr_index].eq(m_in.pte)
819 sync += itlb_valid_bits[wr_index].eq(1)
820
821 # -- Cache hit detection, output to fetch2 and other misc logic
822 # icache_comb : process(all)
823 # Cache hit detection, output to fetch2 and other misc logic
824 def icache_comb(self, m, use_previous, r, req_index, req_row,
825 req_tag, real_addr, req_laddr, cache_valid_bits,
826 cache_tags, access_ok, req_is_hit,
827 req_is_miss, replace_way, plru_victim, cache_out):
828 # variable is_hit : std_ulogic;
829 # variable hit_way : way_t;
830 comb = m.d.comb
831
832 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
833 flush_in, stall_out = self.flush_in, self.stall_out
834
835 is_hit = Signal()
836 hit_way = Signal(NUM_WAYS)
837 # begin
838 # -- i_in.sequential means that i_in.nia this cycle
839 # -- is 4 more than last cycle. If we read more
840 # -- than 32 bits at a time, had a cache hit last
841 # -- cycle, and we don't want the first 32-bit chunk
842 # -- then we can keep the data we read last cycle
843 # -- and just use that.
844 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
845 # use_previous <= i_in.sequential and r.hit_valid;
846 # else
847 # use_previous <= '0';
848 # end if;
849 # i_in.sequential means that i_in.nia this cycle is 4 more than
850 # last cycle. If we read more than 32 bits at a time, had a
851 # cache hit last cycle, and we don't want the first 32-bit chunk
852 # then we can keep the data we read last cycle and just use that.
853 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
854 comb += use_previous.eq(i_in.sequential & r.hit_valid)
855
856 # -- Extract line, row and tag from request
857 # req_index <= get_index(i_in.nia);
858 # req_row <= get_row(i_in.nia);
859 # req_tag <= get_tag(real_addr);
860 # Extract line, row and tag from request
861 comb += req_index.eq(get_index(i_in.nia))
862 comb += req_row.eq(get_row(i_in.nia))
863 comb += req_tag.eq(get_tag(real_addr))
864
865 # -- Calculate address of beginning of cache row, will be
866 # -- used for cache miss processing if needed
867 # req_laddr <=
868 # (63 downto REAL_ADDR_BITS => '0') &
869 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
870 # (ROW_OFF_BITS-1 downto 0 => '0');
871 # Calculate address of beginning of cache row, will be
872 # used for cache miss processing if needed
873 comb += req_laddr.eq(Cat(
874 Const(0b0, ROW_OFF_BITS),
875 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS]
876 ))
877
878 # -- Test if pending request is a hit on any way
879 # hit_way := 0;
880 # is_hit := '0';
881 # for i in way_t loop
882 # if i_in.req = '1' and
883 # (cache_valids(req_index)(i) = '1' or
884 # (r.state = WAIT_ACK and
885 # req_index = r.store_index and
886 # i = r.store_way and
887 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
888 # if read_tag(i, cache_tags(req_index)) = req_tag then
889 # hit_way := i;
890 # is_hit := '1';
891 # end if;
892 # end if;
893 # end loop;
894 # Test if pending request is a hit on any way
895 for i in range(NUM_WAYS):
896 with m.If(i_in.req &
897 (cache_valid_bits[req_index][i] |
898 ((r.state == State.WAIT_ACK)
899 & (req_index == r.store_index)
900 & (i == r.store_way)
901 & r.rows_valid[req_row % ROW_PER_LINE]))):
902 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
903 comb += hit_way.eq(i)
904 comb += is_hit.eq(1)
905
906 # -- Generate the "hit" and "miss" signals
907 # -- for the synchronous blocks
908 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
909 # and rst = '0' then
910 # req_is_hit <= is_hit;
911 # req_is_miss <= not is_hit;
912 # else
913 # req_is_hit <= '0';
914 # req_is_miss <= '0';
915 # end if;
916 # req_hit_way <= hit_way;
917 # Generate the "hit" and "miss" signals
918 # for the synchronous blocks
919 with m.If(i_in.req & access_ok & ~flush_in):
920 comb += req_is_hit.eq(is_hit)
921 comb += req_is_miss.eq(~is_hit)
922
923 with m.Else():
924 comb += req_is_hit.eq(0)
925 comb += req_is_miss.eq(0)
926
927 # -- The way to replace on a miss
928 # if r.state = CLR_TAG then
929 # replace_way <=
930 # to_integer(unsigned(plru_victim(r.store_index)));
931 # else
932 # replace_way <= r.store_way;
933 # end if;
934 # The way to replace on a miss
935 with m.If(r.state == State.CLR_TAG):
936 comb += replace_way.eq(plru_victim[r.store_index])
937
938 with m.Else():
939 comb += replace_way.eq(r.store_way)
940
941 # -- Output instruction from current cache row
942 # --
943 # -- Note: This is a mild violation of our design principle of
944 # -- having pipeline stages output from a clean latch. In this
945 # -- case we output the result of a mux. The alternative would
946 # -- be output an entire row which I prefer not to do just yet
947 # -- as it would force fetch2 to know about some of the cache
948 # -- geometry information.
949 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
950 # i_out.valid <= r.hit_valid;
951 # i_out.nia <= r.hit_nia;
952 # i_out.stop_mark <= r.hit_smark;
953 # i_out.fetch_failed <= r.fetch_failed;
954 # Output instruction from current cache row
955 #
956 # Note: This is a mild violation of our design principle of
957 # having pipeline stages output from a clean latch. In this
958 # case we output the result of a mux. The alternative would
959 # be output an entire row which I prefer not to do just yet
960 # as it would force fetch2 to know about some of the cache
961 # geometry information.
962 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out[r.hit_way]))
963 comb += i_out.valid.eq(r.hit_valid)
964 comb += i_out.nia.eq(r.hit_nia)
965 comb += i_out.stop_mark.eq(r.hit_smark)
966 comb += i_out.fetch_failed.eq(r.fetch_failed)
967
968 # -- Stall fetch1 if we have a miss on cache or TLB
969 # -- or a protection fault
970 # stall_out <= not (is_hit and access_ok);
971 # Stall fetch1 if we have a miss on cache or TLB
972 # or a protection fault
973 comb += stall_out.eq(~(is_hit & access_ok))
974
975 # -- Wishbone requests output (from the cache miss reload machine)
976 # wishbone_out <= r.wb;
977 # Wishbone requests output (from the cache miss reload machine)
978 comb += wb_out.eq(r.wb)
979 # end process;
980
981 # -- Cache hit synchronous machine
982 # icache_hit : process(clk)
983 # Cache hit synchronous machine
984 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
985 req_index, req_tag, real_addr):
986 sync = m.d.sync
987
988 i_in, stall_in = self.i_in, self.stall_in
989 flush_in = self.flush_in
990
991 # begin
992 # if rising_edge(clk) then
993 # -- keep outputs to fetch2 unchanged on a stall
994 # -- except that flush or reset sets valid to 0
995 # -- If use_previous, keep the same data as last
996 # -- cycle and use the second half
997 # if stall_in = '1' or use_previous = '1' then
998 # if rst = '1' or flush_in = '1' then
999 # r.hit_valid <= '0';
1000 # end if;
1001 # keep outputs to fetch2 unchanged on a stall
1002 # except that flush or reset sets valid to 0
1003 # If use_previous, keep the same data as last
1004 # cycle and use the second half
1005 with m.If(stall_in | use_previous):
1006 with m.If(flush_in):
1007 sync += r.hit_valid.eq(0)
1008 # else
1009 # -- On a hit, latch the request for the next cycle,
1010 # -- when the BRAM data will be available on the
1011 # -- cache_out output of the corresponding way
1012 # r.hit_valid <= req_is_hit;
1013 # if req_is_hit = '1' then
1014 # r.hit_way <= req_hit_way;
1015 with m.Else():
1016 # On a hit, latch the request for the next cycle,
1017 # when the BRAM data will be available on the
1018 # cache_out output of the corresponding way
1019 sync += r.hit_valid.eq(req_is_hit)
1020
1021 with m.If(req_is_hit):
1022 sync += r.hit_way.eq(req_hit_way)
1023
1024 # report "cache hit nia:" & to_hstring(i_in.nia) &
1025 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1026 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1027 # " idx:" & integer'image(req_index) &
1028 # " tag:" & to_hstring(req_tag) &
1029 # " way:" & integer'image(req_hit_way) &
1030 # " RA:" & to_hstring(real_addr);
1031 # XXX NO do not use f"" use %d and %x. see dcache.py Display
1032 print(f"cache hit nia:{i_in.nia}, " \
1033 f"IR:{i_in.virt_mode}, " \
1034 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1035 f"tag:{req_tag}, way:{req_hit_way}, " \
1036 f"RA:{real_addr}")
1037 # end if;
1038 # end if;
1039 # if stall_in = '0' then
1040 # -- Send stop marks and NIA down regardless of validity
1041 # r.hit_smark <= i_in.stop_mark;
1042 # r.hit_nia <= i_in.nia;
1043 # end if;
1044 with m.If(~stall_in):
1045 # Send stop marks and NIA down regardless of validity
1046 sync += r.hit_smark.eq(i_in.stop_mark)
1047 sync += r.hit_nia.eq(i_in.nia)
1048 # end if;
1049 # end process;
1050
1051 # -- Cache miss/reload synchronous machine
1052 # icache_miss : process(clk)
1053 # Cache miss/reload synchronous machine
1054 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1055 req_index, req_laddr, req_tag, replace_way,
1056 cache_tags, access_ok, real_addr):
1057 comb = m.d.comb
1058 sync = m.d.sync
1059
1060 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1061 stall_in, flush_in = self.stall_in, self.flush_in
1062 inval_in = self.inval_in
1063
1064 # variable tagset : cache_tags_set_t;
1065 # variable stbs_done : boolean;
1066
1067 tagset = Signal(TAG_RAM_WIDTH)
1068 stbs_done = Signal()
1069
1070 # begin
1071 # if rising_edge(clk) then
1072 # -- On reset, clear all valid bits to force misses
1073 # if rst = '1' then
1074 # On reset, clear all valid bits to force misses
1075 # for i in index_t loop
1076 # cache_valids(i) <= (others => '0');
1077 # end loop;
1078 # r.state <= IDLE;
1079 # r.wb.cyc <= '0';
1080 # r.wb.stb <= '0';
1081 # -- We only ever do reads on wishbone
1082 # r.wb.dat <= (others => '0');
1083 # r.wb.sel <= "11111111";
1084 # r.wb.we <= '0';
1085
1086 # We only ever do reads on wishbone
1087 comb += r.wb.sel.eq(~0) # set to all 1s
1088
1089 # -- Not useful normally but helps avoiding
1090 # -- tons of sim warnings
1091 # r.wb.adr <= (others => '0');
1092
1093 # else
1094
1095 # -- Process cache invalidations
1096 # if inval_in = '1' then
1097 # for i in index_t loop
1098 # cache_valids(i) <= (others => '0');
1099 # end loop;
1100 # r.store_valid <= '0';
1101 # end if;
1102 # Process cache invalidations
1103 with m.If(inval_in):
1104 for i in range(NUM_LINES):
1105 sync += cache_valid_bits[i].eq(~1) # XXX NO just set to zero.
1106 # look again: others == 0
1107
1108 sync += r.store_valid.eq(0)
1109
1110 # -- Main state machine
1111 # case r.state is
1112 # Main state machine
1113 with m.Switch(r.state):
1114
1115 # when IDLE =>
1116 with m.Case(State.IDLE):
1117 # -- Reset per-row valid flags,
1118 # -- only used in WAIT_ACK
1119 # for i in 0 to ROW_PER_LINE - 1 loop
1120 # r.rows_valid(i) <= '0';
1121 # end loop;
1122 # Reset per-row valid flags,
1123 # only used in WAIT_ACK
1124 for i in range(ROW_PER_LINE):
1125 sync += r.rows_valid[i].eq(0)
1126
1127 # -- We need to read a cache line
1128 # if req_is_miss = '1' then
1129 # report "cache miss nia:" & to_hstring(i_in.nia) &
1130 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1131 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1132 # " idx:" & integer'image(req_index) &
1133 # " way:" & integer'image(replace_way) &
1134 # " tag:" & to_hstring(req_tag) &
1135 # " RA:" & to_hstring(real_addr);
1136 # We need to read a cache line
1137 with m.If(req_is_miss):
1138 # XXX no, do not use "f". use sync += Display
1139 # and use %d for integer, %x for hex.
1140 print(f"cache miss nia:{i_in.nia} " \
1141 f"IR:{i_in.virt_mode} " \
1142 f"SM:{i_in.stop_mark} " \
1143 F"idx:{req_index} " \
1144 f"way:{replace_way} tag:{req_tag} " \
1145 f"RA:{real_addr}")
1146
1147 # -- Keep track of our index and way for
1148 # -- subsequent stores
1149 # r.store_index <= req_index;
1150 # r.store_row <= get_row(req_laddr);
1151 # r.store_tag <= req_tag;
1152 # r.store_valid <= '1';
1153 # r.end_row_ix <=
1154 # get_row_of_line(get_row(req_laddr)) - 1;
1155 # Keep track of our index and way
1156 # for subsequent stores
1157 sync += r.store_index.eq(req_index)
1158 sync += r.store_row.eq(get_row(req_laddr))
1159 sync += r.store_tag.eq(req_tag)
1160 sync += r.store_valid.eq(1)
1161 sync += r.end_row_ix.eq(
1162 get_row_of_line(
1163 get_row(req_laddr)
1164 ) - 1
1165 )
1166
1167 # -- Prep for first wishbone read. We calculate the
1168 # -- address of the start of the cache line and
1169 # -- start the WB cycle.
1170 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1171 # r.wb.cyc <= '1';
1172 # r.wb.stb <= '1';
1173 # Prep for first wishbone read.
1174 # We calculate the
1175 # address of the start of the cache line and
1176 # start the WB cycle.
1177 sync += r.wb.adr.eq(req_laddr)
1178 sync += r.wb.cyc.eq(1)
1179 sync += r.wb.stb.eq(1)
1180
1181 # -- Track that we had one request sent
1182 # r.state <= CLR_TAG;
1183 # Track that we had one request sent
1184 sync += r.state.eq(State.CLR_TAG)
1185 # end if;
1186
1187 # when CLR_TAG | WAIT_ACK =>
1188 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1189 # if r.state = CLR_TAG then
1190 with m.If(r.state == State.CLR_TAG):
1191 # -- Get victim way from plru
1192 # r.store_way <= replace_way;
1193 # Get victim way from plru
1194 sync += r.store_way.eq(replace_way)
1195 #
1196 # -- Force misses on that way while
1197 # -- reloading that line
1198 # cache_valids(req_index)(replace_way) <= '0';
1199 # Force misses on that way while
1200 # realoading that line
1201 # XXX see dcache.py
1202 sync += cache_valid_bits[req_index][replace_way].eq(0)
1203
1204 # -- Store new tag in selected way
1205 # for i in 0 to NUM_WAYS-1 loop
1206 # if i = replace_way then
1207 # tagset := cache_tags(r.store_index);
1208 # write_tag(i, tagset, r.store_tag);
1209 # cache_tags(r.store_index) <= tagset;
1210 # end if;
1211 # end loop;
1212 for i in range(NUM_WAYS):
1213 with m.If(i == replace_way):
1214 comb += tagset.eq(cache_tags[r.store_index])
1215 sync += write_tag(i, tagset, r.store_tag)
1216 sync += cache_tags[r.store_index].eq(tagset)
1217
1218 # r.state <= WAIT_ACK;
1219 sync += r.state.eq(State.WAIT_ACK)
1220 # end if;
1221
1222 # -- Requests are all sent if stb is 0
1223 # stbs_done := r.wb.stb = '0';
1224 # Requests are all sent if stb is 0
1225 comb += stbs_done.eq(r.wb.stb == 0)
1226
1227 # -- If we are still sending requests,
1228 # -- was one accepted ?
1229 # if wishbone_in.stall = '0' and not stbs_done then
1230 # If we are still sending requests,
1231 # was one accepted?
1232 with m.If(~wb_in.stall & ~stbs_done):
1233 # -- That was the last word ? We are done sending.
1234 # -- Clear stb and set stbs_done so we can handle
1235 # -- an eventual last ack on the same cycle.
1236 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1237 # r.wb.stb <= '0';
1238 # stbs_done := true;
1239 # end if;
1240 # That was the last word ?
1241 # We are done sending.
1242 # Clear stb and set stbs_done
1243 # so we can handle
1244 # an eventual last ack on
1245 # the same cycle.
1246 with m.If(is_last_row_addr(r.wb.adr, r.end_row_ix)):
1247 sync += r.wb.stb.eq(0)
1248 comb += stbs_done.eq(1)
1249
1250 # -- Calculate the next row address
1251 # r.wb.adr <= next_row_addr(r.wb.adr);
1252 # Calculate the next row address
1253 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1254 # end if;
1255
1256 # -- Incoming acks processing
1257 # if wishbone_in.ack = '1' then
1258 # Incoming acks processing
1259 with m.If(wb_in.ack):
1260 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1261 # <= '1';
1262 sync += r.rows_valid[r.store_row & ROW_PER_LINE].eq(1)
1263
1264 # -- Check for completion
1265 # if stbs_done and
1266 # is_last_row(r.store_row, r.end_row_ix) then
1267 # Check for completion
1268 with m.If(stbs_done &
1269 (is_last_row(r.store_row, r.end_row_ix)):
1270 # -- Complete wishbone cycle
1271 # r.wb.cyc <= '0';
1272 # Complete wishbone cycle
1273 sync += r.wb.cyc.eq(0)
1274
1275 # -- Cache line is now valid
1276 # cache_valids(r.store_index)(replace_way) <=
1277 # r.store_valid and not inval_in;
1278 # Cache line is now valid
1279 sync += cache_valid_bits[r.store_index]
1280 [relace_way].eq(r.store_valid & ~inval_in)
1281
1282 # -- We are done
1283 # r.state <= IDLE;
1284 # We are done
1285 sync += r.state.eq(State.IDLE)
1286 # end if;
1287
1288 # -- Increment store row counter
1289 # r.store_row <= next_row(r.store_row);
1290 # Increment store row counter
1291 sync += store_row.eq(next_row(r.store_row))
1292 # end if;
1293 # end case;
1294 # end if;
1295 #
1296 # -- TLB miss and protection fault processing
1297 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1298 # r.fetch_failed <= '0';
1299 # elsif i_in.req = '1' and access_ok = '0' and
1300 # stall_in = '0' then
1301 # r.fetch_failed <= '1';
1302 # end if;
1303 # TLB miss and protection fault processing
1304 with m.If(flush_in | m_in.tlbld):
1305 sync += r.fetch_failed.eq(0)
1306 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1307 sync += r.fetch_failed.eq(1)
1308 # end if;
1309 # end process;
1310
1311 # icache_log: if LOG_LENGTH > 0 generate
1312 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1313 req_is_miss, req_is_hit, lway, wstate, r):
1314 comb = m.d.comb
1315 sync = m.d.sync
1316
1317 wb_in, i_out = self.wb_in, self.i_out
1318 log_out, stall_out = self.log_out, self.stall_out
1319
1320 # -- Output data to logger
1321 # signal log_data : std_ulogic_vector(53 downto 0);
1322 # begin
1323 # data_log: process(clk)
1324 # variable lway: way_t;
1325 # variable wstate: std_ulogic;
1326 # Output data to logger
1327 for i in range(LOG_LENGTH):
1328 # Output data to logger
1329 log_data = Signal(54)
1330 lway = Signal(NUM_WAYS)
1331 wstate = Signal()
1332
1333 # begin
1334 # if rising_edge(clk) then
1335 # lway := req_hit_way;
1336 # wstate := '0';
1337 comb += lway.eq(req_hit_way)
1338 comb += wstate.eq(0)
1339
1340 # if r.state /= IDLE then
1341 # wstate := '1';
1342 # end if;
1343 with m.If(r.state != State.IDLE):
1344 sync += wstate.eq(1)
1345
1346 # log_data <= i_out.valid &
1347 # i_out.insn &
1348 # wishbone_in.ack &
1349 # r.wb.adr(5 downto 3) &
1350 # r.wb.stb & r.wb.cyc &
1351 # wishbone_in.stall &
1352 # stall_out &
1353 # r.fetch_failed &
1354 # r.hit_nia(5 downto 2) &
1355 # wstate &
1356 # std_ulogic_vector(to_unsigned(lway, 3)) &
1357 # req_is_hit & req_is_miss &
1358 # access_ok &
1359 # ra_valid;
1360 sync += log_data.eq(Cat(
1361 ra_valid, access_ok, req_is_miss, req_is_hit,
1362 lway, wstate, r.hit_nia[2:6],
1363 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1364 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1365 i_out.valid
1366 ))
1367 # end if;
1368 # end process;
1369 # log_out <= log_data;
1370 comb += log_out.eq(log_data)
1371 # end generate;
1372 # end;
1373
1374 def elaborate(self, platform):
1375
1376 m = Module()
1377 comb = m.d.comb
1378
1379 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1380 cache_tags = CacheTagArray()
1381 cache_valid_bits = CacheValidBitsArray()
1382
1383 # signal itlb_valids : tlb_valids_t;
1384 # signal itlb_tags : tlb_tags_t;
1385 # signal itlb_ptes : tlb_ptes_t;
1386 # attribute ram_style of itlb_tags : signal is "distributed";
1387 # attribute ram_style of itlb_ptes : signal is "distributed";
1388 itlb_valid_bits = TLBValidBitsArray()
1389 itlb_tags = TLBTagArray()
1390 itlb_ptes = TLBPTEArray()
1391 # TODO to be passed to nmigen as ram attributes
1392 # attribute ram_style of itlb_tags : signal is "distributed";
1393 # attribute ram_style of itlb_ptes : signal is "distributed";
1394
1395 # -- Privilege bit from PTE EAA field
1396 # signal eaa_priv : std_ulogic;
1397 # Privilege bit from PTE EAA field
1398 eaa_priv = Signal()
1399
1400 # signal r : reg_internal_t;
1401 r = RegInternal()
1402
1403 # -- Async signals on incoming request
1404 # signal req_index : index_t;
1405 # signal req_row : row_t;
1406 # signal req_hit_way : way_t;
1407 # signal req_tag : cache_tag_t;
1408 # signal req_is_hit : std_ulogic;
1409 # signal req_is_miss : std_ulogic;
1410 # signal req_laddr : std_ulogic_vector(63 downto 0);
1411 # Async signal on incoming request
1412 req_index = Signal(NUM_LINES)
1413 req_row = Signal(BRAM_ROWS)
1414 req_hit_way = Signal(NUM_WAYS)
1415 req_tag = Signal(TAG_BITS)
1416 req_is_hit = Signal()
1417 req_is_miss = Signal()
1418 req_laddr = Signal(64)
1419
1420 # signal tlb_req_index : tlb_index_t;
1421 # signal real_addr : std_ulogic_vector(
1422 # REAL_ADDR_BITS - 1 downto 0
1423 # );
1424 # signal ra_valid : std_ulogic;
1425 # signal priv_fault : std_ulogic;
1426 # signal access_ok : std_ulogic;
1427 # signal use_previous : std_ulogic;
1428 tlb_req_index = Signal(TLB_SIZE)
1429 real_addr = Signal(REAL_ADDR_BITS)
1430 ra_valid = Signal()
1431 priv_fault = Signal()
1432 access_ok = Signal()
1433 use_previous = Signal()
1434
1435 # signal cache_out : cache_ram_out_t;
1436 cache_out = CacheRamOut()
1437
1438 # signal plru_victim : plru_out_t;
1439 # signal replace_way : way_t;
1440 plru_victim = PLRUOut()
1441 replace_way = Signal(NUM_WAYS)
1442
1443 # call sub-functions putting everything together, using shared
1444 # signals established above
1445 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1446 self.maybe_plrus(m, r, plru_victim)
1447 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1448 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1449 priv_fault, access_ok)
1450 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1451 self.icache_comb(m, use_previous, r, req_index, req_row,
1452 req_tag, real_addr, req_laddr, cache_valid_bits,
1453 cache_tags, access_ok, req_is_hit, req_is_miss,
1454 replace_way, plru_victim, cache_out)
1455 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1456 req_index, req_tag, real_addr)
1457 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1458 req_laddr, req_tag, replace_way, cache_tags,
1459 access_ok, real_addr)
1460 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1461 # req_is_miss, req_is_hit, lway, wstate, r)
1462
1463 return m
1464
1465
1466 # icache_tb.vhdl
1467 #
1468 # library ieee;
1469 # use ieee.std_logic_1164.all;
1470 #
1471 # library work;
1472 # use work.common.all;
1473 # use work.wishbone_types.all;
1474 #
1475 # entity icache_tb is
1476 # end icache_tb;
1477 #
1478 # architecture behave of icache_tb is
1479 # signal clk : std_ulogic;
1480 # signal rst : std_ulogic;
1481 #
1482 # signal i_out : Fetch1ToIcacheType;
1483 # signal i_in : IcacheToDecode1Type;
1484 #
1485 # signal m_out : MmuToIcacheType;
1486 #
1487 # signal wb_bram_in : wishbone_master_out;
1488 # signal wb_bram_out : wishbone_slave_out;
1489 #
1490 # constant clk_period : time := 10 ns;
1491 # begin
1492 # icache0: entity work.icache
1493 # generic map(
1494 # LINE_SIZE => 64,
1495 # NUM_LINES => 4
1496 # )
1497 # port map(
1498 # clk => clk,
1499 # rst => rst,
1500 # i_in => i_out,
1501 # i_out => i_in,
1502 # m_in => m_out,
1503 # stall_in => '0',
1504 # flush_in => '0',
1505 # inval_in => '0',
1506 # wishbone_out => wb_bram_in,
1507 # wishbone_in => wb_bram_out
1508 # );
1509 #
1510 # -- BRAM Memory slave
1511 # bram0: entity work.wishbone_bram_wrapper
1512 # generic map(
1513 # MEMORY_SIZE => 1024,
1514 # RAM_INIT_FILE => "icache_test.bin"
1515 # )
1516 # port map(
1517 # clk => clk,
1518 # rst => rst,
1519 # wishbone_in => wb_bram_in,
1520 # wishbone_out => wb_bram_out
1521 # );
1522 #
1523 # clk_process: process
1524 # begin
1525 # clk <= '0';
1526 # wait for clk_period/2;
1527 # clk <= '1';
1528 # wait for clk_period/2;
1529 # end process;
1530 #
1531 # rst_process: process
1532 # begin
1533 # rst <= '1';
1534 # wait for 2*clk_period;
1535 # rst <= '0';
1536 # wait;
1537 # end process;
1538 #
1539 # stim: process
1540 # begin
1541 # i_out.req <= '0';
1542 # i_out.nia <= (others => '0');
1543 # i_out.stop_mark <= '0';
1544 #
1545 # m_out.tlbld <= '0';
1546 # m_out.tlbie <= '0';
1547 # m_out.addr <= (others => '0');
1548 # m_out.pte <= (others => '0');
1549 #
1550 # wait until rising_edge(clk);
1551 # wait until rising_edge(clk);
1552 # wait until rising_edge(clk);
1553 # wait until rising_edge(clk);
1554 #
1555 # i_out.req <= '1';
1556 # i_out.nia <= x"0000000000000004";
1557 #
1558 # wait for 30*clk_period;
1559 # wait until rising_edge(clk);
1560 #
1561 # assert i_in.valid = '1' severity failure;
1562 # assert i_in.insn = x"00000001"
1563 # report "insn @" & to_hstring(i_out.nia) &
1564 # "=" & to_hstring(i_in.insn) &
1565 # " expected 00000001"
1566 # severity failure;
1567 #
1568 # i_out.req <= '0';
1569 #
1570 # wait until rising_edge(clk);
1571 #
1572 # -- hit
1573 # i_out.req <= '1';
1574 # i_out.nia <= x"0000000000000008";
1575 # wait until rising_edge(clk);
1576 # wait until rising_edge(clk);
1577 # assert i_in.valid = '1' severity failure;
1578 # assert i_in.insn = x"00000002"
1579 # report "insn @" & to_hstring(i_out.nia) &
1580 # "=" & to_hstring(i_in.insn) &
1581 # " expected 00000002"
1582 # severity failure;
1583 # wait until rising_edge(clk);
1584 #
1585 # -- another miss
1586 # i_out.req <= '1';
1587 # i_out.nia <= x"0000000000000040";
1588 #
1589 # wait for 30*clk_period;
1590 # wait until rising_edge(clk);
1591 #
1592 # assert i_in.valid = '1' severity failure;
1593 # assert i_in.insn = x"00000010"
1594 # report "insn @" & to_hstring(i_out.nia) &
1595 # "=" & to_hstring(i_in.insn) &
1596 # " expected 00000010"
1597 # severity failure;
1598 #
1599 # -- test something that aliases
1600 # i_out.req <= '1';
1601 # i_out.nia <= x"0000000000000100";
1602 # wait until rising_edge(clk);
1603 # wait until rising_edge(clk);
1604 # assert i_in.valid = '0' severity failure;
1605 # wait until rising_edge(clk);
1606 #
1607 # wait for 30*clk_period;
1608 # wait until rising_edge(clk);
1609 #
1610 # assert i_in.valid = '1' severity failure;
1611 # assert i_in.insn = x"00000040"
1612 # report "insn @" & to_hstring(i_out.nia) &
1613 # "=" & to_hstring(i_in.insn) &
1614 # " expected 00000040"
1615 # severity failure;
1616 #
1617 # i_out.req <= '0';
1618 #
1619 # std.env.finish;
1620 # end process;
1621 # end;
1622 def icache_sim(dut):
1623 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1624
1625 yield i_out.req.eq(0)
1626 yield i_out.nia.eq(~1)
1627 yield i_out.stop_mark.eq(0)
1628 yield m_out.tlbld.eq(0)
1629 yield m_out.tlbie.eq(0)
1630 yield m_out.addr.eq(~1)
1631 yield m_out.pte.eq(~1)
1632 yield
1633 yield
1634 yield
1635 yield
1636 yield i_out.req.eq(1)
1637 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1638 for i in range(30):
1639 yield
1640 yield
1641 assert i_in.valid
1642 assert i_in.insn == Const(0x00000001, 32), \
1643 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1644 yield i_out.req.eq(0)
1645 yield
1646
1647 # hit
1648 yield i_out.req.eq(1)
1649 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1650 yield
1651 yield
1652 assert i_in.valid
1653 assert i_in.insn == Const(0x00000002, 32), \
1654 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1655 yield
1656
1657 # another miss
1658 yield i_out.req(1)
1659 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1660 for i in range(30):
1661 yield
1662 yield
1663 assert i_in.valid
1664 assert i_in.insn == Const(0x00000010, 32), \
1665 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1666
1667 # test something that aliases
1668 yield i_out.req.eq(1)
1669 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1670 yield
1671 yield
1672 assert i_in.valid
1673 for i in range(30):
1674 yield
1675 yield
1676 assert i_in.valid
1677 assert i_in.insn == Const(0x00000040, 32), \
1678 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1679 yield i_out.req.eq(0)
1680
1681
1682 def test_icache():
1683 dut = ICache()
1684
1685 m = Module()
1686 m.submodules.icache = dut
1687
1688 # nmigen Simulation
1689 sim = Simulator(m)
1690 sim.add_clock(1e-6)
1691
1692 sim.add_sync_process(wrap(icache_sim(dut)))
1693 with sim.write_vcd('test_icache.vcd'):
1694 sim.run()
1695
1696 if __name__ == '__main__':
1697 dut = ICache()
1698 vl = rtlil.convert(dut, ports=[])
1699 with open("test_icache.il", "w") as f:
1700 f.write(vl)
1701
1702 test_icache()