use word_select
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray():
206 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
207
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray():
213 return Array(Signal(NUM_WAYS) for x in range(NUM_LINES))
214
215 def RowPerLineValidArray():
216 return Array(Signal() for x in range(ROW_PER_LINE))
217
218
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
224
225
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray():
233 return Array(Signal() for x in range(TLB_SIZE))
234
235 def TLBTagArray():
236 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
237
238 def TLBPTEArray():
239 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
240
241
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
245 def CacheRamOut():
246 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
247
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
252 def PLRUOut():
253 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
254
255 # -- Return the cache line index (tag index) for an address
256 # function get_index(addr: std_ulogic_vector(63 downto 0))
257 # return index_t is
258 # begin
259 # return to_integer(unsigned(
260 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
261 # ));
262 # end;
263 # Return the cache line index (tag index) for an address
264 def get_index(addr):
265 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
266
267 # -- Return the cache row index (data memory) for an address
268 # function get_row(addr: std_ulogic_vector(63 downto 0))
269 # return row_t is
270 # begin
271 # return to_integer(unsigned(
272 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
273 # ));
274 # end;
275 # Return the cache row index (data memory) for an address
276 def get_row(addr):
277 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
278
279 # -- Return the index of a row within a line
280 # function get_row_of_line(row: row_t) return row_in_line_t is
281 # variable row_v : unsigned(ROW_BITS-1 downto 0);
282 # begin
283 # row_v := to_unsigned(row, ROW_BITS);
284 # return row_v(ROW_LINEBITS-1 downto 0);
285 # end;
286 # Return the index of a row within a line
287 def get_row_of_line(row):
288 row[:ROW_LINE_BITS]
289
290 # -- Returns whether this is the last row of a line
291 # function is_last_row_addr(addr: wishbone_addr_type;
292 # last: row_in_line_t
293 # )
294 # return boolean is
295 # begin
296 # return unsigned(
297 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
298 # ) = last;
299 # end;
300 # Returns whether this is the last row of a line
301 def is_last_row_addr(addr, last):
302 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
303
304 # -- Returns whether this is the last row of a line
305 # function is_last_row(row: row_t;
306 # last: row_in_line_t) return boolean is
307 # begin
308 # return get_row_of_line(row) = last;
309 # end;
310 # Returns whether this is the last row of a line
311 def is_last_row(row, last):
312 return get_row_of_line(row) == last
313
314 # -- Return the address of the next row in the current cache line
315 # function next_row_addr(addr: wishbone_addr_type)
316 # return std_ulogic_vector is
317 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
318 # variable result : wishbone_addr_type;
319 # begin
320 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
321 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
322 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
323 # result := addr;
324 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
325 # return result;
326 # end;
327 # Return the address of the next row in the current cache line
328 def next_row_addr(addr):
329 # TODO no idea what's going on here, looks like double assignments
330 # overriding earlier assignments ??? Help please!
331 pass
332
333 # -- Return the next row in the current cache line. We use a dedicated
334 # -- function in order to limit the size of the generated adder to be
335 # -- only the bits within a cache line (3 bits with default settings)
336 # function next_row(row: row_t) return row_t is
337 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
338 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
339 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
340 # begin
341 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
342 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
343 # row_v(ROW_LINEBITS-1 downto 0) :=
344 # std_ulogic_vector(unsigned(row_idx) + 1);
345 # return to_integer(unsigned(row_v));
346 # end;
347 # Return the next row in the current cache line. We use a dedicated
348 # function in order to limit the size of the generated adder to be
349 # only the bits within a cache line (3 bits with default settings)
350 def next_row(row):
351 # TODO no idea what's going on here, looks like double assignments
352 # overriding earlier assignments ??? Help please!
353 pass
354
355 # -- Read the instruction word for the given address in the
356 # -- current cache row
357 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
358 # data: cache_row_t) return std_ulogic_vector is
359 # variable word: integer range 0 to INSN_PER_ROW-1;
360 # begin
361 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
362 # return data(31+word*32 downto word*32);
363 # end;
364 # Read the instruction word for the given address
365 # in the current cache row
366 def read_insn_word(addr, data):
367 word = addr[2:INSN_BITS+3]
368 return data.word_select(word, 32)
369
370 # -- Get the tag value from the address
371 # function get_tag(
372 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
373 # )
374 # return cache_tag_t is
375 # begin
376 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
377 # end;
378 # Get the tag value from the address
379 def get_tag(addr):
380 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
381
382 # -- Read a tag from a tag memory row
383 # function read_tag(way: way_t; tagset: cache_tags_set_t)
384 # return cache_tag_t is
385 # begin
386 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
387 # end;
388 # Read a tag from a tag memory row
389 def read_tag(way, tagset):
390 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
391
392 # -- Write a tag to tag memory row
393 # procedure write_tag(way: in way_t;
394 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
395 # begin
396 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
397 # end;
398 # Write a tag to tag memory row
399 def write_tag(way, tagset, tag):
400 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
401
402 # -- Simple hash for direct-mapped TLB index
403 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
404 # return tlb_index_t is
405 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
406 # begin
407 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
408 # xor addr(
409 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
410 # TLB_LG_PGSZ + TLB_BITS
411 # )
412 # xor addr(
413 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
414 # TLB_LG_PGSZ + 2 * TLB_BITS
415 # );
416 # return to_integer(unsigned(hash));
417 # end;
418 # Simple hash for direct-mapped TLB index
419 def hash_ea(addr):
420 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
421 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
422 ] ^ addr[
423 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
424 ]
425 return hsh
426
427 # begin
428 #
429 # assert LINE_SIZE mod ROW_SIZE = 0;
430 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
431 # severity FAILURE;
432 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
433 # severity FAILURE;
434 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
435 # severity FAILURE;
436 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
437 # severity FAILURE;
438 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
439 # report "geometry bits don't add up" severity FAILURE;
440 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
441 # report "geometry bits don't add up" severity FAILURE;
442 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
443 # report "geometry bits don't add up" severity FAILURE;
444 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
445 # report "geometry bits don't add up" severity FAILURE;
446 #
447 # sim_debug: if SIM generate
448 # debug: process
449 # begin
450 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
451 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
452 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
453 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
454 # report "INSN_BITS = " & natural'image(INSN_BITS);
455 # report "ROW_BITS = " & natural'image(ROW_BITS);
456 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
457 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
458 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
459 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
460 # report "TAG_BITS = " & natural'image(TAG_BITS);
461 # report "WAY_BITS = " & natural'image(WAY_BITS);
462 # wait;
463 # end process;
464 # end generate;
465
466 # Cache reload state machine
467 @unique
468 class State(Enum):
469 IDLE = 0
470 CLR_TAG = 1
471 WAIT_ACK = 2
472
473 # type reg_internal_t is record
474 # -- Cache hit state (Latches for 1 cycle BRAM access)
475 # hit_way : way_t;
476 # hit_nia : std_ulogic_vector(63 downto 0);
477 # hit_smark : std_ulogic;
478 # hit_valid : std_ulogic;
479 #
480 # -- Cache miss state (reload state machine)
481 # state : state_t;
482 # wb : wishbone_master_out;
483 # store_way : way_t;
484 # store_index : index_t;
485 # store_row : row_t;
486 # store_tag : cache_tag_t;
487 # store_valid : std_ulogic;
488 # end_row_ix : row_in_line_t;
489 # rows_valid : row_per_line_valid_t;
490 #
491 # -- TLB miss state
492 # fetch_failed : std_ulogic;
493 # end record;
494 class RegInternal(RecordObject):
495 def __init__(self):
496 super().__init__()
497 # Cache hit state (Latches for 1 cycle BRAM access)
498 self.hit_way = Signal(NUM_WAYS)
499 self.hit_nia = Signal(64)
500 self.hit_smark = Signal()
501 self.hit_valid = Signal()
502
503 # Cache miss state (reload state machine)
504 self.state = Signal(State)
505 self.wb = WBMasterOut()
506 self.store_way = Signal(NUM_WAYS)
507 self.store_index = Signal(NUM_LINES)
508 self.store_row = Signal(BRAM_ROWS)
509 self.store_tag = Signal(TAG_BITS)
510 self.store_valid = Signal()
511 self.end_row_ix = Signal(ROW_LINE_BITS)
512 self.rows_valid = RowPerLineValidArray()
513
514 # TLB miss state
515 self.fetch_failed = Signal()
516
517 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
518 #
519 # entity icache is
520 # generic (
521 # SIM : boolean := false;
522 # -- Line size in bytes
523 # LINE_SIZE : positive := 64;
524 # -- BRAM organisation: We never access more
525 # -- than wishbone_data_bits
526 # -- at a time so to save resources we make the
527 # -- array only that wide,
528 # -- and use consecutive indices for to make a cache "line"
529 # --
530 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
531 # -- so 64-bits)
532 # ROW_SIZE : positive := wishbone_data_bits / 8;
533 # -- Number of lines in a set
534 # NUM_LINES : positive := 32;
535 # -- Number of ways
536 # NUM_WAYS : positive := 4;
537 # -- L1 ITLB number of entries (direct mapped)
538 # TLB_SIZE : positive := 64;
539 # -- L1 ITLB log_2(page_size)
540 # TLB_LG_PGSZ : positive := 12;
541 # -- Number of real address bits that we store
542 # REAL_ADDR_BITS : positive := 56;
543 # -- Non-zero to enable log data collection
544 # LOG_LENGTH : natural := 0
545 # );
546 # port (
547 # clk : in std_ulogic;
548 # rst : in std_ulogic;
549 #
550 # i_in : in Fetch1ToIcacheType;
551 # i_out : out IcacheToDecode1Type;
552 #
553 # m_in : in MmuToIcacheType;
554 #
555 # stall_in : in std_ulogic;
556 # stall_out : out std_ulogic;
557 # flush_in : in std_ulogic;
558 # inval_in : in std_ulogic;
559 #
560 # wishbone_out : out wishbone_master_out;
561 # wishbone_in : in wishbone_slave_out;
562 #
563 # log_out : out std_ulogic_vector(53 downto 0)
564 # );
565 # end entity icache;
566 # 64 bit direct mapped icache. All instructions are 4B aligned.
567 class ICache(Elaboratable):
568 """64 bit direct mapped icache. All instructions are 4B aligned."""
569 def __init__(self):
570 self.i_in = Fetch1ToICacheType()
571 self.i_out = ICacheToDecode1Type()
572
573 self.m_in = MMUToICacheType()
574
575 self.stall_in = Signal()
576 self.stall_out = Signal()
577 self.flush_in = Signal()
578 self.inval_in = Signal()
579
580 self.wb_out = WBMasterOut()
581 self.wb_in = WBSlaveOut()
582
583 self.log_out = Signal(54)
584
585
586 # -- Generate a cache RAM for each way
587 # rams: for i in 0 to NUM_WAYS-1 generate
588 # signal do_read : std_ulogic;
589 # signal do_write : std_ulogic;
590 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
591 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
592 # signal dout : cache_row_t;
593 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
594 # begin
595 # way: entity work.cache_ram
596 # generic map (
597 # ROW_BITS => ROW_BITS,
598 # WIDTH => ROW_SIZE_BITS
599 # )
600 # port map (
601 # clk => clk,
602 # rd_en => do_read,
603 # rd_addr => rd_addr,
604 # rd_data => dout,
605 # wr_sel => wr_sel,
606 # wr_addr => wr_addr,
607 # wr_data => wishbone_in.dat
608 # );
609 # process(all)
610 # begin
611 # do_read <= not (stall_in or use_previous);
612 # do_write <= '0';
613 # if wishbone_in.ack = '1' and replace_way = i then
614 # do_write <= '1';
615 # end if;
616 # cache_out(i) <= dout;
617 # rd_addr <=
618 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
619 # wr_addr <=
620 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
621 # for i in 0 to ROW_SIZE-1 loop
622 # wr_sel(i) <= do_write;
623 # end loop;
624 # end process;
625 # end generate;
626 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
627 comb = m.d.comb
628
629 wb_in, stall_in = self.wb_in, self.stall_in
630
631 do_read = Signal()
632 do_write = Signal()
633 rd_addr = Signal(ROW_BITS)
634 wr_addr = Signal(ROW_BITS)
635 _d_out = Signal(ROW_SIZE_BITS)
636 wr_sel = Signal(ROW_SIZE)
637
638 for i in range(NUM_WAYS):
639 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
640 comb += way.rd_en.eq(do_read)
641 comb += way.rd_addr.eq(rd_addr)
642 comb += way.rd_data_o.eq(_d_out)
643 comb += way.wr_sel.eq(wr_sel)
644 comb += way.wr_addr.eq(wr_addr)
645 comb += way.wr_data.eq(wb_in.dat)
646
647 comb += do_read.eq(~(stall_in | use_previous))
648 comb += do_write.eq(0)
649
650 with m.If(wb_in.ack & (replace_way == i)):
651 comb += do_write.eq(1)
652
653 comb += cache_out[i].eq(_d_out)
654 comb += rd_addr.eq(req_row)
655 comb += wr_addr.eq(r.store_row)
656 for j in range(ROW_SIZE):
657 comb += wr_sel[j].eq(do_write)
658
659 # -- Generate PLRUs
660 # maybe_plrus: if NUM_WAYS > 1 generate
661 # begin
662 # plrus: for i in 0 to NUM_LINES-1 generate
663 # -- PLRU interface
664 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
665 # signal plru_acc_en : std_ulogic;
666 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
667 #
668 # begin
669 # plru : entity work.plru
670 # generic map (
671 # BITS => WAY_BITS
672 # )
673 # port map (
674 # clk => clk,
675 # rst => rst,
676 # acc => plru_acc,
677 # acc_en => plru_acc_en,
678 # lru => plru_out
679 # );
680 #
681 # process(all)
682 # begin
683 # -- PLRU interface
684 # if get_index(r.hit_nia) = i then
685 # plru_acc_en <= r.hit_valid;
686 # else
687 # plru_acc_en <= '0';
688 # end if;
689 # plru_acc <=
690 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
691 # plru_victim(i) <= plru_out;
692 # end process;
693 # end generate;
694 # end generate;
695 def maybe_plrus(self, m, r, plru_victim):
696 comb = m.d.comb
697
698 with m.If(NUM_WAYS > 1):
699 for i in range(NUM_LINES):
700 plru_acc = Signal(WAY_BITS)
701 plru_acc_en = Signal()
702 plru_out = Signal(WAY_BITS)
703 plru = PLRU(WAY_BITS)
704 comb += plru.acc.eq(plru_acc)
705 comb += plru.acc_en.eq(plru_acc_en)
706 comb += plru.lru_o.eq(plru_out)
707
708 # PLRU interface
709 with m.If(get_index(r.hit_nia) == i):
710 comb += plru.acc_en.eq(r.hit_valid)
711
712 with m.Else():
713 comb += plru.acc_en.eq(0)
714
715 comb += plru.acc.eq(r.hit_way)
716 comb += plru_victim[i].eq(plru.lru_o)
717
718 # -- TLB hit detection and real address generation
719 # itlb_lookup : process(all)
720 # variable pte : tlb_pte_t;
721 # variable ttag : tlb_tag_t;
722 # begin
723 # tlb_req_index <= hash_ea(i_in.nia);
724 # pte := itlb_ptes(tlb_req_index);
725 # ttag := itlb_tags(tlb_req_index);
726 # if i_in.virt_mode = '1' then
727 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
728 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
729 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
730 # ra_valid <= itlb_valids(tlb_req_index);
731 # else
732 # ra_valid <= '0';
733 # end if;
734 # eaa_priv <= pte(3);
735 # else
736 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
737 # ra_valid <= '1';
738 # eaa_priv <= '1';
739 # end if;
740 #
741 # -- no IAMR, so no KUEP support for now
742 # priv_fault <= eaa_priv and not i_in.priv_mode;
743 # access_ok <= ra_valid and not priv_fault;
744 # end process;
745 # TLB hit detection and real address generation
746 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
747 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
748 priv_fault, access_ok):
749 comb = m.d.comb
750
751 i_in = self.i_in
752
753 pte = Signal(TLB_PTE_BITS)
754 ttag = Signal(TLB_EA_TAG_BITS)
755
756 comb += tlb_req_index.eq(hash_ea(i_in.nia))
757 comb += pte.eq(itlb_ptes[tlb_req_index])
758 comb += ttag.eq(itlb_tags[tlb_req_index])
759
760 with m.If(i_in.virt_mode):
761 comb += real_addr.eq(Cat(
762 i_in.nia[:TLB_LG_PGSZ],
763 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
764 ))
765
766 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
767 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
768
769 with m.Else():
770 comb += ra_valid.eq(0)
771
772 with m.Else():
773 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
774 comb += ra_valid.eq(1)
775 comb += eaa_priv.eq(1)
776
777 # No IAMR, so no KUEP support for now
778 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
779 comb += access_ok.eq(ra_valid & ~priv_fault)
780
781 # -- iTLB update
782 # itlb_update: process(clk)
783 # variable wr_index : tlb_index_t;
784 # begin
785 # if rising_edge(clk) then
786 # wr_index := hash_ea(m_in.addr);
787 # if rst = '1' or
788 # (m_in.tlbie = '1' and m_in.doall = '1') then
789 # -- clear all valid bits
790 # for i in tlb_index_t loop
791 # itlb_valids(i) <= '0';
792 # end loop;
793 # elsif m_in.tlbie = '1' then
794 # -- clear entry regardless of hit or miss
795 # itlb_valids(wr_index) <= '0';
796 # elsif m_in.tlbld = '1' then
797 # itlb_tags(wr_index) <=
798 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
799 # itlb_ptes(wr_index) <= m_in.pte;
800 # itlb_valids(wr_index) <= '1';
801 # end if;
802 # end if;
803 # end process;
804 # iTLB update
805 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
806 comb = m.d.comb
807 sync = m.d.sync
808
809 m_in = self.m_in
810
811 wr_index = Signal(TLB_SIZE)
812 comb += wr_index.eq(hash_ea(m_in.addr))
813
814 with m.If(m_in.tlbie & m_in.doall):
815 # Clear all valid bits
816 for i in range(TLB_SIZE):
817 sync += itlb_valid_bits[i].eq(0)
818
819 with m.Elif(m_in.tlbie):
820 # Clear entry regardless of hit or miss
821 sync += itlb_valid_bits[wr_index].eq(0)
822
823 with m.Elif(m_in.tlbld):
824 sync += itlb_tags[wr_index].eq(
825 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
826 )
827 sync += itlb_ptes[wr_index].eq(m_in.pte)
828 sync += itlb_valid_bits[wr_index].eq(1)
829
830 # -- Cache hit detection, output to fetch2 and other misc logic
831 # icache_comb : process(all)
832 # Cache hit detection, output to fetch2 and other misc logic
833 def icache_comb(self, m, use_previous, r, req_index, req_row,
834 req_tag, real_addr, req_laddr, cache_valid_bits,
835 cache_tags, access_ok, req_is_hit,
836 req_is_miss, replace_way, plru_victim, cache_out):
837 # variable is_hit : std_ulogic;
838 # variable hit_way : way_t;
839 comb = m.d.comb
840
841 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
842 flush_in, stall_out = self.flush_in, self.stall_out
843
844 is_hit = Signal()
845 hit_way = Signal(NUM_WAYS)
846 # begin
847 # -- i_in.sequential means that i_in.nia this cycle
848 # -- is 4 more than last cycle. If we read more
849 # -- than 32 bits at a time, had a cache hit last
850 # -- cycle, and we don't want the first 32-bit chunk
851 # -- then we can keep the data we read last cycle
852 # -- and just use that.
853 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
854 # use_previous <= i_in.sequential and r.hit_valid;
855 # else
856 # use_previous <= '0';
857 # end if;
858 # i_in.sequential means that i_in.nia this cycle is 4 more than
859 # last cycle. If we read more than 32 bits at a time, had a
860 # cache hit last cycle, and we don't want the first 32-bit chunk
861 # then we can keep the data we read last cycle and just use that.
862 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
863 comb += use_previous.eq(i_in.sequential & r.hit_valid)
864
865 with m.Else():
866 comb += use_previous.eq(0)
867
868 # -- Extract line, row and tag from request
869 # req_index <= get_index(i_in.nia);
870 # req_row <= get_row(i_in.nia);
871 # req_tag <= get_tag(real_addr);
872 # Extract line, row and tag from request
873 comb += req_index.eq(get_index(i_in.nia))
874 comb += req_row.eq(get_row(i_in.nia))
875 comb += req_tag.eq(get_tag(real_addr))
876
877 # -- Calculate address of beginning of cache row, will be
878 # -- used for cache miss processing if needed
879 # req_laddr <=
880 # (63 downto REAL_ADDR_BITS => '0') &
881 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
882 # (ROW_OFF_BITS-1 downto 0 => '0');
883 # Calculate address of beginning of cache row, will be
884 # used for cache miss processing if needed
885 comb += req_laddr.eq(Cat(
886 Const(0b0, ROW_OFF_BITS),
887 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
888 Const(0, REAL_ADDR_BITS)
889 ))
890
891 # -- Test if pending request is a hit on any way
892 # hit_way := 0;
893 # is_hit := '0';
894 # for i in way_t loop
895 # if i_in.req = '1' and
896 # (cache_valids(req_index)(i) = '1' or
897 # (r.state = WAIT_ACK and
898 # req_index = r.store_index and
899 # i = r.store_way and
900 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
901 # if read_tag(i, cache_tags(req_index)) = req_tag then
902 # hit_way := i;
903 # is_hit := '1';
904 # end if;
905 # end if;
906 # end loop;
907 # Test if pending request is a hit on any way
908 for i in range(NUM_WAYS):
909 with m.If(i_in.req &
910 (cache_valid_bits[req_index][i] |
911 ((r.state == State.WAIT_ACK)
912 & (req_index == r.store_index)
913 & (i == r.store_way)
914 & r.rows_valid[req_row % ROW_PER_LINE]))):
915 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
916 comb += hit_way.eq(i)
917 comb += is_hit.eq(1)
918
919 # -- Generate the "hit" and "miss" signals
920 # -- for the synchronous blocks
921 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
922 # and rst = '0' then
923 # req_is_hit <= is_hit;
924 # req_is_miss <= not is_hit;
925 # else
926 # req_is_hit <= '0';
927 # req_is_miss <= '0';
928 # end if;
929 # req_hit_way <= hit_way;
930 # Generate the "hit" and "miss" signals
931 # for the synchronous blocks
932 with m.If(i_in.req & access_ok & ~flush_in):
933 comb += req_is_hit.eq(is_hit)
934 comb += req_is_miss.eq(~is_hit)
935
936 with m.Else():
937 comb += req_is_hit.eq(0)
938 comb += req_is_miss.eq(0)
939
940 # -- The way to replace on a miss
941 # if r.state = CLR_TAG then
942 # replace_way <=
943 # to_integer(unsigned(plru_victim(r.store_index)));
944 # else
945 # replace_way <= r.store_way;
946 # end if;
947 # The way to replace on a miss
948 with m.If(r.state == State.CLR_TAG):
949 comb += replace_way.eq(plru_victim[r.store_index])
950
951 with m.Else():
952 comb += replace_way.eq(r.store_way)
953
954 # -- Output instruction from current cache row
955 # --
956 # -- Note: This is a mild violation of our design principle of
957 # -- having pipeline stages output from a clean latch. In this
958 # -- case we output the result of a mux. The alternative would
959 # -- be output an entire row which I prefer not to do just yet
960 # -- as it would force fetch2 to know about some of the cache
961 # -- geometry information.
962 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
963 # i_out.valid <= r.hit_valid;
964 # i_out.nia <= r.hit_nia;
965 # i_out.stop_mark <= r.hit_smark;
966 # i_out.fetch_failed <= r.fetch_failed;
967 # Output instruction from current cache row
968 #
969 # Note: This is a mild violation of our design principle of
970 # having pipeline stages output from a clean latch. In this
971 # case we output the result of a mux. The alternative would
972 # be output an entire row which I prefer not to do just yet
973 # as it would force fetch2 to know about some of the cache
974 # geometry information.
975 comb += i_out.insn.eq(
976 read_insn_word(r.hit_nia, cache_out[r.hit_way])
977 )
978 comb += i_out.valid.eq(r.hit_valid)
979 comb += i_out.nia.eq(r.hit_nia)
980 comb += i_out.stop_mark.eq(r.hit_smark)
981 comb += i_out.fetch_failed.eq(r.fetch_failed)
982
983 # -- Stall fetch1 if we have a miss on cache or TLB
984 # -- or a protection fault
985 # stall_out <= not (is_hit and access_ok);
986 # Stall fetch1 if we have a miss on cache or TLB
987 # or a protection fault
988 comb += stall_out.eq(~(is_hit & access_ok))
989
990 # -- Wishbone requests output (from the cache miss reload machine)
991 # wishbone_out <= r.wb;
992 # Wishbone requests output (from the cache miss reload machine)
993 comb += wb_out.eq(r.wb)
994 # end process;
995
996 # -- Cache hit synchronous machine
997 # icache_hit : process(clk)
998 # Cache hit synchronous machine
999 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
1000 req_index, req_tag, real_addr):
1001 sync = m.d.sync
1002
1003 i_in, stall_in = self.i_in, self.stall_in
1004 flush_in = self.flush_in
1005
1006 # begin
1007 # if rising_edge(clk) then
1008 # -- keep outputs to fetch2 unchanged on a stall
1009 # -- except that flush or reset sets valid to 0
1010 # -- If use_previous, keep the same data as last
1011 # -- cycle and use the second half
1012 # if stall_in = '1' or use_previous = '1' then
1013 # if rst = '1' or flush_in = '1' then
1014 # r.hit_valid <= '0';
1015 # end if;
1016 # keep outputs to fetch2 unchanged on a stall
1017 # except that flush or reset sets valid to 0
1018 # If use_previous, keep the same data as last
1019 # cycle and use the second half
1020 with m.If(stall_in | use_previous):
1021 with m.If(flush_in):
1022 sync += r.hit_valid.eq(0)
1023 # else
1024 # -- On a hit, latch the request for the next cycle,
1025 # -- when the BRAM data will be available on the
1026 # -- cache_out output of the corresponding way
1027 # r.hit_valid <= req_is_hit;
1028 # if req_is_hit = '1' then
1029 # r.hit_way <= req_hit_way;
1030 with m.Else():
1031 # On a hit, latch the request for the next cycle,
1032 # when the BRAM data will be available on the
1033 # cache_out output of the corresponding way
1034 sync += r.hit_valid.eq(req_is_hit)
1035
1036 with m.If(req_is_hit):
1037 sync += r.hit_way.eq(req_hit_way)
1038
1039 # report "cache hit nia:" & to_hstring(i_in.nia) &
1040 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1041 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1042 # " idx:" & integer'image(req_index) &
1043 # " tag:" & to_hstring(req_tag) &
1044 # " way:" & integer'image(req_hit_way) &
1045 # " RA:" & to_hstring(real_addr);
1046 print(f"cache hit nia:{i_in.nia}, " \
1047 f"IR:{i_in.virt_mode}, " \
1048 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1049 f"tag:{req_tag}, way:{req_hit_way}, " \
1050 f"RA:{real_addr}")
1051 # end if;
1052 # end if;
1053 # if stall_in = '0' then
1054 # -- Send stop marks and NIA down regardless of validity
1055 # r.hit_smark <= i_in.stop_mark;
1056 # r.hit_nia <= i_in.nia;
1057 # end if;
1058 with m.If(~stall_in):
1059 # Send stop marks and NIA down regardless of validity
1060 sync += r.hit_smark.eq(i_in.stop_mark)
1061 sync += r.hit_nia.eq(i_in.nia)
1062 # end if;
1063 # end process;
1064
1065 # -- Cache miss/reload synchronous machine
1066 # icache_miss : process(clk)
1067 # Cache miss/reload synchronous machine
1068 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1069 req_index, req_laddr, req_tag, replace_way,
1070 cache_tags, access_ok):
1071 comb = m.d.comb
1072 sync = m.d.sync
1073
1074 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1075 stall_in, flush_in = self.stall_in, self.flush_in
1076 inval_in = self.inval_in
1077
1078 # variable tagset : cache_tags_set_t;
1079 # variable stbs_done : boolean;
1080
1081 tagset = Signal(TAG_RAM_WIDTH)
1082 stbs_done = Signal()
1083
1084 # begin
1085 # if rising_edge(clk) then
1086 # -- On reset, clear all valid bits to force misses
1087 # if rst = '1' then
1088 # On reset, clear all valid bits to force misses
1089 with m.If('''TODO rst nmigen'''):
1090 # for i in index_t loop
1091 # cache_valids(i) <= (others => '0');
1092 # end loop;
1093 for i in Signal(NUM_LINES):
1094 sync += cache_valid_bits[i].eq(~1)
1095
1096 # r.state <= IDLE;
1097 # r.wb.cyc <= '0';
1098 # r.wb.stb <= '0';
1099 sync += r.state.eq(State.IDLE)
1100 sync += r.wb.cyc.eq(0)
1101 sync += r.wb.stb.eq(0)
1102
1103 # -- We only ever do reads on wishbone
1104 # r.wb.dat <= (others => '0');
1105 # r.wb.sel <= "11111111";
1106 # r.wb.we <= '0';
1107 # We only ever do reads on wishbone
1108 sync += r.wb.dat.eq(~1)
1109 sync += r.wb.sel.eq(Const(0b11111111, 8))
1110 sync += r.wb.we.eq(0)
1111
1112 # -- Not useful normally but helps avoiding
1113 # -- tons of sim warnings
1114 # r.wb.adr <= (others => '0');
1115 # Not useful normally but helps avoiding tons of sim warnings
1116 sync += r.wb.adr.eq(~1)
1117
1118 # else
1119 with m.Else():
1120 # -- Process cache invalidations
1121 # if inval_in = '1' then
1122 # for i in index_t loop
1123 # cache_valids(i) <= (others => '0');
1124 # end loop;
1125 # r.store_valid <= '0';
1126 # end if;
1127 # Process cache invalidations
1128 with m.If(inval_in):
1129 for i in range(NUM_LINES):
1130 sync += cache_valid_bits[i].eq(~1)
1131
1132 sync += r.store_valid.eq(0)
1133
1134 # -- Main state machine
1135 # case r.state is
1136 # Main state machine
1137 with m.Switch(r.state):
1138
1139 # when IDLE =>
1140 with m.Case(State.IDLE):
1141 # -- Reset per-row valid flags,
1142 # -- only used in WAIT_ACK
1143 # for i in 0 to ROW_PER_LINE - 1 loop
1144 # r.rows_valid(i) <= '0';
1145 # end loop;
1146 # Reset per-row valid flags,
1147 # only used in WAIT_ACK
1148 for i in range(ROW_PER_LINE):
1149 sync += r.rows_valid[i].eq(0)
1150
1151 # -- We need to read a cache line
1152 # if req_is_miss = '1' then
1153 # report "cache miss nia:" & to_hstring(i_in.nia) &
1154 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1155 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1156 # " idx:" & integer'image(req_index) &
1157 # " way:" & integer'image(replace_way) &
1158 # " tag:" & to_hstring(req_tag) &
1159 # " RA:" & to_hstring(real_addr);
1160 # We need to read a cache line
1161 with m.If(req_is_miss):
1162 print(f"cache miss nia:{i_in.nia} " \
1163 f"IR:{i_in.virt_mode} " \
1164 f"SM:{i_in.stop_mark} " \
1165 F"idx:{req_index} " \
1166 f"way:{replace_way} tag:{req_tag} " \
1167 f"RA:{real_addr}")
1168
1169 # -- Keep track of our index and way for
1170 # -- subsequent stores
1171 # r.store_index <= req_index;
1172 # r.store_row <= get_row(req_laddr);
1173 # r.store_tag <= req_tag;
1174 # r.store_valid <= '1';
1175 # r.end_row_ix <=
1176 # get_row_of_line(get_row(req_laddr)) - 1;
1177 # Keep track of our index and way
1178 # for subsequent stores
1179 sync += r.store_index.eq(req_index)
1180 sync += r.store_row.eq(get_row(req_laddr))
1181 sync += r.store_tag.eq(req_tag)
1182 sync += r.store_valid.eq(1)
1183 sync += r.end_row_ix.eq(
1184 get_row_of_line(
1185 get_row(req_laddr)
1186 ) - 1
1187 )
1188
1189 # -- Prep for first wishbone read. We calculate the
1190 # -- address of the start of the cache line and
1191 # -- start the WB cycle.
1192 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1193 # r.wb.cyc <= '1';
1194 # r.wb.stb <= '1';
1195 # Prep for first wishbone read.
1196 # We calculate the
1197 # address of the start of the cache line and
1198 # start the WB cycle.
1199 sync += r.wb.adr.eq(
1200 req_laddr[:r.wb.adr]
1201 )
1202
1203 # -- Track that we had one request sent
1204 # r.state <= CLR_TAG;
1205 # Track that we had one request sent
1206 sync += r.state.eq(State.CLR_TAG)
1207 # end if;
1208
1209 # when CLR_TAG | WAIT_ACK =>
1210 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1211 # if r.state = CLR_TAG then
1212 with m.If(r.state == State.CLR_TAG):
1213 # -- Get victim way from plru
1214 # r.store_way <= replace_way;
1215 # Get victim way from plru
1216 sync += r.store_way.eq(replace_way)
1217 #
1218 # -- Force misses on that way while
1219 # -- reloading that line
1220 # cache_valids(req_index)(replace_way) <= '0';
1221 # Force misses on that way while
1222 # realoading that line
1223 sync += cache_valid_bits[
1224 req_index
1225 ][replace_way].eq(0)
1226
1227 # -- Store new tag in selected way
1228 # for i in 0 to NUM_WAYS-1 loop
1229 # if i = replace_way then
1230 # tagset := cache_tags(r.store_index);
1231 # write_tag(i, tagset, r.store_tag);
1232 # cache_tags(r.store_index) <= tagset;
1233 # end if;
1234 # end loop;
1235 for i in range(NUM_WAYS):
1236 with m.If(i == replace_way):
1237 comb += tagset.eq(
1238 cache_tags[r.store_index]
1239 )
1240 sync += write_tag(
1241 i, tagset, r.store_tag
1242 )
1243 sync += cache_tags[r.store_index].eq(
1244 tagset
1245 )
1246
1247 # r.state <= WAIT_ACK;
1248 sync += r.state.eq(State.WAIT_ACK)
1249 # end if;
1250
1251 # -- Requests are all sent if stb is 0
1252 # stbs_done := r.wb.stb = '0';
1253 # Requests are all sent if stb is 0
1254 comb += stbs_done.eq(r.wb.stb == 0)
1255
1256 # -- If we are still sending requests,
1257 # -- was one accepted ?
1258 # if wishbone_in.stall = '0' and not stbs_done then
1259 # If we are still sending requests,
1260 # was one accepted?
1261 with m.If(~wb_in.stall & ~stbs_done):
1262 # -- That was the last word ? We are done sending.
1263 # -- Clear stb and set stbs_done so we can handle
1264 # -- an eventual last ack on the same cycle.
1265 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1266 # r.wb.stb <= '0';
1267 # stbs_done := true;
1268 # end if;
1269 # That was the last word ?
1270 # We are done sending.
1271 # Clear stb and set stbs_done
1272 # so we can handle
1273 # an eventual last ack on
1274 # the same cycle.
1275 with m.If(is_last_row_addr(
1276 r.wb.adr, r.end_row_ix)):
1277 sync += r.wb.stb.eq(0)
1278 stbs_done.eq(1)
1279
1280 # -- Calculate the next row address
1281 # r.wb.adr <= next_row_addr(r.wb.adr);
1282 # Calculate the next row address
1283 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1284 # end if;
1285
1286 # -- Incoming acks processing
1287 # if wishbone_in.ack = '1' then
1288 # Incoming acks processing
1289 with m.If(wb_in.ack):
1290 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1291 # <= '1';
1292 sync += r.rows_valid[
1293 r.store_row & ROW_PER_LINE
1294 ].eq(1)
1295
1296 # -- Check for completion
1297 # if stbs_done and
1298 # is_last_row(r.store_row, r.end_row_ix) then
1299 # Check for completion
1300 with m.If(stbs_done & is_last_row(
1301 r.store_row, r.end_row_ix)):
1302 # -- Complete wishbone cycle
1303 # r.wb.cyc <= '0';
1304 # Complete wishbone cycle
1305 sync += r.wb.cyc.eq(0)
1306
1307 # -- Cache line is now valid
1308 # cache_valids(r.store_index)(replace_way) <=
1309 # r.store_valid and not inval_in;
1310 # Cache line is now valid
1311 sync += cache_valid_bits[
1312 r.store_index
1313 ][relace_way].eq(
1314 r.store_valid & ~inval_in
1315 )
1316
1317 # -- We are done
1318 # r.state <= IDLE;
1319 # We are done
1320 sync += r.state.eq(State.IDLE)
1321 # end if;
1322
1323 # -- Increment store row counter
1324 # r.store_row <= next_row(r.store_row);
1325 # Increment store row counter
1326 sync += store_row.eq(next_row(r.store_row))
1327 # end if;
1328 # end case;
1329 # end if;
1330 #
1331 # -- TLB miss and protection fault processing
1332 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1333 # r.fetch_failed <= '0';
1334 # elsif i_in.req = '1' and access_ok = '0' and
1335 # stall_in = '0' then
1336 # r.fetch_failed <= '1';
1337 # end if;
1338 # TLB miss and protection fault processing
1339 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1340 sync += r.fetch_failed.eq(0)
1341
1342 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1343 sync += r.fetch_failed.eq(1)
1344 # end if;
1345 # end process;
1346
1347 # icache_log: if LOG_LENGTH > 0 generate
1348 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1349 req_is_miss, req_is_hit, lway, wstate, r):
1350 comb = m.d.comb
1351 sync = m.d.sync
1352
1353 wb_in, i_out = self.wb_in, self.i_out
1354 log_out, stall_out = self.log_out, self.stall_out
1355
1356 # -- Output data to logger
1357 # signal log_data : std_ulogic_vector(53 downto 0);
1358 # begin
1359 # data_log: process(clk)
1360 # variable lway: way_t;
1361 # variable wstate: std_ulogic;
1362 # Output data to logger
1363 for i in range(LOG_LENGTH):
1364 # Output data to logger
1365 log_data = Signal(54)
1366 lway = Signal(NUM_WAYS)
1367 wstate = Signal()
1368
1369 # begin
1370 # if rising_edge(clk) then
1371 # lway := req_hit_way;
1372 # wstate := '0';
1373 comb += lway.eq(req_hit_way)
1374 comb += wstate.eq(0)
1375
1376 # if r.state /= IDLE then
1377 # wstate := '1';
1378 # end if;
1379 with m.If(r.state != State.IDLE):
1380 sync += wstate.eq(1)
1381
1382 # log_data <= i_out.valid &
1383 # i_out.insn &
1384 # wishbone_in.ack &
1385 # r.wb.adr(5 downto 3) &
1386 # r.wb.stb & r.wb.cyc &
1387 # wishbone_in.stall &
1388 # stall_out &
1389 # r.fetch_failed &
1390 # r.hit_nia(5 downto 2) &
1391 # wstate &
1392 # std_ulogic_vector(to_unsigned(lway, 3)) &
1393 # req_is_hit & req_is_miss &
1394 # access_ok &
1395 # ra_valid;
1396 sync += log_data.eq(Cat(
1397 ra_valid, access_ok, req_is_miss, req_is_hit,
1398 lway, wstate, r.hit_nia[2:6],
1399 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1400 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1401 i_out.valid
1402 ))
1403 # end if;
1404 # end process;
1405 # log_out <= log_data;
1406 comb += log_out.eq(log_data)
1407 # end generate;
1408 # end;
1409
1410 def elaborate(self, platform):
1411
1412 m = Module()
1413 comb = m.d.comb
1414
1415 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1416 cache_tags = CacheTagArray()
1417 cache_valid_bits = CacheValidBitsArray()
1418
1419 # signal itlb_valids : tlb_valids_t;
1420 # signal itlb_tags : tlb_tags_t;
1421 # signal itlb_ptes : tlb_ptes_t;
1422 # attribute ram_style of itlb_tags : signal is "distributed";
1423 # attribute ram_style of itlb_ptes : signal is "distributed";
1424 itlb_valid_bits = TLBValidBitsArray()
1425 itlb_tags = TLBTagArray()
1426 itlb_ptes = TLBPTEArray()
1427 # TODO to be passed to nmigen as ram attributes
1428 # attribute ram_style of itlb_tags : signal is "distributed";
1429 # attribute ram_style of itlb_ptes : signal is "distributed";
1430
1431 # -- Privilege bit from PTE EAA field
1432 # signal eaa_priv : std_ulogic;
1433 # Privilege bit from PTE EAA field
1434 eaa_priv = Signal()
1435
1436 # signal r : reg_internal_t;
1437 r = RegInternal()
1438
1439 # -- Async signals on incoming request
1440 # signal req_index : index_t;
1441 # signal req_row : row_t;
1442 # signal req_hit_way : way_t;
1443 # signal req_tag : cache_tag_t;
1444 # signal req_is_hit : std_ulogic;
1445 # signal req_is_miss : std_ulogic;
1446 # signal req_laddr : std_ulogic_vector(63 downto 0);
1447 # Async signal on incoming request
1448 req_index = Signal(NUM_LINES)
1449 req_row = Signal(BRAM_ROWS)
1450 req_hit_way = Signal(NUM_WAYS)
1451 req_tag = Signal(TAG_BITS)
1452 req_is_hit = Signal()
1453 req_is_miss = Signal()
1454 req_laddr = Signal(64)
1455
1456 # signal tlb_req_index : tlb_index_t;
1457 # signal real_addr : std_ulogic_vector(
1458 # REAL_ADDR_BITS - 1 downto 0
1459 # );
1460 # signal ra_valid : std_ulogic;
1461 # signal priv_fault : std_ulogic;
1462 # signal access_ok : std_ulogic;
1463 # signal use_previous : std_ulogic;
1464 tlb_req_index = Signal(TLB_SIZE)
1465 real_addr = Signal(REAL_ADDR_BITS)
1466 ra_valid = Signal()
1467 priv_fault = Signal()
1468 access_ok = Signal()
1469 use_previous = Signal()
1470
1471 # signal cache_out : cache_ram_out_t;
1472 cache_out = CacheRamOut()
1473
1474 # signal plru_victim : plru_out_t;
1475 # signal replace_way : way_t;
1476 plru_victim = PLRUOut()
1477 replace_way = Signal(NUM_WAYS)
1478
1479 # call sub-functions putting everything together, using shared
1480 # signals established above
1481 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1482 self.maybe_plrus(m, r, plru_victim)
1483 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1484 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1485 priv_fault, access_ok)
1486 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1487 self.icache_comb(m, use_previous, r, req_index, req_row,
1488 req_tag, real_addr, req_laddr, cache_valid_bits,
1489 cache_tags, access_ok, req_is_hit, req_is_miss,
1490 replace_way, plru_victim, cache_out)
1491 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1492 req_index, req_tag, real_addr)
1493 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1494 req_laddr, req_tag, replace_way, cache_tags,
1495 access_ok)
1496 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1497 # req_is_miss, req_is_hit, lway, wstate, r)
1498
1499 return m
1500
1501
1502 # icache_tb.vhdl
1503 #
1504 # library ieee;
1505 # use ieee.std_logic_1164.all;
1506 #
1507 # library work;
1508 # use work.common.all;
1509 # use work.wishbone_types.all;
1510 #
1511 # entity icache_tb is
1512 # end icache_tb;
1513 #
1514 # architecture behave of icache_tb is
1515 # signal clk : std_ulogic;
1516 # signal rst : std_ulogic;
1517 #
1518 # signal i_out : Fetch1ToIcacheType;
1519 # signal i_in : IcacheToDecode1Type;
1520 #
1521 # signal m_out : MmuToIcacheType;
1522 #
1523 # signal wb_bram_in : wishbone_master_out;
1524 # signal wb_bram_out : wishbone_slave_out;
1525 #
1526 # constant clk_period : time := 10 ns;
1527 # begin
1528 # icache0: entity work.icache
1529 # generic map(
1530 # LINE_SIZE => 64,
1531 # NUM_LINES => 4
1532 # )
1533 # port map(
1534 # clk => clk,
1535 # rst => rst,
1536 # i_in => i_out,
1537 # i_out => i_in,
1538 # m_in => m_out,
1539 # stall_in => '0',
1540 # flush_in => '0',
1541 # inval_in => '0',
1542 # wishbone_out => wb_bram_in,
1543 # wishbone_in => wb_bram_out
1544 # );
1545 #
1546 # -- BRAM Memory slave
1547 # bram0: entity work.wishbone_bram_wrapper
1548 # generic map(
1549 # MEMORY_SIZE => 1024,
1550 # RAM_INIT_FILE => "icache_test.bin"
1551 # )
1552 # port map(
1553 # clk => clk,
1554 # rst => rst,
1555 # wishbone_in => wb_bram_in,
1556 # wishbone_out => wb_bram_out
1557 # );
1558 #
1559 # clk_process: process
1560 # begin
1561 # clk <= '0';
1562 # wait for clk_period/2;
1563 # clk <= '1';
1564 # wait for clk_period/2;
1565 # end process;
1566 #
1567 # rst_process: process
1568 # begin
1569 # rst <= '1';
1570 # wait for 2*clk_period;
1571 # rst <= '0';
1572 # wait;
1573 # end process;
1574 #
1575 # stim: process
1576 # begin
1577 # i_out.req <= '0';
1578 # i_out.nia <= (others => '0');
1579 # i_out.stop_mark <= '0';
1580 #
1581 # m_out.tlbld <= '0';
1582 # m_out.tlbie <= '0';
1583 # m_out.addr <= (others => '0');
1584 # m_out.pte <= (others => '0');
1585 #
1586 # wait until rising_edge(clk);
1587 # wait until rising_edge(clk);
1588 # wait until rising_edge(clk);
1589 # wait until rising_edge(clk);
1590 #
1591 # i_out.req <= '1';
1592 # i_out.nia <= x"0000000000000004";
1593 #
1594 # wait for 30*clk_period;
1595 # wait until rising_edge(clk);
1596 #
1597 # assert i_in.valid = '1' severity failure;
1598 # assert i_in.insn = x"00000001"
1599 # report "insn @" & to_hstring(i_out.nia) &
1600 # "=" & to_hstring(i_in.insn) &
1601 # " expected 00000001"
1602 # severity failure;
1603 #
1604 # i_out.req <= '0';
1605 #
1606 # wait until rising_edge(clk);
1607 #
1608 # -- hit
1609 # i_out.req <= '1';
1610 # i_out.nia <= x"0000000000000008";
1611 # wait until rising_edge(clk);
1612 # wait until rising_edge(clk);
1613 # assert i_in.valid = '1' severity failure;
1614 # assert i_in.insn = x"00000002"
1615 # report "insn @" & to_hstring(i_out.nia) &
1616 # "=" & to_hstring(i_in.insn) &
1617 # " expected 00000002"
1618 # severity failure;
1619 # wait until rising_edge(clk);
1620 #
1621 # -- another miss
1622 # i_out.req <= '1';
1623 # i_out.nia <= x"0000000000000040";
1624 #
1625 # wait for 30*clk_period;
1626 # wait until rising_edge(clk);
1627 #
1628 # assert i_in.valid = '1' severity failure;
1629 # assert i_in.insn = x"00000010"
1630 # report "insn @" & to_hstring(i_out.nia) &
1631 # "=" & to_hstring(i_in.insn) &
1632 # " expected 00000010"
1633 # severity failure;
1634 #
1635 # -- test something that aliases
1636 # i_out.req <= '1';
1637 # i_out.nia <= x"0000000000000100";
1638 # wait until rising_edge(clk);
1639 # wait until rising_edge(clk);
1640 # assert i_in.valid = '0' severity failure;
1641 # wait until rising_edge(clk);
1642 #
1643 # wait for 30*clk_period;
1644 # wait until rising_edge(clk);
1645 #
1646 # assert i_in.valid = '1' severity failure;
1647 # assert i_in.insn = x"00000040"
1648 # report "insn @" & to_hstring(i_out.nia) &
1649 # "=" & to_hstring(i_in.insn) &
1650 # " expected 00000040"
1651 # severity failure;
1652 #
1653 # i_out.req <= '0';
1654 #
1655 # std.env.finish;
1656 # end process;
1657 # end;
1658 def icache_sim(dut):
1659 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1660
1661 yield i_out.req.eq(0)
1662 yield i_out.nia.eq(~1)
1663 yield i_out.stop_mark.eq(0)
1664 yield m_out.tlbld.eq(0)
1665 yield m_out.tlbie.eq(0)
1666 yield m_out.addr.eq(~1)
1667 yield m_out.pte.eq(~1)
1668 yield
1669 yield
1670 yield
1671 yield
1672 yield i_out.req.eq(1)
1673 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1674 for i in range(30):
1675 yield
1676 yield
1677 assert i_in.valid
1678 assert i_in.insn == Const(0x00000001, 32), \
1679 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1680 yield i_out.req.eq(0)
1681 yield
1682
1683 # hit
1684 yield i_out.req.eq(1)
1685 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1686 yield
1687 yield
1688 assert i_in.valid
1689 assert i_in.insn == Const(0x00000002, 32), \
1690 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1691 yield
1692
1693 # another miss
1694 yield i_out.req(1)
1695 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1696 for i in range(30):
1697 yield
1698 yield
1699 assert i_in.valid
1700 assert i_in.insn == Const(0x00000010, 32), \
1701 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1702
1703 # test something that aliases
1704 yield i_out.req.eq(1)
1705 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1706 yield
1707 yield
1708 assert i_in.valid
1709 for i in range(30):
1710 yield
1711 yield
1712 assert i_in.valid
1713 assert i_in.insn == Const(0x00000040, 32), \
1714 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1715 yield i_out.req.eq(0)
1716
1717
1718 def test_icache():
1719 dut = ICache()
1720
1721 m = Module()
1722 m.submodules.icache = dut
1723
1724 # nmigen Simulation
1725 sim = Simulator(m)
1726 sim.add_clock(1e-6)
1727
1728 sim.add_sync_process(wrap(icache_sim(dut)))
1729 with sim.write_vcd('test_icache.vcd'):
1730 sim.run()
1731
1732 if __name__ == '__main__':
1733 dut = ICache()
1734 vl = rtlil.convert(dut, ports=[])
1735 with open("test_icache.il", "w") as f:
1736 f.write(vl)
1737
1738 test_icache()