52c09a205919d10a7d1056b5fcc7f26f325dcdd5
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42
43 SIM = 0
44 LINE_SIZE = 64
45 # BRAM organisation: We never access more than wishbone_data_bits
46 # at a time so to save resources we make the array only that wide,
47 # and use consecutive indices for to make a cache "line"
48 #
49 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
50 ROW_SIZE = WB_DATA_BITS // 8
51 # Number of lines in a set
52 NUM_LINES = 32
53 # Number of ways
54 NUM_WAYS = 4
55 # L1 ITLB number of entries (direct mapped)
56 TLB_SIZE = 64
57 # L1 ITLB log_2(page_size)
58 TLB_LG_PGSZ = 12
59 # Number of real address bits that we store
60 REAL_ADDR_BITS = 56
61 # Non-zero to enable log data collection
62 LOG_LENGTH = 0
63
64 ROW_SIZE_BITS = ROW_SIZE * 8
65 # ROW_PER_LINE is the number of row
66 # (wishbone) transactions in a line
67 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
68 # BRAM_ROWS is the number of rows in
69 # BRAM needed to represent the full icache
70 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
71 # INSN_PER_ROW is the number of 32bit
72 # instructions per BRAM row
73 INSN_PER_ROW = ROW_SIZE_BITS // 32
74
75 # Bit fields counts in the address
76 #
77 # INSN_BITS is the number of bits to
78 # select an instruction in a row
79 INSN_BITS = log2_int(INSN_PER_ROW)
80 # ROW_BITS is the number of bits to
81 # select a row
82 ROW_BITS = log2_int(BRAM_ROWS)
83 # ROW_LINEBITS is the number of bits to
84 # select a row within a line
85 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
86 # LINE_OFF_BITS is the number of bits for
87 # the offset in a cache line
88 LINE_OFF_BITS = log2_int(LINE_SIZE)
89 # ROW_OFF_BITS is the number of bits for
90 # the offset in a row
91 ROW_OFF_BITS = log2_int(ROW_SIZE)
92 # INDEX_BITS is the number of bits to
93 # select a cache line
94 INDEX_BITS = log2_int(NUM_LINES)
95 # SET_SIZE_BITS is the log base 2 of
96 # the set size
97 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
98 # TAG_BITS is the number of bits of
99 # the tag part of the address
100 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
101 # WAY_BITS is the number of bits to
102 # select a way
103 WAY_BITS = log2_int(NUM_WAYS)
104 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
105
106 # -- L1 ITLB.
107 # constant TLB_BITS : natural := log2(TLB_SIZE);
108 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
109 # constant TLB_PTE_BITS : natural := 64;
110 TLB_BITS = log2_int(TLB_SIZE)
111 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
112 TLB_PTE_BITS = 64
113
114 # architecture rtl of icache is
115 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
116 #-- ROW_PER_LINE is the number of row (wishbone
117 #-- transactions) in a line
118 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
119 #-- BRAM_ROWS is the number of rows in BRAM
120 #-- needed to represent the full
121 #-- icache
122 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
123 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
124 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
125 #-- Bit fields counts in the address
126 #
127 #-- INSN_BITS is the number of bits to select
128 #-- an instruction in a row
129 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
130 #-- ROW_BITS is the number of bits to select a row
131 #constant ROW_BITS : natural := log2(BRAM_ROWS);
132 #-- ROW_LINEBITS is the number of bits to
133 #-- select a row within a line
134 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
135 #-- LINE_OFF_BITS is the number of bits for the offset
136 #-- in a cache line
137 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
138 #-- ROW_OFF_BITS is the number of bits for the offset in a row
139 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
140 #-- INDEX_BITS is the number of bits to select a cache line
141 #constant INDEX_BITS : natural := log2(NUM_LINES);
142 #-- SET_SIZE_BITS is the log base 2 of the set size
143 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
144 #-- TAG_BITS is the number of bits of the tag part of the address
145 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
146 #-- WAY_BITS is the number of bits to select a way
147 #constant WAY_BITS : natural := log2(NUM_WAYS);
148
149 #-- Example of layout for 32 lines of 64 bytes:
150 #--
151 #-- .. tag |index| line |
152 #-- .. | row | |
153 #-- .. | | | |00| zero (2)
154 #-- .. | | |-| | INSN_BITS (1)
155 #-- .. | |---| | ROW_LINEBITS (3)
156 #-- .. | |--- - --| LINE_OFF_BITS (6)
157 #-- .. | |- --| ROW_OFF_BITS (3)
158 #-- .. |----- ---| | ROW_BITS (8)
159 #-- .. |-----| | INDEX_BITS (5)
160 #-- .. --------| | TAG_BITS (53)
161 # Example of layout for 32 lines of 64 bytes:
162 #
163 # .. tag |index| line |
164 # .. | row | |
165 # .. | | | |00| zero (2)
166 # .. | | |-| | INSN_BITS (1)
167 # .. | |---| | ROW_LINEBITS (3)
168 # .. | |--- - --| LINE_OFF_BITS (6)
169 # .. | |- --| ROW_OFF_BITS (3)
170 # .. |----- ---| | ROW_BITS (8)
171 # .. |-----| | INDEX_BITS (5)
172 # .. --------| | TAG_BITS (53)
173
174 #subtype row_t is integer range 0 to BRAM_ROWS-1;
175 #subtype index_t is integer range 0 to NUM_LINES-1;
176 #subtype way_t is integer range 0 to NUM_WAYS-1;
177 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
178 #
179 #-- The cache data BRAM organized as described above for each way
180 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
181 #
182 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
183 #-- not handle a clean (commented) definition of the cache tags as a 3d
184 #-- memory. For now, work around it by putting all the tags
185 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
186 # type cache_tags_set_t is array(way_t) of cache_tag_t;
187 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
188 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
189 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
190 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
191 def CacheTagArray():
192 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
193
194 #-- The cache valid bits
195 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
196 #type cache_valids_t is array(index_t) of cache_way_valids_t;
197 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
198 def CacheValidBitsArray():
199 return Array(Signal() for x in range(ROW_PER_LINE))
200
201 def RowPerLineValidArray():
202 return Array(Signal() for x in range(ROW_PER_LINE))
203
204
205 #attribute ram_style : string;
206 #attribute ram_style of cache_tags : signal is "distributed";
207 # TODO to be passed to nigmen as ram attributes
208 # attribute ram_style : string;
209 # attribute ram_style of cache_tags : signal is "distributed";
210
211
212 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
213 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
214 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
215 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
216 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
217 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
218 def TLBValidBitsArray():
219 return Array(Signal() for x in range(TLB_SIZE))
220
221 def TLBTagArray():
222 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
223
224 def TLBPTEArray():
225 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
226
227
228 #-- Cache RAM interface
229 #type cache_ram_out_t is array(way_t) of cache_row_t;
230 # Cache RAM interface
231 def CacheRamOut():
232 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
233
234 #-- PLRU output interface
235 #type plru_out_t is array(index_t) of
236 # std_ulogic_vector(WAY_BITS-1 downto 0);
237 # PLRU output interface
238 def PLRUOut():
239 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
240
241 # begin
242 #
243 # assert LINE_SIZE mod ROW_SIZE = 0;
244 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
245 # severity FAILURE;
246 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
247 # severity FAILURE;
248 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
249 # severity FAILURE;
250 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
251 # severity FAILURE;
252 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
253 # report "geometry bits don't add up" severity FAILURE;
254 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
255 # report "geometry bits don't add up" severity FAILURE;
256 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
257 # report "geometry bits don't add up" severity FAILURE;
258 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
259 # report "geometry bits don't add up" severity FAILURE;
260 #
261 # sim_debug: if SIM generate
262 # debug: process
263 # begin
264 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
265 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
266 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
267 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
268 # report "INSN_BITS = " & natural'image(INSN_BITS);
269 # report "ROW_BITS = " & natural'image(ROW_BITS);
270 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
271 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
272 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
273 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
274 # report "TAG_BITS = " & natural'image(TAG_BITS);
275 # report "WAY_BITS = " & natural'image(WAY_BITS);
276 # wait;
277 # end process;
278 # end generate;
279
280 # Cache reload state machine
281 @unique
282 class State(Enum):
283 IDLE = 0
284 CLR_TAG = 1
285 WAIT_ACK = 2
286
287 # type reg_internal_t is record
288 # -- Cache hit state (Latches for 1 cycle BRAM access)
289 # hit_way : way_t;
290 # hit_nia : std_ulogic_vector(63 downto 0);
291 # hit_smark : std_ulogic;
292 # hit_valid : std_ulogic;
293 #
294 # -- Cache miss state (reload state machine)
295 # state : state_t;
296 # wb : wishbone_master_out;
297 # store_way : way_t;
298 # store_index : index_t;
299 # store_row : row_t;
300 # store_tag : cache_tag_t;
301 # store_valid : std_ulogic;
302 # end_row_ix : row_in_line_t;
303 # rows_valid : row_per_line_valid_t;
304 #
305 # -- TLB miss state
306 # fetch_failed : std_ulogic;
307 # end record;
308 class RegInternal(RecordObject):
309 def __init__(self):
310 super().__init__()
311 # Cache hit state (Latches for 1 cycle BRAM access)
312 self.hit_way = Signal(NUM_WAYS)
313 self.hit_nia = Signal(64)
314 self.hit_smark = Signal()
315 self.hit_valid = Signal()
316
317 # Cache miss state (reload state machine)
318 self.state = Signal(State)
319 self.wb = WBMasterOut()
320 self.store_way = Signal(NUM_WAYS)
321 self.store_index = Signal(NUM_LINES)
322 self.store_row = Signal(BRAM_ROWS)
323 self.store_tag = Signal(TAG_BITS)
324 self.store_valid = Signal()
325 self.end_row_ix = Signal(ROW_LINE_BITS)
326 self.rows_valid = RowPerLineValidArray()
327
328 # TLB miss state
329 self.fetch_failed = Signal()
330
331 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
332 #
333 # entity icache is
334 # generic (
335 # SIM : boolean := false;
336 # -- Line size in bytes
337 # LINE_SIZE : positive := 64;
338 # -- BRAM organisation: We never access more
339 # -- than wishbone_data_bits
340 # -- at a time so to save resources we make the
341 # -- array only that wide,
342 # -- and use consecutive indices for to make a cache "line"
343 # --
344 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
345 # -- so 64-bits)
346 # ROW_SIZE : positive := wishbone_data_bits / 8;
347 # -- Number of lines in a set
348 # NUM_LINES : positive := 32;
349 # -- Number of ways
350 # NUM_WAYS : positive := 4;
351 # -- L1 ITLB number of entries (direct mapped)
352 # TLB_SIZE : positive := 64;
353 # -- L1 ITLB log_2(page_size)
354 # TLB_LG_PGSZ : positive := 12;
355 # -- Number of real address bits that we store
356 # REAL_ADDR_BITS : positive := 56;
357 # -- Non-zero to enable log data collection
358 # LOG_LENGTH : natural := 0
359 # );
360 # port (
361 # clk : in std_ulogic;
362 # rst : in std_ulogic;
363 #
364 # i_in : in Fetch1ToIcacheType;
365 # i_out : out IcacheToDecode1Type;
366 #
367 # m_in : in MmuToIcacheType;
368 #
369 # stall_in : in std_ulogic;
370 # stall_out : out std_ulogic;
371 # flush_in : in std_ulogic;
372 # inval_in : in std_ulogic;
373 #
374 # wishbone_out : out wishbone_master_out;
375 # wishbone_in : in wishbone_slave_out;
376 #
377 # log_out : out std_ulogic_vector(53 downto 0)
378 # );
379 # end entity icache;
380 # 64 bit direct mapped icache. All instructions are 4B aligned.
381 class ICache(Elaboratable):
382 """64 bit direct mapped icache. All instructions are 4B aligned."""
383 def __init__(self):
384 self.i_in = Fetch1ToICacheType()
385 self.i_out = ICacheToDecode1Type()
386
387 self.m_in = MMUToICacheType()
388
389 self.stall_in = Signal()
390 self.stall_out = Signal()
391 self.flush_in = Signal()
392 self.inval_in = Signal()
393
394 self.wb_out = WBMasterOut()
395 self.wb_in = WBSlaveOut()
396
397 self.log_out = Signal(54)
398
399 # -- Return the cache line index (tag index) for an address
400 # function get_index(addr: std_ulogic_vector(63 downto 0))
401 # return index_t is
402 # begin
403 # return to_integer(unsigned(
404 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
405 # ));
406 # end;
407 # Return the cache line index (tag index) for an address
408 def get_index(addr):
409 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
410
411 # -- Return the cache row index (data memory) for an address
412 # function get_row(addr: std_ulogic_vector(63 downto 0))
413 # return row_t is
414 # begin
415 # return to_integer(unsigned(
416 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
417 # ));
418 # end;
419 # Return the cache row index (data memory) for an address
420 def get_row(addr):
421 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
422
423 # -- Return the index of a row within a line
424 # function get_row_of_line(row: row_t) return row_in_line_t is
425 # variable row_v : unsigned(ROW_BITS-1 downto 0);
426 # begin
427 # row_v := to_unsigned(row, ROW_BITS);
428 # return row_v(ROW_LINEBITS-1 downto 0);
429 # end;
430 # Return the index of a row within a line
431 def get_row_of_line(row):
432 row[:ROW_LINE_BITS]
433
434 # -- Returns whether this is the last row of a line
435 # function is_last_row_addr(addr: wishbone_addr_type;
436 # last: row_in_line_t
437 # )
438 # return boolean is
439 # begin
440 # return unsigned(
441 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
442 # ) = last;
443 # end;
444 # Returns whether this is the last row of a line
445 def is_last_row_addr(addr, last):
446 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
447
448 # -- Returns whether this is the last row of a line
449 # function is_last_row(row: row_t;
450 # last: row_in_line_t) return boolean is
451 # begin
452 # return get_row_of_line(row) = last;
453 # end;
454 # Returns whether this is the last row of a line
455 def is_last_row(row, last):
456 return get_row_of_line(row) == last
457
458 # -- Return the address of the next row in the current cache line
459 # function next_row_addr(addr: wishbone_addr_type)
460 # return std_ulogic_vector is
461 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
462 # variable result : wishbone_addr_type;
463 # begin
464 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
465 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
466 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
467 # result := addr;
468 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
469 # return result;
470 # end;
471 # Return the address of the next row in the current cache line
472 def next_row_addr(addr):
473 # TODO no idea what's going on here, looks like double assignments
474 # overriding earlier assignments ??? Help please!
475 pass
476
477 # -- Return the next row in the current cache line. We use a dedicated
478 # -- function in order to limit the size of the generated adder to be
479 # -- only the bits within a cache line (3 bits with default settings)
480 # function next_row(row: row_t) return row_t is
481 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
482 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
483 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
484 # begin
485 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
486 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
487 # row_v(ROW_LINEBITS-1 downto 0) :=
488 # std_ulogic_vector(unsigned(row_idx) + 1);
489 # return to_integer(unsigned(row_v));
490 # end;
491 # Return the next row in the current cache line. We use a dedicated
492 # function in order to limit the size of the generated adder to be
493 # only the bits within a cache line (3 bits with default settings)
494 def next_row(row):
495 # TODO no idea what's going on here, looks like double assignments
496 # overriding earlier assignments ??? Help please!
497 pass
498
499 # -- Read the instruction word for the given address in the
500 # -- current cache row
501 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
502 # data: cache_row_t) return std_ulogic_vector is
503 # variable word: integer range 0 to INSN_PER_ROW-1;
504 # begin
505 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
506 # return data(31+word*32 downto word*32);
507 # end;
508 # Read the instruction word for the given address
509 # in the current cache row
510 def read_insn_word(addr, data):
511 word = addr[2:INSN_BITS+3]
512 return data[word * 32:32 + word * 32]
513
514 # -- Get the tag value from the address
515 # function get_tag(
516 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
517 # )
518 # return cache_tag_t is
519 # begin
520 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
521 # end;
522 # Get the tag value from the address
523 def get_tag(addr):
524 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
525
526 # -- Read a tag from a tag memory row
527 # function read_tag(way: way_t; tagset: cache_tags_set_t)
528 # return cache_tag_t is
529 # begin
530 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
531 # end;
532 # Read a tag from a tag memory row
533 def read_tag(way, tagset):
534 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
535
536 # -- Write a tag to tag memory row
537 # procedure write_tag(way: in way_t;
538 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
539 # begin
540 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
541 # end;
542 # Write a tag to tag memory row
543 def write_tag(way, tagset, tag):
544 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
545
546 # -- Simple hash for direct-mapped TLB index
547 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
548 # return tlb_index_t is
549 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
550 # begin
551 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
552 # xor addr(
553 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
554 # TLB_LG_PGSZ + TLB_BITS
555 # )
556 # xor addr(
557 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
558 # TLB_LG_PGSZ + 2 * TLB_BITS
559 # );
560 # return to_integer(unsigned(hash));
561 # end;
562 # Simple hash for direct-mapped TLB index
563 def hash_ea(addr):
564 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
565 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
566 ] ^ addr[
567 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
568 ]
569 return hsh
570
571 # -- Generate a cache RAM for each way
572 # rams: for i in 0 to NUM_WAYS-1 generate
573 # signal do_read : std_ulogic;
574 # signal do_write : std_ulogic;
575 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
576 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
577 # signal dout : cache_row_t;
578 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
579 # begin
580 # way: entity work.cache_ram
581 # generic map (
582 # ROW_BITS => ROW_BITS,
583 # WIDTH => ROW_SIZE_BITS
584 # )
585 # port map (
586 # clk => clk,
587 # rd_en => do_read,
588 # rd_addr => rd_addr,
589 # rd_data => dout,
590 # wr_sel => wr_sel,
591 # wr_addr => wr_addr,
592 # wr_data => wishbone_in.dat
593 # );
594 # process(all)
595 # begin
596 # do_read <= not (stall_in or use_previous);
597 # do_write <= '0';
598 # if wishbone_in.ack = '1' and replace_way = i then
599 # do_write <= '1';
600 # end if;
601 # cache_out(i) <= dout;
602 # rd_addr <=
603 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
604 # wr_addr <=
605 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
606 # for i in 0 to ROW_SIZE-1 loop
607 # wr_sel(i) <= do_write;
608 # end loop;
609 # end process;
610 # end generate;
611 def rams(self, m):
612 comb = m.d.comb
613
614 do_read = Signal()
615 do_write = Signal()
616 rd_addr = Signal(ROW_BITS)
617 wr_addr = Signal(ROW_BITS)
618 _d_out = Signal(ROW_SIZE_BITS)
619 wr_sel = Signal(ROW_SIZE)
620
621 for i in range(NUM_WAYS):
622 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
623 comb += way.rd_en.eq(do_read)
624 comb += way.rd_addr.eq(rd_addr)
625 comb += way.rd_data.eq(_d_out)
626 comb += way.wr_sel.eq(wr_sel)
627 comb += way.wr_add.eq(wr_addr)
628 comb += way.wr_data.eq(wb_in.dat)
629
630 comb += do_read.eq(~(stall_in | use_previous))
631 comb += do_write.eq(0)
632
633 with m.If(wb_in.ack & (replace_way == i)):
634 do_write.eq(1)
635
636 comb += cache_out[i].eq(_d_out)
637 comb += rd_addr.eq(Signal(req_row))
638 comb += wr_addr.eq(Signal(r.store_row))
639 for j in range(ROW_SIZE):
640 comb += wr_sel[j].eq(do_write)
641
642 # -- Generate PLRUs
643 # maybe_plrus: if NUM_WAYS > 1 generate
644 # begin
645 # plrus: for i in 0 to NUM_LINES-1 generate
646 # -- PLRU interface
647 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
648 # signal plru_acc_en : std_ulogic;
649 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
650 #
651 # begin
652 # plru : entity work.plru
653 # generic map (
654 # BITS => WAY_BITS
655 # )
656 # port map (
657 # clk => clk,
658 # rst => rst,
659 # acc => plru_acc,
660 # acc_en => plru_acc_en,
661 # lru => plru_out
662 # );
663 #
664 # process(all)
665 # begin
666 # -- PLRU interface
667 # if get_index(r.hit_nia) = i then
668 # plru_acc_en <= r.hit_valid;
669 # else
670 # plru_acc_en <= '0';
671 # end if;
672 # plru_acc <=
673 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
674 # plru_victim(i) <= plru_out;
675 # end process;
676 # end generate;
677 # end generate;
678 def maybe_plrus(self, m):
679 comb += m.d.comb
680
681 with m.If(NUM_WAYS > 1):
682 for i in range(NUM_LINES):
683 plru_acc = Signal(WAY_BITS)
684 plru_acc_en = Signal()
685 plru_out = Signal(WAY_BITS)
686 plru = PLRU(WAY_BITS)
687 comb += plru.acc.eq(plru_acc)
688 comb += plru.acc_en.eq(plru_acc_en)
689 comb += plru.lru.eq(plru_out)
690
691 # PLRU interface
692 with m.If(get_index(r.hit_nia) == i):
693 comb += plru.acc_en.eq(r.hit_valid)
694
695 with m.Else():
696 comb += plru.acc_en.eq(0)
697
698 comb += plru.acc.eq(r.hit_way)
699 comb += plru_victim[i].eq(plru.lru)
700
701 # -- TLB hit detection and real address generation
702 # itlb_lookup : process(all)
703 # variable pte : tlb_pte_t;
704 # variable ttag : tlb_tag_t;
705 # begin
706 # tlb_req_index <= hash_ea(i_in.nia);
707 # pte := itlb_ptes(tlb_req_index);
708 # ttag := itlb_tags(tlb_req_index);
709 # if i_in.virt_mode = '1' then
710 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
711 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
712 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
713 # ra_valid <= itlb_valids(tlb_req_index);
714 # else
715 # ra_valid <= '0';
716 # end if;
717 # eaa_priv <= pte(3);
718 # else
719 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
720 # ra_valid <= '1';
721 # eaa_priv <= '1';
722 # end if;
723 #
724 # -- no IAMR, so no KUEP support for now
725 # priv_fault <= eaa_priv and not i_in.priv_mode;
726 # access_ok <= ra_valid and not priv_fault;
727 # end process;
728 # TLB hit detection and real address generation
729 def itlb_lookup(self, m):
730 comb = m.d.comb
731
732 comb += tlb_req_index.eq(hash_ea(i_in.nia))
733 comb += pte.eq(itlb_ptes[tlb_req_index])
734 comb += ttag.eq(itlb_tags[tlb_req_index])
735
736 with m.If(i_in.virt_mode):
737 comb += real_addr.eq(Cat(
738 i_in.nia[:TLB_LB_PGSZ],
739 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
740 ))
741
742 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
743 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
744
745 with m.Else():
746 comb += ra_valid.eq(0)
747
748 with m.Else():
749 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
750 comb += ra_valid.eq(1)
751 comb += eaa_priv.eq(1)
752
753 # No IAMR, so no KUEP support for now
754 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
755 comb += access_ok.eq(ra_valid & ~priv_fault)
756
757 # -- iTLB update
758 # itlb_update: process(clk)
759 # variable wr_index : tlb_index_t;
760 # begin
761 # if rising_edge(clk) then
762 # wr_index := hash_ea(m_in.addr);
763 # if rst = '1' or
764 # (m_in.tlbie = '1' and m_in.doall = '1') then
765 # -- clear all valid bits
766 # for i in tlb_index_t loop
767 # itlb_valids(i) <= '0';
768 # end loop;
769 # elsif m_in.tlbie = '1' then
770 # -- clear entry regardless of hit or miss
771 # itlb_valids(wr_index) <= '0';
772 # elsif m_in.tlbld = '1' then
773 # itlb_tags(wr_index) <=
774 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
775 # itlb_ptes(wr_index) <= m_in.pte;
776 # itlb_valids(wr_index) <= '1';
777 # end if;
778 # end if;
779 # end process;
780 # iTLB update
781 def itlb_update(self, m):
782 sync = m.d.sync
783
784 wr_index = Signal(TLB_SIZE)
785 sync += wr_index.eq(hash_ea(m_in.addr))
786
787 with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
788 # Clear all valid bits
789 for i in range(TLB_SIZE):
790 sync += itlb_vlaids[i].eq(0)
791
792 with m.Elif(m_in.tlbie):
793 # Clear entry regardless of hit or miss
794 sync += itlb_valid_bits[wr_index].eq(0)
795
796 with m.Elif(m_in.tlbld):
797 sync += itlb_tags[wr_index].eq(
798 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
799 )
800 sync += itlb_ptes[wr_index].eq(m_in.pte)
801 sync += itlb_valid_bits[wr_index].eq(1)
802
803 # -- Cache hit detection, output to fetch2 and other misc logic
804 # icache_comb : process(all)
805 # Cache hit detection, output to fetch2 and other misc logic
806 def icache_comb(self, m):
807 # variable is_hit : std_ulogic;
808 # variable hit_way : way_t;
809 comb = m.d.comb
810
811 is_hit = Signal()
812 hit_way = Signal(NUM_WAYS)
813 # begin
814 # -- i_in.sequential means that i_in.nia this cycle
815 # -- is 4 more than last cycle. If we read more
816 # -- than 32 bits at a time, had a cache hit last
817 # -- cycle, and we don't want the first 32-bit chunk
818 # -- then we can keep the data we read last cycle
819 # -- and just use that.
820 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
821 # use_previous <= i_in.sequential and r.hit_valid;
822 # else
823 # use_previous <= '0';
824 # end if;
825 # i_in.sequential means that i_in.nia this cycle is 4 more than
826 # last cycle. If we read more than 32 bits at a time, had a
827 # cache hit last cycle, and we don't want the first 32-bit chunk
828 # then we can keep the data we read last cycle and just use that.
829 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
830 comb += use_previous.eq(i_in.sequential & r.hit_valid)
831
832 with m.Else():
833 comb += use_previous.eq(0)
834
835 # -- Extract line, row and tag from request
836 # req_index <= get_index(i_in.nia);
837 # req_row <= get_row(i_in.nia);
838 # req_tag <= get_tag(real_addr);
839 # Extract line, row and tag from request
840 comb += req_index.eq(get_index(i_in.nia))
841 comb += req_row.eq(get_row(i_in.nia))
842 comb += req_tag.eq(get_tag(real_addr))
843
844 # -- Calculate address of beginning of cache row, will be
845 # -- used for cache miss processing if needed
846 # req_laddr <=
847 # (63 downto REAL_ADDR_BITS => '0') &
848 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
849 # (ROW_OFF_BITS-1 downto 0 => '0');
850 # Calculate address of beginning of cache row, will be
851 # used for cache miss processing if needed
852 comb += req_laddr.eq(Cat(
853 Const(0b0, ROW_OFF_BITS),
854 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
855 Const(0, REAL_ADDR_BITS)
856 ))
857
858 # -- Test if pending request is a hit on any way
859 # hit_way := 0;
860 # is_hit := '0';
861 # for i in way_t loop
862 # if i_in.req = '1' and
863 # (cache_valids(req_index)(i) = '1' or
864 # (r.state = WAIT_ACK and
865 # req_index = r.store_index and
866 # i = r.store_way and
867 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
868 # if read_tag(i, cache_tags(req_index)) = req_tag then
869 # hit_way := i;
870 # is_hit := '1';
871 # end if;
872 # end if;
873 # end loop;
874 # Test if pending request is a hit on any way
875 for i in range(NUM_WAYS):
876 with m.If(i_in.req &
877 (cache_valid_bits[req_index][i] |
878 ((r.state == State.WAIT_ACK)
879 & (req_index == r.store_index)
880 & (i == r.store_way)
881 & r.rows_valid[req_row % ROW_PER_LINE]))):
882 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
883 comb += hit_way.eq(i)
884 comb += is_hit.eq(1)
885
886 # -- Generate the "hit" and "miss" signals
887 # -- for the synchronous blocks
888 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
889 # and rst = '0' then
890 # req_is_hit <= is_hit;
891 # req_is_miss <= not is_hit;
892 # else
893 # req_is_hit <= '0';
894 # req_is_miss <= '0';
895 # end if;
896 # req_hit_way <= hit_way;
897 # Generate the "hit" and "miss" signals
898 # for the synchronous blocks
899 with m.If(i_in.rq & access_ok & ~flush_in):
900 comb += req_is_hit.eq(is_hit)
901 comb += req_is_miss.eq(~is_hit)
902
903 with m.Else():
904 comb += req_is_hit.eq(0)
905 comb += req_is_miss.eq(0)
906
907 # -- The way to replace on a miss
908 # if r.state = CLR_TAG then
909 # replace_way <=
910 # to_integer(unsigned(plru_victim(r.store_index)));
911 # else
912 # replace_way <= r.store_way;
913 # end if;
914 # The way to replace on a miss
915 with m.If(r.state == State.CLR_TAG):
916 comb += replace_way.eq(plru_victim[r.store_index])
917
918 with m.Else():
919 comb += replace_way.eq(r.store_way)
920
921 # -- Output instruction from current cache row
922 # --
923 # -- Note: This is a mild violation of our design principle of
924 # -- having pipeline stages output from a clean latch. In this
925 # -- case we output the result of a mux. The alternative would
926 # -- be output an entire row which I prefer not to do just yet
927 # -- as it would force fetch2 to know about some of the cache
928 # -- geometry information.
929 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
930 # i_out.valid <= r.hit_valid;
931 # i_out.nia <= r.hit_nia;
932 # i_out.stop_mark <= r.hit_smark;
933 # i_out.fetch_failed <= r.fetch_failed;
934 # Output instruction from current cache row
935 #
936 # Note: This is a mild violation of our design principle of
937 # having pipeline stages output from a clean latch. In this
938 # case we output the result of a mux. The alternative would
939 # be output an entire row which I prefer not to do just yet
940 # as it would force fetch2 to know about some of the cache
941 # geometry information.
942 comb += i_out.insn.eq(
943 read_insn_word(r.hit_nia, cache_out[r.hit_way])
944 )
945 comb += i_out.valid.eq(r.hit_valid)
946 comb += i_out.nia.eq(r.hit_nia)
947 comb += i_out.stop_mark.eq(r.hit_smark)
948 comb += i_out.fetch_failed.eq(r.fetch_failed)
949
950 # -- Stall fetch1 if we have a miss on cache or TLB
951 # -- or a protection fault
952 # stall_out <= not (is_hit and access_ok);
953 # Stall fetch1 if we have a miss on cache or TLB
954 # or a protection fault
955 comb += stall_out.eq(~(is_hit & access_ok))
956
957 # -- Wishbone requests output (from the cache miss reload machine)
958 # wishbone_out <= r.wb;
959 # Wishbone requests output (from the cache miss reload machine)
960 comb += wb_out.eq(r.wb)
961 # end process;
962
963 # -- Cache hit synchronous machine
964 # icache_hit : process(clk)
965 # Cache hit synchronous machine
966 def icache_hit(self, m):
967 sync = m.d.sync
968 # begin
969 # if rising_edge(clk) then
970 # -- keep outputs to fetch2 unchanged on a stall
971 # -- except that flush or reset sets valid to 0
972 # -- If use_previous, keep the same data as last
973 # -- cycle and use the second half
974 # if stall_in = '1' or use_previous = '1' then
975 # if rst = '1' or flush_in = '1' then
976 # r.hit_valid <= '0';
977 # end if;
978 # keep outputs to fetch2 unchanged on a stall
979 # except that flush or reset sets valid to 0
980 # If use_previous, keep the same data as last
981 # cycle and use the second half
982 with m.If(stall_in | use_previous):
983 with m.If('''TODO rst nmigen''' | flush_in):
984 sync += r.hit_valid.eq(0)
985 # else
986 # -- On a hit, latch the request for the next cycle,
987 # -- when the BRAM data will be available on the
988 # -- cache_out output of the corresponding way
989 # r.hit_valid <= req_is_hit;
990 # if req_is_hit = '1' then
991 # r.hit_way <= req_hit_way;
992 with m.Else():
993 # On a hit, latch the request for the next cycle,
994 # when the BRAM data will be available on the
995 # cache_out output of the corresponding way
996 sync += r.hit_valid.eq(req_is_hit)
997
998 with m.If(req_is_hit):
999 sync += r.hit_way.eq(req_hit_way)
1000
1001 # report "cache hit nia:" & to_hstring(i_in.nia) &
1002 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1003 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1004 # " idx:" & integer'image(req_index) &
1005 # " tag:" & to_hstring(req_tag) &
1006 # " way:" & integer'image(req_hit_way) &
1007 # " RA:" & to_hstring(real_addr);
1008 print(f"cache hit nia:{i_in.nia}, " \
1009 f"IR:{i_in.virt_mode}, " \
1010 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1011 f"tag:{req_tag}, way:{req_hit_way}, " \
1012 f"RA:{real_addr}")
1013 # end if;
1014 # end if;
1015 # if stall_in = '0' then
1016 # -- Send stop marks and NIA down regardless of validity
1017 # r.hit_smark <= i_in.stop_mark;
1018 # r.hit_nia <= i_in.nia;
1019 # end if;
1020 with m.If(~stall_in):
1021 # Send stop marks and NIA down regardless of validity
1022 sync += r.hit_smark.eq(i_in.stop_mark)
1023 sync += r.hit_nia.eq(i_in.nia)
1024 # end if;
1025 # end process;
1026
1027 # -- Cache miss/reload synchronous machine
1028 # icache_miss : process(clk)
1029 # Cache miss/reload synchronous machine
1030 def icache_miss(self, m):
1031 comb = m.d.comb
1032 sync = m.d.sync
1033
1034 # variable tagset : cache_tags_set_t;
1035 # variable stbs_done : boolean;
1036
1037 tagset = Signal(TAG_RAM_WIDTH)
1038 stbs_done = Signal()
1039
1040 # begin
1041 # if rising_edge(clk) then
1042 # -- On reset, clear all valid bits to force misses
1043 # if rst = '1' then
1044 # On reset, clear all valid bits to force misses
1045 with m.If('''TODO rst nmigen'''):
1046 # for i in index_t loop
1047 # cache_valids(i) <= (others => '0');
1048 # end loop;
1049 for i in Signal(NUM_LINES):
1050 sync += cache_valid_bits[i].eq(~1)
1051
1052 # r.state <= IDLE;
1053 # r.wb.cyc <= '0';
1054 # r.wb.stb <= '0';
1055 sync += r.state.eq(State.IDLE)
1056 sync += r.wb.cyc.eq(0)
1057 sync += r.wb.stb.eq(0)
1058
1059 # -- We only ever do reads on wishbone
1060 # r.wb.dat <= (others => '0');
1061 # r.wb.sel <= "11111111";
1062 # r.wb.we <= '0';
1063 # We only ever do reads on wishbone
1064 sync += r.wb.dat.eq(~1)
1065 sync += r.wb.sel.eq(Const(0b11111111, 8))
1066 sync += r.wb.we.eq(0)
1067
1068 # -- Not useful normally but helps avoiding
1069 # -- tons of sim warnings
1070 # r.wb.adr <= (others => '0');
1071 # Not useful normally but helps avoiding tons of sim warnings
1072 sync += r.wb.adr.eq(~1)
1073
1074 # else
1075 with m.Else():
1076 # -- Process cache invalidations
1077 # if inval_in = '1' then
1078 # for i in index_t loop
1079 # cache_valids(i) <= (others => '0');
1080 # end loop;
1081 # r.store_valid <= '0';
1082 # end if;
1083 # Process cache invalidations
1084 with m.If(inval_in):
1085 for i in range(NUM_LINES):
1086 sync += cache_valid_bits[i].eq(~1)
1087
1088 sync += r.store_valid.eq(0)
1089
1090 # -- Main state machine
1091 # case r.state is
1092 # Main state machine
1093 with m.Switch(r.state):
1094
1095 # when IDLE =>
1096 with m.Case(State.IDLE):
1097 # -- Reset per-row valid flags,
1098 # -- only used in WAIT_ACK
1099 # for i in 0 to ROW_PER_LINE - 1 loop
1100 # r.rows_valid(i) <= '0';
1101 # end loop;
1102 # Reset per-row valid flags,
1103 # only used in WAIT_ACK
1104 for i in range(ROW_PER_LINE):
1105 sync += r.rows_valid[i].eq(0)
1106
1107 # -- We need to read a cache line
1108 # if req_is_miss = '1' then
1109 # report "cache miss nia:" & to_hstring(i_in.nia) &
1110 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1111 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1112 # " idx:" & integer'image(req_index) &
1113 # " way:" & integer'image(replace_way) &
1114 # " tag:" & to_hstring(req_tag) &
1115 # " RA:" & to_hstring(real_addr);
1116 # We need to read a cache line
1117 with m.If(req_is_miss):
1118 print(f"cache miss nia:{i_in.nia} " \
1119 f"IR:{i_in.virt_mode} " \
1120 f"SM:{i_in.stop_mark} " \
1121 F"idx:{req_index} " \
1122 f"way:{replace_way} tag:{req_tag} " \
1123 f"RA:{real_addr}")
1124
1125 # -- Keep track of our index and way for
1126 # -- subsequent stores
1127 # r.store_index <= req_index;
1128 # r.store_row <= get_row(req_laddr);
1129 # r.store_tag <= req_tag;
1130 # r.store_valid <= '1';
1131 # r.end_row_ix <=
1132 # get_row_of_line(get_row(req_laddr)) - 1;
1133 # Keep track of our index and way
1134 # for subsequent stores
1135 sync += r.store_index.eq(req_index)
1136 sync += r.store_row.eq(get_row(req_laddr))
1137 sync += r.store_tag.eq(req_tag)
1138 sync += r.store_valid.eq(1)
1139 sync += r.end_row_ix.eq(
1140 get_row_of_line(
1141 get_row(req_laddr)
1142 ) - 1
1143 )
1144
1145 # -- Prep for first wishbone read. We calculate the
1146 # -- address of the start of the cache line and
1147 # -- start the WB cycle.
1148 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1149 # r.wb.cyc <= '1';
1150 # r.wb.stb <= '1';
1151 # Prep for first wishbone read.
1152 # We calculate the
1153 # address of the start of the cache line and
1154 # start the WB cycle.
1155 sync += r.wb.adr.eq(
1156 req_laddr[:r.wb.adr]
1157 )
1158
1159 # -- Track that we had one request sent
1160 # r.state <= CLR_TAG;
1161 # Track that we had one request sent
1162 sync += r.state.eq(State.CLR_TAG)
1163 # end if;
1164
1165 # when CLR_TAG | WAIT_ACK =>
1166 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1167 # if r.state = CLR_TAG then
1168 with m.If(r.state == State.CLR_TAG):
1169 # -- Get victim way from plru
1170 # r.store_way <= replace_way;
1171 # Get victim way from plru
1172 sync += r.store_way.eq(replace_way)
1173 #
1174 # -- Force misses on that way while
1175 # -- reloading that line
1176 # cache_valids(req_index)(replace_way) <= '0';
1177 # Force misses on that way while
1178 # realoading that line
1179 sync += cache_valid_bits[
1180 req_index
1181 ][replace_way].eq(0)
1182
1183 # -- Store new tag in selected way
1184 # for i in 0 to NUM_WAYS-1 loop
1185 # if i = replace_way then
1186 # tagset := cache_tags(r.store_index);
1187 # write_tag(i, tagset, r.store_tag);
1188 # cache_tags(r.store_index) <= tagset;
1189 # end if;
1190 # end loop;
1191 for i in range(NUM_WAYS):
1192 with m.If(i == replace_way):
1193 comb += tagset.eq(
1194 cache_tags[r.store_index]
1195 )
1196 sync += write_tag(
1197 i, tagset, r.store_tag
1198 )
1199 sync += cache_tags(r.store_index).eq(
1200 tagset
1201 )
1202
1203 # r.state <= WAIT_ACK;
1204 sync += r.state.eq(State.WAIT_ACK)
1205 # end if;
1206
1207 # -- Requests are all sent if stb is 0
1208 # stbs_done := r.wb.stb = '0';
1209 # Requests are all sent if stb is 0
1210 comb += stbs_done.eq(r.wb.stb == 0)
1211
1212 # -- If we are still sending requests,
1213 # -- was one accepted ?
1214 # if wishbone_in.stall = '0' and not stbs_done then
1215 # If we are still sending requests,
1216 # was one accepted?
1217 with m.If(~wb_in.stall & ~stbs_done):
1218 # -- That was the last word ? We are done sending.
1219 # -- Clear stb and set stbs_done so we can handle
1220 # -- an eventual last ack on the same cycle.
1221 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1222 # r.wb.stb <= '0';
1223 # stbs_done := true;
1224 # end if;
1225 # That was the last word ?
1226 # We are done sending.
1227 # Clear stb and set stbs_done
1228 # so we can handle
1229 # an eventual last ack on
1230 # the same cycle.
1231 with m.If(is_last_row_addr(
1232 r.wb.adr, r.end_row_ix)):
1233 sync += r.wb.stb.eq(0)
1234 stbs_done.eq(1)
1235
1236 # -- Calculate the next row address
1237 # r.wb.adr <= next_row_addr(r.wb.adr);
1238 # Calculate the next row address
1239 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1240 # end if;
1241
1242 # -- Incoming acks processing
1243 # if wishbone_in.ack = '1' then
1244 # Incoming acks processing
1245 with m.If(wb_in.ack):
1246 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1247 # <= '1';
1248 sync += r.rows_valid[
1249 r.store_row & ROW_PER_LINE
1250 ].eq(1)
1251
1252 # -- Check for completion
1253 # if stbs_done and
1254 # is_last_row(r.store_row, r.end_row_ix) then
1255 # Check for completion
1256 with m.If(stbs_done & is_last_row(
1257 r.store_row, r.end_row_ix)):
1258 # -- Complete wishbone cycle
1259 # r.wb.cyc <= '0';
1260 # Complete wishbone cycle
1261 sync += r.wb.cyc.eq(0)
1262
1263 # -- Cache line is now valid
1264 # cache_valids(r.store_index)(replace_way) <=
1265 # r.store_valid and not inval_in;
1266 # Cache line is now valid
1267 sync += cache_valid_bits[
1268 r.store_index
1269 ][relace_way].eq(
1270 r.store_valid & ~inval_in
1271 )
1272
1273 # -- We are done
1274 # r.state <= IDLE;
1275 # We are done
1276 sync += r.state.eq(State.IDLE)
1277 # end if;
1278
1279 # -- Increment store row counter
1280 # r.store_row <= next_row(r.store_row);
1281 # Increment store row counter
1282 sync += store_row.eq(next_row(r.store_row))
1283 # end if;
1284 # end case;
1285 # end if;
1286 #
1287 # -- TLB miss and protection fault processing
1288 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1289 # r.fetch_failed <= '0';
1290 # elsif i_in.req = '1' and access_ok = '0' and
1291 # stall_in = '0' then
1292 # r.fetch_failed <= '1';
1293 # end if;
1294 # TLB miss and protection fault processing
1295 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1296 sync += r.fetch_failed.eq(0)
1297
1298 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1299 sync += r.fetch_failed.eq(1)
1300 # end if;
1301 # end process;
1302
1303 # icache_log: if LOG_LENGTH > 0 generate
1304 def icache_log(self, m, log_out):
1305 comb = m.d.comb
1306 sync = m.d.sync
1307
1308 # -- Output data to logger
1309 # signal log_data : std_ulogic_vector(53 downto 0);
1310 # begin
1311 # data_log: process(clk)
1312 # variable lway: way_t;
1313 # variable wstate: std_ulogic;
1314 # Output data to logger
1315 for i in range(LOG_LENGTH):
1316 # Output data to logger
1317 log_data = Signal(54)
1318 lway = Signal(NUM_WAYS)
1319 wstate = Signal()
1320
1321 # begin
1322 # if rising_edge(clk) then
1323 # lway := req_hit_way;
1324 # wstate := '0';
1325 comb += lway.eq(req_hit_way)
1326 comb += wstate.eq(0)
1327
1328 # if r.state /= IDLE then
1329 # wstate := '1';
1330 # end if;
1331 with m.If(r.state != State.IDLE):
1332 comb += wstate.eq(1)
1333
1334 # log_data <= i_out.valid &
1335 # i_out.insn &
1336 # wishbone_in.ack &
1337 # r.wb.adr(5 downto 3) &
1338 # r.wb.stb & r.wb.cyc &
1339 # wishbone_in.stall &
1340 # stall_out &
1341 # r.fetch_failed &
1342 # r.hit_nia(5 downto 2) &
1343 # wstate &
1344 # std_ulogic_vector(to_unsigned(lway, 3)) &
1345 # req_is_hit & req_is_miss &
1346 # access_ok &
1347 # ra_valid;
1348 sync += log_data.eq(Cat(
1349 ra_valid, access_ok, req_is_miss, req_is_hit,
1350 lway, wstate, r.hit_nia[2:6],
1351 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1352 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1353 i_out.valid
1354 ))
1355 # end if;
1356 # end process;
1357 # log_out <= log_data;
1358 comb += log_out.eq(log_data)
1359 # end generate;
1360 # end;
1361
1362 def elaborate(self, platform):
1363 m = Module()
1364
1365 comb = m.d.comb
1366 sync = m.d.sync
1367
1368 # -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1369 # signal cache_tags : cache_tags_array_t;
1370 # signal cache_valids : cache_valids_t;
1371 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1372 cache_tags = CacheTagArray()
1373 cache_valid_bits = CacheValidBitsArray()
1374
1375 # signal itlb_valids : tlb_valids_t;
1376 # signal itlb_tags : tlb_tags_t;
1377 # signal itlb_ptes : tlb_ptes_t;
1378 # attribute ram_style of itlb_tags : signal is "distributed";
1379 # attribute ram_style of itlb_ptes : signal is "distributed";
1380 itlb_valid_bits = TLBValidBitsArray()
1381 itlb_tags = TLBTagArray()
1382 itlb_ptes = TLBPTEArray()
1383 # TODO to be passed to nmigen as ram attributes
1384 # attribute ram_style of itlb_tags : signal is "distributed";
1385 # attribute ram_style of itlb_ptes : signal is "distributed";
1386
1387 # -- Privilege bit from PTE EAA field
1388 # signal eaa_priv : std_ulogic;
1389 # Privilege bit from PTE EAA field
1390 eaa_priv = Signal()
1391
1392 # signal r : reg_internal_t;
1393 r = RegInternal()
1394
1395 # -- Async signals on incoming request
1396 # signal req_index : index_t;
1397 # signal req_row : row_t;
1398 # signal req_hit_way : way_t;
1399 # signal req_tag : cache_tag_t;
1400 # signal req_is_hit : std_ulogic;
1401 # signal req_is_miss : std_ulogic;
1402 # signal req_laddr : std_ulogic_vector(63 downto 0);
1403 # Async signal on incoming request
1404 req_index = Signal(NUM_LINES)
1405 req_row = Signal(BRAM_ROWS)
1406 req_hit_way = Signal(NUM_WAYS)
1407 req_tag = Signal(TAG_BITS)
1408 req_is_hit = Signal()
1409 req_is_miss = Signal()
1410 req_laddr = Signal(64)
1411
1412 # signal tlb_req_index : tlb_index_t;
1413 # signal real_addr : std_ulogic_vector(
1414 # REAL_ADDR_BITS - 1 downto 0
1415 # );
1416 # signal ra_valid : std_ulogic;
1417 # signal priv_fault : std_ulogic;
1418 # signal access_ok : std_ulogic;
1419 # signal use_previous : std_ulogic;
1420 tlb_req_index = Signal(TLB_SIZE)
1421 real_addr = Signal(REAL_ADDR_BITS)
1422 ra_valid = Signal()
1423 priv_fault = Signal()
1424 access_ok = Signal()
1425 use_previous = Signal()
1426
1427 # signal cache_out : cache_ram_out_t;
1428 cache_out = CacheRamOut()
1429
1430 # signal plru_victim : plru_out_t;
1431 # signal replace_way : way_t;
1432 plru_victim = PLRUOut()
1433 replace_way = Signal(NUM_WAYS)
1434
1435
1436
1437 # icache_tb.vhdl
1438 #
1439 # library ieee;
1440 # use ieee.std_logic_1164.all;
1441 #
1442 # library work;
1443 # use work.common.all;
1444 # use work.wishbone_types.all;
1445 #
1446 # entity icache_tb is
1447 # end icache_tb;
1448 #
1449 # architecture behave of icache_tb is
1450 # signal clk : std_ulogic;
1451 # signal rst : std_ulogic;
1452 #
1453 # signal i_out : Fetch1ToIcacheType;
1454 # signal i_in : IcacheToDecode1Type;
1455 #
1456 # signal m_out : MmuToIcacheType;
1457 #
1458 # signal wb_bram_in : wishbone_master_out;
1459 # signal wb_bram_out : wishbone_slave_out;
1460 #
1461 # constant clk_period : time := 10 ns;
1462 # begin
1463 # icache0: entity work.icache
1464 # generic map(
1465 # LINE_SIZE => 64,
1466 # NUM_LINES => 4
1467 # )
1468 # port map(
1469 # clk => clk,
1470 # rst => rst,
1471 # i_in => i_out,
1472 # i_out => i_in,
1473 # m_in => m_out,
1474 # stall_in => '0',
1475 # flush_in => '0',
1476 # inval_in => '0',
1477 # wishbone_out => wb_bram_in,
1478 # wishbone_in => wb_bram_out
1479 # );
1480 #
1481 # -- BRAM Memory slave
1482 # bram0: entity work.wishbone_bram_wrapper
1483 # generic map(
1484 # MEMORY_SIZE => 1024,
1485 # RAM_INIT_FILE => "icache_test.bin"
1486 # )
1487 # port map(
1488 # clk => clk,
1489 # rst => rst,
1490 # wishbone_in => wb_bram_in,
1491 # wishbone_out => wb_bram_out
1492 # );
1493 #
1494 # clk_process: process
1495 # begin
1496 # clk <= '0';
1497 # wait for clk_period/2;
1498 # clk <= '1';
1499 # wait for clk_period/2;
1500 # end process;
1501 #
1502 # rst_process: process
1503 # begin
1504 # rst <= '1';
1505 # wait for 2*clk_period;
1506 # rst <= '0';
1507 # wait;
1508 # end process;
1509 #
1510 # stim: process
1511 # begin
1512 # i_out.req <= '0';
1513 # i_out.nia <= (others => '0');
1514 # i_out.stop_mark <= '0';
1515 #
1516 # m_out.tlbld <= '0';
1517 # m_out.tlbie <= '0';
1518 # m_out.addr <= (others => '0');
1519 # m_out.pte <= (others => '0');
1520 #
1521 # wait until rising_edge(clk);
1522 # wait until rising_edge(clk);
1523 # wait until rising_edge(clk);
1524 # wait until rising_edge(clk);
1525 #
1526 # i_out.req <= '1';
1527 # i_out.nia <= x"0000000000000004";
1528 #
1529 # wait for 30*clk_period;
1530 # wait until rising_edge(clk);
1531 #
1532 # assert i_in.valid = '1' severity failure;
1533 # assert i_in.insn = x"00000001"
1534 # report "insn @" & to_hstring(i_out.nia) &
1535 # "=" & to_hstring(i_in.insn) &
1536 # " expected 00000001"
1537 # severity failure;
1538 #
1539 # i_out.req <= '0';
1540 #
1541 # wait until rising_edge(clk);
1542 #
1543 # -- hit
1544 # i_out.req <= '1';
1545 # i_out.nia <= x"0000000000000008";
1546 # wait until rising_edge(clk);
1547 # wait until rising_edge(clk);
1548 # assert i_in.valid = '1' severity failure;
1549 # assert i_in.insn = x"00000002"
1550 # report "insn @" & to_hstring(i_out.nia) &
1551 # "=" & to_hstring(i_in.insn) &
1552 # " expected 00000002"
1553 # severity failure;
1554 # wait until rising_edge(clk);
1555 #
1556 # -- another miss
1557 # i_out.req <= '1';
1558 # i_out.nia <= x"0000000000000040";
1559 #
1560 # wait for 30*clk_period;
1561 # wait until rising_edge(clk);
1562 #
1563 # assert i_in.valid = '1' severity failure;
1564 # assert i_in.insn = x"00000010"
1565 # report "insn @" & to_hstring(i_out.nia) &
1566 # "=" & to_hstring(i_in.insn) &
1567 # " expected 00000010"
1568 # severity failure;
1569 #
1570 # -- test something that aliases
1571 # i_out.req <= '1';
1572 # i_out.nia <= x"0000000000000100";
1573 # wait until rising_edge(clk);
1574 # wait until rising_edge(clk);
1575 # assert i_in.valid = '0' severity failure;
1576 # wait until rising_edge(clk);
1577 #
1578 # wait for 30*clk_period;
1579 # wait until rising_edge(clk);
1580 #
1581 # assert i_in.valid = '1' severity failure;
1582 # assert i_in.insn = x"00000040"
1583 # report "insn @" & to_hstring(i_out.nia) &
1584 # "=" & to_hstring(i_in.insn) &
1585 # " expected 00000040"
1586 # severity failure;
1587 #
1588 # i_out.req <= '0';
1589 #
1590 # std.env.finish;
1591 # end process;
1592 # end;
1593 def icache_sim(dut):
1594 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1595
1596 yield i_out.req.eq(0)
1597 yield i_out.nia.eq(~1)
1598 yield i_out.stop_mark.eq(0)
1599 yield m_out.tlbld.eq(0)
1600 yield m_out.tlbie.eq(0)
1601 yield m_out.addr.eq(~1)
1602 yield m_out.pte.eq(~1)
1603 yield
1604 yield
1605 yield
1606 yield
1607 yield i_out.req.eq(1)
1608 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1609 for i in range(30):
1610 yield
1611 yield
1612 assert i_in.valid
1613 assert i_in.insn == Const(0x00000001, 32), \
1614 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1615 yield i_out.req.eq(0)
1616 yield
1617
1618 # hit
1619 yield i_out.req.eq(1)
1620 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1621 yield
1622 yield
1623 assert i_in.valid
1624 assert i_in.insn == Const(0x00000002, 32), \
1625 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1626 yield
1627
1628 # another miss
1629 yield i_out.req(1)
1630 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1631 for i in range(30):
1632 yield
1633 yield
1634 assert i_in.valid
1635 assert i_in.insn == Const(0x00000010, 32), \
1636 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1637
1638 # test something that aliases
1639 yield i_out.req.eq(1)
1640 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1641 yield
1642 yield
1643 assert i_in.valid
1644 for i in range(30):
1645 yield
1646 yield
1647 assert i_in.valid
1648 assert i_in.insn == Const(0x00000040, 32), \
1649 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1650 yield i_out.req.eq(0)
1651
1652
1653 def test_icache():
1654 dut = ICache()
1655 vl = rtlil.convert(dut, ports=[])
1656 with open("test_icache.il", "w") as f:
1657 f.write(vl)
1658
1659 #run_simulation(dut, icache_sim(), vcd_name='test_icache.vcd')
1660
1661 if __name__ == '__main__':
1662 test_icache()