icache.py copy simulation code from dcache.py, fix syntax
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray():
206 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
207
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray():
213 return Array(Signal() for x in range(ROW_PER_LINE))
214
215 def RowPerLineValidArray():
216 return Array(Signal() for x in range(ROW_PER_LINE))
217
218
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
224
225
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray():
233 return Array(Signal() for x in range(TLB_SIZE))
234
235 def TLBTagArray():
236 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
237
238 def TLBPTEArray():
239 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
240
241
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
245 def CacheRamOut():
246 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
247
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
252 def PLRUOut():
253 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
254
255 # begin
256 #
257 # assert LINE_SIZE mod ROW_SIZE = 0;
258 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
259 # severity FAILURE;
260 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
261 # severity FAILURE;
262 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
263 # severity FAILURE;
264 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
265 # severity FAILURE;
266 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
267 # report "geometry bits don't add up" severity FAILURE;
268 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
269 # report "geometry bits don't add up" severity FAILURE;
270 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
271 # report "geometry bits don't add up" severity FAILURE;
272 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
273 # report "geometry bits don't add up" severity FAILURE;
274 #
275 # sim_debug: if SIM generate
276 # debug: process
277 # begin
278 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
279 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
280 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
281 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
282 # report "INSN_BITS = " & natural'image(INSN_BITS);
283 # report "ROW_BITS = " & natural'image(ROW_BITS);
284 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
285 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
286 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
287 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
288 # report "TAG_BITS = " & natural'image(TAG_BITS);
289 # report "WAY_BITS = " & natural'image(WAY_BITS);
290 # wait;
291 # end process;
292 # end generate;
293
294 # Cache reload state machine
295 @unique
296 class State(Enum):
297 IDLE = 0
298 CLR_TAG = 1
299 WAIT_ACK = 2
300
301 # type reg_internal_t is record
302 # -- Cache hit state (Latches for 1 cycle BRAM access)
303 # hit_way : way_t;
304 # hit_nia : std_ulogic_vector(63 downto 0);
305 # hit_smark : std_ulogic;
306 # hit_valid : std_ulogic;
307 #
308 # -- Cache miss state (reload state machine)
309 # state : state_t;
310 # wb : wishbone_master_out;
311 # store_way : way_t;
312 # store_index : index_t;
313 # store_row : row_t;
314 # store_tag : cache_tag_t;
315 # store_valid : std_ulogic;
316 # end_row_ix : row_in_line_t;
317 # rows_valid : row_per_line_valid_t;
318 #
319 # -- TLB miss state
320 # fetch_failed : std_ulogic;
321 # end record;
322 class RegInternal(RecordObject):
323 def __init__(self):
324 super().__init__()
325 # Cache hit state (Latches for 1 cycle BRAM access)
326 self.hit_way = Signal(NUM_WAYS)
327 self.hit_nia = Signal(64)
328 self.hit_smark = Signal()
329 self.hit_valid = Signal()
330
331 # Cache miss state (reload state machine)
332 self.state = Signal(State)
333 self.wb = WBMasterOut()
334 self.store_way = Signal(NUM_WAYS)
335 self.store_index = Signal(NUM_LINES)
336 self.store_row = Signal(BRAM_ROWS)
337 self.store_tag = Signal(TAG_BITS)
338 self.store_valid = Signal()
339 self.end_row_ix = Signal(ROW_LINE_BITS)
340 self.rows_valid = RowPerLineValidArray()
341
342 # TLB miss state
343 self.fetch_failed = Signal()
344
345 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
346 #
347 # entity icache is
348 # generic (
349 # SIM : boolean := false;
350 # -- Line size in bytes
351 # LINE_SIZE : positive := 64;
352 # -- BRAM organisation: We never access more
353 # -- than wishbone_data_bits
354 # -- at a time so to save resources we make the
355 # -- array only that wide,
356 # -- and use consecutive indices for to make a cache "line"
357 # --
358 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
359 # -- so 64-bits)
360 # ROW_SIZE : positive := wishbone_data_bits / 8;
361 # -- Number of lines in a set
362 # NUM_LINES : positive := 32;
363 # -- Number of ways
364 # NUM_WAYS : positive := 4;
365 # -- L1 ITLB number of entries (direct mapped)
366 # TLB_SIZE : positive := 64;
367 # -- L1 ITLB log_2(page_size)
368 # TLB_LG_PGSZ : positive := 12;
369 # -- Number of real address bits that we store
370 # REAL_ADDR_BITS : positive := 56;
371 # -- Non-zero to enable log data collection
372 # LOG_LENGTH : natural := 0
373 # );
374 # port (
375 # clk : in std_ulogic;
376 # rst : in std_ulogic;
377 #
378 # i_in : in Fetch1ToIcacheType;
379 # i_out : out IcacheToDecode1Type;
380 #
381 # m_in : in MmuToIcacheType;
382 #
383 # stall_in : in std_ulogic;
384 # stall_out : out std_ulogic;
385 # flush_in : in std_ulogic;
386 # inval_in : in std_ulogic;
387 #
388 # wishbone_out : out wishbone_master_out;
389 # wishbone_in : in wishbone_slave_out;
390 #
391 # log_out : out std_ulogic_vector(53 downto 0)
392 # );
393 # end entity icache;
394 # 64 bit direct mapped icache. All instructions are 4B aligned.
395 class ICache(Elaboratable):
396 """64 bit direct mapped icache. All instructions are 4B aligned."""
397 def __init__(self):
398 self.i_in = Fetch1ToICacheType()
399 self.i_out = ICacheToDecode1Type()
400
401 self.m_in = MMUToICacheType()
402
403 self.stall_in = Signal()
404 self.stall_out = Signal()
405 self.flush_in = Signal()
406 self.inval_in = Signal()
407
408 self.wb_out = WBMasterOut()
409 self.wb_in = WBSlaveOut()
410
411 self.log_out = Signal(54)
412
413 # -- Return the cache line index (tag index) for an address
414 # function get_index(addr: std_ulogic_vector(63 downto 0))
415 # return index_t is
416 # begin
417 # return to_integer(unsigned(
418 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
419 # ));
420 # end;
421 # Return the cache line index (tag index) for an address
422 def get_index(addr):
423 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
424
425 # -- Return the cache row index (data memory) for an address
426 # function get_row(addr: std_ulogic_vector(63 downto 0))
427 # return row_t is
428 # begin
429 # return to_integer(unsigned(
430 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
431 # ));
432 # end;
433 # Return the cache row index (data memory) for an address
434 def get_row(addr):
435 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
436
437 # -- Return the index of a row within a line
438 # function get_row_of_line(row: row_t) return row_in_line_t is
439 # variable row_v : unsigned(ROW_BITS-1 downto 0);
440 # begin
441 # row_v := to_unsigned(row, ROW_BITS);
442 # return row_v(ROW_LINEBITS-1 downto 0);
443 # end;
444 # Return the index of a row within a line
445 def get_row_of_line(row):
446 row[:ROW_LINE_BITS]
447
448 # -- Returns whether this is the last row of a line
449 # function is_last_row_addr(addr: wishbone_addr_type;
450 # last: row_in_line_t
451 # )
452 # return boolean is
453 # begin
454 # return unsigned(
455 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
456 # ) = last;
457 # end;
458 # Returns whether this is the last row of a line
459 def is_last_row_addr(addr, last):
460 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
461
462 # -- Returns whether this is the last row of a line
463 # function is_last_row(row: row_t;
464 # last: row_in_line_t) return boolean is
465 # begin
466 # return get_row_of_line(row) = last;
467 # end;
468 # Returns whether this is the last row of a line
469 def is_last_row(row, last):
470 return get_row_of_line(row) == last
471
472 # -- Return the address of the next row in the current cache line
473 # function next_row_addr(addr: wishbone_addr_type)
474 # return std_ulogic_vector is
475 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
476 # variable result : wishbone_addr_type;
477 # begin
478 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
479 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
480 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
481 # result := addr;
482 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
483 # return result;
484 # end;
485 # Return the address of the next row in the current cache line
486 def next_row_addr(addr):
487 # TODO no idea what's going on here, looks like double assignments
488 # overriding earlier assignments ??? Help please!
489 pass
490
491 # -- Return the next row in the current cache line. We use a dedicated
492 # -- function in order to limit the size of the generated adder to be
493 # -- only the bits within a cache line (3 bits with default settings)
494 # function next_row(row: row_t) return row_t is
495 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
496 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
497 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
498 # begin
499 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
500 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
501 # row_v(ROW_LINEBITS-1 downto 0) :=
502 # std_ulogic_vector(unsigned(row_idx) + 1);
503 # return to_integer(unsigned(row_v));
504 # end;
505 # Return the next row in the current cache line. We use a dedicated
506 # function in order to limit the size of the generated adder to be
507 # only the bits within a cache line (3 bits with default settings)
508 def next_row(row):
509 # TODO no idea what's going on here, looks like double assignments
510 # overriding earlier assignments ??? Help please!
511 pass
512
513 # -- Read the instruction word for the given address in the
514 # -- current cache row
515 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
516 # data: cache_row_t) return std_ulogic_vector is
517 # variable word: integer range 0 to INSN_PER_ROW-1;
518 # begin
519 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
520 # return data(31+word*32 downto word*32);
521 # end;
522 # Read the instruction word for the given address
523 # in the current cache row
524 def read_insn_word(addr, data):
525 word = addr[2:INSN_BITS+3]
526 return data[word * 32:32 + word * 32]
527
528 # -- Get the tag value from the address
529 # function get_tag(
530 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
531 # )
532 # return cache_tag_t is
533 # begin
534 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
535 # end;
536 # Get the tag value from the address
537 def get_tag(addr):
538 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
539
540 # -- Read a tag from a tag memory row
541 # function read_tag(way: way_t; tagset: cache_tags_set_t)
542 # return cache_tag_t is
543 # begin
544 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
545 # end;
546 # Read a tag from a tag memory row
547 def read_tag(way, tagset):
548 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
549
550 # -- Write a tag to tag memory row
551 # procedure write_tag(way: in way_t;
552 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
553 # begin
554 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
555 # end;
556 # Write a tag to tag memory row
557 def write_tag(way, tagset, tag):
558 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
559
560 # -- Simple hash for direct-mapped TLB index
561 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
562 # return tlb_index_t is
563 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
564 # begin
565 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
566 # xor addr(
567 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
568 # TLB_LG_PGSZ + TLB_BITS
569 # )
570 # xor addr(
571 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
572 # TLB_LG_PGSZ + 2 * TLB_BITS
573 # );
574 # return to_integer(unsigned(hash));
575 # end;
576 # Simple hash for direct-mapped TLB index
577 def hash_ea(addr):
578 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
579 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
580 ] ^ addr[
581 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
582 ]
583 return hsh
584
585 # -- Generate a cache RAM for each way
586 # rams: for i in 0 to NUM_WAYS-1 generate
587 # signal do_read : std_ulogic;
588 # signal do_write : std_ulogic;
589 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
590 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
591 # signal dout : cache_row_t;
592 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
593 # begin
594 # way: entity work.cache_ram
595 # generic map (
596 # ROW_BITS => ROW_BITS,
597 # WIDTH => ROW_SIZE_BITS
598 # )
599 # port map (
600 # clk => clk,
601 # rd_en => do_read,
602 # rd_addr => rd_addr,
603 # rd_data => dout,
604 # wr_sel => wr_sel,
605 # wr_addr => wr_addr,
606 # wr_data => wishbone_in.dat
607 # );
608 # process(all)
609 # begin
610 # do_read <= not (stall_in or use_previous);
611 # do_write <= '0';
612 # if wishbone_in.ack = '1' and replace_way = i then
613 # do_write <= '1';
614 # end if;
615 # cache_out(i) <= dout;
616 # rd_addr <=
617 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
618 # wr_addr <=
619 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
620 # for i in 0 to ROW_SIZE-1 loop
621 # wr_sel(i) <= do_write;
622 # end loop;
623 # end process;
624 # end generate;
625 def rams(self, m):
626 comb = m.d.comb
627
628 do_read = Signal()
629 do_write = Signal()
630 rd_addr = Signal(ROW_BITS)
631 wr_addr = Signal(ROW_BITS)
632 _d_out = Signal(ROW_SIZE_BITS)
633 wr_sel = Signal(ROW_SIZE)
634
635 for i in range(NUM_WAYS):
636 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
637 comb += way.rd_en.eq(do_read)
638 comb += way.rd_addr.eq(rd_addr)
639 comb += way.rd_data.eq(_d_out)
640 comb += way.wr_sel.eq(wr_sel)
641 comb += way.wr_add.eq(wr_addr)
642 comb += way.wr_data.eq(wb_in.dat)
643
644 comb += do_read.eq(~(stall_in | use_previous))
645 comb += do_write.eq(0)
646
647 with m.If(wb_in.ack & (replace_way == i)):
648 do_write.eq(1)
649
650 comb += cache_out[i].eq(_d_out)
651 comb += rd_addr.eq(Signal(req_row))
652 comb += wr_addr.eq(Signal(r.store_row))
653 for j in range(ROW_SIZE):
654 comb += wr_sel[j].eq(do_write)
655
656 # -- Generate PLRUs
657 # maybe_plrus: if NUM_WAYS > 1 generate
658 # begin
659 # plrus: for i in 0 to NUM_LINES-1 generate
660 # -- PLRU interface
661 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
662 # signal plru_acc_en : std_ulogic;
663 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
664 #
665 # begin
666 # plru : entity work.plru
667 # generic map (
668 # BITS => WAY_BITS
669 # )
670 # port map (
671 # clk => clk,
672 # rst => rst,
673 # acc => plru_acc,
674 # acc_en => plru_acc_en,
675 # lru => plru_out
676 # );
677 #
678 # process(all)
679 # begin
680 # -- PLRU interface
681 # if get_index(r.hit_nia) = i then
682 # plru_acc_en <= r.hit_valid;
683 # else
684 # plru_acc_en <= '0';
685 # end if;
686 # plru_acc <=
687 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
688 # plru_victim(i) <= plru_out;
689 # end process;
690 # end generate;
691 # end generate;
692 def maybe_plrus(self, m):
693 comb += m.d.comb
694
695 with m.If(NUM_WAYS > 1):
696 for i in range(NUM_LINES):
697 plru_acc = Signal(WAY_BITS)
698 plru_acc_en = Signal()
699 plru_out = Signal(WAY_BITS)
700 plru = PLRU(WAY_BITS)
701 comb += plru.acc.eq(plru_acc)
702 comb += plru.acc_en.eq(plru_acc_en)
703 comb += plru.lru.eq(plru_out)
704
705 # PLRU interface
706 with m.If(get_index(r.hit_nia) == i):
707 comb += plru.acc_en.eq(r.hit_valid)
708
709 with m.Else():
710 comb += plru.acc_en.eq(0)
711
712 comb += plru.acc.eq(r.hit_way)
713 comb += plru_victim[i].eq(plru.lru)
714
715 # -- TLB hit detection and real address generation
716 # itlb_lookup : process(all)
717 # variable pte : tlb_pte_t;
718 # variable ttag : tlb_tag_t;
719 # begin
720 # tlb_req_index <= hash_ea(i_in.nia);
721 # pte := itlb_ptes(tlb_req_index);
722 # ttag := itlb_tags(tlb_req_index);
723 # if i_in.virt_mode = '1' then
724 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
725 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
726 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
727 # ra_valid <= itlb_valids(tlb_req_index);
728 # else
729 # ra_valid <= '0';
730 # end if;
731 # eaa_priv <= pte(3);
732 # else
733 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
734 # ra_valid <= '1';
735 # eaa_priv <= '1';
736 # end if;
737 #
738 # -- no IAMR, so no KUEP support for now
739 # priv_fault <= eaa_priv and not i_in.priv_mode;
740 # access_ok <= ra_valid and not priv_fault;
741 # end process;
742 # TLB hit detection and real address generation
743 def itlb_lookup(self, m):
744 comb = m.d.comb
745
746 comb += tlb_req_index.eq(hash_ea(i_in.nia))
747 comb += pte.eq(itlb_ptes[tlb_req_index])
748 comb += ttag.eq(itlb_tags[tlb_req_index])
749
750 with m.If(i_in.virt_mode):
751 comb += real_addr.eq(Cat(
752 i_in.nia[:TLB_LB_PGSZ],
753 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
754 ))
755
756 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
757 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
758
759 with m.Else():
760 comb += ra_valid.eq(0)
761
762 with m.Else():
763 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
764 comb += ra_valid.eq(1)
765 comb += eaa_priv.eq(1)
766
767 # No IAMR, so no KUEP support for now
768 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
769 comb += access_ok.eq(ra_valid & ~priv_fault)
770
771 # -- iTLB update
772 # itlb_update: process(clk)
773 # variable wr_index : tlb_index_t;
774 # begin
775 # if rising_edge(clk) then
776 # wr_index := hash_ea(m_in.addr);
777 # if rst = '1' or
778 # (m_in.tlbie = '1' and m_in.doall = '1') then
779 # -- clear all valid bits
780 # for i in tlb_index_t loop
781 # itlb_valids(i) <= '0';
782 # end loop;
783 # elsif m_in.tlbie = '1' then
784 # -- clear entry regardless of hit or miss
785 # itlb_valids(wr_index) <= '0';
786 # elsif m_in.tlbld = '1' then
787 # itlb_tags(wr_index) <=
788 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
789 # itlb_ptes(wr_index) <= m_in.pte;
790 # itlb_valids(wr_index) <= '1';
791 # end if;
792 # end if;
793 # end process;
794 # iTLB update
795 def itlb_update(self, m):
796 sync = m.d.sync
797
798 wr_index = Signal(TLB_SIZE)
799 sync += wr_index.eq(hash_ea(m_in.addr))
800
801 with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
802 # Clear all valid bits
803 for i in range(TLB_SIZE):
804 sync += itlb_vlaids[i].eq(0)
805
806 with m.Elif(m_in.tlbie):
807 # Clear entry regardless of hit or miss
808 sync += itlb_valid_bits[wr_index].eq(0)
809
810 with m.Elif(m_in.tlbld):
811 sync += itlb_tags[wr_index].eq(
812 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
813 )
814 sync += itlb_ptes[wr_index].eq(m_in.pte)
815 sync += itlb_valid_bits[wr_index].eq(1)
816
817 # -- Cache hit detection, output to fetch2 and other misc logic
818 # icache_comb : process(all)
819 # Cache hit detection, output to fetch2 and other misc logic
820 def icache_comb(self, m):
821 # variable is_hit : std_ulogic;
822 # variable hit_way : way_t;
823 comb = m.d.comb
824
825 is_hit = Signal()
826 hit_way = Signal(NUM_WAYS)
827 # begin
828 # -- i_in.sequential means that i_in.nia this cycle
829 # -- is 4 more than last cycle. If we read more
830 # -- than 32 bits at a time, had a cache hit last
831 # -- cycle, and we don't want the first 32-bit chunk
832 # -- then we can keep the data we read last cycle
833 # -- and just use that.
834 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
835 # use_previous <= i_in.sequential and r.hit_valid;
836 # else
837 # use_previous <= '0';
838 # end if;
839 # i_in.sequential means that i_in.nia this cycle is 4 more than
840 # last cycle. If we read more than 32 bits at a time, had a
841 # cache hit last cycle, and we don't want the first 32-bit chunk
842 # then we can keep the data we read last cycle and just use that.
843 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
844 comb += use_previous.eq(i_in.sequential & r.hit_valid)
845
846 with m.Else():
847 comb += use_previous.eq(0)
848
849 # -- Extract line, row and tag from request
850 # req_index <= get_index(i_in.nia);
851 # req_row <= get_row(i_in.nia);
852 # req_tag <= get_tag(real_addr);
853 # Extract line, row and tag from request
854 comb += req_index.eq(get_index(i_in.nia))
855 comb += req_row.eq(get_row(i_in.nia))
856 comb += req_tag.eq(get_tag(real_addr))
857
858 # -- Calculate address of beginning of cache row, will be
859 # -- used for cache miss processing if needed
860 # req_laddr <=
861 # (63 downto REAL_ADDR_BITS => '0') &
862 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
863 # (ROW_OFF_BITS-1 downto 0 => '0');
864 # Calculate address of beginning of cache row, will be
865 # used for cache miss processing if needed
866 comb += req_laddr.eq(Cat(
867 Const(0b0, ROW_OFF_BITS),
868 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
869 Const(0, REAL_ADDR_BITS)
870 ))
871
872 # -- Test if pending request is a hit on any way
873 # hit_way := 0;
874 # is_hit := '0';
875 # for i in way_t loop
876 # if i_in.req = '1' and
877 # (cache_valids(req_index)(i) = '1' or
878 # (r.state = WAIT_ACK and
879 # req_index = r.store_index and
880 # i = r.store_way and
881 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
882 # if read_tag(i, cache_tags(req_index)) = req_tag then
883 # hit_way := i;
884 # is_hit := '1';
885 # end if;
886 # end if;
887 # end loop;
888 # Test if pending request is a hit on any way
889 for i in range(NUM_WAYS):
890 with m.If(i_in.req &
891 (cache_valid_bits[req_index][i] |
892 ((r.state == State.WAIT_ACK)
893 & (req_index == r.store_index)
894 & (i == r.store_way)
895 & r.rows_valid[req_row % ROW_PER_LINE]))):
896 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
897 comb += hit_way.eq(i)
898 comb += is_hit.eq(1)
899
900 # -- Generate the "hit" and "miss" signals
901 # -- for the synchronous blocks
902 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
903 # and rst = '0' then
904 # req_is_hit <= is_hit;
905 # req_is_miss <= not is_hit;
906 # else
907 # req_is_hit <= '0';
908 # req_is_miss <= '0';
909 # end if;
910 # req_hit_way <= hit_way;
911 # Generate the "hit" and "miss" signals
912 # for the synchronous blocks
913 with m.If(i_in.rq & access_ok & ~flush_in):
914 comb += req_is_hit.eq(is_hit)
915 comb += req_is_miss.eq(~is_hit)
916
917 with m.Else():
918 comb += req_is_hit.eq(0)
919 comb += req_is_miss.eq(0)
920
921 # -- The way to replace on a miss
922 # if r.state = CLR_TAG then
923 # replace_way <=
924 # to_integer(unsigned(plru_victim(r.store_index)));
925 # else
926 # replace_way <= r.store_way;
927 # end if;
928 # The way to replace on a miss
929 with m.If(r.state == State.CLR_TAG):
930 comb += replace_way.eq(plru_victim[r.store_index])
931
932 with m.Else():
933 comb += replace_way.eq(r.store_way)
934
935 # -- Output instruction from current cache row
936 # --
937 # -- Note: This is a mild violation of our design principle of
938 # -- having pipeline stages output from a clean latch. In this
939 # -- case we output the result of a mux. The alternative would
940 # -- be output an entire row which I prefer not to do just yet
941 # -- as it would force fetch2 to know about some of the cache
942 # -- geometry information.
943 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
944 # i_out.valid <= r.hit_valid;
945 # i_out.nia <= r.hit_nia;
946 # i_out.stop_mark <= r.hit_smark;
947 # i_out.fetch_failed <= r.fetch_failed;
948 # Output instruction from current cache row
949 #
950 # Note: This is a mild violation of our design principle of
951 # having pipeline stages output from a clean latch. In this
952 # case we output the result of a mux. The alternative would
953 # be output an entire row which I prefer not to do just yet
954 # as it would force fetch2 to know about some of the cache
955 # geometry information.
956 comb += i_out.insn.eq(
957 read_insn_word(r.hit_nia, cache_out[r.hit_way])
958 )
959 comb += i_out.valid.eq(r.hit_valid)
960 comb += i_out.nia.eq(r.hit_nia)
961 comb += i_out.stop_mark.eq(r.hit_smark)
962 comb += i_out.fetch_failed.eq(r.fetch_failed)
963
964 # -- Stall fetch1 if we have a miss on cache or TLB
965 # -- or a protection fault
966 # stall_out <= not (is_hit and access_ok);
967 # Stall fetch1 if we have a miss on cache or TLB
968 # or a protection fault
969 comb += stall_out.eq(~(is_hit & access_ok))
970
971 # -- Wishbone requests output (from the cache miss reload machine)
972 # wishbone_out <= r.wb;
973 # Wishbone requests output (from the cache miss reload machine)
974 comb += wb_out.eq(r.wb)
975 # end process;
976
977 # -- Cache hit synchronous machine
978 # icache_hit : process(clk)
979 # Cache hit synchronous machine
980 def icache_hit(self, m):
981 sync = m.d.sync
982 # begin
983 # if rising_edge(clk) then
984 # -- keep outputs to fetch2 unchanged on a stall
985 # -- except that flush or reset sets valid to 0
986 # -- If use_previous, keep the same data as last
987 # -- cycle and use the second half
988 # if stall_in = '1' or use_previous = '1' then
989 # if rst = '1' or flush_in = '1' then
990 # r.hit_valid <= '0';
991 # end if;
992 # keep outputs to fetch2 unchanged on a stall
993 # except that flush or reset sets valid to 0
994 # If use_previous, keep the same data as last
995 # cycle and use the second half
996 with m.If(stall_in | use_previous):
997 with m.If('''TODO rst nmigen''' | flush_in):
998 sync += r.hit_valid.eq(0)
999 # else
1000 # -- On a hit, latch the request for the next cycle,
1001 # -- when the BRAM data will be available on the
1002 # -- cache_out output of the corresponding way
1003 # r.hit_valid <= req_is_hit;
1004 # if req_is_hit = '1' then
1005 # r.hit_way <= req_hit_way;
1006 with m.Else():
1007 # On a hit, latch the request for the next cycle,
1008 # when the BRAM data will be available on the
1009 # cache_out output of the corresponding way
1010 sync += r.hit_valid.eq(req_is_hit)
1011
1012 with m.If(req_is_hit):
1013 sync += r.hit_way.eq(req_hit_way)
1014
1015 # report "cache hit nia:" & to_hstring(i_in.nia) &
1016 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1017 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1018 # " idx:" & integer'image(req_index) &
1019 # " tag:" & to_hstring(req_tag) &
1020 # " way:" & integer'image(req_hit_way) &
1021 # " RA:" & to_hstring(real_addr);
1022 print(f"cache hit nia:{i_in.nia}, " \
1023 f"IR:{i_in.virt_mode}, " \
1024 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1025 f"tag:{req_tag}, way:{req_hit_way}, " \
1026 f"RA:{real_addr}")
1027 # end if;
1028 # end if;
1029 # if stall_in = '0' then
1030 # -- Send stop marks and NIA down regardless of validity
1031 # r.hit_smark <= i_in.stop_mark;
1032 # r.hit_nia <= i_in.nia;
1033 # end if;
1034 with m.If(~stall_in):
1035 # Send stop marks and NIA down regardless of validity
1036 sync += r.hit_smark.eq(i_in.stop_mark)
1037 sync += r.hit_nia.eq(i_in.nia)
1038 # end if;
1039 # end process;
1040
1041 # -- Cache miss/reload synchronous machine
1042 # icache_miss : process(clk)
1043 # Cache miss/reload synchronous machine
1044 def icache_miss(self, m):
1045 comb = m.d.comb
1046 sync = m.d.sync
1047
1048 # variable tagset : cache_tags_set_t;
1049 # variable stbs_done : boolean;
1050
1051 tagset = Signal(TAG_RAM_WIDTH)
1052 stbs_done = Signal()
1053
1054 # begin
1055 # if rising_edge(clk) then
1056 # -- On reset, clear all valid bits to force misses
1057 # if rst = '1' then
1058 # On reset, clear all valid bits to force misses
1059 with m.If('''TODO rst nmigen'''):
1060 # for i in index_t loop
1061 # cache_valids(i) <= (others => '0');
1062 # end loop;
1063 for i in Signal(NUM_LINES):
1064 sync += cache_valid_bits[i].eq(~1)
1065
1066 # r.state <= IDLE;
1067 # r.wb.cyc <= '0';
1068 # r.wb.stb <= '0';
1069 sync += r.state.eq(State.IDLE)
1070 sync += r.wb.cyc.eq(0)
1071 sync += r.wb.stb.eq(0)
1072
1073 # -- We only ever do reads on wishbone
1074 # r.wb.dat <= (others => '0');
1075 # r.wb.sel <= "11111111";
1076 # r.wb.we <= '0';
1077 # We only ever do reads on wishbone
1078 sync += r.wb.dat.eq(~1)
1079 sync += r.wb.sel.eq(Const(0b11111111, 8))
1080 sync += r.wb.we.eq(0)
1081
1082 # -- Not useful normally but helps avoiding
1083 # -- tons of sim warnings
1084 # r.wb.adr <= (others => '0');
1085 # Not useful normally but helps avoiding tons of sim warnings
1086 sync += r.wb.adr.eq(~1)
1087
1088 # else
1089 with m.Else():
1090 # -- Process cache invalidations
1091 # if inval_in = '1' then
1092 # for i in index_t loop
1093 # cache_valids(i) <= (others => '0');
1094 # end loop;
1095 # r.store_valid <= '0';
1096 # end if;
1097 # Process cache invalidations
1098 with m.If(inval_in):
1099 for i in range(NUM_LINES):
1100 sync += cache_valid_bits[i].eq(~1)
1101
1102 sync += r.store_valid.eq(0)
1103
1104 # -- Main state machine
1105 # case r.state is
1106 # Main state machine
1107 with m.Switch(r.state):
1108
1109 # when IDLE =>
1110 with m.Case(State.IDLE):
1111 # -- Reset per-row valid flags,
1112 # -- only used in WAIT_ACK
1113 # for i in 0 to ROW_PER_LINE - 1 loop
1114 # r.rows_valid(i) <= '0';
1115 # end loop;
1116 # Reset per-row valid flags,
1117 # only used in WAIT_ACK
1118 for i in range(ROW_PER_LINE):
1119 sync += r.rows_valid[i].eq(0)
1120
1121 # -- We need to read a cache line
1122 # if req_is_miss = '1' then
1123 # report "cache miss nia:" & to_hstring(i_in.nia) &
1124 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1125 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1126 # " idx:" & integer'image(req_index) &
1127 # " way:" & integer'image(replace_way) &
1128 # " tag:" & to_hstring(req_tag) &
1129 # " RA:" & to_hstring(real_addr);
1130 # We need to read a cache line
1131 with m.If(req_is_miss):
1132 print(f"cache miss nia:{i_in.nia} " \
1133 f"IR:{i_in.virt_mode} " \
1134 f"SM:{i_in.stop_mark} " \
1135 F"idx:{req_index} " \
1136 f"way:{replace_way} tag:{req_tag} " \
1137 f"RA:{real_addr}")
1138
1139 # -- Keep track of our index and way for
1140 # -- subsequent stores
1141 # r.store_index <= req_index;
1142 # r.store_row <= get_row(req_laddr);
1143 # r.store_tag <= req_tag;
1144 # r.store_valid <= '1';
1145 # r.end_row_ix <=
1146 # get_row_of_line(get_row(req_laddr)) - 1;
1147 # Keep track of our index and way
1148 # for subsequent stores
1149 sync += r.store_index.eq(req_index)
1150 sync += r.store_row.eq(get_row(req_laddr))
1151 sync += r.store_tag.eq(req_tag)
1152 sync += r.store_valid.eq(1)
1153 sync += r.end_row_ix.eq(
1154 get_row_of_line(
1155 get_row(req_laddr)
1156 ) - 1
1157 )
1158
1159 # -- Prep for first wishbone read. We calculate the
1160 # -- address of the start of the cache line and
1161 # -- start the WB cycle.
1162 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1163 # r.wb.cyc <= '1';
1164 # r.wb.stb <= '1';
1165 # Prep for first wishbone read.
1166 # We calculate the
1167 # address of the start of the cache line and
1168 # start the WB cycle.
1169 sync += r.wb.adr.eq(
1170 req_laddr[:r.wb.adr]
1171 )
1172
1173 # -- Track that we had one request sent
1174 # r.state <= CLR_TAG;
1175 # Track that we had one request sent
1176 sync += r.state.eq(State.CLR_TAG)
1177 # end if;
1178
1179 # when CLR_TAG | WAIT_ACK =>
1180 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1181 # if r.state = CLR_TAG then
1182 with m.If(r.state == State.CLR_TAG):
1183 # -- Get victim way from plru
1184 # r.store_way <= replace_way;
1185 # Get victim way from plru
1186 sync += r.store_way.eq(replace_way)
1187 #
1188 # -- Force misses on that way while
1189 # -- reloading that line
1190 # cache_valids(req_index)(replace_way) <= '0';
1191 # Force misses on that way while
1192 # realoading that line
1193 sync += cache_valid_bits[
1194 req_index
1195 ][replace_way].eq(0)
1196
1197 # -- Store new tag in selected way
1198 # for i in 0 to NUM_WAYS-1 loop
1199 # if i = replace_way then
1200 # tagset := cache_tags(r.store_index);
1201 # write_tag(i, tagset, r.store_tag);
1202 # cache_tags(r.store_index) <= tagset;
1203 # end if;
1204 # end loop;
1205 for i in range(NUM_WAYS):
1206 with m.If(i == replace_way):
1207 comb += tagset.eq(
1208 cache_tags[r.store_index]
1209 )
1210 sync += write_tag(
1211 i, tagset, r.store_tag
1212 )
1213 sync += cache_tags(r.store_index).eq(
1214 tagset
1215 )
1216
1217 # r.state <= WAIT_ACK;
1218 sync += r.state.eq(State.WAIT_ACK)
1219 # end if;
1220
1221 # -- Requests are all sent if stb is 0
1222 # stbs_done := r.wb.stb = '0';
1223 # Requests are all sent if stb is 0
1224 comb += stbs_done.eq(r.wb.stb == 0)
1225
1226 # -- If we are still sending requests,
1227 # -- was one accepted ?
1228 # if wishbone_in.stall = '0' and not stbs_done then
1229 # If we are still sending requests,
1230 # was one accepted?
1231 with m.If(~wb_in.stall & ~stbs_done):
1232 # -- That was the last word ? We are done sending.
1233 # -- Clear stb and set stbs_done so we can handle
1234 # -- an eventual last ack on the same cycle.
1235 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1236 # r.wb.stb <= '0';
1237 # stbs_done := true;
1238 # end if;
1239 # That was the last word ?
1240 # We are done sending.
1241 # Clear stb and set stbs_done
1242 # so we can handle
1243 # an eventual last ack on
1244 # the same cycle.
1245 with m.If(is_last_row_addr(
1246 r.wb.adr, r.end_row_ix)):
1247 sync += r.wb.stb.eq(0)
1248 stbs_done.eq(1)
1249
1250 # -- Calculate the next row address
1251 # r.wb.adr <= next_row_addr(r.wb.adr);
1252 # Calculate the next row address
1253 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1254 # end if;
1255
1256 # -- Incoming acks processing
1257 # if wishbone_in.ack = '1' then
1258 # Incoming acks processing
1259 with m.If(wb_in.ack):
1260 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1261 # <= '1';
1262 sync += r.rows_valid[
1263 r.store_row & ROW_PER_LINE
1264 ].eq(1)
1265
1266 # -- Check for completion
1267 # if stbs_done and
1268 # is_last_row(r.store_row, r.end_row_ix) then
1269 # Check for completion
1270 with m.If(stbs_done & is_last_row(
1271 r.store_row, r.end_row_ix)):
1272 # -- Complete wishbone cycle
1273 # r.wb.cyc <= '0';
1274 # Complete wishbone cycle
1275 sync += r.wb.cyc.eq(0)
1276
1277 # -- Cache line is now valid
1278 # cache_valids(r.store_index)(replace_way) <=
1279 # r.store_valid and not inval_in;
1280 # Cache line is now valid
1281 sync += cache_valid_bits[
1282 r.store_index
1283 ][relace_way].eq(
1284 r.store_valid & ~inval_in
1285 )
1286
1287 # -- We are done
1288 # r.state <= IDLE;
1289 # We are done
1290 sync += r.state.eq(State.IDLE)
1291 # end if;
1292
1293 # -- Increment store row counter
1294 # r.store_row <= next_row(r.store_row);
1295 # Increment store row counter
1296 sync += store_row.eq(next_row(r.store_row))
1297 # end if;
1298 # end case;
1299 # end if;
1300 #
1301 # -- TLB miss and protection fault processing
1302 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1303 # r.fetch_failed <= '0';
1304 # elsif i_in.req = '1' and access_ok = '0' and
1305 # stall_in = '0' then
1306 # r.fetch_failed <= '1';
1307 # end if;
1308 # TLB miss and protection fault processing
1309 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1310 sync += r.fetch_failed.eq(0)
1311
1312 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1313 sync += r.fetch_failed.eq(1)
1314 # end if;
1315 # end process;
1316
1317 # icache_log: if LOG_LENGTH > 0 generate
1318 def icache_log(self, m, log_out):
1319 comb = m.d.comb
1320 sync = m.d.sync
1321
1322 # -- Output data to logger
1323 # signal log_data : std_ulogic_vector(53 downto 0);
1324 # begin
1325 # data_log: process(clk)
1326 # variable lway: way_t;
1327 # variable wstate: std_ulogic;
1328 # Output data to logger
1329 for i in range(LOG_LENGTH):
1330 # Output data to logger
1331 log_data = Signal(54)
1332 lway = Signal(NUM_WAYS)
1333 wstate = Signal()
1334
1335 # begin
1336 # if rising_edge(clk) then
1337 # lway := req_hit_way;
1338 # wstate := '0';
1339 comb += lway.eq(req_hit_way)
1340 comb += wstate.eq(0)
1341
1342 # if r.state /= IDLE then
1343 # wstate := '1';
1344 # end if;
1345 with m.If(r.state != State.IDLE):
1346 comb += wstate.eq(1)
1347
1348 # log_data <= i_out.valid &
1349 # i_out.insn &
1350 # wishbone_in.ack &
1351 # r.wb.adr(5 downto 3) &
1352 # r.wb.stb & r.wb.cyc &
1353 # wishbone_in.stall &
1354 # stall_out &
1355 # r.fetch_failed &
1356 # r.hit_nia(5 downto 2) &
1357 # wstate &
1358 # std_ulogic_vector(to_unsigned(lway, 3)) &
1359 # req_is_hit & req_is_miss &
1360 # access_ok &
1361 # ra_valid;
1362 sync += log_data.eq(Cat(
1363 ra_valid, access_ok, req_is_miss, req_is_hit,
1364 lway, wstate, r.hit_nia[2:6],
1365 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1366 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1367 i_out.valid
1368 ))
1369 # end if;
1370 # end process;
1371 # log_out <= log_data;
1372 comb += log_out.eq(log_data)
1373 # end generate;
1374 # end;
1375
1376 def elaborate(self, platform):
1377
1378 m = Module()
1379 comb = m.d.comb
1380
1381 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1382 cache_tags = CacheTagArray()
1383 cache_valid_bits = CacheValidBitsArray()
1384
1385 # signal itlb_valids : tlb_valids_t;
1386 # signal itlb_tags : tlb_tags_t;
1387 # signal itlb_ptes : tlb_ptes_t;
1388 # attribute ram_style of itlb_tags : signal is "distributed";
1389 # attribute ram_style of itlb_ptes : signal is "distributed";
1390 itlb_valid_bits = TLBValidBitsArray()
1391 itlb_tags = TLBTagArray()
1392 itlb_ptes = TLBPTEArray()
1393 # TODO to be passed to nmigen as ram attributes
1394 # attribute ram_style of itlb_tags : signal is "distributed";
1395 # attribute ram_style of itlb_ptes : signal is "distributed";
1396
1397 # -- Privilege bit from PTE EAA field
1398 # signal eaa_priv : std_ulogic;
1399 # Privilege bit from PTE EAA field
1400 eaa_priv = Signal()
1401
1402 # signal r : reg_internal_t;
1403 r = RegInternal()
1404
1405 # -- Async signals on incoming request
1406 # signal req_index : index_t;
1407 # signal req_row : row_t;
1408 # signal req_hit_way : way_t;
1409 # signal req_tag : cache_tag_t;
1410 # signal req_is_hit : std_ulogic;
1411 # signal req_is_miss : std_ulogic;
1412 # signal req_laddr : std_ulogic_vector(63 downto 0);
1413 # Async signal on incoming request
1414 req_index = Signal(NUM_LINES)
1415 req_row = Signal(BRAM_ROWS)
1416 req_hit_way = Signal(NUM_WAYS)
1417 req_tag = Signal(TAG_BITS)
1418 req_is_hit = Signal()
1419 req_is_miss = Signal()
1420 req_laddr = Signal(64)
1421
1422 # signal tlb_req_index : tlb_index_t;
1423 # signal real_addr : std_ulogic_vector(
1424 # REAL_ADDR_BITS - 1 downto 0
1425 # );
1426 # signal ra_valid : std_ulogic;
1427 # signal priv_fault : std_ulogic;
1428 # signal access_ok : std_ulogic;
1429 # signal use_previous : std_ulogic;
1430 tlb_req_index = Signal(TLB_SIZE)
1431 real_addr = Signal(REAL_ADDR_BITS)
1432 ra_valid = Signal()
1433 priv_fault = Signal()
1434 access_ok = Signal()
1435 use_previous = Signal()
1436
1437 # signal cache_out : cache_ram_out_t;
1438 cache_out = CacheRamOut()
1439
1440 # signal plru_victim : plru_out_t;
1441 # signal replace_way : way_t;
1442 plru_victim = PLRUOut()
1443 replace_way = Signal(NUM_WAYS)
1444
1445 return m
1446
1447
1448 # icache_tb.vhdl
1449 #
1450 # library ieee;
1451 # use ieee.std_logic_1164.all;
1452 #
1453 # library work;
1454 # use work.common.all;
1455 # use work.wishbone_types.all;
1456 #
1457 # entity icache_tb is
1458 # end icache_tb;
1459 #
1460 # architecture behave of icache_tb is
1461 # signal clk : std_ulogic;
1462 # signal rst : std_ulogic;
1463 #
1464 # signal i_out : Fetch1ToIcacheType;
1465 # signal i_in : IcacheToDecode1Type;
1466 #
1467 # signal m_out : MmuToIcacheType;
1468 #
1469 # signal wb_bram_in : wishbone_master_out;
1470 # signal wb_bram_out : wishbone_slave_out;
1471 #
1472 # constant clk_period : time := 10 ns;
1473 # begin
1474 # icache0: entity work.icache
1475 # generic map(
1476 # LINE_SIZE => 64,
1477 # NUM_LINES => 4
1478 # )
1479 # port map(
1480 # clk => clk,
1481 # rst => rst,
1482 # i_in => i_out,
1483 # i_out => i_in,
1484 # m_in => m_out,
1485 # stall_in => '0',
1486 # flush_in => '0',
1487 # inval_in => '0',
1488 # wishbone_out => wb_bram_in,
1489 # wishbone_in => wb_bram_out
1490 # );
1491 #
1492 # -- BRAM Memory slave
1493 # bram0: entity work.wishbone_bram_wrapper
1494 # generic map(
1495 # MEMORY_SIZE => 1024,
1496 # RAM_INIT_FILE => "icache_test.bin"
1497 # )
1498 # port map(
1499 # clk => clk,
1500 # rst => rst,
1501 # wishbone_in => wb_bram_in,
1502 # wishbone_out => wb_bram_out
1503 # );
1504 #
1505 # clk_process: process
1506 # begin
1507 # clk <= '0';
1508 # wait for clk_period/2;
1509 # clk <= '1';
1510 # wait for clk_period/2;
1511 # end process;
1512 #
1513 # rst_process: process
1514 # begin
1515 # rst <= '1';
1516 # wait for 2*clk_period;
1517 # rst <= '0';
1518 # wait;
1519 # end process;
1520 #
1521 # stim: process
1522 # begin
1523 # i_out.req <= '0';
1524 # i_out.nia <= (others => '0');
1525 # i_out.stop_mark <= '0';
1526 #
1527 # m_out.tlbld <= '0';
1528 # m_out.tlbie <= '0';
1529 # m_out.addr <= (others => '0');
1530 # m_out.pte <= (others => '0');
1531 #
1532 # wait until rising_edge(clk);
1533 # wait until rising_edge(clk);
1534 # wait until rising_edge(clk);
1535 # wait until rising_edge(clk);
1536 #
1537 # i_out.req <= '1';
1538 # i_out.nia <= x"0000000000000004";
1539 #
1540 # wait for 30*clk_period;
1541 # wait until rising_edge(clk);
1542 #
1543 # assert i_in.valid = '1' severity failure;
1544 # assert i_in.insn = x"00000001"
1545 # report "insn @" & to_hstring(i_out.nia) &
1546 # "=" & to_hstring(i_in.insn) &
1547 # " expected 00000001"
1548 # severity failure;
1549 #
1550 # i_out.req <= '0';
1551 #
1552 # wait until rising_edge(clk);
1553 #
1554 # -- hit
1555 # i_out.req <= '1';
1556 # i_out.nia <= x"0000000000000008";
1557 # wait until rising_edge(clk);
1558 # wait until rising_edge(clk);
1559 # assert i_in.valid = '1' severity failure;
1560 # assert i_in.insn = x"00000002"
1561 # report "insn @" & to_hstring(i_out.nia) &
1562 # "=" & to_hstring(i_in.insn) &
1563 # " expected 00000002"
1564 # severity failure;
1565 # wait until rising_edge(clk);
1566 #
1567 # -- another miss
1568 # i_out.req <= '1';
1569 # i_out.nia <= x"0000000000000040";
1570 #
1571 # wait for 30*clk_period;
1572 # wait until rising_edge(clk);
1573 #
1574 # assert i_in.valid = '1' severity failure;
1575 # assert i_in.insn = x"00000010"
1576 # report "insn @" & to_hstring(i_out.nia) &
1577 # "=" & to_hstring(i_in.insn) &
1578 # " expected 00000010"
1579 # severity failure;
1580 #
1581 # -- test something that aliases
1582 # i_out.req <= '1';
1583 # i_out.nia <= x"0000000000000100";
1584 # wait until rising_edge(clk);
1585 # wait until rising_edge(clk);
1586 # assert i_in.valid = '0' severity failure;
1587 # wait until rising_edge(clk);
1588 #
1589 # wait for 30*clk_period;
1590 # wait until rising_edge(clk);
1591 #
1592 # assert i_in.valid = '1' severity failure;
1593 # assert i_in.insn = x"00000040"
1594 # report "insn @" & to_hstring(i_out.nia) &
1595 # "=" & to_hstring(i_in.insn) &
1596 # " expected 00000040"
1597 # severity failure;
1598 #
1599 # i_out.req <= '0';
1600 #
1601 # std.env.finish;
1602 # end process;
1603 # end;
1604 def icache_sim(dut):
1605 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1606
1607 yield i_out.req.eq(0)
1608 yield i_out.nia.eq(~1)
1609 yield i_out.stop_mark.eq(0)
1610 yield m_out.tlbld.eq(0)
1611 yield m_out.tlbie.eq(0)
1612 yield m_out.addr.eq(~1)
1613 yield m_out.pte.eq(~1)
1614 yield
1615 yield
1616 yield
1617 yield
1618 yield i_out.req.eq(1)
1619 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1620 for i in range(30):
1621 yield
1622 yield
1623 assert i_in.valid
1624 assert i_in.insn == Const(0x00000001, 32), \
1625 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1626 yield i_out.req.eq(0)
1627 yield
1628
1629 # hit
1630 yield i_out.req.eq(1)
1631 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1632 yield
1633 yield
1634 assert i_in.valid
1635 assert i_in.insn == Const(0x00000002, 32), \
1636 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1637 yield
1638
1639 # another miss
1640 yield i_out.req(1)
1641 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1642 for i in range(30):
1643 yield
1644 yield
1645 assert i_in.valid
1646 assert i_in.insn == Const(0x00000010, 32), \
1647 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1648
1649 # test something that aliases
1650 yield i_out.req.eq(1)
1651 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1652 yield
1653 yield
1654 assert i_in.valid
1655 for i in range(30):
1656 yield
1657 yield
1658 assert i_in.valid
1659 assert i_in.insn == Const(0x00000040, 32), \
1660 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1661 yield i_out.req.eq(0)
1662
1663
1664 def test_icache():
1665 dut = ICache()
1666
1667 m = Module()
1668 m.submodules.icache = dut
1669
1670 # nmigen Simulation
1671 sim = Simulator(m)
1672 sim.add_clock(1e-6)
1673
1674 sim.add_sync_process(wrap(icache_sim(dut)))
1675 with sim.write_vcd('test_icache.vcd'):
1676 sim.run()
1677
1678 if __name__ == '__main__':
1679 dut = ICache()
1680 vl = rtlil.convert(dut, ports=[])
1681 with open("test_icache.il", "w") as f:
1682 f.write(vl)
1683
1684 test_icache()