icache.py fixed numerous bugs as specified by lkcl on bugzilla, now
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 32
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 print("ROW_SIZE", ROW_SIZE)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
88 print("ROW_PER_LINE", ROW_PER_LINE)
89 print("BRAM_ROWS", BRAM_ROWS)
90 print("INSN_PER_ROW", INSN_PER_ROW)
91
92 # Bit fields counts in the address
93 #
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to
98 # select a row
99 ROW_BITS = log2_int(BRAM_ROWS)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS = log2_int(ROW_SIZE)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112 # SET_SIZE_BITS is the log base 2 of
113 # the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
120
121 # WAY_BITS is the number of bits to
122 # select a way
123 WAY_BITS = log2_int(NUM_WAYS)
124 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
125
126 # -- L1 ITLB.
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS = log2_int(TLB_SIZE)
131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
132 TLB_PTE_BITS = 64
133
134
135 print("INSN_BITS", INSN_BITS)
136 print("ROW_BITS", ROW_BITS)
137 print("ROW_LINE_BITS", ROW_LINE_BITS)
138 print("LINE_OFF_BITS", LINE_OFF_BITS)
139 print("ROW_OFF_BITS", ROW_OFF_BITS)
140 print("INDEX_BITS", INDEX_BITS)
141 print("SET_SIZE_BITS", SET_SIZE_BITS)
142 print("TAG_BITS", TAG_BITS)
143 print("WAY_BITS", WAY_BITS)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
145 print("TLB_BITS", TLB_BITS)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
147 print("TLB_PTE_BITS", TLB_PTE_BITS)
148
149
150
151
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
159 #-- icache
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
164 #
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
174 #-- in a cache line
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
186
187 #-- Example of layout for 32 lines of 64 bytes:
188 #--
189 #-- .. tag |index| line |
190 #-- .. | row | |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
200 #
201 # .. tag |index| line |
202 # .. | row | |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
211
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
216 #
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
219 #
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
229 def CacheTagArray():
230 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
231 for x in range(NUM_LINES))
232
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
239 for x in range(NUM_LINES))
240
241 def RowPerLineValidArray():
242 return Array(Signal(name="rows_valid_%d" %x) \
243 for x in range(ROW_PER_LINE))
244
245
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
251
252
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name="tlbvalid_%d" %x) \
261 for x in range(TLB_SIZE))
262
263 def TLBTagArray():
264 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
265 for x in range(TLB_SIZE))
266
267 def TLBPtesArray():
268 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
269 for x in range(TLB_SIZE))
270
271
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
275 def CacheRamOut():
276 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
277 for x in range(NUM_WAYS))
278
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
283 def PLRUOut():
284 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
285 for x in range(NUM_LINES))
286
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
289 # return index_t is
290 # begin
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
293 # ));
294 # end;
295 # Return the cache line index (tag index) for an address
296 def get_index(addr):
297 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
298
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
301 # return row_t is
302 # begin
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
305 # ));
306 # end;
307 # Return the cache row index (data memory) for an address
308 def get_row(addr):
309 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
310
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
314 # begin
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
317 # end;
318 # Return the index of a row within a line
319 def get_row_of_line(row):
320 return row[:ROW_LINE_BITS]
321
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
325 # )
326 # return boolean is
327 # begin
328 # return unsigned(
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
330 # ) = last;
331 # end;
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr, last):
334 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
335
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
339 # begin
340 # return get_row_of_line(row) = last;
341 # end;
342 # Returns whether this is the last row of a line
343 def is_last_row(row, last):
344 return get_row_of_line(row) == last
345
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
353 # begin
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
359 # end;
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
363 def next_row(row):
364 row_v = row[0:ROW_LINE_BITS] + 1
365 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
371 # begin
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
374 # end;
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr, data):
378 word = addr[2:INSN_BITS+2]
379 return data.word_select(word, 32)
380
381 # -- Get the tag value from the address
382 # function get_tag(
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
384 # )
385 # return cache_tag_t is
386 # begin
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
388 # end;
389 # Get the tag value from the address
390 def get_tag(addr):
391 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
392
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
396 # begin
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
398 # end;
399 # Read a tag from a tag memory row
400 def read_tag(way, tagset):
401 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
402
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
406 # begin
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
408 # end;
409 # Write a tag to tag memory row
410 def write_tag(way, tagset, tag):
411 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS].eq(tag)
412
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
417 # begin
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
419 # xor addr(
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
422 # )
423 # xor addr(
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
426 # );
427 # return to_integer(unsigned(hash));
428 # end;
429 # Simple hash for direct-mapped TLB index
430 def hash_ea(addr):
431 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
432 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
433 ] ^ addr[
434 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
435 ]
436 return hsh
437
438 # begin
439 #
440 # assert LINE_SIZE mod ROW_SIZE = 0;
441 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
442 # severity FAILURE;
443 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
444 # severity FAILURE;
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
446 # severity FAILURE;
447 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
448 # severity FAILURE;
449 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up" severity FAILURE;
451 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
452 # report "geometry bits don't add up" severity FAILURE;
453 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
454 # report "geometry bits don't add up" severity FAILURE;
455 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
456 # report "geometry bits don't add up" severity FAILURE;
457 #
458 # sim_debug: if SIM generate
459 # debug: process
460 # begin
461 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
462 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
463 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
464 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
465 # report "INSN_BITS = " & natural'image(INSN_BITS);
466 # report "ROW_BITS = " & natural'image(ROW_BITS);
467 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
468 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
469 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
470 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
471 # report "TAG_BITS = " & natural'image(TAG_BITS);
472 # report "WAY_BITS = " & natural'image(WAY_BITS);
473 # wait;
474 # end process;
475 # end generate;
476
477 # Cache reload state machine
478 @unique
479 class State(Enum):
480 IDLE = 0
481 CLR_TAG = 1
482 WAIT_ACK = 2
483
484 # type reg_internal_t is record
485 # -- Cache hit state (Latches for 1 cycle BRAM access)
486 # hit_way : way_t;
487 # hit_nia : std_ulogic_vector(63 downto 0);
488 # hit_smark : std_ulogic;
489 # hit_valid : std_ulogic;
490 #
491 # -- Cache miss state (reload state machine)
492 # state : state_t;
493 # wb : wishbone_master_out;
494 # store_way : way_t;
495 # store_index : index_t;
496 # store_row : row_t;
497 # store_tag : cache_tag_t;
498 # store_valid : std_ulogic;
499 # end_row_ix : row_in_line_t;
500 # rows_valid : row_per_line_valid_t;
501 #
502 # -- TLB miss state
503 # fetch_failed : std_ulogic;
504 # end record;
505 class RegInternal(RecordObject):
506 def __init__(self):
507 super().__init__()
508 # Cache hit state (Latches for 1 cycle BRAM access)
509 self.hit_way = Signal(NUM_WAYS)
510 self.hit_nia = Signal(64)
511 self.hit_smark = Signal()
512 self.hit_valid = Signal()
513
514 # Cache miss state (reload state machine)
515 self.state = Signal(State, reset=State.IDLE)
516 self.wb = WBMasterOut("wb")
517 self.store_way = Signal(NUM_WAYS)
518 self.store_index = Signal(NUM_LINES)
519 self.store_row = Signal(BRAM_ROWS)
520 self.store_tag = Signal(TAG_BITS)
521 self.store_valid = Signal()
522 self.end_row_ix = Signal(ROW_LINE_BITS)
523 self.rows_valid = RowPerLineValidArray()
524
525 # TLB miss state
526 self.fetch_failed = Signal()
527
528 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
529 #
530 # entity icache is
531 # generic (
532 # SIM : boolean := false;
533 # -- Line size in bytes
534 # LINE_SIZE : positive := 64;
535 # -- BRAM organisation: We never access more
536 # -- than wishbone_data_bits
537 # -- at a time so to save resources we make the
538 # -- array only that wide,
539 # -- and use consecutive indices for to make a cache "line"
540 # --
541 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
542 # -- so 64-bits)
543 # ROW_SIZE : positive := wishbone_data_bits / 8;
544 # -- Number of lines in a set
545 # NUM_LINES : positive := 32;
546 # -- Number of ways
547 # NUM_WAYS : positive := 4;
548 # -- L1 ITLB number of entries (direct mapped)
549 # TLB_SIZE : positive := 64;
550 # -- L1 ITLB log_2(page_size)
551 # TLB_LG_PGSZ : positive := 12;
552 # -- Number of real address bits that we store
553 # REAL_ADDR_BITS : positive := 56;
554 # -- Non-zero to enable log data collection
555 # LOG_LENGTH : natural := 0
556 # );
557 # port (
558 # clk : in std_ulogic;
559 # rst : in std_ulogic;
560 #
561 # i_in : in Fetch1ToIcacheType;
562 # i_out : out IcacheToDecode1Type;
563 #
564 # m_in : in MmuToIcacheType;
565 #
566 # stall_in : in std_ulogic;
567 # stall_out : out std_ulogic;
568 # flush_in : in std_ulogic;
569 # inval_in : in std_ulogic;
570 #
571 # wishbone_out : out wishbone_master_out;
572 # wishbone_in : in wishbone_slave_out;
573 #
574 # log_out : out std_ulogic_vector(53 downto 0)
575 # );
576 # end entity icache;
577 # 64 bit direct mapped icache. All instructions are 4B aligned.
578 class ICache(Elaboratable):
579 """64 bit direct mapped icache. All instructions are 4B aligned."""
580 def __init__(self):
581 self.i_in = Fetch1ToICacheType(name="i_in")
582 self.i_out = ICacheToDecode1Type(name="i_out")
583
584 self.m_in = MMUToICacheType(name="m_in")
585
586 self.stall_in = Signal()
587 self.stall_out = Signal()
588 self.flush_in = Signal()
589 self.inval_in = Signal()
590
591 self.wb_out = WBMasterOut(name="wb_out")
592 self.wb_in = WBSlaveOut(name="wb_in")
593
594 self.log_out = Signal(54)
595
596
597 # -- Generate a cache RAM for each way
598 # rams: for i in 0 to NUM_WAYS-1 generate
599 # signal do_read : std_ulogic;
600 # signal do_write : std_ulogic;
601 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
602 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
603 # signal dout : cache_row_t;
604 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
605 # begin
606 # way: entity work.cache_ram
607 # generic map (
608 # ROW_BITS => ROW_BITS,
609 # WIDTH => ROW_SIZE_BITS
610 # )
611 # port map (
612 # clk => clk,
613 # rd_en => do_read,
614 # rd_addr => rd_addr,
615 # rd_data => dout,
616 # wr_sel => wr_sel,
617 # wr_addr => wr_addr,
618 # wr_data => wishbone_in.dat
619 # );
620 # process(all)
621 # begin
622 # do_read <= not (stall_in or use_previous);
623 # do_write <= '0';
624 # if wishbone_in.ack = '1' and replace_way = i then
625 # do_write <= '1';
626 # end if;
627 # cache_out(i) <= dout;
628 # rd_addr <=
629 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
630 # wr_addr <=
631 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
632 # for i in 0 to ROW_SIZE-1 loop
633 # wr_sel(i) <= do_write;
634 # end loop;
635 # end process;
636 # end generate;
637 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
638 comb = m.d.comb
639
640 wb_in, stall_in = self.wb_in, self.stall_in
641
642
643 for i in range(NUM_WAYS):
644 do_read = Signal(name="do_rd_%d" % i)
645 do_write = Signal(name="do_wr_%d" % i)
646 rd_addr = Signal(ROW_BITS)
647 wr_addr = Signal(ROW_BITS)
648 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
649 wr_sel = Signal(ROW_SIZE)
650
651 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
652 setattr(m.submodules, "cacheram_%d" % i, way)
653
654 comb += way.rd_en.eq(do_read)
655 comb += way.rd_addr.eq(rd_addr)
656 comb += d_out.eq(way.rd_data_o)
657 comb += way.wr_sel.eq(wr_sel)
658 comb += way.wr_addr.eq(wr_addr)
659 comb += way.wr_data.eq(wb_in.dat)
660
661 comb += do_read.eq(~(stall_in | use_previous))
662
663 with m.If(wb_in.ack & (replace_way == i)):
664 comb += do_write.eq(1)
665
666 comb += cache_out[i].eq(d_out)
667 comb += rd_addr.eq(req_row)
668 comb += wr_addr.eq(r.store_row)
669 for j in range(ROW_SIZE):
670 comb += wr_sel[j].eq(do_write)
671
672 # -- Generate PLRUs
673 # maybe_plrus: if NUM_WAYS > 1 generate
674 # begin
675 # plrus: for i in 0 to NUM_LINES-1 generate
676 # -- PLRU interface
677 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
678 # signal plru_acc_en : std_ulogic;
679 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
680 #
681 # begin
682 # plru : entity work.plru
683 # generic map (
684 # BITS => WAY_BITS
685 # )
686 # port map (
687 # clk => clk,
688 # rst => rst,
689 # acc => plru_acc,
690 # acc_en => plru_acc_en,
691 # lru => plru_out
692 # );
693 #
694 # process(all)
695 # begin
696 # -- PLRU interface
697 # if get_index(r.hit_nia) = i then
698 # plru_acc_en <= r.hit_valid;
699 # else
700 # plru_acc_en <= '0';
701 # end if;
702 # plru_acc <=
703 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
704 # plru_victim(i) <= plru_out;
705 # end process;
706 # end generate;
707 # end generate;
708 def maybe_plrus(self, m, r, plru_victim):
709 comb = m.d.comb
710
711 with m.If(NUM_WAYS > 1):
712 for i in range(NUM_LINES):
713 plru_acc_i = Signal(WAY_BITS)
714 plru_acc_en = Signal()
715 plru_out = Signal(WAY_BITS)
716 plru = PLRU(WAY_BITS)
717 comb += plru.acc_i.eq(plru_acc_i)
718 comb += plru.acc_en.eq(plru_acc_en)
719 comb += plru.lru_o.eq(plru_out)
720
721 # PLRU interface
722 with m.If(get_index(r.hit_nia) == i):
723 comb += plru.acc_en.eq(r.hit_valid)
724
725 comb += plru.acc_i.eq(r.hit_way)
726 comb += plru_victim[i].eq(plru.lru_o)
727
728 # -- TLB hit detection and real address generation
729 # itlb_lookup : process(all)
730 # variable pte : tlb_pte_t;
731 # variable ttag : tlb_tag_t;
732 # begin
733 # tlb_req_index <= hash_ea(i_in.nia);
734 # pte := itlb_ptes(tlb_req_index);
735 # ttag := itlb_tags(tlb_req_index);
736 # if i_in.virt_mode = '1' then
737 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
738 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
739 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
740 # ra_valid <= itlb_valids(tlb_req_index);
741 # else
742 # ra_valid <= '0';
743 # end if;
744 # eaa_priv <= pte(3);
745 # else
746 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
747 # ra_valid <= '1';
748 # eaa_priv <= '1';
749 # end if;
750 #
751 # -- no IAMR, so no KUEP support for now
752 # priv_fault <= eaa_priv and not i_in.priv_mode;
753 # access_ok <= ra_valid and not priv_fault;
754 # end process;
755 # TLB hit detection and real address generation
756 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
757 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
758 priv_fault, access_ok):
759 comb = m.d.comb
760
761 i_in = self.i_in
762
763 pte = Signal(TLB_PTE_BITS)
764 ttag = Signal(TLB_EA_TAG_BITS)
765
766 comb += tlb_req_index.eq(hash_ea(i_in.nia))
767 comb += pte.eq(itlb_ptes[tlb_req_index])
768 comb += ttag.eq(itlb_tags[tlb_req_index])
769
770 with m.If(i_in.virt_mode):
771 comb += real_addr.eq(Cat(
772 i_in.nia[:TLB_LG_PGSZ],
773 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
774 ))
775
776 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
777 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
778
779 comb += eaa_priv.eq(pte[3])
780
781 with m.Else():
782 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
783 comb += ra_valid.eq(1)
784 comb += eaa_priv.eq(1)
785
786 # No IAMR, so no KUEP support for now
787 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
788 comb += access_ok.eq(ra_valid & ~priv_fault)
789
790 # -- iTLB update
791 # itlb_update: process(clk)
792 # variable wr_index : tlb_index_t;
793 # begin
794 # if rising_edge(clk) then
795 # wr_index := hash_ea(m_in.addr);
796 # if rst = '1' or
797 # (m_in.tlbie = '1' and m_in.doall = '1') then
798 # -- clear all valid bits
799 # for i in tlb_index_t loop
800 # itlb_valids(i) <= '0';
801 # end loop;
802 # elsif m_in.tlbie = '1' then
803 # -- clear entry regardless of hit or miss
804 # itlb_valids(wr_index) <= '0';
805 # elsif m_in.tlbld = '1' then
806 # itlb_tags(wr_index) <=
807 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
808 # itlb_ptes(wr_index) <= m_in.pte;
809 # itlb_valids(wr_index) <= '1';
810 # end if;
811 # end if;
812 # end process;
813 # iTLB update
814 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
815 comb = m.d.comb
816 sync = m.d.sync
817
818 m_in = self.m_in
819
820 wr_index = Signal(TLB_SIZE)
821 sync += wr_index.eq(hash_ea(m_in.addr))
822
823 with m.If(m_in.tlbie & m_in.doall):
824 # Clear all valid bits
825 for i in range(TLB_SIZE):
826 sync += itlb_valid_bits[i].eq(0)
827
828 with m.Elif(m_in.tlbie):
829 # Clear entry regardless of hit or miss
830 sync += itlb_valid_bits[wr_index].eq(0)
831
832 with m.Elif(m_in.tlbld):
833 sync += itlb_tags[wr_index].eq(
834 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
835 )
836 sync += itlb_ptes[wr_index].eq(m_in.pte)
837 sync += itlb_valid_bits[wr_index].eq(1)
838
839 # -- Cache hit detection, output to fetch2 and other misc logic
840 # icache_comb : process(all)
841 # Cache hit detection, output to fetch2 and other misc logic
842 def icache_comb(self, m, use_previous, r, req_index, req_row,
843 req_tag, real_addr, req_laddr, cache_valid_bits,
844 cache_tags, access_ok, req_is_hit,
845 req_is_miss, replace_way, plru_victim, cache_out):
846 # variable is_hit : std_ulogic;
847 # variable hit_way : way_t;
848 comb = m.d.comb
849
850 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x " \
851 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x " \
852 # "access_ok:%x req_is_hit:%x req_is_miss:%x " \
853 # "replace_way:%x", use_previous, req_index, req_row, \
854 # req_tag, real_addr, req_laddr, access_ok, \
855 # req_is_hit, req_is_miss, replace_way)
856
857 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
858 flush_in, stall_out = self.flush_in, self.stall_out
859
860 is_hit = Signal()
861 hit_way = Signal(NUM_WAYS)
862 # begin
863 # -- i_in.sequential means that i_in.nia this cycle
864 # -- is 4 more than last cycle. If we read more
865 # -- than 32 bits at a time, had a cache hit last
866 # -- cycle, and we don't want the first 32-bit chunk
867 # -- then we can keep the data we read last cycle
868 # -- and just use that.
869 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
870 # use_previous <= i_in.sequential and r.hit_valid;
871 # else
872 # use_previous <= '0';
873 # end if;
874 # i_in.sequential means that i_in.nia this cycle is 4 more than
875 # last cycle. If we read more than 32 bits at a time, had a
876 # cache hit last cycle, and we don't want the first 32-bit chunk
877 # then we can keep the data we read last cycle and just use that.
878 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
879 comb += use_previous.eq(i_in.sequential & r.hit_valid)
880
881 # -- Extract line, row and tag from request
882 # req_index <= get_index(i_in.nia);
883 # req_row <= get_row(i_in.nia);
884 # req_tag <= get_tag(real_addr);
885 # Extract line, row and tag from request
886 comb += req_index.eq(get_index(i_in.nia))
887 comb += req_row.eq(get_row(i_in.nia))
888 comb += req_tag.eq(get_tag(real_addr))
889
890 # -- Calculate address of beginning of cache row, will be
891 # -- used for cache miss processing if needed
892 # req_laddr <=
893 # (63 downto REAL_ADDR_BITS => '0') &
894 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
895 # (ROW_OFF_BITS-1 downto 0 => '0');
896 # Calculate address of beginning of cache row, will be
897 # used for cache miss processing if needed
898 comb += req_laddr.eq(Cat(
899 Const(0b0, ROW_OFF_BITS),
900 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
901 Const(0b0, 8)
902 ))
903
904 # -- Test if pending request is a hit on any way
905 # hit_way := 0;
906 # is_hit := '0';
907 # for i in way_t loop
908 # if i_in.req = '1' and
909 # (cache_valids(req_index)(i) = '1' or
910 # (r.state = WAIT_ACK and
911 # req_index = r.store_index and
912 # i = r.store_way and
913 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
914 # if read_tag(i, cache_tags(req_index)) = req_tag then
915 # hit_way := i;
916 # is_hit := '1';
917 # end if;
918 # end if;
919 # end loop;
920 # Test if pending request is a hit on any way
921 for i in range(NUM_WAYS):
922 with m.If(i_in.req &
923 (cache_valid_bits[req_index][i] |
924 ((r.state == State.WAIT_ACK)
925 & (req_index == r.store_index)
926 & (i == r.store_way)
927 & r.rows_valid[req_row % ROW_PER_LINE]))):
928 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
929 comb += hit_way.eq(i)
930 comb += is_hit.eq(1)
931
932 # -- Generate the "hit" and "miss" signals
933 # -- for the synchronous blocks
934 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
935 # and rst = '0' then
936 # req_is_hit <= is_hit;
937 # req_is_miss <= not is_hit;
938 # else
939 # req_is_hit <= '0';
940 # req_is_miss <= '0';
941 # end if;
942 # req_hit_way <= hit_way;
943 # Generate the "hit" and "miss" signals
944 # for the synchronous blocks
945 with m.If(i_in.req & access_ok & ~flush_in):
946 comb += req_is_hit.eq(is_hit)
947 comb += req_is_miss.eq(~is_hit)
948
949 with m.Else():
950 comb += req_is_hit.eq(0)
951 comb += req_is_miss.eq(0)
952
953 # -- The way to replace on a miss
954 # if r.state = CLR_TAG then
955 # replace_way <=
956 # to_integer(unsigned(plru_victim(r.store_index)));
957 # else
958 # replace_way <= r.store_way;
959 # end if;
960 # The way to replace on a miss
961 with m.If(r.state == State.CLR_TAG):
962 comb += replace_way.eq(plru_victim[r.store_index])
963
964 with m.Else():
965 comb += replace_way.eq(r.store_way)
966
967 # -- Output instruction from current cache row
968 # --
969 # -- Note: This is a mild violation of our design principle of
970 # -- having pipeline stages output from a clean latch. In this
971 # -- case we output the result of a mux. The alternative would
972 # -- be output an entire row which I prefer not to do just yet
973 # -- as it would force fetch2 to know about some of the cache
974 # -- geometry information.
975 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
976 # i_out.valid <= r.hit_valid;
977 # i_out.nia <= r.hit_nia;
978 # i_out.stop_mark <= r.hit_smark;
979 # i_out.fetch_failed <= r.fetch_failed;
980 # Output instruction from current cache row
981 #
982 # Note: This is a mild violation of our design principle of
983 # having pipeline stages output from a clean latch. In this
984 # case we output the result of a mux. The alternative would
985 # be output an entire row which I prefer not to do just yet
986 # as it would force fetch2 to know about some of the cache
987 # geometry information.
988 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
989 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
990 # r.hit_way, cache_out[r.hit_way])
991 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out[r.hit_way]))
992 comb += i_out.valid.eq(r.hit_valid)
993 comb += i_out.nia.eq(r.hit_nia)
994 comb += i_out.stop_mark.eq(r.hit_smark)
995 comb += i_out.fetch_failed.eq(r.fetch_failed)
996
997 # -- Stall fetch1 if we have a miss on cache or TLB
998 # -- or a protection fault
999 # stall_out <= not (is_hit and access_ok);
1000 # Stall fetch1 if we have a miss on cache or TLB
1001 # or a protection fault
1002 comb += stall_out.eq(~(is_hit & access_ok))
1003
1004 # -- Wishbone requests output (from the cache miss reload machine)
1005 # wishbone_out <= r.wb;
1006 # Wishbone requests output (from the cache miss reload machine)
1007 comb += wb_out.eq(r.wb)
1008 # end process;
1009
1010 # -- Cache hit synchronous machine
1011 # icache_hit : process(clk)
1012 # Cache hit synchronous machine
1013 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
1014 req_index, req_tag, real_addr):
1015 sync = m.d.sync
1016
1017 i_in, stall_in = self.i_in, self.stall_in
1018 flush_in = self.flush_in
1019
1020 # begin
1021 # if rising_edge(clk) then
1022 # -- keep outputs to fetch2 unchanged on a stall
1023 # -- except that flush or reset sets valid to 0
1024 # -- If use_previous, keep the same data as last
1025 # -- cycle and use the second half
1026 # if stall_in = '1' or use_previous = '1' then
1027 # if rst = '1' or flush_in = '1' then
1028 # r.hit_valid <= '0';
1029 # end if;
1030 # keep outputs to fetch2 unchanged on a stall
1031 # except that flush or reset sets valid to 0
1032 # If use_previous, keep the same data as last
1033 # cycle and use the second half
1034 with m.If(stall_in | use_previous):
1035 with m.If(flush_in):
1036 sync += r.hit_valid.eq(0)
1037 # else
1038 # -- On a hit, latch the request for the next cycle,
1039 # -- when the BRAM data will be available on the
1040 # -- cache_out output of the corresponding way
1041 # r.hit_valid <= req_is_hit;
1042 # if req_is_hit = '1' then
1043 # r.hit_way <= req_hit_way;
1044 with m.Else():
1045 # On a hit, latch the request for the next cycle,
1046 # when the BRAM data will be available on the
1047 # cache_out output of the corresponding way
1048 sync += r.hit_valid.eq(req_is_hit)
1049
1050 with m.If(req_is_hit):
1051 sync += r.hit_way.eq(req_hit_way)
1052
1053 # report "cache hit nia:" & to_hstring(i_in.nia) &
1054 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1055 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1056 # " idx:" & integer'image(req_index) &
1057 # " tag:" & to_hstring(req_tag) &
1058 # " way:" & integer'image(req_hit_way) &
1059 # " RA:" & to_hstring(real_addr);
1060 sync += Display("cache hit nia:%x, IR:%x, SM:%x, idx:%x, " \
1061 "tag:%x, way:%x, RA:%x", i_in.nia, \
1062 i_in.virt_mode, i_in.stop_mark, req_index, \
1063 req_tag, req_hit_way, real_addr)
1064
1065
1066
1067 # end if;
1068 # end if;
1069 # if stall_in = '0' then
1070 # -- Send stop marks and NIA down regardless of validity
1071 # r.hit_smark <= i_in.stop_mark;
1072 # r.hit_nia <= i_in.nia;
1073 # end if;
1074 with m.If(~stall_in):
1075 # Send stop marks and NIA down regardless of validity
1076 sync += r.hit_smark.eq(i_in.stop_mark)
1077 sync += r.hit_nia.eq(i_in.nia)
1078 # end if;
1079 # end process;
1080
1081 # -- Cache miss/reload synchronous machine
1082 # icache_miss : process(clk)
1083 # Cache miss/reload synchronous machine
1084 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1085 req_index, req_laddr, req_tag, replace_way,
1086 cache_tags, access_ok, real_addr):
1087 comb = m.d.comb
1088 sync = m.d.sync
1089
1090 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1091 stall_in, flush_in = self.stall_in, self.flush_in
1092 inval_in = self.inval_in
1093
1094 # variable tagset : cache_tags_set_t;
1095 # variable stbs_done : boolean;
1096
1097 tagset = Signal(TAG_RAM_WIDTH)
1098 stbs_done = Signal()
1099
1100 # begin
1101 # if rising_edge(clk) then
1102 # -- On reset, clear all valid bits to force misses
1103 # if rst = '1' then
1104 # On reset, clear all valid bits to force misses
1105 # for i in index_t loop
1106 # cache_valids(i) <= (others => '0');
1107 # end loop;
1108 # r.state <= IDLE;
1109 # r.wb.cyc <= '0';
1110 # r.wb.stb <= '0';
1111 # -- We only ever do reads on wishbone
1112 # r.wb.dat <= (others => '0');
1113 # r.wb.sel <= "11111111";
1114 # r.wb.we <= '0';
1115
1116 # -- Not useful normally but helps avoiding
1117 # -- tons of sim warnings
1118 # r.wb.adr <= (others => '0');
1119
1120 # else
1121
1122 # -- Process cache invalidations
1123 # if inval_in = '1' then
1124 # for i in index_t loop
1125 # cache_valids(i) <= (others => '0');
1126 # end loop;
1127 # r.store_valid <= '0';
1128 # end if;
1129 # Process cache invalidations
1130 with m.If(inval_in):
1131 for i in range(NUM_LINES):
1132 sync += cache_valid_bits[i].eq(0)
1133 sync += r.store_valid.eq(0)
1134
1135 # -- Main state machine
1136 # case r.state is
1137 # Main state machine
1138 with m.Switch(r.state):
1139
1140 # when IDLE =>
1141 with m.Case(State.IDLE):
1142 # -- Reset per-row valid flags,
1143 # -- only used in WAIT_ACK
1144 # for i in 0 to ROW_PER_LINE - 1 loop
1145 # r.rows_valid(i) <= '0';
1146 # end loop;
1147 # Reset per-row valid flags,
1148 # only used in WAIT_ACK
1149 for i in range(ROW_PER_LINE):
1150 sync += r.rows_valid[i].eq(0)
1151
1152 # -- We need to read a cache line
1153 # if req_is_miss = '1' then
1154 # report "cache miss nia:" & to_hstring(i_in.nia) &
1155 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1156 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1157 # " idx:" & integer'image(req_index) &
1158 # " way:" & integer'image(replace_way) &
1159 # " tag:" & to_hstring(req_tag) &
1160 # " RA:" & to_hstring(real_addr);
1161 # We need to read a cache line
1162 with m.If(req_is_miss):
1163 sync += Display(
1164 "cache miss nia:%x IR:%x SM:%x idx:%x way:%x " \
1165 "tag:%x RA:%x", i_in.nia, i_in.virt_mode, \
1166 i_in.stop_mark, req_index, replace_way, \
1167 req_tag, real_addr)
1168
1169 # -- Keep track of our index and way for
1170 # -- subsequent stores
1171 # r.store_index <= req_index;
1172 # r.store_row <= get_row(req_laddr);
1173 # r.store_tag <= req_tag;
1174 # r.store_valid <= '1';
1175 # r.end_row_ix <=
1176 # get_row_of_line(get_row(req_laddr)) - 1;
1177 # Keep track of our index and way
1178 # for subsequent stores
1179 sync += r.store_index.eq(req_index)
1180 sync += r.store_row.eq(get_row(req_laddr))
1181 sync += r.store_tag.eq(req_tag)
1182 sync += r.store_valid.eq(1)
1183 sync += r.end_row_ix.eq(
1184 get_row_of_line(
1185 get_row(req_laddr)
1186 ) - 1
1187 )
1188
1189 # -- Prep for first wishbone read. We calculate the
1190 # -- address of the start of the cache line and
1191 # -- start the WB cycle.
1192 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1193 # r.wb.cyc <= '1';
1194 # r.wb.stb <= '1';
1195 # Prep for first wishbone read.
1196 # We calculate the
1197 # address of the start of the cache line and
1198 # start the WB cycle.
1199 sync += r.wb.adr.eq(req_laddr)
1200 sync += r.wb.cyc.eq(1)
1201 sync += r.wb.stb.eq(1)
1202
1203 # -- Track that we had one request sent
1204 # r.state <= CLR_TAG;
1205 # Track that we had one request sent
1206 sync += r.state.eq(State.CLR_TAG)
1207 # end if;
1208
1209 # when CLR_TAG | WAIT_ACK =>
1210 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1211 # if r.state = CLR_TAG then
1212 with m.If(r.state == State.CLR_TAG):
1213 # -- Get victim way from plru
1214 # r.store_way <= replace_way;
1215 # Get victim way from plru
1216 sync += r.store_way.eq(replace_way)
1217 #
1218 # -- Force misses on that way while
1219 # -- reloading that line
1220 # cache_valids(req_index)(replace_way) <= '0';
1221 # Force misses on that way while
1222 # realoading that line
1223 cv = Signal(INDEX_BITS)
1224 comb += cv.eq(cache_valid_bits[req_index])
1225 comb += cv.bit_select(replace_way, 1).eq(0)
1226 sync += cache_valid_bits[req_index].eq(cv)
1227
1228 # -- Store new tag in selected way
1229 # for i in 0 to NUM_WAYS-1 loop
1230 # if i = replace_way then
1231 # tagset := cache_tags(r.store_index);
1232 # write_tag(i, tagset, r.store_tag);
1233 # cache_tags(r.store_index) <= tagset;
1234 # end if;
1235 # end loop;
1236 for i in range(NUM_WAYS):
1237 with m.If(i == replace_way):
1238 comb += tagset.eq(cache_tags[r.store_index])
1239 comb += write_tag(i, tagset, r.store_tag)
1240 sync += cache_tags[r.store_index].eq(tagset)
1241
1242 # r.state <= WAIT_ACK;
1243 sync += r.state.eq(State.WAIT_ACK)
1244 # end if;
1245
1246 # -- Requests are all sent if stb is 0
1247 # stbs_done := r.wb.stb = '0';
1248 # Requests are all sent if stb is 0
1249 comb += stbs_done.eq(r.wb.stb == 0)
1250 stbs_zero = Signal()
1251 comb += stbs_zero.eq(stbs_done == 0)
1252
1253 # -- If we are still sending requests,
1254 # -- was one accepted ?
1255 # if wishbone_in.stall = '0' and not stbs_done then
1256 # If we are still sending requests,
1257 # was one accepted?
1258 with m.If(~wb_in.stall & stbs_zero):
1259 # -- That was the last word ? We are done sending.
1260 # -- Clear stb and set stbs_done so we can handle
1261 # -- an eventual last ack on the same cycle.
1262 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1263 # r.wb.stb <= '0';
1264 # stbs_done := true;
1265 # end if;
1266 # That was the last word ?
1267 # We are done sending.
1268 # Clear stb and set stbs_done
1269 # so we can handle
1270 # an eventual last ack on
1271 # the same cycle.
1272 with m.If(is_last_row_addr(r.wb.adr, r.end_row_ix)):
1273 sync += r.wb.stb.eq(0)
1274 comb += stbs_done.eq(1)
1275
1276 # -- Calculate the next row address
1277 # r.wb.adr <= next_row_addr(r.wb.adr);
1278 # Calculate the next row address
1279 rarange = r.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]
1280 sync += r.wb.adr.eq(rarange + 1)
1281 # end if;
1282
1283 # -- Incoming acks processing
1284 # if wishbone_in.ack = '1' then
1285 # Incoming acks processing
1286 with m.If(wb_in.ack):
1287 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1288 # <= '1';
1289 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
1290
1291 # -- Check for completion
1292 # if stbs_done and
1293 # is_last_row(r.store_row, r.end_row_ix) then
1294 # Check for completion
1295 with m.If(stbs_done &
1296 is_last_row(r.store_row, r.end_row_ix)):
1297 # -- Complete wishbone cycle
1298 # r.wb.cyc <= '0';
1299 # Complete wishbone cycle
1300 sync += r.wb.cyc.eq(0)
1301
1302 # -- Cache line is now valid
1303 # cache_valids(r.store_index)(replace_way) <=
1304 # r.store_valid and not inval_in;
1305 # Cache line is now valid
1306 cv = Signal(INDEX_BITS)
1307 comb += cv.eq(cache_valid_bits[r.store_index])
1308 comb += cv.bit_select(replace_way, 1).eq(
1309 r.store_valid & ~inval_in
1310 )
1311 sync += cache_valid_bits[r.store_index].eq(cv)
1312
1313 # -- We are done
1314 # r.state <= IDLE;
1315 # We are done
1316 sync += r.state.eq(State.IDLE)
1317 # end if;
1318
1319 # -- Increment store row counter
1320 # r.store_row <= next_row(r.store_row);
1321 # Increment store row counter
1322 sync += r.store_row.eq(next_row(r.store_row))
1323 # end if;
1324 # end case;
1325 # end if;
1326 #
1327 # -- TLB miss and protection fault processing
1328 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1329 # r.fetch_failed <= '0';
1330 # elsif i_in.req = '1' and access_ok = '0' and
1331 # stall_in = '0' then
1332 # r.fetch_failed <= '1';
1333 # end if;
1334 # TLB miss and protection fault processing
1335 with m.If(flush_in | m_in.tlbld):
1336 sync += r.fetch_failed.eq(0)
1337
1338 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1339 sync += r.fetch_failed.eq(1)
1340 # end if;
1341 # end process;
1342
1343 # icache_log: if LOG_LENGTH > 0 generate
1344 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1345 req_is_miss, req_is_hit, lway, wstate, r):
1346 comb = m.d.comb
1347 sync = m.d.sync
1348
1349 wb_in, i_out = self.wb_in, self.i_out
1350 log_out, stall_out = self.log_out, self.stall_out
1351
1352 # -- Output data to logger
1353 # signal log_data : std_ulogic_vector(53 downto 0);
1354 # begin
1355 # data_log: process(clk)
1356 # variable lway: way_t;
1357 # variable wstate: std_ulogic;
1358 # Output data to logger
1359 for i in range(LOG_LENGTH):
1360 # Output data to logger
1361 log_data = Signal(54)
1362 lway = Signal(NUM_WAYS)
1363 wstate = Signal()
1364
1365 # begin
1366 # if rising_edge(clk) then
1367 # lway := req_hit_way;
1368 # wstate := '0';
1369 sync += lway.eq(req_hit_way)
1370 sync += wstate.eq(0)
1371
1372 # if r.state /= IDLE then
1373 # wstate := '1';
1374 # end if;
1375 with m.If(r.state != State.IDLE):
1376 sync += wstate.eq(1)
1377
1378 # log_data <= i_out.valid &
1379 # i_out.insn &
1380 # wishbone_in.ack &
1381 # r.wb.adr(5 downto 3) &
1382 # r.wb.stb & r.wb.cyc &
1383 # wishbone_in.stall &
1384 # stall_out &
1385 # r.fetch_failed &
1386 # r.hit_nia(5 downto 2) &
1387 # wstate &
1388 # std_ulogic_vector(to_unsigned(lway, 3)) &
1389 # req_is_hit & req_is_miss &
1390 # access_ok &
1391 # ra_valid;
1392 sync += log_data.eq(Cat(
1393 ra_valid, access_ok, req_is_miss, req_is_hit,
1394 lway, wstate, r.hit_nia[2:6],
1395 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1396 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1397 i_out.valid
1398 ))
1399 # end if;
1400 # end process;
1401 # log_out <= log_data;
1402 comb += log_out.eq(log_data)
1403 # end generate;
1404 # end;
1405
1406 def elaborate(self, platform):
1407
1408 m = Module()
1409 comb = m.d.comb
1410
1411 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1412 cache_tags = CacheTagArray()
1413 cache_valid_bits = CacheValidBitsArray()
1414
1415 # signal itlb_valids : tlb_valids_t;
1416 # signal itlb_tags : tlb_tags_t;
1417 # signal itlb_ptes : tlb_ptes_t;
1418 # attribute ram_style of itlb_tags : signal is "distributed";
1419 # attribute ram_style of itlb_ptes : signal is "distributed";
1420 itlb_valid_bits = TLBValidBitsArray()
1421 itlb_tags = TLBTagArray()
1422 itlb_ptes = TLBPtesArray()
1423 # TODO to be passed to nmigen as ram attributes
1424 # attribute ram_style of itlb_tags : signal is "distributed";
1425 # attribute ram_style of itlb_ptes : signal is "distributed";
1426
1427 # -- Privilege bit from PTE EAA field
1428 # signal eaa_priv : std_ulogic;
1429 # Privilege bit from PTE EAA field
1430 eaa_priv = Signal()
1431
1432 # signal r : reg_internal_t;
1433 r = RegInternal()
1434
1435 # -- Async signals on incoming request
1436 # signal req_index : index_t;
1437 # signal req_row : row_t;
1438 # signal req_hit_way : way_t;
1439 # signal req_tag : cache_tag_t;
1440 # signal req_is_hit : std_ulogic;
1441 # signal req_is_miss : std_ulogic;
1442 # signal req_laddr : std_ulogic_vector(63 downto 0);
1443 # Async signal on incoming request
1444 req_index = Signal(NUM_LINES)
1445 req_row = Signal(BRAM_ROWS)
1446 req_hit_way = Signal(NUM_WAYS)
1447 req_tag = Signal(TAG_BITS)
1448 req_is_hit = Signal()
1449 req_is_miss = Signal()
1450 req_laddr = Signal(64)
1451
1452 # signal tlb_req_index : tlb_index_t;
1453 # signal real_addr : std_ulogic_vector(
1454 # REAL_ADDR_BITS - 1 downto 0
1455 # );
1456 # signal ra_valid : std_ulogic;
1457 # signal priv_fault : std_ulogic;
1458 # signal access_ok : std_ulogic;
1459 # signal use_previous : std_ulogic;
1460 tlb_req_index = Signal(TLB_SIZE)
1461 real_addr = Signal(REAL_ADDR_BITS)
1462 ra_valid = Signal()
1463 priv_fault = Signal()
1464 access_ok = Signal()
1465 use_previous = Signal()
1466
1467 # signal cache_out : cache_ram_out_t;
1468 cache_out = CacheRamOut()
1469
1470 # signal plru_victim : plru_out_t;
1471 # signal replace_way : way_t;
1472 plru_victim = PLRUOut()
1473 replace_way = Signal(NUM_WAYS)
1474
1475 # call sub-functions putting everything together, using shared
1476 # signals established above
1477 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1478 self.maybe_plrus(m, r, plru_victim)
1479 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1480 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1481 priv_fault, access_ok)
1482 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1483 self.icache_comb(m, use_previous, r, req_index, req_row,
1484 req_tag, real_addr, req_laddr, cache_valid_bits,
1485 cache_tags, access_ok, req_is_hit, req_is_miss,
1486 replace_way, plru_victim, cache_out)
1487 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1488 req_index, req_tag, real_addr)
1489 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1490 req_laddr, req_tag, replace_way, cache_tags,
1491 access_ok, real_addr)
1492 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1493 # req_is_miss, req_is_hit, lway, wstate, r)
1494
1495 return m
1496
1497
1498 # icache_tb.vhdl
1499 #
1500 # library ieee;
1501 # use ieee.std_logic_1164.all;
1502 #
1503 # library work;
1504 # use work.common.all;
1505 # use work.wishbone_types.all;
1506 #
1507 # entity icache_tb is
1508 # end icache_tb;
1509 #
1510 # architecture behave of icache_tb is
1511 # signal clk : std_ulogic;
1512 # signal rst : std_ulogic;
1513 #
1514 # signal i_out : Fetch1ToIcacheType;
1515 # signal i_in : IcacheToDecode1Type;
1516 #
1517 # signal m_out : MmuToIcacheType;
1518 #
1519 # signal wb_bram_in : wishbone_master_out;
1520 # signal wb_bram_out : wishbone_slave_out;
1521 #
1522 # constant clk_period : time := 10 ns;
1523 # begin
1524 # icache0: entity work.icache
1525 # generic map(
1526 # LINE_SIZE => 64,
1527 # NUM_LINES => 4
1528 # )
1529 # port map(
1530 # clk => clk,
1531 # rst => rst,
1532 # i_in => i_out,
1533 # i_out => i_in,
1534 # m_in => m_out,
1535 # stall_in => '0',
1536 # flush_in => '0',
1537 # inval_in => '0',
1538 # wishbone_out => wb_bram_in,
1539 # wishbone_in => wb_bram_out
1540 # );
1541 #
1542 # -- BRAM Memory slave
1543 # bram0: entity work.wishbone_bram_wrapper
1544 # generic map(
1545 # MEMORY_SIZE => 1024,
1546 # RAM_INIT_FILE => "icache_test.bin"
1547 # )
1548 # port map(
1549 # clk => clk,
1550 # rst => rst,
1551 # wishbone_in => wb_bram_in,
1552 # wishbone_out => wb_bram_out
1553 # );
1554 #
1555 # clk_process: process
1556 # begin
1557 # clk <= '0';
1558 # wait for clk_period/2;
1559 # clk <= '1';
1560 # wait for clk_period/2;
1561 # end process;
1562 #
1563 # rst_process: process
1564 # begin
1565 # rst <= '1';
1566 # wait for 2*clk_period;
1567 # rst <= '0';
1568 # wait;
1569 # end process;
1570 #
1571 # stim: process
1572 # begin
1573 # i_out.req <= '0';
1574 # i_out.nia <= (others => '0');
1575 # i_out.stop_mark <= '0';
1576 #
1577 # m_out.tlbld <= '0';
1578 # m_out.tlbie <= '0';
1579 # m_out.addr <= (others => '0');
1580 # m_out.pte <= (others => '0');
1581 #
1582 # wait until rising_edge(clk);
1583 # wait until rising_edge(clk);
1584 # wait until rising_edge(clk);
1585 # wait until rising_edge(clk);
1586 #
1587 # i_out.req <= '1';
1588 # i_out.nia <= x"0000000000000004";
1589 #
1590 # wait for 30*clk_period;
1591 # wait until rising_edge(clk);
1592 #
1593 # assert i_in.valid = '1' severity failure;
1594 # assert i_in.insn = x"00000001"
1595 # report "insn @" & to_hstring(i_out.nia) &
1596 # "=" & to_hstring(i_in.insn) &
1597 # " expected 00000001"
1598 # severity failure;
1599 #
1600 # i_out.req <= '0';
1601 #
1602 # wait until rising_edge(clk);
1603 #
1604 # -- hit
1605 # i_out.req <= '1';
1606 # i_out.nia <= x"0000000000000008";
1607 # wait until rising_edge(clk);
1608 # wait until rising_edge(clk);
1609 # assert i_in.valid = '1' severity failure;
1610 # assert i_in.insn = x"00000002"
1611 # report "insn @" & to_hstring(i_out.nia) &
1612 # "=" & to_hstring(i_in.insn) &
1613 # " expected 00000002"
1614 # severity failure;
1615 # wait until rising_edge(clk);
1616 #
1617 # -- another miss
1618 # i_out.req <= '1';
1619 # i_out.nia <= x"0000000000000040";
1620 #
1621 # wait for 30*clk_period;
1622 # wait until rising_edge(clk);
1623 #
1624 # assert i_in.valid = '1' severity failure;
1625 # assert i_in.insn = x"00000010"
1626 # report "insn @" & to_hstring(i_out.nia) &
1627 # "=" & to_hstring(i_in.insn) &
1628 # " expected 00000010"
1629 # severity failure;
1630 #
1631 # -- test something that aliases
1632 # i_out.req <= '1';
1633 # i_out.nia <= x"0000000000000100";
1634 # wait until rising_edge(clk);
1635 # wait until rising_edge(clk);
1636 # assert i_in.valid = '0' severity failure;
1637 # wait until rising_edge(clk);
1638 #
1639 # wait for 30*clk_period;
1640 # wait until rising_edge(clk);
1641 #
1642 # assert i_in.valid = '1' severity failure;
1643 # assert i_in.insn = x"00000040"
1644 # report "insn @" & to_hstring(i_out.nia) &
1645 # "=" & to_hstring(i_in.insn) &
1646 # " expected 00000040"
1647 # severity failure;
1648 #
1649 # i_out.req <= '0';
1650 #
1651 # std.env.finish;
1652 # end process;
1653 # end;
1654 def icache_sim(dut):
1655 i_out = dut.i_in
1656 i_in = dut.i_out
1657 m_out = dut.m_in
1658
1659 yield i_in.valid.eq(0)
1660 yield i_out.priv_mode.eq(1)
1661 yield i_out.req.eq(0)
1662 yield i_out.nia.eq(0)
1663 yield i_out.stop_mark.eq(0)
1664 yield m_out.tlbld.eq(0)
1665 yield m_out.tlbie.eq(0)
1666 yield m_out.addr.eq(0)
1667 yield m_out.pte.eq(0)
1668 yield
1669 yield
1670 yield
1671 yield
1672 yield i_out.req.eq(1)
1673 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1674 for i in range(30):
1675 yield
1676 yield
1677 valid = yield i_in.valid
1678 nia = yield i_out.nia
1679 insn = yield i_in.insn
1680 print(f"valid? {valid}")
1681 assert valid
1682 assert insn == 0x00000001, \
1683 "insn @%x=%x expected 00000001" % (nia, insn)
1684 yield i_out.req.eq(0)
1685 yield
1686
1687 # hit
1688 yield i_out.req.eq(1)
1689 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1690 yield
1691 yield
1692 valid = yield i_in.valid
1693 nia = yield i_in.nia
1694 insn = yield i_in.insn
1695 assert valid
1696 assert insn == 0x00000002, \
1697 "insn @%x=%x expected 00000002" % (nia, insn)
1698 yield
1699
1700 # another miss
1701 yield i_out.req.eq(1)
1702 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1703 for i in range(30):
1704 yield
1705 yield
1706 valid = yield i_in.valid
1707 nia = yield i_out.nia
1708 insn = yield i_in.insn
1709 assert valid
1710 assert insn == 0x00000010, \
1711 "insn @%x=%x expected 00000010" % (nia, insn)
1712
1713 # test something that aliases
1714 yield i_out.req.eq(1)
1715 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1716 yield
1717 yield
1718 valid = yield i_in.valid
1719 assert ~valid
1720 for i in range(30):
1721 yield
1722 yield
1723 insn = yield i_in.insn
1724 valid = yield i_in.valid
1725 insn = yield i_in.insn
1726 assert valid
1727 assert insn == 0x00000040, \
1728 "insn @%x=%x expected 00000040" % (nia, insn)
1729 yield i_out.req.eq(0)
1730
1731
1732
1733 def test_icache(mem):
1734 dut = ICache()
1735
1736 memory = Memory(width=64, depth=16*64, init=mem)
1737 sram = SRAM(memory=memory, granularity=8)
1738
1739 m = Module()
1740
1741 m.submodules.icache = dut
1742 m.submodules.sram = sram
1743
1744 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1745 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1746 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1747 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1748 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1749 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1750
1751 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1752 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1753
1754 # nmigen Simulation
1755 sim = Simulator(m)
1756 sim.add_clock(1e-6)
1757
1758 sim.add_sync_process(wrap(icache_sim(dut)))
1759 with sim.write_vcd('test_icache.vcd'):
1760 sim.run()
1761
1762 if __name__ == '__main__':
1763 dut = ICache()
1764 vl = rtlil.convert(dut, ports=[])
1765 with open("test_icache.il", "w") as f:
1766 f.write(vl)
1767
1768 mem = []
1769 for i in range(0,512):
1770 mem.append((i*2)| ((i*2+1)<<32))
1771
1772 test_icache(mem)
1773