remove reviewed comments
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 print("ROW_SIZE", ROW_SIZE)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
88 print("ROW_PER_LINE", ROW_PER_LINE)
89 print("BRAM_ROWS", BRAM_ROWS)
90 print("INSN_PER_ROW", INSN_PER_ROW)
91
92 # Bit fields counts in the address
93 #
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to
98 # select a row
99 ROW_BITS = log2_int(BRAM_ROWS)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINEBITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS = log2_int(ROW_SIZE)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112 # SET_SIZE_BITS is the log base 2 of
113 # the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
120
121 # WAY_BITS is the number of bits to
122 # select a way
123 WAY_BITS = log2_int(NUM_WAYS)
124 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
125
126 # -- L1 ITLB.
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS = log2_int(TLB_SIZE)
131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
132 TLB_PTE_BITS = 64
133
134
135 print("INSN_BITS", INSN_BITS)
136 print("ROW_BITS", ROW_BITS)
137 print("ROW_LINEBITS", ROW_LINEBITS)
138 print("LINE_OFF_BITS", LINE_OFF_BITS)
139 print("ROW_OFF_BITS", ROW_OFF_BITS)
140 print("INDEX_BITS", INDEX_BITS)
141 print("SET_SIZE_BITS", SET_SIZE_BITS)
142 print("TAG_BITS", TAG_BITS)
143 print("WAY_BITS", WAY_BITS)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
145 print("TLB_BITS", TLB_BITS)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
147 print("TLB_PTE_BITS", TLB_PTE_BITS)
148
149
150
151
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
159 #-- icache
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
164 #
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
174 #-- in a cache line
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
186
187 #-- Example of layout for 32 lines of 64 bytes:
188 #--
189 #-- .. tag |index| line |
190 #-- .. | row | |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
200 #
201 # .. tag |index| line |
202 # .. | row | |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
211
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
216 #
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
219 #
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
229 def CacheTagArray():
230 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
231 for x in range(NUM_LINES))
232
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
239 for x in range(NUM_LINES))
240
241 def RowPerLineValidArray():
242 return Array(Signal(name="rows_valid_%d" %x) \
243 for x in range(ROW_PER_LINE))
244
245
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
251
252
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name="tlbvalid_%d" %x) \
261 for x in range(TLB_SIZE))
262
263 def TLBTagArray():
264 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
265 for x in range(TLB_SIZE))
266
267 def TLBPtesArray():
268 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
269 for x in range(TLB_SIZE))
270
271
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
275 def CacheRamOut():
276 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
277 for x in range(NUM_WAYS))
278
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
283 def PLRUOut():
284 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
285 for x in range(NUM_LINES))
286
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
289 # return index_t is
290 # begin
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
293 # ));
294 # end;
295 # Return the cache line index (tag index) for an address
296 def get_index(addr):
297 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
298
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
301 # return row_t is
302 # begin
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
305 # ));
306 # end;
307 # Return the cache row index (data memory) for an address
308 def get_row(addr):
309 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
310
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
314 # begin
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
317 # end;
318 # Return the index of a row within a line
319 def get_row_of_line(row):
320 return row[:ROW_LINEBITS]
321
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
325 # )
326 # return boolean is
327 # begin
328 # return unsigned(
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
330 # ) = last;
331 # end;
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr, last):
334 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
335
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
339 # begin
340 # return get_row_of_line(row) = last;
341 # end;
342 # Returns whether this is the last row of a line
343 def is_last_row(row, last):
344 return get_row_of_line(row) == last
345
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
353 # begin
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
359 # end;
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
363 def next_row(row):
364 row_v = row[0:ROW_LINEBITS] + 1
365 return Cat(row_v[:ROW_LINEBITS], row[ROW_LINEBITS:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
371 # begin
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
374 # end;
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr, data):
378 word = addr[2:INSN_BITS+2]
379 return data.word_select(word, 32)
380
381 # -- Get the tag value from the address
382 # function get_tag(
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
384 # )
385 # return cache_tag_t is
386 # begin
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
388 # end;
389 # Get the tag value from the address
390 def get_tag(addr):
391 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
392
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
396 # begin
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
398 # end;
399 # Read a tag from a tag memory row
400 def read_tag(way, tagset):
401 return tagset.word_select(way, TAG_BITS)
402
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
406 # begin
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
408 # end;
409 # Write a tag to tag memory row
410 def write_tag(way, tagset, tag):
411 return read_tag(way, tagset).eq(tag)
412
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
417 # begin
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
419 # xor addr(
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
422 # )
423 # xor addr(
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
426 # );
427 # return to_integer(unsigned(hash));
428 # end;
429 # Simple hash for direct-mapped TLB index
430 def hash_ea(addr):
431 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
432 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
433 ] ^ addr[
434 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
435 ]
436 return hsh
437
438 # begin
439 #
440 # XXX put these assert statements in - as python asserts
441 #
442 # assert LINE_SIZE mod ROW_SIZE = 0;
443 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
444 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
446 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
447 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
448 # report "geometry bits don't add up"
449 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up"
451 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
452 # report "geometry bits don't add up"
453 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
454 # report "geometry bits don't add up"
455 #
456 # sim_debug: if SIM generate
457 # debug: process
458 # begin
459 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
460 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
461 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
462 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
463 # report "INSN_BITS = " & natural'image(INSN_BITS);
464 # report "ROW_BITS = " & natural'image(ROW_BITS);
465 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
466 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
467 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
468 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
469 # report "TAG_BITS = " & natural'image(TAG_BITS);
470 # report "WAY_BITS = " & natural'image(WAY_BITS);
471 # wait;
472 # end process;
473 # end generate;
474
475 # Cache reload state machine
476 @unique
477 class State(Enum):
478 IDLE = 0
479 CLR_TAG = 1
480 WAIT_ACK = 2
481
482
483 class RegInternal(RecordObject):
484 def __init__(self):
485 super().__init__()
486 # Cache hit state (Latches for 1 cycle BRAM access)
487 self.hit_way = Signal(NUM_WAYS)
488 self.hit_nia = Signal(64)
489 self.hit_smark = Signal()
490 self.hit_valid = Signal()
491
492 # Cache miss state (reload state machine)
493 self.state = Signal(State, reset=State.IDLE)
494 self.wb = WBMasterOut("wb")
495 self.req_adr = Signal(64)
496 self.store_way = Signal(NUM_WAYS)
497 self.store_index = Signal(NUM_LINES)
498 self.store_row = Signal(BRAM_ROWS)
499 self.store_tag = Signal(TAG_BITS)
500 self.store_valid = Signal()
501 self.end_row_ix = Signal(ROW_LINEBITS)
502 self.rows_valid = RowPerLineValidArray()
503
504 # TLB miss state
505 self.fetch_failed = Signal()
506
507 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
508 #
509 # entity icache is
510 # generic (
511 # SIM : boolean := false;
512 # -- Line size in bytes
513 # LINE_SIZE : positive := 64;
514 # -- BRAM organisation: We never access more
515 # -- than wishbone_data_bits
516 # -- at a time so to save resources we make the
517 # -- array only that wide,
518 # -- and use consecutive indices for to make a cache "line"
519 # --
520 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
521 # -- so 64-bits)
522 # ROW_SIZE : positive := wishbone_data_bits / 8;
523 # -- Number of lines in a set
524 # NUM_LINES : positive := 32;
525 # -- Number of ways
526 # NUM_WAYS : positive := 4;
527 # -- L1 ITLB number of entries (direct mapped)
528 # TLB_SIZE : positive := 64;
529 # -- L1 ITLB log_2(page_size)
530 # TLB_LG_PGSZ : positive := 12;
531 # -- Number of real address bits that we store
532 # REAL_ADDR_BITS : positive := 56;
533 # -- Non-zero to enable log data collection
534 # LOG_LENGTH : natural := 0
535 # );
536 # port (
537 # clk : in std_ulogic;
538 # rst : in std_ulogic;
539 #
540 # i_in : in Fetch1ToIcacheType;
541 # i_out : out IcacheToDecode1Type;
542 #
543 # m_in : in MmuToIcacheType;
544 #
545 # stall_in : in std_ulogic;
546 # stall_out : out std_ulogic;
547 # flush_in : in std_ulogic;
548 # inval_in : in std_ulogic;
549 #
550 # wishbone_out : out wishbone_master_out;
551 # wishbone_in : in wishbone_slave_out;
552 #
553 # log_out : out std_ulogic_vector(53 downto 0)
554 # );
555 # end entity icache;
556 # 64 bit direct mapped icache. All instructions are 4B aligned.
557 class ICache(Elaboratable):
558 """64 bit direct mapped icache. All instructions are 4B aligned."""
559 def __init__(self):
560 self.i_in = Fetch1ToICacheType(name="i_in")
561 self.i_out = ICacheToDecode1Type(name="i_out")
562
563 self.m_in = MMUToICacheType(name="m_in")
564
565 self.stall_in = Signal()
566 self.stall_out = Signal()
567 self.flush_in = Signal()
568 self.inval_in = Signal()
569
570 self.wb_out = WBMasterOut(name="wb_out")
571 self.wb_in = WBSlaveOut(name="wb_in")
572
573 self.log_out = Signal(54)
574
575
576 # Generate a cache RAM for each way
577 def rams(self, m, r, cache_out_row, use_previous, replace_way, req_row):
578 comb = m.d.comb
579
580 wb_in, stall_in = self.wb_in, self.stall_in
581
582
583 for i in range(NUM_WAYS):
584 do_read = Signal(name="do_rd_%d" % i)
585 do_write = Signal(name="do_wr_%d" % i)
586 rd_addr = Signal(ROW_BITS)
587 wr_addr = Signal(ROW_BITS)
588 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
589 wr_sel = Signal(ROW_SIZE)
590
591 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
592 setattr(m.submodules, "cacheram_%d" % i, way)
593
594 comb += way.rd_en.eq(do_read)
595 comb += way.rd_addr.eq(rd_addr)
596 comb += d_out.eq(way.rd_data_o)
597 comb += way.wr_sel.eq(wr_sel)
598 comb += way.wr_addr.eq(wr_addr)
599 comb += way.wr_data.eq(wb_in.dat)
600
601 comb += do_read.eq(~(stall_in | use_previous))
602 comb += do_write.eq(wb_in.ack & (replace_way == i))
603
604 with m.If(r.hit_way == i):
605 comb += cache_out_row.eq(d_out)
606 comb += rd_addr.eq(req_row)
607 comb += wr_addr.eq(r.store_row)
608 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
609
610 # -- Generate PLRUs
611 def maybe_plrus(self, m, r, plru_victim):
612 comb = m.d.comb
613
614 with m.If(NUM_WAYS > 1):
615 for i in range(NUM_LINES):
616 plru_acc_i = Signal(WAY_BITS)
617 plru_acc_en = Signal()
618 plru = PLRU(WAY_BITS)
619 setattr(m.submodules, "plru_%d" % i, plru)
620
621 comb += plru.acc_i.eq(plru_acc_i)
622 comb += plru.acc_en.eq(plru_acc_en)
623
624 # PLRU interface
625 with m.If(get_index(r.hit_nia) == i):
626 comb += plru.acc_en.eq(r.hit_valid)
627
628 comb += plru.acc_i.eq(r.hit_way)
629 comb += plru_victim[i].eq(plru.lru_o)
630
631 # TLB hit detection and real address generation
632 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
633 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
634 priv_fault, access_ok):
635 comb = m.d.comb
636
637 i_in = self.i_in
638
639 pte = Signal(TLB_PTE_BITS)
640 ttag = Signal(TLB_EA_TAG_BITS)
641
642 comb += tlb_req_index.eq(hash_ea(i_in.nia))
643 comb += pte.eq(itlb_ptes[tlb_req_index])
644 comb += ttag.eq(itlb_tags[tlb_req_index])
645
646 with m.If(i_in.virt_mode):
647 comb += real_addr.eq(Cat(
648 i_in.nia[:TLB_LG_PGSZ],
649 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
650 ))
651
652 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
653 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
654
655 comb += eaa_priv.eq(pte[3])
656
657 with m.Else():
658 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
659 comb += ra_valid.eq(1)
660 comb += eaa_priv.eq(1)
661
662 # No IAMR, so no KUEP support for now
663 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
664 comb += access_ok.eq(ra_valid & ~priv_fault)
665
666 # iTLB update
667 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
668 comb = m.d.comb
669 sync = m.d.sync
670
671 m_in = self.m_in
672
673 wr_index = Signal(TLB_SIZE)
674 comb += wr_index.eq(hash_ea(m_in.addr))
675
676 with m.If(m_in.tlbie & m_in.doall):
677 # Clear all valid bits
678 for i in range(TLB_SIZE):
679 sync += itlb_valid_bits[i].eq(0)
680
681 with m.Elif(m_in.tlbie):
682 # Clear entry regardless of hit or miss
683 sync += itlb_valid_bits[wr_index].eq(0)
684
685 with m.Elif(m_in.tlbld):
686 sync += itlb_tags[wr_index].eq(
687 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
688 )
689 sync += itlb_ptes[wr_index].eq(m_in.pte)
690 sync += itlb_valid_bits[wr_index].eq(1)
691
692 # Cache hit detection, output to fetch2 and other misc logic
693 def icache_comb(self, m, use_previous, r, req_index, req_row,
694 req_tag, real_addr, req_laddr, cache_valid_bits,
695 cache_tags, access_ok, req_is_hit,
696 req_is_miss, replace_way, plru_victim, cache_out_row):
697 comb = m.d.comb
698
699 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x "
700 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x "
701 # "access_ok:%x req_is_hit:%x req_is_miss:%x "
702 # "replace_way:%x", use_previous, req_index, req_row,
703 # req_tag, real_addr, req_laddr, access_ok,
704 # req_is_hit, req_is_miss, replace_way)
705
706 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
707 flush_in, stall_out = self.flush_in, self.stall_out
708
709 is_hit = Signal()
710 hit_way = Signal(NUM_WAYS)
711
712 # i_in.sequential means that i_in.nia this cycle is 4 more than
713 # last cycle. If we read more than 32 bits at a time, had a
714 # cache hit last cycle, and we don't want the first 32-bit chunk
715 # then we can keep the data we read last cycle and just use that.
716 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
717 comb += use_previous.eq(i_in.sequential & r.hit_valid)
718
719 # Extract line, row and tag from request
720 comb += req_index.eq(get_index(i_in.nia))
721 comb += req_row.eq(get_row(i_in.nia))
722 comb += req_tag.eq(get_tag(real_addr))
723
724 # Calculate address of beginning of cache row, will be
725 # used for cache miss processing if needed
726 comb += req_laddr.eq(Cat(
727 Const(0, ROW_OFF_BITS),
728 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
729 ))
730
731 # Test if pending request is a hit on any way
732 hitcond = Signal()
733 comb += hitcond.eq((r.state == State.WAIT_ACK)
734 & (req_index == r.store_index)
735 & r.rows_valid[req_row % ROW_PER_LINE])
736 with m.If(i_in.req):
737 cvb = Signal(NUM_WAYS)
738 ctag = Signal(TAG_RAM_WIDTH)
739 comb += ctag.eq(cache_tags[req_index])
740 comb += cvb.eq(cache_valid_bits[req_index])
741 for i in range(NUM_WAYS):
742 tagi = Signal(TAG_BITS, name="ti%d" % i)
743 comb += tagi.eq(read_tag(i, ctag))
744 hit_test = Signal(name="hit_test%d" % i)
745 comb += hit_test.eq(i == r.store_way)
746 with m.If((cvb[i] | (hitcond & hit_test)) & (tagi == req_tag)):
747 comb += hit_way.eq(i)
748 comb += is_hit.eq(1)
749
750 # Generate the "hit" and "miss" signals
751 # for the synchronous blocks
752 with m.If(i_in.req & access_ok & ~flush_in):
753 comb += req_is_hit.eq(is_hit)
754 comb += req_is_miss.eq(~is_hit)
755
756 # The way to replace on a miss
757 with m.If(r.state == State.CLR_TAG):
758 comb += replace_way.eq(plru_victim[r.store_index])
759 with m.Else():
760 comb += replace_way.eq(r.store_way)
761
762 # Output instruction from current cache row
763 #
764 # Note: This is a mild violation of our design principle of
765 # having pipeline stages output from a clean latch. In this
766 # case we output the result of a mux. The alternative would
767 # be output an entire row which I prefer not to do just yet
768 # as it would force fetch2 to know about some of the cache
769 # geometry information.
770 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
771 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
772 # r.hit_way, cache_out[r.hit_way])
773 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
774 comb += i_out.valid.eq(r.hit_valid)
775 comb += i_out.nia.eq(r.hit_nia)
776 comb += i_out.stop_mark.eq(r.hit_smark)
777 comb += i_out.fetch_failed.eq(r.fetch_failed)
778
779 # Stall fetch1 if we have a miss on cache or TLB
780 # or a protection fault
781 comb += stall_out.eq(~(is_hit & access_ok))
782
783 # Wishbone requests output (from the cache miss reload machine)
784 comb += wb_out.eq(r.wb)
785
786 # Cache hit synchronous machine
787 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
788 req_index, req_tag, real_addr):
789 sync = m.d.sync
790
791 i_in, stall_in = self.i_in, self.stall_in
792 flush_in = self.flush_in
793
794 # keep outputs to fetch2 unchanged on a stall
795 # except that flush or reset sets valid to 0
796 # If use_previous, keep the same data as last
797 # cycle and use the second half
798 with m.If(stall_in | use_previous):
799 with m.If(flush_in):
800 sync += r.hit_valid.eq(0)
801 with m.Else():
802 # On a hit, latch the request for the next cycle,
803 # when the BRAM data will be available on the
804 # cache_out output of the corresponding way
805 sync += r.hit_valid.eq(req_is_hit)
806
807 with m.If(req_is_hit):
808 sync += r.hit_way.eq(req_hit_way)
809 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x " \
810 "tag:%x way:%x RA:%x", i_in.nia, \
811 i_in.virt_mode, i_in.stop_mark, req_index, \
812 req_tag, req_hit_way, real_addr)
813
814
815
816 with m.If(~stall_in):
817 # Send stop marks and NIA down regardless of validity
818 sync += r.hit_smark.eq(i_in.stop_mark)
819 sync += r.hit_nia.eq(i_in.nia)
820
821 # Cache miss/reload synchronous machine
822 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
823 req_index, req_laddr, req_tag, replace_way,
824 cache_tags, access_ok, real_addr):
825 comb = m.d.comb
826 sync = m.d.sync
827
828 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
829 stall_in, flush_in = self.stall_in, self.flush_in
830 inval_in = self.inval_in
831
832 # variable tagset : cache_tags_set_t;
833 # variable stbs_done : boolean;
834
835 tagset = Signal(TAG_RAM_WIDTH)
836 stbs_done = Signal()
837
838 comb += r.wb.sel.eq(-1)
839 comb += r.wb.adr.eq(r.req_adr[3:])
840
841 # Process cache invalidations
842 with m.If(inval_in):
843 for i in range(NUM_LINES):
844 sync += cache_valid_bits[i].eq(0)
845 sync += r.store_valid.eq(0)
846
847 # Main state machine
848 with m.Switch(r.state):
849
850 with m.Case(State.IDLE):
851 # Reset per-row valid flags,
852 # only used in WAIT_ACK
853 for i in range(ROW_PER_LINE):
854 sync += r.rows_valid[i].eq(0)
855
856 # We need to read a cache line
857 with m.If(req_is_miss):
858 sync += Display("cache miss nia:%x IR:%x SM:%x idx:%x "
859 " way:%x tag:%x RA:%x", i_in.nia,
860 i_in.virt_mode, i_in.stop_mark, req_index,
861 replace_way, req_tag, real_addr)
862
863 # Keep track of our index and way
864 # for subsequent stores
865 sync += r.store_index.eq(req_index)
866 sync += r.store_row.eq(get_row(req_laddr))
867 sync += r.store_tag.eq(req_tag)
868 sync += r.store_valid.eq(1)
869 sync += r.end_row_ix.eq(
870 get_row_of_line(
871 get_row(req_laddr)
872 ) - 1
873 )
874
875 # -- Prep for first wishbone read. We calculate the
876 # -- address of the start of the cache line and
877 # -- start the WB cycle.
878 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
879 # r.wb.cyc <= '1';
880 # r.wb.stb <= '1';
881 # Prep for first wishbone read.
882 # We calculate the
883 # address of the start of the cache line and
884 # start the WB cycle.
885 sync += r.req_adr.eq(req_laddr)
886 sync += r.wb.cyc.eq(1)
887 sync += r.wb.stb.eq(1)
888
889 # -- Track that we had one request sent
890 # r.state <= CLR_TAG;
891 # Track that we had one request sent
892 sync += r.state.eq(State.CLR_TAG)
893 # end if;
894
895 # when CLR_TAG | WAIT_ACK =>
896 with m.Case(State.CLR_TAG, State.WAIT_ACK):
897 # if r.state = CLR_TAG then
898 with m.If(r.state == State.CLR_TAG):
899 # -- Get victim way from plru
900 # r.store_way <= replace_way;
901 # Get victim way from plru
902 sync += r.store_way.eq(replace_way)
903 #
904 # -- Force misses on that way while
905 # -- reloading that line
906 # cache_valids(req_index)(replace_way) <= '0';
907 # Force misses on that way while
908 # realoading that line
909 cv = Signal(INDEX_BITS)
910 comb += cv.eq(cache_valid_bits[req_index])
911 comb += cv.bit_select(replace_way, 1).eq(0)
912 sync += cache_valid_bits[req_index].eq(cv)
913
914 # -- Store new tag in selected way
915 # for i in 0 to NUM_WAYS-1 loop
916 # if i = replace_way then
917 # tagset := cache_tags(r.store_index);
918 # write_tag(i, tagset, r.store_tag);
919 # cache_tags(r.store_index) <= tagset;
920 # end if;
921 # end loop;
922 for i in range(NUM_WAYS):
923 with m.If(i == replace_way):
924 comb += tagset.eq(cache_tags[r.store_index])
925 comb += write_tag(i, tagset, r.store_tag)
926 sync += cache_tags[r.store_index].eq(tagset)
927
928 # r.state <= WAIT_ACK;
929 sync += r.state.eq(State.WAIT_ACK)
930 # end if;
931
932 # -- Requests are all sent if stb is 0
933 # stbs_done := r.wb.stb = '0';
934 # Requests are all sent if stb is 0
935 stbs_zero = Signal()
936 comb += stbs_zero.eq(r.wb.stb == 0)
937 comb += stbs_done.eq(stbs_zero)
938
939 # -- If we are still sending requests,
940 # -- was one accepted ?
941 # if wishbone_in.stall = '0' and not stbs_done then
942 # If we are still sending requests,
943 # was one accepted?
944 with m.If(~wb_in.stall & ~stbs_zero):
945 # -- That was the last word ? We are done sending.
946 # -- Clear stb and set stbs_done so we can handle
947 # -- an eventual last ack on the same cycle.
948 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
949 # r.wb.stb <= '0';
950 # stbs_done := true;
951 # end if;
952 # That was the last word ?
953 # We are done sending.
954 # Clear stb and set stbs_done
955 # so we can handle
956 # an eventual last ack on
957 # the same cycle.
958 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
959 sync += Display("IS_LAST_ROW_ADDR " \
960 "r.wb.addr:%x r.end_row_ix:%x " \
961 "r.wb.stb:%x stbs_zero:%x " \
962 "stbs_done:%x", r.wb.adr, \
963 r.end_row_ix, r.wb.stb, \
964 stbs_zero, stbs_done)
965 sync += r.wb.stb.eq(0)
966 comb += stbs_done.eq(1)
967
968 # -- Calculate the next row address
969 # r.wb.adr <= next_row_addr(r.wb.adr);
970 # Calculate the next row address
971 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
972 comb += rarange.eq(
973 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
974 )
975 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
976 rarange
977 )
978 sync += Display("RARANGE r.req_adr:%x rarange:%x "
979 "stbs_zero:%x stbs_done:%x",
980 r.req_adr, rarange, stbs_zero, stbs_done)
981 # end if;
982
983 # -- Incoming acks processing
984 # if wishbone_in.ack = '1' then
985 # Incoming acks processing
986 with m.If(wb_in.ack):
987 # r.rows_valid(r.store_row mod ROW_PER_LINE)
988 # <= '1';
989 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
990 "stbs_done:%x",
991 wb_in.dat, stbs_zero, stbs_done)
992
993 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
994
995 # -- Check for completion
996 # if stbs_done and
997 # is_last_row(r.store_row, r.end_row_ix) then
998 # Check for completion
999 with m.If(stbs_done &
1000 is_last_row(r.store_row, r.end_row_ix)):
1001 # -- Complete wishbone cycle
1002 # r.wb.cyc <= '0';
1003 # Complete wishbone cycle
1004 sync += r.wb.cyc.eq(0)
1005
1006 # -- Cache line is now valid
1007 # cache_valids(r.store_index)(replace_way) <=
1008 # r.store_valid and not inval_in;
1009 # Cache line is now valid
1010 cv = Signal(INDEX_BITS)
1011 comb += cv.eq(cache_valid_bits[r.store_index])
1012 comb += cv.bit_select(replace_way, 1).eq(
1013 r.store_valid & ~inval_in
1014 )
1015 sync += cache_valid_bits[r.store_index].eq(cv)
1016
1017 # -- We are done
1018 # r.state <= IDLE;
1019 # We are done
1020 sync += r.state.eq(State.IDLE)
1021 # end if;
1022
1023 # -- Increment store row counter
1024 # r.store_row <= next_row(r.store_row);
1025 # Increment store row counter
1026 sync += r.store_row.eq(next_row(r.store_row))
1027 # end if;
1028 # end case;
1029 # end if;
1030 #
1031 # -- TLB miss and protection fault processing
1032 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1033 # r.fetch_failed <= '0';
1034 # elsif i_in.req = '1' and access_ok = '0' and
1035 # stall_in = '0' then
1036 # r.fetch_failed <= '1';
1037 # end if;
1038 # TLB miss and protection fault processing
1039 with m.If(flush_in | m_in.tlbld):
1040 sync += r.fetch_failed.eq(0)
1041
1042 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1043 sync += r.fetch_failed.eq(1)
1044 # end if;
1045 # end process;
1046
1047 # icache_log: if LOG_LENGTH > 0 generate
1048 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1049 req_is_miss, req_is_hit, lway, wstate, r):
1050 comb = m.d.comb
1051 sync = m.d.sync
1052
1053 wb_in, i_out = self.wb_in, self.i_out
1054 log_out, stall_out = self.log_out, self.stall_out
1055
1056 # -- Output data to logger
1057 # signal log_data : std_ulogic_vector(53 downto 0);
1058 # begin
1059 # data_log: process(clk)
1060 # variable lway: way_t;
1061 # variable wstate: std_ulogic;
1062 # Output data to logger
1063 for i in range(LOG_LENGTH):
1064 # Output data to logger
1065 log_data = Signal(54)
1066 lway = Signal(NUM_WAYS)
1067 wstate = Signal()
1068
1069 # begin
1070 # if rising_edge(clk) then
1071 # lway := req_hit_way;
1072 # wstate := '0';
1073 sync += lway.eq(req_hit_way)
1074 sync += wstate.eq(0)
1075
1076 # if r.state /= IDLE then
1077 # wstate := '1';
1078 # end if;
1079 with m.If(r.state != State.IDLE):
1080 sync += wstate.eq(1)
1081
1082 # log_data <= i_out.valid &
1083 # i_out.insn &
1084 # wishbone_in.ack &
1085 # r.wb.adr(5 downto 3) &
1086 # r.wb.stb & r.wb.cyc &
1087 # wishbone_in.stall &
1088 # stall_out &
1089 # r.fetch_failed &
1090 # r.hit_nia(5 downto 2) &
1091 # wstate &
1092 # std_ulogic_vector(to_unsigned(lway, 3)) &
1093 # req_is_hit & req_is_miss &
1094 # access_ok &
1095 # ra_valid;
1096 sync += log_data.eq(Cat(
1097 ra_valid, access_ok, req_is_miss, req_is_hit,
1098 lway, wstate, r.hit_nia[2:6],
1099 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1100 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1101 i_out.valid
1102 ))
1103 # end if;
1104 # end process;
1105 # log_out <= log_data;
1106 comb += log_out.eq(log_data)
1107 # end generate;
1108 # end;
1109
1110 def elaborate(self, platform):
1111
1112 m = Module()
1113 comb = m.d.comb
1114
1115 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1116 cache_tags = CacheTagArray()
1117 cache_valid_bits = CacheValidBitsArray()
1118
1119 # signal itlb_valids : tlb_valids_t;
1120 # signal itlb_tags : tlb_tags_t;
1121 # signal itlb_ptes : tlb_ptes_t;
1122 # attribute ram_style of itlb_tags : signal is "distributed";
1123 # attribute ram_style of itlb_ptes : signal is "distributed";
1124 itlb_valid_bits = TLBValidBitsArray()
1125 itlb_tags = TLBTagArray()
1126 itlb_ptes = TLBPtesArray()
1127 # TODO to be passed to nmigen as ram attributes
1128 # attribute ram_style of itlb_tags : signal is "distributed";
1129 # attribute ram_style of itlb_ptes : signal is "distributed";
1130
1131 # -- Privilege bit from PTE EAA field
1132 # signal eaa_priv : std_ulogic;
1133 # Privilege bit from PTE EAA field
1134 eaa_priv = Signal()
1135
1136 # signal r : reg_internal_t;
1137 r = RegInternal()
1138
1139 # -- Async signals on incoming request
1140 # signal req_index : index_t;
1141 # signal req_row : row_t;
1142 # signal req_hit_way : way_t;
1143 # signal req_tag : cache_tag_t;
1144 # signal req_is_hit : std_ulogic;
1145 # signal req_is_miss : std_ulogic;
1146 # signal req_laddr : std_ulogic_vector(63 downto 0);
1147 # Async signal on incoming request
1148 req_index = Signal(NUM_LINES)
1149 req_row = Signal(BRAM_ROWS)
1150 req_hit_way = Signal(NUM_WAYS)
1151 req_tag = Signal(TAG_BITS)
1152 req_is_hit = Signal()
1153 req_is_miss = Signal()
1154 req_laddr = Signal(64)
1155
1156 # signal tlb_req_index : tlb_index_t;
1157 # signal real_addr : std_ulogic_vector(
1158 # REAL_ADDR_BITS - 1 downto 0
1159 # );
1160 # signal ra_valid : std_ulogic;
1161 # signal priv_fault : std_ulogic;
1162 # signal access_ok : std_ulogic;
1163 # signal use_previous : std_ulogic;
1164 tlb_req_index = Signal(TLB_SIZE)
1165 real_addr = Signal(REAL_ADDR_BITS)
1166 ra_valid = Signal()
1167 priv_fault = Signal()
1168 access_ok = Signal()
1169 use_previous = Signal()
1170
1171 # signal cache_out : cache_ram_out_t;
1172 cache_out_row = Signal(ROW_SIZE_BITS)
1173
1174 # signal plru_victim : plru_out_t;
1175 # signal replace_way : way_t;
1176 plru_victim = PLRUOut()
1177 replace_way = Signal(NUM_WAYS)
1178
1179 # call sub-functions putting everything together, using shared
1180 # signals established above
1181 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
1182 self.maybe_plrus(m, r, plru_victim)
1183 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1184 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1185 priv_fault, access_ok)
1186 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1187 self.icache_comb(m, use_previous, r, req_index, req_row,
1188 req_tag, real_addr, req_laddr, cache_valid_bits,
1189 cache_tags, access_ok, req_is_hit, req_is_miss,
1190 replace_way, plru_victim, cache_out_row)
1191 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1192 req_index, req_tag, real_addr)
1193 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1194 req_laddr, req_tag, replace_way, cache_tags,
1195 access_ok, real_addr)
1196 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1197 # req_is_miss, req_is_hit, lway, wstate, r)
1198
1199 return m
1200
1201
1202 # icache_tb.vhdl
1203 #
1204 # library ieee;
1205 # use ieee.std_logic_1164.all;
1206 #
1207 # library work;
1208 # use work.common.all;
1209 # use work.wishbone_types.all;
1210 #
1211 # entity icache_tb is
1212 # end icache_tb;
1213 #
1214 # architecture behave of icache_tb is
1215 # signal clk : std_ulogic;
1216 # signal rst : std_ulogic;
1217 #
1218 # signal i_out : Fetch1ToIcacheType;
1219 # signal i_in : IcacheToDecode1Type;
1220 #
1221 # signal m_out : MmuToIcacheType;
1222 #
1223 # signal wb_bram_in : wishbone_master_out;
1224 # signal wb_bram_out : wishbone_slave_out;
1225 #
1226 # constant clk_period : time := 10 ns;
1227 # begin
1228 # icache0: entity work.icache
1229 # generic map(
1230 # LINE_SIZE => 64,
1231 # NUM_LINES => 4
1232 # )
1233 # port map(
1234 # clk => clk,
1235 # rst => rst,
1236 # i_in => i_out,
1237 # i_out => i_in,
1238 # m_in => m_out,
1239 # stall_in => '0',
1240 # flush_in => '0',
1241 # inval_in => '0',
1242 # wishbone_out => wb_bram_in,
1243 # wishbone_in => wb_bram_out
1244 # );
1245 #
1246 # -- BRAM Memory slave
1247 # bram0: entity work.wishbone_bram_wrapper
1248 # generic map(
1249 # MEMORY_SIZE => 1024,
1250 # RAM_INIT_FILE => "icache_test.bin"
1251 # )
1252 # port map(
1253 # clk => clk,
1254 # rst => rst,
1255 # wishbone_in => wb_bram_in,
1256 # wishbone_out => wb_bram_out
1257 # );
1258 #
1259 # clk_process: process
1260 # begin
1261 # clk <= '0';
1262 # wait for clk_period/2;
1263 # clk <= '1';
1264 # wait for clk_period/2;
1265 # end process;
1266 #
1267 # rst_process: process
1268 # begin
1269 # rst <= '1';
1270 # wait for 2*clk_period;
1271 # rst <= '0';
1272 # wait;
1273 # end process;
1274 #
1275 # stim: process
1276 # begin
1277 # i_out.req <= '0';
1278 # i_out.nia <= (others => '0');
1279 # i_out.stop_mark <= '0';
1280 #
1281 # m_out.tlbld <= '0';
1282 # m_out.tlbie <= '0';
1283 # m_out.addr <= (others => '0');
1284 # m_out.pte <= (others => '0');
1285 #
1286 # wait until rising_edge(clk);
1287 # wait until rising_edge(clk);
1288 # wait until rising_edge(clk);
1289 # wait until rising_edge(clk);
1290 #
1291 # i_out.req <= '1';
1292 # i_out.nia <= x"0000000000000004";
1293 #
1294 # wait for 30*clk_period;
1295 # wait until rising_edge(clk);
1296 #
1297 # assert i_in.valid = '1' severity failure;
1298 # assert i_in.insn = x"00000001"
1299 # report "insn @" & to_hstring(i_out.nia) &
1300 # "=" & to_hstring(i_in.insn) &
1301 # " expected 00000001"
1302 # severity failure;
1303 #
1304 # i_out.req <= '0';
1305 #
1306 # wait until rising_edge(clk);
1307 #
1308 # -- hit
1309 # i_out.req <= '1';
1310 # i_out.nia <= x"0000000000000008";
1311 # wait until rising_edge(clk);
1312 # wait until rising_edge(clk);
1313 # assert i_in.valid = '1' severity failure;
1314 # assert i_in.insn = x"00000002"
1315 # report "insn @" & to_hstring(i_out.nia) &
1316 # "=" & to_hstring(i_in.insn) &
1317 # " expected 00000002"
1318 # severity failure;
1319 # wait until rising_edge(clk);
1320 #
1321 # -- another miss
1322 # i_out.req <= '1';
1323 # i_out.nia <= x"0000000000000040";
1324 #
1325 # wait for 30*clk_period;
1326 # wait until rising_edge(clk);
1327 #
1328 # assert i_in.valid = '1' severity failure;
1329 # assert i_in.insn = x"00000010"
1330 # report "insn @" & to_hstring(i_out.nia) &
1331 # "=" & to_hstring(i_in.insn) &
1332 # " expected 00000010"
1333 # severity failure;
1334 #
1335 # -- test something that aliases
1336 # i_out.req <= '1';
1337 # i_out.nia <= x"0000000000000100";
1338 # wait until rising_edge(clk);
1339 # wait until rising_edge(clk);
1340 # assert i_in.valid = '0' severity failure;
1341 # wait until rising_edge(clk);
1342 #
1343 # wait for 30*clk_period;
1344 # wait until rising_edge(clk);
1345 #
1346 # assert i_in.valid = '1' severity failure;
1347 # assert i_in.insn = x"00000040"
1348 # report "insn @" & to_hstring(i_out.nia) &
1349 # "=" & to_hstring(i_in.insn) &
1350 # " expected 00000040"
1351 # severity failure;
1352 #
1353 # i_out.req <= '0';
1354 #
1355 # std.env.finish;
1356 # end process;
1357 # end;
1358 def icache_sim(dut):
1359 i_out = dut.i_in
1360 i_in = dut.i_out
1361 m_out = dut.m_in
1362
1363 yield i_in.valid.eq(0)
1364 yield i_out.priv_mode.eq(1)
1365 yield i_out.req.eq(0)
1366 yield i_out.nia.eq(0)
1367 yield i_out.stop_mark.eq(0)
1368 yield m_out.tlbld.eq(0)
1369 yield m_out.tlbie.eq(0)
1370 yield m_out.addr.eq(0)
1371 yield m_out.pte.eq(0)
1372 yield
1373 yield
1374 yield
1375 yield
1376 yield i_out.req.eq(1)
1377 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1378 for i in range(30):
1379 yield
1380 yield
1381 valid = yield i_in.valid
1382 nia = yield i_out.nia
1383 insn = yield i_in.insn
1384 print(f"valid? {valid}")
1385 assert valid
1386 assert insn == 0x00000001, \
1387 "insn @%x=%x expected 00000001" % (nia, insn)
1388 yield i_out.req.eq(0)
1389 yield
1390
1391 # hit
1392 yield i_out.req.eq(1)
1393 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1394 yield
1395 yield
1396 valid = yield i_in.valid
1397 nia = yield i_in.nia
1398 insn = yield i_in.insn
1399 assert valid
1400 assert insn == 0x00000002, \
1401 "insn @%x=%x expected 00000002" % (nia, insn)
1402 yield
1403
1404 # another miss
1405 yield i_out.req.eq(1)
1406 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1407 for i in range(30):
1408 yield
1409 yield
1410 valid = yield i_in.valid
1411 nia = yield i_out.nia
1412 insn = yield i_in.insn
1413 assert valid
1414 assert insn == 0x00000010, \
1415 "insn @%x=%x expected 00000010" % (nia, insn)
1416
1417 # test something that aliases
1418 yield i_out.req.eq(1)
1419 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1420 yield
1421 yield
1422 valid = yield i_in.valid
1423 assert ~valid
1424 for i in range(30):
1425 yield
1426 yield
1427 insn = yield i_in.insn
1428 valid = yield i_in.valid
1429 insn = yield i_in.insn
1430 assert valid
1431 assert insn == 0x00000040, \
1432 "insn @%x=%x expected 00000040" % (nia, insn)
1433 yield i_out.req.eq(0)
1434
1435
1436
1437 def test_icache(mem):
1438 dut = ICache()
1439
1440 memory = Memory(width=64, depth=16*64, init=mem)
1441 sram = SRAM(memory=memory, granularity=8)
1442
1443 m = Module()
1444
1445 m.submodules.icache = dut
1446 m.submodules.sram = sram
1447
1448 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1449 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1450 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1451 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1452 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1453 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1454
1455 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1456 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1457
1458 # nmigen Simulation
1459 sim = Simulator(m)
1460 sim.add_clock(1e-6)
1461
1462 sim.add_sync_process(wrap(icache_sim(dut)))
1463 with sim.write_vcd('test_icache.vcd'):
1464 sim.run()
1465
1466 if __name__ == '__main__':
1467 dut = ICache()
1468 vl = rtlil.convert(dut, ports=[])
1469 with open("test_icache.il", "w") as f:
1470 f.write(vl)
1471
1472 mem = []
1473 for i in range(512):
1474 mem.append((i*2)| ((i*2+1)<<32))
1475
1476 test_icache(mem)
1477