remove more reviewed comments
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 print("ROW_SIZE", ROW_SIZE)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
88 print("ROW_PER_LINE", ROW_PER_LINE)
89 print("BRAM_ROWS", BRAM_ROWS)
90 print("INSN_PER_ROW", INSN_PER_ROW)
91
92 # Bit fields counts in the address
93 #
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to
98 # select a row
99 ROW_BITS = log2_int(BRAM_ROWS)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINEBITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS = log2_int(ROW_SIZE)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112 # SET_SIZE_BITS is the log base 2 of
113 # the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
120
121 # WAY_BITS is the number of bits to
122 # select a way
123 WAY_BITS = log2_int(NUM_WAYS)
124 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
125
126 # -- L1 ITLB.
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS = log2_int(TLB_SIZE)
131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
132 TLB_PTE_BITS = 64
133
134
135 print("INSN_BITS", INSN_BITS)
136 print("ROW_BITS", ROW_BITS)
137 print("ROW_LINEBITS", ROW_LINEBITS)
138 print("LINE_OFF_BITS", LINE_OFF_BITS)
139 print("ROW_OFF_BITS", ROW_OFF_BITS)
140 print("INDEX_BITS", INDEX_BITS)
141 print("SET_SIZE_BITS", SET_SIZE_BITS)
142 print("TAG_BITS", TAG_BITS)
143 print("WAY_BITS", WAY_BITS)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
145 print("TLB_BITS", TLB_BITS)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
147 print("TLB_PTE_BITS", TLB_PTE_BITS)
148
149
150
151
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
159 #-- icache
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
164 #
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
174 #-- in a cache line
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
186
187 #-- Example of layout for 32 lines of 64 bytes:
188 #--
189 #-- .. tag |index| line |
190 #-- .. | row | |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
200 #
201 # .. tag |index| line |
202 # .. | row | |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
211
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
216 #
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
219 #
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
229 def CacheTagArray():
230 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
231 for x in range(NUM_LINES))
232
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
239 for x in range(NUM_LINES))
240
241 def RowPerLineValidArray():
242 return Array(Signal(name="rows_valid_%d" %x) \
243 for x in range(ROW_PER_LINE))
244
245
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
251
252
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name="tlbvalid_%d" %x) \
261 for x in range(TLB_SIZE))
262
263 def TLBTagArray():
264 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
265 for x in range(TLB_SIZE))
266
267 def TLBPtesArray():
268 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
269 for x in range(TLB_SIZE))
270
271
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
275 def CacheRamOut():
276 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
277 for x in range(NUM_WAYS))
278
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
283 def PLRUOut():
284 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
285 for x in range(NUM_LINES))
286
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
289 # return index_t is
290 # begin
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
293 # ));
294 # end;
295 # Return the cache line index (tag index) for an address
296 def get_index(addr):
297 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
298
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
301 # return row_t is
302 # begin
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
305 # ));
306 # end;
307 # Return the cache row index (data memory) for an address
308 def get_row(addr):
309 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
310
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
314 # begin
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
317 # end;
318 # Return the index of a row within a line
319 def get_row_of_line(row):
320 return row[:ROW_LINEBITS]
321
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
325 # )
326 # return boolean is
327 # begin
328 # return unsigned(
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
330 # ) = last;
331 # end;
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr, last):
334 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
335
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
339 # begin
340 # return get_row_of_line(row) = last;
341 # end;
342 # Returns whether this is the last row of a line
343 def is_last_row(row, last):
344 return get_row_of_line(row) == last
345
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
353 # begin
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
359 # end;
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
363 def next_row(row):
364 row_v = row[0:ROW_LINEBITS] + 1
365 return Cat(row_v[:ROW_LINEBITS], row[ROW_LINEBITS:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
371 # begin
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
374 # end;
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr, data):
378 word = addr[2:INSN_BITS+2]
379 return data.word_select(word, 32)
380
381 # -- Get the tag value from the address
382 # function get_tag(
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
384 # )
385 # return cache_tag_t is
386 # begin
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
388 # end;
389 # Get the tag value from the address
390 def get_tag(addr):
391 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
392
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
396 # begin
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
398 # end;
399 # Read a tag from a tag memory row
400 def read_tag(way, tagset):
401 return tagset.word_select(way, TAG_BITS)
402
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
406 # begin
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
408 # end;
409 # Write a tag to tag memory row
410 def write_tag(way, tagset, tag):
411 return read_tag(way, tagset).eq(tag)
412
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
417 # begin
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
419 # xor addr(
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
422 # )
423 # xor addr(
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
426 # );
427 # return to_integer(unsigned(hash));
428 # end;
429 # Simple hash for direct-mapped TLB index
430 def hash_ea(addr):
431 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
432 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
433 ] ^ addr[
434 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
435 ]
436 return hsh
437
438 # begin
439 #
440 # XXX put these assert statements in - as python asserts
441 #
442 # assert LINE_SIZE mod ROW_SIZE = 0;
443 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
444 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
446 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
447 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
448 # report "geometry bits don't add up"
449 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up"
451 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
452 # report "geometry bits don't add up"
453 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
454 # report "geometry bits don't add up"
455 #
456 # sim_debug: if SIM generate
457 # debug: process
458 # begin
459 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
460 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
461 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
462 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
463 # report "INSN_BITS = " & natural'image(INSN_BITS);
464 # report "ROW_BITS = " & natural'image(ROW_BITS);
465 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
466 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
467 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
468 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
469 # report "TAG_BITS = " & natural'image(TAG_BITS);
470 # report "WAY_BITS = " & natural'image(WAY_BITS);
471 # wait;
472 # end process;
473 # end generate;
474
475 # Cache reload state machine
476 @unique
477 class State(Enum):
478 IDLE = 0
479 CLR_TAG = 1
480 WAIT_ACK = 2
481
482
483 class RegInternal(RecordObject):
484 def __init__(self):
485 super().__init__()
486 # Cache hit state (Latches for 1 cycle BRAM access)
487 self.hit_way = Signal(NUM_WAYS)
488 self.hit_nia = Signal(64)
489 self.hit_smark = Signal()
490 self.hit_valid = Signal()
491
492 # Cache miss state (reload state machine)
493 self.state = Signal(State, reset=State.IDLE)
494 self.wb = WBMasterOut("wb")
495 self.req_adr = Signal(64)
496 self.store_way = Signal(NUM_WAYS)
497 self.store_index = Signal(NUM_LINES)
498 self.store_row = Signal(BRAM_ROWS)
499 self.store_tag = Signal(TAG_BITS)
500 self.store_valid = Signal()
501 self.end_row_ix = Signal(ROW_LINEBITS)
502 self.rows_valid = RowPerLineValidArray()
503
504 # TLB miss state
505 self.fetch_failed = Signal()
506
507 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
508 #
509 # entity icache is
510 # generic (
511 # SIM : boolean := false;
512 # -- Line size in bytes
513 # LINE_SIZE : positive := 64;
514 # -- BRAM organisation: We never access more
515 # -- than wishbone_data_bits
516 # -- at a time so to save resources we make the
517 # -- array only that wide,
518 # -- and use consecutive indices for to make a cache "line"
519 # --
520 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
521 # -- so 64-bits)
522 # ROW_SIZE : positive := wishbone_data_bits / 8;
523 # -- Number of lines in a set
524 # NUM_LINES : positive := 32;
525 # -- Number of ways
526 # NUM_WAYS : positive := 4;
527 # -- L1 ITLB number of entries (direct mapped)
528 # TLB_SIZE : positive := 64;
529 # -- L1 ITLB log_2(page_size)
530 # TLB_LG_PGSZ : positive := 12;
531 # -- Number of real address bits that we store
532 # REAL_ADDR_BITS : positive := 56;
533 # -- Non-zero to enable log data collection
534 # LOG_LENGTH : natural := 0
535 # );
536 # port (
537 # clk : in std_ulogic;
538 # rst : in std_ulogic;
539 #
540 # i_in : in Fetch1ToIcacheType;
541 # i_out : out IcacheToDecode1Type;
542 #
543 # m_in : in MmuToIcacheType;
544 #
545 # stall_in : in std_ulogic;
546 # stall_out : out std_ulogic;
547 # flush_in : in std_ulogic;
548 # inval_in : in std_ulogic;
549 #
550 # wishbone_out : out wishbone_master_out;
551 # wishbone_in : in wishbone_slave_out;
552 #
553 # log_out : out std_ulogic_vector(53 downto 0)
554 # );
555 # end entity icache;
556 # 64 bit direct mapped icache. All instructions are 4B aligned.
557 class ICache(Elaboratable):
558 """64 bit direct mapped icache. All instructions are 4B aligned."""
559 def __init__(self):
560 self.i_in = Fetch1ToICacheType(name="i_in")
561 self.i_out = ICacheToDecode1Type(name="i_out")
562
563 self.m_in = MMUToICacheType(name="m_in")
564
565 self.stall_in = Signal()
566 self.stall_out = Signal()
567 self.flush_in = Signal()
568 self.inval_in = Signal()
569
570 self.wb_out = WBMasterOut(name="wb_out")
571 self.wb_in = WBSlaveOut(name="wb_in")
572
573 self.log_out = Signal(54)
574
575
576 # Generate a cache RAM for each way
577 def rams(self, m, r, cache_out_row, use_previous, replace_way, req_row):
578 comb = m.d.comb
579
580 wb_in, stall_in = self.wb_in, self.stall_in
581
582
583 for i in range(NUM_WAYS):
584 do_read = Signal(name="do_rd_%d" % i)
585 do_write = Signal(name="do_wr_%d" % i)
586 rd_addr = Signal(ROW_BITS)
587 wr_addr = Signal(ROW_BITS)
588 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
589 wr_sel = Signal(ROW_SIZE)
590
591 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
592 setattr(m.submodules, "cacheram_%d" % i, way)
593
594 comb += way.rd_en.eq(do_read)
595 comb += way.rd_addr.eq(rd_addr)
596 comb += d_out.eq(way.rd_data_o)
597 comb += way.wr_sel.eq(wr_sel)
598 comb += way.wr_addr.eq(wr_addr)
599 comb += way.wr_data.eq(wb_in.dat)
600
601 comb += do_read.eq(~(stall_in | use_previous))
602 comb += do_write.eq(wb_in.ack & (replace_way == i))
603
604 with m.If(r.hit_way == i):
605 comb += cache_out_row.eq(d_out)
606 comb += rd_addr.eq(req_row)
607 comb += wr_addr.eq(r.store_row)
608 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
609
610 # -- Generate PLRUs
611 def maybe_plrus(self, m, r, plru_victim):
612 comb = m.d.comb
613
614 with m.If(NUM_WAYS > 1):
615 for i in range(NUM_LINES):
616 plru_acc_i = Signal(WAY_BITS)
617 plru_acc_en = Signal()
618 plru = PLRU(WAY_BITS)
619 setattr(m.submodules, "plru_%d" % i, plru)
620
621 comb += plru.acc_i.eq(plru_acc_i)
622 comb += plru.acc_en.eq(plru_acc_en)
623
624 # PLRU interface
625 with m.If(get_index(r.hit_nia) == i):
626 comb += plru.acc_en.eq(r.hit_valid)
627
628 comb += plru.acc_i.eq(r.hit_way)
629 comb += plru_victim[i].eq(plru.lru_o)
630
631 # TLB hit detection and real address generation
632 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
633 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
634 priv_fault, access_ok):
635 comb = m.d.comb
636
637 i_in = self.i_in
638
639 pte = Signal(TLB_PTE_BITS)
640 ttag = Signal(TLB_EA_TAG_BITS)
641
642 comb += tlb_req_index.eq(hash_ea(i_in.nia))
643 comb += pte.eq(itlb_ptes[tlb_req_index])
644 comb += ttag.eq(itlb_tags[tlb_req_index])
645
646 with m.If(i_in.virt_mode):
647 comb += real_addr.eq(Cat(
648 i_in.nia[:TLB_LG_PGSZ],
649 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
650 ))
651
652 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
653 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
654
655 comb += eaa_priv.eq(pte[3])
656
657 with m.Else():
658 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
659 comb += ra_valid.eq(1)
660 comb += eaa_priv.eq(1)
661
662 # No IAMR, so no KUEP support for now
663 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
664 comb += access_ok.eq(ra_valid & ~priv_fault)
665
666 # iTLB update
667 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
668 comb = m.d.comb
669 sync = m.d.sync
670
671 m_in = self.m_in
672
673 wr_index = Signal(TLB_SIZE)
674 comb += wr_index.eq(hash_ea(m_in.addr))
675
676 with m.If(m_in.tlbie & m_in.doall):
677 # Clear all valid bits
678 for i in range(TLB_SIZE):
679 sync += itlb_valid_bits[i].eq(0)
680
681 with m.Elif(m_in.tlbie):
682 # Clear entry regardless of hit or miss
683 sync += itlb_valid_bits[wr_index].eq(0)
684
685 with m.Elif(m_in.tlbld):
686 sync += itlb_tags[wr_index].eq(
687 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
688 )
689 sync += itlb_ptes[wr_index].eq(m_in.pte)
690 sync += itlb_valid_bits[wr_index].eq(1)
691
692 # Cache hit detection, output to fetch2 and other misc logic
693 def icache_comb(self, m, use_previous, r, req_index, req_row,
694 req_tag, real_addr, req_laddr, cache_valid_bits,
695 cache_tags, access_ok, req_is_hit,
696 req_is_miss, replace_way, plru_victim, cache_out_row):
697 comb = m.d.comb
698
699 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x "
700 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x "
701 # "access_ok:%x req_is_hit:%x req_is_miss:%x "
702 # "replace_way:%x", use_previous, req_index, req_row,
703 # req_tag, real_addr, req_laddr, access_ok,
704 # req_is_hit, req_is_miss, replace_way)
705
706 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
707 flush_in, stall_out = self.flush_in, self.stall_out
708
709 is_hit = Signal()
710 hit_way = Signal(NUM_WAYS)
711
712 # i_in.sequential means that i_in.nia this cycle is 4 more than
713 # last cycle. If we read more than 32 bits at a time, had a
714 # cache hit last cycle, and we don't want the first 32-bit chunk
715 # then we can keep the data we read last cycle and just use that.
716 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
717 comb += use_previous.eq(i_in.sequential & r.hit_valid)
718
719 # Extract line, row and tag from request
720 comb += req_index.eq(get_index(i_in.nia))
721 comb += req_row.eq(get_row(i_in.nia))
722 comb += req_tag.eq(get_tag(real_addr))
723
724 # Calculate address of beginning of cache row, will be
725 # used for cache miss processing if needed
726 comb += req_laddr.eq(Cat(
727 Const(0, ROW_OFF_BITS),
728 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
729 ))
730
731 # Test if pending request is a hit on any way
732 hitcond = Signal()
733 comb += hitcond.eq((r.state == State.WAIT_ACK)
734 & (req_index == r.store_index)
735 & r.rows_valid[req_row % ROW_PER_LINE])
736 with m.If(i_in.req):
737 cvb = Signal(NUM_WAYS)
738 ctag = Signal(TAG_RAM_WIDTH)
739 comb += ctag.eq(cache_tags[req_index])
740 comb += cvb.eq(cache_valid_bits[req_index])
741 for i in range(NUM_WAYS):
742 tagi = Signal(TAG_BITS, name="ti%d" % i)
743 comb += tagi.eq(read_tag(i, ctag))
744 hit_test = Signal(name="hit_test%d" % i)
745 comb += hit_test.eq(i == r.store_way)
746 with m.If((cvb[i] | (hitcond & hit_test)) & (tagi == req_tag)):
747 comb += hit_way.eq(i)
748 comb += is_hit.eq(1)
749
750 # Generate the "hit" and "miss" signals
751 # for the synchronous blocks
752 with m.If(i_in.req & access_ok & ~flush_in):
753 comb += req_is_hit.eq(is_hit)
754 comb += req_is_miss.eq(~is_hit)
755
756 # The way to replace on a miss
757 with m.If(r.state == State.CLR_TAG):
758 comb += replace_way.eq(plru_victim[r.store_index])
759 with m.Else():
760 comb += replace_way.eq(r.store_way)
761
762 # Output instruction from current cache row
763 #
764 # Note: This is a mild violation of our design principle of
765 # having pipeline stages output from a clean latch. In this
766 # case we output the result of a mux. The alternative would
767 # be output an entire row which I prefer not to do just yet
768 # as it would force fetch2 to know about some of the cache
769 # geometry information.
770 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
771 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
772 # r.hit_way, cache_out[r.hit_way])
773 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
774 comb += i_out.valid.eq(r.hit_valid)
775 comb += i_out.nia.eq(r.hit_nia)
776 comb += i_out.stop_mark.eq(r.hit_smark)
777 comb += i_out.fetch_failed.eq(r.fetch_failed)
778
779 # Stall fetch1 if we have a miss on cache or TLB
780 # or a protection fault
781 comb += stall_out.eq(~(is_hit & access_ok))
782
783 # Wishbone requests output (from the cache miss reload machine)
784 comb += wb_out.eq(r.wb)
785
786 # Cache hit synchronous machine
787 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
788 req_index, req_tag, real_addr):
789 sync = m.d.sync
790
791 i_in, stall_in = self.i_in, self.stall_in
792 flush_in = self.flush_in
793
794 # keep outputs to fetch2 unchanged on a stall
795 # except that flush or reset sets valid to 0
796 # If use_previous, keep the same data as last
797 # cycle and use the second half
798 with m.If(stall_in | use_previous):
799 with m.If(flush_in):
800 sync += r.hit_valid.eq(0)
801 with m.Else():
802 # On a hit, latch the request for the next cycle,
803 # when the BRAM data will be available on the
804 # cache_out output of the corresponding way
805 sync += r.hit_valid.eq(req_is_hit)
806
807 with m.If(req_is_hit):
808 sync += r.hit_way.eq(req_hit_way)
809 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x " \
810 "tag:%x way:%x RA:%x", i_in.nia, \
811 i_in.virt_mode, i_in.stop_mark, req_index, \
812 req_tag, req_hit_way, real_addr)
813
814
815
816 with m.If(~stall_in):
817 # Send stop marks and NIA down regardless of validity
818 sync += r.hit_smark.eq(i_in.stop_mark)
819 sync += r.hit_nia.eq(i_in.nia)
820
821 # Cache miss/reload synchronous machine
822 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
823 req_index, req_laddr, req_tag, replace_way,
824 cache_tags, access_ok, real_addr):
825 comb = m.d.comb
826 sync = m.d.sync
827
828 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
829 stall_in, flush_in = self.stall_in, self.flush_in
830 inval_in = self.inval_in
831
832 # variable tagset : cache_tags_set_t;
833 # variable stbs_done : boolean;
834
835 tagset = Signal(TAG_RAM_WIDTH)
836 stbs_done = Signal()
837
838 comb += r.wb.sel.eq(-1)
839 comb += r.wb.adr.eq(r.req_adr[3:])
840
841 # Process cache invalidations
842 with m.If(inval_in):
843 for i in range(NUM_LINES):
844 sync += cache_valid_bits[i].eq(0)
845 sync += r.store_valid.eq(0)
846
847 # Main state machine
848 with m.Switch(r.state):
849
850 with m.Case(State.IDLE):
851 # Reset per-row valid flags,
852 # only used in WAIT_ACK
853 for i in range(ROW_PER_LINE):
854 sync += r.rows_valid[i].eq(0)
855
856 # We need to read a cache line
857 with m.If(req_is_miss):
858 sync += Display("cache miss nia:%x IR:%x SM:%x idx:%x "
859 " way:%x tag:%x RA:%x", i_in.nia,
860 i_in.virt_mode, i_in.stop_mark, req_index,
861 replace_way, req_tag, real_addr)
862
863 # Keep track of our index and way
864 # for subsequent stores
865 st_row = Signal(BRAM_ROWS)
866 comb += st_row.eq(get_row(req_laddr))
867 sync += r.store_index.eq(req_index)
868 sync += r.store_row.eq(st_row)
869 sync += r.store_tag.eq(req_tag)
870 sync += r.store_valid.eq(1)
871 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
872
873 # Prep for first wishbone read. We calculate the
874 # address of the start of the cache line and
875 # start the WB cycle.
876 sync += r.req_adr.eq(req_laddr)
877 sync += r.wb.cyc.eq(1)
878 sync += r.wb.stb.eq(1)
879
880 # Track that we had one request sent
881 sync += r.state.eq(State.CLR_TAG)
882
883 with m.Case(State.CLR_TAG, State.WAIT_ACK):
884 with m.If(r.state == State.CLR_TAG):
885 # Get victim way from plru
886 sync += r.store_way.eq(replace_way)
887 # Force misses on that way while reloading that line
888 cv = Signal(INDEX_BITS)
889 comb += cv.eq(cache_valid_bits[req_index])
890 comb += cv.bit_select(replace_way, 1).eq(0)
891 sync += cache_valid_bits[req_index].eq(cv)
892
893 for i in range(NUM_WAYS):
894 with m.If(i == replace_way):
895 comb += tagset.eq(cache_tags[r.store_index])
896 comb += write_tag(i, tagset, r.store_tag)
897 sync += cache_tags[r.store_index].eq(tagset)
898
899 sync += r.state.eq(State.WAIT_ACK)
900
901 # Requests are all sent if stb is 0
902 stbs_zero = Signal()
903 comb += stbs_zero.eq(r.wb.stb == 0)
904 comb += stbs_done.eq(stbs_zero)
905
906 # If we are still sending requests, was one accepted?
907 with m.If(~wb_in.stall & ~stbs_zero):
908 # That was the last word ? # We are done sending.
909 # Clear stb and set stbs_done # so we can handle
910 # an eventual last ack on # the same cycle.
911 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
912 sync += Display("IS_LAST_ROW_ADDR " \
913 "r.wb.addr:%x r.end_row_ix:%x " \
914 "r.wb.stb:%x stbs_zero:%x " \
915 "stbs_done:%x", r.wb.adr, \
916 r.end_row_ix, r.wb.stb, \
917 stbs_zero, stbs_done)
918 sync += r.wb.stb.eq(0)
919 comb += stbs_done.eq(1)
920
921 # Calculate the next row address
922 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
923 comb += rarange.eq(
924 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
925 )
926 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
927 rarange
928 )
929 sync += Display("RARANGE r.req_adr:%x rarange:%x "
930 "stbs_zero:%x stbs_done:%x",
931 r.req_adr, rarange, stbs_zero, stbs_done)
932
933 # Incoming acks processing
934 with m.If(wb_in.ack):
935 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
936 "stbs_done:%x",
937 wb_in.dat, stbs_zero, stbs_done)
938
939 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
940
941 # Check for completion
942 with m.If(stbs_done &
943 is_last_row(r.store_row, r.end_row_ix)):
944 # Complete wishbone cycle
945 sync += r.wb.cyc.eq(0)
946
947 # Cache line is now valid
948 cv = Signal(INDEX_BITS)
949 comb += cv.eq(cache_valid_bits[r.store_index])
950 comb += cv.bit_select(replace_way, 1).eq(
951 r.store_valid & ~inval_in
952 )
953 sync += cache_valid_bits[r.store_index].eq(cv)
954
955 sync += r.state.eq(State.IDLE)
956
957 # Increment store row counter
958 sync += r.store_row.eq(next_row(r.store_row))
959
960 # TLB miss and protection fault processing
961 with m.If(flush_in | m_in.tlbld):
962 sync += r.fetch_failed.eq(0)
963 with m.Elif(i_in.req & ~access_ok & ~stall_in):
964 sync += r.fetch_failed.eq(1)
965
966 # icache_log: if LOG_LENGTH > 0 generate
967 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
968 req_is_miss, req_is_hit, lway, wstate, r):
969 comb = m.d.comb
970 sync = m.d.sync
971
972 wb_in, i_out = self.wb_in, self.i_out
973 log_out, stall_out = self.log_out, self.stall_out
974
975 # -- Output data to logger
976 # signal log_data : std_ulogic_vector(53 downto 0);
977 # begin
978 # data_log: process(clk)
979 # variable lway: way_t;
980 # variable wstate: std_ulogic;
981 # Output data to logger
982 for i in range(LOG_LENGTH):
983 # Output data to logger
984 log_data = Signal(54)
985 lway = Signal(NUM_WAYS)
986 wstate = Signal()
987
988 # begin
989 # if rising_edge(clk) then
990 # lway := req_hit_way;
991 # wstate := '0';
992 sync += lway.eq(req_hit_way)
993 sync += wstate.eq(0)
994
995 # if r.state /= IDLE then
996 # wstate := '1';
997 # end if;
998 with m.If(r.state != State.IDLE):
999 sync += wstate.eq(1)
1000
1001 # log_data <= i_out.valid &
1002 # i_out.insn &
1003 # wishbone_in.ack &
1004 # r.wb.adr(5 downto 3) &
1005 # r.wb.stb & r.wb.cyc &
1006 # wishbone_in.stall &
1007 # stall_out &
1008 # r.fetch_failed &
1009 # r.hit_nia(5 downto 2) &
1010 # wstate &
1011 # std_ulogic_vector(to_unsigned(lway, 3)) &
1012 # req_is_hit & req_is_miss &
1013 # access_ok &
1014 # ra_valid;
1015 sync += log_data.eq(Cat(
1016 ra_valid, access_ok, req_is_miss, req_is_hit,
1017 lway, wstate, r.hit_nia[2:6],
1018 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1019 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1020 i_out.valid
1021 ))
1022 # end if;
1023 # end process;
1024 # log_out <= log_data;
1025 comb += log_out.eq(log_data)
1026 # end generate;
1027 # end;
1028
1029 def elaborate(self, platform):
1030
1031 m = Module()
1032 comb = m.d.comb
1033
1034 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1035 cache_tags = CacheTagArray()
1036 cache_valid_bits = CacheValidBitsArray()
1037
1038 # signal itlb_valids : tlb_valids_t;
1039 # signal itlb_tags : tlb_tags_t;
1040 # signal itlb_ptes : tlb_ptes_t;
1041 # attribute ram_style of itlb_tags : signal is "distributed";
1042 # attribute ram_style of itlb_ptes : signal is "distributed";
1043 itlb_valid_bits = TLBValidBitsArray()
1044 itlb_tags = TLBTagArray()
1045 itlb_ptes = TLBPtesArray()
1046 # TODO to be passed to nmigen as ram attributes
1047 # attribute ram_style of itlb_tags : signal is "distributed";
1048 # attribute ram_style of itlb_ptes : signal is "distributed";
1049
1050 # -- Privilege bit from PTE EAA field
1051 # signal eaa_priv : std_ulogic;
1052 # Privilege bit from PTE EAA field
1053 eaa_priv = Signal()
1054
1055 # signal r : reg_internal_t;
1056 r = RegInternal()
1057
1058 # -- Async signals on incoming request
1059 # signal req_index : index_t;
1060 # signal req_row : row_t;
1061 # signal req_hit_way : way_t;
1062 # signal req_tag : cache_tag_t;
1063 # signal req_is_hit : std_ulogic;
1064 # signal req_is_miss : std_ulogic;
1065 # signal req_laddr : std_ulogic_vector(63 downto 0);
1066 # Async signal on incoming request
1067 req_index = Signal(NUM_LINES)
1068 req_row = Signal(BRAM_ROWS)
1069 req_hit_way = Signal(NUM_WAYS)
1070 req_tag = Signal(TAG_BITS)
1071 req_is_hit = Signal()
1072 req_is_miss = Signal()
1073 req_laddr = Signal(64)
1074
1075 # signal tlb_req_index : tlb_index_t;
1076 # signal real_addr : std_ulogic_vector(
1077 # REAL_ADDR_BITS - 1 downto 0
1078 # );
1079 # signal ra_valid : std_ulogic;
1080 # signal priv_fault : std_ulogic;
1081 # signal access_ok : std_ulogic;
1082 # signal use_previous : std_ulogic;
1083 tlb_req_index = Signal(TLB_SIZE)
1084 real_addr = Signal(REAL_ADDR_BITS)
1085 ra_valid = Signal()
1086 priv_fault = Signal()
1087 access_ok = Signal()
1088 use_previous = Signal()
1089
1090 # signal cache_out : cache_ram_out_t;
1091 cache_out_row = Signal(ROW_SIZE_BITS)
1092
1093 # signal plru_victim : plru_out_t;
1094 # signal replace_way : way_t;
1095 plru_victim = PLRUOut()
1096 replace_way = Signal(NUM_WAYS)
1097
1098 # call sub-functions putting everything together, using shared
1099 # signals established above
1100 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
1101 self.maybe_plrus(m, r, plru_victim)
1102 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1103 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1104 priv_fault, access_ok)
1105 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1106 self.icache_comb(m, use_previous, r, req_index, req_row,
1107 req_tag, real_addr, req_laddr, cache_valid_bits,
1108 cache_tags, access_ok, req_is_hit, req_is_miss,
1109 replace_way, plru_victim, cache_out_row)
1110 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1111 req_index, req_tag, real_addr)
1112 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1113 req_laddr, req_tag, replace_way, cache_tags,
1114 access_ok, real_addr)
1115 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1116 # req_is_miss, req_is_hit, lway, wstate, r)
1117
1118 return m
1119
1120
1121 # icache_tb.vhdl
1122 #
1123 # library ieee;
1124 # use ieee.std_logic_1164.all;
1125 #
1126 # library work;
1127 # use work.common.all;
1128 # use work.wishbone_types.all;
1129 #
1130 # entity icache_tb is
1131 # end icache_tb;
1132 #
1133 # architecture behave of icache_tb is
1134 # signal clk : std_ulogic;
1135 # signal rst : std_ulogic;
1136 #
1137 # signal i_out : Fetch1ToIcacheType;
1138 # signal i_in : IcacheToDecode1Type;
1139 #
1140 # signal m_out : MmuToIcacheType;
1141 #
1142 # signal wb_bram_in : wishbone_master_out;
1143 # signal wb_bram_out : wishbone_slave_out;
1144 #
1145 # constant clk_period : time := 10 ns;
1146 # begin
1147 # icache0: entity work.icache
1148 # generic map(
1149 # LINE_SIZE => 64,
1150 # NUM_LINES => 4
1151 # )
1152 # port map(
1153 # clk => clk,
1154 # rst => rst,
1155 # i_in => i_out,
1156 # i_out => i_in,
1157 # m_in => m_out,
1158 # stall_in => '0',
1159 # flush_in => '0',
1160 # inval_in => '0',
1161 # wishbone_out => wb_bram_in,
1162 # wishbone_in => wb_bram_out
1163 # );
1164 #
1165 # -- BRAM Memory slave
1166 # bram0: entity work.wishbone_bram_wrapper
1167 # generic map(
1168 # MEMORY_SIZE => 1024,
1169 # RAM_INIT_FILE => "icache_test.bin"
1170 # )
1171 # port map(
1172 # clk => clk,
1173 # rst => rst,
1174 # wishbone_in => wb_bram_in,
1175 # wishbone_out => wb_bram_out
1176 # );
1177 #
1178 # clk_process: process
1179 # begin
1180 # clk <= '0';
1181 # wait for clk_period/2;
1182 # clk <= '1';
1183 # wait for clk_period/2;
1184 # end process;
1185 #
1186 # rst_process: process
1187 # begin
1188 # rst <= '1';
1189 # wait for 2*clk_period;
1190 # rst <= '0';
1191 # wait;
1192 # end process;
1193 #
1194 # stim: process
1195 # begin
1196 # i_out.req <= '0';
1197 # i_out.nia <= (others => '0');
1198 # i_out.stop_mark <= '0';
1199 #
1200 # m_out.tlbld <= '0';
1201 # m_out.tlbie <= '0';
1202 # m_out.addr <= (others => '0');
1203 # m_out.pte <= (others => '0');
1204 #
1205 # wait until rising_edge(clk);
1206 # wait until rising_edge(clk);
1207 # wait until rising_edge(clk);
1208 # wait until rising_edge(clk);
1209 #
1210 # i_out.req <= '1';
1211 # i_out.nia <= x"0000000000000004";
1212 #
1213 # wait for 30*clk_period;
1214 # wait until rising_edge(clk);
1215 #
1216 # assert i_in.valid = '1' severity failure;
1217 # assert i_in.insn = x"00000001"
1218 # report "insn @" & to_hstring(i_out.nia) &
1219 # "=" & to_hstring(i_in.insn) &
1220 # " expected 00000001"
1221 # severity failure;
1222 #
1223 # i_out.req <= '0';
1224 #
1225 # wait until rising_edge(clk);
1226 #
1227 # -- hit
1228 # i_out.req <= '1';
1229 # i_out.nia <= x"0000000000000008";
1230 # wait until rising_edge(clk);
1231 # wait until rising_edge(clk);
1232 # assert i_in.valid = '1' severity failure;
1233 # assert i_in.insn = x"00000002"
1234 # report "insn @" & to_hstring(i_out.nia) &
1235 # "=" & to_hstring(i_in.insn) &
1236 # " expected 00000002"
1237 # severity failure;
1238 # wait until rising_edge(clk);
1239 #
1240 # -- another miss
1241 # i_out.req <= '1';
1242 # i_out.nia <= x"0000000000000040";
1243 #
1244 # wait for 30*clk_period;
1245 # wait until rising_edge(clk);
1246 #
1247 # assert i_in.valid = '1' severity failure;
1248 # assert i_in.insn = x"00000010"
1249 # report "insn @" & to_hstring(i_out.nia) &
1250 # "=" & to_hstring(i_in.insn) &
1251 # " expected 00000010"
1252 # severity failure;
1253 #
1254 # -- test something that aliases
1255 # i_out.req <= '1';
1256 # i_out.nia <= x"0000000000000100";
1257 # wait until rising_edge(clk);
1258 # wait until rising_edge(clk);
1259 # assert i_in.valid = '0' severity failure;
1260 # wait until rising_edge(clk);
1261 #
1262 # wait for 30*clk_period;
1263 # wait until rising_edge(clk);
1264 #
1265 # assert i_in.valid = '1' severity failure;
1266 # assert i_in.insn = x"00000040"
1267 # report "insn @" & to_hstring(i_out.nia) &
1268 # "=" & to_hstring(i_in.insn) &
1269 # " expected 00000040"
1270 # severity failure;
1271 #
1272 # i_out.req <= '0';
1273 #
1274 # std.env.finish;
1275 # end process;
1276 # end;
1277 def icache_sim(dut):
1278 i_out = dut.i_in
1279 i_in = dut.i_out
1280 m_out = dut.m_in
1281
1282 yield i_in.valid.eq(0)
1283 yield i_out.priv_mode.eq(1)
1284 yield i_out.req.eq(0)
1285 yield i_out.nia.eq(0)
1286 yield i_out.stop_mark.eq(0)
1287 yield m_out.tlbld.eq(0)
1288 yield m_out.tlbie.eq(0)
1289 yield m_out.addr.eq(0)
1290 yield m_out.pte.eq(0)
1291 yield
1292 yield
1293 yield
1294 yield
1295 yield i_out.req.eq(1)
1296 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1297 for i in range(30):
1298 yield
1299 yield
1300 valid = yield i_in.valid
1301 nia = yield i_out.nia
1302 insn = yield i_in.insn
1303 print(f"valid? {valid}")
1304 assert valid
1305 assert insn == 0x00000001, \
1306 "insn @%x=%x expected 00000001" % (nia, insn)
1307 yield i_out.req.eq(0)
1308 yield
1309
1310 # hit
1311 yield i_out.req.eq(1)
1312 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1313 yield
1314 yield
1315 valid = yield i_in.valid
1316 nia = yield i_in.nia
1317 insn = yield i_in.insn
1318 assert valid
1319 assert insn == 0x00000002, \
1320 "insn @%x=%x expected 00000002" % (nia, insn)
1321 yield
1322
1323 # another miss
1324 yield i_out.req.eq(1)
1325 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1326 for i in range(30):
1327 yield
1328 yield
1329 valid = yield i_in.valid
1330 nia = yield i_out.nia
1331 insn = yield i_in.insn
1332 assert valid
1333 assert insn == 0x00000010, \
1334 "insn @%x=%x expected 00000010" % (nia, insn)
1335
1336 # test something that aliases
1337 yield i_out.req.eq(1)
1338 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1339 yield
1340 yield
1341 valid = yield i_in.valid
1342 assert ~valid
1343 for i in range(30):
1344 yield
1345 yield
1346 insn = yield i_in.insn
1347 valid = yield i_in.valid
1348 insn = yield i_in.insn
1349 assert valid
1350 assert insn == 0x00000040, \
1351 "insn @%x=%x expected 00000040" % (nia, insn)
1352 yield i_out.req.eq(0)
1353
1354
1355
1356 def test_icache(mem):
1357 dut = ICache()
1358
1359 memory = Memory(width=64, depth=16*64, init=mem)
1360 sram = SRAM(memory=memory, granularity=8)
1361
1362 m = Module()
1363
1364 m.submodules.icache = dut
1365 m.submodules.sram = sram
1366
1367 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1368 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1369 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1370 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1371 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1372 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1373
1374 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1375 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1376
1377 # nmigen Simulation
1378 sim = Simulator(m)
1379 sim.add_clock(1e-6)
1380
1381 sim.add_sync_process(wrap(icache_sim(dut)))
1382 with sim.write_vcd('test_icache.vcd'):
1383 sim.run()
1384
1385 if __name__ == '__main__':
1386 dut = ICache()
1387 vl = rtlil.convert(dut, ports=[])
1388 with open("test_icache.il", "w") as f:
1389 f.write(vl)
1390
1391 mem = []
1392 for i in range(512):
1393 mem.append((i*2)| ((i*2+1)<<32))
1394
1395 test_icache(mem)
1396