comb on wr_index not sync
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 print("ROW_SIZE", ROW_SIZE)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
88 print("ROW_PER_LINE", ROW_PER_LINE)
89 print("BRAM_ROWS", BRAM_ROWS)
90 print("INSN_PER_ROW", INSN_PER_ROW)
91
92 # Bit fields counts in the address
93 #
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to
98 # select a row
99 ROW_BITS = log2_int(BRAM_ROWS)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINEBITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS = log2_int(ROW_SIZE)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112 # SET_SIZE_BITS is the log base 2 of
113 # the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
120
121 # WAY_BITS is the number of bits to
122 # select a way
123 WAY_BITS = log2_int(NUM_WAYS)
124 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
125
126 # -- L1 ITLB.
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS = log2_int(TLB_SIZE)
131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
132 TLB_PTE_BITS = 64
133
134
135 print("INSN_BITS", INSN_BITS)
136 print("ROW_BITS", ROW_BITS)
137 print("ROW_LINEBITS", ROW_LINEBITS)
138 print("LINE_OFF_BITS", LINE_OFF_BITS)
139 print("ROW_OFF_BITS", ROW_OFF_BITS)
140 print("INDEX_BITS", INDEX_BITS)
141 print("SET_SIZE_BITS", SET_SIZE_BITS)
142 print("TAG_BITS", TAG_BITS)
143 print("WAY_BITS", WAY_BITS)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
145 print("TLB_BITS", TLB_BITS)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
147 print("TLB_PTE_BITS", TLB_PTE_BITS)
148
149
150
151
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
159 #-- icache
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
164 #
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
174 #-- in a cache line
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
186
187 #-- Example of layout for 32 lines of 64 bytes:
188 #--
189 #-- .. tag |index| line |
190 #-- .. | row | |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
200 #
201 # .. tag |index| line |
202 # .. | row | |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
211
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
216 #
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
219 #
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
229 def CacheTagArray():
230 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
231 for x in range(NUM_LINES))
232
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
239 for x in range(NUM_LINES))
240
241 def RowPerLineValidArray():
242 return Array(Signal(name="rows_valid_%d" %x) \
243 for x in range(ROW_PER_LINE))
244
245
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
251
252
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name="tlbvalid_%d" %x) \
261 for x in range(TLB_SIZE))
262
263 def TLBTagArray():
264 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
265 for x in range(TLB_SIZE))
266
267 def TLBPtesArray():
268 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
269 for x in range(TLB_SIZE))
270
271
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
275 def CacheRamOut():
276 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
277 for x in range(NUM_WAYS))
278
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
283 def PLRUOut():
284 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
285 for x in range(NUM_LINES))
286
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
289 # return index_t is
290 # begin
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
293 # ));
294 # end;
295 # Return the cache line index (tag index) for an address
296 def get_index(addr):
297 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
298
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
301 # return row_t is
302 # begin
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
305 # ));
306 # end;
307 # Return the cache row index (data memory) for an address
308 def get_row(addr):
309 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
310
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
314 # begin
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
317 # end;
318 # Return the index of a row within a line
319 def get_row_of_line(row):
320 return row[:ROW_LINEBITS]
321
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
325 # )
326 # return boolean is
327 # begin
328 # return unsigned(
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
330 # ) = last;
331 # end;
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr, last):
334 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
335
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
339 # begin
340 # return get_row_of_line(row) = last;
341 # end;
342 # Returns whether this is the last row of a line
343 def is_last_row(row, last):
344 return get_row_of_line(row) == last
345
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
353 # begin
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
359 # end;
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
363 def next_row(row):
364 row_v = row[0:ROW_LINEBITS] + 1
365 return Cat(row_v[:ROW_LINEBITS], row[ROW_LINEBITS:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
371 # begin
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
374 # end;
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr, data):
378 word = addr[2:INSN_BITS+2]
379 return data.word_select(word, 32)
380
381 # -- Get the tag value from the address
382 # function get_tag(
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
384 # )
385 # return cache_tag_t is
386 # begin
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
388 # end;
389 # Get the tag value from the address
390 def get_tag(addr):
391 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
392
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
396 # begin
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
398 # end;
399 # Read a tag from a tag memory row
400 def read_tag(way, tagset):
401 return tagset.word_select(way, TAG_BITS)
402
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
406 # begin
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
408 # end;
409 # Write a tag to tag memory row
410 def write_tag(way, tagset, tag):
411 return read_tag(way, tagset).eq(tag)
412
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
417 # begin
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
419 # xor addr(
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
422 # )
423 # xor addr(
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
426 # );
427 # return to_integer(unsigned(hash));
428 # end;
429 # Simple hash for direct-mapped TLB index
430 def hash_ea(addr):
431 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
432 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
433 ] ^ addr[
434 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
435 ]
436 return hsh
437
438 # begin
439 #
440 # XXX put these assert statements in - as python asserts
441 #
442 # assert LINE_SIZE mod ROW_SIZE = 0;
443 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
444 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
446 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
447 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
448 # report "geometry bits don't add up"
449 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up"
451 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
452 # report "geometry bits don't add up"
453 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
454 # report "geometry bits don't add up"
455 #
456 # sim_debug: if SIM generate
457 # debug: process
458 # begin
459 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
460 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
461 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
462 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
463 # report "INSN_BITS = " & natural'image(INSN_BITS);
464 # report "ROW_BITS = " & natural'image(ROW_BITS);
465 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
466 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
467 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
468 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
469 # report "TAG_BITS = " & natural'image(TAG_BITS);
470 # report "WAY_BITS = " & natural'image(WAY_BITS);
471 # wait;
472 # end process;
473 # end generate;
474
475 # Cache reload state machine
476 @unique
477 class State(Enum):
478 IDLE = 0
479 CLR_TAG = 1
480 WAIT_ACK = 2
481
482
483 class RegInternal(RecordObject):
484 def __init__(self):
485 super().__init__()
486 # Cache hit state (Latches for 1 cycle BRAM access)
487 self.hit_way = Signal(NUM_WAYS)
488 self.hit_nia = Signal(64)
489 self.hit_smark = Signal()
490 self.hit_valid = Signal()
491
492 # Cache miss state (reload state machine)
493 self.state = Signal(State, reset=State.IDLE)
494 self.wb = WBMasterOut("wb")
495 self.req_adr = Signal(64)
496 self.store_way = Signal(NUM_WAYS)
497 self.store_index = Signal(NUM_LINES)
498 self.store_row = Signal(BRAM_ROWS)
499 self.store_tag = Signal(TAG_BITS)
500 self.store_valid = Signal()
501 self.end_row_ix = Signal(ROW_LINEBITS)
502 self.rows_valid = RowPerLineValidArray()
503
504 # TLB miss state
505 self.fetch_failed = Signal()
506
507 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
508 #
509 # entity icache is
510 # generic (
511 # SIM : boolean := false;
512 # -- Line size in bytes
513 # LINE_SIZE : positive := 64;
514 # -- BRAM organisation: We never access more
515 # -- than wishbone_data_bits
516 # -- at a time so to save resources we make the
517 # -- array only that wide,
518 # -- and use consecutive indices for to make a cache "line"
519 # --
520 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
521 # -- so 64-bits)
522 # ROW_SIZE : positive := wishbone_data_bits / 8;
523 # -- Number of lines in a set
524 # NUM_LINES : positive := 32;
525 # -- Number of ways
526 # NUM_WAYS : positive := 4;
527 # -- L1 ITLB number of entries (direct mapped)
528 # TLB_SIZE : positive := 64;
529 # -- L1 ITLB log_2(page_size)
530 # TLB_LG_PGSZ : positive := 12;
531 # -- Number of real address bits that we store
532 # REAL_ADDR_BITS : positive := 56;
533 # -- Non-zero to enable log data collection
534 # LOG_LENGTH : natural := 0
535 # );
536 # port (
537 # clk : in std_ulogic;
538 # rst : in std_ulogic;
539 #
540 # i_in : in Fetch1ToIcacheType;
541 # i_out : out IcacheToDecode1Type;
542 #
543 # m_in : in MmuToIcacheType;
544 #
545 # stall_in : in std_ulogic;
546 # stall_out : out std_ulogic;
547 # flush_in : in std_ulogic;
548 # inval_in : in std_ulogic;
549 #
550 # wishbone_out : out wishbone_master_out;
551 # wishbone_in : in wishbone_slave_out;
552 #
553 # log_out : out std_ulogic_vector(53 downto 0)
554 # );
555 # end entity icache;
556 # 64 bit direct mapped icache. All instructions are 4B aligned.
557 class ICache(Elaboratable):
558 """64 bit direct mapped icache. All instructions are 4B aligned."""
559 def __init__(self):
560 self.i_in = Fetch1ToICacheType(name="i_in")
561 self.i_out = ICacheToDecode1Type(name="i_out")
562
563 self.m_in = MMUToICacheType(name="m_in")
564
565 self.stall_in = Signal()
566 self.stall_out = Signal()
567 self.flush_in = Signal()
568 self.inval_in = Signal()
569
570 self.wb_out = WBMasterOut(name="wb_out")
571 self.wb_in = WBSlaveOut(name="wb_in")
572
573 self.log_out = Signal(54)
574
575
576 # Generate a cache RAM for each way
577 def rams(self, m, r, cache_out_row, use_previous, replace_way, req_row):
578 comb = m.d.comb
579
580 wb_in, stall_in = self.wb_in, self.stall_in
581
582
583 for i in range(NUM_WAYS):
584 do_read = Signal(name="do_rd_%d" % i)
585 do_write = Signal(name="do_wr_%d" % i)
586 rd_addr = Signal(ROW_BITS)
587 wr_addr = Signal(ROW_BITS)
588 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
589 wr_sel = Signal(ROW_SIZE)
590
591 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
592 setattr(m.submodules, "cacheram_%d" % i, way)
593
594 comb += way.rd_en.eq(do_read)
595 comb += way.rd_addr.eq(rd_addr)
596 comb += d_out.eq(way.rd_data_o)
597 comb += way.wr_sel.eq(wr_sel)
598 comb += way.wr_addr.eq(wr_addr)
599 comb += way.wr_data.eq(wb_in.dat)
600
601 comb += do_read.eq(~(stall_in | use_previous))
602 comb += do_write.eq(wb_in.ack & (replace_way == i))
603
604 with m.If(r.hit_way == i):
605 comb += cache_out_row.eq(d_out)
606 comb += rd_addr.eq(req_row)
607 comb += wr_addr.eq(r.store_row)
608 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
609
610 # -- Generate PLRUs
611 def maybe_plrus(self, m, r, plru_victim):
612 comb = m.d.comb
613
614 with m.If(NUM_WAYS > 1):
615 for i in range(NUM_LINES):
616 plru_acc_i = Signal(WAY_BITS)
617 plru_acc_en = Signal()
618 plru = PLRU(WAY_BITS)
619 setattr(m.submodules, "plru_%d" % i, plru)
620
621 comb += plru.acc_i.eq(plru_acc_i)
622 comb += plru.acc_en.eq(plru_acc_en)
623
624 # PLRU interface
625 with m.If(get_index(r.hit_nia) == i):
626 comb += plru.acc_en.eq(r.hit_valid)
627
628 comb += plru.acc_i.eq(r.hit_way)
629 comb += plru_victim[i].eq(plru.lru_o)
630
631 # TLB hit detection and real address generation
632 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
633 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
634 priv_fault, access_ok):
635 comb = m.d.comb
636
637 i_in = self.i_in
638
639 pte = Signal(TLB_PTE_BITS)
640 ttag = Signal(TLB_EA_TAG_BITS)
641
642 comb += tlb_req_index.eq(hash_ea(i_in.nia))
643 comb += pte.eq(itlb_ptes[tlb_req_index])
644 comb += ttag.eq(itlb_tags[tlb_req_index])
645
646 with m.If(i_in.virt_mode):
647 comb += real_addr.eq(Cat(
648 i_in.nia[:TLB_LG_PGSZ],
649 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
650 ))
651
652 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
653 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
654
655 comb += eaa_priv.eq(pte[3])
656
657 with m.Else():
658 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
659 comb += ra_valid.eq(1)
660 comb += eaa_priv.eq(1)
661
662 # No IAMR, so no KUEP support for now
663 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
664 comb += access_ok.eq(ra_valid & ~priv_fault)
665
666 # iTLB update
667 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
668 comb = m.d.comb
669 sync = m.d.sync
670
671 m_in = self.m_in
672
673 wr_index = Signal(TLB_SIZE)
674 comb += wr_index.eq(hash_ea(m_in.addr))
675
676 with m.If(m_in.tlbie & m_in.doall):
677 # Clear all valid bits
678 for i in range(TLB_SIZE):
679 sync += itlb_valid_bits[i].eq(0)
680
681 with m.Elif(m_in.tlbie):
682 # Clear entry regardless of hit or miss
683 sync += itlb_valid_bits[wr_index].eq(0)
684
685 with m.Elif(m_in.tlbld):
686 sync += itlb_tags[wr_index].eq(
687 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
688 )
689 sync += itlb_ptes[wr_index].eq(m_in.pte)
690 sync += itlb_valid_bits[wr_index].eq(1)
691
692 # -- Cache hit detection, output to fetch2 and other misc logic
693 # icache_comb : process(all)
694 # Cache hit detection, output to fetch2 and other misc logic
695 def icache_comb(self, m, use_previous, r, req_index, req_row,
696 req_tag, real_addr, req_laddr, cache_valid_bits,
697 cache_tags, access_ok, req_is_hit,
698 req_is_miss, replace_way, plru_victim, cache_out_row):
699 # variable is_hit : std_ulogic;
700 # variable hit_way : way_t;
701 comb = m.d.comb
702
703 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x " \
704 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x " \
705 # "access_ok:%x req_is_hit:%x req_is_miss:%x " \
706 # "replace_way:%x", use_previous, req_index, req_row, \
707 # req_tag, real_addr, req_laddr, access_ok, \
708 # req_is_hit, req_is_miss, replace_way)
709
710 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
711 flush_in, stall_out = self.flush_in, self.stall_out
712
713 is_hit = Signal()
714 hit_way = Signal(NUM_WAYS)
715 # begin
716 # -- i_in.sequential means that i_in.nia this cycle
717 # -- is 4 more than last cycle. If we read more
718 # -- than 32 bits at a time, had a cache hit last
719 # -- cycle, and we don't want the first 32-bit chunk
720 # -- then we can keep the data we read last cycle
721 # -- and just use that.
722 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
723 # use_previous <= i_in.sequential and r.hit_valid;
724 # else
725 # use_previous <= '0';
726 # end if;
727 # i_in.sequential means that i_in.nia this cycle is 4 more than
728 # last cycle. If we read more than 32 bits at a time, had a
729 # cache hit last cycle, and we don't want the first 32-bit chunk
730 # then we can keep the data we read last cycle and just use that.
731 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
732 comb += use_previous.eq(i_in.sequential & r.hit_valid)
733
734 # -- Extract line, row and tag from request
735 # req_index <= get_index(i_in.nia);
736 # req_row <= get_row(i_in.nia);
737 # req_tag <= get_tag(real_addr);
738 # Extract line, row and tag from request
739 comb += req_index.eq(get_index(i_in.nia))
740 comb += req_row.eq(get_row(i_in.nia))
741 comb += req_tag.eq(get_tag(real_addr))
742
743 # -- Calculate address of beginning of cache row, will be
744 # -- used for cache miss processing if needed
745 # req_laddr <=
746 # (63 downto REAL_ADDR_BITS => '0') &
747 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
748 # (ROW_OFF_BITS-1 downto 0 => '0');
749 # Calculate address of beginning of cache row, will be
750 # used for cache miss processing if needed
751 comb += req_laddr.eq(Cat(
752 Const(0b0, ROW_OFF_BITS),
753 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
754 Const(0b0, 8)
755 ))
756
757 # -- Test if pending request is a hit on any way
758 # hit_way := 0;
759 # is_hit := '0';
760 # for i in way_t loop
761 # if i_in.req = '1' and
762 # (cache_valids(req_index)(i) = '1' or
763 # (r.state = WAIT_ACK and
764 # req_index = r.store_index and
765 # i = r.store_way and
766 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
767 # if read_tag(i, cache_tags(req_index)) = req_tag then
768 # hit_way := i;
769 # is_hit := '1';
770 # end if;
771 # end if;
772 # end loop;
773 # Test if pending request is a hit on any way
774 hitcond = Signal()
775 comb += hitcond.eq((r.state == State.WAIT_ACK)
776 & (req_index == r.store_index)
777 & r.rows_valid[req_row % ROW_PER_LINE])
778 with m.If(i_in.req):
779 cvb = Signal(NUM_WAYS)
780 ctag = Signal(TAG_RAM_WIDTH)
781 comb += ctag.eq(cache_tags[req_index])
782 comb += cvb.eq(cache_valid_bits[req_index])
783 for i in range(NUM_WAYS):
784 tagi = Signal(TAG_BITS, name="ti%d" % i)
785 comb += tagi.eq(read_tag(i, ctag))
786 hit_test = Signal(name="hit_test%d" % i)
787 comb += hit_test.eq(i == r.store_way)
788 with m.If((cvb[i] | (hitcond & hit_test)) & (tagi == req_tag)):
789 comb += hit_way.eq(i)
790 comb += is_hit.eq(1)
791
792 # -- Generate the "hit" and "miss" signals
793 # -- for the synchronous blocks
794 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
795 # and rst = '0' then
796 # req_is_hit <= is_hit;
797 # req_is_miss <= not is_hit;
798 # else
799 # req_is_hit <= '0';
800 # req_is_miss <= '0';
801 # end if;
802 # req_hit_way <= hit_way;
803 # Generate the "hit" and "miss" signals
804 # for the synchronous blocks
805 with m.If(i_in.req & access_ok & ~flush_in):
806 comb += req_is_hit.eq(is_hit)
807 comb += req_is_miss.eq(~is_hit)
808
809 with m.Else():
810 comb += req_is_hit.eq(0)
811 comb += req_is_miss.eq(0)
812
813 # -- The way to replace on a miss
814 # if r.state = CLR_TAG then
815 # replace_way <=
816 # to_integer(unsigned(plru_victim(r.store_index)));
817 # else
818 # replace_way <= r.store_way;
819 # end if;
820 # The way to replace on a miss
821 with m.If(r.state == State.CLR_TAG):
822 comb += replace_way.eq(plru_victim[r.store_index])
823
824 with m.Else():
825 comb += replace_way.eq(r.store_way)
826
827 # -- Output instruction from current cache row
828 # --
829 # -- Note: This is a mild violation of our design principle of
830 # -- having pipeline stages output from a clean latch. In this
831 # -- case we output the result of a mux. The alternative would
832 # -- be output an entire row which I prefer not to do just yet
833 # -- as it would force fetch2 to know about some of the cache
834 # -- geometry information.
835 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
836 # i_out.valid <= r.hit_valid;
837 # i_out.nia <= r.hit_nia;
838 # i_out.stop_mark <= r.hit_smark;
839 # i_out.fetch_failed <= r.fetch_failed;
840 # Output instruction from current cache row
841 #
842 # Note: This is a mild violation of our design principle of
843 # having pipeline stages output from a clean latch. In this
844 # case we output the result of a mux. The alternative would
845 # be output an entire row which I prefer not to do just yet
846 # as it would force fetch2 to know about some of the cache
847 # geometry information.
848 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
849 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
850 # r.hit_way, cache_out[r.hit_way])
851 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
852 comb += i_out.valid.eq(r.hit_valid)
853 comb += i_out.nia.eq(r.hit_nia)
854 comb += i_out.stop_mark.eq(r.hit_smark)
855 comb += i_out.fetch_failed.eq(r.fetch_failed)
856
857 # -- Stall fetch1 if we have a miss on cache or TLB
858 # -- or a protection fault
859 # stall_out <= not (is_hit and access_ok);
860 # Stall fetch1 if we have a miss on cache or TLB
861 # or a protection fault
862 comb += stall_out.eq(~(is_hit & access_ok))
863
864 # -- Wishbone requests output (from the cache miss reload machine)
865 # wishbone_out <= r.wb;
866 # Wishbone requests output (from the cache miss reload machine)
867 comb += wb_out.eq(r.wb)
868 # end process;
869
870 # -- Cache hit synchronous machine
871 # icache_hit : process(clk)
872 # Cache hit synchronous machine
873 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
874 req_index, req_tag, real_addr):
875 sync = m.d.sync
876
877 i_in, stall_in = self.i_in, self.stall_in
878 flush_in = self.flush_in
879
880 # begin
881 # if rising_edge(clk) then
882 # -- keep outputs to fetch2 unchanged on a stall
883 # -- except that flush or reset sets valid to 0
884 # -- If use_previous, keep the same data as last
885 # -- cycle and use the second half
886 # if stall_in = '1' or use_previous = '1' then
887 # if rst = '1' or flush_in = '1' then
888 # r.hit_valid <= '0';
889 # end if;
890 # keep outputs to fetch2 unchanged on a stall
891 # except that flush or reset sets valid to 0
892 # If use_previous, keep the same data as last
893 # cycle and use the second half
894 with m.If(stall_in | use_previous):
895 with m.If(flush_in):
896 sync += r.hit_valid.eq(0)
897 # else
898 # -- On a hit, latch the request for the next cycle,
899 # -- when the BRAM data will be available on the
900 # -- cache_out output of the corresponding way
901 # r.hit_valid <= req_is_hit;
902 # if req_is_hit = '1' then
903 # r.hit_way <= req_hit_way;
904 with m.Else():
905 # On a hit, latch the request for the next cycle,
906 # when the BRAM data will be available on the
907 # cache_out output of the corresponding way
908 sync += r.hit_valid.eq(req_is_hit)
909
910 with m.If(req_is_hit):
911 sync += r.hit_way.eq(req_hit_way)
912
913 # report "cache hit nia:" & to_hstring(i_in.nia) &
914 # " IR:" & std_ulogic'image(i_in.virt_mode) &
915 # " SM:" & std_ulogic'image(i_in.stop_mark) &
916 # " idx:" & integer'image(req_index) &
917 # " tag:" & to_hstring(req_tag) &
918 # " way:" & integer'image(req_hit_way) &
919 # " RA:" & to_hstring(real_addr);
920 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x " \
921 "tag:%x way:%x RA:%x", i_in.nia, \
922 i_in.virt_mode, i_in.stop_mark, req_index, \
923 req_tag, req_hit_way, real_addr)
924
925
926
927 # end if;
928 # end if;
929 # if stall_in = '0' then
930 # -- Send stop marks and NIA down regardless of validity
931 # r.hit_smark <= i_in.stop_mark;
932 # r.hit_nia <= i_in.nia;
933 # end if;
934 with m.If(~stall_in):
935 # Send stop marks and NIA down regardless of validity
936 sync += r.hit_smark.eq(i_in.stop_mark)
937 sync += r.hit_nia.eq(i_in.nia)
938 # end if;
939 # end process;
940
941 # -- Cache miss/reload synchronous machine
942 # icache_miss : process(clk)
943 # Cache miss/reload synchronous machine
944 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
945 req_index, req_laddr, req_tag, replace_way,
946 cache_tags, access_ok, real_addr):
947 comb = m.d.comb
948 sync = m.d.sync
949
950 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
951 stall_in, flush_in = self.stall_in, self.flush_in
952 inval_in = self.inval_in
953
954 # variable tagset : cache_tags_set_t;
955 # variable stbs_done : boolean;
956
957 tagset = Signal(TAG_RAM_WIDTH)
958 stbs_done = Signal()
959
960 # begin
961 # if rising_edge(clk) then
962 # -- On reset, clear all valid bits to force misses
963 # if rst = '1' then
964 # On reset, clear all valid bits to force misses
965 # for i in index_t loop
966 # cache_valids(i) <= (others => '0');
967 # end loop;
968 # r.state <= IDLE;
969 # r.wb.cyc <= '0';
970 # r.wb.stb <= '0';
971 # -- We only ever do reads on wishbone
972 # r.wb.dat <= (others => '0');
973 # r.wb.sel <= "11111111";
974 # r.wb.we <= '0';
975
976 # -- Not useful normally but helps avoiding
977 # -- tons of sim warnings
978 # r.wb.adr <= (others => '0');
979
980 # else
981
982 # -- Process cache invalidations
983 # if inval_in = '1' then
984 # for i in index_t loop
985 # cache_valids(i) <= (others => '0');
986 # end loop;
987 # r.store_valid <= '0';
988 # end if;
989 comb += r.wb.sel.eq(-1)
990 comb += r.wb.adr.eq(r.req_adr[3:])
991
992 # Process cache invalidations
993 with m.If(inval_in):
994 for i in range(NUM_LINES):
995 sync += cache_valid_bits[i].eq(0)
996 sync += r.store_valid.eq(0)
997
998 # -- Main state machine
999 # case r.state is
1000 # Main state machine
1001 with m.Switch(r.state):
1002
1003 # when IDLE =>
1004 with m.Case(State.IDLE):
1005 # -- Reset per-row valid flags,
1006 # -- only used in WAIT_ACK
1007 # for i in 0 to ROW_PER_LINE - 1 loop
1008 # r.rows_valid(i) <= '0';
1009 # end loop;
1010 # Reset per-row valid flags,
1011 # only used in WAIT_ACK
1012 for i in range(ROW_PER_LINE):
1013 sync += r.rows_valid[i].eq(0)
1014
1015 # -- We need to read a cache line
1016 # if req_is_miss = '1' then
1017 # report "cache miss nia:" & to_hstring(i_in.nia) &
1018 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1019 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1020 # " idx:" & integer'image(req_index) &
1021 # " way:" & integer'image(replace_way) &
1022 # " tag:" & to_hstring(req_tag) &
1023 # " RA:" & to_hstring(real_addr);
1024 # We need to read a cache line
1025 with m.If(req_is_miss):
1026 sync += Display(
1027 "cache miss nia:%x IR:%x SM:%x idx:%x " \
1028 " way:%x tag:%x RA:%x", i_in.nia, \
1029 i_in.virt_mode, i_in.stop_mark, req_index, \
1030 replace_way, req_tag, real_addr)
1031
1032 # -- Keep track of our index and way for
1033 # -- subsequent stores
1034 # r.store_index <= req_index;
1035 # r.store_row <= get_row(req_laddr);
1036 # r.store_tag <= req_tag;
1037 # r.store_valid <= '1';
1038 # r.end_row_ix <=
1039 # get_row_of_line(get_row(req_laddr)) - 1;
1040 # Keep track of our index and way
1041 # for subsequent stores
1042 sync += r.store_index.eq(req_index)
1043 sync += r.store_row.eq(get_row(req_laddr))
1044 sync += r.store_tag.eq(req_tag)
1045 sync += r.store_valid.eq(1)
1046 sync += r.end_row_ix.eq(
1047 get_row_of_line(
1048 get_row(req_laddr)
1049 ) - 1
1050 )
1051
1052 # -- Prep for first wishbone read. We calculate the
1053 # -- address of the start of the cache line and
1054 # -- start the WB cycle.
1055 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1056 # r.wb.cyc <= '1';
1057 # r.wb.stb <= '1';
1058 # Prep for first wishbone read.
1059 # We calculate the
1060 # address of the start of the cache line and
1061 # start the WB cycle.
1062 sync += r.req_adr.eq(req_laddr)
1063 sync += r.wb.cyc.eq(1)
1064 sync += r.wb.stb.eq(1)
1065
1066 # -- Track that we had one request sent
1067 # r.state <= CLR_TAG;
1068 # Track that we had one request sent
1069 sync += r.state.eq(State.CLR_TAG)
1070 # end if;
1071
1072 # when CLR_TAG | WAIT_ACK =>
1073 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1074 # if r.state = CLR_TAG then
1075 with m.If(r.state == State.CLR_TAG):
1076 # -- Get victim way from plru
1077 # r.store_way <= replace_way;
1078 # Get victim way from plru
1079 sync += r.store_way.eq(replace_way)
1080 #
1081 # -- Force misses on that way while
1082 # -- reloading that line
1083 # cache_valids(req_index)(replace_way) <= '0';
1084 # Force misses on that way while
1085 # realoading that line
1086 cv = Signal(INDEX_BITS)
1087 comb += cv.eq(cache_valid_bits[req_index])
1088 comb += cv.bit_select(replace_way, 1).eq(0)
1089 sync += cache_valid_bits[req_index].eq(cv)
1090
1091 # -- Store new tag in selected way
1092 # for i in 0 to NUM_WAYS-1 loop
1093 # if i = replace_way then
1094 # tagset := cache_tags(r.store_index);
1095 # write_tag(i, tagset, r.store_tag);
1096 # cache_tags(r.store_index) <= tagset;
1097 # end if;
1098 # end loop;
1099 for i in range(NUM_WAYS):
1100 with m.If(i == replace_way):
1101 comb += tagset.eq(cache_tags[r.store_index])
1102 comb += write_tag(i, tagset, r.store_tag)
1103 sync += cache_tags[r.store_index].eq(tagset)
1104
1105 # r.state <= WAIT_ACK;
1106 sync += r.state.eq(State.WAIT_ACK)
1107 # end if;
1108
1109 # -- Requests are all sent if stb is 0
1110 # stbs_done := r.wb.stb = '0';
1111 # Requests are all sent if stb is 0
1112 stbs_zero = Signal()
1113 comb += stbs_zero.eq(r.wb.stb == 0)
1114 comb += stbs_done.eq(stbs_zero)
1115
1116 # -- If we are still sending requests,
1117 # -- was one accepted ?
1118 # if wishbone_in.stall = '0' and not stbs_done then
1119 # If we are still sending requests,
1120 # was one accepted?
1121 with m.If(~wb_in.stall & ~stbs_zero):
1122 # -- That was the last word ? We are done sending.
1123 # -- Clear stb and set stbs_done so we can handle
1124 # -- an eventual last ack on the same cycle.
1125 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1126 # r.wb.stb <= '0';
1127 # stbs_done := true;
1128 # end if;
1129 # That was the last word ?
1130 # We are done sending.
1131 # Clear stb and set stbs_done
1132 # so we can handle
1133 # an eventual last ack on
1134 # the same cycle.
1135 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
1136 sync += Display("IS_LAST_ROW_ADDR " \
1137 "r.wb.addr:%x r.end_row_ix:%x " \
1138 "r.wb.stb:%x stbs_zero:%x " \
1139 "stbs_done:%x", r.wb.adr, \
1140 r.end_row_ix, r.wb.stb, \
1141 stbs_zero, stbs_done)
1142 sync += r.wb.stb.eq(0)
1143 comb += stbs_done.eq(1)
1144
1145 # -- Calculate the next row address
1146 # r.wb.adr <= next_row_addr(r.wb.adr);
1147 # Calculate the next row address
1148 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
1149 comb += rarange.eq(
1150 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
1151 )
1152 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
1153 rarange
1154 )
1155 sync += Display("RARANGE r.wb.adr:%x stbs_zero:%x " \
1156 "stbs_done:%x", rarange, stbs_zero, \
1157 stbs_done)
1158 # end if;
1159
1160 # -- Incoming acks processing
1161 # if wishbone_in.ack = '1' then
1162 # Incoming acks processing
1163 with m.If(wb_in.ack):
1164 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1165 # <= '1';
1166 sync += Display("WB_IN_ACK stbs_zero:%x " \
1167 "stbs_done:%x", \
1168 stbs_zero, stbs_done)
1169
1170 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
1171
1172 # -- Check for completion
1173 # if stbs_done and
1174 # is_last_row(r.store_row, r.end_row_ix) then
1175 # Check for completion
1176 with m.If(stbs_done &
1177 is_last_row(r.store_row, r.end_row_ix)):
1178 # -- Complete wishbone cycle
1179 # r.wb.cyc <= '0';
1180 # Complete wishbone cycle
1181 sync += r.wb.cyc.eq(0)
1182
1183 # -- Cache line is now valid
1184 # cache_valids(r.store_index)(replace_way) <=
1185 # r.store_valid and not inval_in;
1186 # Cache line is now valid
1187 cv = Signal(INDEX_BITS)
1188 comb += cv.eq(cache_valid_bits[r.store_index])
1189 comb += cv.bit_select(replace_way, 1).eq(
1190 r.store_valid & ~inval_in
1191 )
1192 sync += cache_valid_bits[r.store_index].eq(cv)
1193
1194 # -- We are done
1195 # r.state <= IDLE;
1196 # We are done
1197 sync += r.state.eq(State.IDLE)
1198 # end if;
1199
1200 # -- Increment store row counter
1201 # r.store_row <= next_row(r.store_row);
1202 # Increment store row counter
1203 sync += r.store_row.eq(next_row(r.store_row))
1204 # end if;
1205 # end case;
1206 # end if;
1207 #
1208 # -- TLB miss and protection fault processing
1209 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1210 # r.fetch_failed <= '0';
1211 # elsif i_in.req = '1' and access_ok = '0' and
1212 # stall_in = '0' then
1213 # r.fetch_failed <= '1';
1214 # end if;
1215 # TLB miss and protection fault processing
1216 with m.If(flush_in | m_in.tlbld):
1217 sync += r.fetch_failed.eq(0)
1218
1219 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1220 sync += r.fetch_failed.eq(1)
1221 # end if;
1222 # end process;
1223
1224 # icache_log: if LOG_LENGTH > 0 generate
1225 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1226 req_is_miss, req_is_hit, lway, wstate, r):
1227 comb = m.d.comb
1228 sync = m.d.sync
1229
1230 wb_in, i_out = self.wb_in, self.i_out
1231 log_out, stall_out = self.log_out, self.stall_out
1232
1233 # -- Output data to logger
1234 # signal log_data : std_ulogic_vector(53 downto 0);
1235 # begin
1236 # data_log: process(clk)
1237 # variable lway: way_t;
1238 # variable wstate: std_ulogic;
1239 # Output data to logger
1240 for i in range(LOG_LENGTH):
1241 # Output data to logger
1242 log_data = Signal(54)
1243 lway = Signal(NUM_WAYS)
1244 wstate = Signal()
1245
1246 # begin
1247 # if rising_edge(clk) then
1248 # lway := req_hit_way;
1249 # wstate := '0';
1250 sync += lway.eq(req_hit_way)
1251 sync += wstate.eq(0)
1252
1253 # if r.state /= IDLE then
1254 # wstate := '1';
1255 # end if;
1256 with m.If(r.state != State.IDLE):
1257 sync += wstate.eq(1)
1258
1259 # log_data <= i_out.valid &
1260 # i_out.insn &
1261 # wishbone_in.ack &
1262 # r.wb.adr(5 downto 3) &
1263 # r.wb.stb & r.wb.cyc &
1264 # wishbone_in.stall &
1265 # stall_out &
1266 # r.fetch_failed &
1267 # r.hit_nia(5 downto 2) &
1268 # wstate &
1269 # std_ulogic_vector(to_unsigned(lway, 3)) &
1270 # req_is_hit & req_is_miss &
1271 # access_ok &
1272 # ra_valid;
1273 sync += log_data.eq(Cat(
1274 ra_valid, access_ok, req_is_miss, req_is_hit,
1275 lway, wstate, r.hit_nia[2:6],
1276 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1277 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1278 i_out.valid
1279 ))
1280 # end if;
1281 # end process;
1282 # log_out <= log_data;
1283 comb += log_out.eq(log_data)
1284 # end generate;
1285 # end;
1286
1287 def elaborate(self, platform):
1288
1289 m = Module()
1290 comb = m.d.comb
1291
1292 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1293 cache_tags = CacheTagArray()
1294 cache_valid_bits = CacheValidBitsArray()
1295
1296 # signal itlb_valids : tlb_valids_t;
1297 # signal itlb_tags : tlb_tags_t;
1298 # signal itlb_ptes : tlb_ptes_t;
1299 # attribute ram_style of itlb_tags : signal is "distributed";
1300 # attribute ram_style of itlb_ptes : signal is "distributed";
1301 itlb_valid_bits = TLBValidBitsArray()
1302 itlb_tags = TLBTagArray()
1303 itlb_ptes = TLBPtesArray()
1304 # TODO to be passed to nmigen as ram attributes
1305 # attribute ram_style of itlb_tags : signal is "distributed";
1306 # attribute ram_style of itlb_ptes : signal is "distributed";
1307
1308 # -- Privilege bit from PTE EAA field
1309 # signal eaa_priv : std_ulogic;
1310 # Privilege bit from PTE EAA field
1311 eaa_priv = Signal()
1312
1313 # signal r : reg_internal_t;
1314 r = RegInternal()
1315
1316 # -- Async signals on incoming request
1317 # signal req_index : index_t;
1318 # signal req_row : row_t;
1319 # signal req_hit_way : way_t;
1320 # signal req_tag : cache_tag_t;
1321 # signal req_is_hit : std_ulogic;
1322 # signal req_is_miss : std_ulogic;
1323 # signal req_laddr : std_ulogic_vector(63 downto 0);
1324 # Async signal on incoming request
1325 req_index = Signal(NUM_LINES)
1326 req_row = Signal(BRAM_ROWS)
1327 req_hit_way = Signal(NUM_WAYS)
1328 req_tag = Signal(TAG_BITS)
1329 req_is_hit = Signal()
1330 req_is_miss = Signal()
1331 req_laddr = Signal(64)
1332
1333 # signal tlb_req_index : tlb_index_t;
1334 # signal real_addr : std_ulogic_vector(
1335 # REAL_ADDR_BITS - 1 downto 0
1336 # );
1337 # signal ra_valid : std_ulogic;
1338 # signal priv_fault : std_ulogic;
1339 # signal access_ok : std_ulogic;
1340 # signal use_previous : std_ulogic;
1341 tlb_req_index = Signal(TLB_SIZE)
1342 real_addr = Signal(REAL_ADDR_BITS)
1343 ra_valid = Signal()
1344 priv_fault = Signal()
1345 access_ok = Signal()
1346 use_previous = Signal()
1347
1348 # signal cache_out : cache_ram_out_t;
1349 cache_out_row = Signal(ROW_SIZE_BITS)
1350
1351 # signal plru_victim : plru_out_t;
1352 # signal replace_way : way_t;
1353 plru_victim = PLRUOut()
1354 replace_way = Signal(NUM_WAYS)
1355
1356 # call sub-functions putting everything together, using shared
1357 # signals established above
1358 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
1359 self.maybe_plrus(m, r, plru_victim)
1360 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1361 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1362 priv_fault, access_ok)
1363 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1364 self.icache_comb(m, use_previous, r, req_index, req_row,
1365 req_tag, real_addr, req_laddr, cache_valid_bits,
1366 cache_tags, access_ok, req_is_hit, req_is_miss,
1367 replace_way, plru_victim, cache_out_row)
1368 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1369 req_index, req_tag, real_addr)
1370 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1371 req_laddr, req_tag, replace_way, cache_tags,
1372 access_ok, real_addr)
1373 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1374 # req_is_miss, req_is_hit, lway, wstate, r)
1375
1376 return m
1377
1378
1379 # icache_tb.vhdl
1380 #
1381 # library ieee;
1382 # use ieee.std_logic_1164.all;
1383 #
1384 # library work;
1385 # use work.common.all;
1386 # use work.wishbone_types.all;
1387 #
1388 # entity icache_tb is
1389 # end icache_tb;
1390 #
1391 # architecture behave of icache_tb is
1392 # signal clk : std_ulogic;
1393 # signal rst : std_ulogic;
1394 #
1395 # signal i_out : Fetch1ToIcacheType;
1396 # signal i_in : IcacheToDecode1Type;
1397 #
1398 # signal m_out : MmuToIcacheType;
1399 #
1400 # signal wb_bram_in : wishbone_master_out;
1401 # signal wb_bram_out : wishbone_slave_out;
1402 #
1403 # constant clk_period : time := 10 ns;
1404 # begin
1405 # icache0: entity work.icache
1406 # generic map(
1407 # LINE_SIZE => 64,
1408 # NUM_LINES => 4
1409 # )
1410 # port map(
1411 # clk => clk,
1412 # rst => rst,
1413 # i_in => i_out,
1414 # i_out => i_in,
1415 # m_in => m_out,
1416 # stall_in => '0',
1417 # flush_in => '0',
1418 # inval_in => '0',
1419 # wishbone_out => wb_bram_in,
1420 # wishbone_in => wb_bram_out
1421 # );
1422 #
1423 # -- BRAM Memory slave
1424 # bram0: entity work.wishbone_bram_wrapper
1425 # generic map(
1426 # MEMORY_SIZE => 1024,
1427 # RAM_INIT_FILE => "icache_test.bin"
1428 # )
1429 # port map(
1430 # clk => clk,
1431 # rst => rst,
1432 # wishbone_in => wb_bram_in,
1433 # wishbone_out => wb_bram_out
1434 # );
1435 #
1436 # clk_process: process
1437 # begin
1438 # clk <= '0';
1439 # wait for clk_period/2;
1440 # clk <= '1';
1441 # wait for clk_period/2;
1442 # end process;
1443 #
1444 # rst_process: process
1445 # begin
1446 # rst <= '1';
1447 # wait for 2*clk_period;
1448 # rst <= '0';
1449 # wait;
1450 # end process;
1451 #
1452 # stim: process
1453 # begin
1454 # i_out.req <= '0';
1455 # i_out.nia <= (others => '0');
1456 # i_out.stop_mark <= '0';
1457 #
1458 # m_out.tlbld <= '0';
1459 # m_out.tlbie <= '0';
1460 # m_out.addr <= (others => '0');
1461 # m_out.pte <= (others => '0');
1462 #
1463 # wait until rising_edge(clk);
1464 # wait until rising_edge(clk);
1465 # wait until rising_edge(clk);
1466 # wait until rising_edge(clk);
1467 #
1468 # i_out.req <= '1';
1469 # i_out.nia <= x"0000000000000004";
1470 #
1471 # wait for 30*clk_period;
1472 # wait until rising_edge(clk);
1473 #
1474 # assert i_in.valid = '1' severity failure;
1475 # assert i_in.insn = x"00000001"
1476 # report "insn @" & to_hstring(i_out.nia) &
1477 # "=" & to_hstring(i_in.insn) &
1478 # " expected 00000001"
1479 # severity failure;
1480 #
1481 # i_out.req <= '0';
1482 #
1483 # wait until rising_edge(clk);
1484 #
1485 # -- hit
1486 # i_out.req <= '1';
1487 # i_out.nia <= x"0000000000000008";
1488 # wait until rising_edge(clk);
1489 # wait until rising_edge(clk);
1490 # assert i_in.valid = '1' severity failure;
1491 # assert i_in.insn = x"00000002"
1492 # report "insn @" & to_hstring(i_out.nia) &
1493 # "=" & to_hstring(i_in.insn) &
1494 # " expected 00000002"
1495 # severity failure;
1496 # wait until rising_edge(clk);
1497 #
1498 # -- another miss
1499 # i_out.req <= '1';
1500 # i_out.nia <= x"0000000000000040";
1501 #
1502 # wait for 30*clk_period;
1503 # wait until rising_edge(clk);
1504 #
1505 # assert i_in.valid = '1' severity failure;
1506 # assert i_in.insn = x"00000010"
1507 # report "insn @" & to_hstring(i_out.nia) &
1508 # "=" & to_hstring(i_in.insn) &
1509 # " expected 00000010"
1510 # severity failure;
1511 #
1512 # -- test something that aliases
1513 # i_out.req <= '1';
1514 # i_out.nia <= x"0000000000000100";
1515 # wait until rising_edge(clk);
1516 # wait until rising_edge(clk);
1517 # assert i_in.valid = '0' severity failure;
1518 # wait until rising_edge(clk);
1519 #
1520 # wait for 30*clk_period;
1521 # wait until rising_edge(clk);
1522 #
1523 # assert i_in.valid = '1' severity failure;
1524 # assert i_in.insn = x"00000040"
1525 # report "insn @" & to_hstring(i_out.nia) &
1526 # "=" & to_hstring(i_in.insn) &
1527 # " expected 00000040"
1528 # severity failure;
1529 #
1530 # i_out.req <= '0';
1531 #
1532 # std.env.finish;
1533 # end process;
1534 # end;
1535 def icache_sim(dut):
1536 i_out = dut.i_in
1537 i_in = dut.i_out
1538 m_out = dut.m_in
1539
1540 yield i_in.valid.eq(0)
1541 yield i_out.priv_mode.eq(1)
1542 yield i_out.req.eq(0)
1543 yield i_out.nia.eq(0)
1544 yield i_out.stop_mark.eq(0)
1545 yield m_out.tlbld.eq(0)
1546 yield m_out.tlbie.eq(0)
1547 yield m_out.addr.eq(0)
1548 yield m_out.pte.eq(0)
1549 yield
1550 yield
1551 yield
1552 yield
1553 yield i_out.req.eq(1)
1554 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1555 for i in range(30):
1556 yield
1557 yield
1558 valid = yield i_in.valid
1559 nia = yield i_out.nia
1560 insn = yield i_in.insn
1561 print(f"valid? {valid}")
1562 assert valid
1563 assert insn == 0x00000001, \
1564 "insn @%x=%x expected 00000001" % (nia, insn)
1565 yield i_out.req.eq(0)
1566 yield
1567
1568 # hit
1569 yield i_out.req.eq(1)
1570 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1571 yield
1572 yield
1573 valid = yield i_in.valid
1574 nia = yield i_in.nia
1575 insn = yield i_in.insn
1576 assert valid
1577 assert insn == 0x00000002, \
1578 "insn @%x=%x expected 00000002" % (nia, insn)
1579 yield
1580
1581 # another miss
1582 yield i_out.req.eq(1)
1583 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1584 for i in range(30):
1585 yield
1586 yield
1587 valid = yield i_in.valid
1588 nia = yield i_out.nia
1589 insn = yield i_in.insn
1590 assert valid
1591 assert insn == 0x00000010, \
1592 "insn @%x=%x expected 00000010" % (nia, insn)
1593
1594 # test something that aliases
1595 yield i_out.req.eq(1)
1596 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1597 yield
1598 yield
1599 valid = yield i_in.valid
1600 assert ~valid
1601 for i in range(30):
1602 yield
1603 yield
1604 insn = yield i_in.insn
1605 valid = yield i_in.valid
1606 insn = yield i_in.insn
1607 assert valid
1608 assert insn == 0x00000040, \
1609 "insn @%x=%x expected 00000040" % (nia, insn)
1610 yield i_out.req.eq(0)
1611
1612
1613
1614 def test_icache(mem):
1615 dut = ICache()
1616
1617 memory = Memory(width=64, depth=16*64, init=mem)
1618 sram = SRAM(memory=memory, granularity=8)
1619
1620 m = Module()
1621
1622 m.submodules.icache = dut
1623 m.submodules.sram = sram
1624
1625 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1626 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1627 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1628 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1629 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1630 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1631
1632 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1633 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1634
1635 # nmigen Simulation
1636 sim = Simulator(m)
1637 sim.add_clock(1e-6)
1638
1639 sim.add_sync_process(wrap(icache_sim(dut)))
1640 with sim.write_vcd('test_icache.vcd'):
1641 sim.run()
1642
1643 if __name__ == '__main__':
1644 dut = ICache()
1645 vl = rtlil.convert(dut, ports=[])
1646 with open("test_icache.il", "w") as f:
1647 f.write(vl)
1648
1649 mem = []
1650 for i in range(512):
1651 mem.append((i*2)| ((i*2+1)<<32))
1652
1653 test_icache(mem)
1654