ignore /abc.history
[microwatt.git] / icache.vhdl
1 --
2 -- Set associative icache
3 --
4 -- TODO (in no specific order):
5 --
6 -- * Add debug interface to inspect cache content
7 -- * Add multi-hit error detection
8 -- * Maybe add parity ? There's a few bits free in each BRAM row on Xilinx
9 -- * Add optimization: service hits on partially loaded lines
10 -- * Add optimization: (maybe) interrupt reload on fluch/redirect
11 -- * Check if playing with the geometry of the cache tags allow for more
12 -- efficient use of distributed RAM and less logic/muxes. Currently we
13 -- write TAG_BITS width which may not match full ram blocks and might
14 -- cause muxes to be inferred for "partial writes".
15 --
16 library ieee;
17 use ieee.std_logic_1164.all;
18 use ieee.numeric_std.all;
19
20 library work;
21 use work.utils.all;
22 use work.common.all;
23 use work.decode_types.all;
24 use work.wishbone_types.all;
25
26 -- 64 bit direct mapped icache. All instructions are 4B aligned.
27
28 entity icache is
29 generic (
30 SIM : boolean := false;
31 HAS_FPU : boolean := true;
32 -- Line size in bytes
33 LINE_SIZE : positive := 64;
34 -- BRAM organisation: We never access more than wishbone_data_bits at
35 -- a time so to save resources we make the array only that wide, and
36 -- use consecutive indices for to make a cache "line"
37 --
38 -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
39 ROW_SIZE : positive := wishbone_data_bits / 8;
40 -- Number of lines in a set
41 NUM_LINES : positive := 32;
42 -- Number of ways
43 NUM_WAYS : positive := 4;
44 -- L1 ITLB number of entries (direct mapped)
45 TLB_SIZE : positive := 64;
46 -- L1 ITLB log_2(page_size)
47 TLB_LG_PGSZ : positive := 12;
48 -- Non-zero to enable log data collection
49 LOG_LENGTH : natural := 0
50 );
51 port (
52 clk : in std_ulogic;
53 rst : in std_ulogic;
54
55 i_in : in Fetch1ToIcacheType;
56 i_out : out IcacheToDecode1Type;
57
58 m_in : in MmuToIcacheType;
59
60 stall_in : in std_ulogic;
61 stall_out : out std_ulogic;
62 flush_in : in std_ulogic;
63 inval_in : in std_ulogic;
64
65 wishbone_out : out wishbone_master_out;
66 wishbone_in : in wishbone_slave_out;
67
68 wb_snoop_in : in wishbone_master_out := wishbone_master_out_init;
69
70 events : out IcacheEventType;
71 log_out : out std_ulogic_vector(57 downto 0)
72 );
73 end entity icache;
74
75 architecture rtl of icache is
76 constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
77 -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
78 constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
79 -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
80 -- icache
81 constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
82 -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
83 constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
84 -- Bit fields counts in the address
85
86 -- INSN_BITS is the number of bits to select an instruction in a row
87 constant INSN_BITS : natural := log2(INSN_PER_ROW);
88 -- ROW_BITS is the number of bits to select a row
89 constant ROW_BITS : natural := log2(BRAM_ROWS);
90 -- ROW_LINEBITS is the number of bits to select a row within a line
91 constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
92 -- LINE_OFF_BITS is the number of bits for the offset in a cache line
93 constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
94 -- ROW_OFF_BITS is the number of bits for the offset in a row
95 constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
96 -- INDEX_BITS is the number of bits to select a cache line
97 constant INDEX_BITS : natural := log2(NUM_LINES);
98 -- SET_SIZE_BITS is the log base 2 of the set size
99 constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
100 -- TAG_BITS is the number of bits of the tag part of the address
101 -- the +1 is to allow the endianness to be stored in the tag
102 constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS + 1;
103 -- WAY_BITS is the number of bits to select a way
104 -- Make sure this is at least 1, to avoid 0-element vectors
105 constant WAY_BITS : natural := maximum(log2(NUM_WAYS), 1);
106
107 -- Example of layout for 32 lines of 64 bytes:
108 --
109 -- .. tag |index| line |
110 -- .. | row | |
111 -- .. | | | |00| zero (2)
112 -- .. | | |-| | INSN_BITS (1)
113 -- .. | |---| | ROW_LINEBITS (3)
114 -- .. | |--- - --| LINE_OFF_BITS (6)
115 -- .. | |- --| ROW_OFF_BITS (3)
116 -- .. |----- ---| | ROW_BITS (8)
117 -- .. |-----| | INDEX_BITS (5)
118 -- .. --------| | TAG_BITS (53)
119
120 subtype row_t is unsigned(ROW_BITS-1 downto 0);
121 subtype index_t is integer range 0 to NUM_LINES-1;
122 subtype index_sig_t is unsigned(INDEX_BITS-1 downto 0);
123 subtype way_t is integer range 0 to NUM_WAYS-1;
124 subtype way_sig_t is unsigned(WAY_BITS-1 downto 0);
125 subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
126
127 -- We store a pre-decoded 10-bit insn_code along with the bottom 26 bits of
128 -- each instruction, giving a total of 36 bits per instruction, which
129 -- fits neatly into the block RAMs available on FPGAs.
130 -- For illegal instructions, the top 4 bits are ones and the bottom 6 bits
131 -- are the instruction's primary opcode, so we have the whole instruction
132 -- word available (e.g. to put in HEIR). For other instructions, the
133 -- primary opcode is not stored but could be determined from the insn_code.
134 constant PREDECODE_BITS : natural := 10;
135 constant INSN_IMAGE_BITS : natural := 26;
136 constant ICWORDLEN : natural := PREDECODE_BITS + INSN_IMAGE_BITS;
137 constant ROW_WIDTH : natural := INSN_PER_ROW * ICWORDLEN;
138
139 -- The cache data BRAM organized as described above for each way
140 subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0);
141
142 -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
143 -- not handle a clean (commented) definition of the cache tags as a 3d
144 -- memory. For now, work around it by putting all the tags
145 subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
146 -- type cache_tags_set_t is array(way_t) of cache_tag_t;
147 -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
148 constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
149 subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
150 type cache_tags_array_t is array(index_t) of cache_tags_set_t;
151
152 -- The cache valid bits
153 subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
154 type cache_valids_t is array(index_t) of cache_way_valids_t;
155 type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
156
157 -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
158 signal cache_tags : cache_tags_array_t;
159 signal cache_valids : cache_valids_t;
160
161 attribute ram_style : string;
162 attribute ram_style of cache_tags : signal is "distributed";
163
164 -- L1 ITLB.
165 constant TLB_BITS : natural := log2(TLB_SIZE);
166 constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
167 constant TLB_PTE_BITS : natural := 64;
168
169 subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
170 type tlb_valids_t is array(tlb_index_t) of std_ulogic;
171 subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
172 type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
173 subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
174 type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
175
176 signal itlb_valids : tlb_valids_t;
177 signal itlb_tags : tlb_tags_t;
178 signal itlb_ptes : tlb_ptes_t;
179 attribute ram_style of itlb_tags : signal is "distributed";
180 attribute ram_style of itlb_ptes : signal is "distributed";
181
182 -- Privilege bit from PTE EAA field
183 signal eaa_priv : std_ulogic;
184
185 -- Cache reload state machine
186 type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK);
187
188 type reg_internal_t is record
189 -- Cache hit state (Latches for 1 cycle BRAM access)
190 hit_way : way_sig_t;
191 hit_nia : std_ulogic_vector(63 downto 0);
192 hit_smark : std_ulogic;
193 hit_valid : std_ulogic;
194 big_endian: std_ulogic;
195 predicted : std_ulogic;
196 pred_ntaken: std_ulogic;
197
198 -- Cache miss state (reload state machine)
199 state : state_t;
200 wb : wishbone_master_out;
201 store_way : way_sig_t;
202 store_index : index_sig_t;
203 recv_row : row_t;
204 recv_valid : std_ulogic;
205 store_row : row_t;
206 store_tag : cache_tag_t;
207 store_valid : std_ulogic;
208 end_row_ix : row_in_line_t;
209 rows_valid : row_per_line_valid_t;
210
211 -- TLB miss state
212 fetch_failed : std_ulogic;
213 end record;
214
215 signal r : reg_internal_t;
216
217 signal ev : IcacheEventType;
218
219 -- Async signals on incoming request
220 signal req_index : index_sig_t;
221 signal req_row : row_t;
222 signal req_hit_way : way_sig_t;
223 signal req_tag : cache_tag_t;
224 signal req_is_hit : std_ulogic;
225 signal req_is_miss : std_ulogic;
226 signal req_raddr : real_addr_t;
227
228 signal real_addr : real_addr_t;
229 signal ra_valid : std_ulogic;
230 signal priv_fault : std_ulogic;
231 signal access_ok : std_ulogic;
232
233 -- Cache RAM interface
234 type cache_ram_out_t is array(way_t) of cache_row_t;
235 signal cache_out : cache_ram_out_t;
236 signal cache_wr_data : std_ulogic_vector(ROW_WIDTH - 1 downto 0);
237 signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);
238
239 -- PLRU output interface
240 signal plru_victim : way_sig_t;
241
242 -- Memory write snoop signals
243 signal snoop_valid : std_ulogic;
244 signal snoop_index : index_sig_t;
245 signal snoop_hits : cache_way_valids_t;
246
247 signal log_insn : std_ulogic_vector(35 downto 0);
248
249 -- Return the cache line index (tag index) for an address
250 function get_index(addr: std_ulogic_vector) return index_sig_t is
251 begin
252 return unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS));
253 end;
254
255 -- Return the cache row index (data memory) for an address
256 function get_row(addr: std_ulogic_vector) return row_t is
257 begin
258 return unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS));
259 end;
260
261 -- Return the index of a row within a line
262 function get_row_of_line(row: row_t) return row_in_line_t is
263 begin
264 return row(ROW_LINEBITS-1 downto 0);
265 end;
266
267 -- Returns whether this is the last row of a line
268 function is_last_row_wb_addr(wb_addr: wishbone_addr_type; last: row_in_line_t) return boolean is
269 begin
270 return unsigned(wb_addr(LINE_OFF_BITS - ROW_OFF_BITS - 1 downto 0)) = last;
271 end;
272
273 -- Returns whether this is the last row of a line
274 function is_last_row(row: row_t; last: row_in_line_t) return boolean is
275 begin
276 return get_row_of_line(row) = last;
277 end;
278
279 -- Return the address of the next row in the current cache line
280 function next_row_wb_addr(wb_addr: wishbone_addr_type)
281 return std_ulogic_vector is
282 variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
283 variable result : wishbone_addr_type;
284 begin
285 -- Is there no simpler way in VHDL to generate that 3 bits adder ?
286 row_idx := wb_addr(ROW_LINEBITS - 1 downto 0);
287 row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
288 result := wb_addr;
289 result(ROW_LINEBITS - 1 downto 0) := row_idx;
290 return result;
291 end;
292
293 -- Return the next row in the current cache line. We use a dedicated
294 -- function in order to limit the size of the generated adder to be
295 -- only the bits within a cache line (3 bits with default settings)
296 --
297 function next_row(row: row_t) return row_t is
298 variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
299 variable row_idx : unsigned(ROW_LINEBITS-1 downto 0);
300 variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
301 begin
302 row_v := std_ulogic_vector(row);
303 row_idx := row(ROW_LINEBITS-1 downto 0);
304 row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(row_idx + 1);
305 return unsigned(row_v);
306 end;
307
308 -- Read the instruction word for the given address in the current cache row
309 function read_insn_word(addr: std_ulogic_vector(63 downto 0);
310 data: cache_row_t) return std_ulogic_vector is
311 variable word: integer range 0 to INSN_PER_ROW-1;
312 begin
313 assert not is_X(addr) severity failure;
314 word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
315 return data(word * ICWORDLEN + ICWORDLEN - 1 downto word * ICWORDLEN);
316 end;
317
318 -- Get the tag value from the address
319 function get_tag(addr: real_addr_t; endian: std_ulogic) return cache_tag_t is
320 begin
321 return endian & addr(addr'left downto SET_SIZE_BITS);
322 end;
323
324 -- Read a tag from a tag memory row
325 function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
326 begin
327 return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
328 end;
329
330 -- Write a tag to tag memory row
331 procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
332 tag: cache_tag_t) is
333 begin
334 tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
335 end;
336
337 -- Simple hash for direct-mapped TLB index
338 function hash_ea(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
339 variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
340 begin
341 hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
342 xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS)
343 xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS);
344 return hash;
345 end;
346
347 begin
348
349 -- byte-swap read data if big endian
350 process(all)
351 variable j: integer;
352 begin
353 if r.store_tag(TAG_BITS - 1) = '0' then
354 wb_rd_data <= wishbone_in.dat;
355 else
356 for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
357 j := ((ii / 4) * 4) + (3 - (ii mod 4));
358 wb_rd_data(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
359 end loop;
360 end if;
361 end process;
362
363 predecoder_0: entity work.predecoder
364 generic map (
365 HAS_FPU => HAS_FPU,
366 WIDTH => INSN_PER_ROW,
367 ICODE_LEN => PREDECODE_BITS,
368 IMAGE_LEN => INSN_IMAGE_BITS
369 )
370 port map (
371 clk => clk,
372 valid_in => wishbone_in.ack,
373 insns_in => wb_rd_data,
374 icodes_out => cache_wr_data
375 );
376
377 assert LINE_SIZE mod ROW_SIZE = 0;
378 assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
379 assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
380 assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
381 assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" severity FAILURE;
382 assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
383 report "geometry bits don't add up" severity FAILURE;
384 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
385 report "geometry bits don't add up" severity FAILURE;
386 assert (REAL_ADDR_BITS + 1 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
387 report "geometry bits don't add up" severity FAILURE;
388 assert (REAL_ADDR_BITS + 1 = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
389 report "geometry bits don't add up" severity FAILURE;
390
391 sim_debug: if SIM generate
392 debug: process
393 begin
394 report "ROW_SIZE = " & natural'image(ROW_SIZE);
395 report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
396 report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
397 report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
398 report "INSN_BITS = " & natural'image(INSN_BITS);
399 report "ROW_BITS = " & natural'image(ROW_BITS);
400 report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
401 report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
402 report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
403 report "INDEX_BITS = " & natural'image(INDEX_BITS);
404 report "TAG_BITS = " & natural'image(TAG_BITS);
405 report "WAY_BITS = " & natural'image(WAY_BITS);
406 wait;
407 end process;
408 end generate;
409
410 -- Generate a cache RAM for each way
411 rams: for i in 0 to NUM_WAYS-1 generate
412 signal do_read : std_ulogic;
413 signal do_write : std_ulogic;
414 signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
415 signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
416 signal dout : cache_row_t;
417 signal wr_sel : std_ulogic_vector(0 downto 0);
418 begin
419 way: entity work.cache_ram
420 generic map (
421 ROW_BITS => ROW_BITS,
422 WIDTH => ROW_WIDTH,
423 BYTEWID => ROW_WIDTH
424 )
425 port map (
426 clk => clk,
427 rd_en => do_read,
428 rd_addr => rd_addr,
429 rd_data => dout,
430 wr_sel => wr_sel,
431 wr_addr => wr_addr,
432 wr_data => cache_wr_data
433 );
434 process(all)
435 begin
436 do_read <= not stall_in;
437 do_write <= '0';
438 if r.recv_valid = '1' and r.store_way = to_unsigned(i, WAY_BITS) then
439 do_write <= '1';
440 end if;
441 cache_out(i) <= dout;
442 rd_addr <= std_ulogic_vector(req_row);
443 wr_addr <= std_ulogic_vector(r.store_row);
444 wr_sel(0) <= do_write;
445 end process;
446 end generate;
447
448 -- Generate PLRUs
449 maybe_plrus: if NUM_WAYS > 1 generate
450 type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
451 signal plru_ram : plru_array;
452 signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
453 signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
454 signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
455 signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
456 begin
457 plru : entity work.plrufn
458 generic map (
459 BITS => WAY_BITS
460 )
461 port map (
462 acc => plru_acc,
463 tree_in => plru_cur,
464 tree_out => plru_upd,
465 lru => plru_out
466 );
467
468 process(all)
469 begin
470 -- Read PLRU bits from array
471 if is_X(r.hit_nia) then
472 plru_cur <= (others => 'X');
473 else
474 plru_cur <= plru_ram(to_integer(get_index(r.hit_nia)));
475 end if;
476
477 -- PLRU interface
478 plru_acc <= std_ulogic_vector(r.hit_way);
479 plru_victim <= unsigned(plru_out);
480 end process;
481
482 -- synchronous writes to PLRU array
483 process(clk)
484 begin
485 if rising_edge(clk) then
486 if r.hit_valid = '1' then
487 assert not is_X(r.hit_nia) severity failure;
488 plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd;
489 end if;
490 end if;
491 end process;
492 end generate;
493
494 -- TLB hit detection and real address generation
495 itlb_lookup : process(all)
496 variable pte : tlb_pte_t;
497 variable ttag : tlb_tag_t;
498 variable tlb_req_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
499 begin
500 tlb_req_index := hash_ea(i_in.nia);
501 if is_X(tlb_req_index) then
502 pte := (others => 'X');
503 ttag := (others => 'X');
504 else
505 pte := itlb_ptes(to_integer(unsigned(tlb_req_index)));
506 ttag := itlb_tags(to_integer(unsigned(tlb_req_index)));
507 end if;
508 if i_in.virt_mode = '1' then
509 real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
510 i_in.nia(TLB_LG_PGSZ - 1 downto 0);
511 if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
512 if is_X(tlb_req_index) then
513 ra_valid <= 'X';
514 else
515 ra_valid <= itlb_valids(to_integer(unsigned(tlb_req_index)));
516 end if;
517 else
518 ra_valid <= '0';
519 end if;
520 eaa_priv <= pte(3);
521 else
522 real_addr <= addr_to_real(i_in.nia);
523 ra_valid <= '1';
524 eaa_priv <= '1';
525 end if;
526
527 -- no IAMR, so no KUEP support for now
528 priv_fault <= eaa_priv and not i_in.priv_mode;
529 access_ok <= ra_valid and not priv_fault;
530 end process;
531
532 -- iTLB update
533 itlb_update: process(clk)
534 variable wr_index : std_ulogic_vector(TLB_BITS - 1 downto 0);
535 begin
536 if rising_edge(clk) then
537 wr_index := hash_ea(m_in.addr);
538 if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
539 -- clear all valid bits
540 for i in tlb_index_t loop
541 itlb_valids(i) <= '0';
542 end loop;
543 elsif m_in.tlbie = '1' then
544 assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
545 -- clear entry regardless of hit or miss
546 itlb_valids(to_integer(unsigned(wr_index))) <= '0';
547 elsif m_in.tlbld = '1' then
548 assert not is_X(wr_index) report "icache index invalid on write" severity FAILURE;
549 itlb_tags(to_integer(unsigned(wr_index))) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
550 itlb_ptes(to_integer(unsigned(wr_index))) <= m_in.pte;
551 itlb_valids(to_integer(unsigned(wr_index))) <= '1';
552 end if;
553 ev.itlb_miss_resolved <= m_in.tlbld and not rst;
554 end if;
555 end process;
556
557 -- Cache hit detection, output to fetch2 and other misc logic
558 icache_comb : process(all)
559 variable is_hit : std_ulogic;
560 variable hit_way : way_sig_t;
561 variable insn : std_ulogic_vector(ICWORDLEN - 1 downto 0);
562 variable icode : insn_code;
563 begin
564 -- Extract line, row and tag from request
565 req_index <= get_index(i_in.nia);
566 req_row <= get_row(i_in.nia);
567 req_tag <= get_tag(real_addr, i_in.big_endian);
568
569 -- Calculate address of beginning of cache row, will be
570 -- used for cache miss processing if needed
571 --
572 req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
573 (ROW_OFF_BITS-1 downto 0 => '0');
574
575 -- Test if pending request is a hit on any way
576 hit_way := to_unsigned(0, WAY_BITS);
577 is_hit := '0';
578 if i_in.req = '1' then
579 assert not is_X(req_index) and not is_X(req_row) severity failure;
580 end if;
581 for i in way_t loop
582 if i_in.req = '1' and
583 (cache_valids(to_integer(req_index))(i) = '1' or
584 (r.state = WAIT_ACK and
585 req_index = r.store_index and
586 to_unsigned(i, WAY_BITS) = r.store_way and
587 r.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) = '1')) then
588 if read_tag(i, cache_tags(to_integer(req_index))) = req_tag then
589 hit_way := to_unsigned(i, WAY_BITS);
590 is_hit := '1';
591 end if;
592 end if;
593 end loop;
594
595 -- Generate the "hit" and "miss" signals for the synchronous blocks
596 if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then
597 req_is_hit <= is_hit;
598 req_is_miss <= not is_hit;
599 else
600 req_is_hit <= '0';
601 req_is_miss <= '0';
602 end if;
603 req_hit_way <= hit_way;
604
605 -- Output instruction from current cache row
606 --
607 -- Note: This is a mild violation of our design principle of having pipeline
608 -- stages output from a clean latch. In this case we output the result
609 -- of a mux. The alternative would be output an entire row which
610 -- I prefer not to do just yet as it would force fetch2 to know about
611 -- some of the cache geometry information.
612 --
613 insn := (others => '0');
614 icode := INSN_illegal;
615 if r.hit_valid = '1' then
616 assert not is_X(r.hit_way) severity failure;
617 insn := read_insn_word(r.hit_nia, cache_out(to_integer(r.hit_way)));
618 -- Currently we use only the top bit for indicating illegal
619 -- instructions because we know that insn_codes fit into 9 bits.
620 if is_X(insn) then
621 insn := (others => '0');
622 elsif insn(ICWORDLEN - 1) = '0' then
623 icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS))));
624 end if;
625 end if;
626 i_out.insn <= insn(31 downto 0);
627 i_out.icode <= icode;
628 log_insn <= insn;
629 i_out.valid <= r.hit_valid;
630 i_out.nia <= r.hit_nia;
631 i_out.stop_mark <= r.hit_smark;
632 i_out.fetch_failed <= r.fetch_failed;
633 i_out.big_endian <= r.big_endian;
634 i_out.next_predicted <= r.predicted;
635 i_out.next_pred_ntaken <= r.pred_ntaken;
636
637 -- Stall fetch1 if we have a miss on cache or TLB or a protection fault
638 stall_out <= not (is_hit and access_ok);
639
640 -- Wishbone requests output (from the cache miss reload machine)
641 wishbone_out <= r.wb;
642 end process;
643
644 -- Cache hit synchronous machine
645 icache_hit : process(clk)
646 begin
647 if rising_edge(clk) then
648 -- keep outputs to fetch2 unchanged on a stall
649 -- except that flush or reset sets valid to 0
650 if stall_in = '1' then
651 if rst = '1' or flush_in = '1' then
652 r.hit_valid <= '0';
653 end if;
654 else
655 -- On a hit, latch the request for the next cycle, when the BRAM data
656 -- will be available on the cache_out output of the corresponding way
657 --
658 r.hit_valid <= req_is_hit;
659 if req_is_hit = '1' then
660 r.hit_way <= req_hit_way;
661 -- this is a bit fragile but better than propogating bad values
662 assert not is_X(i_in.nia) report "metavalue in NIA" severity FAILURE;
663
664 report "cache hit nia:" & to_hstring(i_in.nia) &
665 " IR:" & std_ulogic'image(i_in.virt_mode) &
666 " SM:" & std_ulogic'image(i_in.stop_mark) &
667 " idx:" & to_hstring(req_index) &
668 " tag:" & to_hstring(req_tag) &
669 " way:" & to_hstring(req_hit_way) &
670 " RA:" & to_hstring(real_addr);
671 end if;
672 end if;
673 if stall_in = '0' then
674 -- Send stop marks and NIA down regardless of validity
675 r.hit_smark <= i_in.stop_mark;
676 r.hit_nia <= i_in.nia;
677 r.big_endian <= i_in.big_endian;
678 r.predicted <= i_in.predicted;
679 r.pred_ntaken <= i_in.pred_ntaken;
680 end if;
681 if i_out.valid = '1' then
682 assert not is_X(i_out.insn) severity failure;
683 end if;
684 end if;
685 end process;
686
687 -- Cache miss/reload synchronous machine
688 icache_miss : process(clk)
689 variable tagset : cache_tags_set_t;
690 variable tag : cache_tag_t;
691 variable snoop_addr : real_addr_t;
692 variable snoop_tag : cache_tag_t;
693 variable snoop_cache_tags : cache_tags_set_t;
694 variable replace_way : way_sig_t;
695 begin
696 if rising_edge(clk) then
697 ev.icache_miss <= '0';
698 r.recv_valid <= '0';
699 -- On reset, clear all valid bits to force misses
700 if rst = '1' then
701 for i in index_t loop
702 cache_valids(i) <= (others => '0');
703 end loop;
704 r.state <= IDLE;
705 r.wb.cyc <= '0';
706 r.wb.stb <= '0';
707
708 -- We only ever do reads on wishbone
709 r.wb.dat <= (others => '0');
710 r.wb.sel <= "11111111";
711 r.wb.we <= '0';
712
713 -- Not useful normally but helps avoiding tons of sim warnings
714 r.wb.adr <= (others => '0');
715
716 snoop_valid <= '0';
717 snoop_index <= to_unsigned(0, INDEX_BITS);
718 snoop_hits <= (others => '0');
719 else
720 -- Detect snooped writes and decode address into index and tag
721 -- Since we never write, any write should be snooped
722 snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we;
723 snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr));
724 snoop_index <= get_index(snoop_addr);
725 snoop_tag := get_tag(snoop_addr, '0');
726 snoop_hits <= (others => '0');
727 if snoop_valid = '1' then
728 if is_X(snoop_addr) then
729 report "metavalue in snoop_addr" severity FAILURE;
730 end if;
731 snoop_cache_tags := cache_tags(to_integer(get_index(snoop_addr)));
732 for i in way_t loop
733 tag := read_tag(i, snoop_cache_tags);
734 -- Ignore endian bit in comparison
735 tag(TAG_BITS - 1) := '0';
736 if tag = snoop_tag then
737 snoop_hits(i) <= '1';
738 end if;
739 end loop;
740 end if;
741
742 -- Process cache invalidations
743 if inval_in = '1' then
744 for i in index_t loop
745 cache_valids(i) <= (others => '0');
746 end loop;
747 r.store_valid <= '0';
748 else
749 -- Do invalidations from snooped stores to memory, one
750 -- cycle after the address appears on wb_snoop_in.
751 for i in way_t loop
752 if snoop_hits(i) = '1' then
753 assert not is_X(snoop_index) severity failure;
754 cache_valids(to_integer(snoop_index))(i) <= '0';
755 end if;
756 end loop;
757 end if;
758
759 -- Main state machine
760 case r.state is
761 when IDLE =>
762 -- Reset per-row valid flags, only used in WAIT_ACK
763 for i in 0 to ROW_PER_LINE - 1 loop
764 r.rows_valid(i) <= '0';
765 end loop;
766
767 -- We need to read a cache line
768 if req_is_miss = '1' then
769 report "cache miss nia:" & to_hstring(i_in.nia) &
770 " IR:" & std_ulogic'image(i_in.virt_mode) &
771 " SM:" & std_ulogic'image(i_in.stop_mark) &
772 " idx:" & to_hstring(req_index) &
773 " tag:" & to_hstring(req_tag) &
774 " RA:" & to_hstring(real_addr);
775 ev.icache_miss <= '1';
776
777 -- Keep track of our index and way for subsequent stores
778 r.store_index <= req_index;
779 r.recv_row <= get_row(req_raddr);
780 r.store_row <= get_row(req_raddr);
781 r.store_tag <= req_tag;
782 r.store_valid <= '1';
783 r.end_row_ix <= get_row_of_line(get_row(req_raddr)) - 1;
784
785 -- Prep for first wishbone read. We calculate the address of
786 -- the start of the cache line and start the WB cycle.
787 --
788 r.wb.adr <= addr_to_wb(req_raddr);
789 r.wb.cyc <= '1';
790 r.wb.stb <= '1';
791
792 -- Track that we had one request sent
793 r.state <= CLR_TAG;
794 end if;
795
796 when CLR_TAG | WAIT_ACK =>
797 assert not is_X(r.store_index) severity failure;
798 assert not is_X(r.store_row) severity failure;
799 assert not is_X(r.recv_row) severity failure;
800 if r.state = CLR_TAG then
801 replace_way := to_unsigned(0, WAY_BITS);
802 if NUM_WAYS > 1 then
803 -- Get victim way from plru
804 replace_way := plru_victim;
805 end if;
806 r.store_way <= replace_way;
807
808 -- Force misses on that way while reloading that line
809 assert not is_X(replace_way) severity failure;
810 cache_valids(to_integer(r.store_index))(to_integer(replace_way)) <= '0';
811
812 -- Store new tag in selected way
813 for i in 0 to NUM_WAYS-1 loop
814 if to_unsigned(i, WAY_BITS) = replace_way then
815 tagset := cache_tags(to_integer(r.store_index));
816 write_tag(i, tagset, r.store_tag);
817 cache_tags(to_integer(r.store_index)) <= tagset;
818 end if;
819 end loop;
820
821 r.state <= WAIT_ACK;
822 end if;
823
824 -- If we are writing in this cycle, mark row valid and see if we are done
825 if r.recv_valid = '1' then
826 r.rows_valid(to_integer(r.store_row(ROW_LINEBITS-1 downto 0))) <= not inval_in;
827 if is_last_row(r.store_row, r.end_row_ix) then
828 -- Cache line is now valid
829 cache_valids(to_integer(r.store_index))(to_integer(r.store_way)) <=
830 r.store_valid and not inval_in;
831 -- We are done
832 r.state <= IDLE;
833 end if;
834 -- Increment store row counter
835 r.store_row <= r.recv_row;
836 end if;
837
838 -- If we are still sending requests, was one accepted ?
839 if wishbone_in.stall = '0' and r.wb.stb = '1' then
840 -- That was the last word ? We are done sending. Clear stb.
841 --
842 if is_last_row_wb_addr(r.wb.adr, r.end_row_ix) then
843 r.wb.stb <= '0';
844 end if;
845
846 -- Calculate the next row address
847 r.wb.adr <= next_row_wb_addr(r.wb.adr);
848 end if;
849
850 -- Abort reload if we get an invalidation
851 if inval_in = '1' then
852 r.wb.stb <= '0';
853 r.state <= STOP_RELOAD;
854 end if;
855
856 -- Incoming acks processing
857 if wishbone_in.ack = '1' then
858 -- Check for completion
859 if is_last_row(r.recv_row, r.end_row_ix) then
860 -- Complete wishbone cycle
861 r.wb.cyc <= '0';
862 end if;
863 r.recv_valid <= '1';
864
865 -- Increment receive row counter
866 r.recv_row <= next_row(r.recv_row);
867 end if;
868
869 when STOP_RELOAD =>
870 -- Wait for all outstanding requests to be satisfied, then
871 -- go to IDLE state.
872 if get_row_of_line(r.recv_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
873 r.wb.cyc <= '0';
874 r.state <= IDLE;
875 end if;
876 if wishbone_in.ack = '1' then
877 -- Increment store row counter
878 r.recv_row <= next_row(r.recv_row);
879 end if;
880 end case;
881 end if;
882
883 -- TLB miss and protection fault processing
884 if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
885 r.fetch_failed <= '0';
886 elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
887 r.fetch_failed <= '1';
888 end if;
889 end if;
890 end process;
891
892 icache_log: if LOG_LENGTH > 0 generate
893 -- Output data to logger
894 signal log_data : std_ulogic_vector(57 downto 0);
895 begin
896 data_log: process(clk)
897 variable lway: way_sig_t;
898 variable wstate: std_ulogic;
899 begin
900 if rising_edge(clk) then
901 lway := req_hit_way;
902 wstate := '0';
903 if r.state /= IDLE then
904 wstate := '1';
905 end if;
906 log_data <= i_out.valid &
907 log_insn &
908 wishbone_in.ack &
909 r.wb.adr(2 downto 0) &
910 r.wb.stb & r.wb.cyc &
911 wishbone_in.stall &
912 stall_out &
913 r.fetch_failed &
914 r.hit_nia(5 downto 2) &
915 wstate &
916 std_ulogic_vector(resize(lway, 3)) &
917 req_is_hit & req_is_miss &
918 access_ok &
919 ra_valid;
920 end if;
921 end process;
922 log_out <= log_data;
923 end generate;
924
925 events <= ev;
926
927 end;