Merge remote-tracking branch 'to-be-merged/merge-3d-game'
[microwatt.git] / icache.vhdl
1 --
2 -- Set associative icache
3 --
4 -- TODO (in no specific order):
5 --
6 -- * Add debug interface to inspect cache content
7 -- * Add snoop/invalidate path
8 -- * Add multi-hit error detection
9 -- * Pipelined bus interface (wb or axi)
10 -- * Maybe add parity ? There's a few bits free in each BRAM row on Xilinx
11 -- * Add optimization: service hits on partially loaded lines
12 -- * Add optimization: (maybe) interrupt reload on fluch/redirect
13 -- * Check if playing with the geometry of the cache tags allow for more
14 -- efficient use of distributed RAM and less logic/muxes. Currently we
15 -- write TAG_BITS width which may not match full ram blocks and might
16 -- cause muxes to be inferred for "partial writes".
17 -- * Check if making the read size of PLRU a ROM helps utilization
18 --
19 library ieee;
20 use ieee.std_logic_1164.all;
21 use ieee.numeric_std.all;
22
23 library work;
24 use work.utils.all;
25 use work.common.all;
26 use work.wishbone_types.all;
27
28 -- 64 bit direct mapped icache. All instructions are 4B aligned.
29
30 entity icache is
31 generic (
32 SIM : boolean := false;
33 -- Line size in bytes
34 LINE_SIZE : positive := 64;
35 -- BRAM organisation: We never access more than wishbone_data_bits at
36 -- a time so to save resources we make the array only that wide, and
37 -- use consecutive indices for to make a cache "line"
38 --
39 -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
40 ROW_SIZE : positive := wishbone_data_bits / 8;
41 -- Number of lines in a set
42 NUM_LINES : positive := 32;
43 -- Number of ways
44 NUM_WAYS : positive := 4;
45 -- L1 ITLB number of entries (direct mapped)
46 TLB_SIZE : positive := 64;
47 -- L1 ITLB log_2(page_size)
48 TLB_LG_PGSZ : positive := 12;
49 -- Non-zero to enable log data collection
50 LOG_LENGTH : natural := 0
51 );
52 port (
53 clk : in std_ulogic;
54 rst : in std_ulogic;
55
56 i_in : in Fetch1ToIcacheType;
57 i_out : out IcacheToDecode1Type;
58
59 m_in : in MmuToIcacheType;
60
61 stall_in : in std_ulogic;
62 stall_out : out std_ulogic;
63 flush_in : in std_ulogic;
64 inval_in : in std_ulogic;
65
66 wishbone_out : out wishbone_master_out;
67 wishbone_in : in wishbone_slave_out;
68
69 wb_snoop_in : in wishbone_master_out := wishbone_master_out_init;
70
71 events : out IcacheEventType;
72 log_out : out std_ulogic_vector(53 downto 0)
73 );
74 end entity icache;
75
76 architecture rtl of icache is
77 constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
78 -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
79 constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
80 -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
81 -- icache
82 constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
83 -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
84 constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
85 -- Bit fields counts in the address
86
87 -- INSN_BITS is the number of bits to select an instruction in a row
88 constant INSN_BITS : natural := log2(INSN_PER_ROW);
89 -- ROW_BITS is the number of bits to select a row
90 constant ROW_BITS : natural := log2(BRAM_ROWS);
91 -- ROW_LINEBITS is the number of bits to select a row within a line
92 constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
93 -- LINE_OFF_BITS is the number of bits for the offset in a cache line
94 constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
95 -- ROW_OFF_BITS is the number of bits for the offset in a row
96 constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
97 -- INDEX_BITS is the number of bits to select a cache line
98 constant INDEX_BITS : natural := log2(NUM_LINES);
99 -- SET_SIZE_BITS is the log base 2 of the set size
100 constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
101 -- TAG_BITS is the number of bits of the tag part of the address
102 -- the +1 is to allow the endianness to be stored in the tag
103 constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS + 1;
104 -- WAY_BITS is the number of bits to select a way
105 constant WAY_BITS : natural := log2(NUM_WAYS);
106
107 -- Example of layout for 32 lines of 64 bytes:
108 --
109 -- .. tag |index| line |
110 -- .. | row | |
111 -- .. | | | |00| zero (2)
112 -- .. | | |-| | INSN_BITS (1)
113 -- .. | |---| | ROW_LINEBITS (3)
114 -- .. | |--- - --| LINE_OFF_BITS (6)
115 -- .. | |- --| ROW_OFF_BITS (3)
116 -- .. |----- ---| | ROW_BITS (8)
117 -- .. |-----| | INDEX_BITS (5)
118 -- .. --------| | TAG_BITS (53)
119
120 subtype row_t is integer range 0 to BRAM_ROWS-1;
121 subtype index_t is integer range 0 to NUM_LINES-1;
122 subtype way_t is integer range 0 to NUM_WAYS-1;
123 subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
124
125 -- The cache data BRAM organized as described above for each way
126 subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
127
128 -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
129 -- not handle a clean (commented) definition of the cache tags as a 3d
130 -- memory. For now, work around it by putting all the tags
131 subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
132 -- type cache_tags_set_t is array(way_t) of cache_tag_t;
133 -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
134 constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
135 subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
136 type cache_tags_array_t is array(index_t) of cache_tags_set_t;
137
138 -- The cache valid bits
139 subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
140 type cache_valids_t is array(index_t) of cache_way_valids_t;
141 type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
142
143 -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
144 signal cache_tags : cache_tags_array_t;
145 signal cache_valids : cache_valids_t;
146
147 attribute ram_style : string;
148 attribute ram_style of cache_tags : signal is "distributed";
149
150 -- L1 ITLB.
151 constant TLB_BITS : natural := log2(TLB_SIZE);
152 constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
153 constant TLB_PTE_BITS : natural := 64;
154
155 subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
156 type tlb_valids_t is array(tlb_index_t) of std_ulogic;
157 subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
158 type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
159 subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
160 type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
161
162 signal itlb_valids : tlb_valids_t;
163 signal itlb_tags : tlb_tags_t;
164 signal itlb_ptes : tlb_ptes_t;
165 attribute ram_style of itlb_tags : signal is "distributed";
166 attribute ram_style of itlb_ptes : signal is "distributed";
167
168 -- Privilege bit from PTE EAA field
169 signal eaa_priv : std_ulogic;
170
171 -- Cache reload state machine
172 type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK);
173
174 type reg_internal_t is record
175 -- Cache hit state (Latches for 1 cycle BRAM access)
176 hit_way : way_t;
177 hit_nia : std_ulogic_vector(63 downto 0);
178 hit_smark : std_ulogic;
179 hit_valid : std_ulogic;
180 big_endian: std_ulogic;
181
182 -- Cache miss state (reload state machine)
183 state : state_t;
184 wb : wishbone_master_out;
185 store_way : way_t;
186 store_index : index_t;
187 store_row : row_t;
188 store_tag : cache_tag_t;
189 store_valid : std_ulogic;
190 end_row_ix : row_in_line_t;
191 rows_valid : row_per_line_valid_t;
192
193 -- TLB miss state
194 fetch_failed : std_ulogic;
195 end record;
196
197 signal r : reg_internal_t;
198
199 signal ev : IcacheEventType;
200
201 -- Async signals on incoming request
202 signal req_index : index_t;
203 signal req_row : row_t;
204 signal req_hit_way : way_t;
205 signal req_tag : cache_tag_t;
206 signal req_is_hit : std_ulogic;
207 signal req_is_miss : std_ulogic;
208 signal req_raddr : real_addr_t;
209
210 signal tlb_req_index : tlb_index_t;
211 signal real_addr : real_addr_t;
212 signal ra_valid : std_ulogic;
213 signal priv_fault : std_ulogic;
214 signal access_ok : std_ulogic;
215
216 -- Cache RAM interface
217 type cache_ram_out_t is array(way_t) of cache_row_t;
218 signal cache_out : cache_ram_out_t;
219
220 -- PLRU output interface
221 type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
222 signal plru_victim : plru_out_t;
223 signal replace_way : way_t;
224
225 -- Memory write snoop signals
226 signal snoop_valid : std_ulogic;
227 signal snoop_index : index_t;
228 signal snoop_hits : cache_way_valids_t;
229
230 -- Return the cache line index (tag index) for an address
231 function get_index(addr: std_ulogic_vector) return index_t is
232 begin
233 return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
234 end;
235
236 -- Return the cache row index (data memory) for an address
237 function get_row(addr: std_ulogic_vector) return row_t is
238 begin
239 return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
240 end;
241
242 -- Return the index of a row within a line
243 function get_row_of_line(row: row_t) return row_in_line_t is
244 variable row_v : unsigned(ROW_BITS-1 downto 0);
245 begin
246 row_v := to_unsigned(row, ROW_BITS);
247 return row_v(ROW_LINEBITS-1 downto 0);
248 end;
249
250 -- Returns whether this is the last row of a line
251 function is_last_row_wb_addr(wb_addr: wishbone_addr_type; last: row_in_line_t) return boolean is
252 begin
253 return unsigned(wb_addr(LINE_OFF_BITS - ROW_OFF_BITS - 1 downto 0)) = last;
254 end;
255
256 -- Returns whether this is the last row of a line
257 function is_last_row(row: row_t; last: row_in_line_t) return boolean is
258 begin
259 return get_row_of_line(row) = last;
260 end;
261
262 -- Return the address of the next row in the current cache line
263 function next_row_wb_addr(wb_addr: wishbone_addr_type)
264 return std_ulogic_vector is
265 variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
266 variable result : wishbone_addr_type;
267 begin
268 -- Is there no simpler way in VHDL to generate that 3 bits adder ?
269 row_idx := wb_addr(ROW_LINEBITS - 1 downto 0);
270 row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
271 result := wb_addr;
272 result(ROW_LINEBITS - 1 downto 0) := row_idx;
273 return result;
274 end;
275
276 -- Return the next row in the current cache line. We use a dedicated
277 -- function in order to limit the size of the generated adder to be
278 -- only the bits within a cache line (3 bits with default settings)
279 --
280 function next_row(row: row_t) return row_t is
281 variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
282 variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
283 variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
284 begin
285 row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
286 row_idx := row_v(ROW_LINEBITS-1 downto 0);
287 row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
288 return to_integer(unsigned(row_v));
289 end;
290
291 -- Read the instruction word for the given address in the current cache row
292 function read_insn_word(addr: std_ulogic_vector(63 downto 0);
293 data: cache_row_t) return std_ulogic_vector is
294 variable word: integer range 0 to INSN_PER_ROW-1;
295 begin
296 word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
297 return data(31+word*32 downto word*32);
298 end;
299
300 -- Get the tag value from the address
301 function get_tag(addr: real_addr_t; endian: std_ulogic) return cache_tag_t is
302 begin
303 return endian & addr(addr'left downto SET_SIZE_BITS);
304 end;
305
306 -- Read a tag from a tag memory row
307 function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
308 begin
309 return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
310 end;
311
312 -- Write a tag to tag memory row
313 procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
314 tag: cache_tag_t) is
315 begin
316 tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
317 end;
318
319 -- Simple hash for direct-mapped TLB index
320 function hash_ea(addr: std_ulogic_vector(63 downto 0)) return tlb_index_t is
321 variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
322 begin
323 hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
324 xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS)
325 xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS);
326 return to_integer(unsigned(hash));
327 end;
328 begin
329
330 assert LINE_SIZE mod ROW_SIZE = 0;
331 assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
332 assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
333 assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
334 assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" severity FAILURE;
335 assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
336 report "geometry bits don't add up" severity FAILURE;
337 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
338 report "geometry bits don't add up" severity FAILURE;
339 assert (REAL_ADDR_BITS + 1 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
340 report "geometry bits don't add up" severity FAILURE;
341 assert (REAL_ADDR_BITS + 1 = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
342 report "geometry bits don't add up" severity FAILURE;
343
344 sim_debug: if SIM generate
345 debug: process
346 begin
347 report "ROW_SIZE = " & natural'image(ROW_SIZE);
348 report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
349 report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
350 report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
351 report "INSN_BITS = " & natural'image(INSN_BITS);
352 report "ROW_BITS = " & natural'image(ROW_BITS);
353 report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
354 report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
355 report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
356 report "INDEX_BITS = " & natural'image(INDEX_BITS);
357 report "TAG_BITS = " & natural'image(TAG_BITS);
358 report "WAY_BITS = " & natural'image(WAY_BITS);
359 wait;
360 end process;
361 end generate;
362
363 -- Generate a cache RAM for each way
364 rams: for i in 0 to NUM_WAYS-1 generate
365 signal do_read : std_ulogic;
366 signal do_write : std_ulogic;
367 signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
368 signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
369 signal dout : cache_row_t;
370 signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
371 signal wr_dat : std_ulogic_vector(wishbone_in.dat'left downto 0);
372 begin
373 way: entity work.cache_ram
374 generic map (
375 ROW_BITS => ROW_BITS,
376 WIDTH => ROW_SIZE_BITS
377 )
378 port map (
379 clk => clk,
380 rd_en => do_read,
381 rd_addr => rd_addr,
382 rd_data => dout,
383 wr_sel => wr_sel,
384 wr_addr => wr_addr,
385 wr_data => wr_dat
386 );
387 process(all)
388 variable j: integer;
389 begin
390 -- byte-swap read data if big endian
391 if r.store_tag(TAG_BITS - 1) = '0' then
392 wr_dat <= wishbone_in.dat;
393 else
394 for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
395 j := ((ii / 4) * 4) + (3 - (ii mod 4));
396 wr_dat(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
397 end loop;
398 end if;
399 do_read <= not stall_in;
400 do_write <= '0';
401 if wishbone_in.ack = '1' and replace_way = i then
402 do_write <= '1';
403 end if;
404 cache_out(i) <= dout;
405 rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
406 wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
407 for ii in 0 to ROW_SIZE-1 loop
408 wr_sel(ii) <= do_write;
409 end loop;
410 end process;
411 end generate;
412
413 -- Generate PLRUs
414 maybe_plrus: if NUM_WAYS > 1 generate
415 begin
416 plrus: for i in 0 to NUM_LINES-1 generate
417 -- PLRU interface
418 signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
419 signal plru_acc_en : std_ulogic;
420 signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
421
422 begin
423 plru : entity work.plru
424 generic map (
425 BITS => WAY_BITS
426 )
427 port map (
428 clk => clk,
429 rst => rst,
430 acc => plru_acc,
431 acc_en => plru_acc_en,
432 lru => plru_out
433 );
434
435 process(all)
436 begin
437 -- PLRU interface
438 if get_index(r.hit_nia) = i then
439 plru_acc_en <= r.hit_valid;
440 else
441 plru_acc_en <= '0';
442 end if;
443 plru_acc <= std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
444 plru_victim(i) <= plru_out;
445 end process;
446 end generate;
447 end generate;
448
449 -- TLB hit detection and real address generation
450 itlb_lookup : process(all)
451 variable pte : tlb_pte_t;
452 variable ttag : tlb_tag_t;
453 begin
454 tlb_req_index <= hash_ea(i_in.nia);
455 pte := itlb_ptes(tlb_req_index);
456 ttag := itlb_tags(tlb_req_index);
457 if i_in.virt_mode = '1' then
458 real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
459 i_in.nia(TLB_LG_PGSZ - 1 downto 0);
460 if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
461 ra_valid <= itlb_valids(tlb_req_index);
462 else
463 ra_valid <= '0';
464 end if;
465 eaa_priv <= pte(3);
466 else
467 real_addr <= addr_to_real(i_in.nia);
468 ra_valid <= '1';
469 eaa_priv <= '1';
470 end if;
471
472 -- no IAMR, so no KUEP support for now
473 priv_fault <= eaa_priv and not i_in.priv_mode;
474 access_ok <= ra_valid and not priv_fault;
475 end process;
476
477 -- iTLB update
478 itlb_update: process(clk)
479 variable wr_index : tlb_index_t;
480 begin
481 if rising_edge(clk) then
482 wr_index := hash_ea(m_in.addr);
483 if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
484 -- clear all valid bits
485 for i in tlb_index_t loop
486 itlb_valids(i) <= '0';
487 end loop;
488 elsif m_in.tlbie = '1' then
489 -- clear entry regardless of hit or miss
490 itlb_valids(wr_index) <= '0';
491 elsif m_in.tlbld = '1' then
492 itlb_tags(wr_index) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
493 itlb_ptes(wr_index) <= m_in.pte;
494 itlb_valids(wr_index) <= '1';
495 end if;
496 ev.itlb_miss_resolved <= m_in.tlbld and not rst;
497 end if;
498 end process;
499
500 -- Cache hit detection, output to fetch2 and other misc logic
501 icache_comb : process(all)
502 variable is_hit : std_ulogic;
503 variable hit_way : way_t;
504 begin
505 -- Extract line, row and tag from request
506 req_index <= get_index(i_in.nia);
507 req_row <= get_row(i_in.nia);
508 req_tag <= get_tag(real_addr, i_in.big_endian);
509
510 -- Calculate address of beginning of cache row, will be
511 -- used for cache miss processing if needed
512 --
513 req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
514 (ROW_OFF_BITS-1 downto 0 => '0');
515
516 -- Test if pending request is a hit on any way
517 hit_way := 0;
518 is_hit := '0';
519 for i in way_t loop
520 if i_in.req = '1' and
521 (cache_valids(req_index)(i) = '1' or
522 (r.state = WAIT_ACK and
523 req_index = r.store_index and
524 i = r.store_way and
525 r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
526 if read_tag(i, cache_tags(req_index)) = req_tag then
527 hit_way := i;
528 is_hit := '1';
529 end if;
530 end if;
531 end loop;
532
533 -- Generate the "hit" and "miss" signals for the synchronous blocks
534 if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then
535 req_is_hit <= is_hit;
536 req_is_miss <= not is_hit;
537 else
538 req_is_hit <= '0';
539 req_is_miss <= '0';
540 end if;
541 req_hit_way <= hit_way;
542
543 -- The way to replace on a miss
544 if r.state = CLR_TAG then
545 replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
546 else
547 replace_way <= r.store_way;
548 end if;
549
550 -- Output instruction from current cache row
551 --
552 -- Note: This is a mild violation of our design principle of having pipeline
553 -- stages output from a clean latch. In this case we output the result
554 -- of a mux. The alternative would be output an entire row which
555 -- I prefer not to do just yet as it would force fetch2 to know about
556 -- some of the cache geometry information.
557 --
558 i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
559 i_out.valid <= r.hit_valid;
560 i_out.nia <= r.hit_nia;
561 i_out.stop_mark <= r.hit_smark;
562 i_out.fetch_failed <= r.fetch_failed;
563 i_out.big_endian <= r.big_endian;
564 i_out.next_predicted <= i_in.predicted;
565 i_out.next_pred_ntaken <= i_in.pred_ntaken;
566
567 -- Stall fetch1 if we have a miss on cache or TLB or a protection fault
568 stall_out <= not (is_hit and access_ok);
569
570 -- Wishbone requests output (from the cache miss reload machine)
571 wishbone_out <= r.wb;
572 end process;
573
574 -- Cache hit synchronous machine
575 icache_hit : process(clk)
576 begin
577 if rising_edge(clk) then
578 -- keep outputs to fetch2 unchanged on a stall
579 -- except that flush or reset sets valid to 0
580 if stall_in = '1' then
581 if rst = '1' or flush_in = '1' then
582 r.hit_valid <= '0';
583 end if;
584 else
585 -- On a hit, latch the request for the next cycle, when the BRAM data
586 -- will be available on the cache_out output of the corresponding way
587 --
588 r.hit_valid <= req_is_hit;
589 if req_is_hit = '1' then
590 r.hit_way <= req_hit_way;
591
592 report "cache hit nia:" & to_hstring(i_in.nia) &
593 " IR:" & std_ulogic'image(i_in.virt_mode) &
594 " SM:" & std_ulogic'image(i_in.stop_mark) &
595 " idx:" & integer'image(req_index) &
596 " tag:" & to_hstring(req_tag) &
597 " way:" & integer'image(req_hit_way) &
598 " RA:" & to_hstring(real_addr);
599 end if;
600 end if;
601 if stall_in = '0' then
602 -- Send stop marks and NIA down regardless of validity
603 r.hit_smark <= i_in.stop_mark;
604 r.hit_nia <= i_in.nia;
605 r.big_endian <= i_in.big_endian;
606 end if;
607 end if;
608 end process;
609
610 -- Cache miss/reload synchronous machine
611 icache_miss : process(clk)
612 variable tagset : cache_tags_set_t;
613 variable tag : cache_tag_t;
614 variable snoop_addr : real_addr_t;
615 variable snoop_tag : cache_tag_t;
616 variable snoop_cache_tags : cache_tags_set_t;
617 begin
618 if rising_edge(clk) then
619 ev.icache_miss <= '0';
620 -- On reset, clear all valid bits to force misses
621 if rst = '1' then
622 for i in index_t loop
623 cache_valids(i) <= (others => '0');
624 end loop;
625 r.state <= IDLE;
626 r.wb.cyc <= '0';
627 r.wb.stb <= '0';
628
629 -- We only ever do reads on wishbone
630 r.wb.dat <= (others => '0');
631 r.wb.sel <= "11111111";
632 r.wb.we <= '0';
633
634 -- Not useful normally but helps avoiding tons of sim warnings
635 r.wb.adr <= (others => '0');
636
637 snoop_valid <= '0';
638 snoop_index <= 0;
639 snoop_hits <= (others => '0');
640 else
641 -- Detect snooped writes and decode address into index and tag
642 -- Since we never write, any write should be snooped
643 snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we;
644 snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr));
645 snoop_index <= get_index(snoop_addr);
646 snoop_cache_tags := cache_tags(get_index(snoop_addr));
647 snoop_tag := get_tag(snoop_addr, '0');
648 snoop_hits <= (others => '0');
649 for i in way_t loop
650 tag := read_tag(i, snoop_cache_tags);
651 -- Ignore endian bit in comparison
652 tag(TAG_BITS - 1) := '0';
653 if tag = snoop_tag then
654 snoop_hits(i) <= '1';
655 end if;
656 end loop;
657
658 -- Process cache invalidations
659 if inval_in = '1' then
660 for i in index_t loop
661 cache_valids(i) <= (others => '0');
662 end loop;
663 r.store_valid <= '0';
664 else
665 -- Do invalidations from snooped stores to memory, one
666 -- cycle after the address appears on wb_snoop_in.
667 for i in way_t loop
668 if snoop_valid = '1' and snoop_hits(i) = '1' then
669 cache_valids(snoop_index)(i) <= '0';
670 end if;
671 end loop;
672 end if;
673
674 -- Main state machine
675 case r.state is
676 when IDLE =>
677 -- Reset per-row valid flags, only used in WAIT_ACK
678 for i in 0 to ROW_PER_LINE - 1 loop
679 r.rows_valid(i) <= '0';
680 end loop;
681
682 -- We need to read a cache line
683 if req_is_miss = '1' then
684 report "cache miss nia:" & to_hstring(i_in.nia) &
685 " IR:" & std_ulogic'image(i_in.virt_mode) &
686 " SM:" & std_ulogic'image(i_in.stop_mark) &
687 " idx:" & integer'image(req_index) &
688 " way:" & integer'image(replace_way) &
689 " tag:" & to_hstring(req_tag) &
690 " RA:" & to_hstring(real_addr);
691 ev.icache_miss <= '1';
692
693 -- Keep track of our index and way for subsequent stores
694 r.store_index <= req_index;
695 r.store_row <= get_row(req_raddr);
696 r.store_tag <= req_tag;
697 r.store_valid <= '1';
698 r.end_row_ix <= get_row_of_line(get_row(req_raddr)) - 1;
699
700 -- Prep for first wishbone read. We calculate the address of
701 -- the start of the cache line and start the WB cycle.
702 --
703 r.wb.adr <= addr_to_wb(req_raddr);
704 r.wb.cyc <= '1';
705 r.wb.stb <= '1';
706
707 -- Track that we had one request sent
708 r.state <= CLR_TAG;
709 end if;
710
711 when CLR_TAG | WAIT_ACK =>
712 if r.state = CLR_TAG then
713 -- Get victim way from plru
714 r.store_way <= replace_way;
715
716 -- Force misses on that way while reloading that line
717 cache_valids(req_index)(replace_way) <= '0';
718
719 -- Store new tag in selected way
720 for i in 0 to NUM_WAYS-1 loop
721 if i = replace_way then
722 tagset := cache_tags(r.store_index);
723 write_tag(i, tagset, r.store_tag);
724 cache_tags(r.store_index) <= tagset;
725 end if;
726 end loop;
727
728 r.state <= WAIT_ACK;
729 end if;
730
731 -- If we are still sending requests, was one accepted ?
732 if wishbone_in.stall = '0' and r.wb.stb = '1' then
733 -- That was the last word ? We are done sending. Clear stb.
734 --
735 if is_last_row_wb_addr(r.wb.adr, r.end_row_ix) then
736 r.wb.stb <= '0';
737 end if;
738
739 -- Calculate the next row address
740 r.wb.adr <= next_row_wb_addr(r.wb.adr);
741 end if;
742
743 -- Abort reload if we get an invalidation
744 if inval_in = '1' then
745 r.wb.stb <= '0';
746 r.state <= STOP_RELOAD;
747 end if;
748
749 -- Incoming acks processing
750 if wishbone_in.ack = '1' then
751 r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
752 -- Check for completion
753 if is_last_row(r.store_row, r.end_row_ix) then
754 -- Complete wishbone cycle
755 r.wb.cyc <= '0';
756
757 -- Cache line is now valid
758 cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in;
759
760 -- We are done
761 r.state <= IDLE;
762 end if;
763
764 -- Increment store row counter
765 r.store_row <= next_row(r.store_row);
766 end if;
767
768 when STOP_RELOAD =>
769 -- Wait for all outstanding requests to be satisfied, then
770 -- go to IDLE state.
771 if get_row_of_line(r.store_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
772 r.wb.cyc <= '0';
773 r.state <= IDLE;
774 end if;
775 if wishbone_in.ack = '1' then
776 -- Increment store row counter
777 r.store_row <= next_row(r.store_row);
778 end if;
779 end case;
780 end if;
781
782 -- TLB miss and protection fault processing
783 if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
784 r.fetch_failed <= '0';
785 elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
786 r.fetch_failed <= '1';
787 end if;
788 end if;
789 end process;
790
791 icache_log: if LOG_LENGTH > 0 generate
792 -- Output data to logger
793 signal log_data : std_ulogic_vector(53 downto 0);
794 begin
795 data_log: process(clk)
796 variable lway: way_t;
797 variable wstate: std_ulogic;
798 begin
799 if rising_edge(clk) then
800 lway := req_hit_way;
801 wstate := '0';
802 if r.state /= IDLE then
803 wstate := '1';
804 end if;
805 log_data <= i_out.valid &
806 i_out.insn &
807 wishbone_in.ack &
808 r.wb.adr(2 downto 0) &
809 r.wb.stb & r.wb.cyc &
810 wishbone_in.stall &
811 stall_out &
812 r.fetch_failed &
813 r.hit_nia(5 downto 2) &
814 wstate &
815 std_ulogic_vector(to_unsigned(lway, 3)) &
816 req_is_hit & req_is_miss &
817 access_ok &
818 ra_valid;
819 end if;
820 end process;
821 log_out <= log_data;
822 end generate;
823 end;