Fix a ghdlsynth issue in icache
[microwatt.git] / icache.vhdl
1 --
2 -- Set associative icache
3 --
4 -- TODO (in no specific order):
5 --
6 -- * Add debug interface to inspect cache content
7 -- * Add snoop/invalidate path
8 -- * Add multi-hit error detection
9 -- * Pipelined bus interface (wb or axi)
10 -- * Maybe add parity ? There's a few bits free in each BRAM row on Xilinx
11 -- * Add optimization: service hits on partially loaded lines
12 -- * Add optimization: (maybe) interrupt reload on fluch/redirect
13 -- * Check if playing with the geometry of the cache tags allow for more
14 -- efficient use of distributed RAM and less logic/muxes. Currently we
15 -- write TAG_BITS width which may not match full ram blocks and might
16 -- cause muxes to be inferred for "partial writes".
17 -- * Check if making the read size of PLRU a ROM helps utilization
18 --
19 library ieee;
20 use ieee.std_logic_1164.all;
21 use ieee.numeric_std.all;
22
23 library work;
24 use work.utils.all;
25 use work.common.all;
26 use work.wishbone_types.all;
27
28 -- 64 bit direct mapped icache. All instructions are 4B aligned.
29
30 entity icache is
31 generic (
32 SIM : boolean := false;
33 -- Line size in bytes
34 LINE_SIZE : positive := 64;
35 -- Number of lines in a set
36 NUM_LINES : positive := 32;
37 -- Number of ways
38 NUM_WAYS : positive := 4
39 );
40 port (
41 clk : in std_ulogic;
42 rst : in std_ulogic;
43
44 i_in : in Fetch1ToIcacheType;
45 i_out : out IcacheToFetch2Type;
46
47 stall_out : out std_ulogic;
48 flush_in : in std_ulogic;
49
50 wishbone_out : out wishbone_master_out;
51 wishbone_in : in wishbone_slave_out
52 );
53 end entity icache;
54
55 architecture rtl of icache is
56 -- BRAM organisation: We never access more than wishbone_data_bits at
57 -- a time so to save resources we make the array only that wide, and
58 -- use consecutive indices for to make a cache "line"
59 --
60 -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 constant ROW_SIZE : natural := wishbone_data_bits / 8;
62 -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
63 constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
64 -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
65 -- icache
66 constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
67 -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
68 constant INSN_PER_ROW : natural := wishbone_data_bits / 32;
69 -- Bit fields counts in the address
70
71 -- INSN_BITS is the number of bits to select an instruction in a row
72 constant INSN_BITS : natural := log2(INSN_PER_ROW);
73 -- ROW_BITS is the number of bits to select a row
74 constant ROW_BITS : natural := log2(BRAM_ROWS);
75 -- ROW_LINEBITS is the number of bits to select a row within a line
76 constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
77 -- LINE_OFF_BITS is the number of bits for the offset in a cache line
78 constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
79 -- ROW_OFF_BITS is the number of bits for the offset in a row
80 constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
81 -- INDEX_BITS is the number if bits to select a cache line
82 constant INDEX_BITS : natural := log2(NUM_LINES);
83 -- TAG_BITS is the number of bits of the tag part of the address
84 constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS;
85 -- WAY_BITS is the number of bits to select a way
86 constant WAY_BITS : natural := log2(NUM_WAYS);
87
88 -- Example of layout for 32 lines of 64 bytes:
89 --
90 -- .. tag |index| line |
91 -- .. | row | |
92 -- .. | | | |00| zero (2)
93 -- .. | | |-| | INSN_BITS (1)
94 -- .. | |---| | ROW_LINEBITS (3)
95 -- .. | |--- - --| LINE_OFF_BITS (6)
96 -- .. | |- --| ROW_OFF_BITS (3)
97 -- .. |----- ---| | ROW_BITS (8)
98 -- .. |-----| | INDEX_BITS (5)
99 -- .. --------| | TAG_BITS (53)
100
101 subtype row_t is integer range 0 to BRAM_ROWS-1;
102 subtype index_t is integer range 0 to NUM_LINES-1;
103 subtype way_t is integer range 0 to NUM_WAYS-1;
104
105 -- The cache data BRAM organized as described above for each way
106 subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0);
107
108 -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
109 -- not handle a clean (commented) definition of the cache tags as a 3d
110 -- memory. For now, work around it by putting all the tags
111 subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
112 -- type cache_tags_set_t is array(way_t) of cache_tag_t;
113 -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
114 constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
115 subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
116 type cache_tags_array_t is array(index_t) of cache_tags_set_t;
117
118 -- The cache valid bits
119 subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
120 type cache_valids_t is array(index_t) of cache_way_valids_t;
121
122 -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
123 signal cache_tags : cache_tags_array_t;
124 signal cache_valids : cache_valids_t;
125
126 attribute ram_style : string;
127 attribute ram_style of cache_tags : signal is "distributed";
128
129 -- Cache reload state machine
130 type state_t is (IDLE, WAIT_ACK);
131
132 type reg_internal_t is record
133 -- Cache hit state (Latches for 1 cycle BRAM access)
134 hit_way : way_t;
135 hit_nia : std_ulogic_vector(63 downto 0);
136 hit_smark : std_ulogic;
137 hit_valid : std_ulogic;
138
139 -- Cache miss state (reload state machine)
140 state : state_t;
141 wb : wishbone_master_out;
142 store_way : way_t;
143 store_index : index_t;
144 store_row : row_t;
145 end record;
146
147 signal r : reg_internal_t;
148
149 -- Async signals on incoming request
150 signal req_index : index_t;
151 signal req_row : row_t;
152 signal req_hit_way : way_t;
153 signal req_tag : cache_tag_t;
154 signal req_is_hit : std_ulogic;
155 signal req_is_miss : std_ulogic;
156 signal req_laddr : std_ulogic_vector(63 downto 0);
157
158 -- Cache RAM interface
159 type cache_ram_out_t is array(way_t) of cache_row_t;
160 signal cache_out : cache_ram_out_t;
161
162 -- PLRU output interface
163 type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
164 signal plru_victim : plru_out_t;
165 signal replace_way : way_t;
166
167 -- Return the cache line index (tag index) for an address
168 function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
169 begin
170 return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS)));
171 end;
172
173 -- Return the cache row index (data memory) for an address
174 function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
175 begin
176 return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS)));
177 end;
178
179 -- Returns whether this is the last row of a line
180 function is_last_row_addr(addr: wishbone_addr_type) return boolean is
181 constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
182 begin
183 return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
184 end;
185
186 -- Returns whether this is the last row of a line
187 function is_last_row(row: row_t) return boolean is
188 variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
189 constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
190 begin
191 row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
192 return row_v(ROW_LINEBITS-1 downto 0) = ones;
193 end;
194
195 -- Return the address of the next row in the current cache line
196 function next_row_addr(addr: wishbone_addr_type)
197 return std_ulogic_vector is
198 variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
199 variable result : wishbone_addr_type;
200 begin
201 -- Is there no simpler way in VHDL to generate that 3 bits adder ?
202 row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
203 row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
204 result := addr;
205 result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
206 return result;
207 end;
208
209 -- Return the next row in the current cache line. We use a dedicated
210 -- function in order to limit the size of the generated adder to be
211 -- only the bits within a cache line (3 bits with default settings)
212 --
213 function next_row(row: row_t) return row_t is
214 variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
215 variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
216 variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
217 begin
218 row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
219 row_idx := row_v(ROW_LINEBITS-1 downto 0);
220 row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
221 return to_integer(unsigned(row_v));
222 end;
223
224 -- Read the instruction word for the given address in the current cache row
225 function read_insn_word(addr: std_ulogic_vector(63 downto 0);
226 data: cache_row_t) return std_ulogic_vector is
227 variable word: integer range 0 to INSN_PER_ROW-1;
228 begin
229 word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
230 return data(31+word*32 downto word*32);
231 end;
232
233 -- Get the tag value from the address
234 function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is
235 begin
236 return addr(63 downto 64-TAG_BITS);
237 end;
238
239 -- Read a tag from a tag memory row
240 function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
241 begin
242 return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
243 end;
244
245 -- Write a tag to tag memory row
246 procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
247 tag: cache_tag_t) is
248 begin
249 tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
250 end;
251
252 begin
253
254 assert LINE_SIZE mod ROW_SIZE = 0;
255 assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
256 assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
257 assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
258 assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" severity FAILURE;
259 assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
260 report "geometry bits don't add up" severity FAILURE;
261 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
262 report "geometry bits don't add up" severity FAILURE;
263 assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
264 report "geometry bits don't add up" severity FAILURE;
265 assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
266 report "geometry bits don't add up" severity FAILURE;
267
268 sim_debug: if SIM generate
269 debug: process
270 begin
271 report "ROW_SIZE = " & natural'image(ROW_SIZE);
272 report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
273 report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
274 report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
275 report "INSN_BITS = " & natural'image(INSN_BITS);
276 report "ROW_BITS = " & natural'image(ROW_BITS);
277 report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
278 report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
279 report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
280 report "INDEX_BITS = " & natural'image(INDEX_BITS);
281 report "TAG_BITS = " & natural'image(TAG_BITS);
282 report "WAY_BITS = " & natural'image(WAY_BITS);
283 wait;
284 end process;
285 end generate;
286
287 -- Generate a cache RAM for each way
288 rams: for i in 0 to NUM_WAYS-1 generate
289 signal do_read : std_ulogic;
290 signal do_write : std_ulogic;
291 signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
292 signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
293 signal dout : cache_row_t;
294 begin
295 way: entity work.cache_ram
296 generic map (
297 ROW_BITS => ROW_BITS,
298 WIDTH => wishbone_data_bits
299 )
300 port map (
301 clk => clk,
302 rd_en => do_read,
303 rd_addr => rd_addr,
304 rd_data => dout,
305 wr_en => do_write,
306 wr_sel => (others => '1'),
307 wr_addr => wr_addr,
308 wr_data => wishbone_in.dat
309 );
310 process(all)
311 begin
312 do_read <= '1';
313 do_write <= '0';
314 if wishbone_in.ack = '1' and r.store_way = i then
315 do_write <= '1';
316 end if;
317 cache_out(i) <= dout;
318 rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
319 wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
320 end process;
321 end generate;
322
323 -- Generate PLRUs
324 maybe_plrus: if NUM_WAYS > 1 generate
325 begin
326 plrus: for i in 0 to NUM_LINES-1 generate
327 -- PLRU interface
328 signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
329 signal plru_acc_en : std_ulogic;
330 signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
331
332 begin
333 plru : entity work.plru
334 generic map (
335 BITS => WAY_BITS
336 )
337 port map (
338 clk => clk,
339 rst => rst,
340 acc => plru_acc,
341 acc_en => plru_acc_en,
342 lru => plru_out
343 );
344
345 process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out)
346 begin
347 -- PLRU interface
348 if req_is_hit = '1' and req_index = i then
349 plru_acc_en <= req_is_hit;
350 else
351 plru_acc_en <= '0';
352 end if;
353 plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
354 plru_victim(i) <= plru_out;
355 end process;
356 end generate;
357 end generate;
358
359 -- Cache hit detection, output to fetch2 and other misc logic
360 icache_comb : process(all)
361 variable is_hit : std_ulogic;
362 variable hit_way : way_t;
363 begin
364 -- Extract line, row and tag from request
365 req_index <= get_index(i_in.nia);
366 req_row <= get_row(i_in.nia);
367 req_tag <= get_tag(i_in.nia);
368
369 -- Calculate address of beginning of cache line, will be
370 -- used for cache miss processing if needed
371 --
372 req_laddr <= i_in.nia(63 downto LINE_OFF_BITS) &
373 (LINE_OFF_BITS-1 downto 0 => '0');
374
375 -- Test if pending request is a hit on any way
376 hit_way := 0;
377 is_hit := '0';
378 for i in way_t loop
379 if i_in.req = '1' and cache_valids(req_index)(i) = '1' then
380 if read_tag(i, cache_tags(req_index)) = req_tag then
381 hit_way := i;
382 is_hit := '1';
383 end if;
384 end if;
385 end loop;
386
387 -- Generate the "hit" and "miss" signals for the synchronous blocks
388 req_is_hit <= i_in.req and is_hit and not flush_in;
389 req_is_miss <= i_in.req and not is_hit and not flush_in;
390 req_hit_way <= hit_way;
391
392 -- The way to replace on a miss
393 replace_way <= to_integer(unsigned(plru_victim(req_index)));
394
395 -- Output instruction from current cache row
396 --
397 -- Note: This is a mild violation of our design principle of having pipeline
398 -- stages output from a clean latch. In this case we output the result
399 -- of a mux. The alternative would be output an entire row which
400 -- I prefer not to do just yet as it would force fetch2 to know about
401 -- some of the cache geometry information.
402 --
403 i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
404 i_out.valid <= r.hit_valid;
405 i_out.nia <= r.hit_nia;
406 i_out.stop_mark <= r.hit_smark;
407
408 -- Stall fetch1 if we have a miss
409 stall_out <= not is_hit;
410
411 -- Wishbone requests output (from the cache miss reload machine)
412 wishbone_out <= r.wb;
413 end process;
414
415 -- Cache hit synchronous machine
416 icache_hit : process(clk)
417 begin
418 if rising_edge(clk) then
419 -- On a hit, latch the request for the next cycle, when the BRAM data
420 -- will be available on the cache_out output of the corresponding way
421 --
422 if req_is_hit = '1' then
423 r.hit_way <= req_hit_way;
424 r.hit_nia <= i_in.nia;
425 r.hit_smark <= i_in.stop_mark;
426 r.hit_valid <= '1';
427
428 report "cache hit nia:" & to_hstring(i_in.nia) &
429 " SM:" & std_ulogic'image(i_in.stop_mark) &
430 " idx:" & integer'image(req_index) &
431 " tag:" & to_hstring(req_tag) &
432 " way: " & integer'image(req_hit_way);
433 else
434 r.hit_valid <= '0';
435
436 -- Send stop marks down regardless of validity
437 r.hit_smark <= i_in.stop_mark;
438 end if;
439 end if;
440 end process;
441
442 -- Cache miss/reload synchronous machine
443 icache_miss : process(clk)
444 variable tagset : cache_tags_set_t;
445 variable stbs_done : boolean;
446 begin
447 if rising_edge(clk) then
448 -- On reset, clear all valid bits to force misses
449 if rst = '1' then
450 for i in index_t loop
451 cache_valids(i) <= (others => '0');
452 end loop;
453 r.state <= IDLE;
454 r.wb.cyc <= '0';
455 r.wb.stb <= '0';
456
457 -- We only ever do reads on wishbone
458 r.wb.dat <= (others => '0');
459 r.wb.sel <= "11111111";
460 r.wb.we <= '0';
461
462 -- Not useful normally but helps avoiding tons of sim warnings
463 r.wb.adr <= (others => '0');
464 else
465 -- Main state machine
466 case r.state is
467 when IDLE =>
468 -- We need to read a cache line
469 if req_is_miss = '1' then
470 report "cache miss nia:" & to_hstring(i_in.nia) &
471 " SM:" & std_ulogic'image(i_in.stop_mark) &
472 " idx:" & integer'image(req_index) &
473 " way:" & integer'image(replace_way) &
474 " tag:" & to_hstring(req_tag);
475
476 -- Force misses on that way while reloading that line
477 cache_valids(req_index)(replace_way) <= '0';
478
479 -- Store new tag in selected way
480 for i in 0 to NUM_WAYS-1 loop
481 if i = replace_way then
482 tagset := cache_tags(req_index);
483 write_tag(i, tagset, req_tag);
484 cache_tags(req_index) <= tagset;
485 end if;
486 end loop;
487
488 -- Keep track of our index and way for subsequent stores
489 r.store_index <= req_index;
490 r.store_way <= replace_way;
491 r.store_row <= get_row(req_laddr);
492
493 -- Prep for first wishbone read. We calculate the address of
494 -- the start of the cache line and start the WB cycle.
495 --
496 r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
497 r.wb.cyc <= '1';
498 r.wb.stb <= '1';
499
500 -- Track that we had one request sent
501 r.state <= WAIT_ACK;
502 end if;
503
504 when WAIT_ACK =>
505 -- Requests are all sent if stb is 0
506 stbs_done := r.wb.stb = '0';
507
508 -- If we are still sending requests, was one accepted ?
509 if wishbone_in.stall = '0' and not stbs_done then
510 -- That was the last word ? We are done sending. Clear
511 -- stb and set stbs_done so we can handle an eventual last
512 -- ack on the same cycle.
513 --
514 if is_last_row_addr(r.wb.adr) then
515 r.wb.stb <= '0';
516 stbs_done := true;
517 end if;
518
519 -- Calculate the next row address
520 r.wb.adr <= next_row_addr(r.wb.adr);
521 end if;
522
523 -- Incoming acks processing
524 if wishbone_in.ack = '1' then
525 -- Check for completion
526 if stbs_done and is_last_row(r.store_row) then
527 -- Complete wishbone cycle
528 r.wb.cyc <= '0';
529
530 -- Cache line is now valid
531 cache_valids(r.store_index)(r.store_way) <= '1';
532
533 -- We are done
534 r.state <= IDLE;
535 end if;
536
537 -- Increment store row counter
538 r.store_row <= next_row(r.store_row);
539 end if;
540 end case;
541 end if;
542 end if;
543 end process;
544 end;