2 use ieee.std_logic_1164.all;
3 use ieee.numeric_std.all;
6 use work.decode_types.all;
8 use work.insn_helpers.all;
12 -- We calculate the address in the first cycle
16 HAS_FPU : boolean := true;
17 -- Non-zero to enable log data collection
18 LOG_LENGTH : natural := 0
24 l_in : in Execute1ToLoadstore1Type;
25 e_out : out Loadstore1ToExecute1Type;
26 l_out : out Loadstore1ToWritebackType;
28 d_out : out Loadstore1ToDcacheType;
29 d_in : in DcacheToLoadstore1Type;
31 m_out : out Loadstore1ToMmuType;
32 m_in : in MmuToLoadstore1Type;
34 dc_stall : in std_ulogic;
36 log_out : out std_ulogic_vector(9 downto 0)
40 -- Note, we don't currently use the stall output from the dcache because
41 -- we know it can take two requests without stalling when idle, we are
42 -- its only user, and we know it never stalls when idle.
44 architecture behave of loadstore1 is
46 -- State machine for unaligned loads/stores
47 type state_t is (IDLE, -- ready for instruction
48 FPR_CONV, -- converting double to float for store
49 SECOND_REQ, -- send 2nd request of unaligned xfer
50 ACK_WAIT, -- waiting for ack from dcache
51 MMU_LOOKUP, -- waiting for MMU to look up translation
52 TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
53 FINISH_LFS, -- write back converted SP data for lfs*
54 COMPLETE -- extra cycle to complete an operation
57 type reg_stage_t is record
58 -- latch most of the input request
63 addr : std_ulogic_vector(63 downto 0);
64 store_data : std_ulogic_vector(63 downto 0);
65 load_data : std_ulogic_vector(63 downto 0);
66 write_reg : gspr_index_t;
67 length : std_ulogic_vector(3 downto 0);
68 byte_reverse : std_ulogic;
69 sign_extend : std_ulogic;
71 update_reg : gpr_index_t;
75 nc : std_ulogic; -- non-cacheable access
76 virt_mode : std_ulogic;
77 priv_mode : std_ulogic;
79 dwords_done : std_ulogic;
80 last_dword : std_ulogic;
81 first_bytes : std_ulogic_vector(7 downto 0);
82 second_bytes : std_ulogic_vector(7 downto 0);
83 dar : std_ulogic_vector(63 downto 0);
84 dsisr : std_ulogic_vector(31 downto 0);
85 instr_fault : std_ulogic;
86 align_intr : std_ulogic;
87 sprval : std_ulogic_vector(63 downto 0);
89 wait_dcache : std_ulogic;
90 wait_mmu : std_ulogic;
91 do_update : std_ulogic;
92 extra_cycle : std_ulogic;
93 mode_32bit : std_ulogic;
95 ld_sp_data : std_ulogic_vector(31 downto 0);
96 ld_sp_nz : std_ulogic;
97 ld_sp_lz : std_ulogic_vector(5 downto 0);
98 st_sp_data : std_ulogic_vector(31 downto 0);
101 type byte_sel_t is array(0 to 7) of std_ulogic;
102 subtype byte_trim_t is std_ulogic_vector(1 downto 0);
103 type trim_ctl_t is array(0 to 7) of byte_trim_t;
105 signal r, rin : reg_stage_t;
106 signal lsu_sum : std_ulogic_vector(63 downto 0);
108 signal store_sp_data : std_ulogic_vector(31 downto 0);
109 signal load_dp_data : std_ulogic_vector(63 downto 0);
111 -- Generate byte enables from sizes
112 function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
126 end function length_to_sel;
128 -- Calculate byte enables
129 -- This returns 16 bits, giving the select signals for two transfers,
130 -- to account for unaligned loads or stores
131 function xfer_data_sel(size : in std_logic_vector(3 downto 0);
132 address : in std_logic_vector(2 downto 0))
133 return std_ulogic_vector is
134 variable longsel : std_ulogic_vector(15 downto 0);
136 longsel := "00000000" & length_to_sel(size);
137 return std_ulogic_vector(shift_left(unsigned(longsel),
138 to_integer(unsigned(address))));
139 end function xfer_data_sel;
141 -- 23-bit right shifter for DP -> SP float conversions
142 function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
143 return std_ulogic_vector is
144 variable fs1 : std_ulogic_vector(22 downto 0);
145 variable fs2 : std_ulogic_vector(22 downto 0);
147 case shift(1 downto 0) is
151 fs1 := '0' & frac(22 downto 1);
153 fs1 := "00" & frac(22 downto 2);
155 fs1 := "000" & frac(22 downto 3);
157 case shift(4 downto 2) is
161 fs2 := x"0" & fs1(22 downto 4);
163 fs2 := x"00" & fs1(22 downto 8);
165 fs2 := x"000" & fs1(22 downto 12);
167 fs2 := x"0000" & fs1(22 downto 16);
169 fs2 := x"00000" & fs1(22 downto 20);
174 -- 23-bit left shifter for SP -> DP float conversions
175 function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
176 return std_ulogic_vector is
177 variable fs1 : std_ulogic_vector(22 downto 0);
178 variable fs2 : std_ulogic_vector(22 downto 0);
180 case shift(1 downto 0) is
184 fs1 := frac(21 downto 0) & '0';
186 fs1 := frac(20 downto 0) & "00";
188 fs1 := frac(19 downto 0) & "000";
190 case shift(4 downto 2) is
194 fs2 := fs1(18 downto 0) & x"0" ;
196 fs2 := fs1(14 downto 0) & x"00";
198 fs2 := fs1(10 downto 0) & x"000";
200 fs2 := fs1(6 downto 0) & x"0000";
202 fs2 := fs1(2 downto 0) & x"00000";
208 -- Calculate the address in the first cycle
209 lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
211 loadstore1_0: process(clk)
213 if rising_edge(clk) then
224 ls_fp_conv: if HAS_FPU generate
225 -- Convert DP data to SP for stfs
226 dp_to_sp: process(all)
227 variable exp : unsigned(10 downto 0);
228 variable frac : std_ulogic_vector(22 downto 0);
229 variable shift : unsigned(4 downto 0);
231 store_sp_data(31) <= l_in.data(63);
232 store_sp_data(30 downto 0) <= (others => '0');
233 exp := unsigned(l_in.data(62 downto 52));
235 store_sp_data(30) <= l_in.data(62);
236 store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
237 elsif exp >= 874 then
238 -- denormalization required
239 frac := '1' & l_in.data(51 downto 30);
240 shift := 0 - exp(4 downto 0);
241 store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
245 -- Convert SP data to DP for lfs
246 sp_to_dp: process(all)
247 variable exp : unsigned(7 downto 0);
248 variable exp_dp : unsigned(10 downto 0);
249 variable exp_nz : std_ulogic;
250 variable exp_ao : std_ulogic;
251 variable frac : std_ulogic_vector(22 downto 0);
252 variable frac_shift : unsigned(4 downto 0);
254 frac := r.ld_sp_data(22 downto 0);
255 exp := unsigned(r.ld_sp_data(30 downto 23));
256 exp_nz := or (r.ld_sp_data(30 downto 23));
257 exp_ao := and (r.ld_sp_data(30 downto 23));
258 frac_shift := (others => '0');
260 exp_dp := to_unsigned(2047, 11); -- infinity or NaN
261 elsif exp_nz = '1' then
262 exp_dp := 896 + resize(exp, 11); -- finite normalized value
263 elsif r.ld_sp_nz = '0' then
264 exp_dp := to_unsigned(0, 11); -- zero
266 -- denormalized SP operand, need to normalize
267 exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
268 frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
270 load_dp_data(63) <= r.ld_sp_data(31);
271 load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
272 load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
273 load_dp_data(28 downto 0) <= (others => '0');
277 loadstore1_1: process(all)
278 variable v : reg_stage_t;
279 variable brev_lenm1 : unsigned(2 downto 0);
280 variable byte_offset : unsigned(2 downto 0);
281 variable j : integer;
282 variable k : unsigned(2 downto 0);
283 variable kk : unsigned(3 downto 0);
284 variable long_sel : std_ulogic_vector(15 downto 0);
285 variable byte_sel : std_ulogic_vector(7 downto 0);
286 variable req : std_ulogic;
287 variable busy : std_ulogic;
288 variable addr : std_ulogic_vector(63 downto 0);
289 variable maddr : std_ulogic_vector(63 downto 0);
290 variable wdata : std_ulogic_vector(63 downto 0);
291 variable write_enable : std_ulogic;
292 variable do_update : std_ulogic;
293 variable done : std_ulogic;
294 variable data_permuted : std_ulogic_vector(63 downto 0);
295 variable data_trimmed : std_ulogic_vector(63 downto 0);
296 variable store_data : std_ulogic_vector(63 downto 0);
297 variable data_in : std_ulogic_vector(63 downto 0);
298 variable byte_rev : std_ulogic;
299 variable length : std_ulogic_vector(3 downto 0);
300 variable use_second : byte_sel_t;
301 variable trim_ctl : trim_ctl_t;
302 variable negative : std_ulogic;
303 variable sprn : std_ulogic_vector(9 downto 0);
304 variable exception : std_ulogic;
305 variable next_addr : std_ulogic_vector(63 downto 0);
306 variable mmureq : std_ulogic;
307 variable dsisr : std_ulogic_vector(31 downto 0);
308 variable mmu_mtspr : std_ulogic;
309 variable itlb_fault : std_ulogic;
310 variable misaligned : std_ulogic;
311 variable fp_reg_conv : std_ulogic;
312 variable lfs_done : std_ulogic;
319 sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
320 dsisr := (others => '0');
327 do_update := r.do_update;
330 -- load data formatting
331 byte_offset := unsigned(r.addr(2 downto 0));
333 if r.byte_reverse = '1' then
334 brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
337 -- shift and byte-reverse data bytes
339 kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
340 use_second(i) := kk(3);
341 j := to_integer(kk(2 downto 0)) * 8;
342 data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
345 -- Work out the sign bit for sign extension.
346 -- For unaligned loads crossing two dwords, the sign bit is in the
347 -- first dword for big-endian (byte_reverse = 1), or the second dword
348 -- for little-endian.
349 if r.dwords_done = '1' and r.byte_reverse = '1' then
350 negative := (r.length(3) and r.load_data(63)) or
351 (r.length(2) and r.load_data(31)) or
352 (r.length(1) and r.load_data(15)) or
353 (r.length(0) and r.load_data(7));
355 negative := (r.length(3) and data_permuted(63)) or
356 (r.length(2) and data_permuted(31)) or
357 (r.length(1) and data_permuted(15)) or
358 (r.length(0) and data_permuted(7));
361 -- trim and sign-extend
363 if i < to_integer(unsigned(r.length)) then
364 if r.dwords_done = '1' then
365 trim_ctl(i) := '1' & not use_second(i);
370 trim_ctl(i) := '0' & (negative and r.sign_extend);
374 data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
376 data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
378 data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
380 data_trimmed(i * 8 + 7 downto i * 8) := x"00";
385 -- Single-precision FP conversion
386 v.st_sp_data := store_sp_data;
387 v.ld_sp_data := data_trimmed(31 downto 0);
388 v.ld_sp_nz := or (data_trimmed(22 downto 0));
389 v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
392 -- Byte reversing and rotating for stores.
393 -- Done in the first cycle (when l_in.valid = 1) for integer stores
394 -- and DP float stores, and in the second cycle for SP float stores.
395 store_data := r.store_data;
396 if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
397 if HAS_FPU and r.state = FPR_CONV then
398 data_in := x"00000000" & r.st_sp_data;
399 byte_offset := unsigned(r.addr(2 downto 0));
400 byte_rev := r.byte_reverse;
403 data_in := l_in.data;
404 byte_offset := unsigned(lsu_sum(2 downto 0));
405 byte_rev := l_in.byte_reverse;
406 length := l_in.length;
409 if byte_rev = '1' then
410 brev_lenm1 := unsigned(length(2 downto 0)) - 1;
413 k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
414 j := to_integer(k) * 8;
415 store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
418 v.store_data := store_data;
420 -- compute (addr + 8) & ~7 for the second doubleword when unaligned
421 next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
424 -- We need to minimize the delay from clock to busy valid because it
425 -- gates the start of execution of the next instruction.
426 busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done));
430 if r.state /= IDLE and busy = '0' then
435 if r.dwords_done = '1' or r.state = SECOND_REQ then
437 byte_sel := r.second_bytes;
440 byte_sel := r.first_bytes;
442 if r.mode_32bit = '1' then
443 addr(63 downto 32) := (others => '0');
452 if r.second_bytes /= "00000000" then
453 v.state := SECOND_REQ;
464 if d_in.error = '1' then
465 -- dcache will discard the second request if it
466 -- gets an error on the 1st of two requests
467 if d_in.cache_paradox = '1' then
468 -- signal an interrupt straight away
470 dsisr(63 - 38) := not r.load;
471 -- XXX there is no architected bit for this
472 dsisr(63 - 35) := d_in.cache_paradox;
474 -- Look up the translation for TLB miss
475 -- and also for permission error and RC error
476 -- in case the PTE has been updated.
478 v.state := MMU_LOOKUP;
481 if d_in.valid = '1' then
482 if r.last_dword = '0' then
483 v.dwords_done := '1';
486 v.load_data := data_permuted;
489 write_enable := r.load and not r.load_sp;
490 if HAS_FPU and r.load_sp = '1' then
491 -- SP to DP conversion takes a cycle
492 -- Write back rA update in this cycle if needed
493 do_update := r.update;
494 v.state := FINISH_LFS;
495 elsif r.extra_cycle = '1' then
496 -- loads with rA update need an extra cycle
498 v.do_update := r.update;
500 -- stores write back rA update in this cycle
501 do_update := r.update;
506 -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state,
507 -- which is OK because the dcache always takes at least two cycles.
508 v.wait_dcache := r.last_dword and not r.extra_cycle;
511 if m_in.done = '1' then
512 if r.instr_fault = '0' then
513 -- retry the request now that the MMU has installed a TLB entry
515 if r.last_dword = '0' then
516 v.state := SECOND_REQ;
522 if m_in.err = '1' then
524 dsisr(63 - 33) := m_in.invalid;
525 dsisr(63 - 36) := m_in.perm_error;
526 dsisr(63 - 38) := not r.load;
527 dsisr(63 - 44) := m_in.badtree;
528 dsisr(63 - 45) := m_in.rc_error;
537 exception := r.align_intr;
541 if done = '1' or exception = '1' then
546 -- Note that l_in.valid is gated with busy inside execute1
547 if l_in.valid = '1' then
549 v.mode_32bit := l_in.mode_32bit;
553 v.instr_fault := '0';
555 v.dwords_done := '0';
557 v.write_reg := l_in.write_reg;
558 v.length := l_in.length;
559 v.byte_reverse := l_in.byte_reverse;
560 v.sign_extend := l_in.sign_extend;
561 v.update := l_in.update;
562 v.update_reg := l_in.update_reg;
564 v.reserve := l_in.reserve;
567 v.virt_mode := l_in.virt_mode;
568 v.priv_mode := l_in.priv_mode;
570 v.wait_dcache := '0';
573 v.extra_cycle := '0';
576 if l_in.mode_32bit = '1' then
577 addr(63 downto 32) := (others => '0');
579 maddr := l_in.addr2; -- address from RB for tlbie
581 -- XXX Temporary hack. Mark the op as non-cachable if the address
582 -- is the form 0xc------- for a real-mode access.
583 if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
587 -- Do length_to_sel and work out if we are doing 2 dwords
588 long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
589 byte_sel := long_sel(7 downto 0);
590 v.first_bytes := byte_sel;
591 v.second_bytes := long_sel(15 downto 8);
593 -- check alignment for larx/stcx
594 misaligned := or (std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1) and addr(2 downto 0));
595 v.align_intr := l_in.reserve and misaligned;
603 -- Allow an extra cycle for RA update on loads
604 v.extra_cycle := l_in.update;
606 v.align_intr := v.nc;
611 if l_in.is_32bit = '1' then
622 -- Allow an extra cycle for SP->DP precision conversion
624 v.extra_cycle := l_in.update;
625 if l_in.is_32bit = '1' then
627 v.extra_cycle := '1';
633 v.state := TLBIE_WAIT;
637 -- partial decode on SPR number should be adequate given
638 -- the restricted set that get sent down this path
639 if sprn(9) = '0' and sprn(5) = '0' then
640 if sprn(0) = '0' then
641 v.sprval := x"00000000" & r.dsisr;
646 -- reading one of the SPRs in the MMU
647 v.sprval := m_in.sprval;
651 if sprn(9) = '0' and sprn(5) = '0' then
652 if sprn(0) = '0' then
653 v.dsisr := l_in.data(31 downto 0);
659 -- writing one of the SPRs in the MMU
661 v.state := TLBIE_WAIT;
664 when OP_FETCH_FAILED =>
665 -- send it to the MMU to do the radix walk
667 v.instr_fault := '1';
669 v.state := MMU_LOOKUP;
672 assert false report "unknown op sent to loadstore1";
676 if v.align_intr = '1' then
678 elsif long_sel(15 downto 8) = "00000000" then
681 v.state := SECOND_REQ;
685 v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
688 -- Update outputs to dcache
689 d_out.valid <= req and not v.align_intr;
690 d_out.load <= v.load;
691 d_out.dcbz <= v.dcbz;
693 d_out.reserve <= v.reserve;
695 d_out.data <= store_data;
696 d_out.byte_sel <= byte_sel;
697 d_out.virt_mode <= v.virt_mode;
698 d_out.priv_mode <= v.priv_mode;
700 -- Update outputs to MMU
701 m_out.valid <= mmureq;
702 m_out.iside <= v.instr_fault;
703 m_out.load <= r.load;
704 m_out.priv <= r.priv_mode;
705 m_out.tlbie <= v.tlbie;
706 m_out.mtspr <= mmu_mtspr;
709 m_out.slbia <= l_in.insn(7);
710 m_out.rs <= l_in.data;
712 -- Update outputs to writeback
713 -- Multiplex either cache data to the destination GPR or
714 -- the address for the rA update.
716 if r.mfspr = '1' then
717 l_out.write_enable <= '1';
718 l_out.write_reg <= r.write_reg;
719 l_out.write_data <= r.sprval;
720 elsif do_update = '1' then
721 l_out.write_enable <= '1';
722 l_out.write_reg <= gpr_to_gspr(r.update_reg);
723 l_out.write_data <= r.addr;
724 elsif lfs_done = '1' then
725 l_out.write_enable <= '1';
726 l_out.write_reg <= r.write_reg;
727 l_out.write_data <= load_dp_data;
729 l_out.write_enable <= write_enable;
730 l_out.write_reg <= r.write_reg;
731 l_out.write_data <= data_trimmed;
733 l_out.xerc <= r.xerc;
734 l_out.rc <= r.rc and done;
735 l_out.store_done <= d_in.store_done;
737 -- update exception info back to execute1
739 e_out.exception <= exception;
740 e_out.alignment <= r.align_intr;
741 e_out.instr_fault <= r.instr_fault;
742 e_out.invalid <= m_in.invalid;
743 e_out.badtree <= m_in.badtree;
744 e_out.perm_error <= m_in.perm_error;
745 e_out.rc_error <= m_in.rc_error;
746 e_out.segment_fault <= m_in.segerr;
747 if exception = '1' and r.instr_fault = '0' then
749 if m_in.segerr = '0' and r.align_intr = '0' then
759 l1_log: if LOG_LENGTH > 0 generate
760 signal log_data : std_ulogic_vector(9 downto 0);
762 ls1_log: process(clk)
764 if rising_edge(clk) then
765 log_data <= e_out.busy &
772 std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));