library work;
use work.decode_types.all;
use work.common.all;
+use work.insn_helpers.all;
+use work.helpers.all;
-- 2 cycle LSU
-- We calculate the address in the first cycle
entity loadstore1 is
+ generic (
+ HAS_FPU : boolean := true;
+ -- Non-zero to enable log data collection
+ LOG_LENGTH : natural := 0
+ );
port (
clk : in std_ulogic;
rst : in std_ulogic;
);
end loadstore1;
--- Note, we don't currently use the stall output from the dcache because
--- we know it can take two requests without stalling when idle, we are
--- its only user, and we know it never stalls when idle.
-
architecture behave of loadstore1 is
-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
- SECOND_REQ, -- send 2nd request of unaligned xfer
- ACK_WAIT, -- waiting for ack from dcache
- LD_UPDATE, -- writing rA with computed addr on load
MMU_LOOKUP, -- waiting for MMU to look up translation
- TLBIE_WAIT -- waiting for MMU to finish doing a tlbie
+ TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
+ FINISH_LFS -- write back converted SP data for lfs*
);
- type reg_stage_t is record
- busy : std_ulogic;
- -- latch most of the input request
+ type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
+ subtype byte_trim_t is std_ulogic_vector(1 downto 0);
+ type trim_ctl_t is array(0 to 7) of byte_trim_t;
+
+ type request_t is record
+ valid : std_ulogic;
+ dc_req : std_ulogic;
load : std_ulogic;
+ store : std_ulogic;
tlbie : std_ulogic;
dcbz : std_ulogic;
+ read_spr : std_ulogic;
+ write_spr : std_ulogic;
+ mmu_op : std_ulogic;
+ instr_fault : std_ulogic;
+ load_zero : std_ulogic;
+ do_update : std_ulogic;
+ noop : std_ulogic;
+ mode_32bit : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
+ addr0 : std_ulogic_vector(63 downto 0);
+ byte_sel : std_ulogic_vector(7 downto 0);
+ second_bytes : std_ulogic_vector(7 downto 0);
store_data : std_ulogic_vector(63 downto 0);
- load_data : std_ulogic_vector(63 downto 0);
- write_reg : gpr_index_t;
+ instr_tag : instr_tag_t;
+ write_reg : gspr_index_t;
length : std_ulogic_vector(3 downto 0);
+ elt_length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
+ brev_mask : unsigned(2 downto 0);
sign_extend : std_ulogic;
update : std_ulogic;
- update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
+ atomic : std_ulogic;
+ atomic_last : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
virt_mode : std_ulogic;
priv_mode : std_ulogic;
+ load_sp : std_ulogic;
+ sprn : std_ulogic_vector(9 downto 0);
+ is_slbia : std_ulogic;
+ align_intr : std_ulogic;
+ dword_index : std_ulogic;
+ two_dwords : std_ulogic;
+ nia : std_ulogic_vector(63 downto 0);
+ end record;
+ constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0',
+ dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0',
+ instr_fault => '0', load_zero => '0', do_update => '0', noop => '0',
+ mode_32bit => '0', addr => (others => '0'), addr0 => (others => '0'),
+ byte_sel => x"00", second_bytes => x"00",
+ store_data => (others => '0'), instr_tag => instr_tag_init,
+ write_reg => 7x"00", length => x"0",
+ elt_length => x"0", byte_reverse => '0', brev_mask => "000",
+ sign_extend => '0', update => '0',
+ xerc => xerc_init, reserve => '0',
+ atomic => '0', atomic_last => '0', rc => '0', nc => '0',
+ virt_mode => '0', priv_mode => '0', load_sp => '0',
+ sprn => 10x"0", is_slbia => '0', align_intr => '0',
+ dword_index => '0', two_dwords => '0',
+ nia => (others => '0'));
+
+ type reg_stage1_t is record
+ req : request_t;
+ issued : std_ulogic;
+ end record;
+
+ type reg_stage2_t is record
+ req : request_t;
+ byte_index : byte_index_t;
+ use_second : std_ulogic_vector(7 downto 0);
+ wait_dc : std_ulogic;
+ wait_mmu : std_ulogic;
+ one_cycle : std_ulogic;
+ wr_sel : std_ulogic_vector(1 downto 0);
+ end record;
+
+ type reg_stage3_t is record
state : state_t;
- dwords_done : std_ulogic;
- first_bytes : std_ulogic_vector(7 downto 0);
- second_bytes : std_ulogic_vector(7 downto 0);
+ instr_tag : instr_tag_t;
+ write_enable : std_ulogic;
+ write_reg : gspr_index_t;
+ write_data : std_ulogic_vector(63 downto 0);
+ rc : std_ulogic;
+ xerc : xer_common_t;
+ store_done : std_ulogic;
+ convert_lfs : std_ulogic;
+ load_data : std_ulogic_vector(63 downto 0);
dar : std_ulogic_vector(63 downto 0);
dsisr : std_ulogic_vector(31 downto 0);
- instr_fault : std_ulogic;
+ ld_sp_data : std_ulogic_vector(31 downto 0);
+ ld_sp_nz : std_ulogic;
+ ld_sp_lz : std_ulogic_vector(5 downto 0);
+ stage1_en : std_ulogic;
+ interrupt : std_ulogic;
+ intr_vec : integer range 0 to 16#fff#;
+ nia : std_ulogic_vector(63 downto 0);
+ srr1 : std_ulogic_vector(15 downto 0);
end record;
- type byte_sel_t is array(0 to 7) of std_ulogic;
- subtype byte_trim_t is std_ulogic_vector(1 downto 0);
- type trim_ctl_t is array(0 to 7) of byte_trim_t;
+ signal req_in : request_t;
+ signal r1, r1in : reg_stage1_t;
+ signal r2, r2in : reg_stage2_t;
+ signal r3, r3in : reg_stage3_t;
+
+ signal busy : std_ulogic;
+ signal complete : std_ulogic;
+ signal in_progress : std_ulogic;
+ signal flushing : std_ulogic;
- signal r, rin : reg_stage_t;
- signal lsu_sum : std_ulogic_vector(63 downto 0);
+ signal store_sp_data : std_ulogic_vector(31 downto 0);
+ signal load_dp_data : std_ulogic_vector(63 downto 0);
+ signal store_data : std_ulogic_vector(63 downto 0);
- signal log_data : std_ulogic_vector(9 downto 0);
+ signal stage1_issue_enable : std_ulogic;
+ signal stage1_req : request_t;
+ signal stage1_dcreq : std_ulogic;
+ signal stage1_dreq : std_ulogic;
+ signal stage2_busy_next : std_ulogic;
+ signal stage3_busy_next : std_ulogic;
-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
to_integer(unsigned(address))));
end function xfer_data_sel;
-begin
- -- Calculate the address in the first cycle
- lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
+ -- 23-bit right shifter for DP -> SP float conversions
+ function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+ return std_ulogic_vector is
+ variable fs1 : std_ulogic_vector(22 downto 0);
+ variable fs2 : std_ulogic_vector(22 downto 0);
+ begin
+ case shift(1 downto 0) is
+ when "00" =>
+ fs1 := frac;
+ when "01" =>
+ fs1 := '0' & frac(22 downto 1);
+ when "10" =>
+ fs1 := "00" & frac(22 downto 2);
+ when others =>
+ fs1 := "000" & frac(22 downto 3);
+ end case;
+ case shift(4 downto 2) is
+ when "000" =>
+ fs2 := fs1;
+ when "001" =>
+ fs2 := x"0" & fs1(22 downto 4);
+ when "010" =>
+ fs2 := x"00" & fs1(22 downto 8);
+ when "011" =>
+ fs2 := x"000" & fs1(22 downto 12);
+ when "100" =>
+ fs2 := x"0000" & fs1(22 downto 16);
+ when others =>
+ fs2 := x"00000" & fs1(22 downto 20);
+ end case;
+ return fs2;
+ end;
+
+ -- 23-bit left shifter for SP -> DP float conversions
+ function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+ return std_ulogic_vector is
+ variable fs1 : std_ulogic_vector(22 downto 0);
+ variable fs2 : std_ulogic_vector(22 downto 0);
+ begin
+ case shift(1 downto 0) is
+ when "00" =>
+ fs1 := frac;
+ when "01" =>
+ fs1 := frac(21 downto 0) & '0';
+ when "10" =>
+ fs1 := frac(20 downto 0) & "00";
+ when others =>
+ fs1 := frac(19 downto 0) & "000";
+ end case;
+ case shift(4 downto 2) is
+ when "000" =>
+ fs2 := fs1;
+ when "001" =>
+ fs2 := fs1(18 downto 0) & x"0" ;
+ when "010" =>
+ fs2 := fs1(14 downto 0) & x"00";
+ when "011" =>
+ fs2 := fs1(10 downto 0) & x"000";
+ when "100" =>
+ fs2 := fs1(6 downto 0) & x"0000";
+ when others =>
+ fs2 := fs1(2 downto 0) & x"00000";
+ end case;
+ return fs2;
+ end;
- loadstore1_0: process(clk)
+begin
+ loadstore1_reg: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
- r.state <= IDLE;
- r.busy <= '0';
+ r1.req.valid <= '0';
+ r2.req.valid <= '0';
+ r2.wait_dc <= '0';
+ r2.wait_mmu <= '0';
+ r2.one_cycle <= '0';
+ r3.dar <= (others => '0');
+ r3.dsisr <= (others => '0');
+ r3.state <= IDLE;
+ r3.write_enable <= '0';
+ r3.interrupt <= '0';
+ r3.stage1_en <= '1';
+ r3.convert_lfs <= '0';
+ flushing <= '0';
else
- r <= rin;
+ r1 <= r1in;
+ r2 <= r2in;
+ r3 <= r3in;
+ flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and
+ not r3in.interrupt;
+ end if;
+ stage1_dreq <= stage1_dcreq;
+ if d_in.valid = '1' then
+ assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure;
+ end if;
+ if d_in.error = '1' then
+ assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure;
+ end if;
+ if m_in.done = '1' or m_in.err = '1' then
+ assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure;
end if;
end if;
end process;
- loadstore1_1: process(all)
- variable v : reg_stage_t;
+ ls_fp_conv: if HAS_FPU generate
+ -- Convert DP data to SP for stfs
+ dp_to_sp: process(all)
+ variable exp : unsigned(10 downto 0);
+ variable frac : std_ulogic_vector(22 downto 0);
+ variable shift : unsigned(4 downto 0);
+ begin
+ store_sp_data(31) <= l_in.data(63);
+ store_sp_data(30 downto 0) <= (others => '0');
+ exp := unsigned(l_in.data(62 downto 52));
+ if exp > 896 then
+ store_sp_data(30) <= l_in.data(62);
+ store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+ elsif exp >= 874 then
+ -- denormalization required
+ frac := '1' & l_in.data(51 downto 30);
+ shift := 0 - exp(4 downto 0);
+ store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+ end if;
+ end process;
+
+ -- Convert SP data to DP for lfs
+ sp_to_dp: process(all)
+ variable exp : unsigned(7 downto 0);
+ variable exp_dp : unsigned(10 downto 0);
+ variable exp_nz : std_ulogic;
+ variable exp_ao : std_ulogic;
+ variable frac : std_ulogic_vector(22 downto 0);
+ variable frac_shift : unsigned(4 downto 0);
+ begin
+ frac := r3.ld_sp_data(22 downto 0);
+ exp := unsigned(r3.ld_sp_data(30 downto 23));
+ exp_nz := or (r3.ld_sp_data(30 downto 23));
+ exp_ao := and (r3.ld_sp_data(30 downto 23));
+ frac_shift := (others => '0');
+ if exp_ao = '1' then
+ exp_dp := to_unsigned(2047, 11); -- infinity or NaN
+ elsif exp_nz = '1' then
+ exp_dp := 896 + resize(exp, 11); -- finite normalized value
+ elsif r3.ld_sp_nz = '0' then
+ exp_dp := to_unsigned(0, 11); -- zero
+ else
+ -- denormalized SP operand, need to normalize
+ exp_dp := 896 - resize(unsigned(r3.ld_sp_lz), 11);
+ frac_shift := unsigned(r3.ld_sp_lz(4 downto 0)) + 1;
+ end if;
+ load_dp_data(63) <= r3.ld_sp_data(31);
+ load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+ load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+ load_dp_data(28 downto 0) <= (others => '0');
+ end process;
+ end generate;
+
+ -- Translate a load/store instruction into the internal request format
+ -- XXX this should only depend on l_in, but actually depends on
+ -- r1.req.addr0 as well (in the l_in.second = 1 case).
+ loadstore1_in: process(all)
+ variable v : request_t;
+ variable lsu_sum : std_ulogic_vector(63 downto 0);
variable brev_lenm1 : unsigned(2 downto 0);
- variable byte_offset : unsigned(2 downto 0);
+ variable long_sel : std_ulogic_vector(15 downto 0);
+ variable addr : std_ulogic_vector(63 downto 0);
+ variable sprn : std_ulogic_vector(9 downto 0);
+ variable misaligned : std_ulogic;
+ variable addr_mask : std_ulogic_vector(2 downto 0);
+ begin
+ v := request_init;
+ sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
+
+ v.valid := l_in.valid;
+ v.instr_tag := l_in.instr_tag;
+ v.mode_32bit := l_in.mode_32bit;
+ v.write_reg := l_in.write_reg;
+ v.length := l_in.length;
+ v.elt_length := l_in.length;
+ v.byte_reverse := l_in.byte_reverse;
+ v.sign_extend := l_in.sign_extend;
+ v.update := l_in.update;
+ v.xerc := l_in.xerc;
+ v.reserve := l_in.reserve;
+ v.rc := l_in.rc;
+ v.nc := l_in.ci;
+ v.virt_mode := l_in.virt_mode;
+ v.priv_mode := l_in.priv_mode;
+ v.sprn := sprn;
+ v.nia := l_in.nia;
+
+ lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2));
+
+ if HAS_FPU and l_in.is_32bit = '1' then
+ v.store_data := x"00000000" & store_sp_data;
+ else
+ v.store_data := l_in.data;
+ end if;
+
+ addr := lsu_sum;
+
+ if l_in.second = '1' then
+ if l_in.update = '0' then
+ -- for the second half of a 16-byte transfer,
+ -- use the previous address plus 8.
+ addr := std_ulogic_vector(unsigned(r1.req.addr0(63 downto 3)) + 1) & r1.req.addr0(2 downto 0);
+ else
+ -- for an update-form load, use the previous address
+ -- as the value to write back to RA.
+ addr := r1.req.addr0;
+ end if;
+ end if;
+ if l_in.mode_32bit = '1' then
+ addr(63 downto 32) := (others => '0');
+ end if;
+ v.addr := addr;
+ v.addr0 := addr;
+
+ -- XXX Temporary hack. Mark the op as non-cachable if the address
+ -- is the form 0xc------- for a real-mode access.
+ if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then
+ v.nc := '1';
+ end if;
+
+ addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1);
+
+ -- Do length_to_sel and work out if we are doing 2 dwords
+ long_sel := xfer_data_sel(v.length, addr(2 downto 0));
+ v.byte_sel := long_sel(7 downto 0);
+ v.second_bytes := long_sel(15 downto 8);
+ if long_sel(15 downto 8) /= "00000000" then
+ v.two_dwords := '1';
+ end if;
+
+ -- check alignment for larx/stcx
+ misaligned := or (addr_mask and addr(2 downto 0));
+ v.align_intr := l_in.reserve and misaligned;
+ if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then
+ -- length is really 16 not 8
+ -- Make misaligned lq cause an alignment interrupt in LE mode,
+ -- in order to avoid the case with RA = RT + 1 where the second half
+ -- faults but the first doesn't (and updates RT+1, destroying RA).
+ -- The equivalent BE case doesn't occur because RA = RT is illegal.
+ misaligned := '1';
+ if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then
+ v.align_intr := '1';
+ end if;
+ end if;
+
+ v.atomic := not misaligned;
+ v.atomic_last := not misaligned and (l_in.second or not l_in.repeat);
+
+ case l_in.op is
+ when OP_STORE =>
+ v.store := '1';
+ when OP_LOAD =>
+ if l_in.update = '0' or l_in.second = '0' then
+ v.load := '1';
+ if HAS_FPU and l_in.is_32bit = '1' then
+ -- Allow an extra cycle for SP->DP precision conversion
+ v.load_sp := '1';
+ end if;
+ else
+ -- write back address to RA
+ v.do_update := '1';
+ end if;
+ when OP_DCBZ =>
+ v.dcbz := '1';
+ v.align_intr := v.nc;
+ when OP_TLBIE =>
+ v.tlbie := '1';
+ v.addr := l_in.addr2; -- address from RB for tlbie
+ v.is_slbia := l_in.insn(7);
+ v.mmu_op := '1';
+ when OP_MFSPR =>
+ v.read_spr := '1';
+ when OP_MTSPR =>
+ v.write_spr := '1';
+ v.mmu_op := sprn(9) or sprn(5);
+ when OP_FETCH_FAILED =>
+ -- send it to the MMU to do the radix walk
+ v.instr_fault := '1';
+ v.addr := l_in.nia;
+ v.mmu_op := '1';
+ when others =>
+ end case;
+ v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr;
+
+ -- Work out controls for load and store formatting
+ brev_lenm1 := "000";
+ if v.byte_reverse = '1' then
+ brev_lenm1 := unsigned(v.length(2 downto 0)) - 1;
+ end if;
+ v.brev_mask := brev_lenm1;
+
+ req_in <= v;
+ end process;
+
+ busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
+ (r1.issued and d_in.error) or
+ stage2_busy_next or
+ (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
+ complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
+ (r2.wait_mmu and m_in.done) or r3.convert_lfs;
+ in_progress <= r1.req.valid or (r2.req.valid and not complete);
+
+ stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
+ not (r2.req.valid and r2.req.mmu_op);
+
+ -- Processing done in the first cycle of a load/store instruction
+ loadstore1_1: process(all)
+ variable v : reg_stage1_t;
+ variable req : request_t;
+ variable dcreq : std_ulogic;
+ variable addr : std_ulogic_vector(63 downto 0);
+ begin
+ v := r1;
+ dcreq := '0';
+ req := req_in;
+ if flushing = '1' then
+ -- Make this a no-op request rather than simply invalid.
+ -- It will never get to stage 3 since there is a request ahead of
+ -- it with align_intr = 1.
+ req.dc_req := '0';
+ end if;
+
+ -- Note that l_in.valid is gated with busy inside execute1
+ if l_in.valid = '1' then
+ dcreq := req.dc_req and stage1_issue_enable and not d_in.error and not dc_stall;
+ v.req := req;
+ v.issued := dcreq;
+ elsif r1.req.valid = '1' then
+ if r1.req.dc_req = '1' and r1.issued = '0' then
+ req := r1.req;
+ dcreq := stage1_issue_enable and not dc_stall and not d_in.error;
+ v.issued := dcreq;
+ elsif r1.issued = '1' and d_in.error = '1' then
+ v.issued := '0';
+ elsif stage2_busy_next = '0' then
+ -- we can change what's in r1 next cycle because the current thing
+ -- in r1 will go into r2
+ if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then
+ -- construct the second request for a misaligned access
+ v.req.dword_index := '1';
+ v.req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000";
+ if r1.req.mode_32bit = '1' then
+ v.req.addr(32) := '0';
+ end if;
+ v.req.byte_sel := r1.req.second_bytes;
+ v.issued := stage1_issue_enable and not dc_stall;
+ dcreq := stage1_issue_enable and not dc_stall;
+ req := v.req;
+ else
+ v.req.valid := '0';
+ end if;
+ end if;
+ end if;
+ if r3in.interrupt = '1' then
+ v.req.valid := '0';
+ dcreq := '0';
+ end if;
+
+ stage1_req <= req;
+ stage1_dcreq <= dcreq;
+ r1in <= v;
+ end process;
+
+ -- Processing done in the second cycle of a load/store instruction.
+ -- Store data is formatted here and sent to the dcache.
+ -- The request in r1 is sent to stage 3 if stage 3 will not be busy next cycle.
+ loadstore1_2: process(all)
+ variable v : reg_stage2_t;
variable j : integer;
variable k : unsigned(2 downto 0);
variable kk : unsigned(3 downto 0);
- variable long_sel : std_ulogic_vector(15 downto 0);
- variable byte_sel : std_ulogic_vector(7 downto 0);
- variable req : std_ulogic;
- variable stall : std_ulogic;
- variable addr : std_ulogic_vector(63 downto 0);
- variable wdata : std_ulogic_vector(63 downto 0);
- variable write_enable : std_ulogic;
- variable do_update : std_ulogic;
- variable two_dwords : std_ulogic;
- variable done : std_ulogic;
+ variable idx : unsigned(2 downto 0);
+ variable byte_offset : unsigned(2 downto 0);
+ begin
+ v := r2;
+
+ -- Byte reversing and rotating for stores.
+ -- Done in the second cycle (the cycle after l_in.valid = 1).
+ byte_offset := unsigned(r1.req.addr0(2 downto 0));
+ for i in 0 to 7 loop
+ k := (to_unsigned(i, 3) - byte_offset) xor r1.req.brev_mask;
+ j := to_integer(k) * 8;
+ store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j);
+ end loop;
+
+ if stage3_busy_next = '0' and
+ (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then
+ v.req := r1.req;
+ v.req.store_data := store_data;
+ v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and
+ not (r1.req.two_dwords and not r1.req.dword_index);
+ v.wait_mmu := r1.req.valid and r1.req.mmu_op;
+ v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or
+ (r1.req.write_spr and not r1.req.mmu_op) or
+ r1.req.load_zero or r1.req.do_update);
+ if r1.req.read_spr = '1' then
+ v.wr_sel := "00";
+ elsif r1.req.do_update = '1' or r1.req.store = '1' then
+ v.wr_sel := "01";
+ elsif r1.req.load_sp = '1' then
+ v.wr_sel := "10";
+ else
+ v.wr_sel := "11";
+ end if;
+
+ -- Work out load formatter controls for next cycle
+ for i in 0 to 7 loop
+ idx := to_unsigned(i, 3) xor r1.req.brev_mask;
+ kk := ('0' & idx) + ('0' & byte_offset);
+ v.use_second(i) := kk(3);
+ v.byte_index(i) := kk(2 downto 0);
+ end loop;
+ elsif stage3_busy_next = '0' then
+ v.req.valid := '0';
+ v.wait_dc := '0';
+ v.wait_mmu := '0';
+ end if;
+
+ stage2_busy_next <= r1.req.valid and stage3_busy_next;
+
+ if r3in.interrupt = '1' then
+ v.req.valid := '0';
+ end if;
+
+ r2in <= v;
+ end process;
+
+ -- Processing done in the third cycle of a load/store instruction.
+ -- At this stage we can do things that have side effects without
+ -- fear of the instruction getting flushed. This is the point at
+ -- which requests get sent to the MMU.
+ loadstore1_3: process(all)
+ variable v : reg_stage3_t;
+ variable j : integer;
+ variable req : std_ulogic;
+ variable mmureq : std_ulogic;
+ variable mmu_mtspr : std_ulogic;
+ variable write_enable : std_ulogic;
+ variable write_data : std_ulogic_vector(63 downto 0);
+ variable do_update : std_ulogic;
+ variable done : std_ulogic;
+ variable part_done : std_ulogic;
+ variable exception : std_ulogic;
variable data_permuted : std_ulogic_vector(63 downto 0);
- variable data_trimmed : std_ulogic_vector(63 downto 0);
- variable use_second : byte_sel_t;
- variable trim_ctl : trim_ctl_t;
- variable negative : std_ulogic;
- variable mfspr : std_ulogic;
- variable sprn : std_ulogic_vector(9 downto 0);
- variable sprval : std_ulogic_vector(63 downto 0);
- variable exception : std_ulogic;
- variable next_addr : std_ulogic_vector(63 downto 0);
- variable mmureq : std_ulogic;
- variable dsisr : std_ulogic_vector(31 downto 0);
- variable mmu_mtspr : std_ulogic;
- variable itlb_fault : std_ulogic;
+ variable data_trimmed : std_ulogic_vector(63 downto 0);
+ variable sprval : std_ulogic_vector(63 downto 0);
+ variable negative : std_ulogic;
+ variable dsisr : std_ulogic_vector(31 downto 0);
+ variable itlb_fault : std_ulogic;
+ variable trim_ctl : trim_ctl_t;
begin
- v := r;
+ v := r3;
+
req := '0';
- stall := '0';
- done := '0';
- byte_sel := (others => '0');
- addr := lsu_sum;
- mfspr := '0';
+ mmureq := '0';
mmu_mtspr := '0';
- itlb_fault := '0';
- sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
- sprval := (others => '0'); -- avoid inferred latches
+ done := '0';
+ part_done := '0';
exception := '0';
dsisr := (others => '0');
- mmureq := '0';
-
write_enable := '0';
+ sprval := (others => '0');
do_update := '0';
- two_dwords := or (r.second_bytes);
+ v.convert_lfs := '0';
+ v.srr1 := (others => '0');
-- load data formatting
- byte_offset := unsigned(r.addr(2 downto 0));
- brev_lenm1 := "000";
- if r.byte_reverse = '1' then
- brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
- end if;
-
-- shift and byte-reverse data bytes
for i in 0 to 7 loop
- kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
- use_second(i) := kk(3);
- j := to_integer(kk(2 downto 0)) * 8;
+ j := to_integer(r2.byte_index(i)) * 8;
data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
end loop;
-- Work out the sign bit for sign extension.
- -- Assumes we are not doing both sign extension and byte reversal,
- -- in that for unaligned loads crossing two dwords we end up
- -- using a bit from the second dword, whereas for a byte-reversed
- -- (i.e. big-endian) load the sign bit would be in the first dword.
- negative := (r.length(3) and data_permuted(63)) or
- (r.length(2) and data_permuted(31)) or
- (r.length(1) and data_permuted(15)) or
- (r.length(0) and data_permuted(7));
+ -- For unaligned loads crossing two dwords, the sign bit is in the
+ -- first dword for big-endian (byte_reverse = 1), or the second dword
+ -- for little-endian.
+ if r2.req.dword_index = '1' and r2.req.byte_reverse = '1' then
+ negative := (r2.req.length(3) and r3.load_data(63)) or
+ (r2.req.length(2) and r3.load_data(31)) or
+ (r2.req.length(1) and r3.load_data(15)) or
+ (r2.req.length(0) and r3.load_data(7));
+ else
+ negative := (r2.req.length(3) and data_permuted(63)) or
+ (r2.req.length(2) and data_permuted(31)) or
+ (r2.req.length(1) and data_permuted(15)) or
+ (r2.req.length(0) and data_permuted(7));
+ end if;
-- trim and sign-extend
for i in 0 to 7 loop
- if i < to_integer(unsigned(r.length)) then
- if two_dwords = '1' then
- trim_ctl(i) := '1' & not use_second(i);
+ if i < to_integer(unsigned(r2.req.length)) then
+ if r2.req.dword_index = '1' then
+ trim_ctl(i) := '1' & not r2.use_second(i);
else
- trim_ctl(i) := not use_second(i) & '0';
+ trim_ctl(i) := "10";
end if;
else
- trim_ctl(i) := '0' & (negative and r.sign_extend);
+ trim_ctl(i) := "00";
end if;
+ end loop;
+
+ for i in 0 to 7 loop
case trim_ctl(i) is
when "11" =>
- data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
+ data_trimmed(i * 8 + 7 downto i * 8) := r3.load_data(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
- when "01" =>
- data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
when others =>
- data_trimmed(i * 8 + 7 downto i * 8) := x"00";
+ data_trimmed(i * 8 + 7 downto i * 8) := (others => negative and r2.req.sign_extend);
end case;
end loop;
- -- compute (addr + 8) & ~7 for the second doubleword when unaligned
- next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
+ if HAS_FPU then
+ -- Single-precision FP conversion for loads
+ v.ld_sp_data := data_trimmed(31 downto 0);
+ v.ld_sp_nz := or (data_trimmed(22 downto 0));
+ v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+ end if;
- case r.state is
- when IDLE =>
- if l_in.valid = '1' then
- v.addr := lsu_sum;
- v.load := '0';
- v.dcbz := '0';
- v.tlbie := '0';
- v.instr_fault := '0';
- v.dwords_done := '0';
- case l_in.op is
- when OP_STORE =>
- req := '1';
- when OP_LOAD =>
- req := '1';
- v.load := '1';
- when OP_DCBZ =>
- req := '1';
- v.dcbz := '1';
- when OP_TLBIE =>
- mmureq := '1';
- stall := '1';
- v.tlbie := '1';
- v.state := TLBIE_WAIT;
- when OP_MFSPR =>
- done := '1';
- mfspr := '1';
- -- partial decode on SPR number should be adequate given
- -- the restricted set that get sent down this path
- if sprn(9) = '0' and sprn(5) = '0' then
- if sprn(0) = '0' then
- sprval := x"00000000" & r.dsisr;
- else
- sprval := r.dar;
- end if;
- else
- -- reading one of the SPRs in the MMU
- sprval := m_in.sprval;
- end if;
- when OP_MTSPR =>
- if sprn(9) = '0' and sprn(5) = '0' then
- if sprn(0) = '0' then
- v.dsisr := l_in.data(31 downto 0);
- else
- v.dar := l_in.data;
- end if;
- done := '1';
- else
- -- writing one of the SPRs in the MMU
- mmu_mtspr := '1';
- stall := '1';
- v.state := TLBIE_WAIT;
- end if;
- when OP_FETCH_FAILED =>
- -- send it to the MMU to do the radix walk
- addr := l_in.nia;
- v.addr := l_in.nia;
- v.instr_fault := '1';
- mmureq := '1';
- stall := '1';
- v.state := MMU_LOOKUP;
- when others =>
- assert false report "unknown op sent to loadstore1";
- end case;
-
- v.write_reg := l_in.write_reg;
- v.length := l_in.length;
- v.byte_reverse := l_in.byte_reverse;
- v.sign_extend := l_in.sign_extend;
- v.update := l_in.update;
- v.update_reg := l_in.update_reg;
- v.xerc := l_in.xerc;
- v.reserve := l_in.reserve;
- v.rc := l_in.rc;
- v.nc := l_in.ci;
- v.virt_mode := l_in.virt_mode;
- v.priv_mode := l_in.priv_mode;
-
- -- XXX Temporary hack. Mark the op as non-cachable if the address
- -- is the form 0xc------- for a real-mode access.
- --
- -- This will have to be replaced by a combination of implementing the
- -- proper HV CI load/store instructions and having an MMU to get the I
- -- bit otherwise.
- if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
- v.nc := '1';
- end if;
+ if d_in.valid = '1' and r2.req.load = '1' then
+ v.load_data := data_permuted;
+ end if;
- -- Do length_to_sel and work out if we are doing 2 dwords
- long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
- byte_sel := long_sel(7 downto 0);
- v.first_bytes := byte_sel;
- v.second_bytes := long_sel(15 downto 8);
-
- -- Do byte reversing and rotating for stores in the first cycle
- byte_offset := unsigned(lsu_sum(2 downto 0));
- brev_lenm1 := "000";
- if l_in.byte_reverse = '1' then
- brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
- end if;
- for i in 0 to 7 loop
- k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
- j := to_integer(k) * 8;
- v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
- end loop;
-
- if req = '1' then
- stall := '1';
- if long_sel(15 downto 8) = "00000000" then
- v.state := ACK_WAIT;
+ if r2.req.valid = '1' then
+ if r2.req.read_spr = '1' then
+ write_enable := '1';
+ -- partial decode on SPR number should be adequate given
+ -- the restricted set that get sent down this path
+ if r2.req.sprn(9) = '0' and r2.req.sprn(5) = '0' then
+ if r2.req.sprn(0) = '0' then
+ sprval := x"00000000" & r3.dsisr;
else
- v.state := SECOND_REQ;
+ sprval := r3.dar;
end if;
+ else
+ -- reading one of the SPRs in the MMU
+ sprval := m_in.sprval;
end if;
end if;
+ if r2.req.align_intr = '1' then
+ -- generate alignment interrupt
+ exception := '1';
+ end if;
+ if r2.req.load_zero = '1' then
+ write_enable := '1';
+ end if;
+ if r2.req.do_update = '1' then
+ do_update := '1';
+ end if;
+ end if;
- when SECOND_REQ =>
- addr := next_addr;
- byte_sel := r.second_bytes;
- req := '1';
- stall := '1';
- v.state := ACK_WAIT;
-
- when ACK_WAIT =>
- stall := '1';
+ case r3.state is
+ when IDLE =>
if d_in.valid = '1' then
- if d_in.error = '1' then
- -- dcache will discard the second request if it
- -- gets an error on the 1st of two requests
- if r.dwords_done = '1' then
- addr := next_addr;
+ if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then
+ write_enable := r2.req.load and not r2.req.load_sp;
+ if HAS_FPU and r2.req.load_sp = '1' then
+ -- SP to DP conversion takes a cycle
+ v.state := FINISH_LFS;
+ v.convert_lfs := '1';
else
- addr := r.addr;
+ -- stores write back rA update
+ do_update := r2.req.update and r2.req.store;
end if;
- if d_in.cache_paradox = '1' then
- -- signal an interrupt straight away
- exception := '1';
- dsisr(63 - 38) := not r.load;
- -- XXX there is no architected bit for this
- dsisr(63 - 35) := d_in.cache_paradox;
- v.state := IDLE;
- else
- -- Look up the translation for TLB miss
- -- and also for permission error and RC error
- -- in case the PTE has been updated.
- mmureq := '1';
+ else
+ part_done := '1';
+ end if;
+ end if;
+ if d_in.error = '1' then
+ if d_in.cache_paradox = '1' then
+ -- signal an interrupt straight away
+ exception := '1';
+ dsisr(63 - 38) := not r2.req.load;
+ -- XXX there is no architected bit for this
+ -- (probably should be a machine check in fact)
+ dsisr(63 - 35) := d_in.cache_paradox;
+ else
+ -- Look up the translation for TLB miss
+ -- and also for permission error and RC error
+ -- in case the PTE has been updated.
+ mmureq := '1';
+ v.state := MMU_LOOKUP;
+ v.stage1_en := '0';
+ end if;
+ end if;
+ if r2.req.valid = '1' then
+ if r2.req.mmu_op = '1' then
+ -- send request (tlbie, mtspr, itlb miss) to MMU
+ mmureq := not r2.req.write_spr;
+ mmu_mtspr := r2.req.write_spr;
+ if r2.req.instr_fault = '1' then
v.state := MMU_LOOKUP;
+ else
+ v.state := TLBIE_WAIT;
end if;
- else
- if two_dwords = '1' and r.dwords_done = '0' then
- v.dwords_done := '1';
- if r.load = '1' then
- v.load_data := data_permuted;
- end if;
+ elsif r2.req.write_spr = '1' then
+ if r2.req.sprn(0) = '0' then
+ v.dsisr := r2.req.store_data(31 downto 0);
else
- write_enable := r.load;
- if r.load = '1' and r.update = '1' then
- -- loads with rA update need an extra cycle
- v.state := LD_UPDATE;
- else
- -- stores write back rA update in this cycle
- do_update := r.update;
- stall := '0';
- done := '1';
- v.state := IDLE;
- end if;
+ v.dar := r2.req.store_data;
end if;
end if;
end if;
when MMU_LOOKUP =>
- stall := '1';
- if r.dwords_done = '1' then
- addr := next_addr;
- byte_sel := r.second_bytes;
- else
- addr := r.addr;
- byte_sel := r.first_bytes;
- end if;
if m_in.done = '1' then
- if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and
- m_in.badtree = '0' and m_in.segerr = '0' then
- if r.instr_fault = '0' then
- -- retry the request now that the MMU has installed a TLB entry
- req := '1';
- if two_dwords = '1' and r.dwords_done = '0' then
- v.state := SECOND_REQ;
- else
- v.state := ACK_WAIT;
- end if;
- else
- -- nothing to do, the icache retries automatically
- stall := '0';
- done := '1';
- v.state := IDLE;
- end if;
- else
- exception := '1';
- dsisr(63 - 33) := m_in.invalid;
- dsisr(63 - 36) := m_in.perm_error;
- dsisr(63 - 38) := not r.load;
- dsisr(63 - 44) := m_in.badtree;
- dsisr(63 - 45) := m_in.rc_error;
+ if r2.req.instr_fault = '0' then
+ -- retry the request now that the MMU has installed a TLB entry
+ req := '1';
+ v.stage1_en := '1';
v.state := IDLE;
end if;
end if;
+ if m_in.err = '1' then
+ exception := '1';
+ dsisr(63 - 33) := m_in.invalid;
+ dsisr(63 - 36) := m_in.perm_error;
+ dsisr(63 - 38) := r2.req.store or r2.req.dcbz;
+ dsisr(63 - 44) := m_in.badtree;
+ dsisr(63 - 45) := m_in.rc_error;
+ end if;
when TLBIE_WAIT =>
- stall := '1';
- if m_in.done = '1' then
- -- tlbie is finished
- stall := '0';
- done := '1';
- v.state := IDLE;
- end if;
- when LD_UPDATE =>
- do_update := '1';
+ when FINISH_LFS =>
+ write_enable := '1';
+
+ end case;
+
+ if complete = '1' or exception = '1' then
+ v.stage1_en := '1';
v.state := IDLE;
- done := '1';
+ end if;
+
+ -- generate DSI or DSegI for load/store exceptions
+ -- or ISI or ISegI for instruction fetch exceptions
+ v.interrupt := exception;
+ if exception = '1' then
+ v.nia := r2.req.nia;
+ if r2.req.align_intr = '1' then
+ v.intr_vec := 16#600#;
+ v.dar := r2.req.addr;
+ elsif r2.req.instr_fault = '0' then
+ v.dar := r2.req.addr;
+ if m_in.segerr = '0' then
+ v.intr_vec := 16#300#;
+ v.dsisr := dsisr;
+ else
+ v.intr_vec := 16#380#;
+ end if;
+ else
+ if m_in.segerr = '0' then
+ v.srr1(47 - 33) := m_in.invalid;
+ v.srr1(47 - 35) := m_in.perm_error; -- noexec fault
+ v.srr1(47 - 44) := m_in.badtree;
+ v.srr1(47 - 45) := m_in.rc_error;
+ v.intr_vec := 16#400#;
+ else
+ v.intr_vec := 16#480#;
+ end if;
+ end if;
+ end if;
+ case r2.wr_sel is
+ when "00" =>
+ -- mfspr result
+ write_data := sprval;
+ when "01" =>
+ -- update reg
+ write_data := r2.req.addr0;
+ when "10" =>
+ -- lfs result
+ write_data := load_dp_data;
+ when others =>
+ -- load data
+ write_data := data_trimmed;
end case;
-- Update outputs to dcache
- d_out.valid <= req;
- d_out.load <= v.load;
- d_out.dcbz <= v.dcbz;
- d_out.nc <= v.nc;
- d_out.reserve <= v.reserve;
- d_out.addr <= addr;
- d_out.data <= v.store_data;
- d_out.byte_sel <= byte_sel;
- d_out.virt_mode <= v.virt_mode;
- d_out.priv_mode <= v.priv_mode;
+ if stage1_issue_enable = '1' then
+ d_out.valid <= stage1_dcreq;
+ d_out.load <= stage1_req.load;
+ d_out.dcbz <= stage1_req.dcbz;
+ d_out.nc <= stage1_req.nc;
+ d_out.reserve <= stage1_req.reserve;
+ d_out.atomic <= stage1_req.atomic;
+ d_out.atomic_last <= stage1_req.atomic_last;
+ d_out.addr <= stage1_req.addr;
+ d_out.byte_sel <= stage1_req.byte_sel;
+ d_out.virt_mode <= stage1_req.virt_mode;
+ d_out.priv_mode <= stage1_req.priv_mode;
+ else
+ d_out.valid <= req;
+ d_out.load <= r2.req.load;
+ d_out.dcbz <= r2.req.dcbz;
+ d_out.nc <= r2.req.nc;
+ d_out.reserve <= r2.req.reserve;
+ d_out.atomic <= r2.req.atomic;
+ d_out.atomic_last <= r2.req.atomic_last;
+ d_out.addr <= r2.req.addr;
+ d_out.byte_sel <= r2.req.byte_sel;
+ d_out.virt_mode <= r2.req.virt_mode;
+ d_out.priv_mode <= r2.req.priv_mode;
+ end if;
+ if stage1_dreq = '1' then
+ d_out.data <= store_data;
+ else
+ d_out.data <= r2.req.store_data;
+ end if;
+ d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid;
-- Update outputs to MMU
m_out.valid <= mmureq;
- m_out.iside <= v.instr_fault;
- m_out.load <= r.load;
- m_out.priv <= r.priv_mode;
- m_out.tlbie <= v.tlbie;
+ m_out.iside <= r2.req.instr_fault;
+ m_out.load <= r2.req.load;
+ m_out.priv <= r2.req.priv_mode;
+ m_out.tlbie <= r2.req.tlbie;
m_out.mtspr <= mmu_mtspr;
- m_out.sprn <= sprn;
- m_out.addr <= addr;
- m_out.slbia <= l_in.insn(7);
- m_out.rs <= l_in.data;
+ m_out.sprn <= r2.req.sprn;
+ m_out.addr <= r2.req.addr;
+ m_out.slbia <= r2.req.is_slbia;
+ m_out.rs <= r2.req.store_data;
-- Update outputs to writeback
- -- Multiplex either cache data to the destination GPR or
- -- the address for the rA update.
- l_out.valid <= done;
- if mfspr = '1' then
- l_out.write_enable <= '1';
- l_out.write_reg <= l_in.write_reg;
- l_out.write_data <= sprval;
- elsif do_update = '1' then
- l_out.write_enable <= '1';
- l_out.write_reg <= r.update_reg;
- l_out.write_data <= r.addr;
- else
- l_out.write_enable <= write_enable;
- l_out.write_reg <= r.write_reg;
- l_out.write_data <= data_trimmed;
- end if;
- l_out.xerc <= r.xerc;
- l_out.rc <= r.rc and done;
+ l_out.valid <= complete;
+ l_out.instr_tag <= r2.req.instr_tag;
+ l_out.write_enable <= write_enable or do_update;
+ l_out.write_reg <= r2.req.write_reg;
+ l_out.write_data <= write_data;
+ l_out.xerc <= r2.req.xerc;
+ l_out.rc <= r2.req.rc and complete;
l_out.store_done <= d_in.store_done;
+ l_out.interrupt <= r3.interrupt;
+ l_out.intr_vec <= r3.intr_vec;
+ l_out.srr0 <= r3.nia;
+ l_out.srr1 <= r3.srr1;
- -- update exception info back to execute1
- e_out.busy <= r.busy;
- e_out.exception <= exception;
- e_out.instr_fault <= r.instr_fault;
- e_out.invalid <= m_in.invalid;
- e_out.badtree <= m_in.badtree;
- e_out.perm_error <= m_in.perm_error;
- e_out.rc_error <= m_in.rc_error;
- e_out.segment_fault <= m_in.segerr;
- if exception = '1' and r.instr_fault = '0' then
- v.dar := addr;
- if m_in.segerr = '0' then
- v.dsisr := dsisr;
- end if;
- end if;
+ -- update busy signal back to execute1
+ e_out.busy <= busy;
+ e_out.in_progress <= in_progress;
- v.busy := stall;
+ -- Busy calculation.
+ stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);
-- Update registers
- rin <= v;
+ r3in <= v;
end process;
- ls1_log: process(clk)
+ l1_log: if LOG_LENGTH > 0 generate
+ signal log_data : std_ulogic_vector(9 downto 0);
begin
- if rising_edge(clk) then
- log_data <= r.busy &
- e_out.exception &
- l_out.valid &
- m_out.valid &
- d_out.valid &
- m_in.done &
- r.dwords_done &
- std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));
- end if;
- end process;
- log_out <= log_data;
+ ls1_log: process(clk)
+ begin
+ if rising_edge(clk) then
+ log_data <= e_out.busy &
+ l_out.interrupt &
+ l_out.valid &
+ m_out.valid &
+ d_out.valid &
+ m_in.done &
+ r2.req.dword_index &
+ std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3));
+ end if;
+ end process;
+ log_out <= log_data;
+ end generate;
+
end;