The idea here is that we can have multiple instructions in progress at
the same time as long as they all go to the same unit, because that
unit will keep them in order. If we get an instruction for a
different unit, we wait for all the previous instructions to finish
before executing it. Since the loadstore unit is the only one that is
currently pipelined, this boils down to saying that loadstore
instructions can go ahead while l_in.in_progress = 1 but other
instructions have to wait until it is 0.
This gives a 2% increase on coremark performance on the Arty A7-100
(from ~190 to ~194).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
type Loadstore1ToExecute1Type is record
busy : std_ulogic;
type Loadstore1ToExecute1Type is record
busy : std_ulogic;
+ in_progress : std_ulogic;
end record;
type Loadstore1ToDcacheType is record
end record;
type Loadstore1ToDcacheType is record
entity control is
generic (
EX1_BYPASS : boolean := true;
entity control is
generic (
EX1_BYPASS : boolean := true;
- PIPELINE_DEPTH : natural := 2
+ PIPELINE_DEPTH : natural := 3
);
port (
clk : in std_ulogic;
);
port (
clk : in std_ulogic;
elsif complete_in.valid = '1' then
v_int.outstanding := r_int.outstanding - 1;
end if;
elsif complete_in.valid = '1' then
v_int.outstanding := r_int.outstanding - 1;
end if;
+ if r_int.outstanding >= PIPELINE_DEPTH + 1 then
+ valid_tmp := '0';
+ stall_tmp := '1';
+ end if;
if rst = '1' then
v_int := reg_internal_init;
if rst = '1' then
v_int := reg_internal_init;
begin
control_0: entity work.control
generic map (
begin
control_0: entity work.control
generic map (
- EX1_BYPASS => EX1_BYPASS,
- PIPELINE_DEPTH => 1
+ EX1_BYPASS => EX1_BYPASS
-- writeback.
xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;
-- writeback.
xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;
- busy_out <= l_in.busy or r.busy or fp_in.busy;
+ with e_in.unit select busy_out <=
+ l_in.busy or r.busy or fp_in.busy when LDST,
+ l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others;
+
valid_in <= e_in.valid and not busy_out and not flush_in;
terminate_out <= r.terminate;
valid_in <= e_in.valid and not busy_out and not flush_in;
terminate_out <= r.terminate;
-- Determine if there is any exception to be taken
-- before/instead of executing this instruction
-- Determine if there is any exception to be taken
-- before/instead of executing this instruction
- if valid_in = '1' and e_in.second = '0' then
+ if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then
if HAS_FPU and r.fp_exception_next = '1' then
-- This is used for FP-type program interrupts that
-- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
if HAS_FPU and r.fp_exception_next = '1' then
-- This is used for FP-type program interrupts that
-- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
signal busy : std_ulogic;
signal complete : std_ulogic;
signal busy : std_ulogic;
signal complete : std_ulogic;
+ signal in_progress : std_ulogic;
signal flushing : std_ulogic;
signal store_sp_data : std_ulogic_vector(31 downto 0);
signal flushing : std_ulogic;
signal store_sp_data : std_ulogic_vector(31 downto 0);
req_in <= v;
end process;
req_in <= v;
end process;
- --busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
- -- (r1.issued and d_in.error) or
- -- stage2_busy_next or
- -- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
+ busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
+ (r1.issued and d_in.error) or
+ stage2_busy_next or
+ (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
(r2.wait_mmu and m_in.done) or r3.convert_lfs;
complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
(r2.wait_mmu and m_in.done) or r3.convert_lfs;
- busy <= r1.req.valid or (r2.req.valid and not complete);
+ in_progress <= r1.req.valid or (r2.req.valid and not complete);
stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
not (r2.req.valid and r2.req.mmu_op);
stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
not (r2.req.valid and r2.req.mmu_op);
-- update busy signal back to execute1
e_out.busy <= busy;
-- update busy signal back to execute1
e_out.busy <= busy;
+ e_out.in_progress <= in_progress;
-- Busy calculation.
stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);
-- Busy calculation.
stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);