From e711b67a9081ae84c66174a50705dc98ba993a43 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 18 Jan 2021 16:55:32 +0800 Subject: [PATCH] Fix incorrect optimization by cprop_hardreg. If SRC had been assigned a mode narrower than the copy, we can't always link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). i.e kmovw %k0, %edi vmovd %edi, %xmm2 vpshuflw $0, %xmm2, %xmm0 kmovw %k0, %r8d kmovd %k0, %r9d ... - movl %r9d, %r11d + vmovd %xmm2, %r11d gcc/ChangeLog: PR rtl-optimization/98694 * regcprop.c (copy_value): If SRC had been assigned a mode narrower than the copy, we can't link DEST into the chain even they have same hard_regno_nregs(i.e. HImode/SImode in i386 backend). gcc/testsuite/ChangeLog: PR rtl-optimization/98694 * gcc.target/i386/pr98694.c: New test. --- gcc/regcprop.c | 29 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pr98694.c | 41 +++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c diff --git a/gcc/regcprop.c b/gcc/regcprop.c index dd62cb36013..e1342f56bd1 100644 --- a/gcc/regcprop.c +++ b/gcc/regcprop.c @@ -358,6 +358,35 @@ copy_value (rtx dest, rtx src, struct value_data *vd) else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)) return; + /* It is not safe to link DEST into the chain if SRC was defined in some + narrower mode M and if M is also narrower than the mode of the first + register in the chain. For example: + (set (reg:DI r1) (reg:DI r0)) + (set (reg:HI r2) (reg:HI r1)) + (set (reg:SI r3) (reg:SI r2)) //Should be a new chain start at r3 + (set (reg:SI r4) (reg:SI r1)) + (set (reg:SI r5) (reg:SI r4)) + + the upper part of r3 is undefined. If we added it to the chain, + it may be used to replace r5, which has defined upper bits. + See PR98694 for details. + + [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src)) + [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode) + Condition B is added to to catch optimization opportunities of + + (set (reg:HI R1) (reg:HI R0)) + (set (reg:SI R2) (reg:SI R1)) // [A] + (set (reg:DI R3) (reg:DI R2)) // [A] + (set (reg:SI R4) (reg:SI R[0-3])) + (set (reg:HI R5) (reg:HI R[0-4])) + + in which all registers have only 16 defined bits. */ + else if (partial_subreg_p (vd->e[sr].mode, GET_MODE (src)) + && partial_subreg_p (vd->e[sr].mode, + vd->e[vd->e[sr].oldest_regno].mode)) + return; + /* Link DR at the end of the value chain used by SR. */ vd->e[dr].oldest_regno = vd->e[sr].oldest_regno; diff --git a/gcc/testsuite/gcc.target/i386/pr98694.c b/gcc/testsuite/gcc.target/i386/pr98694.c new file mode 100644 index 00000000000..45889d482c1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr98694.c @@ -0,0 +1,41 @@ +/* PR rtl-optimization/98694 */ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#include +typedef short v4hi __attribute__ ((vector_size (8))); +typedef int v2si __attribute__ ((vector_size (8))); +v4hi b; + +__attribute__ ((noipa)) +v2si +foo (__m512i src1, __m512i src2) +{ + __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2); + short s = (short) m; + int i = (int)m; + b = __extension__ (v4hi) {s, s, s, s}; + return __extension__ (v2si) {i, i}; +} + +int main () +{ + if (!__builtin_cpu_supports ("avx512bw")) + return 0; + + __m512i src1 = _mm512_setzero_si512 (); + __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); + __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2); + v2si a = foo (src1, src2); + if (a[0] != (int)m) + __builtin_abort (); + return 0; +} -- 2.30.2