Enable mask movement for VCOND_EXPR under avx512f for
authorHongtao Liu <liuhongt@gcc.gnu.org>
Mon, 9 Dec 2019 04:16:24 +0000 (04:16 +0000)
committerHongtao Liu <liuhongt@gcc.gnu.org>
Mon, 9 Dec 2019 04:16:24 +0000 (04:16 +0000)
128/256-bit vector when integer mask is available.

Changelog
gcc/
PR target/92686
* config/i386/sse.md
(*<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>,
*<avx512>_cmp<mode>3<mask_scalar_merge_name>,
*<avx512>_ucmp<mode>3<mask_scalar_merge_name>,
*<avx512>_ucmp<mode>3<mask_scalar_merge_name>): New.
* config/i386/i386.c (ix86_print_operand): New operand substitution.
* config/i386/i386-expand.c (ix86_valid_mask_cmp_mode):
New function.
(ix86_expand_sse_cmp): Relax condition for integer mask from
512-bit vector to all 128/256/512-bit vector. Delete code gen
for avx512f compare patterns since we have generic pattern now.
(ix86_expand_sse_movcc): Adjust condition and codegen for
maskcmp.
(ix86_expand_int_sse_cmp): Don't canonicalize the comparison
when corresponding vector compare is available.

gcc/testsuite/
* gcc.target/i386/pr92686.inc: New file.
* gcc.target/i386/avx512bw-pr92686-vpcmp-1.c: New test.
* gcc.target/i386/avx512bw-pr92686-vpcmp-intelasm-1.c: Ditto.
* gcc.target/i386/avx512bw-pr92686-vpcmp-2.c: Ditto.
* gcc.target/i386/avx512vl-pr92686-vpcmp-1.c: Ditto.
* gcc.target/i386/avx512vl-pr92686-vpcmp-intelasm-1.c: Ditto.
* gcc.target/i386/avx512vl-pr92686-vpcmp-2.c: Ditto.
* gcc.target/i386/avx512bw-pr92686-movcc-1.c: Ditto.
* gcc.target/i386/avx512bw-pr92686-movcc-2.c: Ditto.
* gcc.target/i386/avx512vl-pr92686-movcc-1.c: Ditto.
* gcc.target/i386/avx512vl-pr92686-movcc-2.c: Ditto.
* gcc.target/i386/avx512vl-pr88547-1.c: Adjust testcase.
* gcc.target/i386/pr88547-1.c: Ditto.

From-SVN: r279107

18 files changed:
gcc/ChangeLog
gcc/config/i386/i386-expand.c
gcc/config/i386/i386.c
gcc/config/i386/sse.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-intelasm-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr88547-1.c
gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-intelasm-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/pr88547-1.c
gcc/testsuite/gcc.target/i386/pr92686.inc [new file with mode: 0644]

index e84fb302fd5a38e3270e8bc4bed08e79c50ac6c0..9155667222ad674d1af968e99a6d700c6a999122 100644 (file)
@@ -1,3 +1,22 @@
+2019-12-09  Hongtao Liu  <hongtao.liu@intel.com>
+
+       PR target/92686
+       * config/i386/sse.md
+       (*<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>,
+       *<avx512>_cmp<mode>3<mask_scalar_merge_name>,
+       *<avx512>_ucmp<mode>3<mask_scalar_merge_name>,
+       *<avx512>_ucmp<mode>3<mask_scalar_merge_name>): New.
+       * config/i386/i386.c (ix86_print_operand): New operand substitution.
+       * config/i386/i386-expand.c (ix86_valid_mask_cmp_mode):
+       New function.
+       (ix86_expand_sse_cmp): Relax condition for integer mask from
+       512-bit vector to all 128/256/512-bit vector. Delete code gen
+       for avx512f compare patterns since we have generic pattern now.
+       (ix86_expand_sse_movcc): Adjust condition and codegen for
+       maskcmp.
+       (ix86_expand_int_sse_cmp): Don't canonicalize the comparison
+       when corresponding vector compare is available.
+
 2019-12-08  Sandra Loosemore  <sandra@codesourcery.com>
 
        Revert:
index 1ff1153e105307d8094f05c49f8e79ac92748fd0..ff3c24cc5b72b10d20ef53765029a679770659b3 100644 (file)
@@ -3422,6 +3422,29 @@ ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
   return true;
 }
 
+/* Return true if MODE is valid for vector compare to mask register,
+   Same result for conditionl vector move with mask register.  */
+static bool
+ix86_valid_mask_cmp_mode (machine_mode mode)
+{
+  /* XOP has its own vector conditional movement.  */
+  if (TARGET_XOP)
+    return false;
+
+  /* AVX512F is needed for mask operation.  */
+  if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
+    return false;
+
+  /* AVX512BW is needed for vector QI/HImode,
+     AVX512VL is needed for 128/256-bit vector.  */
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  int vector_size = GET_MODE_SIZE (mode);
+  if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
+    return false;
+
+  return vector_size == 64 || TARGET_AVX512VL;
+}
+
 /* Expand an SSE comparison.  Return the register with the result.  */
 
 static rtx
@@ -3438,11 +3461,11 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
   bool maskcmp = false;
   rtx x;
 
-  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+  if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
     {
       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
-      cmp_mode = int_mode_for_size (nbits, 0).require ();
       maskcmp = true;
+      cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
     }
   else
     cmp_mode = cmp_ops_mode;
@@ -3461,37 +3484,6 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
 
-  /* Compare patterns for int modes are unspec in AVX512F only.  */
-  if (maskcmp && (code == GT || code == EQ))
-    {
-      rtx (*gen)(rtx, rtx, rtx);
-
-      switch (cmp_ops_mode)
-       {
-       case E_V64QImode:
-         gcc_assert (TARGET_AVX512BW);
-         gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
-         break;
-       case E_V32HImode:
-         gcc_assert (TARGET_AVX512BW);
-         gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
-         break;
-       case E_V16SImode:
-         gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
-         break;
-       case E_V8DImode:
-         gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
-         break;
-       default:
-         gen = NULL;
-       }
-
-      if (gen)
-       {
-         emit_insn (gen (dest, cmp_op0, cmp_op1));
-         return dest;
-       }
-    }
   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
 
   if (cmp_mode != mode && !maskcmp)
@@ -3515,7 +3507,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
   machine_mode cmpmode = GET_MODE (cmp);
 
   /* In AVX512F the result of comparison is an integer mask.  */
-  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+  bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
 
   rtx t2, t3, x;
 
@@ -3529,85 +3521,34 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 
   if (maskcmp)
     {
-      rtx (*gen) (rtx, rtx) = NULL;
-      if ((op_true == CONST0_RTX (mode)
-          && vector_all_ones_operand (op_false, mode))
-         || (op_false == CONST0_RTX (mode)
-             && vector_all_ones_operand (op_true, mode)))
-       switch (mode)
-         {
-         case E_V64QImode:
-           if (TARGET_AVX512BW)
-             gen = gen_avx512bw_cvtmask2bv64qi;
-           break;
-         case E_V32QImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2bv32qi;
-           break;
-         case E_V16QImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2bv16qi;
-           break;
-         case E_V32HImode:
-           if (TARGET_AVX512BW)
-             gen = gen_avx512bw_cvtmask2wv32hi;
-           break;
-         case E_V16HImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2wv16hi;
-           break;
-         case E_V8HImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2wv8hi;
-           break;
-         case E_V16SImode:
-           if (TARGET_AVX512DQ)
-             gen = gen_avx512f_cvtmask2dv16si;
-           break;
-         case E_V8SImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2dv8si;
-           break;
-         case E_V4SImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2dv4si;
-           break;
-         case E_V8DImode:
-           if (TARGET_AVX512DQ)
-             gen = gen_avx512f_cvtmask2qv8di;
-           break;
-         case E_V4DImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2qv4di;
-           break;
-         case E_V2DImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2qv2di;
-           break;
-         default:
-           break;
-         }
-      if (gen && SCALAR_INT_MODE_P (cmpmode))
-       {
-         cmp = force_reg (cmpmode, cmp);
-         if (op_true == CONST0_RTX (mode))
+      /* Using vector move with mask register.  */
+      cmp = force_reg (cmpmode, cmp);
+      /* Optimize for mask zero.  */
+      op_true = (op_true != CONST0_RTX (mode)
+                ? force_reg (mode, op_true) : op_true);
+      op_false = (op_false != CONST0_RTX (mode)
+                 ? force_reg (mode, op_false) : op_false);
+      if (op_true == CONST0_RTX (mode))
+       {
+         rtx (*gen_not) (rtx, rtx);
+         switch (cmpmode)
            {
-             rtx (*gen_not) (rtx, rtx);
-             switch (cmpmode)
-               {
-               case E_QImode: gen_not = gen_knotqi; break;
-               case E_HImode: gen_not = gen_knothi; break;
-               case E_SImode: gen_not = gen_knotsi; break;
-               case E_DImode: gen_not = gen_knotdi; break;
-               default: gcc_unreachable ();
-               }
-             rtx n = gen_reg_rtx (cmpmode);
-             emit_insn (gen_not (n, cmp));
-             cmp = n;
+           case E_QImode: gen_not = gen_knotqi; break;
+           case E_HImode: gen_not = gen_knothi; break;
+           case E_SImode: gen_not = gen_knotsi; break;
+           case E_DImode: gen_not = gen_knotdi; break;
+           default: gcc_unreachable ();
            }
-         emit_insn (gen (dest, cmp));
-         return;
+         rtx n = gen_reg_rtx (cmpmode);
+         emit_insn (gen_not (n, cmp));
+         cmp = n;
+         /* Reverse op_true op_false.  */
+         std::swap (op_true, op_false);
        }
+
+      rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
+      emit_insn (gen_rtx_SET (dest, vec_merge));
+      return;
     }
   else if (vector_all_ones_operand (op_true, mode)
           && op_false == CONST0_RTX (mode))
@@ -4068,6 +4009,10 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
       && (mode == V16QImode || mode == V8HImode
          || mode == V4SImode || mode == V2DImode))
     ;
+  /* AVX512F supports all of the comparsions
+     on all 128/256/512-bit vector int types.  */
+  else if (ix86_valid_mask_cmp_mode (mode))
+    ;
   else
     {
       /* Canonicalize the comparison to EQ, GT, GTU.  */
index 04cbbd532c0d7df8288e87555696bc32477beeca..99d60bc9813dd207f6e41748670b9dcc9989ac2d 100644 (file)
@@ -12468,6 +12468,40 @@ ix86_print_operand (FILE *file, rtx x, int code)
            }
          return;
 
+       case 'I':
+         if (ASSEMBLER_DIALECT == ASM_ATT)
+           putc ('$', file);
+         switch (GET_CODE (x))
+           {
+           case EQ:
+             putc ('0', file);
+             break;
+           case NE:
+             putc ('4', file);
+             break;
+           case GE:
+           case GEU:
+             putc ('5', file);
+             break;
+           case GT:
+           case GTU:
+             putc ('6', file);
+             break;
+           case LE:
+           case LEU:
+             putc ('2', file);
+             break;
+           case LT:
+           case LTU:
+             putc ('1', file);
+             break;
+           default:
+             output_operand_lossage ("operand is not a condition code, "
+                                     "invalid operand code 'I'");
+             return;
+           }
+         return;
+
        case 'Y':
          switch (GET_CODE (x))
            {
index fb43cafaad047708e2cb1398f0aed33a762d90b9..bbceb8b83ad0822a28b69e4a5144daf9145b6b87 100644 (file)
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+       (match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator"
+         [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+          (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "<round_saeonly_constraint>")]))]
+  "TARGET_AVX512F && <round_saeonly_mode512bit_condition>"
+  "vpcmp<ssemodesuffix>\t{%I3, <round_saeonly_mask_scalar_merge_op4>%2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2<round_saeonly_mask_scalar_merge_op4>, %I3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+       (match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator"
+         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))]
+  "TARGET_AVX512BW"
+  "vpcmp<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+       (match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator"
+         [(match_operand:VI12_AVX512VL 1 "register_operand" "v")
+          (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))]
+  "TARGET_AVX512BW"
+  "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (unspec:<avx512fmaskmode>
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
+       (match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator"
+         [(match_operand:VI48_AVX512VL 1 "register_operand" "v")
+          (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")]))]
+  "TARGET_AVX512F"
+  "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>"
   [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k")
        (and:<avx512fmaskmode>
index d069d0ca2c307dce9df96e0b305a7e1ed9fbdb34..c6094cc3913f084c80082e14c116450316d39ed8 100644 (file)
@@ -1,3 +1,19 @@
+2019-12-09  Hongtao Liu  <hongtao@intel.com>
+
+       * gcc.target/i386/pr92686.inc: New file.
+       * gcc.target/i386/avx512bw-pr92686-vpcmp-1.c: New test.
+       * gcc.target/i386/avx512bw-pr92686-vpcmp-intelasm-1.c: Ditto.
+       * gcc.target/i386/avx512bw-pr92686-vpcmp-2.c: Ditto.
+       * gcc.target/i386/avx512vl-pr92686-vpcmp-1.c: Ditto.
+       * gcc.target/i386/avx512vl-pr92686-vpcmp-intelasm-1.c: Ditto.
+       * gcc.target/i386/avx512vl-pr92686-vpcmp-2.c: Ditto.
+       * gcc.target/i386/avx512bw-pr92686-movcc-1.c: Ditto.
+       * gcc.target/i386/avx512bw-pr92686-movcc-2.c: Ditto.
+       * gcc.target/i386/avx512vl-pr92686-movcc-1.c: Ditto.
+       * gcc.target/i386/avx512vl-pr92686-movcc-2.c: Ditto.
+       * gcc.target/i386/avx512vl-pr88547-1.c: Adjust testcase.
+       * gcc.target/i386/pr88547-1.c: Ditto.
+
 2019-12-08  Andrew Pinski  <apinski@marvell.com>
 
        * gcc.c-torture/execute/bswap-3.c: New test.
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-1.c
new file mode 100644 (file)
index 0000000..2a89077
--- /dev/null
@@ -0,0 +1,133 @@
+/* PR target/92686 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512bw -mno-avx512dq -mno-avx512vl -mno-xop -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]8\[^\{\n\]*%zmm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]16\[^\{\n\]*%zmm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]32\[^\{\n\]*%zmm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]64\[^\{\n\]*%zmm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+
+__attribute__((noipa)) void
+f1 (char *__restrict dst, char *__restrict src1, char *__restrict src2)
+{
+  for (int i = 0; i != 64; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f2 (unsigned char *__restrict dst, unsigned char *__restrict src1,
+    unsigned char *__restrict src2)
+{
+  for (int i = 0; i != 64; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f3 (char *__restrict dst, char *__restrict src1, char *__restrict src2)
+{
+  for (int i = 0; i != 64; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f4 (unsigned char *__restrict dst, unsigned char *__restrict src1,
+    unsigned char *__restrict src2)
+{
+  for (int i = 0; i != 64; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f5 (short *__restrict dst, short *__restrict src1, short *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f6 (unsigned short *__restrict dst, unsigned short *__restrict src1,
+    unsigned short *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f7 (short *__restrict dst, short *__restrict src1, short *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f8 (unsigned short *__restrict dst, unsigned short *__restrict src1,
+    unsigned short *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f9 (int *__restrict dst, int *__restrict src1, int *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f10 (unsigned int *__restrict dst, unsigned int *__restrict src1,
+     unsigned int *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f11 (int *__restrict dst, int *__restrict src1, int *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f12 (unsigned int *__restrict dst, unsigned int *__restrict src1,
+     unsigned int *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f13 (long long int *__restrict dst, long long int *__restrict src1,
+     long long int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f14 (unsigned long long int *__restrict dst,
+     unsigned long long int *__restrict src1,
+     unsigned long long int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f15 (long long int *__restrict dst, long long int *__restrict src1,
+     long long int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f16 (unsigned long long int *__restrict dst,
+     unsigned long long int *__restrict src1,
+     unsigned long long int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-movcc-2.c
new file mode 100644 (file)
index 0000000..53a7da1
--- /dev/null
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-Ofast -mavx512bw -mavx512vl -mprefer-vector-width=256" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr92686-movcc-1.c"
+#include "pr92686.inc"
+
+#define NUM 512
+
+
+#define TEST_SIGNED(vtype, type, N, fn, fn2, op)               \
+do                                                             \
+  {                                                            \
+    type dst[NUM], src1[NUM], src2[NUM];                       \
+    int i, j,  sign = 1;                                       \
+    type res[N];                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1[i] = i * i * sign;                                 \
+       src2[i] = (i + 20) * sign;                              \
+       dst[i] = i * i * i + 100;                               \
+       sign = -sign;                                           \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      {                                                                \
+       for (j = 0; j < N; j++)                                 \
+         res[j] = dst[i + j];                                  \
+       fn (&dst[i], &src1[i], &src2[i]);                       \
+       for (j = 0; j < N; j++)                                 \
+         {                                                     \
+           res[j] = fn2 (res[j], src1[i + j],                  \
+                         src2[i+ j], op);                      \
+           if (res[j] != dst[i+ j])                            \
+             abort();                                          \
+         }                                                     \
+      }                                                                \
+  }                                                            \
+while (0)
+
+#define TEST_UNSIGNED(vtype, type, N, fn, fn2, op)             \
+do                                                             \
+  {                                                            \
+    type dst[NUM], src1[NUM], src2[NUM];                       \
+    int i,j;                                                   \
+    type res[N];                                               \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1[i] = i * i;                                        \
+       src2[i] = i + 20;                                       \
+       dst[i] = i * i * i + 100;                               \
+       if ((i % 4))                                            \
+         src2[i] |= (1ULL << (sizeof (type)                    \
+                                * __CHAR_BIT__ - 1));          \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      {                                                                \
+       for (j = 0; j < N; j++)                                 \
+         res[j] = dst[i + j];                                  \
+       fn (&dst[i], &src1[i], &src2[i]);                       \
+       for (j = 0; j < N; j++)                                 \
+         {                                                     \
+           res[j] = fn2 (res[j], src1[i + j],                  \
+                         src2[i + j], op);                     \
+           if (res[j] != dst[i + j])                           \
+             abort();                                          \
+         }                                                     \
+      }                                                                \
+  }                                                            \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_SIGNED (v64qi, signed char, 64, f1, cmpb, 5);
+  TEST_UNSIGNED (v64uqi, unsigned char, 64, f2, cmpub, 5);
+  TEST_SIGNED (v64qi, signed char, 64, f3, cmpb, 2);
+  TEST_UNSIGNED (v64uqi, unsigned char, 64, f4, cmpub, 2);
+  TEST_SIGNED (v32hi, short int, 32, f5, cmpw, 5);
+  TEST_UNSIGNED (v32uhi, unsigned short int, 32, f6, cmpuw, 5);
+  TEST_SIGNED (v32hi, short int, 32, f7, cmpw, 2);
+  TEST_UNSIGNED (v32uhi, unsigned short int, 32, f8, cmpuw, 2);
+  TEST_SIGNED (v16si, int, 16, f9, cmpd, 5);
+  TEST_UNSIGNED (v16usi, unsigned int, 16, f10, cmpud, 5);
+  TEST_SIGNED (v16si, int, 16, f11, cmpd, 2);
+  TEST_UNSIGNED (v16usi, unsigned int, 16, f12, cmpud, 2);
+  TEST_SIGNED (v8di, long long int, 8, f13, cmpq, 5);
+  TEST_UNSIGNED (v8udi, unsigned long long int, 8, f14, cmpuq, 5);
+  TEST_SIGNED (v8di, long long int, 8, f15, cmpq, 2);
+  TEST_UNSIGNED (v8udi, unsigned long long int, 8, f16, cmpuq, 2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-1.c
new file mode 100644 (file)
index 0000000..4fd3b36
--- /dev/null
@@ -0,0 +1,112 @@
+/* PR target/92686 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mno-avx512dq -mno-avx512vl -mno-xop" } */
+/* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovm2\[bw\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[\t ]" 8 } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v64uqi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef unsigned short v32uhi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef unsigned v16usi __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef unsigned long long v8udi __attribute__((vector_size(64)));
+
+__attribute__((noipa)) v64qi
+f1 (v64qi x, v64qi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v64uqi
+f2 (v64uqi x, v64uqi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v64qi
+f3 (v64qi x, v64qi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v64uqi
+f4 (v64uqi x, v64uqi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32hi
+f5 (v32hi x, v32hi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32uhi
+f6 (v32uhi x, v32uhi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32hi
+f7 (v32hi x, v32hi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32uhi
+f8 (v32uhi x, v32uhi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16si
+f9 (v16si x, v16si y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16usi
+f10 (v16usi x, v16usi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16si
+f11 (v16si x, v16si y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16usi
+f12 (v16usi x, v16usi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8di
+f13 (v8di x, v8di y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8udi
+f14 (v8udi x, v8udi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8di
+f15 (v8di x, v8di y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8udi
+f16 (v8udi x, v8udi y)
+{
+  return x <= y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-2.c
new file mode 100644 (file)
index 0000000..0ea5b56
--- /dev/null
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr92686-vpcmp-1.c"
+
+#define NUM 512
+
+#define TEST_SIGNED(vtype, type, N, fn, op) \
+do                                                             \
+  {                                                            \
+    union { vtype x[NUM / N]; type i[NUM]; } dst, src1, src2;  \
+    int i, sign = 1;                                           \
+    type res;                                                  \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1.i[i] = i * i * sign;                               \
+       src2.i[i] = (i + 20) * sign;                            \
+       sign = -sign;                                           \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      dst.x[i / N] = fn (src1.x[i / N], src2.x[i / N]);                \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       res = src1.i[i] op src2.i[i] ? -1 : 0;                  \
+       if (res != dst.i[i])                                    \
+         abort ();                                             \
+      }                                                                \
+  }                                                            \
+while (0)
+
+#define TEST_UNSIGNED(vtype, type, N, fn, op) \
+do                                                             \
+  {                                                            \
+    union { vtype x[NUM / N]; type i[NUM]; } dst, src1, src2;  \
+    int i;                                                     \
+    type res;                                                  \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1.i[i] = i * i;                                      \
+       src2.i[i] = i + 20;                                     \
+       if ((i % 4))                                            \
+         src2.i[i] |= (1ULL << (sizeof (type)                  \
+                                * __CHAR_BIT__ - 1));          \
+      }                                                                \
+                                                               \
+    for (i = 0; i < NUM; i += N)                               \
+      dst.x[i / N] = fn (src1.x[i / N], src2.x[i / N]);                \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       res = src1.i[i] op src2.i[i] ? -1 : 0;                  \
+       if (res != dst.i[i])                                    \
+         abort ();                                             \
+      }                                                                \
+  }                                                            \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_SIGNED (v64qi, signed char, 64, f1, >=);
+  TEST_UNSIGNED (v64uqi, unsigned char, 64, f2, >=);
+  TEST_SIGNED (v64qi, signed char, 64, f3, <=);
+  TEST_UNSIGNED (v64uqi, unsigned char, 64, f4, <=);
+  TEST_SIGNED (v32hi, short int, 32, f5, >=);
+  TEST_UNSIGNED (v32uhi, unsigned short int, 32, f6, >=);
+  TEST_SIGNED (v32hi, short int, 32, f7, <=);
+  TEST_UNSIGNED (v32uhi, unsigned short int, 32, f8, <=);
+  TEST_SIGNED (v16si, int, 16, f9, >=);
+  TEST_UNSIGNED (v16usi, unsigned int, 16, f10, >=);
+  TEST_SIGNED (v16si, int, 16, f11, <=);
+  TEST_UNSIGNED (v16usi, unsigned int, 16, f12, <=);
+  TEST_SIGNED (v8di, long long int, 8, f13, >=);
+  TEST_UNSIGNED (v8udi, unsigned long long int, 8, f14, >=);
+  TEST_SIGNED (v8di, long long int, 8, f15, <=);
+  TEST_UNSIGNED (v8udi, unsigned long long int, 8, f16, <=);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-intelasm-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr92686-vpcmp-intelasm-1.c
new file mode 100644 (file)
index 0000000..23c785c
--- /dev/null
@@ -0,0 +1,110 @@
+/* PR target/92686 */
+/* { dg-do assemble } */
+/* { dg-options "-O2 -mavx512bw -mno-avx512dq -mno-avx512vl -mno-xop -masm=intel" } */
+/* { dg-require-effective-target avx512bw } */
+
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v64uqi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef unsigned short v32uhi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef unsigned v16usi __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+typedef unsigned long long v8udi __attribute__((vector_size(64)));
+
+__attribute__((noipa)) v64qi
+f1 (v64qi x, v64qi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v64uqi
+f2 (v64uqi x, v64uqi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v64qi
+f3 (v64qi x, v64qi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v64uqi
+f4 (v64uqi x, v64uqi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32hi
+f5 (v32hi x, v32hi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32uhi
+f6 (v32uhi x, v32uhi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32hi
+f7 (v32hi x, v32hi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32uhi
+f8 (v32uhi x, v32uhi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16si
+f9 (v16si x, v16si y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16usi
+f10 (v16usi x, v16usi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16si
+f11 (v16si x, v16si y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16usi
+f12 (v16usi x, v16usi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8di
+f13 (v8di x, v8di y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8udi
+f14 (v8udi x, v8udi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8di
+f15 (v8di x, v8di y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8udi
+f16 (v8udi x, v8udi y)
+{
+  return x <= y;
+}
index aa64dc2f06e723c42b72b5d301df6a4a5f77f0a2..a3ffeca4354434b258ef662fb90b785c7c8cc959 100644 (file)
@@ -6,9 +6,7 @@
 /* { dg-final { scan-assembler-times "vpminsb\[\t ]" 2 } } */
 /* { dg-final { scan-assembler-times "vpminuw\[\t ]" 2 } } */
 /* { dg-final { scan-assembler-times "vpminsw\[\t ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminud\[\t ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminsd\[\t ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminuq\[\t ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminsq\[\t ]" 2 } } */
-
+/* { dg-final { scan-assembler-times "vpcmp\[dq\]\[\t ]" 4 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[dq\]\[\t ]" 4 } } */
+/* { dg-final { scan-assembler-times "vpternlog\[qd\]\[\t ]" 8 } } */
 #include "avx2-pr88547-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-1.c
new file mode 100644 (file)
index 0000000..1b9644a
--- /dev/null
@@ -0,0 +1,133 @@
+/* PR target/92686 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512bw -mavx512vl -mno-xop -mprefer-vector-width=256" } */
+/* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]8\[^\{\n\]*%ymm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]16\[^\{\n\]*%ymm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]32\[^\{\n\]*%ymm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+/* { dg-final { scan-assembler-times "vmovdq\[au\]64\[^\{\n\]*%ymm0+\[^\n\]*\{%k\[1-7\]\}" 4 } } */
+
+__attribute__((noipa)) void
+f1 (char *__restrict dst, char *__restrict src1, char *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f2 (unsigned char *__restrict dst, unsigned char *__restrict src1,
+    unsigned char *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f3 (char *__restrict dst, char *__restrict src1, char *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f4 (unsigned char *__restrict dst, unsigned char *__restrict src1,
+    unsigned char *__restrict src2)
+{
+  for (int i = 0; i != 32; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f5 (short *__restrict dst, short *__restrict src1, short *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f6 (unsigned short *__restrict dst, unsigned short *__restrict src1,
+    unsigned short *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f7 (short *__restrict dst, short *__restrict src1, short *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f8 (unsigned short *__restrict dst, unsigned short *__restrict src1,
+    unsigned short *__restrict src2)
+{
+  for (int i = 0; i != 16; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f9 (int *__restrict dst, int *__restrict src1, int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f10 (unsigned int *__restrict dst, unsigned int *__restrict src1,
+     unsigned int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f11 (int *__restrict dst, int *__restrict src1, int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f12 (unsigned int *__restrict dst, unsigned int *__restrict src1,
+     unsigned int *__restrict src2)
+{
+  for (int i = 0; i != 8; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f13 (long long int *__restrict dst, long long int *__restrict src1,
+     long long int *__restrict src2)
+{
+  for (int i = 0; i != 4; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f14 (unsigned long long int *__restrict dst,
+     unsigned long long int *__restrict src1,
+     unsigned long long int *__restrict src2)
+{
+  for (int i = 0; i != 4; i++)
+    dst[i] = src1[i] >= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f15 (long long int *__restrict dst, long long int *__restrict src1,
+     long long int *__restrict src2)
+{
+  for (int i = 0; i != 4; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
+
+__attribute__((noipa)) void
+f16 (unsigned long long int *__restrict dst,
+     unsigned long long int *__restrict src1,
+     unsigned long long int *__restrict src2)
+{
+  for (int i = 0; i != 4; i++)
+    dst[i] = src1[i] <= src2[i] ? src1[i] : dst[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-movcc-2.c
new file mode 100644 (file)
index 0000000..5f5562b
--- /dev/null
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-Ofast -mavx512bw -mavx512vl -mprefer-vector-width=256" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512vl_test
+#endif
+
+#include "avx512vl-pr92686-movcc-1.c"
+#include "pr92686.inc"
+
+#define NUM 256
+
+
+#define TEST_SIGNED(vtype, type, N, fn, fn2, op)               \
+do                                                             \
+  {                                                            \
+    type dst[NUM], src1[NUM], src2[NUM];                       \
+    int i, j,  sign = 1;                                       \
+    type res[N];                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1[i] = i * i * sign;                                 \
+       src2[i] = (i + 20) * sign;                              \
+       dst[i] = i * i * i + 100;                               \
+       sign = -sign;                                           \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      {                                                                \
+       for (j = 0; j < N; j++)                                 \
+         res[j] = dst[i + j];                                  \
+       fn (&dst[i], &src1[i], &src2[i]);                       \
+       for (j = 0; j < N; j++)                                 \
+         {                                                     \
+           res[j] = fn2 (res[j], src1[i + j],                  \
+                         src2[i+ j], op);                      \
+           if (res[j] != dst[i+ j])                            \
+             abort();                                          \
+         }                                                     \
+      }                                                                \
+  }                                                            \
+while (0)
+
+#define TEST_UNSIGNED(vtype, type, N, fn, fn2, op)             \
+do                                                             \
+  {                                                            \
+    type dst[NUM], src1[NUM], src2[NUM];                       \
+    int i,j;                                                   \
+    type res[N];                                               \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1[i] = i * i;                                        \
+       src2[i] = i + 20;                                       \
+       dst[i] = i * i * i + 100;                               \
+       if ((i % 4))                                            \
+         src2[i] |= (1ULL << (sizeof (type)                    \
+                                * __CHAR_BIT__ - 1));          \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      {                                                                \
+       for (j = 0; j < N; j++)                                 \
+         res[j] = dst[i + j];                                  \
+       fn (&dst[i], &src1[i], &src2[i]);                       \
+       for (j = 0; j < N; j++)                                 \
+         {                                                     \
+           res[j] = fn2 (res[j], src1[i + j],                  \
+                         src2[i + j], op);                     \
+           if (res[j] != dst[i + j])                           \
+             abort();                                          \
+         }                                                     \
+      }                                                                \
+  }                                                            \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_SIGNED (v32qi, signed char, 32, f1, cmpb, 5);
+  TEST_UNSIGNED (v32uqi, unsigned char, 32, f2, cmpub, 5);
+  TEST_SIGNED (v32qi, signed char, 32, f3, cmpb, 2);
+  TEST_UNSIGNED (v32uqi, unsigned char, 32, f4, cmpub, 2);
+  TEST_SIGNED (v16hi, short int, 16, f5, cmpw, 5);
+  TEST_UNSIGNED (v16uhi, unsigned short int, 16, f6, cmpuw, 5);
+  TEST_SIGNED (v16hi, short int, 16, f7, cmpw, 2);
+  TEST_UNSIGNED (v16uhi, unsigned short int, 16, f8, cmpuw, 2);
+  TEST_SIGNED (v8si, int, 8, f9, cmpd, 5);
+  TEST_UNSIGNED (v8usi, unsigned int, 8, f10, cmpud, 5);
+  TEST_SIGNED (v8si, int, 8, f11, cmpd, 2);
+  TEST_UNSIGNED (v8usi, unsigned int, 8, f12, cmpud, 2);
+  TEST_SIGNED (v4di, long long int, 4, f13, cmpq, 5);
+  TEST_UNSIGNED (v4udi, unsigned long long int, 4, f14, cmpuq, 5);
+  TEST_SIGNED (v4di, long long int, 4, f15, cmpq, 2);
+  TEST_UNSIGNED (v4udi, unsigned long long int, 4, f16, cmpuq, 2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-1.c
new file mode 100644 (file)
index 0000000..5b79d4d
--- /dev/null
@@ -0,0 +1,112 @@
+/* PR target/88547 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl -mno-avx512dq -mno-xop" } */
+/* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovm2\[bw\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[\t ]" 8 } } */
+
+typedef signed char v32qi __attribute__((vector_size(32)));
+typedef unsigned char v32uqi __attribute__((vector_size(32)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef unsigned short v16uhi __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef unsigned v8usi __attribute__((vector_size(32)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef unsigned long long v4udi __attribute__((vector_size(32)));
+
+__attribute__((noipa)) v32qi
+f1 (v32qi x, v32qi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32uqi
+f2 (v32uqi x, v32uqi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32qi
+f3 (v32qi x, v32qi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32uqi
+f4 (v32uqi x, v32uqi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16hi
+f5 (v16hi x, v16hi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16uhi
+f6 (v16uhi x, v16uhi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16hi
+f7 (v16hi x, v16hi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16uhi
+f8 (v16uhi x, v16uhi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8si
+f9 (v8si x, v8si y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8usi
+f10 (v8usi x, v8usi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8si
+f11 (v8si x, v8si y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8usi
+f12 (v8usi x, v8usi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v4di
+f13 (v4di x, v4di y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v4udi
+f14 (v4udi x, v4udi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v4di
+f15 (v4di x, v4di y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v4udi
+f16 (v4udi x, v4udi y)
+{
+  return x <= y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-2.c
new file mode 100644 (file)
index 0000000..6be24ff
--- /dev/null
@@ -0,0 +1,91 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512vl_test
+#endif
+
+#include "avx512vl-pr92686-vpcmp-1.c"
+
+#define NUM 256
+
+#define TEST_SIGNED(vtype, type, N, fn, op) \
+do                                                             \
+  {                                                            \
+    union { vtype x[NUM / N]; type i[NUM]; } dst, src1, src2;  \
+    int i, sign = 1;                                           \
+    type res;                                                  \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1.i[i] = i * i * sign;                               \
+       src2.i[i] = (i + 20) * sign;                            \
+       sign = -sign;                                           \
+      }                                                                \
+    for (i = 0; i < NUM; i += N)                               \
+      dst.x[i / N] = fn (src1.x[i / N], src2.x[i / N]);                \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       res = src1.i[i] op src2.i[i] ? -1 : 0;                  \
+       if (res != dst.i[i])                                    \
+         abort ();                                             \
+      }                                                                \
+  }                                                            \
+while (0)
+
+#define TEST_UNSIGNED(vtype, type, N, fn, op) \
+do                                                             \
+  {                                                            \
+    union { vtype x[NUM / N]; type i[NUM]; } dst, src1, src2;  \
+    int i;                                                     \
+    type res;                                                  \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       src1.i[i] = i * i;                                      \
+       src2.i[i] = i + 20;                                     \
+       if ((i % 4))                                            \
+         src2.i[i] |= (1ULL << (sizeof (type)                  \
+                                * __CHAR_BIT__ - 1));          \
+      }                                                                \
+                                                               \
+    for (i = 0; i < NUM; i += N)                               \
+      dst.x[i / N] = fn (src1.x[i / N], src2.x[i / N]);                \
+                                                               \
+    for (i = 0; i < NUM; i++)                                  \
+      {                                                                \
+       res = src1.i[i] op src2.i[i] ? -1 : 0;                  \
+       if (res != dst.i[i])                                    \
+         abort ();                                             \
+      }                                                                \
+  }                                                            \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_SIGNED (v32qi, signed char, 32, f1, >=);
+  TEST_UNSIGNED (v32uqi, unsigned char, 32, f2, >=);
+  TEST_SIGNED (v32qi, signed char, 32, f3, <=);
+  TEST_UNSIGNED (v32uqi, unsigned char, 32, f4, <=);
+  TEST_SIGNED (v16hi, short int, 16, f5, >=);
+  TEST_UNSIGNED (v16uhi, unsigned short int, 16, f6, >=);
+  TEST_SIGNED (v16hi, short int, 16, f7, <=);
+  TEST_UNSIGNED (v16uhi, unsigned short int, 16, f8, <=);
+  TEST_SIGNED (v8si, int, 8, f9, >=);
+  TEST_UNSIGNED (v8usi, unsigned int, 8, f10, >=);
+  TEST_SIGNED (v8si, int, 8, f11, <=);
+  TEST_UNSIGNED (v8usi, unsigned int, 8, f12, <=);
+  TEST_SIGNED (v4di, long long int, 4, f13, >=);
+  TEST_UNSIGNED (v4udi, unsigned long long int, 4, f14, >=);
+  TEST_SIGNED (v4di, long long int, 4, f15, <=);
+  TEST_UNSIGNED (v4udi, unsigned long long int, 4, f16, <=);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-intelasm-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr92686-vpcmp-intelasm-1.c
new file mode 100644 (file)
index 0000000..c9a1b69
--- /dev/null
@@ -0,0 +1,110 @@
+/* PR target/88547 */
+/* { dg-do assemble } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl -mno-avx512dq -mno-xop -masm=intel" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+
+typedef signed char v32qi __attribute__((vector_size(32)));
+typedef unsigned char v32uqi __attribute__((vector_size(32)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef unsigned short v16uhi __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef unsigned v8usi __attribute__((vector_size(32)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef unsigned long long v4udi __attribute__((vector_size(32)));
+
+__attribute__((noipa)) v32qi
+f1 (v32qi x, v32qi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32uqi
+f2 (v32uqi x, v32uqi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v32qi
+f3 (v32qi x, v32qi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v32uqi
+f4 (v32uqi x, v32uqi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16hi
+f5 (v16hi x, v16hi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16uhi
+f6 (v16uhi x, v16uhi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v16hi
+f7 (v16hi x, v16hi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v16uhi
+f8 (v16uhi x, v16uhi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8si
+f9 (v8si x, v8si y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8usi
+f10 (v8usi x, v8usi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v8si
+f11 (v8si x, v8si y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v8usi
+f12 (v8usi x, v8usi y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v4di
+f13 (v4di x, v4di y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v4udi
+f14 (v4udi x, v4udi y)
+{
+  return x >= y;
+}
+
+__attribute__((noipa)) v4di
+f15 (v4di x, v4di y)
+{
+  return x <= y;
+}
+
+__attribute__((noipa)) v4udi
+f16 (v4udi x, v4udi y)
+{
+  return x <= y;
+}
index b6c82157bba6605c2ae2859f328157bebf8f87b2..fa6832c7fab7a0210ea59724ec9a2e0fe3a383d6 100644 (file)
@@ -1,19 +1,9 @@
 /* PR target/88547 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512vl -mavx512bw -mavx512dq" } */
-/* { dg-final { scan-assembler-not "vpternlog" } } */
-/* { dg-final { scan-assembler-times "vpmovm2b\[\t  ]" 4 } } */
-/* { dg-final { scan-assembler-times "vpmovm2w\[\t  ]" 4 } } */
-/* { dg-final { scan-assembler-times "vpmovm2d\[\t  ]" 4 } } */
-/* { dg-final { scan-assembler-times "vpmovm2q\[\t  ]" 4 } } */
-/* { dg-final { scan-assembler-times "knotb\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-times "knotw\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-times "knotd\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-times "knotq\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminud\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-times "vpminuq\[\t  ]" 2 } } */
-/* { dg-final { scan-assembler-not "vpsubd\[\t  ]" } } */
-/* { dg-final { scan-assembler-not "vpsubq\[\t  ]" } } */
+/* { dg-final { scan-assembler-times "vpcmp\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpcmpu\[bwdq\]\[\t ]" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovm2\[bwdq\]\[\t ]" 16 } } */
 
 typedef signed char v64qi __attribute__((vector_size(64)));
 typedef unsigned char v64uqi __attribute__((vector_size(64)));
diff --git a/gcc/testsuite/gcc.target/i386/pr92686.inc b/gcc/testsuite/gcc.target/i386/pr92686.inc
new file mode 100644 (file)
index 0000000..260581e
--- /dev/null
@@ -0,0 +1,189 @@
+/* Include by avx512bw-pr92686-movcc-2.c, avx512vl-pr92686-movcc-2.c  */
+__attribute__((noipa)) char
+cmpb (char dst, char src1, char src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) unsigned char
+cmpub (unsigned char dst, unsigned char src1,
+                      unsigned char src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) short
+cmpw (short dst, short src1, short src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) unsigned short
+cmpuw (unsigned short dst, unsigned short src1,
+                      unsigned short src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) int
+cmpd (int dst, int src1, int src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) unsigned int
+cmpud (unsigned int dst, unsigned int src1,
+                      unsigned int src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) long long int
+cmpq (long long int dst, long long int src1,
+                      long long int src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}
+
+__attribute__((noipa)) unsigned long long int
+cmpuq (unsigned long long int dst, unsigned long long int src1,
+       unsigned long long int src2, int num)
+{
+  switch(num)
+    {
+    case 0:
+      return src1 == src2 ? src1 : dst;
+    case 1:
+      return src1 < src2 ? src1 : dst;
+    case 2:
+      return src1 <= src2 ? src1 : dst;
+    case 4:
+      return src1 != src2 ? src1 : dst;
+    case 5:
+      return src1 >= src2 ? src1 : dst;
+    case 6:
+      return src1 > src2 ? src1 : dst;
+    default:
+      abort();
+    }
+  abort();
+}