[AArch64] Support for LDP/STP of Q-registers
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>
Wed, 20 Jun 2018 08:57:17 +0000 (08:57 +0000)
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>
Wed, 20 Jun 2018 08:57:17 +0000 (08:57 +0000)
This patch adds support for generating LDPs and STPs of Q-registers.
This allows for more compact code generation and makes better use of the ISA.

It's implemented in a straightforward way by allowing 16-byte modes in the
sched-fusion machinery and adding appropriate peepholes in aarch64-ldpstp.md
as well as the patterns themselves in aarch64-simd.md.

It adds a new no_ldp_stp_qregs tuning flag.
I use it to restrict the peepholes in aarch64-ldpstp.md from merging the
operations together into PARALLELs. I also use it to restrict the sched fusion
check that brings such loads and stores together. This is enough to avoid
forming the pairs when the tuning flag is set.

I didn't see any non-noise performance effect on SPEC2017 on Cortex-A72 and Cortex-A53.

        * config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New.
        * config/aarch64/aarch64.c (xgene1_tunings): Add
        AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags.
        (aarch64_mode_valid_for_sched_fusion_p):
        Allow 16-byte modes.
        (aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
        * config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
        128-bit modes.
        * config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
        New pattern.
        (vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
        * config/aarch64/iterators.md (VQ2): New mode iterator.

        * gcc.target/aarch64/ldp_stp_q.c: New test.
        * gcc.target/aarch64/stp_vec_128_1.c: Likewise.
        * gcc.target/aarch64/ldp_stp_q_disable.c: Likewise.

From-SVN: r261796

gcc/ChangeLog
gcc/config/aarch64/aarch64-ldpstp.md
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/aarch64-tuning-flags.def
gcc/config/aarch64/aarch64.c
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c [new file with mode: 0644]

index c2639798fb34232ddd32f1f0bd666c4e6db11570..ae490155217d5a84018ca9887552458b435d5460 100644 (file)
@@ -1,3 +1,18 @@
+2018-06-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New.
+       * config/aarch64/aarch64.c (xgene1_tunings): Add
+       AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags.
+       (aarch64_mode_valid_for_sched_fusion_p):
+       Allow 16-byte modes.
+       (aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
+       * config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
+       128-bit modes.
+       * config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
+       New pattern.
+       (vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
+       * config/aarch64/iterators.md (VQ2): New mode iterator.
+
 2018-06-20  Martin Liska  <mliska@suse.cz>
 
        * tree-switch-conversion.c (jump_table_cluster::can_be_handled):
index 7f1031dc80fab31f691c0b03d6a485c1b6fd7e53..650a80dcb2fed1c148a5bff83468fb2b768aa14a 100644 (file)
   aarch64_swap_ldrstr_operands (operands, false);
 })
 
+(define_peephole2
+  [(set (match_operand:VQ 0 "register_operand" "")
+       (match_operand:VQ 1 "memory_operand" ""))
+   (set (match_operand:VQ2 2 "register_operand" "")
+       (match_operand:VQ2 3 "memory_operand" ""))]
+  "TARGET_SIMD
+   && aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode)
+   && (aarch64_tune_params.extra_tuning_flags
+       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+             (set (match_dup 2) (match_dup 3))])]
+{
+  aarch64_swap_ldrstr_operands (operands, true);
+})
+
+(define_peephole2
+  [(set (match_operand:VQ 0 "memory_operand" "")
+       (match_operand:VQ 1 "register_operand" ""))
+   (set (match_operand:VQ2 2 "memory_operand" "")
+       (match_operand:VQ2 3 "register_operand" ""))]
+  "TARGET_SIMD
+   && aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode)
+   && (aarch64_tune_params.extra_tuning_flags
+       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+             (set (match_dup 2) (match_dup 3))])]
+{
+  aarch64_swap_ldrstr_operands (operands, false);
+})
+
+
 ;; Handle sign/zero extended consecutive load/store.
 
 (define_peephole2
index dc4e0263096923097ac003fd7131a86bc661297e..aac5fa146ed8dde4507a0eb4ad6a07ce78d2f0cd 100644 (file)
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "load_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+       (match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:VQ2 2 "register_operand" "=w")
+       (match_operand:VQ2 3 "memory_operand" "m"))]
+  "TARGET_SIMD
+    && rtx_equal_p (XEXP (operands[3], 0),
+                   plus_constant (Pmode,
+                              XEXP (operands[1], 0),
+                              GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "ldp\\t%q0, %q2, %1"
+  [(set_attr "type" "neon_ldp_q")]
+)
+
+(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
+       (match_operand:VQ 1 "register_operand" "w"))
+   (set (match_operand:VQ2 2 "memory_operand" "=m")
+       (match_operand:VQ2 3 "register_operand" "w"))]
+  "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
+               plus_constant (Pmode,
+                              XEXP (operands[0], 0),
+                              GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "stp\\t%q1, %q3, %0"
+  [(set_attr "type" "neon_stp_q")]
+)
+
+
 (define_split
   [(set (match_operand:VQ 0 "register_operand" "")
       (match_operand:VQ 1 "register_operand" ""))]
index ea9ead234cbc9abcff874dcc1fef4e76c76239ad..fb9700ca42f4660e2d5e19c14866a9d784318dd5 100644 (file)
@@ -41,4 +41,7 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Disallow load/store pair instructions on Q-registers.  */
+AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
index bd0ac2f04d8f43fd54b58ff9581645493b8d0cd1..b88e7cac27ab76e01b9769563ec9077d2a81bd7b 100644 (file)
@@ -880,7 +880,7 @@ static const struct tune_params xgene1_tunings =
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),   /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),       /* tune_flags.  */
   &generic_prefetch_tune
 };
 
@@ -5690,7 +5690,10 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
   return mode == SImode || mode == DImode
         || mode == SFmode || mode == DFmode
         || (aarch64_vector_mode_supported_p (mode)
-            && known_eq (GET_MODE_SIZE (mode), 8));
+            && (known_eq (GET_MODE_SIZE (mode), 8)
+                || (known_eq (GET_MODE_SIZE (mode), 16)
+                   && (aarch64_tune_params.extra_tuning_flags
+                       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
 }
 
 /* Return true if REGNO is a virtual pointer register, or an eliminable
@@ -5847,7 +5850,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 
          if (load_store_pair_p)
            return ((known_eq (GET_MODE_SIZE (mode), 4)
-                    || known_eq (GET_MODE_SIZE (mode), 8))
+                    || known_eq (GET_MODE_SIZE (mode), 8)
+                    || known_eq (GET_MODE_SIZE (mode), 16))
                    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
          else
            return (offset_9bit_signed_unscaled_p (mode, offset)
@@ -5907,7 +5911,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
 
          if (load_store_pair_p)
            return ((known_eq (GET_MODE_SIZE (mode), 4)
-                    || known_eq (GET_MODE_SIZE (mode), 8))
+                    || known_eq (GET_MODE_SIZE (mode), 8)
+                    || known_eq (GET_MODE_SIZE (mode), 16))
                    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
          else
            return offset_9bit_signed_unscaled_p (mode, offset);
index dbe1e34b61d1dcc96eebbf80b9745df94f130232..c5ef2eecf20c4167c071f1b964e45092222b147c 100644 (file)
@@ -84,6 +84,9 @@
 ;; Quad vector modes.
 (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Copy of the above.
+(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
index 4dd987bcd647afef14ded9a49e8afa88dc9f97b6..a2c5eabd5a6efcaae2df3cd016eb8a1560c41ac5 100644 (file)
@@ -1,3 +1,9 @@
+2018-06-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * gcc.target/aarch64/ldp_stp_q.c: New test.
+       * gcc.target/aarch64/stp_vec_128_1.c: Likewise.
+       * gcc.target/aarch64/ldp_stp_q_disable.c: Likewise.
+
 2018-06-20  Martin Liska  <mliska@suse.cz>
 
        * gcc.dg/tree-ssa/vrp104.c: Grep just for GIMPLE IL.
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c
new file mode 100644 (file)
index 0000000..b4f2586
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-options "-O2 -moverride=tune=none" } */
+
+typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
+
+float32x4_t arr[4][4];
+
+void
+foo (float32x4_t x, float32x4_t y)
+{
+  arr[0][1] = x;
+  arr[1][0] = y;
+  arr[2][0] = x;
+  arr[1][1] = y;
+  arr[0][2] = x;
+  arr[0][3] = y;
+  arr[1][2] = x;
+  arr[2][1] = y;
+  arr[3][0] = x;
+  arr[3][1] = y;
+  arr[2][2] = x;
+  arr[1][3] = y;
+  arr[2][3] = x;
+  arr[3][2] = y;
+}
+
+/* { dg-final { scan-assembler-times "stp\tq\[0-9\]+, q\[0-9\]" 7 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c
new file mode 100644 (file)
index 0000000..38c1870
--- /dev/null
@@ -0,0 +1,26 @@
+/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */
+
+typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
+
+float32x4_t arr[4][4];
+
+void
+foo (float32x4_t x, float32x4_t y)
+{
+  arr[0][1] = x;
+  arr[1][0] = y;
+  arr[2][0] = x;
+  arr[1][1] = y;
+  arr[0][2] = x;
+  arr[0][3] = y;
+  arr[1][2] = x;
+  arr[2][1] = y;
+  arr[3][0] = x;
+  arr[3][1] = y;
+  arr[2][2] = x;
+  arr[1][3] = y;
+  arr[2][3] = x;
+  arr[3][2] = y;
+}
+
+/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c
new file mode 100644 (file)
index 0000000..7d8d54e
--- /dev/null
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -moverride=tune=none" } */
+
+
+typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
+
+void
+bar (int32x4_t *foo)
+{
+  int i = 0;
+  int32x4_t val = { 3, 2, 5, 1 };
+
+  for (i = 0; i < 256; i+=2)
+    {
+      foo[i] = val;
+      foo[i+1] = val;
+    }
+}
+
+/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]" } } */