AArch64: Add NEON, SVE and SVE2 RTL patterns for Multiply, FMS and FMA.
authorTamar Christina <tamar.christina@arm.com>
Fri, 15 Jan 2021 18:50:27 +0000 (18:50 +0000)
committerTamar Christina <tamar.christina@arm.com>
Fri, 15 Jan 2021 18:50:27 +0000 (18:50 +0000)
This adds implementation for the optabs for complex operations.  With this the
following C code:

  void g (float complex a[restrict N], float complex b[restrict N],
  float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] =  a[i] * b[i];
  }

generates

NEON:

g:
        movi    v3.4s, 0
        mov     x3, 0
        .p2align 3,,7
.L2:
        mov     v0.16b, v3.16b
        ldr     q2, [x1, x3]
        ldr     q1, [x0, x3]
        fcmla   v0.4s, v1.4s, v2.4s, #0
        fcmla   v0.4s, v1.4s, v2.4s, #90
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 1600
        bne     .L2
        ret

SVE:

g:
        mov     x3, 0
        mov     x4, 400
        ptrue   p1.b, all
        whilelo p0.s, xzr, x4
        mov     z3.s, #0
        .p2align 3,,7
.L2:
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        ld1w    z2.s, p0/z, [x1, x3, lsl 2]
        movprfx z0, z3
        fcmla   z0.s, p1/m, z1.s, z2.s, #0
        fcmla   z0.s, p1/m, z1.s, z2.s, #90
        st1w    z0.s, p0, [x2, x3, lsl 2]
        incw    x3
        whilelo p0.s, x3, x4
        b.any   .L2
        ret

SVE2 (with int instead of float)
g:
        mov     x3, 0
        mov     x4, 400
        mov     z3.b, #0
        whilelo p0.s, xzr, x4
        .p2align 3,,7
.L2:
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        ld1w    z2.s, p0/z, [x1, x3, lsl 2]
        movprfx z0, z3
        cmla    z0.s, z1.s, z2.s, #0
        cmla    z0.s, z1.s, z2.s, #90
        st1w    z0.s, p0, [x2, x3, lsl 2]
        incw    x3
        whilelo p0.s, x3, x4
        b.any   .L2
        ret

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (cml<fcmac1><conj_op><mode>4,
cmul<conj_op><mode>3): New.
* config/aarch64/iterators.md (UNSPEC_FCMUL,
UNSPEC_FCMUL180, UNSPEC_FCMLA_CONJ, UNSPEC_FCMLA180_CONJ,
UNSPEC_CMLA_CONJ, UNSPEC_CMLA180_CONJ, UNSPEC_CMUL, UNSPEC_CMUL180,
FCMLA_OP, FCMUL_OP, conj_op, rotsplit1, rotsplit2, fcmac1, sve_rot1,
sve_rot2, SVE2_INT_CMLA_OP, SVE2_INT_CMUL_OP, SVE2_INT_CADD_OP): New.
(rot): Add UNSPEC_FCMUL, UNSPEC_FCMUL180.
(rot_op): Renamed to conj_op.
* config/aarch64/aarch64-sve.md (cml<fcmac1><conj_op><mode>4,
cmul<conj_op><mode>3): New.
* config/aarch64/aarch64-sve2.md (cml<fcmac1><conj_op><mode>4,
cmul<conj_op><mode>3): New.

gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/aarch64-sve2.md
gcc/config/aarch64/iterators.md

index eabc915c681bf4700fc2e06826aa589ff6f9eecb..41071b668fd0982f55f9e48510403b9f50fe0f60 100644 (file)
   [(set_attr "type" "neon_fcmla")]
 )
 
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+  [(set (match_operand:VHSDF 0 "register_operand")
+       (plus:VHSDF (match_operand:VHSDF 1 "register_operand")
+                   (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand")
+                                  (match_operand:VHSDF 3 "register_operand")]
+                                  FCMLA_OP)))]
+  "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (tmp, operands[1],
+                                                operands[3], operands[2]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], tmp,
+                                                operands[3], operands[2]));
+  DONE;
+})
+
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:VHSDF 0 "register_operand")
+       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")
+                      (match_operand:VHSDF 2 "register_operand")]
+                      FCMUL_OP))]
+  "TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
+{
+  rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  rtx res1 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_fcmla<rotsplit1><mode> (res1, tmp,
+                                                operands[2], operands[1]));
+  emit_insn (gen_aarch64_fcmla<rotsplit2><mode> (operands[0], res1,
+                                                operands[2], operands[1]));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "aarch64_<sur>dot<vsi2qi>"
   [(set (match_operand:VS 0 "register_operand" "=w")
index da15bd8788507feb12d52894c14e099370f34108..608319600318974b414e47285ee1474b041f0e05 100644 (file)
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+       (unspec:SVE_FULL_F
+         [(match_dup 4)
+          (match_dup 5)
+          (match_operand:SVE_FULL_F 1 "register_operand")
+          (match_operand:SVE_FULL_F 2 "register_operand")
+          (match_operand:SVE_FULL_F 3 "register_operand")]
+         FCMLA_OP))]
+  "TARGET_SVE"
+{
+  operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn
+    (gen_aarch64_pred_fcmla<sve_rot1><mode> (tmp, operands[4],
+                                            operands[3], operands[2],
+                                            operands[1], operands[5]));
+  emit_insn
+    (gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+                                            operands[3], operands[2],
+                                            tmp, operands[5]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+       (unspec:SVE_FULL_F
+          [(match_operand:SVE_FULL_F 1 "register_operand")
+           (match_operand:SVE_FULL_F 2 "register_operand")]
+         FCMUL_OP))]
+  "TARGET_SVE"
+{
+  rtx pred_reg = aarch64_ptrue_reg (<VPRED>mode);
+  rtx gp_mode = gen_int_mode (SVE_RELAXED_GP, SImode);
+  rtx accum = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn
+    (gen_aarch64_pred_fcmla<sve_rot1><mode> (tmp, pred_reg,
+                                            operands[2], operands[1],
+                                            accum, gp_mode));
+  emit_insn
+    (gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], pred_reg,
+                                            operands[2], operands[1],
+                                            tmp, gp_mode));
+  DONE;
+})
+
 ;; Predicated FCMLA with merging.
 (define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_FULL_F 0 "register_operand")
index 5cb9144da98af2d02b83043511a99b5723d7e8c0..e7cd2b86d25f9a74dada4321aec22439dd07ae19 100644 (file)
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><conj_op><mode>4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+       (plus:SVE_FULL_I (match_operand:SVE_FULL_I 1 "register_operand")
+         (unspec:SVE_FULL_I
+           [(match_operand:SVE_FULL_I 2 "register_operand")
+            (match_operand:SVE_FULL_I 3 "register_operand")]
+           SVE2_INT_CMLA_OP)))]
+  "TARGET_SVE2"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (tmp, operands[1],
+                                                  operands[3], operands[2]));
+  emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], tmp,
+                                                  operands[3], operands[2]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<conj_op><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+       (unspec:SVE_FULL_I
+         [(match_operand:SVE_FULL_I 1 "register_operand")
+          (match_operand:SVE_FULL_I 2 "register_operand")]
+         SVE2_INT_CMUL_OP))]
+  "TARGET_SVE2"
+{
+  rtx accum = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_sve_cmla<sve_rot1><mode> (tmp, accum,
+                                                  operands[2], operands[1]));
+  emit_insn (gen_aarch64_sve_cmla<sve_rot2><mode> (operands[0], tmp,
+                                                  operands[2], operands[1]));
+  DONE;
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Complex dot product
 ;; -------------------------------------------------------------------------
index d42a70653edb266f2b76924b75a814db25f08f23..b64d77037af7345b2664362be34119eddc14ad93 100644 (file)
     UNSPEC_FCMLA90     ; Used in aarch64-simd.md.
     UNSPEC_FCMLA180    ; Used in aarch64-simd.md.
     UNSPEC_FCMLA270    ; Used in aarch64-simd.md.
+    UNSPEC_FCMUL       ; Used in aarch64-simd.md.
+    UNSPEC_FCMUL_CONJ  ; Used in aarch64-simd.md.
+    UNSPEC_FCMLA_CONJ  ; Used in aarch64-simd.md.
+    UNSPEC_FCMLA180_CONJ       ; Used in aarch64-simd.md.
     UNSPEC_ASRD                ; Used in aarch64-sve.md.
     UNSPEC_ADCLB       ; Used in aarch64-sve2.md.
     UNSPEC_ADCLT       ; Used in aarch64-sve2.md.
     UNSPEC_CMLA180     ; Used in aarch64-sve2.md.
     UNSPEC_CMLA270     ; Used in aarch64-sve2.md.
     UNSPEC_CMLA90      ; Used in aarch64-sve2.md.
+    UNSPEC_CMLA_CONJ   ; Used in aarch64-sve2.md.
+    UNSPEC_CMLA180_CONJ        ; Used in aarch64-sve2.md.
+    UNSPEC_CMUL                ; Used in aarch64-sve2.md.
+    UNSPEC_CMUL_CONJ   ; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTLT ; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTNT ; Used in aarch64-sve2.md.
     UNSPEC_COND_FCVTX  ; Used in aarch64-sve2.md.
 
 ;; Widened mode register suffixes for VD_BHSI/VQW/VQ_HSF.
 (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s")
-                         (V2SI "2d") (V16QI "8h") 
+                         (V2SI "2d") (V16QI "8h")
                          (V8HI "4s") (V4SI "2d")
                          (V8HF "4s") (V4SF "2d")])
 
 
 ;; Widened mode register suffixes for VDW/VQW.
 (define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s")
-                          (V2SI ".2d") (V16QI ".8h") 
+                          (V2SI ".2d") (V16QI ".8h")
                           (V8HI ".4s") (V4SI ".2d")
                           (V4HF ".4s") (V2SF ".2d")
                           (SI   "")    (HI   "")])
                                    UNSPEC_SQRDCMLAH180
                                    UNSPEC_SQRDCMLAH270])
 
+;; Unlike the normal CMLA instructions these represent the actual operation
+;; to be performed.  They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMLA_OP [UNSPEC_CMLA
+                                      UNSPEC_CMLA_CONJ
+                                      UNSPEC_CMLA180
+                                      UNSPEC_CMLA180_CONJ])
+
+;; Unlike the normal CMLA instructions these represent the actual operation
+;; to be performed.  They will always need to be expanded into multiple
+;; sequences consisting of CMLA.
+(define_int_iterator SVE2_INT_CMUL_OP [UNSPEC_CMUL
+                                      UNSPEC_CMUL_CONJ])
+
 ;; Same as SVE2_INT_CADD but exclude the saturating instructions
 (define_int_iterator SVE2_INT_CADD_OP [UNSPEC_CADD90
                                       UNSPEC_CADD270])
 (define_int_iterator BF_MLA [UNSPEC_BFMLALB
                             UNSPEC_BFMLALT])
 
+(define_int_iterator FCMLA_OP [UNSPEC_FCMLA
+                              UNSPEC_FCMLA180
+                              UNSPEC_FCMLA_CONJ
+                              UNSPEC_FCMLA180_CONJ])
+
+(define_int_iterator FCMUL_OP [UNSPEC_FCMUL
+                              UNSPEC_FCMUL_CONJ])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
                      (UNSPEC_COND_FCMLA "0")
                      (UNSPEC_COND_FCMLA90 "90")
                      (UNSPEC_COND_FCMLA180 "180")
-                     (UNSPEC_COND_FCMLA270 "270")])
+                     (UNSPEC_COND_FCMLA270 "270")
+                     (UNSPEC_FCMUL "0")
+                     (UNSPEC_FCMUL_CONJ "180")])
+
+;; A conjucate is a negation of the imaginary component
+;; The number in the unspecs are the rotation component of the instruction, e.g
+;; FCMLA180 means use the instruction with #180.
+;; The iterator is used to produce the right name mangling for the function.
+(define_int_attr conj_op [(UNSPEC_FCMLA180 "")
+                         (UNSPEC_FCMLA180_CONJ "_conj")
+                         (UNSPEC_FCMLA "")
+                         (UNSPEC_FCMLA_CONJ "_conj")
+                         (UNSPEC_FCMUL "")
+                         (UNSPEC_FCMUL_CONJ "_conj")
+                         (UNSPEC_CMLA "")
+                         (UNSPEC_CMLA180 "")
+                         (UNSPEC_CMLA180_CONJ "_conj")
+                         (UNSPEC_CMLA_CONJ "_conj")
+                         (UNSPEC_CMUL "")
+                         (UNSPEC_CMUL_CONJ "_conj")])
+
+;; The complex operations when performed on a real complex number require two
+;; instructions to perform the operation. e.g. complex multiplication requires
+;; two FCMUL with a particular rotation value.
+;;
+;; These values can be looked up in rotsplit1 and rotsplit2.  as an example
+;; FCMUL needs the first instruction to use #0 and the second #90.
+(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0")
+                           (UNSPEC_FCMLA_CONJ "0")
+                           (UNSPEC_FCMUL "0")
+                           (UNSPEC_FCMUL_CONJ "0")
+                           (UNSPEC_FCMLA180 "180")
+                           (UNSPEC_FCMLA180_CONJ "180")])
+
+(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
+                           (UNSPEC_FCMLA_CONJ "270")
+                           (UNSPEC_FCMUL "90")
+                           (UNSPEC_FCMUL_CONJ "270")
+                           (UNSPEC_FCMLA180 "270")
+                           (UNSPEC_FCMLA180_CONJ "90")])
+
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+                          (UNSPEC_FCMLA_CONJ "")
+                          (UNSPEC_FCMUL "")
+                          (UNSPEC_FCMUL_CONJ "")
+                          (UNSPEC_FCMLA180 "180")
+                          (UNSPEC_FCMLA180_CONJ "180")
+                          (UNSPEC_CMLA "")
+                          (UNSPEC_CMLA_CONJ "")
+                          (UNSPEC_CMUL "")
+                          (UNSPEC_CMUL_CONJ "")
+                          (UNSPEC_CMLA180 "180")
+                          (UNSPEC_CMLA180_CONJ "180")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+                          (UNSPEC_FCMLA_CONJ "270")
+                          (UNSPEC_FCMUL "90")
+                          (UNSPEC_FCMUL_CONJ "270")
+                          (UNSPEC_FCMLA180 "270")
+                          (UNSPEC_FCMLA180_CONJ "90")
+                          (UNSPEC_CMLA "90")
+                          (UNSPEC_CMLA_CONJ "270")
+                          (UNSPEC_CMUL "90")
+                          (UNSPEC_CMUL_CONJ "270")
+                          (UNSPEC_CMLA180 "270")
+                          (UNSPEC_CMLA180_CONJ "90")])
+
+
+(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA_CONJ "a")
+                        (UNSPEC_FCMLA180 "s") (UNSPEC_FCMLA180_CONJ "s")
+                        (UNSPEC_CMLA "a") (UNSPEC_CMLA_CONJ "a")
+                        (UNSPEC_CMLA180 "s") (UNSPEC_CMLA180_CONJ "s")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
                              (UNSPEC_COND_FMLS "fmls")