1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
157 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
158 GET_MODE (op
) == VOIDmode
159 ? mode
: GET_MODE (op
), byte
);
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
168 ix86_expand_clear (rtx dest
)
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed
);
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
177 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
178 tmp
= gen_rtx_SET (dest
, const0_rtx
);
180 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
182 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
183 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
190 ix86_expand_move (machine_mode mode
, rtx operands
[])
193 rtx tmp
, addend
= NULL_RTX
;
194 enum tls_model model
;
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0
, op1
))
202 tmp
= gen_reg_rtx (mode
);
204 ix86_expand_move (mode
, operands
);
210 switch (GET_CODE (op1
))
215 if (GET_CODE (tmp
) != PLUS
216 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
220 addend
= XEXP (tmp
, 1);
224 model
= SYMBOL_REF_TLS_MODEL (op1
);
227 op1
= legitimize_tls_address (op1
, model
, true);
228 else if (ix86_force_load_from_GOT_p (op1
))
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
235 op1
= gen_rtx_CONST (Pmode
, op1
);
236 op1
= gen_const_mem (Pmode
, op1
);
237 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
241 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
257 op1
= force_operand (op1
, NULL_RTX
);
258 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
259 op0
, 1, OPTAB_DIRECT
);
262 op1
= force_operand (op1
, op0
);
267 op1
= convert_to_mode (mode
, op1
, 1);
273 if ((flag_pic
|| MACHOPIC_INDIRECT
)
274 && symbolic_operand (op1
, mode
))
276 if (TARGET_MACHO
&& !TARGET_64BIT
)
280 if (MACHOPIC_INDIRECT
)
282 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
283 ? op0
: gen_reg_rtx (Pmode
);
284 op1
= machopic_indirect_data_reference (op1
, temp
);
286 op1
= machopic_legitimize_pic_address (op1
, mode
,
287 temp
== op1
? 0 : temp
);
289 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
291 rtx insn
= gen_rtx_SET (op0
, op1
);
295 if (GET_CODE (op0
) == MEM
)
296 op1
= force_reg (Pmode
, op1
);
300 if (GET_CODE (temp
) != REG
)
301 temp
= gen_reg_rtx (Pmode
);
302 temp
= legitimize_pic_address (op1
, temp
);
313 op1
= force_reg (mode
, op1
);
314 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
316 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
317 op1
= legitimize_pic_address (op1
, reg
);
320 op1
= convert_to_mode (mode
, op1
, 1);
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
328 || !push_operand (op0
, mode
))
330 op1
= force_reg (mode
, op1
);
332 if (push_operand (op0
, mode
)
333 && ! general_no_elim_operand (op1
, mode
))
334 op1
= copy_to_mode_reg (mode
, op1
);
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode
== DImode
) && TARGET_64BIT
340 && immediate_operand (op1
, mode
)
341 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
342 && !register_operand (op0
, mode
)
344 op1
= copy_to_mode_reg (mode
, op1
);
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1
))
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
353 op1
= validize_mem (force_const_mem (mode
, op1
));
354 if (!register_operand (op0
, mode
))
356 rtx temp
= gen_reg_rtx (mode
);
357 emit_insn (gen_rtx_SET (temp
, op1
));
358 emit_move_insn (op0
, temp
);
364 emit_insn (gen_rtx_SET (op0
, op1
));
368 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
370 rtx op0
= operands
[0], op1
= operands
[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align
= (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode
)
375 : GET_MODE_ALIGNMENT (mode
));
377 if (push_operand (op0
, VOIDmode
))
378 op0
= emit_move_resolve_push (mode
, op0
);
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
387 && CONSTANT_P (SUBREG_REG (op1
))))
388 && ((register_operand (op0
, mode
)
389 && !standard_sse_constant_p (op1
, mode
))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode
)
393 && MEM_ALIGN (op0
) < align
)))
397 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
398 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
400 r
= validize_mem (r
);
402 r
= force_reg (imode
, SUBREG_REG (op1
));
403 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
406 op1
= validize_mem (force_const_mem (mode
, op1
));
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode
)
413 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
414 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0
, mode
)
421 && !register_operand (op1
, mode
))
422 op1
= force_reg (mode
, op1
);
424 tmp
[0] = op0
; tmp
[1] = op1
;
425 ix86_expand_vector_move_misalign (mode
, tmp
);
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0
, mode
)
432 && !register_operand (op1
, mode
))
434 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
438 emit_insn (gen_rtx_SET (op0
, op1
));
441 /* Split 32-byte AVX unaligned load and store if needed. */
444 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
447 rtx (*extract
) (rtx
, rtx
, rtx
);
450 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
451 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
453 emit_insn (gen_rtx_SET (op0
, op1
));
457 rtx orig_op0
= NULL_RTX
;
458 mode
= GET_MODE (op0
);
459 switch (GET_MODE_CLASS (mode
))
461 case MODE_VECTOR_INT
:
463 if (mode
!= V32QImode
)
468 op0
= gen_reg_rtx (V32QImode
);
471 op0
= gen_lowpart (V32QImode
, op0
);
472 op1
= gen_lowpart (V32QImode
, op1
);
476 case MODE_VECTOR_FLOAT
:
487 extract
= gen_avx_vextractf128v32qi
;
491 extract
= gen_avx_vextractf128v8sf
;
495 extract
= gen_avx_vextractf128v4df
;
502 rtx r
= gen_reg_rtx (mode
);
503 m
= adjust_address (op1
, mode
, 0);
504 emit_move_insn (r
, m
);
505 m
= adjust_address (op1
, mode
, 16);
506 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
507 emit_move_insn (op0
, r
);
509 else if (MEM_P (op0
))
511 m
= adjust_address (op0
, mode
, 0);
512 emit_insn (extract (m
, op1
, const0_rtx
));
513 m
= adjust_address (op0
, mode
, 16);
514 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
520 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
530 if (x86_sse_partial_reg_dependency == true)
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
546 if (x86_sse_partial_reg_dependency == true)
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
563 if (x86_sse_split_regs == true)
576 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
586 emit_insn (gen_rtx_SET (op0
, op1
));
592 if (GET_MODE_SIZE (mode
) == 32)
593 ix86_avx256_split_vector_move_misalign (op0
, op1
);
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0
, op1
));
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
603 emit_insn (gen_rtx_SET (op0
, op1
));
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
610 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
612 emit_insn (gen_rtx_SET (op0
, op1
));
618 if (TARGET_SSE2
&& mode
== V2DFmode
)
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS
)
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero
= CONST0_RTX (V2DFmode
);
642 m
= adjust_address (op1
, DFmode
, 0);
643 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
644 m
= adjust_address (op1
, DFmode
, 8);
645 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
651 if (mode
!= V4SFmode
)
652 t
= gen_reg_rtx (V4SFmode
);
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
657 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
661 m
= adjust_address (op1
, V2SFmode
, 0);
662 emit_insn (gen_sse_loadlps (t
, t
, m
));
663 m
= adjust_address (op1
, V2SFmode
, 8);
664 emit_insn (gen_sse_loadhps (t
, t
, m
));
665 if (mode
!= V4SFmode
)
666 emit_move_insn (op0
, gen_lowpart (mode
, t
));
669 else if (MEM_P (op0
))
671 if (TARGET_SSE2
&& mode
== V2DFmode
)
673 m
= adjust_address (op0
, DFmode
, 0);
674 emit_insn (gen_sse2_storelpd (m
, op1
));
675 m
= adjust_address (op0
, DFmode
, 8);
676 emit_insn (gen_sse2_storehpd (m
, op1
));
680 if (mode
!= V4SFmode
)
681 op1
= gen_lowpart (V4SFmode
, op1
);
683 m
= adjust_address (op0
, V2SFmode
, 0);
684 emit_insn (gen_sse_storelps (m
, op1
));
685 m
= adjust_address (op0
, V2SFmode
, 8);
686 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
693 /* Move bits 64:95 to bits 32:63. */
696 ix86_move_vector_high_sse_to_mmx (rtx op
)
698 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
702 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
703 rtx insn
= gen_rtx_SET (dest
, op
);
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
710 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
712 rtx op0
= operands
[0];
713 rtx op1
= operands
[1];
714 rtx op2
= operands
[2];
716 machine_mode dmode
= GET_MODE (op0
);
717 machine_mode smode
= GET_MODE (op1
);
718 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
719 machine_mode inner_smode
= GET_MODE_INNER (smode
);
721 /* Get the corresponding SSE mode for destination. */
722 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
723 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
725 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
726 nunits
/ 2).require ();
728 /* Get the corresponding SSE mode for source. */
729 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
730 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
735 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
736 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
738 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
739 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
740 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
744 ix86_move_vector_high_sse_to_mmx (op0
);
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
750 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
752 rtx op0
= operands
[0];
753 rtx op1
= operands
[1];
754 rtx op2
= operands
[2];
755 machine_mode mode
= GET_MODE (op0
);
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode
, double_sse_mode
;
763 sse_mode
= V16QImode
;
764 double_sse_mode
= V32QImode
;
765 mask
= gen_rtx_PARALLEL (VOIDmode
,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
779 double_sse_mode
= V16HImode
;
780 mask
= gen_rtx_PARALLEL (VOIDmode
,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
790 double_sse_mode
= V8SImode
;
791 mask
= gen_rtx_PARALLEL (VOIDmode
,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
801 /* Generate SSE punpcklXX. */
802 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
803 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
804 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
806 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
807 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
808 rtx insn
= gen_rtx_SET (dest
, op2
);
813 /* Move bits 64:127 to bits 0:63. */
814 mask
= gen_rtx_PARALLEL (VOIDmode
,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
818 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
819 insn
= gen_rtx_SET (dest
, op1
);
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
828 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
831 rtx dst
= operands
[0];
832 rtx src1
= operands
[1];
833 rtx src2
= operands
[2];
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst
, src1
))
843 if (rtx_equal_p (dst
, src2
))
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2
, mode
))
849 if (immediate_operand (src1
, mode
))
852 /* Lowest priority is that memory references should come second. */
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
867 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
870 rtx dst
= operands
[0];
871 rtx src1
= operands
[1];
872 rtx src2
= operands
[2];
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
880 std::swap (src1
, src2
);
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1
) && MEM_P (src2
))
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1
, src2
))
889 src2
= force_reg (mode
, src2
);
892 else if (rtx_equal_p (dst
, src1
))
893 src2
= force_reg (mode
, src2
);
895 src1
= force_reg (mode
, src1
);
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
901 dst
= gen_reg_rtx (mode
);
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1
))
905 src1
= force_reg (mode
, src1
);
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
909 src1
= force_reg (mode
, src1
);
911 /* Improve address combine. */
913 && GET_MODE_CLASS (mode
) == MODE_INT
915 src2
= force_reg (mode
, src2
);
922 /* Similarly, but assume that the destination has already been
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
927 machine_mode mode
, rtx operands
[])
929 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
930 gcc_assert (dst
== operands
[0]);
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
938 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
941 rtx src1
, src2
, dst
, op
, clob
;
943 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
947 /* Emit the instruction. */
949 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
953 && !rtx_equal_p (dst
, src1
))
955 /* This is going to be an LEA; avoid splitting it later. */
960 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
964 /* Fix up the destination if needed. */
965 if (dst
!= operands
[0])
966 emit_move_insn (operands
[0], dst
);
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
973 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
976 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
977 if (SUBREG_P (operands
[1]))
982 else if (SUBREG_P (operands
[2]))
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
997 && SUBREG_BYTE (op1
) == 0
998 && (GET_CODE (op2
) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1000 && SUBREG_BYTE (op2
) == 0))
1001 && can_create_pseudo_p ())
1004 switch (GET_MODE (SUBREG_REG (op1
)))
1012 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1013 if (GET_CODE (op2
) == CONST_VECTOR
)
1015 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1016 op2
= force_reg (GET_MODE (dst
), op2
);
1021 op2
= SUBREG_REG (operands
[2]);
1022 if (!vector_operand (op2
, GET_MODE (dst
)))
1023 op2
= force_reg (GET_MODE (dst
), op2
);
1025 op1
= SUBREG_REG (op1
);
1026 if (!vector_operand (op1
, GET_MODE (dst
)))
1027 op1
= force_reg (GET_MODE (dst
), op1
);
1028 emit_insn (gen_rtx_SET (dst
,
1029 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1031 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1037 if (!vector_operand (operands
[1], mode
))
1038 operands
[1] = force_reg (mode
, operands
[1]);
1039 if (!vector_operand (operands
[2], mode
))
1040 operands
[2] = force_reg (mode
, operands
[2]);
1041 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1042 emit_insn (gen_rtx_SET (operands
[0],
1043 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1051 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1054 rtx dst
= operands
[0];
1055 rtx src1
= operands
[1];
1056 rtx src2
= operands
[2];
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1060 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1065 std::swap (src1
, src2
);
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1
))
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1081 || (TARGET_64BIT
&& mode
== DImode
))
1082 && satisfies_constraint_L (src2
));
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1092 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1095 bool matching_memory
= false;
1096 rtx src
, dst
, op
, clob
;
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1105 if (rtx_equal_p (dst
, src
))
1106 matching_memory
= true;
1108 dst
= gen_reg_rtx (mode
);
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src
) && !matching_memory
)
1113 src
= force_reg (mode
, src
);
1115 /* Emit the instruction. */
1117 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1123 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1127 /* Fix up the destination if needed. */
1128 if (dst
!= operands
[0])
1129 emit_move_insn (operands
[0], dst
);
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1135 predict_jump (int prob
)
1137 rtx_insn
*insn
= get_last_insn ();
1138 gcc_assert (JUMP_P (insn
));
1139 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1146 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1149 rtx_code_label
*end_label
, *qimode_label
;
1152 rtx scratch
, tmp0
, tmp1
, tmp2
;
1153 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1158 if (GET_MODE (operands
[0]) == SImode
)
1160 if (GET_MODE (operands
[1]) == SImode
)
1161 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1164 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1168 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1172 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1179 end_label
= gen_label_rtx ();
1180 qimode_label
= gen_label_rtx ();
1182 scratch
= gen_reg_rtx (mode
);
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch
, operands
[2]);
1187 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1188 scratch
, 1, OPTAB_DIRECT
);
1189 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1190 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1191 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1192 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1193 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1195 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1196 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1197 JUMP_LABEL (insn
) = qimode_label
;
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1201 operands
[2], operands
[3]));
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label
));
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label
);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1212 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1213 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1214 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1218 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1219 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1223 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1224 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1228 if (GET_MODE (operands
[0]) != SImode
)
1229 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1230 if (GET_MODE (operands
[1]) != SImode
)
1231 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1234 /* Extract remainder from AH. */
1235 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1236 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1237 GEN_INT (8), GEN_INT (8));
1238 insn
= emit_move_insn (operands
[1], tmp1
);
1239 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1241 /* Zero extend quotient from AL. */
1242 tmp1
= gen_lowpart (QImode
, tmp0
);
1243 insn
= emit_insn (gen_extend_insn
1245 GET_MODE (operands
[0]), QImode
, 1));
1246 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1248 emit_label (end_label
);
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1255 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1260 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1261 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1266 /* Return true if regno1 def is nearest to the insn. */
1269 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1271 rtx_insn
*prev
= insn
;
1272 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1276 while (prev
&& prev
!= start
)
1278 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1280 prev
= PREV_INSN (prev
);
1283 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1285 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1287 prev
= PREV_INSN (prev
);
1290 /* None of the regs is defined in the bb. */
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1300 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1302 unsigned int regno0
, regno1
, regno2
;
1303 struct ix86_address parts
;
1307 ok
= ix86_decompose_address (operands
[1], &parts
);
1310 target
= gen_lowpart (mode
, operands
[0]);
1312 regno0
= true_regnum (target
);
1313 regno1
= INVALID_REGNUM
;
1314 regno2
= INVALID_REGNUM
;
1318 parts
.base
= gen_lowpart (mode
, parts
.base
);
1319 regno1
= true_regnum (parts
.base
);
1324 parts
.index
= gen_lowpart (mode
, parts
.index
);
1325 regno2
= true_regnum (parts
.index
);
1329 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1331 if (parts
.scale
> 1)
1333 /* Case r1 = r1 + ... */
1334 if (regno1
== regno0
)
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2
!= regno0
);
1342 for (adds
= parts
.scale
; adds
> 0; adds
--)
1343 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0
!= regno2
)
1349 emit_insn (gen_rtx_SET (target
, parts
.index
));
1351 /* Use shift for scaling. */
1352 ix86_emit_binop (ASHIFT
, mode
, target
,
1353 GEN_INT (exact_log2 (parts
.scale
)));
1356 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1358 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1359 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1362 else if (!parts
.base
&& !parts
.index
)
1364 gcc_assert(parts
.disp
);
1365 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1371 if (regno0
!= regno2
)
1372 emit_insn (gen_rtx_SET (target
, parts
.index
));
1374 else if (!parts
.index
)
1376 if (regno0
!= regno1
)
1377 emit_insn (gen_rtx_SET (target
, parts
.base
));
1381 if (regno0
== regno1
)
1383 else if (regno0
== regno2
)
1389 /* Find better operand for SET instruction, depending
1390 on which definition is farther from the insn. */
1391 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1392 tmp
= parts
.index
, tmp1
= parts
.base
;
1394 tmp
= parts
.base
, tmp1
= parts
.index
;
1396 emit_insn (gen_rtx_SET (target
, tmp
));
1398 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1399 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1401 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1405 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1408 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1409 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1413 /* Post-reload splitter for converting an SF or DFmode value in an
1414 SSE register into an unsigned SImode. */
1417 ix86_split_convert_uns_si_sse (rtx operands
[])
1419 machine_mode vecmode
;
1420 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1422 large
= operands
[1];
1423 zero_or_two31
= operands
[2];
1424 input
= operands
[3];
1425 two31
= operands
[4];
1426 vecmode
= GET_MODE (large
);
1427 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1429 /* Load up the value into the low element. We must ensure that the other
1430 elements are valid floats -- zero is the easiest such value. */
1433 if (vecmode
== V4SFmode
)
1434 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1436 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1440 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1441 emit_move_insn (value
, CONST0_RTX (vecmode
));
1442 if (vecmode
== V4SFmode
)
1443 emit_insn (gen_sse_movss (value
, value
, input
));
1445 emit_insn (gen_sse2_movsd (value
, value
, input
));
1448 emit_move_insn (large
, two31
);
1449 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1451 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1452 emit_insn (gen_rtx_SET (large
, x
));
1454 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1455 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1457 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1458 emit_insn (gen_rtx_SET (value
, x
));
1460 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1461 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1463 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1464 if (vecmode
== V4SFmode
)
1465 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1467 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1470 emit_insn (gen_xorv4si3 (value
, value
, large
));
1473 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1474 machine_mode mode
, rtx target
,
1475 rtx var
, int one_var
);
1477 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1478 Expects the 64-bit DImode to be supplied in a pair of integral
1479 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1480 -mfpmath=sse, !optimize_size only. */
1483 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1485 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1486 rtx int_xmm
, fp_xmm
;
1487 rtx biases
, exponents
;
1490 int_xmm
= gen_reg_rtx (V4SImode
);
1491 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1492 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1493 else if (TARGET_SSE_SPLIT_REGS
)
1495 emit_clobber (int_xmm
);
1496 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1500 x
= gen_reg_rtx (V2DImode
);
1501 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1502 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1505 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1506 gen_rtvec (4, GEN_INT (0x43300000UL
),
1507 GEN_INT (0x45300000UL
),
1508 const0_rtx
, const0_rtx
));
1509 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1511 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1514 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517 (0x1.0p84 + double(fp_value_hi_xmm)).
1518 Note these exponents differ by 32. */
1520 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1522 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1524 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1525 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1526 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1527 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1528 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1529 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1530 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1532 /* Add the upper and lower DFmode values together. */
1534 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1537 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1538 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1539 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1542 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1545 /* Not used, but eases macroization of patterns. */
1547 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1552 /* Convert an unsigned SImode value into a DFmode. Only currently used
1553 for SSE, but applicable anywhere. */
1556 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1558 REAL_VALUE_TYPE TWO31r
;
1561 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1562 NULL
, 1, OPTAB_DIRECT
);
1564 fp
= gen_reg_rtx (DFmode
);
1565 emit_insn (gen_floatsidf2 (fp
, x
));
1567 real_ldexp (&TWO31r
, &dconst1
, 31);
1568 x
= const_double_from_real_value (TWO31r
, DFmode
);
1570 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1572 emit_move_insn (target
, x
);
1575 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1576 32-bit mode; otherwise we have a direct convert instruction. */
1579 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1581 REAL_VALUE_TYPE TWO32r
;
1582 rtx fp_lo
, fp_hi
, x
;
1584 fp_lo
= gen_reg_rtx (DFmode
);
1585 fp_hi
= gen_reg_rtx (DFmode
);
1587 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1589 real_ldexp (&TWO32r
, &dconst1
, 32);
1590 x
= const_double_from_real_value (TWO32r
, DFmode
);
1591 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1593 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1595 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1598 emit_move_insn (target
, x
);
1601 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1602 For x86_32, -mfpmath=sse, !optimize_size only. */
1604 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1606 REAL_VALUE_TYPE ONE16r
;
1607 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1609 real_ldexp (&ONE16r
, &dconst1
, 16);
1610 x
= const_double_from_real_value (ONE16r
, SFmode
);
1611 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1612 NULL
, 0, OPTAB_DIRECT
);
1613 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1614 NULL
, 0, OPTAB_DIRECT
);
1615 fp_hi
= gen_reg_rtx (SFmode
);
1616 fp_lo
= gen_reg_rtx (SFmode
);
1617 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1618 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1619 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1621 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1623 if (!rtx_equal_p (target
, fp_hi
))
1624 emit_move_insn (target
, fp_hi
);
1627 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1628 a vector of unsigned ints VAL to vector of floats TARGET. */
1631 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1634 REAL_VALUE_TYPE TWO16r
;
1635 machine_mode intmode
= GET_MODE (val
);
1636 machine_mode fltmode
= GET_MODE (target
);
1637 rtx (*cvt
) (rtx
, rtx
);
1639 if (intmode
== V4SImode
)
1640 cvt
= gen_floatv4siv4sf2
;
1642 cvt
= gen_floatv8siv8sf2
;
1643 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1644 tmp
[0] = force_reg (intmode
, tmp
[0]);
1645 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1647 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1648 NULL_RTX
, 1, OPTAB_DIRECT
);
1649 tmp
[3] = gen_reg_rtx (fltmode
);
1650 emit_insn (cvt (tmp
[3], tmp
[1]));
1651 tmp
[4] = gen_reg_rtx (fltmode
);
1652 emit_insn (cvt (tmp
[4], tmp
[2]));
1653 real_ldexp (&TWO16r
, &dconst1
, 16);
1654 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1655 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1656 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1658 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1660 if (tmp
[7] != target
)
1661 emit_move_insn (target
, tmp
[7]);
1664 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1670 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1672 REAL_VALUE_TYPE TWO31r
;
1674 machine_mode mode
= GET_MODE (val
);
1675 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1676 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1677 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1680 for (i
= 0; i
< 3; i
++)
1681 tmp
[i
] = gen_reg_rtx (mode
);
1682 real_ldexp (&TWO31r
, &dconst1
, 31);
1683 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1684 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1685 two31r
= force_reg (mode
, two31r
);
1688 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1689 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1690 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1691 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1692 default: gcc_unreachable ();
1694 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1695 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1696 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1698 if (intmode
== V4SImode
|| TARGET_AVX2
)
1699 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1700 gen_lowpart (intmode
, tmp
[0]),
1701 GEN_INT (31), NULL_RTX
, 0,
1705 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1706 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1707 *xorp
= expand_simple_binop (intmode
, AND
,
1708 gen_lowpart (intmode
, tmp
[0]),
1712 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1716 /* Generate code for floating point ABS or NEG. */
1719 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1723 bool use_sse
= false;
1724 bool vector_mode
= VECTOR_MODE_P (mode
);
1725 machine_mode vmode
= mode
;
1728 if (vector_mode
|| mode
== TFmode
)
1730 else if (TARGET_SSE_MATH
)
1732 use_sse
= SSE_FLOAT_MODE_P (mode
);
1735 else if (mode
== DFmode
)
1742 set
= gen_rtx_fmt_e (code
, mode
, src
);
1743 set
= gen_rtx_SET (dst
, set
);
1747 rtx mask
, use
, clob
;
1749 /* NEG and ABS performed with SSE use bitwise mask operations.
1750 Create the appropriate mask now. */
1751 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1752 use
= gen_rtx_USE (VOIDmode
, mask
);
1753 if (vector_mode
|| mode
== TFmode
)
1754 par
= gen_rtvec (2, set
, use
);
1757 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1758 par
= gen_rtvec (3, set
, use
, clob
);
1765 /* Changing of sign for FP values is doable using integer unit too. */
1766 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1767 par
= gen_rtvec (2, set
, clob
);
1770 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1773 /* Deconstruct a floating point ABS or NEG operation
1774 with integer registers into integer operations. */
1777 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1780 enum rtx_code absneg_op
;
1783 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1788 dst
= gen_lowpart (SImode
, operands
[0]);
1792 set
= gen_int_mode (0x7fffffff, SImode
);
1797 set
= gen_int_mode (0x80000000, SImode
);
1800 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1806 dst
= gen_lowpart (DImode
, operands
[0]);
1807 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1812 set
= gen_rtx_NOT (DImode
, dst
);
1816 dst
= gen_highpart (SImode
, operands
[0]);
1820 set
= gen_int_mode (0x7fffffff, SImode
);
1825 set
= gen_int_mode (0x80000000, SImode
);
1828 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1833 dst
= gen_rtx_REG (SImode
,
1834 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1837 set
= GEN_INT (0x7fff);
1842 set
= GEN_INT (0x8000);
1845 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1852 set
= gen_rtx_SET (dst
, set
);
1854 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1855 rtvec par
= gen_rtvec (2, set
, clob
);
1857 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1860 /* Expand a copysign operation. Special case operand 0 being a constant. */
1863 ix86_expand_copysign (rtx operands
[])
1865 machine_mode mode
, vmode
;
1866 rtx dest
, op0
, op1
, mask
;
1872 mode
= GET_MODE (dest
);
1876 else if (mode
== DFmode
)
1878 else if (mode
== TFmode
)
1883 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1885 if (CONST_DOUBLE_P (op0
))
1887 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1888 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1890 if (mode
== SFmode
|| mode
== DFmode
)
1892 if (op0
== CONST0_RTX (mode
))
1893 op0
= CONST0_RTX (vmode
);
1896 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1898 op0
= force_reg (vmode
, v
);
1901 else if (op0
!= CONST0_RTX (mode
))
1902 op0
= force_reg (mode
, op0
);
1904 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1908 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1910 emit_insn (gen_copysign3_var
1911 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1915 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1916 be a constant, and so has already been expanded into a vector constant. */
1919 ix86_split_copysign_const (rtx operands
[])
1921 machine_mode mode
, vmode
;
1922 rtx dest
, op0
, mask
, x
;
1928 mode
= GET_MODE (dest
);
1929 vmode
= GET_MODE (mask
);
1931 dest
= lowpart_subreg (vmode
, dest
, mode
);
1932 x
= gen_rtx_AND (vmode
, dest
, mask
);
1933 emit_insn (gen_rtx_SET (dest
, x
));
1935 if (op0
!= CONST0_RTX (vmode
))
1937 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1938 emit_insn (gen_rtx_SET (dest
, x
));
1942 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1943 so we have to do two masks. */
1946 ix86_split_copysign_var (rtx operands
[])
1948 machine_mode mode
, vmode
;
1949 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1952 scratch
= operands
[1];
1955 nmask
= operands
[4];
1958 mode
= GET_MODE (dest
);
1959 vmode
= GET_MODE (mask
);
1961 if (rtx_equal_p (op0
, op1
))
1963 /* Shouldn't happen often (it's useless, obviously), but when it does
1964 we'd generate incorrect code if we continue below. */
1965 emit_move_insn (dest
, op0
);
1969 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1971 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1973 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1974 emit_insn (gen_rtx_SET (scratch
, x
));
1977 op0
= lowpart_subreg (vmode
, op0
, mode
);
1978 x
= gen_rtx_NOT (vmode
, dest
);
1979 x
= gen_rtx_AND (vmode
, x
, op0
);
1980 emit_insn (gen_rtx_SET (dest
, x
));
1984 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1986 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1988 else /* alternative 2,4 */
1990 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1991 op1
= lowpart_subreg (vmode
, op1
, mode
);
1992 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1994 emit_insn (gen_rtx_SET (scratch
, x
));
1996 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1998 dest
= lowpart_subreg (vmode
, op0
, mode
);
1999 x
= gen_rtx_AND (vmode
, dest
, nmask
);
2001 else /* alternative 3,4 */
2003 gcc_assert (REGNO (nmask
) == REGNO (dest
));
2005 op0
= lowpart_subreg (vmode
, op0
, mode
);
2006 x
= gen_rtx_AND (vmode
, dest
, op0
);
2008 emit_insn (gen_rtx_SET (dest
, x
));
2011 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
2012 emit_insn (gen_rtx_SET (dest
, x
));
2015 /* Expand an xorsign operation. */
2018 ix86_expand_xorsign (rtx operands
[])
2020 machine_mode mode
, vmode
;
2021 rtx dest
, op0
, op1
, mask
;
2027 mode
= GET_MODE (dest
);
2031 else if (mode
== DFmode
)
2036 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2038 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2041 /* Deconstruct an xorsign operation into bit masks. */
2044 ix86_split_xorsign (rtx operands
[])
2046 machine_mode mode
, vmode
;
2047 rtx dest
, op0
, mask
, x
;
2053 mode
= GET_MODE (dest
);
2054 vmode
= GET_MODE (mask
);
2056 dest
= lowpart_subreg (vmode
, dest
, mode
);
2057 x
= gen_rtx_AND (vmode
, dest
, mask
);
2058 emit_insn (gen_rtx_SET (dest
, x
));
2060 op0
= lowpart_subreg (vmode
, op0
, mode
);
2061 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2062 emit_insn (gen_rtx_SET (dest
, x
));
2065 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2068 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2070 machine_mode mode
= GET_MODE (op0
);
2073 /* Handle special case - vector comparsion with boolean result, transform
2074 it using ptest instruction. */
2075 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2077 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2078 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2080 gcc_assert (code
== EQ
|| code
== NE
);
2081 /* Generate XOR since we can't check that one operand is zero vector. */
2082 tmp
= gen_reg_rtx (mode
);
2083 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2084 tmp
= gen_lowpart (p_mode
, tmp
);
2085 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2086 gen_rtx_UNSPEC (CCmode
,
2087 gen_rtvec (2, tmp
, tmp
),
2089 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2090 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2091 gen_rtx_LABEL_REF (VOIDmode
, label
),
2093 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2106 tmp
= ix86_expand_compare (code
, op0
, op1
);
2107 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2108 gen_rtx_LABEL_REF (VOIDmode
, label
),
2110 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2116 /* For 32-bit target DI comparison may be performed on
2117 SSE registers. To allow this we should avoid split
2118 to SI mode which is achieved by doing xor in DI mode
2119 and then comparing with zero (which is recognized by
2120 STV pass). We don't compare using xor when optimizing
2122 if (!optimize_insn_for_size_p ()
2124 && (code
== EQ
|| code
== NE
))
2126 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2131 /* Expand DImode branch into multiple compare+branch. */
2134 rtx_code_label
*label2
;
2135 enum rtx_code code1
, code2
, code3
;
2136 machine_mode submode
;
2138 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2140 std::swap (op0
, op1
);
2141 code
= swap_condition (code
);
2144 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2145 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2147 submode
= mode
== DImode
? SImode
: DImode
;
2149 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2150 avoid two branches. This costs one extra insn, so disable when
2151 optimizing for size. */
2153 if ((code
== EQ
|| code
== NE
)
2154 && (!optimize_insn_for_size_p ()
2155 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2160 if (hi
[1] != const0_rtx
)
2161 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2162 NULL_RTX
, 0, OPTAB_WIDEN
);
2165 if (lo
[1] != const0_rtx
)
2166 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2167 NULL_RTX
, 0, OPTAB_WIDEN
);
2169 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2170 NULL_RTX
, 0, OPTAB_WIDEN
);
2172 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2176 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2177 op1 is a constant and the low word is zero, then we can just
2178 examine the high word. Similarly for low word -1 and
2179 less-or-equal-than or greater-than. */
2181 if (CONST_INT_P (hi
[1]))
2184 case LT
: case LTU
: case GE
: case GEU
:
2185 if (lo
[1] == const0_rtx
)
2187 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2191 case LE
: case LEU
: case GT
: case GTU
:
2192 if (lo
[1] == constm1_rtx
)
2194 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2202 /* Emulate comparisons that do not depend on Zero flag with
2203 double-word subtraction. Note that only Overflow, Sign
2204 and Carry flags are valid, so swap arguments and condition
2205 of comparisons that would otherwise test Zero flag. */
2209 case LE
: case LEU
: case GT
: case GTU
:
2210 std::swap (lo
[0], lo
[1]);
2211 std::swap (hi
[0], hi
[1]);
2212 code
= swap_condition (code
);
2215 case LT
: case LTU
: case GE
: case GEU
:
2217 bool uns
= (code
== LTU
|| code
== GEU
);
2218 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2219 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2221 if (!nonimmediate_operand (lo
[0], submode
))
2222 lo
[0] = force_reg (submode
, lo
[0]);
2223 if (!x86_64_general_operand (lo
[1], submode
))
2224 lo
[1] = force_reg (submode
, lo
[1]);
2226 if (!register_operand (hi
[0], submode
))
2227 hi
[0] = force_reg (submode
, hi
[0]);
2228 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2229 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2230 hi
[1] = force_reg (submode
, hi
[1]);
2232 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2234 tmp
= gen_rtx_SCRATCH (submode
);
2235 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2237 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2238 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2246 /* Otherwise, we need two or three jumps. */
2248 label2
= gen_label_rtx ();
2251 code2
= swap_condition (code
);
2252 code3
= unsigned_condition (code
);
2256 case LT
: case GT
: case LTU
: case GTU
:
2259 case LE
: code1
= LT
; code2
= GT
; break;
2260 case GE
: code1
= GT
; code2
= LT
; break;
2261 case LEU
: code1
= LTU
; code2
= GTU
; break;
2262 case GEU
: code1
= GTU
; code2
= LTU
; break;
2264 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2265 case NE
: code2
= UNKNOWN
; break;
2273 * if (hi(a) < hi(b)) goto true;
2274 * if (hi(a) > hi(b)) goto false;
2275 * if (lo(a) < lo(b)) goto true;
2279 if (code1
!= UNKNOWN
)
2280 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2281 if (code2
!= UNKNOWN
)
2282 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2284 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2286 if (code2
!= UNKNOWN
)
2287 emit_label (label2
);
2292 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2297 /* Figure out whether to use unordered fp comparisons. */
2300 ix86_unordered_fp_compare (enum rtx_code code
)
2302 if (!TARGET_IEEE_FP
)
2331 /* Return a comparison we can do and that it is equivalent to
2332 swap_condition (code) apart possibly from orderedness.
2333 But, never change orderedness if TARGET_IEEE_FP, returning
2334 UNKNOWN in that case if necessary. */
2336 static enum rtx_code
2337 ix86_fp_swap_condition (enum rtx_code code
)
2341 case GT
: /* GTU - CF=0 & ZF=0 */
2342 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2343 case GE
: /* GEU - CF=0 */
2344 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2345 case UNLT
: /* LTU - CF=1 */
2346 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2347 case UNLE
: /* LEU - CF=1 | ZF=1 */
2348 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2350 return swap_condition (code
);
2354 /* Return cost of comparison CODE using the best strategy for performance.
2355 All following functions do use number of instructions as a cost metrics.
2356 In future this should be tweaked to compute bytes for optimize_size and
2357 take into account performance of various instructions on various CPUs. */
2360 ix86_fp_comparison_cost (enum rtx_code code
)
2364 /* The cost of code using bit-twiddling on %ah. */
2381 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2385 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2391 switch (ix86_fp_comparison_strategy (code
))
2393 case IX86_FPCMP_COMI
:
2394 return arith_cost
> 4 ? 3 : 2;
2395 case IX86_FPCMP_SAHF
:
2396 return arith_cost
> 4 ? 4 : 3;
2402 /* Swap, force into registers, or otherwise massage the two operands
2403 to a fp comparison. The operands are updated in place; the new
2404 comparison code is returned. */
2406 static enum rtx_code
2407 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2409 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2410 rtx op0
= *pop0
, op1
= *pop1
;
2411 machine_mode op_mode
= GET_MODE (op0
);
2412 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2414 /* All of the unordered compare instructions only work on registers.
2415 The same is true of the fcomi compare instructions. The XFmode
2416 compare instructions require registers except when comparing
2417 against zero or when converting operand 1 from fixed point to
2421 && (unordered_compare
2422 || (op_mode
== XFmode
2423 && ! (standard_80387_constant_p (op0
) == 1
2424 || standard_80387_constant_p (op1
) == 1)
2425 && GET_CODE (op1
) != FLOAT
)
2426 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2428 op0
= force_reg (op_mode
, op0
);
2429 op1
= force_reg (op_mode
, op1
);
2433 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2434 things around if they appear profitable, otherwise force op0
2437 if (standard_80387_constant_p (op0
) == 0
2439 && ! (standard_80387_constant_p (op1
) == 0
2442 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2443 if (new_code
!= UNKNOWN
)
2445 std::swap (op0
, op1
);
2451 op0
= force_reg (op_mode
, op0
);
2453 if (CONSTANT_P (op1
))
2455 int tmp
= standard_80387_constant_p (op1
);
2457 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2461 op1
= force_reg (op_mode
, op1
);
2464 op1
= force_reg (op_mode
, op1
);
2468 /* Try to rearrange the comparison to make it cheaper. */
2469 if (ix86_fp_comparison_cost (code
)
2470 > ix86_fp_comparison_cost (swap_condition (code
))
2471 && (REG_P (op1
) || can_create_pseudo_p ()))
2473 std::swap (op0
, op1
);
2474 code
= swap_condition (code
);
2476 op0
= force_reg (op_mode
, op0
);
2484 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2487 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2489 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2490 machine_mode cmp_mode
;
2493 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2495 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2496 if (unordered_compare
)
2497 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2499 /* Do fcomi/sahf based test when profitable. */
2500 switch (ix86_fp_comparison_strategy (code
))
2502 case IX86_FPCMP_COMI
:
2503 cmp_mode
= CCFPmode
;
2504 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2507 case IX86_FPCMP_SAHF
:
2508 cmp_mode
= CCFPmode
;
2509 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2510 scratch
= gen_reg_rtx (HImode
);
2511 emit_insn (gen_rtx_SET (scratch
, tmp
));
2512 emit_insn (gen_x86_sahf_1 (scratch
));
2515 case IX86_FPCMP_ARITH
:
2516 cmp_mode
= CCNOmode
;
2517 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2518 scratch
= gen_reg_rtx (HImode
);
2519 emit_insn (gen_rtx_SET (scratch
, tmp
));
2521 /* In the unordered case, we have to check C2 for NaN's, which
2522 doesn't happen to work out to anything nice combination-wise.
2523 So do some bit twiddling on the value we've got in AH to come
2524 up with an appropriate set of condition codes. */
2530 if (code
== GT
|| !TARGET_IEEE_FP
)
2532 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2537 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2538 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2539 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2546 if (code
== LT
&& TARGET_IEEE_FP
)
2548 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2549 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2555 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2561 if (code
== GE
|| !TARGET_IEEE_FP
)
2563 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2568 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2569 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2575 if (code
== LE
&& TARGET_IEEE_FP
)
2577 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2578 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2579 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2585 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2591 if (code
== EQ
&& TARGET_IEEE_FP
)
2593 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2594 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2600 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2606 if (code
== NE
&& TARGET_IEEE_FP
)
2608 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2609 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2615 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2621 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2625 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2638 /* Return the test that should be put into the flags user, i.e.
2639 the bcc, scc, or cmov instruction. */
2640 return gen_rtx_fmt_ee (code
, VOIDmode
,
2641 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2645 /* Generate insn patterns to do an integer compare of OPERANDS. */
2648 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2650 machine_mode cmpmode
;
2653 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2654 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2656 /* This is very simple, but making the interface the same as in the
2657 FP case makes the rest of the code easier. */
2658 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2659 emit_insn (gen_rtx_SET (flags
, tmp
));
2661 /* Return the test that should be put into the flags user, i.e.
2662 the bcc, scc, or cmov instruction. */
2663 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2667 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2671 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2672 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2674 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2676 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2677 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2680 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2686 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2690 gcc_assert (GET_MODE (dest
) == QImode
);
2692 ret
= ix86_expand_compare (code
, op0
, op1
);
2693 PUT_MODE (ret
, QImode
);
2694 emit_insn (gen_rtx_SET (dest
, ret
));
2697 /* Expand comparison setting or clearing carry flag. Return true when
2698 successful and set pop for the operation. */
2700 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2703 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2705 /* Do not handle double-mode compares that go through special path. */
2706 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2709 if (SCALAR_FLOAT_MODE_P (mode
))
2712 rtx_insn
*compare_seq
;
2714 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2716 /* Shortcut: following common codes never translate
2717 into carry flag compares. */
2718 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2719 || code
== ORDERED
|| code
== UNORDERED
)
2722 /* These comparisons require zero flag; swap operands so they won't. */
2723 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2726 std::swap (op0
, op1
);
2727 code
= swap_condition (code
);
2730 /* Try to expand the comparison and verify that we end up with
2731 carry flag based comparison. This fails to be true only when
2732 we decide to expand comparison using arithmetic that is not
2733 too common scenario. */
2735 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2736 compare_seq
= get_insns ();
2739 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2740 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2742 code
= GET_CODE (compare_op
);
2744 if (code
!= LTU
&& code
!= GEU
)
2747 emit_insn (compare_seq
);
2752 if (!INTEGRAL_MODE_P (mode
))
2761 /* Convert a==0 into (unsigned)a<1. */
2764 if (op1
!= const0_rtx
)
2767 code
= (code
== EQ
? LTU
: GEU
);
2770 /* Convert a>b into b<a or a>=b-1. */
2773 if (CONST_INT_P (op1
))
2775 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2776 /* Bail out on overflow. We still can swap operands but that
2777 would force loading of the constant into register. */
2778 if (op1
== const0_rtx
2779 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2781 code
= (code
== GTU
? GEU
: LTU
);
2785 std::swap (op0
, op1
);
2786 code
= (code
== GTU
? LTU
: GEU
);
2790 /* Convert a>=0 into (unsigned)a<0x80000000. */
2793 if (mode
== DImode
|| op1
!= const0_rtx
)
2795 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2796 code
= (code
== LT
? GEU
: LTU
);
2800 if (mode
== DImode
|| op1
!= constm1_rtx
)
2802 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2803 code
= (code
== LE
? GEU
: LTU
);
2809 /* Swapping operands may cause constant to appear as first operand. */
2810 if (!nonimmediate_operand (op0
, VOIDmode
))
2812 if (!can_create_pseudo_p ())
2814 op0
= force_reg (mode
, op0
);
2816 *pop
= ix86_expand_compare (code
, op0
, op1
);
2817 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2821 /* Expand conditional increment or decrement using adb/sbb instructions.
2822 The default case using setcc followed by the conditional move can be
2823 done by generic code. */
2825 ix86_expand_int_addcc (rtx operands
[])
2827 enum rtx_code code
= GET_CODE (operands
[1]);
2829 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2831 rtx val
= const0_rtx
;
2834 rtx op0
= XEXP (operands
[1], 0);
2835 rtx op1
= XEXP (operands
[1], 1);
2837 if (operands
[3] != const1_rtx
2838 && operands
[3] != constm1_rtx
)
2840 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2842 code
= GET_CODE (compare_op
);
2844 flags
= XEXP (compare_op
, 0);
2846 if (GET_MODE (flags
) == CCFPmode
)
2849 code
= ix86_fp_compare_code_to_integer (code
);
2856 PUT_CODE (compare_op
,
2857 reverse_condition_maybe_unordered
2858 (GET_CODE (compare_op
)));
2860 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2863 mode
= GET_MODE (operands
[0]);
2865 /* Construct either adc or sbb insn. */
2866 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2867 insn
= gen_sub3_carry
;
2869 insn
= gen_add3_carry
;
2871 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2877 ix86_expand_int_movcc (rtx operands
[])
2879 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2880 rtx_insn
*compare_seq
;
2882 machine_mode mode
= GET_MODE (operands
[0]);
2883 bool sign_bit_compare_p
= false;
2884 rtx op0
= XEXP (operands
[1], 0);
2885 rtx op1
= XEXP (operands
[1], 1);
2887 if (GET_MODE (op0
) == TImode
2888 || (GET_MODE (op0
) == DImode
2893 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2894 compare_seq
= get_insns ();
2897 compare_code
= GET_CODE (compare_op
);
2899 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2900 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2901 sign_bit_compare_p
= true;
2903 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2904 HImode insns, we'd be swallowed in word prefix ops. */
2906 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2907 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2908 && CONST_INT_P (operands
[2])
2909 && CONST_INT_P (operands
[3]))
2911 rtx out
= operands
[0];
2912 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2913 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2917 /* Sign bit compares are better done using shifts than we do by using
2919 if (sign_bit_compare_p
2920 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2922 /* Detect overlap between destination and compare sources. */
2925 if (!sign_bit_compare_p
)
2930 compare_code
= GET_CODE (compare_op
);
2932 flags
= XEXP (compare_op
, 0);
2934 if (GET_MODE (flags
) == CCFPmode
)
2938 = ix86_fp_compare_code_to_integer (compare_code
);
2941 /* To simplify rest of code, restrict to the GEU case. */
2942 if (compare_code
== LTU
)
2945 compare_code
= reverse_condition (compare_code
);
2946 code
= reverse_condition (code
);
2951 PUT_CODE (compare_op
,
2952 reverse_condition_maybe_unordered
2953 (GET_CODE (compare_op
)));
2955 PUT_CODE (compare_op
,
2956 reverse_condition (GET_CODE (compare_op
)));
2960 if (reg_overlap_mentioned_p (out
, op0
)
2961 || reg_overlap_mentioned_p (out
, op1
))
2962 tmp
= gen_reg_rtx (mode
);
2965 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2967 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2968 flags
, compare_op
));
2972 if (code
== GT
|| code
== GE
)
2973 code
= reverse_condition (code
);
2979 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2992 tmp
= expand_simple_binop (mode
, PLUS
,
2994 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3005 tmp
= expand_simple_binop (mode
, IOR
,
3007 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3009 else if (diff
== -1 && ct
)
3019 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3021 tmp
= expand_simple_binop (mode
, PLUS
,
3022 copy_rtx (tmp
), GEN_INT (cf
),
3023 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3031 * andl cf - ct, dest
3041 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3044 tmp
= expand_simple_binop (mode
, AND
,
3046 gen_int_mode (cf
- ct
, mode
),
3047 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3049 tmp
= expand_simple_binop (mode
, PLUS
,
3050 copy_rtx (tmp
), GEN_INT (ct
),
3051 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3054 if (!rtx_equal_p (tmp
, out
))
3055 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3062 machine_mode cmp_mode
= GET_MODE (op0
);
3063 enum rtx_code new_code
;
3065 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3067 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3069 /* We may be reversing a non-trapping
3070 comparison to a trapping comparison. */
3071 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3072 && code
!= EQ
&& code
!= NE
3073 && code
!= ORDERED
&& code
!= UNORDERED
)
3076 new_code
= reverse_condition_maybe_unordered (code
);
3079 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3080 if (new_code
!= UNKNOWN
)
3088 compare_code
= UNKNOWN
;
3089 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3090 && CONST_INT_P (op1
))
3092 if (op1
== const0_rtx
3093 && (code
== LT
|| code
== GE
))
3094 compare_code
= code
;
3095 else if (op1
== constm1_rtx
)
3099 else if (code
== GT
)
3104 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3105 if (compare_code
!= UNKNOWN
3106 && GET_MODE (op0
) == GET_MODE (out
)
3107 && (cf
== -1 || ct
== -1))
3109 /* If lea code below could be used, only optimize
3110 if it results in a 2 insn sequence. */
3112 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3113 || diff
== 3 || diff
== 5 || diff
== 9)
3114 || (compare_code
== LT
&& ct
== -1)
3115 || (compare_code
== GE
&& cf
== -1))
3118 * notl op1 (if necessary)
3126 code
= reverse_condition (code
);
3129 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3131 out
= expand_simple_binop (mode
, IOR
,
3133 out
, 1, OPTAB_DIRECT
);
3134 if (out
!= operands
[0])
3135 emit_move_insn (operands
[0], out
);
3142 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3143 || diff
== 3 || diff
== 5 || diff
== 9)
3144 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3146 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3152 * lea cf(dest*(ct-cf)),dest
3156 * This also catches the degenerate setcc-only case.
3162 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3165 /* On x86_64 the lea instruction operates on Pmode, so we need
3166 to get arithmetics done in proper mode to match. */
3168 tmp
= copy_rtx (out
);
3172 out1
= copy_rtx (out
);
3173 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3177 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3183 tmp
= plus_constant (mode
, tmp
, cf
);
3186 if (!rtx_equal_p (tmp
, out
))
3189 out
= force_operand (tmp
, copy_rtx (out
));
3191 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3193 if (!rtx_equal_p (out
, operands
[0]))
3194 emit_move_insn (operands
[0], copy_rtx (out
));
3200 * General case: Jumpful:
3201 * xorl dest,dest cmpl op1, op2
3202 * cmpl op1, op2 movl ct, dest
3204 * decl dest movl cf, dest
3205 * andl (cf-ct),dest 1:
3210 * This is reasonably steep, but branch mispredict costs are
3211 * high on modern cpus, so consider failing only if optimizing
3215 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3216 && BRANCH_COST (optimize_insn_for_speed_p (),
3221 machine_mode cmp_mode
= GET_MODE (op0
);
3222 enum rtx_code new_code
;
3224 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3226 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3228 /* We may be reversing a non-trapping
3229 comparison to a trapping comparison. */
3230 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3231 && code
!= EQ
&& code
!= NE
3232 && code
!= ORDERED
&& code
!= UNORDERED
)
3235 new_code
= reverse_condition_maybe_unordered (code
);
3240 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3241 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3242 compare_code
= reverse_condition (compare_code
);
3245 if (new_code
!= UNKNOWN
)
3253 if (compare_code
!= UNKNOWN
)
3255 /* notl op1 (if needed)
3260 For x < 0 (resp. x <= -1) there will be no notl,
3261 so if possible swap the constants to get rid of the
3263 True/false will be -1/0 while code below (store flag
3264 followed by decrement) is 0/-1, so the constants need
3265 to be exchanged once more. */
3267 if (compare_code
== GE
|| !cf
)
3269 code
= reverse_condition (code
);
3275 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3279 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3281 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3283 copy_rtx (out
), 1, OPTAB_DIRECT
);
3286 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3287 gen_int_mode (cf
- ct
, mode
),
3288 copy_rtx (out
), 1, OPTAB_DIRECT
);
3290 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3291 copy_rtx (out
), 1, OPTAB_DIRECT
);
3292 if (!rtx_equal_p (out
, operands
[0]))
3293 emit_move_insn (operands
[0], copy_rtx (out
));
3299 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3301 /* Try a few things more with specific constants and a variable. */
3304 rtx var
, orig_out
, out
, tmp
;
3306 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3309 /* If one of the two operands is an interesting constant, load a
3310 constant with the above and mask it in with a logical operation. */
3312 if (CONST_INT_P (operands
[2]))
3315 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3316 operands
[3] = constm1_rtx
, op
= and_optab
;
3317 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3318 operands
[3] = const0_rtx
, op
= ior_optab
;
3322 else if (CONST_INT_P (operands
[3]))
3325 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3327 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3328 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3329 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3330 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3334 operands
[2] = constm1_rtx
;
3337 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3338 operands
[2] = const0_rtx
, op
= ior_optab
;
3345 orig_out
= operands
[0];
3346 tmp
= gen_reg_rtx (mode
);
3349 /* Recurse to get the constant loaded. */
3350 if (!ix86_expand_int_movcc (operands
))
3353 /* Mask in the interesting variable. */
3354 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3356 if (!rtx_equal_p (out
, orig_out
))
3357 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3363 * For comparison with above,
3373 if (! nonimmediate_operand (operands
[2], mode
))
3374 operands
[2] = force_reg (mode
, operands
[2]);
3375 if (! nonimmediate_operand (operands
[3], mode
))
3376 operands
[3] = force_reg (mode
, operands
[3]);
3378 if (! register_operand (operands
[2], VOIDmode
)
3380 || ! register_operand (operands
[3], VOIDmode
)))
3381 operands
[2] = force_reg (mode
, operands
[2]);
3384 && ! register_operand (operands
[3], VOIDmode
))
3385 operands
[3] = force_reg (mode
, operands
[3]);
3387 emit_insn (compare_seq
);
3388 emit_insn (gen_rtx_SET (operands
[0],
3389 gen_rtx_IF_THEN_ELSE (mode
,
3390 compare_op
, operands
[2],
3395 /* Detect conditional moves that exactly match min/max operational
3396 semantics. Note that this is IEEE safe, as long as we don't
3397 interchange the operands.
3399 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3400 and TRUE if the operation is successful and instructions are emitted. */
3403 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3404 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3412 else if (code
== UNGE
)
3413 std::swap (if_true
, if_false
);
3417 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3419 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3424 mode
= GET_MODE (dest
);
3426 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3427 but MODE may be a vector mode and thus not appropriate. */
3428 if (!flag_finite_math_only
|| flag_signed_zeros
)
3430 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3433 if_true
= force_reg (mode
, if_true
);
3434 v
= gen_rtvec (2, if_true
, if_false
);
3435 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3439 code
= is_min
? SMIN
: SMAX
;
3440 if (MEM_P (if_true
) && MEM_P (if_false
))
3441 if_true
= force_reg (mode
, if_true
);
3442 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3445 emit_insn (gen_rtx_SET (dest
, tmp
));
3449 /* Return true if MODE is valid for vector compare to mask register,
3450 Same result for conditionl vector move with mask register. */
3452 ix86_valid_mask_cmp_mode (machine_mode mode
)
3454 /* XOP has its own vector conditional movement. */
3455 if (TARGET_XOP
&& !TARGET_AVX512F
)
3458 /* AVX512F is needed for mask operation. */
3459 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3462 /* AVX512BW is needed for vector QI/HImode,
3463 AVX512VL is needed for 128/256-bit vector. */
3464 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3465 int vector_size
= GET_MODE_SIZE (mode
);
3466 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3469 return vector_size
== 64 || TARGET_AVX512VL
;
3472 /* Expand an SSE comparison. Return the register with the result. */
3475 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3476 rtx op_true
, rtx op_false
)
3478 machine_mode mode
= GET_MODE (dest
);
3479 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3481 /* In general case result of comparison can differ from operands' type. */
3482 machine_mode cmp_mode
;
3484 /* In AVX512F the result of comparison is an integer mask. */
3485 bool maskcmp
= false;
3488 if (ix86_valid_mask_cmp_mode (cmp_ops_mode
))
3490 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3492 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3495 cmp_mode
= cmp_ops_mode
;
3497 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3499 int (*op1_predicate
)(rtx
, machine_mode
)
3500 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3502 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3503 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3506 || (maskcmp
&& cmp_mode
!= mode
)
3507 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3508 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3509 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3513 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3518 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3520 if (cmp_mode
!= mode
&& !maskcmp
)
3522 x
= force_reg (cmp_ops_mode
, x
);
3523 convert_move (dest
, x
, false);
3526 emit_insn (gen_rtx_SET (dest
, x
));
3531 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3532 operations. This is used for both scalar and vector conditional moves. */
3535 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3537 machine_mode mode
= GET_MODE (dest
);
3538 machine_mode cmpmode
= GET_MODE (cmp
);
3540 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3541 if (rtx_equal_p (op_true
, op_false
))
3543 emit_move_insn (dest
, op_true
);
3547 /* In AVX512F the result of comparison is an integer mask. */
3548 bool maskcmp
= mode
!= cmpmode
&& ix86_valid_mask_cmp_mode (mode
);
3552 /* If we have an integer mask and FP value then we need
3553 to cast mask to FP mode. */
3554 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3556 cmp
= force_reg (cmpmode
, cmp
);
3557 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3562 /* Using vector move with mask register. */
3563 cmp
= force_reg (cmpmode
, cmp
);
3564 /* Optimize for mask zero. */
3565 op_true
= (op_true
!= CONST0_RTX (mode
)
3566 ? force_reg (mode
, op_true
) : op_true
);
3567 op_false
= (op_false
!= CONST0_RTX (mode
)
3568 ? force_reg (mode
, op_false
) : op_false
);
3569 if (op_true
== CONST0_RTX (mode
))
3571 rtx (*gen_not
) (rtx
, rtx
);
3574 case E_QImode
: gen_not
= gen_knotqi
; break;
3575 case E_HImode
: gen_not
= gen_knothi
; break;
3576 case E_SImode
: gen_not
= gen_knotsi
; break;
3577 case E_DImode
: gen_not
= gen_knotdi
; break;
3578 default: gcc_unreachable ();
3580 rtx n
= gen_reg_rtx (cmpmode
);
3581 emit_insn (gen_not (n
, cmp
));
3583 /* Reverse op_true op_false. */
3584 std::swap (op_true
, op_false
);
3587 rtx vec_merge
= gen_rtx_VEC_MERGE (mode
, op_true
, op_false
, cmp
);
3588 emit_insn (gen_rtx_SET (dest
, vec_merge
));
3591 else if (vector_all_ones_operand (op_true
, mode
)
3592 && op_false
== CONST0_RTX (mode
))
3594 emit_insn (gen_rtx_SET (dest
, cmp
));
3597 else if (op_false
== CONST0_RTX (mode
))
3599 op_true
= force_reg (mode
, op_true
);
3600 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3601 emit_insn (gen_rtx_SET (dest
, x
));
3604 else if (op_true
== CONST0_RTX (mode
))
3606 op_false
= force_reg (mode
, op_false
);
3607 x
= gen_rtx_NOT (mode
, cmp
);
3608 x
= gen_rtx_AND (mode
, x
, op_false
);
3609 emit_insn (gen_rtx_SET (dest
, x
));
3612 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3614 op_false
= force_reg (mode
, op_false
);
3615 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3616 emit_insn (gen_rtx_SET (dest
, x
));
3619 else if (TARGET_XOP
)
3621 op_true
= force_reg (mode
, op_true
);
3623 if (!nonimmediate_operand (op_false
, mode
))
3624 op_false
= force_reg (mode
, op_false
);
3626 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3632 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3635 if (!vector_operand (op_true
, mode
))
3636 op_true
= force_reg (mode
, op_true
);
3638 op_false
= force_reg (mode
, op_false
);
3644 gen
= gen_sse4_1_blendvps
;
3648 gen
= gen_sse4_1_blendvpd
;
3653 gen
= gen_sse4_1_blendvss
;
3654 op_true
= force_reg (mode
, op_true
);
3660 gen
= gen_sse4_1_blendvsd
;
3661 op_true
= force_reg (mode
, op_true
);
3670 gen
= gen_sse4_1_pblendvb
;
3671 if (mode
!= V16QImode
)
3672 d
= gen_reg_rtx (V16QImode
);
3673 op_false
= gen_lowpart (V16QImode
, op_false
);
3674 op_true
= gen_lowpart (V16QImode
, op_true
);
3675 cmp
= gen_lowpart (V16QImode
, cmp
);
3680 gen
= gen_avx_blendvps256
;
3684 gen
= gen_avx_blendvpd256
;
3692 gen
= gen_avx2_pblendvb
;
3693 if (mode
!= V32QImode
)
3694 d
= gen_reg_rtx (V32QImode
);
3695 op_false
= gen_lowpart (V32QImode
, op_false
);
3696 op_true
= gen_lowpart (V32QImode
, op_true
);
3697 cmp
= gen_lowpart (V32QImode
, cmp
);
3702 gen
= gen_avx512bw_blendmv64qi
;
3705 gen
= gen_avx512bw_blendmv32hi
;
3708 gen
= gen_avx512f_blendmv16si
;
3711 gen
= gen_avx512f_blendmv8di
;
3714 gen
= gen_avx512f_blendmv8df
;
3717 gen
= gen_avx512f_blendmv16sf
;
3726 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3728 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3732 op_true
= force_reg (mode
, op_true
);
3734 t2
= gen_reg_rtx (mode
);
3736 t3
= gen_reg_rtx (mode
);
3740 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3741 emit_insn (gen_rtx_SET (t2
, x
));
3743 x
= gen_rtx_NOT (mode
, cmp
);
3744 x
= gen_rtx_AND (mode
, x
, op_false
);
3745 emit_insn (gen_rtx_SET (t3
, x
));
3747 x
= gen_rtx_IOR (mode
, t3
, t2
);
3748 emit_insn (gen_rtx_SET (dest
, x
));
3752 /* Swap, force into registers, or otherwise massage the two operands
3753 to an sse comparison with a mask result. Thus we differ a bit from
3754 ix86_prepare_fp_compare_args which expects to produce a flags result.
3756 The DEST operand exists to help determine whether to commute commutative
3757 operators. The POP0/POP1 operands are updated in place. The new
3758 comparison code is returned, or UNKNOWN if not implementable. */
3760 static enum rtx_code
3761 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3762 rtx
*pop0
, rtx
*pop1
)
3768 /* AVX supports all the needed comparisons. */
3771 /* We have no LTGT as an operator. We could implement it with
3772 NE & ORDERED, but this requires an extra temporary. It's
3773 not clear that it's worth it. */
3780 /* These are supported directly. */
3787 /* AVX has 3 operand comparisons, no need to swap anything. */
3790 /* For commutative operators, try to canonicalize the destination
3791 operand to be first in the comparison - this helps reload to
3792 avoid extra moves. */
3793 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3801 /* These are not supported directly before AVX, and furthermore
3802 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3803 comparison operands to transform into something that is
3805 std::swap (*pop0
, *pop1
);
3806 code
= swap_condition (code
);
3816 /* Expand a floating-point conditional move. Return true if successful. */
3819 ix86_expand_fp_movcc (rtx operands
[])
3821 machine_mode mode
= GET_MODE (operands
[0]);
3822 enum rtx_code code
= GET_CODE (operands
[1]);
3823 rtx tmp
, compare_op
;
3824 rtx op0
= XEXP (operands
[1], 0);
3825 rtx op1
= XEXP (operands
[1], 1);
3827 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3831 /* Since we've no cmove for sse registers, don't force bad register
3832 allocation just to gain access to it. Deny movcc when the
3833 comparison mode doesn't match the move mode. */
3834 cmode
= GET_MODE (op0
);
3835 if (cmode
== VOIDmode
)
3836 cmode
= GET_MODE (op1
);
3840 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3841 if (code
== UNKNOWN
)
3844 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3845 operands
[2], operands
[3]))
3848 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3849 operands
[2], operands
[3]);
3850 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3854 if (GET_MODE (op0
) == TImode
3855 || (GET_MODE (op0
) == DImode
3859 /* The floating point conditional move instructions don't directly
3860 support conditions resulting from a signed integer comparison. */
3862 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3863 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3865 tmp
= gen_reg_rtx (QImode
);
3866 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3868 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3871 emit_insn (gen_rtx_SET (operands
[0],
3872 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3873 operands
[2], operands
[3])));
3878 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3881 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3906 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3909 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3946 /* Return immediate value to be used in UNSPEC_PCMP
3947 for comparison CODE in MODE. */
3950 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3952 if (FLOAT_MODE_P (mode
))
3953 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3954 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3957 /* Expand AVX-512 vector comparison. */
3960 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
3962 machine_mode mask_mode
= GET_MODE (dest
);
3963 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
3964 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3974 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3978 unspec_code
= UNSPEC_PCMP
;
3981 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
3983 emit_insn (gen_rtx_SET (dest
, unspec
));
3988 /* Expand fp vector comparison. */
3991 ix86_expand_fp_vec_cmp (rtx operands
[])
3993 enum rtx_code code
= GET_CODE (operands
[1]);
3996 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
3997 &operands
[2], &operands
[3]);
3998 if (code
== UNKNOWN
)
4001 switch (GET_CODE (operands
[1]))
4004 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4005 operands
[3], NULL
, NULL
);
4006 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4007 operands
[3], NULL
, NULL
);
4011 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4012 operands
[3], NULL
, NULL
);
4013 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4014 operands
[3], NULL
, NULL
);
4020 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4024 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4025 operands
[1], operands
[2]);
4027 if (operands
[0] != cmp
)
4028 emit_move_insn (operands
[0], cmp
);
4034 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4035 rtx op_true
, rtx op_false
, bool *negate
)
4037 machine_mode data_mode
= GET_MODE (dest
);
4038 machine_mode mode
= GET_MODE (cop0
);
4043 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4045 && (mode
== V16QImode
|| mode
== V8HImode
4046 || mode
== V4SImode
|| mode
== V2DImode
))
4048 /* AVX512F supports all of the comparsions
4049 on all 128/256/512-bit vector int types. */
4050 else if (ix86_valid_mask_cmp_mode (mode
))
4054 /* Canonicalize the comparison to EQ, GT, GTU. */
4065 code
= reverse_condition (code
);
4071 code
= reverse_condition (code
);
4077 std::swap (cop0
, cop1
);
4078 code
= swap_condition (code
);
4085 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4086 if (mode
== V2DImode
)
4091 /* SSE4.1 supports EQ. */
4098 /* SSE4.2 supports GT/GTU. */
4108 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4109 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4111 std::swap (optrue
, opfalse
);
4113 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4114 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4115 min (x, y) == x). While we add one instruction (the minimum),
4116 we remove the need for two instructions in the negation, as the
4117 result is done this way.
4118 When using masks, do it for SI/DImode element types, as it is shorter
4119 than the two subtractions. */
4121 && GET_MODE_SIZE (mode
) != 64
4122 && vector_all_ones_operand (opfalse
, data_mode
)
4123 && optrue
== CONST0_RTX (data_mode
))
4125 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4126 /* Don't do it if not using integer masks and we'd end up with
4127 the right values in the registers though. */
4128 && (GET_MODE_SIZE (mode
) == 64
4129 || !vector_all_ones_operand (optrue
, data_mode
)
4130 || opfalse
!= CONST0_RTX (data_mode
))))
4132 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4137 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4140 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4141 cop0
= force_reg (mode
, cop0
);
4142 cop1
= force_reg (mode
, cop1
);
4146 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4150 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4154 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4157 if (TARGET_AVX512VL
)
4159 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4160 cop0
= force_reg (mode
, cop0
);
4161 cop1
= force_reg (mode
, cop1
);
4165 if (code
== GTU
&& TARGET_SSE2
)
4166 gen
= gen_uminv16qi3
;
4167 else if (code
== GT
&& TARGET_SSE4_1
)
4168 gen
= gen_sminv16qi3
;
4171 if (code
== GTU
&& TARGET_SSE4_1
)
4172 gen
= gen_uminv8hi3
;
4173 else if (code
== GT
&& TARGET_SSE2
)
4174 gen
= gen_sminv8hi3
;
4178 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4181 if (TARGET_AVX512VL
)
4183 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4184 cop0
= force_reg (mode
, cop0
);
4185 cop1
= force_reg (mode
, cop1
);
4194 rtx tem
= gen_reg_rtx (mode
);
4195 if (!vector_operand (cop0
, mode
))
4196 cop0
= force_reg (mode
, cop0
);
4197 if (!vector_operand (cop1
, mode
))
4198 cop1
= force_reg (mode
, cop1
);
4200 emit_insn (gen (tem
, cop0
, cop1
));
4206 /* Unsigned parallel compare is not supported by the hardware.
4207 Play some tricks to turn this into a signed comparison
4211 cop0
= force_reg (mode
, cop0
);
4224 /* Subtract (-(INT MAX) - 1) from both operands to make
4226 mask
= ix86_build_signbit_mask (mode
, true, false);
4227 t1
= gen_reg_rtx (mode
);
4228 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4230 t2
= gen_reg_rtx (mode
);
4231 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4245 /* Perform a parallel unsigned saturating subtraction. */
4246 x
= gen_reg_rtx (mode
);
4247 emit_insn (gen_rtx_SET
4248 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4250 cop1
= CONST0_RTX (mode
);
4262 std::swap (op_true
, op_false
);
4264 /* Allow the comparison to be done in one mode, but the movcc to
4265 happen in another mode. */
4266 if (data_mode
== mode
)
4268 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4273 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4274 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4276 if (GET_MODE (x
) == mode
)
4277 x
= gen_lowpart (data_mode
, x
);
4283 /* Expand integer vector comparison. */
4286 ix86_expand_int_vec_cmp (rtx operands
[])
4288 rtx_code code
= GET_CODE (operands
[1]);
4289 bool negate
= false;
4290 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4291 operands
[3], NULL
, NULL
, &negate
);
4297 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4298 CONST0_RTX (GET_MODE (cmp
)),
4299 NULL
, NULL
, &negate
);
4301 gcc_assert (!negate
);
4303 if (operands
[0] != cmp
)
4304 emit_move_insn (operands
[0], cmp
);
4309 /* Expand a floating-point vector conditional move; a vcond operation
4310 rather than a movcc operation. */
4313 ix86_expand_fp_vcond (rtx operands
[])
4315 enum rtx_code code
= GET_CODE (operands
[3]);
4318 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4319 &operands
[4], &operands
[5]);
4320 if (code
== UNKNOWN
)
4323 switch (GET_CODE (operands
[3]))
4326 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4327 operands
[5], operands
[0], operands
[0]);
4328 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4329 operands
[5], operands
[1], operands
[2]);
4333 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4334 operands
[5], operands
[0], operands
[0]);
4335 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4336 operands
[5], operands
[1], operands
[2]);
4342 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4344 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4348 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4349 operands
[5], operands
[1], operands
[2]))
4352 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4353 operands
[1], operands
[2]);
4354 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4358 /* Expand a signed/unsigned integral vector conditional move. */
4361 ix86_expand_int_vcond (rtx operands
[])
4363 machine_mode data_mode
= GET_MODE (operands
[0]);
4364 machine_mode mode
= GET_MODE (operands
[4]);
4365 enum rtx_code code
= GET_CODE (operands
[3]);
4366 bool negate
= false;
4372 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4373 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4374 if ((code
== LT
|| code
== GE
)
4375 && data_mode
== mode
4376 && cop1
== CONST0_RTX (mode
)
4377 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4378 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4379 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4380 && (GET_MODE_SIZE (data_mode
) == 16
4381 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4383 rtx negop
= operands
[2 - (code
== LT
)];
4384 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4385 if (negop
== CONST1_RTX (data_mode
))
4387 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4388 operands
[0], 1, OPTAB_DIRECT
);
4389 if (res
!= operands
[0])
4390 emit_move_insn (operands
[0], res
);
4393 else if (GET_MODE_INNER (data_mode
) != DImode
4394 && vector_all_ones_operand (negop
, data_mode
))
4396 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4397 operands
[0], 0, OPTAB_DIRECT
);
4398 if (res
!= operands
[0])
4399 emit_move_insn (operands
[0], res
);
4404 if (!nonimmediate_operand (cop1
, mode
))
4405 cop1
= force_reg (mode
, cop1
);
4406 if (!general_operand (operands
[1], data_mode
))
4407 operands
[1] = force_reg (data_mode
, operands
[1]);
4408 if (!general_operand (operands
[2], data_mode
))
4409 operands
[2] = force_reg (data_mode
, operands
[2]);
4411 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4412 operands
[1], operands
[2], &negate
);
4417 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4418 operands
[2-negate
]);
4423 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4424 struct expand_vec_perm_d
*d
)
4426 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4427 expander, so args are either in d, or in op0, op1 etc. */
4428 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4429 machine_mode maskmode
= mode
;
4430 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4435 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4436 gen
= gen_avx512vl_vpermt2varv8hi3
;
4439 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4440 gen
= gen_avx512vl_vpermt2varv16hi3
;
4443 if (TARGET_AVX512VBMI
)
4444 gen
= gen_avx512bw_vpermt2varv64qi3
;
4447 if (TARGET_AVX512BW
)
4448 gen
= gen_avx512bw_vpermt2varv32hi3
;
4451 if (TARGET_AVX512VL
)
4452 gen
= gen_avx512vl_vpermt2varv4si3
;
4455 if (TARGET_AVX512VL
)
4456 gen
= gen_avx512vl_vpermt2varv8si3
;
4460 gen
= gen_avx512f_vpermt2varv16si3
;
4463 if (TARGET_AVX512VL
)
4465 gen
= gen_avx512vl_vpermt2varv4sf3
;
4466 maskmode
= V4SImode
;
4470 if (TARGET_AVX512VL
)
4472 gen
= gen_avx512vl_vpermt2varv8sf3
;
4473 maskmode
= V8SImode
;
4479 gen
= gen_avx512f_vpermt2varv16sf3
;
4480 maskmode
= V16SImode
;
4484 if (TARGET_AVX512VL
)
4485 gen
= gen_avx512vl_vpermt2varv2di3
;
4488 if (TARGET_AVX512VL
)
4489 gen
= gen_avx512vl_vpermt2varv4di3
;
4493 gen
= gen_avx512f_vpermt2varv8di3
;
4496 if (TARGET_AVX512VL
)
4498 gen
= gen_avx512vl_vpermt2varv2df3
;
4499 maskmode
= V2DImode
;
4503 if (TARGET_AVX512VL
)
4505 gen
= gen_avx512vl_vpermt2varv4df3
;
4506 maskmode
= V4DImode
;
4512 gen
= gen_avx512f_vpermt2varv8df3
;
4513 maskmode
= V8DImode
;
4523 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4524 expander, so args are either in d, or in op0, op1 etc. */
4531 for (int i
= 0; i
< d
->nelt
; ++i
)
4532 vec
[i
] = GEN_INT (d
->perm
[i
]);
4533 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4536 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4540 /* Expand a variable vector permutation. */
4543 ix86_expand_vec_perm (rtx operands
[])
4545 rtx target
= operands
[0];
4546 rtx op0
= operands
[1];
4547 rtx op1
= operands
[2];
4548 rtx mask
= operands
[3];
4549 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4550 machine_mode mode
= GET_MODE (op0
);
4551 machine_mode maskmode
= GET_MODE (mask
);
4553 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4555 /* Number of elements in the vector. */
4556 w
= GET_MODE_NUNITS (mode
);
4557 e
= GET_MODE_UNIT_SIZE (mode
);
4558 gcc_assert (w
<= 64);
4560 if (TARGET_AVX512F
&& one_operand_shuffle
)
4562 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4566 gen
=gen_avx512f_permvarv16si
;
4569 gen
= gen_avx512f_permvarv16sf
;
4572 gen
= gen_avx512f_permvarv8di
;
4575 gen
= gen_avx512f_permvarv8df
;
4582 emit_insn (gen (target
, op0
, mask
));
4587 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4592 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4594 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4595 an constant shuffle operand. With a tiny bit of effort we can
4596 use VPERMD instead. A re-interpretation stall for V4DFmode is
4597 unfortunate but there's no avoiding it.
4598 Similarly for V16HImode we don't have instructions for variable
4599 shuffling, while for V32QImode we can use after preparing suitable
4600 masks vpshufb; vpshufb; vpermq; vpor. */
4602 if (mode
== V16HImode
)
4604 maskmode
= mode
= V32QImode
;
4610 maskmode
= mode
= V8SImode
;
4614 t1
= gen_reg_rtx (maskmode
);
4616 /* Replicate the low bits of the V4DImode mask into V8SImode:
4618 t1 = { A A B B C C D D }. */
4619 for (i
= 0; i
< w
/ 2; ++i
)
4620 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4621 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4622 vt
= force_reg (maskmode
, vt
);
4623 mask
= gen_lowpart (maskmode
, mask
);
4624 if (maskmode
== V8SImode
)
4625 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4627 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4629 /* Multiply the shuffle indicies by two. */
4630 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4633 /* Add one to the odd shuffle indicies:
4634 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4635 for (i
= 0; i
< w
/ 2; ++i
)
4637 vec
[i
* 2] = const0_rtx
;
4638 vec
[i
* 2 + 1] = const1_rtx
;
4640 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4641 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4642 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4645 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4646 operands
[3] = mask
= t1
;
4647 target
= gen_reg_rtx (mode
);
4648 op0
= gen_lowpart (mode
, op0
);
4649 op1
= gen_lowpart (mode
, op1
);
4655 /* The VPERMD and VPERMPS instructions already properly ignore
4656 the high bits of the shuffle elements. No need for us to
4657 perform an AND ourselves. */
4658 if (one_operand_shuffle
)
4660 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4661 if (target
!= operands
[0])
4662 emit_move_insn (operands
[0],
4663 gen_lowpart (GET_MODE (operands
[0]), target
));
4667 t1
= gen_reg_rtx (V8SImode
);
4668 t2
= gen_reg_rtx (V8SImode
);
4669 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4670 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4676 mask
= gen_lowpart (V8SImode
, mask
);
4677 if (one_operand_shuffle
)
4678 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4681 t1
= gen_reg_rtx (V8SFmode
);
4682 t2
= gen_reg_rtx (V8SFmode
);
4683 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4684 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4690 /* By combining the two 128-bit input vectors into one 256-bit
4691 input vector, we can use VPERMD and VPERMPS for the full
4692 two-operand shuffle. */
4693 t1
= gen_reg_rtx (V8SImode
);
4694 t2
= gen_reg_rtx (V8SImode
);
4695 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4696 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4697 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4698 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4702 t1
= gen_reg_rtx (V8SFmode
);
4703 t2
= gen_reg_rtx (V8SImode
);
4704 mask
= gen_lowpart (V4SImode
, mask
);
4705 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4706 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4707 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4708 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4712 t1
= gen_reg_rtx (V32QImode
);
4713 t2
= gen_reg_rtx (V32QImode
);
4714 t3
= gen_reg_rtx (V32QImode
);
4715 vt2
= GEN_INT (-128);
4716 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4717 vt
= force_reg (V32QImode
, vt
);
4718 for (i
= 0; i
< 32; i
++)
4719 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4720 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4721 vt2
= force_reg (V32QImode
, vt2
);
4722 /* From mask create two adjusted masks, which contain the same
4723 bits as mask in the low 7 bits of each vector element.
4724 The first mask will have the most significant bit clear
4725 if it requests element from the same 128-bit lane
4726 and MSB set if it requests element from the other 128-bit lane.
4727 The second mask will have the opposite values of the MSB,
4728 and additionally will have its 128-bit lanes swapped.
4729 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4730 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4731 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4732 stands for other 12 bytes. */
4733 /* The bit whether element is from the same lane or the other
4734 lane is bit 4, so shift it up by 3 to the MSB position. */
4735 t5
= gen_reg_rtx (V4DImode
);
4736 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4738 /* Clear MSB bits from the mask just in case it had them set. */
4739 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4740 /* After this t1 will have MSB set for elements from other lane. */
4741 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4742 /* Clear bits other than MSB. */
4743 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4744 /* Or in the lower bits from mask into t3. */
4745 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4746 /* And invert MSB bits in t1, so MSB is set for elements from the same
4748 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4749 /* Swap 128-bit lanes in t3. */
4750 t6
= gen_reg_rtx (V4DImode
);
4751 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4752 const2_rtx
, GEN_INT (3),
4753 const0_rtx
, const1_rtx
));
4754 /* And or in the lower bits from mask into t1. */
4755 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4756 if (one_operand_shuffle
)
4758 /* Each of these shuffles will put 0s in places where
4759 element from the other 128-bit lane is needed, otherwise
4760 will shuffle in the requested value. */
4761 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4762 gen_lowpart (V32QImode
, t6
)));
4763 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4764 /* For t3 the 128-bit lanes are swapped again. */
4765 t7
= gen_reg_rtx (V4DImode
);
4766 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4767 const2_rtx
, GEN_INT (3),
4768 const0_rtx
, const1_rtx
));
4769 /* And oring both together leads to the result. */
4770 emit_insn (gen_iorv32qi3 (target
, t1
,
4771 gen_lowpart (V32QImode
, t7
)));
4772 if (target
!= operands
[0])
4773 emit_move_insn (operands
[0],
4774 gen_lowpart (GET_MODE (operands
[0]), target
));
4778 t4
= gen_reg_rtx (V32QImode
);
4779 /* Similarly to the above one_operand_shuffle code,
4780 just for repeated twice for each operand. merge_two:
4781 code will merge the two results together. */
4782 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4783 gen_lowpart (V32QImode
, t6
)));
4784 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4785 gen_lowpart (V32QImode
, t6
)));
4786 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4787 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4788 t7
= gen_reg_rtx (V4DImode
);
4789 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4790 const2_rtx
, GEN_INT (3),
4791 const0_rtx
, const1_rtx
));
4792 t8
= gen_reg_rtx (V4DImode
);
4793 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4794 const2_rtx
, GEN_INT (3),
4795 const0_rtx
, const1_rtx
));
4796 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4797 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4803 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4810 /* The XOP VPPERM insn supports three inputs. By ignoring the
4811 one_operand_shuffle special case, we avoid creating another
4812 set of constant vectors in memory. */
4813 one_operand_shuffle
= false;
4815 /* mask = mask & {2*w-1, ...} */
4816 vt
= GEN_INT (2*w
- 1);
4820 /* mask = mask & {w-1, ...} */
4821 vt
= GEN_INT (w
- 1);
4824 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4825 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4826 NULL_RTX
, 0, OPTAB_DIRECT
);
4828 /* For non-QImode operations, convert the word permutation control
4829 into a byte permutation control. */
4830 if (mode
!= V16QImode
)
4832 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4833 GEN_INT (exact_log2 (e
)),
4834 NULL_RTX
, 0, OPTAB_DIRECT
);
4836 /* Convert mask to vector of chars. */
4837 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4839 /* Replicate each of the input bytes into byte positions:
4840 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4841 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4842 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4843 for (i
= 0; i
< 16; ++i
)
4844 vec
[i
] = GEN_INT (i
/e
* e
);
4845 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4846 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4848 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4850 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4852 /* Convert it into the byte positions by doing
4853 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4854 for (i
= 0; i
< 16; ++i
)
4855 vec
[i
] = GEN_INT (i
% e
);
4856 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4857 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4858 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4861 /* The actual shuffle operations all operate on V16QImode. */
4862 op0
= gen_lowpart (V16QImode
, op0
);
4863 op1
= gen_lowpart (V16QImode
, op1
);
4867 if (GET_MODE (target
) != V16QImode
)
4868 target
= gen_reg_rtx (V16QImode
);
4869 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4870 if (target
!= operands
[0])
4871 emit_move_insn (operands
[0],
4872 gen_lowpart (GET_MODE (operands
[0]), target
));
4874 else if (one_operand_shuffle
)
4876 if (GET_MODE (target
) != V16QImode
)
4877 target
= gen_reg_rtx (V16QImode
);
4878 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4879 if (target
!= operands
[0])
4880 emit_move_insn (operands
[0],
4881 gen_lowpart (GET_MODE (operands
[0]), target
));
4888 /* Shuffle the two input vectors independently. */
4889 t1
= gen_reg_rtx (V16QImode
);
4890 t2
= gen_reg_rtx (V16QImode
);
4891 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4892 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4895 /* Then merge them together. The key is whether any given control
4896 element contained a bit set that indicates the second word. */
4899 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4901 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4902 more shuffle to convert the V2DI input mask into a V4SI
4903 input mask. At which point the masking that expand_int_vcond
4904 will work as desired. */
4905 rtx t3
= gen_reg_rtx (V4SImode
);
4906 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4907 const0_rtx
, const0_rtx
,
4908 const2_rtx
, const2_rtx
));
4910 maskmode
= V4SImode
;
4914 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4915 vt
= force_reg (maskmode
, vt
);
4916 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4917 NULL_RTX
, 0, OPTAB_DIRECT
);
4919 if (GET_MODE (target
) != mode
)
4920 target
= gen_reg_rtx (mode
);
4922 xops
[1] = gen_lowpart (mode
, t2
);
4923 xops
[2] = gen_lowpart (mode
, t1
);
4924 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4927 ok
= ix86_expand_int_vcond (xops
);
4929 if (target
!= operands
[0])
4930 emit_move_insn (operands
[0],
4931 gen_lowpart (GET_MODE (operands
[0]), target
));
4935 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4936 true if we should do zero extension, else sign extension. HIGH_P is
4937 true if we want the N/2 high elements, else the low elements. */
4940 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4942 machine_mode imode
= GET_MODE (src
);
4947 rtx (*unpack
)(rtx
, rtx
);
4948 rtx (*extract
)(rtx
, rtx
) = NULL
;
4949 machine_mode halfmode
= BLKmode
;
4955 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4957 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4958 halfmode
= V32QImode
;
4960 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4964 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4966 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4967 halfmode
= V16QImode
;
4969 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4973 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4975 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4976 halfmode
= V16HImode
;
4978 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4982 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4984 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4985 halfmode
= V8HImode
;
4987 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
4991 unpack
= gen_avx512f_zero_extendv8siv8di2
;
4993 unpack
= gen_avx512f_sign_extendv8siv8di2
;
4994 halfmode
= V8SImode
;
4996 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5000 unpack
= gen_avx2_zero_extendv4siv4di2
;
5002 unpack
= gen_avx2_sign_extendv4siv4di2
;
5003 halfmode
= V4SImode
;
5005 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5009 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5011 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5015 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5017 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5021 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5023 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5029 if (GET_MODE_SIZE (imode
) >= 32)
5031 tmp
= gen_reg_rtx (halfmode
);
5032 emit_insn (extract (tmp
, src
));
5036 /* Shift higher 8 bytes to lower 8 bytes. */
5037 tmp
= gen_reg_rtx (V1TImode
);
5038 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5040 tmp
= gen_lowpart (imode
, tmp
);
5045 emit_insn (unpack (dest
, tmp
));
5049 rtx (*unpack
)(rtx
, rtx
, rtx
);
5055 unpack
= gen_vec_interleave_highv16qi
;
5057 unpack
= gen_vec_interleave_lowv16qi
;
5061 unpack
= gen_vec_interleave_highv8hi
;
5063 unpack
= gen_vec_interleave_lowv8hi
;
5067 unpack
= gen_vec_interleave_highv4si
;
5069 unpack
= gen_vec_interleave_lowv4si
;
5076 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5078 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5079 src
, pc_rtx
, pc_rtx
);
5081 rtx tmp2
= gen_reg_rtx (imode
);
5082 emit_insn (unpack (tmp2
, src
, tmp
));
5083 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5087 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5088 but works for floating pointer parameters and nonoffsetable memories.
5089 For pushes, it returns just stack offsets; the values will be saved
5090 in the right order. Maximally three parts are generated. */
5093 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5098 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5100 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5102 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5103 gcc_assert (size
>= 2 && size
<= 4);
5105 /* Optimize constant pool reference to immediates. This is used by fp
5106 moves, that force all constants to memory to allow combining. */
5107 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5108 operand
= avoid_constant_pool_reference (operand
);
5110 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5112 /* The only non-offsetable memories we handle are pushes. */
5113 int ok
= push_operand (operand
, VOIDmode
);
5117 operand
= copy_rtx (operand
);
5118 PUT_MODE (operand
, word_mode
);
5119 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5123 if (GET_CODE (operand
) == CONST_VECTOR
)
5125 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5126 /* Caution: if we looked through a constant pool memory above,
5127 the operand may actually have a different mode now. That's
5128 ok, since we want to pun this all the way back to an integer. */
5129 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5130 gcc_assert (operand
!= NULL
);
5137 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5142 if (REG_P (operand
))
5144 gcc_assert (reload_completed
);
5145 for (i
= 0; i
< size
; i
++)
5146 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5148 else if (offsettable_memref_p (operand
))
5150 operand
= adjust_address (operand
, SImode
, 0);
5152 for (i
= 1; i
< size
; i
++)
5153 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5155 else if (CONST_DOUBLE_P (operand
))
5157 const REAL_VALUE_TYPE
*r
;
5160 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5164 real_to_target (l
, r
, mode
);
5165 parts
[3] = gen_int_mode (l
[3], SImode
);
5166 parts
[2] = gen_int_mode (l
[2], SImode
);
5169 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5170 long double may not be 80-bit. */
5171 real_to_target (l
, r
, mode
);
5172 parts
[2] = gen_int_mode (l
[2], SImode
);
5175 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5180 parts
[1] = gen_int_mode (l
[1], SImode
);
5181 parts
[0] = gen_int_mode (l
[0], SImode
);
5190 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5191 if (mode
== XFmode
|| mode
== TFmode
)
5193 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5194 if (REG_P (operand
))
5196 gcc_assert (reload_completed
);
5197 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5198 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5200 else if (offsettable_memref_p (operand
))
5202 operand
= adjust_address (operand
, DImode
, 0);
5204 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5206 else if (CONST_DOUBLE_P (operand
))
5210 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5212 /* real_to_target puts 32-bit pieces in each long. */
5213 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5214 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5217 if (upper_mode
== SImode
)
5218 parts
[1] = gen_int_mode (l
[2], SImode
);
5221 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5222 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5233 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5234 Return false when normal moves are needed; true when all required
5235 insns have been emitted. Operands 2-4 contain the input values
5236 int the correct order; operands 5-7 contain the output values. */
5239 ix86_split_long_move (rtx operands
[])
5245 machine_mode mode
= GET_MODE (operands
[0]);
5246 bool collisionparts
[4];
5248 /* The DFmode expanders may ask us to move double.
5249 For 64bit target this is single move. By hiding the fact
5250 here we simplify i386.md splitters. */
5251 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5253 /* Optimize constant pool reference to immediates. This is used by
5254 fp moves, that force all constants to memory to allow combining. */
5256 if (MEM_P (operands
[1])
5257 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5258 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5259 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5260 if (push_operand (operands
[0], VOIDmode
))
5262 operands
[0] = copy_rtx (operands
[0]);
5263 PUT_MODE (operands
[0], word_mode
);
5266 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5267 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5268 emit_move_insn (operands
[0], operands
[1]);
5272 /* The only non-offsettable memory we handle is push. */
5273 if (push_operand (operands
[0], VOIDmode
))
5276 gcc_assert (!MEM_P (operands
[0])
5277 || offsettable_memref_p (operands
[0]));
5279 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5280 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5282 /* When emitting push, take care for source operands on the stack. */
5283 if (push
&& MEM_P (operands
[1])
5284 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5286 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5288 /* Compensate for the stack decrement by 4. */
5289 if (!TARGET_64BIT
&& nparts
== 3
5290 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5291 src_base
= plus_constant (Pmode
, src_base
, 4);
5293 /* src_base refers to the stack pointer and is
5294 automatically decreased by emitted push. */
5295 for (i
= 0; i
< nparts
; i
++)
5296 part
[1][i
] = change_address (part
[1][i
],
5297 GET_MODE (part
[1][i
]), src_base
);
5300 /* We need to do copy in the right order in case an address register
5301 of the source overlaps the destination. */
5302 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5306 for (i
= 0; i
< nparts
; i
++)
5309 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5310 if (collisionparts
[i
])
5314 /* Collision in the middle part can be handled by reordering. */
5315 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5317 std::swap (part
[0][1], part
[0][2]);
5318 std::swap (part
[1][1], part
[1][2]);
5320 else if (collisions
== 1
5322 && (collisionparts
[1] || collisionparts
[2]))
5324 if (collisionparts
[1])
5326 std::swap (part
[0][1], part
[0][2]);
5327 std::swap (part
[1][1], part
[1][2]);
5331 std::swap (part
[0][2], part
[0][3]);
5332 std::swap (part
[1][2], part
[1][3]);
5336 /* If there are more collisions, we can't handle it by reordering.
5337 Do an lea to the last part and use only one colliding move. */
5338 else if (collisions
> 1)
5344 base
= part
[0][nparts
- 1];
5346 /* Handle the case when the last part isn't valid for lea.
5347 Happens in 64-bit mode storing the 12-byte XFmode. */
5348 if (GET_MODE (base
) != Pmode
)
5349 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5351 addr
= XEXP (part
[1][0], 0);
5352 if (TARGET_TLS_DIRECT_SEG_REFS
)
5354 struct ix86_address parts
;
5355 int ok
= ix86_decompose_address (addr
, &parts
);
5357 /* It is not valid to use %gs: or %fs: in lea. */
5358 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5360 emit_insn (gen_rtx_SET (base
, addr
));
5361 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5362 for (i
= 1; i
< nparts
; i
++)
5364 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5365 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5376 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5377 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5378 emit_move_insn (part
[0][2], part
[1][2]);
5380 else if (nparts
== 4)
5382 emit_move_insn (part
[0][3], part
[1][3]);
5383 emit_move_insn (part
[0][2], part
[1][2]);
5388 /* In 64bit mode we don't have 32bit push available. In case this is
5389 register, it is OK - we will just use larger counterpart. We also
5390 retype memory - these comes from attempt to avoid REX prefix on
5391 moving of second half of TFmode value. */
5392 if (GET_MODE (part
[1][1]) == SImode
)
5394 switch (GET_CODE (part
[1][1]))
5397 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5401 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5408 if (GET_MODE (part
[1][0]) == SImode
)
5409 part
[1][0] = part
[1][1];
5412 emit_move_insn (part
[0][1], part
[1][1]);
5413 emit_move_insn (part
[0][0], part
[1][0]);
5417 /* Choose correct order to not overwrite the source before it is copied. */
5418 if ((REG_P (part
[0][0])
5419 && REG_P (part
[1][1])
5420 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5422 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5424 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5426 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5428 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5430 operands
[2 + i
] = part
[0][j
];
5431 operands
[6 + i
] = part
[1][j
];
5436 for (i
= 0; i
< nparts
; i
++)
5438 operands
[2 + i
] = part
[0][i
];
5439 operands
[6 + i
] = part
[1][i
];
5443 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5444 if (optimize_insn_for_size_p ())
5446 for (j
= 0; j
< nparts
- 1; j
++)
5447 if (CONST_INT_P (operands
[6 + j
])
5448 && operands
[6 + j
] != const0_rtx
5449 && REG_P (operands
[2 + j
]))
5450 for (i
= j
; i
< nparts
- 1; i
++)
5451 if (CONST_INT_P (operands
[7 + i
])
5452 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5453 operands
[7 + i
] = operands
[2 + j
];
5456 for (i
= 0; i
< nparts
; i
++)
5457 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5462 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5463 left shift by a constant, either using a single shift or
5464 a sequence of add instructions. */
5467 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5470 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5471 && !optimize_insn_for_size_p ()))
5474 emit_insn (gen_add2_insn (operand
, operand
));
5478 rtx (*insn
)(rtx
, rtx
, rtx
);
5480 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5481 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5486 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5488 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5489 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5490 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5491 machine_mode half_mode
;
5493 rtx low
[2], high
[2];
5496 if (CONST_INT_P (operands
[2]))
5498 split_double_mode (mode
, operands
, 2, low
, high
);
5499 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5501 if (count
>= half_width
)
5503 emit_move_insn (high
[0], low
[1]);
5504 emit_move_insn (low
[0], const0_rtx
);
5506 if (count
> half_width
)
5507 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5511 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5513 if (!rtx_equal_p (operands
[0], operands
[1]))
5514 emit_move_insn (operands
[0], operands
[1]);
5516 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5517 ix86_expand_ashl_const (low
[0], count
, mode
);
5522 split_double_mode (mode
, operands
, 1, low
, high
);
5523 half_mode
= mode
== DImode
? SImode
: DImode
;
5525 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5527 if (operands
[1] == const1_rtx
)
5529 /* Assuming we've chosen a QImode capable registers, then 1 << N
5530 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5531 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5533 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5535 ix86_expand_clear (low
[0]);
5536 ix86_expand_clear (high
[0]);
5537 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5539 d
= gen_lowpart (QImode
, low
[0]);
5540 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5541 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5542 emit_insn (gen_rtx_SET (d
, s
));
5544 d
= gen_lowpart (QImode
, high
[0]);
5545 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5546 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5547 emit_insn (gen_rtx_SET (d
, s
));
5550 /* Otherwise, we can get the same results by manually performing
5551 a bit extract operation on bit 5/6, and then performing the two
5552 shifts. The two methods of getting 0/1 into low/high are exactly
5553 the same size. Avoiding the shift in the bit extract case helps
5554 pentium4 a bit; no one else seems to care much either way. */
5557 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5558 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5559 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5565 gen_lshr3
= gen_lshrsi3
;
5566 gen_and3
= gen_andsi3
;
5567 gen_xor3
= gen_xorsi3
;
5572 gen_lshr3
= gen_lshrdi3
;
5573 gen_and3
= gen_anddi3
;
5574 gen_xor3
= gen_xordi3
;
5578 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5579 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5581 x
= gen_lowpart (half_mode
, operands
[2]);
5582 emit_insn (gen_rtx_SET (high
[0], x
));
5584 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5585 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5586 emit_move_insn (low
[0], high
[0]);
5587 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5590 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5591 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5595 if (operands
[1] == constm1_rtx
)
5597 /* For -1 << N, we can avoid the shld instruction, because we
5598 know that we're shifting 0...31/63 ones into a -1. */
5599 emit_move_insn (low
[0], constm1_rtx
);
5600 if (optimize_insn_for_size_p ())
5601 emit_move_insn (high
[0], low
[0]);
5603 emit_move_insn (high
[0], constm1_rtx
);
5607 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5609 if (!rtx_equal_p (operands
[0], operands
[1]))
5610 emit_move_insn (operands
[0], operands
[1]);
5612 split_double_mode (mode
, operands
, 1, low
, high
);
5613 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5616 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5618 if (TARGET_CMOVE
&& scratch
)
5620 ix86_expand_clear (scratch
);
5621 emit_insn (gen_x86_shift_adj_1
5622 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5625 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5629 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5631 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5632 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5633 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5634 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5636 rtx low
[2], high
[2];
5639 if (CONST_INT_P (operands
[2]))
5641 split_double_mode (mode
, operands
, 2, low
, high
);
5642 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5644 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5646 emit_move_insn (high
[0], high
[1]);
5647 emit_insn (gen_ashr3 (high
[0], high
[0],
5648 GEN_INT (half_width
- 1)));
5649 emit_move_insn (low
[0], high
[0]);
5652 else if (count
>= half_width
)
5654 emit_move_insn (low
[0], high
[1]);
5655 emit_move_insn (high
[0], low
[0]);
5656 emit_insn (gen_ashr3 (high
[0], high
[0],
5657 GEN_INT (half_width
- 1)));
5659 if (count
> half_width
)
5660 emit_insn (gen_ashr3 (low
[0], low
[0],
5661 GEN_INT (count
- half_width
)));
5665 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5667 if (!rtx_equal_p (operands
[0], operands
[1]))
5668 emit_move_insn (operands
[0], operands
[1]);
5670 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5671 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5676 machine_mode half_mode
;
5678 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5680 if (!rtx_equal_p (operands
[0], operands
[1]))
5681 emit_move_insn (operands
[0], operands
[1]);
5683 split_double_mode (mode
, operands
, 1, low
, high
);
5684 half_mode
= mode
== DImode
? SImode
: DImode
;
5686 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5687 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5689 if (TARGET_CMOVE
&& scratch
)
5691 emit_move_insn (scratch
, high
[0]);
5692 emit_insn (gen_ashr3 (scratch
, scratch
,
5693 GEN_INT (half_width
- 1)));
5694 emit_insn (gen_x86_shift_adj_1
5695 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5698 emit_insn (gen_x86_shift_adj_3
5699 (half_mode
, low
[0], high
[0], operands
[2]));
5704 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5706 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5707 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5708 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5709 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5711 rtx low
[2], high
[2];
5714 if (CONST_INT_P (operands
[2]))
5716 split_double_mode (mode
, operands
, 2, low
, high
);
5717 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5719 if (count
>= half_width
)
5721 emit_move_insn (low
[0], high
[1]);
5722 ix86_expand_clear (high
[0]);
5724 if (count
> half_width
)
5725 emit_insn (gen_lshr3 (low
[0], low
[0],
5726 GEN_INT (count
- half_width
)));
5730 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5732 if (!rtx_equal_p (operands
[0], operands
[1]))
5733 emit_move_insn (operands
[0], operands
[1]);
5735 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5736 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5741 machine_mode half_mode
;
5743 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5745 if (!rtx_equal_p (operands
[0], operands
[1]))
5746 emit_move_insn (operands
[0], operands
[1]);
5748 split_double_mode (mode
, operands
, 1, low
, high
);
5749 half_mode
= mode
== DImode
? SImode
: DImode
;
5751 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5752 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5754 if (TARGET_CMOVE
&& scratch
)
5756 ix86_expand_clear (scratch
);
5757 emit_insn (gen_x86_shift_adj_1
5758 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5761 emit_insn (gen_x86_shift_adj_2
5762 (half_mode
, low
[0], high
[0], operands
[2]));
5766 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5767 DImode for constant loop counts. */
5770 counter_mode (rtx count_exp
)
5772 if (GET_MODE (count_exp
) != VOIDmode
)
5773 return GET_MODE (count_exp
);
5774 if (!CONST_INT_P (count_exp
))
5776 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5781 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5782 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5783 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5784 memory by VALUE (supposed to be in MODE).
5786 The size is rounded down to whole number of chunk size moved at once.
5787 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5791 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5792 rtx destptr
, rtx srcptr
, rtx value
,
5793 rtx count
, machine_mode mode
, int unroll
,
5794 int expected_size
, bool issetmem
)
5796 rtx_code_label
*out_label
, *top_label
;
5798 machine_mode iter_mode
= counter_mode (count
);
5799 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5800 rtx piece_size
= GEN_INT (piece_size_n
);
5801 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5805 top_label
= gen_label_rtx ();
5806 out_label
= gen_label_rtx ();
5807 iter
= gen_reg_rtx (iter_mode
);
5809 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5810 NULL
, 1, OPTAB_DIRECT
);
5811 /* Those two should combine. */
5812 if (piece_size
== const1_rtx
)
5814 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5816 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5818 emit_move_insn (iter
, const0_rtx
);
5820 emit_label (top_label
);
5822 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5824 /* This assert could be relaxed - in this case we'll need to compute
5825 smallest power of two, containing in PIECE_SIZE_N and pass it to
5827 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5828 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5829 destmem
= adjust_address (destmem
, mode
, 0);
5833 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5834 srcmem
= adjust_address (srcmem
, mode
, 0);
5836 /* When unrolling for chips that reorder memory reads and writes,
5837 we can save registers by using single temporary.
5838 Also using 4 temporaries is overkill in 32bit mode. */
5839 if (!TARGET_64BIT
&& 0)
5841 for (i
= 0; i
< unroll
; i
++)
5845 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5846 GET_MODE_SIZE (mode
));
5847 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5848 GET_MODE_SIZE (mode
));
5850 emit_move_insn (destmem
, srcmem
);
5856 gcc_assert (unroll
<= 4);
5857 for (i
= 0; i
< unroll
; i
++)
5859 tmpreg
[i
] = gen_reg_rtx (mode
);
5861 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5862 GET_MODE_SIZE (mode
));
5863 emit_move_insn (tmpreg
[i
], srcmem
);
5865 for (i
= 0; i
< unroll
; i
++)
5868 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5869 GET_MODE_SIZE (mode
));
5870 emit_move_insn (destmem
, tmpreg
[i
]);
5875 for (i
= 0; i
< unroll
; i
++)
5878 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5879 GET_MODE_SIZE (mode
));
5880 emit_move_insn (destmem
, value
);
5883 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5884 true, OPTAB_LIB_WIDEN
);
5886 emit_move_insn (iter
, tmp
);
5888 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5890 if (expected_size
!= -1)
5892 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5893 if (expected_size
== 0)
5895 else if (expected_size
> REG_BR_PROB_BASE
)
5896 predict_jump (REG_BR_PROB_BASE
- 1);
5898 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5902 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5903 iter
= ix86_zero_extend_to_Pmode (iter
);
5904 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5905 true, OPTAB_LIB_WIDEN
);
5907 emit_move_insn (destptr
, tmp
);
5910 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5911 true, OPTAB_LIB_WIDEN
);
5913 emit_move_insn (srcptr
, tmp
);
5915 emit_label (out_label
);
5918 /* Divide COUNTREG by SCALE. */
5920 scale_counter (rtx countreg
, int scale
)
5926 if (CONST_INT_P (countreg
))
5927 return GEN_INT (INTVAL (countreg
) / scale
);
5928 gcc_assert (REG_P (countreg
));
5930 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5931 GEN_INT (exact_log2 (scale
)),
5932 NULL
, 1, OPTAB_DIRECT
);
5936 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5937 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5938 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5939 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5940 ORIG_VALUE is the original value passed to memset to fill the memory with.
5941 Other arguments have same meaning as for previous function. */
5944 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5945 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5947 machine_mode mode
, bool issetmem
)
5952 HOST_WIDE_INT rounded_count
;
5954 /* If possible, it is shorter to use rep movs.
5955 TODO: Maybe it is better to move this logic to decide_alg. */
5956 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5957 && (!issetmem
|| orig_value
== const0_rtx
))
5960 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5961 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5963 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5964 GET_MODE_SIZE (mode
)));
5967 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5968 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5969 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5972 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5973 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5976 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5977 destmem
= shallow_copy_rtx (destmem
);
5978 set_mem_size (destmem
, rounded_count
);
5980 else if (MEM_SIZE_KNOWN_P (destmem
))
5981 clear_mem_size (destmem
);
5985 value
= force_reg (mode
, gen_lowpart (mode
, value
));
5986 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
5990 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
5991 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
5994 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5995 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5996 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
5999 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6000 if (CONST_INT_P (count
))
6003 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6004 srcmem
= shallow_copy_rtx (srcmem
);
6005 set_mem_size (srcmem
, rounded_count
);
6009 if (MEM_SIZE_KNOWN_P (srcmem
))
6010 clear_mem_size (srcmem
);
6012 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6017 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6019 SRC is passed by pointer to be updated on return.
6020 Return value is updated DST. */
6022 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6023 HOST_WIDE_INT size_to_move
)
6025 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
6026 enum insn_code code
;
6027 machine_mode move_mode
;
6030 /* Find the widest mode in which we could perform moves.
6031 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6032 it until move of such size is supported. */
6033 piece_size
= 1 << floor_log2 (size_to_move
);
6034 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6035 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6037 gcc_assert (piece_size
> 1);
6041 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6042 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6043 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6045 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6046 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6047 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6049 move_mode
= word_mode
;
6050 piece_size
= GET_MODE_SIZE (move_mode
);
6051 code
= optab_handler (mov_optab
, move_mode
);
6054 gcc_assert (code
!= CODE_FOR_nothing
);
6056 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6057 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6059 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6060 gcc_assert (size_to_move
% piece_size
== 0);
6062 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6064 /* We move from memory to memory, so we'll need to do it via
6065 a temporary register. */
6066 tempreg
= gen_reg_rtx (move_mode
);
6067 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6068 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6070 emit_move_insn (destptr
,
6071 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6072 emit_move_insn (srcptr
,
6073 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
6075 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6077 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6081 /* Update DST and SRC rtx. */
6086 /* Helper function for the string operations below. Dest VARIABLE whether
6087 it is aligned to VALUE bytes. If true, jump to the label. */
6089 static rtx_code_label
*
6090 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6092 rtx_code_label
*label
= gen_label_rtx ();
6093 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6094 if (GET_MODE (variable
) == DImode
)
6095 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6097 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6098 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6101 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6103 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6108 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6111 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6112 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6115 if (CONST_INT_P (count
))
6117 HOST_WIDE_INT countval
= INTVAL (count
);
6118 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6121 /* For now MAX_SIZE should be a power of 2. This assert could be
6122 relaxed, but it'll require a bit more complicated epilogue
6124 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6125 for (i
= max_size
; i
>= 1; i
>>= 1)
6127 if (epilogue_size
& i
)
6128 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6134 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6135 count
, 1, OPTAB_DIRECT
);
6136 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6137 count
, QImode
, 1, 4, false);
6141 /* When there are stringops, we can cheaply increase dest and src pointers.
6142 Otherwise we save code size by maintaining offset (zero is readily
6143 available from preceding rep operation) and using x86 addressing modes.
6145 if (TARGET_SINGLE_STRINGOP
)
6149 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6150 src
= change_address (srcmem
, SImode
, srcptr
);
6151 dest
= change_address (destmem
, SImode
, destptr
);
6152 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6154 LABEL_NUSES (label
) = 1;
6158 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6159 src
= change_address (srcmem
, HImode
, srcptr
);
6160 dest
= change_address (destmem
, HImode
, destptr
);
6161 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6163 LABEL_NUSES (label
) = 1;
6167 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6168 src
= change_address (srcmem
, QImode
, srcptr
);
6169 dest
= change_address (destmem
, QImode
, destptr
);
6170 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6172 LABEL_NUSES (label
) = 1;
6177 rtx offset
= force_reg (Pmode
, const0_rtx
);
6182 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6183 src
= change_address (srcmem
, SImode
, srcptr
);
6184 dest
= change_address (destmem
, SImode
, destptr
);
6185 emit_move_insn (dest
, src
);
6186 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6187 true, OPTAB_LIB_WIDEN
);
6189 emit_move_insn (offset
, tmp
);
6191 LABEL_NUSES (label
) = 1;
6195 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6196 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6197 src
= change_address (srcmem
, HImode
, tmp
);
6198 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6199 dest
= change_address (destmem
, HImode
, tmp
);
6200 emit_move_insn (dest
, src
);
6201 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6202 true, OPTAB_LIB_WIDEN
);
6204 emit_move_insn (offset
, tmp
);
6206 LABEL_NUSES (label
) = 1;
6210 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6211 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6212 src
= change_address (srcmem
, QImode
, tmp
);
6213 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6214 dest
= change_address (destmem
, QImode
, tmp
);
6215 emit_move_insn (dest
, src
);
6217 LABEL_NUSES (label
) = 1;
6222 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6223 with value PROMOTED_VAL.
6224 SRC is passed by pointer to be updated on return.
6225 Return value is updated DST. */
6227 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6228 HOST_WIDE_INT size_to_move
)
6231 enum insn_code code
;
6232 machine_mode move_mode
;
6235 /* Find the widest mode in which we could perform moves.
6236 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6237 it until move of such size is supported. */
6238 move_mode
= GET_MODE (promoted_val
);
6239 if (move_mode
== VOIDmode
)
6241 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6243 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6244 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6245 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6247 piece_size
= GET_MODE_SIZE (move_mode
);
6248 code
= optab_handler (mov_optab
, move_mode
);
6249 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6251 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6253 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6254 gcc_assert (size_to_move
% piece_size
== 0);
6256 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6258 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6260 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6261 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6266 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6268 emit_move_insn (destptr
,
6269 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6271 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6275 /* Update DST rtx. */
6278 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6280 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6281 rtx count
, int max_size
)
6283 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6284 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6285 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6286 gen_lowpart (QImode
, value
), count
, QImode
,
6287 1, max_size
/ 2, true);
6290 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6292 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6293 rtx count
, int max_size
)
6297 if (CONST_INT_P (count
))
6299 HOST_WIDE_INT countval
= INTVAL (count
);
6300 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6303 /* For now MAX_SIZE should be a power of 2. This assert could be
6304 relaxed, but it'll require a bit more complicated epilogue
6306 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6307 for (i
= max_size
; i
>= 1; i
>>= 1)
6309 if (epilogue_size
& i
)
6311 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6312 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6314 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6321 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6326 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6329 dest
= change_address (destmem
, DImode
, destptr
);
6330 emit_insn (gen_strset (destptr
, dest
, value
));
6331 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6332 emit_insn (gen_strset (destptr
, dest
, value
));
6336 dest
= change_address (destmem
, SImode
, destptr
);
6337 emit_insn (gen_strset (destptr
, dest
, value
));
6338 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6339 emit_insn (gen_strset (destptr
, dest
, value
));
6340 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6341 emit_insn (gen_strset (destptr
, dest
, value
));
6342 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6343 emit_insn (gen_strset (destptr
, dest
, value
));
6346 LABEL_NUSES (label
) = 1;
6350 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6353 dest
= change_address (destmem
, DImode
, destptr
);
6354 emit_insn (gen_strset (destptr
, dest
, value
));
6358 dest
= change_address (destmem
, SImode
, destptr
);
6359 emit_insn (gen_strset (destptr
, dest
, value
));
6360 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6361 emit_insn (gen_strset (destptr
, dest
, value
));
6364 LABEL_NUSES (label
) = 1;
6368 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6369 dest
= change_address (destmem
, SImode
, destptr
);
6370 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6372 LABEL_NUSES (label
) = 1;
6376 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6377 dest
= change_address (destmem
, HImode
, destptr
);
6378 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6380 LABEL_NUSES (label
) = 1;
6384 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6385 dest
= change_address (destmem
, QImode
, destptr
);
6386 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6388 LABEL_NUSES (label
) = 1;
6392 /* Adjust COUNTER by the VALUE. */
6394 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6396 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6399 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6400 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6401 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6403 Return value is updated DESTMEM. */
6406 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6407 rtx destptr
, rtx srcptr
, rtx value
,
6408 rtx vec_value
, rtx count
, int align
,
6409 int desired_alignment
, bool issetmem
)
6412 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6416 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6419 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6420 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6422 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6425 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6426 ix86_adjust_counter (count
, i
);
6428 LABEL_NUSES (label
) = 1;
6429 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6435 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6436 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6437 and jump to DONE_LABEL. */
6439 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6440 rtx destptr
, rtx srcptr
,
6441 rtx value
, rtx vec_value
,
6442 rtx count
, int size
,
6443 rtx done_label
, bool issetmem
)
6445 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6446 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6450 /* If we do not have vector value to copy, we must reduce size. */
6455 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6457 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6458 mode
= GET_MODE (value
);
6461 mode
= GET_MODE (vec_value
), value
= vec_value
;
6465 /* Choose appropriate vector mode. */
6467 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6468 else if (size
>= 16)
6469 mode
= TARGET_SSE
? V16QImode
: DImode
;
6470 srcmem
= change_address (srcmem
, mode
, srcptr
);
6472 destmem
= change_address (destmem
, mode
, destptr
);
6473 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6474 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6475 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6478 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6481 emit_move_insn (destmem
, srcmem
);
6482 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6484 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6487 destmem
= offset_address (destmem
, count
, 1);
6488 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6489 GET_MODE_SIZE (mode
));
6492 srcmem
= offset_address (srcmem
, count
, 1);
6493 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6494 GET_MODE_SIZE (mode
));
6496 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6499 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6502 emit_move_insn (destmem
, srcmem
);
6503 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6505 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6507 emit_jump_insn (gen_jump (done_label
));
6511 LABEL_NUSES (label
) = 1;
6514 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6515 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6516 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6517 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6518 DONE_LABEL is a label after the whole copying sequence. The label is created
6519 on demand if *DONE_LABEL is NULL.
6520 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6521 bounds after the initial copies.
6523 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6524 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6525 we will dispatch to a library call for large blocks.
6527 In pseudocode we do:
6531 Assume that SIZE is 4. Bigger sizes are handled analogously
6534 copy 4 bytes from SRCPTR to DESTPTR
6535 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6540 copy 1 byte from SRCPTR to DESTPTR
6543 copy 2 bytes from SRCPTR to DESTPTR
6544 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6549 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6550 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6552 OLD_DESPTR = DESTPTR;
6553 Align DESTPTR up to DESIRED_ALIGN
6554 SRCPTR += DESTPTR - OLD_DESTPTR
6555 COUNT -= DEST_PTR - OLD_DESTPTR
6557 Round COUNT down to multiple of SIZE
6558 << optional caller supplied zero size guard is here >>
6559 << optional caller supplied dynamic check is here >>
6560 << caller supplied main copy loop is here >>
6565 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6566 rtx
*destptr
, rtx
*srcptr
,
6568 rtx value
, rtx vec_value
,
6570 rtx_code_label
**done_label
,
6574 unsigned HOST_WIDE_INT
*min_size
,
6578 rtx_code_label
*loop_label
= NULL
, *label
;
6581 int prolog_size
= 0;
6584 /* Chose proper value to copy. */
6585 if (issetmem
&& VECTOR_MODE_P (mode
))
6586 mode_value
= vec_value
;
6589 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6591 /* See if block is big or small, handle small blocks. */
6592 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6595 loop_label
= gen_label_rtx ();
6598 *done_label
= gen_label_rtx ();
6600 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6604 /* Handle sizes > 3. */
6605 for (;size2
> 2; size2
>>= 1)
6606 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6610 size2
, *done_label
, issetmem
);
6611 /* Nothing to copy? Jump to DONE_LABEL if so */
6612 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6615 /* Do a byte copy. */
6616 destmem
= change_address (destmem
, QImode
, *destptr
);
6618 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6621 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6622 emit_move_insn (destmem
, srcmem
);
6625 /* Handle sizes 2 and 3. */
6626 label
= ix86_expand_aligntest (*count
, 2, false);
6627 destmem
= change_address (destmem
, HImode
, *destptr
);
6628 destmem
= offset_address (destmem
, *count
, 1);
6629 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6631 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6634 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6635 srcmem
= offset_address (srcmem
, *count
, 1);
6636 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6637 emit_move_insn (destmem
, srcmem
);
6641 LABEL_NUSES (label
) = 1;
6642 emit_jump_insn (gen_jump (*done_label
));
6646 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6647 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6649 /* Start memcpy for COUNT >= SIZE. */
6652 emit_label (loop_label
);
6653 LABEL_NUSES (loop_label
) = 1;
6656 /* Copy first desired_align bytes. */
6658 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6659 destmem
= change_address (destmem
, mode
, *destptr
);
6660 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6661 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6664 emit_move_insn (destmem
, mode_value
);
6667 emit_move_insn (destmem
, srcmem
);
6668 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6670 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6671 prolog_size
+= GET_MODE_SIZE (mode
);
6675 /* Copy last SIZE bytes. */
6676 destmem
= offset_address (destmem
, *count
, 1);
6677 destmem
= offset_address (destmem
,
6678 GEN_INT (-size
- prolog_size
),
6681 emit_move_insn (destmem
, mode_value
);
6684 srcmem
= offset_address (srcmem
, *count
, 1);
6685 srcmem
= offset_address (srcmem
,
6686 GEN_INT (-size
- prolog_size
),
6688 emit_move_insn (destmem
, srcmem
);
6690 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6692 destmem
= offset_address (destmem
, modesize
, 1);
6694 emit_move_insn (destmem
, mode_value
);
6697 srcmem
= offset_address (srcmem
, modesize
, 1);
6698 emit_move_insn (destmem
, srcmem
);
6702 /* Align destination. */
6703 if (desired_align
> 1 && desired_align
> align
)
6705 rtx saveddest
= *destptr
;
6707 gcc_assert (desired_align
<= size
);
6708 /* Align destptr up, place it to new register. */
6709 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6710 GEN_INT (prolog_size
),
6711 NULL_RTX
, 1, OPTAB_DIRECT
);
6712 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6713 REG_POINTER (*destptr
) = 1;
6714 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6715 GEN_INT (-desired_align
),
6716 *destptr
, 1, OPTAB_DIRECT
);
6717 /* See how many bytes we skipped. */
6718 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6720 saveddest
, 1, OPTAB_DIRECT
);
6721 /* Adjust srcptr and count. */
6723 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6724 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6725 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6726 saveddest
, *count
, 1, OPTAB_DIRECT
);
6727 /* We copied at most size + prolog_size. */
6728 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6730 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6734 /* Our loops always round down the block size, but for dispatch to
6735 library we need precise value. */
6737 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6738 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6742 gcc_assert (prolog_size
== 0);
6743 /* Decrease count, so we won't end up copying last word twice. */
6744 if (!CONST_INT_P (*count
))
6745 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6746 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6748 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6749 (unsigned HOST_WIDE_INT
)size
));
6751 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6756 /* This function is like the previous one, except here we know how many bytes
6757 need to be copied. That allows us to update alignment not only of DST, which
6758 is returned, but also of SRC, which is passed as a pointer for that
6761 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6762 rtx srcreg
, rtx value
, rtx vec_value
,
6763 int desired_align
, int align_bytes
,
6768 rtx orig_src
= NULL
;
6770 int copied_bytes
= 0;
6774 gcc_assert (srcp
!= NULL
);
6779 for (piece_size
= 1;
6780 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6783 if (align_bytes
& piece_size
)
6787 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6788 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6790 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6793 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6794 copied_bytes
+= piece_size
;
6797 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6798 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6799 if (MEM_SIZE_KNOWN_P (orig_dst
))
6800 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6804 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6806 if (src_align_bytes
>= 0)
6807 src_align_bytes
= desired_align
- src_align_bytes
;
6808 if (src_align_bytes
>= 0)
6810 unsigned int src_align
;
6811 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6813 if ((src_align_bytes
& (src_align
- 1))
6814 == (align_bytes
& (src_align
- 1)))
6817 if (src_align
> (unsigned int) desired_align
)
6818 src_align
= desired_align
;
6819 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6820 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6822 if (MEM_SIZE_KNOWN_P (orig_src
))
6823 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6830 /* Return true if ALG can be used in current context.
6831 Assume we expand memset if MEMSET is true. */
6833 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6835 if (alg
== no_stringop
)
6837 if (alg
== vector_loop
)
6838 return TARGET_SSE
|| TARGET_AVX
;
6839 /* Algorithms using the rep prefix want at least edi and ecx;
6840 additionally, memset wants eax and memcpy wants esi. Don't
6841 consider such algorithms if the user has appropriated those
6842 registers for their own purposes, or if we have a non-default
6843 address space, since some string insns cannot override the segment. */
6844 if (alg
== rep_prefix_1_byte
6845 || alg
== rep_prefix_4_byte
6846 || alg
== rep_prefix_8_byte
)
6850 if (fixed_regs
[CX_REG
]
6851 || fixed_regs
[DI_REG
]
6852 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6858 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6859 static enum stringop_alg
6860 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6861 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6862 bool memset
, bool zero_memset
, bool have_as
,
6863 int *dynamic_check
, bool *noalign
, bool recur
)
6865 const struct stringop_algs
*algs
;
6866 bool optimize_for_speed
;
6868 const struct processor_costs
*cost
;
6870 bool any_alg_usable_p
= false;
6873 *dynamic_check
= -1;
6875 /* Even if the string operation call is cold, we still might spend a lot
6876 of time processing large blocks. */
6877 if (optimize_function_for_size_p (cfun
)
6878 || (optimize_insn_for_size_p ()
6880 || (expected_size
!= -1 && expected_size
< 256))))
6881 optimize_for_speed
= false;
6883 optimize_for_speed
= true;
6885 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6887 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6889 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6891 /* See maximal size for user defined algorithm. */
6892 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6894 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6895 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6896 any_alg_usable_p
|= usable
;
6898 if (candidate
!= libcall
&& candidate
&& usable
)
6899 max
= algs
->size
[i
].max
;
6902 /* If expected size is not known but max size is small enough
6903 so inline version is a win, set expected size into
6905 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6906 && expected_size
== -1)
6907 expected_size
= min_size
/ 2 + max_size
/ 2;
6909 /* If user specified the algorithm, honor it if possible. */
6910 if (ix86_stringop_alg
!= no_stringop
6911 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6912 return ix86_stringop_alg
;
6913 /* rep; movq or rep; movl is the smallest variant. */
6914 else if (!optimize_for_speed
)
6917 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6918 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6919 ? rep_prefix_1_byte
: loop_1_byte
;
6921 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6922 ? rep_prefix_4_byte
: loop
;
6924 /* Very tiny blocks are best handled via the loop, REP is expensive to
6926 else if (expected_size
!= -1 && expected_size
< 4)
6928 else if (expected_size
!= -1)
6930 enum stringop_alg alg
= libcall
;
6931 bool alg_noalign
= false;
6932 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6934 /* We get here if the algorithms that were not libcall-based
6935 were rep-prefix based and we are unable to use rep prefixes
6936 based on global register usage. Break out of the loop and
6937 use the heuristic below. */
6938 if (algs
->size
[i
].max
== 0)
6940 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6942 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6944 if (candidate
!= libcall
6945 && alg_usable_p (candidate
, memset
, have_as
))
6948 alg_noalign
= algs
->size
[i
].noalign
;
6950 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6951 last non-libcall inline algorithm. */
6952 if (TARGET_INLINE_ALL_STRINGOPS
)
6954 /* When the current size is best to be copied by a libcall,
6955 but we are still forced to inline, run the heuristic below
6956 that will pick code for medium sized blocks. */
6959 *noalign
= alg_noalign
;
6962 else if (!any_alg_usable_p
)
6965 else if (alg_usable_p (candidate
, memset
, have_as
))
6967 *noalign
= algs
->size
[i
].noalign
;
6973 /* When asked to inline the call anyway, try to pick meaningful choice.
6974 We look for maximal size of block that is faster to copy by hand and
6975 take blocks of at most of that size guessing that average size will
6976 be roughly half of the block.
6978 If this turns out to be bad, we might simply specify the preferred
6979 choice in ix86_costs. */
6980 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6981 && (algs
->unknown_size
== libcall
6982 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
6984 enum stringop_alg alg
;
6985 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
6987 /* If there aren't any usable algorithms or if recursing already,
6988 then recursing on smaller sizes or same size isn't going to
6989 find anything. Just return the simple byte-at-a-time copy loop. */
6990 if (!any_alg_usable_p
|| recur
)
6992 /* Pick something reasonable. */
6993 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
6994 *dynamic_check
= 128;
6997 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
6998 zero_memset
, have_as
, dynamic_check
, noalign
, true);
6999 gcc_assert (*dynamic_check
== -1);
7000 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7001 *dynamic_check
= max
;
7003 gcc_assert (alg
!= libcall
);
7006 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7007 ? algs
->unknown_size
: libcall
);
7010 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7011 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7013 decide_alignment (int align
,
7014 enum stringop_alg alg
,
7016 machine_mode move_mode
)
7018 int desired_align
= 0;
7020 gcc_assert (alg
!= no_stringop
);
7024 if (move_mode
== VOIDmode
)
7027 desired_align
= GET_MODE_SIZE (move_mode
);
7028 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7029 copying whole cacheline at once. */
7030 if (TARGET_PENTIUMPRO
7031 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7036 if (desired_align
< align
)
7037 desired_align
= align
;
7038 if (expected_size
!= -1 && expected_size
< 4)
7039 desired_align
= align
;
7041 return desired_align
;
7045 /* Helper function for memcpy. For QImode value 0xXY produce
7046 0xXYXYXYXY of wide specified by MODE. This is essentially
7047 a * 0x10101010, but we can do slightly better than
7048 synth_mult by unwinding the sequence by hand on CPUs with
7051 promote_duplicated_reg (machine_mode mode
, rtx val
)
7053 machine_mode valmode
= GET_MODE (val
);
7055 int nops
= mode
== DImode
? 3 : 2;
7057 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7058 if (val
== const0_rtx
)
7059 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7060 if (CONST_INT_P (val
))
7062 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7067 v
|= (v
<< 16) << 16;
7068 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7071 if (valmode
== VOIDmode
)
7073 if (valmode
!= QImode
)
7074 val
= gen_lowpart (QImode
, val
);
7077 if (!TARGET_PARTIAL_REG_STALL
)
7079 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7080 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7081 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7082 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7084 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7085 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7086 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7091 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7093 if (!TARGET_PARTIAL_REG_STALL
)
7094 emit_insn (gen_insv_1 (mode
, reg
, reg
));
7097 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7098 NULL
, 1, OPTAB_DIRECT
);
7099 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7102 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7103 NULL
, 1, OPTAB_DIRECT
);
7104 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7107 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7108 NULL
, 1, OPTAB_DIRECT
);
7109 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7114 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7115 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7116 alignment from ALIGN to DESIRED_ALIGN. */
7118 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7124 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7125 promoted_val
= promote_duplicated_reg (DImode
, val
);
7126 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7127 promoted_val
= promote_duplicated_reg (SImode
, val
);
7128 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7129 promoted_val
= promote_duplicated_reg (HImode
, val
);
7133 return promoted_val
;
7136 /* Copy the address to a Pmode register. This is used for x32 to
7137 truncate DImode TLS address to a SImode register. */
7140 ix86_copy_addr_to_reg (rtx addr
)
7143 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7145 reg
= copy_addr_to_reg (addr
);
7146 REG_POINTER (reg
) = 1;
7151 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7152 reg
= copy_to_mode_reg (DImode
, addr
);
7153 REG_POINTER (reg
) = 1;
7154 return gen_rtx_SUBREG (SImode
, reg
, 0);
7158 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7159 operations when profitable. The code depends upon architecture, block size
7160 and alignment, but always has one of the following overall structures:
7162 Aligned move sequence:
7164 1) Prologue guard: Conditional that jumps up to epilogues for small
7165 blocks that can be handled by epilogue alone. This is faster
7166 but also needed for correctness, since prologue assume the block
7167 is larger than the desired alignment.
7169 Optional dynamic check for size and libcall for large
7170 blocks is emitted here too, with -minline-stringops-dynamically.
7172 2) Prologue: copy first few bytes in order to get destination
7173 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7174 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7175 copied. We emit either a jump tree on power of two sized
7176 blocks, or a byte loop.
7178 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7179 with specified algorithm.
7181 4) Epilogue: code copying tail of the block that is too small to be
7182 handled by main body (or up to size guarded by prologue guard).
7184 Misaligned move sequence
7186 1) missaligned move prologue/epilogue containing:
7187 a) Prologue handling small memory blocks and jumping to done_label
7188 (skipped if blocks are known to be large enough)
7189 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7190 needed by single possibly misaligned move
7191 (skipped if alignment is not needed)
7192 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7194 2) Zero size guard dispatching to done_label, if needed
7196 3) dispatch to library call, if needed,
7198 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7199 with specified algorithm. */
7201 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7202 rtx align_exp
, rtx expected_align_exp
,
7203 rtx expected_size_exp
, rtx min_size_exp
,
7204 rtx max_size_exp
, rtx probable_max_size_exp
,
7209 rtx_code_label
*label
= NULL
;
7211 rtx_code_label
*jump_around_label
= NULL
;
7212 HOST_WIDE_INT align
= 1;
7213 unsigned HOST_WIDE_INT count
= 0;
7214 HOST_WIDE_INT expected_size
= -1;
7215 int size_needed
= 0, epilogue_size_needed
;
7216 int desired_align
= 0, align_bytes
= 0;
7217 enum stringop_alg alg
;
7218 rtx promoted_val
= NULL
;
7219 rtx vec_promoted_val
= NULL
;
7220 bool force_loopy_epilogue
= false;
7222 bool need_zero_guard
= false;
7224 machine_mode move_mode
= VOIDmode
;
7225 machine_mode wider_mode
;
7226 int unroll_factor
= 1;
7227 /* TODO: Once value ranges are available, fill in proper data. */
7228 unsigned HOST_WIDE_INT min_size
= 0;
7229 unsigned HOST_WIDE_INT max_size
= -1;
7230 unsigned HOST_WIDE_INT probable_max_size
= -1;
7231 bool misaligned_prologue_used
= false;
7234 if (CONST_INT_P (align_exp
))
7235 align
= INTVAL (align_exp
);
7236 /* i386 can do misaligned access on reasonably increased cost. */
7237 if (CONST_INT_P (expected_align_exp
)
7238 && INTVAL (expected_align_exp
) > align
)
7239 align
= INTVAL (expected_align_exp
);
7240 /* ALIGN is the minimum of destination and source alignment, but we care here
7241 just about destination alignment. */
7243 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7244 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7246 if (CONST_INT_P (count_exp
))
7248 min_size
= max_size
= probable_max_size
= count
= expected_size
7249 = INTVAL (count_exp
);
7250 /* When COUNT is 0, there is nothing to do. */
7257 min_size
= INTVAL (min_size_exp
);
7259 max_size
= INTVAL (max_size_exp
);
7260 if (probable_max_size_exp
)
7261 probable_max_size
= INTVAL (probable_max_size_exp
);
7262 if (CONST_INT_P (expected_size_exp
))
7263 expected_size
= INTVAL (expected_size_exp
);
7266 /* Make sure we don't need to care about overflow later on. */
7267 if (count
> (HOST_WIDE_INT_1U
<< 30))
7270 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7272 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7274 /* Step 0: Decide on preferred algorithm, desired alignment and
7275 size of chunks to be copied by main loop. */
7276 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7278 issetmem
&& val_exp
== const0_rtx
, have_as
,
7279 &dynamic_check
, &noalign
, false);
7282 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7283 stringop_alg_names
[alg
]);
7287 gcc_assert (alg
!= no_stringop
);
7289 /* For now vector-version of memset is generated only for memory zeroing, as
7290 creating of promoted vector value is very cheap in this case. */
7291 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7292 alg
= unrolled_loop
;
7295 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7296 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7298 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7301 move_mode
= word_mode
;
7309 need_zero_guard
= true;
7313 need_zero_guard
= true;
7316 need_zero_guard
= true;
7317 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7320 need_zero_guard
= true;
7322 /* Find the widest supported mode. */
7323 move_mode
= word_mode
;
7324 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7325 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7326 move_mode
= wider_mode
;
7328 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
7331 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7332 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7333 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7335 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7336 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7337 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7338 move_mode
= word_mode
;
7340 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7342 case rep_prefix_8_byte
:
7345 case rep_prefix_4_byte
:
7348 case rep_prefix_1_byte
:
7352 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7353 epilogue_size_needed
= size_needed
;
7355 /* If we are going to call any library calls conditionally, make sure any
7356 pending stack adjustment happen before the first conditional branch,
7357 otherwise they will be emitted before the library call only and won't
7358 happen from the other branches. */
7359 if (dynamic_check
!= -1)
7360 do_pending_stack_adjust ();
7362 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7363 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7364 align
= desired_align
;
7366 /* Step 1: Prologue guard. */
7368 /* Alignment code needs count to be in register. */
7369 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7371 if (INTVAL (count_exp
) > desired_align
7372 && INTVAL (count_exp
) > size_needed
)
7375 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7376 if (align_bytes
<= 0)
7379 align_bytes
= desired_align
- align_bytes
;
7381 if (align_bytes
== 0)
7382 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7384 gcc_assert (desired_align
>= 1 && align
>= 1);
7386 /* Misaligned move sequences handle both prologue and epilogue at once.
7387 Default code generation results in a smaller code for large alignments
7388 and also avoids redundant job when sizes are known precisely. */
7389 misaligned_prologue_used
7390 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7391 && MAX (desired_align
, epilogue_size_needed
) <= 32
7392 && desired_align
<= epilogue_size_needed
7393 && ((desired_align
> align
&& !align_bytes
)
7394 || (!count
&& epilogue_size_needed
> 1)));
7396 /* Do the cheap promotion to allow better CSE across the
7397 main loop and epilogue (ie one load of the big constant in the
7399 For now the misaligned move sequences do not have fast path
7400 without broadcasting. */
7401 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7403 if (alg
== vector_loop
)
7405 gcc_assert (val_exp
== const0_rtx
);
7406 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7407 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7408 GET_MODE_SIZE (word_mode
),
7409 desired_align
, align
);
7413 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7414 desired_align
, align
);
7417 /* Misaligned move sequences handles both prologues and epilogues at once.
7418 Default code generation results in smaller code for large alignments and
7419 also avoids redundant job when sizes are known precisely. */
7420 if (misaligned_prologue_used
)
7422 /* Misaligned move prologue handled small blocks by itself. */
7423 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7424 (dst
, src
, &destreg
, &srcreg
,
7425 move_mode
, promoted_val
, vec_promoted_val
,
7428 desired_align
< align
7429 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7430 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7432 src
= change_address (src
, BLKmode
, srcreg
);
7433 dst
= change_address (dst
, BLKmode
, destreg
);
7434 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7435 epilogue_size_needed
= 0;
7437 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7439 /* It is possible that we copied enough so the main loop will not
7441 gcc_assert (size_needed
> 1);
7442 if (jump_around_label
== NULL_RTX
)
7443 jump_around_label
= gen_label_rtx ();
7444 emit_cmp_and_jump_insns (count_exp
,
7445 GEN_INT (size_needed
),
7446 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7447 if (expected_size
== -1
7448 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7449 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7451 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7454 /* Ensure that alignment prologue won't copy past end of block. */
7455 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7457 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7458 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7459 Make sure it is power of 2. */
7460 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7462 /* To improve performance of small blocks, we jump around the VAL
7463 promoting mode. This mean that if the promoted VAL is not constant,
7464 we might not use it in the epilogue and have to use byte
7466 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7467 force_loopy_epilogue
= true;
7468 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7469 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7471 /* If main algorithm works on QImode, no epilogue is needed.
7472 For small sizes just don't align anything. */
7473 if (size_needed
== 1)
7474 desired_align
= align
;
7479 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7481 label
= gen_label_rtx ();
7482 emit_cmp_and_jump_insns (count_exp
,
7483 GEN_INT (epilogue_size_needed
),
7484 LTU
, 0, counter_mode (count_exp
), 1, label
);
7485 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7486 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7488 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7492 /* Emit code to decide on runtime whether library call or inline should be
7494 if (dynamic_check
!= -1)
7496 if (!issetmem
&& CONST_INT_P (count_exp
))
7498 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7500 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7501 count_exp
= const0_rtx
;
7507 rtx_code_label
*hot_label
= gen_label_rtx ();
7508 if (jump_around_label
== NULL_RTX
)
7509 jump_around_label
= gen_label_rtx ();
7510 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7511 LEU
, 0, counter_mode (count_exp
),
7513 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7515 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7517 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7518 emit_jump (jump_around_label
);
7519 emit_label (hot_label
);
7523 /* Step 2: Alignment prologue. */
7524 /* Do the expensive promotion once we branched off the small blocks. */
7525 if (issetmem
&& !promoted_val
)
7526 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7527 desired_align
, align
);
7529 if (desired_align
> align
&& !misaligned_prologue_used
)
7531 if (align_bytes
== 0)
7533 /* Except for the first move in prologue, we no longer know
7534 constant offset in aliasing info. It don't seems to worth
7535 the pain to maintain it for the first move, so throw away
7537 dst
= change_address (dst
, BLKmode
, destreg
);
7539 src
= change_address (src
, BLKmode
, srcreg
);
7540 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7541 promoted_val
, vec_promoted_val
,
7542 count_exp
, align
, desired_align
,
7544 /* At most desired_align - align bytes are copied. */
7545 if (min_size
< (unsigned)(desired_align
- align
))
7548 min_size
-= desired_align
- align
;
7552 /* If we know how many bytes need to be stored before dst is
7553 sufficiently aligned, maintain aliasing info accurately. */
7554 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7562 count_exp
= plus_constant (counter_mode (count_exp
),
7563 count_exp
, -align_bytes
);
7564 count
-= align_bytes
;
7565 min_size
-= align_bytes
;
7566 max_size
-= align_bytes
;
7569 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7570 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7571 || (align_bytes
== 0
7572 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7573 + desired_align
- align
))))
7575 /* It is possible that we copied enough so the main loop will not
7577 gcc_assert (size_needed
> 1);
7578 if (label
== NULL_RTX
)
7579 label
= gen_label_rtx ();
7580 emit_cmp_and_jump_insns (count_exp
,
7581 GEN_INT (size_needed
),
7582 LTU
, 0, counter_mode (count_exp
), 1, label
);
7583 if (expected_size
== -1
7584 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7585 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7587 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7590 if (label
&& size_needed
== 1)
7593 LABEL_NUSES (label
) = 1;
7595 epilogue_size_needed
= 1;
7597 promoted_val
= val_exp
;
7599 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7600 epilogue_size_needed
= size_needed
;
7602 /* Step 3: Main loop. */
7613 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7614 count_exp
, move_mode
, unroll_factor
,
7615 expected_size
, issetmem
);
7618 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7619 vec_promoted_val
, count_exp
, move_mode
,
7620 unroll_factor
, expected_size
, issetmem
);
7622 case rep_prefix_8_byte
:
7623 case rep_prefix_4_byte
:
7624 case rep_prefix_1_byte
:
7625 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7626 val_exp
, count_exp
, move_mode
, issetmem
);
7629 /* Adjust properly the offset of src and dest memory for aliasing. */
7630 if (CONST_INT_P (count_exp
))
7633 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7634 (count
/ size_needed
) * size_needed
);
7635 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7636 (count
/ size_needed
) * size_needed
);
7641 src
= change_address (src
, BLKmode
, srcreg
);
7642 dst
= change_address (dst
, BLKmode
, destreg
);
7645 /* Step 4: Epilogue to copy the remaining bytes. */
7649 /* When the main loop is done, COUNT_EXP might hold original count,
7650 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7651 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7652 bytes. Compensate if needed. */
7654 if (size_needed
< epilogue_size_needed
)
7656 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7657 GEN_INT (size_needed
- 1), count_exp
, 1,
7659 if (tmp
!= count_exp
)
7660 emit_move_insn (count_exp
, tmp
);
7663 LABEL_NUSES (label
) = 1;
7666 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7668 if (force_loopy_epilogue
)
7669 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7670 epilogue_size_needed
);
7674 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7675 vec_promoted_val
, count_exp
,
7676 epilogue_size_needed
);
7678 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7679 epilogue_size_needed
);
7682 if (jump_around_label
)
7683 emit_label (jump_around_label
);
7687 /* Expand cmpstrn or memcmp. */
7690 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
7691 rtx length
, rtx align
, bool is_cmpstrn
)
7693 /* Expand strncmp and memcmp only with -minline-all-stringops since
7694 "repz cmpsb" can be much slower than strncmp and memcmp functions
7695 implemented with vector instructions, see
7697 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7699 if (!TARGET_INLINE_ALL_STRINGOPS
)
7702 /* Can't use this if the user has appropriated ecx, esi or edi. */
7703 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
7708 /* For strncmp, length is the maximum length, which can be larger
7709 than actual string lengths. We can expand the cmpstrn pattern
7710 to "repz cmpsb" only if one of the strings is a constant so
7711 that expand_builtin_strncmp() can write the length argument to
7712 be the minimum of the const string length and the actual length
7713 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7714 tree t1
= MEM_EXPR (src1
);
7715 tree t2
= MEM_EXPR (src2
);
7716 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
7717 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
7718 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
7720 || (t2
&& TREE_CODE (t2
) == MEM_REF
7721 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
7722 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
7727 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
7728 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
7729 if (addr1
!= XEXP (src1
, 0))
7730 src1
= replace_equiv_address_nv (src1
, addr1
);
7731 if (addr2
!= XEXP (src2
, 0))
7732 src2
= replace_equiv_address_nv (src2
, addr2
);
7734 /* NB: Make a copy of the data length to avoid changing the original
7735 data length by cmpstrnqi patterns. */
7736 length
= ix86_zero_extend_to_Pmode (length
);
7737 rtx lengthreg
= gen_reg_rtx (Pmode
);
7738 emit_move_insn (lengthreg
, length
);
7740 /* If we are testing strict equality, we can use known alignment to
7741 good advantage. This may be possible with combine, particularly
7742 once cc0 is dead. */
7743 if (CONST_INT_P (length
))
7745 if (length
== const0_rtx
)
7747 emit_move_insn (result
, const0_rtx
);
7750 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
7755 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
7756 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
7760 rtx out
= gen_lowpart (QImode
, result
);
7761 emit_insn (gen_cmpintqi (out
));
7762 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
7767 /* Expand the appropriate insns for doing strlen if not just doing
7770 out = result, initialized with the start address
7771 align_rtx = alignment of the address.
7772 scratch = scratch register, initialized with the startaddress when
7773 not aligned, otherwise undefined
7775 This is just the body. It needs the initializations mentioned above and
7776 some address computing at the end. These things are done in i386.md. */
7779 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7783 rtx_code_label
*align_2_label
= NULL
;
7784 rtx_code_label
*align_3_label
= NULL
;
7785 rtx_code_label
*align_4_label
= gen_label_rtx ();
7786 rtx_code_label
*end_0_label
= gen_label_rtx ();
7788 rtx tmpreg
= gen_reg_rtx (SImode
);
7789 rtx scratch
= gen_reg_rtx (SImode
);
7793 if (CONST_INT_P (align_rtx
))
7794 align
= INTVAL (align_rtx
);
7796 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7798 /* Is there a known alignment and is it less than 4? */
7801 rtx scratch1
= gen_reg_rtx (Pmode
);
7802 emit_move_insn (scratch1
, out
);
7803 /* Is there a known alignment and is it not 2? */
7806 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7807 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7809 /* Leave just the 3 lower bits. */
7810 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7811 NULL_RTX
, 0, OPTAB_WIDEN
);
7813 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7814 Pmode
, 1, align_4_label
);
7815 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7816 Pmode
, 1, align_2_label
);
7817 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7818 Pmode
, 1, align_3_label
);
7822 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7823 check if is aligned to 4 - byte. */
7825 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7826 NULL_RTX
, 0, OPTAB_WIDEN
);
7828 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7829 Pmode
, 1, align_4_label
);
7832 mem
= change_address (src
, QImode
, out
);
7834 /* Now compare the bytes. */
7836 /* Compare the first n unaligned byte on a byte per byte basis. */
7837 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7838 QImode
, 1, end_0_label
);
7840 /* Increment the address. */
7841 emit_insn (gen_add2_insn (out
, const1_rtx
));
7843 /* Not needed with an alignment of 2 */
7846 emit_label (align_2_label
);
7848 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7851 emit_insn (gen_add2_insn (out
, const1_rtx
));
7853 emit_label (align_3_label
);
7856 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7859 emit_insn (gen_add2_insn (out
, const1_rtx
));
7862 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7863 align this loop. It gives only huge programs, but does not help to
7865 emit_label (align_4_label
);
7867 mem
= change_address (src
, SImode
, out
);
7868 emit_move_insn (scratch
, mem
);
7869 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7871 /* This formula yields a nonzero result iff one of the bytes is zero.
7872 This saves three branches inside loop and many cycles. */
7874 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7875 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7876 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7877 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7878 gen_int_mode (0x80808080, SImode
)));
7879 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7884 rtx reg
= gen_reg_rtx (SImode
);
7885 rtx reg2
= gen_reg_rtx (Pmode
);
7886 emit_move_insn (reg
, tmpreg
);
7887 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7889 /* If zero is not in the first two bytes, move two bytes forward. */
7890 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7891 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7892 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7893 emit_insn (gen_rtx_SET (tmpreg
,
7894 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7897 /* Emit lea manually to avoid clobbering of flags. */
7898 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
7900 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7901 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7902 emit_insn (gen_rtx_SET (out
,
7903 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7909 rtx_code_label
*end_2_label
= gen_label_rtx ();
7910 /* Is zero in the first two bytes? */
7912 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7913 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7914 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7915 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7916 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7918 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7919 JUMP_LABEL (tmp
) = end_2_label
;
7921 /* Not in the first two. Move two bytes forward. */
7922 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7923 emit_insn (gen_add2_insn (out
, const2_rtx
));
7925 emit_label (end_2_label
);
7929 /* Avoid branch in fixing the byte. */
7930 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7931 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7932 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7933 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7934 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7936 emit_label (end_0_label
);
7939 /* Expand strlen. */
7942 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7944 if (TARGET_UNROLL_STRLEN
7945 && TARGET_INLINE_ALL_STRINGOPS
7946 && eoschar
== const0_rtx
7949 /* The generic case of strlen expander is long. Avoid it's
7950 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7951 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7952 /* Well it seems that some optimizer does not combine a call like
7953 foo(strlen(bar), strlen(bar));
7954 when the move and the subtraction is done here. It does calculate
7955 the length just once when these instructions are done inside of
7956 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7957 often used and I use one fewer register for the lifetime of
7958 output_strlen_unroll() this is better. */
7960 emit_move_insn (out
, addr
);
7962 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7964 /* strlensi_unroll_1 returns the address of the zero at the end of
7965 the string, like memchr(), so compute the length by subtracting
7966 the start address. */
7967 emit_insn (gen_sub2_insn (out
, addr
));
7974 /* For given symbol (function) construct code to compute address of it's PLT
7975 entry in large x86-64 PIC model. */
7978 construct_plt_address (rtx symbol
)
7982 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7983 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7984 gcc_assert (Pmode
== DImode
);
7986 tmp
= gen_reg_rtx (Pmode
);
7987 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7989 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7990 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7994 /* Additional registers that are clobbered by SYSV calls. */
7996 static int const x86_64_ms_sysv_extra_clobbered_registers
7997 [NUM_X86_64_MS_CLOBBERED_REGS
] =
8001 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
8002 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
8006 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
8008 rtx pop
, bool sibcall
)
8011 rtx use
= NULL
, call
;
8012 unsigned int vec_len
= 0;
8015 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8017 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
8019 && (lookup_attribute ("interrupt",
8020 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
8021 error ("interrupt service routine cannot be called directly");
8026 if (pop
== const0_rtx
)
8028 gcc_assert (!TARGET_64BIT
|| !pop
);
8030 if (TARGET_MACHO
&& !TARGET_64BIT
)
8033 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8034 fnaddr
= machopic_indirect_call_target (fnaddr
);
8039 /* Static functions and indirect calls don't need the pic register. Also,
8040 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8041 it an indirect call. */
8042 rtx addr
= XEXP (fnaddr
, 0);
8044 && GET_CODE (addr
) == SYMBOL_REF
8045 && !SYMBOL_REF_LOCAL_P (addr
))
8048 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
8049 || !lookup_attribute ("noplt",
8050 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
8053 || (ix86_cmodel
== CM_LARGE_PIC
8054 && DEFAULT_ABI
!= MS_ABI
))
8056 use_reg (&use
, gen_rtx_REG (Pmode
,
8057 REAL_PIC_OFFSET_TABLE_REGNUM
));
8058 if (ix86_use_pseudo_pic_reg ())
8059 emit_move_insn (gen_rtx_REG (Pmode
,
8060 REAL_PIC_OFFSET_TABLE_REGNUM
),
8061 pic_offset_table_rtx
);
8064 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8067 && ix86_cmodel
== CM_LARGE_PIC
8068 && DEFAULT_ABI
!= MS_ABI
)
8070 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8072 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8073 fnaddr
= force_reg (Pmode
, fnaddr
);
8074 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
8076 else if (TARGET_64BIT
)
8078 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8079 gen_rtvec (1, addr
),
8081 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8085 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8087 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8088 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8091 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8092 /* Pmode may not be the same as word_mode for x32, which
8093 doesn't support indirect branch via 32-bit memory slot.
8094 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8095 indirect branch via x32 GOT slot is OK. */
8096 if (GET_MODE (fnaddr
) != word_mode
)
8097 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8098 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8103 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8104 parameters passed in vector registers. */
8106 && (INTVAL (callarg2
) > 0
8107 || (INTVAL (callarg2
) == 0
8108 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8110 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8111 emit_move_insn (al
, callarg2
);
8115 if (ix86_cmodel
== CM_LARGE_PIC
8118 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8119 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8120 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8121 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8122 branch via x32 GOT slot is OK. */
8123 else if (!(TARGET_X32
8125 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8126 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8128 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8129 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8131 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8132 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8135 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8138 call
= gen_rtx_SET (retval
, call
);
8139 vec
[vec_len
++] = call
;
8143 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8144 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8145 vec
[vec_len
++] = pop
;
8148 if (cfun
->machine
->no_caller_saved_registers
8150 || (!TREE_THIS_VOLATILE (fndecl
)
8151 && !lookup_attribute ("no_caller_saved_registers",
8152 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8154 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8155 bool is_64bit_ms_abi
= (TARGET_64BIT
8156 && ix86_function_abi (fndecl
) == MS_ABI
);
8157 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8159 /* If there are no caller-saved registers, add all registers
8160 that are clobbered by the call which returns. */
8161 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8163 && (ix86_call_used_regs
[i
] == 1
8164 || (ix86_call_used_regs
[i
] & c_mask
))
8165 && !STACK_REGNO_P (i
)
8166 && !MMX_REGNO_P (i
))
8168 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8170 else if (TARGET_64BIT_MS_ABI
8171 && (!callarg2
|| INTVAL (callarg2
) != -2))
8175 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8177 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8178 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8180 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8183 /* Set here, but it may get cleared later. */
8184 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8189 /* Don't break hot-patched functions. */
8190 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8193 /* TODO: Cases not yet examined. */
8194 else if (flag_split_stack
)
8195 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8199 gcc_assert (!reload_completed
);
8200 cfun
->machine
->call_ms2sysv
= true;
8206 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8207 rtx_insn
*call_insn
= emit_call_insn (call
);
8209 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8214 /* Split simple return with popping POPC bytes from stack to indirect
8215 branch with stack adjustment . */
8218 ix86_split_simple_return_pop_internal (rtx popc
)
8220 struct machine_function
*m
= cfun
->machine
;
8221 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8224 /* There is no "pascal" calling convention in any 64bit ABI. */
8225 gcc_assert (!TARGET_64BIT
);
8227 insn
= emit_insn (gen_pop (ecx
));
8228 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8229 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8231 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8232 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8233 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8234 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8235 RTX_FRAME_RELATED_P (insn
) = 1;
8237 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8238 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8239 insn
= emit_insn (x
);
8240 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8241 RTX_FRAME_RELATED_P (insn
) = 1;
8243 /* Now return address is in ECX. */
8244 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8247 /* Errors in the source file can cause expand_expr to return const0_rtx
8248 where we expect a vector. To avoid crashing, use one of the vector
8249 clear instructions. */
8252 safe_vector_operand (rtx x
, machine_mode mode
)
8254 if (x
== const0_rtx
)
8255 x
= CONST0_RTX (mode
);
8259 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8262 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8265 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8266 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8267 rtx op0
= expand_normal (arg0
);
8268 rtx op1
= expand_normal (arg1
);
8269 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8270 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8271 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8273 if (VECTOR_MODE_P (mode0
))
8274 op0
= safe_vector_operand (op0
, mode0
);
8275 if (VECTOR_MODE_P (mode1
))
8276 op1
= safe_vector_operand (op1
, mode1
);
8278 if (optimize
|| !target
8279 || GET_MODE (target
) != tmode
8280 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8281 target
= gen_reg_rtx (tmode
);
8283 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8285 rtx x
= gen_reg_rtx (V4SImode
);
8286 emit_insn (gen_sse2_loadd (x
, op1
));
8287 op1
= gen_lowpart (TImode
, x
);
8290 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8291 op0
= copy_to_mode_reg (mode0
, op0
);
8292 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8293 op1
= copy_to_mode_reg (mode1
, op1
);
8295 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8304 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8307 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8308 enum ix86_builtin_func_type m_type
,
8309 enum rtx_code sub_code
)
8312 unsigned int i
, nargs
;
8313 bool comparison_p
= false;
8315 bool last_arg_constant
= false;
8319 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8323 case MULTI_ARG_4_DF2_DI_I
:
8324 case MULTI_ARG_4_DF2_DI_I1
:
8325 case MULTI_ARG_4_SF2_SI_I
:
8326 case MULTI_ARG_4_SF2_SI_I1
:
8328 last_arg_constant
= true;
8331 case MULTI_ARG_3_SF
:
8332 case MULTI_ARG_3_DF
:
8333 case MULTI_ARG_3_SF2
:
8334 case MULTI_ARG_3_DF2
:
8335 case MULTI_ARG_3_DI
:
8336 case MULTI_ARG_3_SI
:
8337 case MULTI_ARG_3_SI_DI
:
8338 case MULTI_ARG_3_HI
:
8339 case MULTI_ARG_3_HI_SI
:
8340 case MULTI_ARG_3_QI
:
8341 case MULTI_ARG_3_DI2
:
8342 case MULTI_ARG_3_SI2
:
8343 case MULTI_ARG_3_HI2
:
8344 case MULTI_ARG_3_QI2
:
8348 case MULTI_ARG_2_SF
:
8349 case MULTI_ARG_2_DF
:
8350 case MULTI_ARG_2_DI
:
8351 case MULTI_ARG_2_SI
:
8352 case MULTI_ARG_2_HI
:
8353 case MULTI_ARG_2_QI
:
8357 case MULTI_ARG_2_DI_IMM
:
8358 case MULTI_ARG_2_SI_IMM
:
8359 case MULTI_ARG_2_HI_IMM
:
8360 case MULTI_ARG_2_QI_IMM
:
8362 last_arg_constant
= true;
8365 case MULTI_ARG_1_SF
:
8366 case MULTI_ARG_1_DF
:
8367 case MULTI_ARG_1_SF2
:
8368 case MULTI_ARG_1_DF2
:
8369 case MULTI_ARG_1_DI
:
8370 case MULTI_ARG_1_SI
:
8371 case MULTI_ARG_1_HI
:
8372 case MULTI_ARG_1_QI
:
8373 case MULTI_ARG_1_SI_DI
:
8374 case MULTI_ARG_1_HI_DI
:
8375 case MULTI_ARG_1_HI_SI
:
8376 case MULTI_ARG_1_QI_DI
:
8377 case MULTI_ARG_1_QI_SI
:
8378 case MULTI_ARG_1_QI_HI
:
8382 case MULTI_ARG_2_DI_CMP
:
8383 case MULTI_ARG_2_SI_CMP
:
8384 case MULTI_ARG_2_HI_CMP
:
8385 case MULTI_ARG_2_QI_CMP
:
8387 comparison_p
= true;
8390 case MULTI_ARG_2_SF_TF
:
8391 case MULTI_ARG_2_DF_TF
:
8392 case MULTI_ARG_2_DI_TF
:
8393 case MULTI_ARG_2_SI_TF
:
8394 case MULTI_ARG_2_HI_TF
:
8395 case MULTI_ARG_2_QI_TF
:
8404 if (optimize
|| !target
8405 || GET_MODE (target
) != tmode
8406 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8407 target
= gen_reg_rtx (tmode
);
8408 else if (memory_operand (target
, tmode
))
8411 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
8413 for (i
= 0; i
< nargs
; i
++)
8415 tree arg
= CALL_EXPR_ARG (exp
, i
);
8416 rtx op
= expand_normal (arg
);
8417 int adjust
= (comparison_p
) ? 1 : 0;
8418 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8420 if (last_arg_constant
&& i
== nargs
- 1)
8422 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8424 enum insn_code new_icode
= icode
;
8427 case CODE_FOR_xop_vpermil2v2df3
:
8428 case CODE_FOR_xop_vpermil2v4sf3
:
8429 case CODE_FOR_xop_vpermil2v4df3
:
8430 case CODE_FOR_xop_vpermil2v8sf3
:
8431 error ("the last argument must be a 2-bit immediate");
8432 return gen_reg_rtx (tmode
);
8433 case CODE_FOR_xop_rotlv2di3
:
8434 new_icode
= CODE_FOR_rotlv2di3
;
8436 case CODE_FOR_xop_rotlv4si3
:
8437 new_icode
= CODE_FOR_rotlv4si3
;
8439 case CODE_FOR_xop_rotlv8hi3
:
8440 new_icode
= CODE_FOR_rotlv8hi3
;
8442 case CODE_FOR_xop_rotlv16qi3
:
8443 new_icode
= CODE_FOR_rotlv16qi3
;
8445 if (CONST_INT_P (op
))
8447 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8448 op
= GEN_INT (INTVAL (op
) & mask
);
8450 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8456 && insn_data
[new_icode
].operand
[0].mode
== tmode
8457 && insn_data
[new_icode
].operand
[1].mode
== tmode
8458 && insn_data
[new_icode
].operand
[2].mode
== mode
8459 && insn_data
[new_icode
].operand
[0].predicate
8460 == insn_data
[icode
].operand
[0].predicate
8461 && insn_data
[new_icode
].operand
[1].predicate
8462 == insn_data
[icode
].operand
[1].predicate
);
8475 if (VECTOR_MODE_P (mode
))
8476 op
= safe_vector_operand (op
, mode
);
8478 /* If we aren't optimizing, only allow one memory operand to be
8480 if (memory_operand (op
, mode
))
8483 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8486 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8488 op
= force_reg (mode
, op
);
8497 pat
= GEN_FCN (icode
) (target
, xops
[0]);
8502 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
8503 GEN_INT ((int)sub_code
));
8504 else if (! comparison_p
)
8505 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
8508 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8511 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
8516 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
8520 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
8534 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8535 insns with vec_merge. */
8538 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8542 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8543 rtx op1
, op0
= expand_normal (arg0
);
8544 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8545 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8547 if (optimize
|| !target
8548 || GET_MODE (target
) != tmode
8549 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8550 target
= gen_reg_rtx (tmode
);
8552 if (VECTOR_MODE_P (mode0
))
8553 op0
= safe_vector_operand (op0
, mode0
);
8555 if ((optimize
&& !register_operand (op0
, mode0
))
8556 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8557 op0
= copy_to_mode_reg (mode0
, op0
);
8560 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8561 op1
= copy_to_mode_reg (mode0
, op1
);
8563 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8570 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8573 ix86_expand_sse_compare (const struct builtin_description
*d
,
8574 tree exp
, rtx target
, bool swap
)
8577 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8578 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8579 rtx op0
= expand_normal (arg0
);
8580 rtx op1
= expand_normal (arg1
);
8582 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8583 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8584 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8585 enum rtx_code comparison
= d
->comparison
;
8587 if (VECTOR_MODE_P (mode0
))
8588 op0
= safe_vector_operand (op0
, mode0
);
8589 if (VECTOR_MODE_P (mode1
))
8590 op1
= safe_vector_operand (op1
, mode1
);
8592 /* Swap operands if we have a comparison that isn't available in
8595 std::swap (op0
, op1
);
8597 if (optimize
|| !target
8598 || GET_MODE (target
) != tmode
8599 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8600 target
= gen_reg_rtx (tmode
);
8602 if ((optimize
&& !register_operand (op0
, mode0
))
8603 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8604 op0
= copy_to_mode_reg (mode0
, op0
);
8605 if ((optimize
&& !register_operand (op1
, mode1
))
8606 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8607 op1
= copy_to_mode_reg (mode1
, op1
);
8609 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8610 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8617 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8620 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8624 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8625 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8626 rtx op0
= expand_normal (arg0
);
8627 rtx op1
= expand_normal (arg1
);
8628 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8629 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8630 enum rtx_code comparison
= d
->comparison
;
8632 if (VECTOR_MODE_P (mode0
))
8633 op0
= safe_vector_operand (op0
, mode0
);
8634 if (VECTOR_MODE_P (mode1
))
8635 op1
= safe_vector_operand (op1
, mode1
);
8637 target
= gen_reg_rtx (SImode
);
8638 emit_move_insn (target
, const0_rtx
);
8639 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8641 if ((optimize
&& !register_operand (op0
, mode0
))
8642 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8643 op0
= copy_to_mode_reg (mode0
, op0
);
8644 if ((optimize
&& !register_operand (op1
, mode1
))
8645 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8646 op1
= copy_to_mode_reg (mode1
, op1
);
8648 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8652 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8653 gen_rtx_fmt_ee (comparison
, QImode
,
8657 return SUBREG_REG (target
);
8660 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8663 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8667 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8668 rtx op1
, op0
= expand_normal (arg0
);
8669 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8670 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8672 if (optimize
|| target
== 0
8673 || GET_MODE (target
) != tmode
8674 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8675 target
= gen_reg_rtx (tmode
);
8677 if (VECTOR_MODE_P (mode0
))
8678 op0
= safe_vector_operand (op0
, mode0
);
8680 if ((optimize
&& !register_operand (op0
, mode0
))
8681 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8682 op0
= copy_to_mode_reg (mode0
, op0
);
8684 op1
= GEN_INT (d
->comparison
);
8686 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8694 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8695 tree exp
, rtx target
)
8698 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8699 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8700 rtx op0
= expand_normal (arg0
);
8701 rtx op1
= expand_normal (arg1
);
8703 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8704 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8705 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8707 if (optimize
|| target
== 0
8708 || GET_MODE (target
) != tmode
8709 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8710 target
= gen_reg_rtx (tmode
);
8712 op0
= safe_vector_operand (op0
, mode0
);
8713 op1
= safe_vector_operand (op1
, mode1
);
8715 if ((optimize
&& !register_operand (op0
, mode0
))
8716 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8717 op0
= copy_to_mode_reg (mode0
, op0
);
8718 if ((optimize
&& !register_operand (op1
, mode1
))
8719 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8720 op1
= copy_to_mode_reg (mode1
, op1
);
8722 op2
= GEN_INT (d
->comparison
);
8724 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8731 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8734 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8738 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8739 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8740 rtx op0
= expand_normal (arg0
);
8741 rtx op1
= expand_normal (arg1
);
8742 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8743 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8744 enum rtx_code comparison
= d
->comparison
;
8746 if (VECTOR_MODE_P (mode0
))
8747 op0
= safe_vector_operand (op0
, mode0
);
8748 if (VECTOR_MODE_P (mode1
))
8749 op1
= safe_vector_operand (op1
, mode1
);
8751 target
= gen_reg_rtx (SImode
);
8752 emit_move_insn (target
, const0_rtx
);
8753 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8755 if ((optimize
&& !register_operand (op0
, mode0
))
8756 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8757 op0
= copy_to_mode_reg (mode0
, op0
);
8758 if ((optimize
&& !register_operand (op1
, mode1
))
8759 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8760 op1
= copy_to_mode_reg (mode1
, op1
);
8762 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8766 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8767 gen_rtx_fmt_ee (comparison
, QImode
,
8771 return SUBREG_REG (target
);
8774 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8777 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8778 tree exp
, rtx target
)
8781 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8782 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8783 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8784 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8785 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8786 rtx scratch0
, scratch1
;
8787 rtx op0
= expand_normal (arg0
);
8788 rtx op1
= expand_normal (arg1
);
8789 rtx op2
= expand_normal (arg2
);
8790 rtx op3
= expand_normal (arg3
);
8791 rtx op4
= expand_normal (arg4
);
8792 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8794 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8795 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8796 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8797 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8798 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8799 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8800 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8802 if (VECTOR_MODE_P (modev2
))
8803 op0
= safe_vector_operand (op0
, modev2
);
8804 if (VECTOR_MODE_P (modev4
))
8805 op2
= safe_vector_operand (op2
, modev4
);
8807 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8808 op0
= copy_to_mode_reg (modev2
, op0
);
8809 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8810 op1
= copy_to_mode_reg (modei3
, op1
);
8811 if ((optimize
&& !register_operand (op2
, modev4
))
8812 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8813 op2
= copy_to_mode_reg (modev4
, op2
);
8814 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8815 op3
= copy_to_mode_reg (modei5
, op3
);
8817 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8819 error ("the fifth argument must be an 8-bit immediate");
8823 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8825 if (optimize
|| !target
8826 || GET_MODE (target
) != tmode0
8827 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8828 target
= gen_reg_rtx (tmode0
);
8830 scratch1
= gen_reg_rtx (tmode1
);
8832 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8834 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8836 if (optimize
|| !target
8837 || GET_MODE (target
) != tmode1
8838 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8839 target
= gen_reg_rtx (tmode1
);
8841 scratch0
= gen_reg_rtx (tmode0
);
8843 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8847 gcc_assert (d
->flag
);
8849 scratch0
= gen_reg_rtx (tmode0
);
8850 scratch1
= gen_reg_rtx (tmode1
);
8852 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8862 target
= gen_reg_rtx (SImode
);
8863 emit_move_insn (target
, const0_rtx
);
8864 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8867 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8868 gen_rtx_fmt_ee (EQ
, QImode
,
8869 gen_rtx_REG ((machine_mode
) d
->flag
,
8872 return SUBREG_REG (target
);
8879 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8882 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8883 tree exp
, rtx target
)
8886 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8887 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8888 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8889 rtx scratch0
, scratch1
;
8890 rtx op0
= expand_normal (arg0
);
8891 rtx op1
= expand_normal (arg1
);
8892 rtx op2
= expand_normal (arg2
);
8893 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8895 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8896 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8897 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8898 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8899 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8901 if (VECTOR_MODE_P (modev2
))
8902 op0
= safe_vector_operand (op0
, modev2
);
8903 if (VECTOR_MODE_P (modev3
))
8904 op1
= safe_vector_operand (op1
, modev3
);
8906 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8907 op0
= copy_to_mode_reg (modev2
, op0
);
8908 if ((optimize
&& !register_operand (op1
, modev3
))
8909 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8910 op1
= copy_to_mode_reg (modev3
, op1
);
8912 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8914 error ("the third argument must be an 8-bit immediate");
8918 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8920 if (optimize
|| !target
8921 || GET_MODE (target
) != tmode0
8922 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8923 target
= gen_reg_rtx (tmode0
);
8925 scratch1
= gen_reg_rtx (tmode1
);
8927 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8929 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8931 if (optimize
|| !target
8932 || GET_MODE (target
) != tmode1
8933 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8934 target
= gen_reg_rtx (tmode1
);
8936 scratch0
= gen_reg_rtx (tmode0
);
8938 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8942 gcc_assert (d
->flag
);
8944 scratch0
= gen_reg_rtx (tmode0
);
8945 scratch1
= gen_reg_rtx (tmode1
);
8947 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8957 target
= gen_reg_rtx (SImode
);
8958 emit_move_insn (target
, const0_rtx
);
8959 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8962 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8963 gen_rtx_fmt_ee (EQ
, QImode
,
8964 gen_rtx_REG ((machine_mode
) d
->flag
,
8967 return SUBREG_REG (target
);
8973 /* Fixup modeless constants to fit required mode. */
8976 fixup_modeless_constant (rtx x
, machine_mode mode
)
8978 if (GET_MODE (x
) == VOIDmode
)
8979 x
= convert_to_mode (mode
, x
, 1);
8983 /* Subroutine of ix86_expand_builtin to take care of insns with
8984 variable number of operands. */
8987 ix86_expand_args_builtin (const struct builtin_description
*d
,
8988 tree exp
, rtx target
)
8990 rtx pat
, real_target
;
8991 unsigned int i
, nargs
;
8992 unsigned int nargs_constant
= 0;
8993 unsigned int mask_pos
= 0;
8996 bool second_arg_count
= false;
8997 enum insn_code icode
= d
->icode
;
8998 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8999 machine_mode tmode
= insn_p
->operand
[0].mode
;
9000 machine_mode rmode
= VOIDmode
;
9002 enum rtx_code comparison
= d
->comparison
;
9004 switch ((enum ix86_builtin_func_type
) d
->flag
)
9006 case V2DF_FTYPE_V2DF_ROUND
:
9007 case V4DF_FTYPE_V4DF_ROUND
:
9008 case V8DF_FTYPE_V8DF_ROUND
:
9009 case V4SF_FTYPE_V4SF_ROUND
:
9010 case V8SF_FTYPE_V8SF_ROUND
:
9011 case V16SF_FTYPE_V16SF_ROUND
:
9012 case V4SI_FTYPE_V4SF_ROUND
:
9013 case V8SI_FTYPE_V8SF_ROUND
:
9014 case V16SI_FTYPE_V16SF_ROUND
:
9015 return ix86_expand_sse_round (d
, exp
, target
);
9016 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
9017 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
9018 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
9019 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
9020 case INT_FTYPE_V8SF_V8SF_PTEST
:
9021 case INT_FTYPE_V4DI_V4DI_PTEST
:
9022 case INT_FTYPE_V4DF_V4DF_PTEST
:
9023 case INT_FTYPE_V4SF_V4SF_PTEST
:
9024 case INT_FTYPE_V2DI_V2DI_PTEST
:
9025 case INT_FTYPE_V2DF_V2DF_PTEST
:
9026 return ix86_expand_sse_ptest (d
, exp
, target
);
9027 case FLOAT128_FTYPE_FLOAT128
:
9028 case FLOAT_FTYPE_FLOAT
:
9030 case UINT_FTYPE_UINT
:
9031 case UINT16_FTYPE_UINT16
:
9032 case UINT64_FTYPE_INT
:
9033 case UINT64_FTYPE_UINT64
:
9034 case INT64_FTYPE_INT64
:
9035 case INT64_FTYPE_V4SF
:
9036 case INT64_FTYPE_V2DF
:
9037 case INT_FTYPE_V16QI
:
9038 case INT_FTYPE_V8QI
:
9039 case INT_FTYPE_V8SF
:
9040 case INT_FTYPE_V4DF
:
9041 case INT_FTYPE_V4SF
:
9042 case INT_FTYPE_V2DF
:
9043 case INT_FTYPE_V32QI
:
9044 case V16QI_FTYPE_V16QI
:
9045 case V8SI_FTYPE_V8SF
:
9046 case V8SI_FTYPE_V4SI
:
9047 case V8HI_FTYPE_V8HI
:
9048 case V8HI_FTYPE_V16QI
:
9049 case V8QI_FTYPE_V8QI
:
9050 case V8SF_FTYPE_V8SF
:
9051 case V8SF_FTYPE_V8SI
:
9052 case V8SF_FTYPE_V4SF
:
9053 case V8SF_FTYPE_V8HI
:
9054 case V4SI_FTYPE_V4SI
:
9055 case V4SI_FTYPE_V16QI
:
9056 case V4SI_FTYPE_V4SF
:
9057 case V4SI_FTYPE_V8SI
:
9058 case V4SI_FTYPE_V8HI
:
9059 case V4SI_FTYPE_V4DF
:
9060 case V4SI_FTYPE_V2DF
:
9061 case V4HI_FTYPE_V4HI
:
9062 case V4DF_FTYPE_V4DF
:
9063 case V4DF_FTYPE_V4SI
:
9064 case V4DF_FTYPE_V4SF
:
9065 case V4DF_FTYPE_V2DF
:
9066 case V4SF_FTYPE_V4SF
:
9067 case V4SF_FTYPE_V4SI
:
9068 case V4SF_FTYPE_V8SF
:
9069 case V4SF_FTYPE_V4DF
:
9070 case V4SF_FTYPE_V8HI
:
9071 case V4SF_FTYPE_V2DF
:
9072 case V2DI_FTYPE_V2DI
:
9073 case V2DI_FTYPE_V16QI
:
9074 case V2DI_FTYPE_V8HI
:
9075 case V2DI_FTYPE_V4SI
:
9076 case V2DF_FTYPE_V2DF
:
9077 case V2DF_FTYPE_V4SI
:
9078 case V2DF_FTYPE_V4DF
:
9079 case V2DF_FTYPE_V4SF
:
9080 case V2DF_FTYPE_V2SI
:
9081 case V2SI_FTYPE_V2SI
:
9082 case V2SI_FTYPE_V4SF
:
9083 case V2SI_FTYPE_V2SF
:
9084 case V2SI_FTYPE_V2DF
:
9085 case V2SF_FTYPE_V2SF
:
9086 case V2SF_FTYPE_V2SI
:
9087 case V32QI_FTYPE_V32QI
:
9088 case V32QI_FTYPE_V16QI
:
9089 case V16HI_FTYPE_V16HI
:
9090 case V16HI_FTYPE_V8HI
:
9091 case V8SI_FTYPE_V8SI
:
9092 case V16HI_FTYPE_V16QI
:
9093 case V8SI_FTYPE_V16QI
:
9094 case V4DI_FTYPE_V16QI
:
9095 case V8SI_FTYPE_V8HI
:
9096 case V4DI_FTYPE_V8HI
:
9097 case V4DI_FTYPE_V4SI
:
9098 case V4DI_FTYPE_V2DI
:
9105 case UHI_FTYPE_V16QI
:
9106 case USI_FTYPE_V32QI
:
9107 case UDI_FTYPE_V64QI
:
9108 case V16QI_FTYPE_UHI
:
9109 case V32QI_FTYPE_USI
:
9110 case V64QI_FTYPE_UDI
:
9111 case V8HI_FTYPE_UQI
:
9112 case V16HI_FTYPE_UHI
:
9113 case V32HI_FTYPE_USI
:
9114 case V4SI_FTYPE_UQI
:
9115 case V8SI_FTYPE_UQI
:
9116 case V4SI_FTYPE_UHI
:
9117 case V8SI_FTYPE_UHI
:
9118 case UQI_FTYPE_V8HI
:
9119 case UHI_FTYPE_V16HI
:
9120 case USI_FTYPE_V32HI
:
9121 case UQI_FTYPE_V4SI
:
9122 case UQI_FTYPE_V8SI
:
9123 case UHI_FTYPE_V16SI
:
9124 case UQI_FTYPE_V2DI
:
9125 case UQI_FTYPE_V4DI
:
9126 case UQI_FTYPE_V8DI
:
9127 case V16SI_FTYPE_UHI
:
9128 case V2DI_FTYPE_UQI
:
9129 case V4DI_FTYPE_UQI
:
9130 case V16SI_FTYPE_INT
:
9131 case V16SF_FTYPE_V8SF
:
9132 case V16SI_FTYPE_V8SI
:
9133 case V16SF_FTYPE_V4SF
:
9134 case V16SI_FTYPE_V4SI
:
9135 case V16SI_FTYPE_V16SF
:
9136 case V16SI_FTYPE_V16SI
:
9137 case V64QI_FTYPE_V64QI
:
9138 case V32HI_FTYPE_V32HI
:
9139 case V16SF_FTYPE_V16SF
:
9140 case V8DI_FTYPE_UQI
:
9141 case V8DI_FTYPE_V8DI
:
9142 case V8DF_FTYPE_V4DF
:
9143 case V8DF_FTYPE_V2DF
:
9144 case V8DF_FTYPE_V8DF
:
9145 case V4DI_FTYPE_V4DI
:
9146 case V16HI_FTYPE_V16SF
:
9147 case V8HI_FTYPE_V8SF
:
9148 case V8HI_FTYPE_V4SF
:
9151 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9152 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9153 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9154 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9155 case V16QI_FTYPE_V16QI_V16QI
:
9156 case V16QI_FTYPE_V8HI_V8HI
:
9157 case V16SF_FTYPE_V16SF_V16SF
:
9158 case V8QI_FTYPE_V8QI_V8QI
:
9159 case V8QI_FTYPE_V4HI_V4HI
:
9160 case V8HI_FTYPE_V8HI_V8HI
:
9161 case V8HI_FTYPE_V16QI_V16QI
:
9162 case V8HI_FTYPE_V4SI_V4SI
:
9163 case V8SF_FTYPE_V8SF_V8SF
:
9164 case V8SF_FTYPE_V8SF_V8SI
:
9165 case V8DF_FTYPE_V8DF_V8DF
:
9166 case V4SI_FTYPE_V4SI_V4SI
:
9167 case V4SI_FTYPE_V8HI_V8HI
:
9168 case V4SI_FTYPE_V2DF_V2DF
:
9169 case V4HI_FTYPE_V4HI_V4HI
:
9170 case V4HI_FTYPE_V8QI_V8QI
:
9171 case V4HI_FTYPE_V2SI_V2SI
:
9172 case V4DF_FTYPE_V4DF_V4DF
:
9173 case V4DF_FTYPE_V4DF_V4DI
:
9174 case V4SF_FTYPE_V4SF_V4SF
:
9175 case V4SF_FTYPE_V4SF_V4SI
:
9176 case V4SF_FTYPE_V4SF_V2SI
:
9177 case V4SF_FTYPE_V4SF_V2DF
:
9178 case V4SF_FTYPE_V4SF_UINT
:
9179 case V4SF_FTYPE_V4SF_DI
:
9180 case V4SF_FTYPE_V4SF_SI
:
9181 case V2DI_FTYPE_V2DI_V2DI
:
9182 case V2DI_FTYPE_V16QI_V16QI
:
9183 case V2DI_FTYPE_V4SI_V4SI
:
9184 case V2DI_FTYPE_V2DI_V16QI
:
9185 case V2SI_FTYPE_V2SI_V2SI
:
9186 case V2SI_FTYPE_V4HI_V4HI
:
9187 case V2SI_FTYPE_V2SF_V2SF
:
9188 case V2DF_FTYPE_V2DF_V2DF
:
9189 case V2DF_FTYPE_V2DF_V4SF
:
9190 case V2DF_FTYPE_V2DF_V2DI
:
9191 case V2DF_FTYPE_V2DF_DI
:
9192 case V2DF_FTYPE_V2DF_SI
:
9193 case V2DF_FTYPE_V2DF_UINT
:
9194 case V2SF_FTYPE_V2SF_V2SF
:
9195 case V1DI_FTYPE_V1DI_V1DI
:
9196 case V1DI_FTYPE_V8QI_V8QI
:
9197 case V1DI_FTYPE_V2SI_V2SI
:
9198 case V32QI_FTYPE_V16HI_V16HI
:
9199 case V16HI_FTYPE_V8SI_V8SI
:
9200 case V64QI_FTYPE_V64QI_V64QI
:
9201 case V32QI_FTYPE_V32QI_V32QI
:
9202 case V16HI_FTYPE_V32QI_V32QI
:
9203 case V16HI_FTYPE_V16HI_V16HI
:
9204 case V8SI_FTYPE_V4DF_V4DF
:
9205 case V8SI_FTYPE_V8SI_V8SI
:
9206 case V8SI_FTYPE_V16HI_V16HI
:
9207 case V4DI_FTYPE_V4DI_V4DI
:
9208 case V4DI_FTYPE_V8SI_V8SI
:
9209 case V8DI_FTYPE_V64QI_V64QI
:
9210 if (comparison
== UNKNOWN
)
9211 return ix86_expand_binop_builtin (icode
, exp
, target
);
9214 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9215 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9216 gcc_assert (comparison
!= UNKNOWN
);
9220 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9221 case V16HI_FTYPE_V16HI_SI_COUNT
:
9222 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9223 case V8SI_FTYPE_V8SI_SI_COUNT
:
9224 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9225 case V4DI_FTYPE_V4DI_INT_COUNT
:
9226 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9227 case V8HI_FTYPE_V8HI_SI_COUNT
:
9228 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9229 case V4SI_FTYPE_V4SI_SI_COUNT
:
9230 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9231 case V4HI_FTYPE_V4HI_SI_COUNT
:
9232 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9233 case V2DI_FTYPE_V2DI_SI_COUNT
:
9234 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9235 case V2SI_FTYPE_V2SI_SI_COUNT
:
9236 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9237 case V1DI_FTYPE_V1DI_SI_COUNT
:
9239 second_arg_count
= true;
9241 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9242 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9243 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9244 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9245 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9246 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9247 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9248 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9249 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9250 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9251 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9252 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9253 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9254 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9255 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9256 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9257 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9258 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9260 second_arg_count
= true;
9262 case UINT64_FTYPE_UINT64_UINT64
:
9263 case UINT_FTYPE_UINT_UINT
:
9264 case UINT_FTYPE_UINT_USHORT
:
9265 case UINT_FTYPE_UINT_UCHAR
:
9266 case UINT16_FTYPE_UINT16_INT
:
9267 case UINT8_FTYPE_UINT8_INT
:
9268 case UQI_FTYPE_UQI_UQI
:
9269 case UHI_FTYPE_UHI_UHI
:
9270 case USI_FTYPE_USI_USI
:
9271 case UDI_FTYPE_UDI_UDI
:
9272 case V16SI_FTYPE_V8DF_V8DF
:
9273 case V32HI_FTYPE_V16SF_V16SF
:
9274 case V16HI_FTYPE_V8SF_V8SF
:
9275 case V8HI_FTYPE_V4SF_V4SF
:
9276 case V16HI_FTYPE_V16SF_UHI
:
9277 case V8HI_FTYPE_V8SF_UQI
:
9278 case V8HI_FTYPE_V4SF_UQI
:
9281 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9286 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9291 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9296 case V8HI_FTYPE_V8HI_INT
:
9297 case V8HI_FTYPE_V8SF_INT
:
9298 case V16HI_FTYPE_V16SF_INT
:
9299 case V8HI_FTYPE_V4SF_INT
:
9300 case V8SF_FTYPE_V8SF_INT
:
9301 case V4SF_FTYPE_V16SF_INT
:
9302 case V16SF_FTYPE_V16SF_INT
:
9303 case V4SI_FTYPE_V4SI_INT
:
9304 case V4SI_FTYPE_V8SI_INT
:
9305 case V4HI_FTYPE_V4HI_INT
:
9306 case V4DF_FTYPE_V4DF_INT
:
9307 case V4DF_FTYPE_V8DF_INT
:
9308 case V4SF_FTYPE_V4SF_INT
:
9309 case V4SF_FTYPE_V8SF_INT
:
9310 case V2DI_FTYPE_V2DI_INT
:
9311 case V2DF_FTYPE_V2DF_INT
:
9312 case V2DF_FTYPE_V4DF_INT
:
9313 case V16HI_FTYPE_V16HI_INT
:
9314 case V8SI_FTYPE_V8SI_INT
:
9315 case V16SI_FTYPE_V16SI_INT
:
9316 case V4SI_FTYPE_V16SI_INT
:
9317 case V4DI_FTYPE_V4DI_INT
:
9318 case V2DI_FTYPE_V4DI_INT
:
9319 case V4DI_FTYPE_V8DI_INT
:
9320 case UQI_FTYPE_UQI_UQI_CONST
:
9321 case UHI_FTYPE_UHI_UQI
:
9322 case USI_FTYPE_USI_UQI
:
9323 case UDI_FTYPE_UDI_UQI
:
9327 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9328 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9329 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9330 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9331 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9332 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9333 case UHI_FTYPE_V16SI_V16SI_UHI
:
9334 case UQI_FTYPE_V8DI_V8DI_UQI
:
9335 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9336 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9337 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9338 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9339 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9340 case V16SI_FTYPE_SI_V16SI_UHI
:
9341 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9342 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9343 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9344 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9345 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9346 case V8SI_FTYPE_SI_V8SI_UQI
:
9347 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9348 case V4SI_FTYPE_SI_V4SI_UQI
:
9349 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9350 case V4DI_FTYPE_DI_V4DI_UQI
:
9351 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9352 case V2DI_FTYPE_DI_V2DI_UQI
:
9353 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9354 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9355 case V64QI_FTYPE_QI_V64QI_UDI
:
9356 case V32QI_FTYPE_V32QI_V32QI_USI
:
9357 case V32QI_FTYPE_V16QI_V32QI_USI
:
9358 case V32QI_FTYPE_QI_V32QI_USI
:
9359 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9360 case V16QI_FTYPE_QI_V16QI_UHI
:
9361 case V32HI_FTYPE_V8HI_V32HI_USI
:
9362 case V32HI_FTYPE_HI_V32HI_USI
:
9363 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9364 case V16HI_FTYPE_HI_V16HI_UHI
:
9365 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9366 case V8HI_FTYPE_HI_V8HI_UQI
:
9367 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9368 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9369 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9370 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9371 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9372 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9373 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9374 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9375 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9376 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9377 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9378 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9379 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9380 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9381 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9382 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9383 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9384 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9385 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9386 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9387 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9388 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9389 case V32QI_FTYPE_V32HI_V32QI_USI
:
9390 case UHI_FTYPE_V16QI_V16QI_UHI
:
9391 case USI_FTYPE_V32QI_V32QI_USI
:
9392 case UDI_FTYPE_V64QI_V64QI_UDI
:
9393 case UQI_FTYPE_V8HI_V8HI_UQI
:
9394 case UHI_FTYPE_V16HI_V16HI_UHI
:
9395 case USI_FTYPE_V32HI_V32HI_USI
:
9396 case UQI_FTYPE_V4SI_V4SI_UQI
:
9397 case UQI_FTYPE_V8SI_V8SI_UQI
:
9398 case UQI_FTYPE_V2DI_V2DI_UQI
:
9399 case UQI_FTYPE_V4DI_V4DI_UQI
:
9400 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9401 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9402 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9403 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9404 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9405 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9406 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9407 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9408 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9409 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9410 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9411 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9412 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9413 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9414 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9415 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9416 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9417 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9418 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9419 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9420 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9421 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9422 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9423 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9424 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9425 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9426 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9427 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9428 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9429 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9430 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9431 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9432 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9433 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9434 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9435 case V8DI_FTYPE_DI_V8DI_UQI
:
9436 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9437 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9438 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9439 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9440 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9441 case V32HI_FTYPE_V32HI_V32HI_USI
:
9442 case V32HI_FTYPE_V32QI_V32HI_USI
:
9443 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9444 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9445 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9446 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9447 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9448 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9449 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9450 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9451 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9452 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9453 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9454 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9455 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9456 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9457 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9458 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9459 case V32HI_FTYPE_V16SF_V16SF_USI
:
9460 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9461 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9462 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9463 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9464 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9465 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9466 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9467 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9470 case V32QI_FTYPE_V32QI_V32QI_INT
:
9471 case V16HI_FTYPE_V16HI_V16HI_INT
:
9472 case V16QI_FTYPE_V16QI_V16QI_INT
:
9473 case V4DI_FTYPE_V4DI_V4DI_INT
:
9474 case V8HI_FTYPE_V8HI_V8HI_INT
:
9475 case V8SI_FTYPE_V8SI_V8SI_INT
:
9476 case V8SI_FTYPE_V8SI_V4SI_INT
:
9477 case V8SF_FTYPE_V8SF_V8SF_INT
:
9478 case V8SF_FTYPE_V8SF_V4SF_INT
:
9479 case V4SI_FTYPE_V4SI_V4SI_INT
:
9480 case V4DF_FTYPE_V4DF_V4DF_INT
:
9481 case V16SF_FTYPE_V16SF_V16SF_INT
:
9482 case V16SF_FTYPE_V16SF_V4SF_INT
:
9483 case V16SI_FTYPE_V16SI_V4SI_INT
:
9484 case V4DF_FTYPE_V4DF_V2DF_INT
:
9485 case V4SF_FTYPE_V4SF_V4SF_INT
:
9486 case V2DI_FTYPE_V2DI_V2DI_INT
:
9487 case V4DI_FTYPE_V4DI_V2DI_INT
:
9488 case V2DF_FTYPE_V2DF_V2DF_INT
:
9489 case UQI_FTYPE_V8DI_V8UDI_INT
:
9490 case UQI_FTYPE_V8DF_V8DF_INT
:
9491 case UQI_FTYPE_V2DF_V2DF_INT
:
9492 case UQI_FTYPE_V4SF_V4SF_INT
:
9493 case UHI_FTYPE_V16SI_V16SI_INT
:
9494 case UHI_FTYPE_V16SF_V16SF_INT
:
9495 case V64QI_FTYPE_V64QI_V64QI_INT
:
9496 case V32HI_FTYPE_V32HI_V32HI_INT
:
9497 case V16SI_FTYPE_V16SI_V16SI_INT
:
9498 case V8DI_FTYPE_V8DI_V8DI_INT
:
9502 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9507 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9512 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9517 case V2DI_FTYPE_V2DI_UINT_UINT
:
9521 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9526 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9532 case QI_FTYPE_V8DF_INT_UQI
:
9533 case QI_FTYPE_V4DF_INT_UQI
:
9534 case QI_FTYPE_V2DF_INT_UQI
:
9535 case HI_FTYPE_V16SF_INT_UHI
:
9536 case QI_FTYPE_V8SF_INT_UQI
:
9537 case QI_FTYPE_V4SF_INT_UQI
:
9538 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9539 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9544 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9550 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9556 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9557 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9558 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9559 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9560 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9561 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9562 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9563 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9564 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9565 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9566 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9567 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9568 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9569 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9570 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9571 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9572 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9573 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9574 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9575 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9576 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9577 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9578 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9579 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9580 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9581 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9582 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9583 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9584 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9585 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9586 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9587 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9588 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9589 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9590 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9591 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9592 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9593 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9594 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9595 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9596 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9597 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9598 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9599 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9600 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9601 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9602 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9603 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9604 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9605 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9606 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9607 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9608 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9609 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9612 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9613 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9614 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9615 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9616 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9620 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9621 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9622 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9623 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9624 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9625 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9626 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9627 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9628 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9629 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9630 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9631 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9632 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9633 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9638 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9642 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9643 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9644 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9645 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9646 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9649 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9650 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9655 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9656 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9657 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9658 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9659 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9660 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9661 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9662 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9663 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9664 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9665 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9666 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9667 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9668 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9669 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9670 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9671 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9672 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9673 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9674 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9675 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9676 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9677 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9678 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9679 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9680 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9681 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9682 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9683 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9684 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9689 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9690 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9691 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9692 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9693 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9694 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9695 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9696 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9697 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9698 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9699 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9700 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9701 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9702 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9703 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9704 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9705 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9706 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9707 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9708 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9709 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9710 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9711 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9712 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9713 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9714 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9715 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9720 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9721 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9722 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9723 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9724 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9725 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9726 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9727 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9728 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9729 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9734 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9735 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9736 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9737 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9738 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9739 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9740 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9741 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9742 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9743 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9744 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9745 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9755 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9757 if (comparison
!= UNKNOWN
)
9759 gcc_assert (nargs
== 2);
9760 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9763 if (rmode
== VOIDmode
|| rmode
== tmode
)
9767 || GET_MODE (target
) != tmode
9768 || !insn_p
->operand
[0].predicate (target
, tmode
))
9769 target
= gen_reg_rtx (tmode
);
9770 else if (memory_operand (target
, tmode
))
9772 real_target
= target
;
9776 real_target
= gen_reg_rtx (tmode
);
9777 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9780 for (i
= 0; i
< nargs
; i
++)
9782 tree arg
= CALL_EXPR_ARG (exp
, i
);
9783 rtx op
= expand_normal (arg
);
9784 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9785 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9787 if (second_arg_count
&& i
== 1)
9789 /* SIMD shift insns take either an 8-bit immediate or
9790 register as count. But builtin functions take int as
9791 count. If count doesn't match, we put it in register.
9792 The instructions are using 64-bit count, if op is just
9793 32-bit, zero-extend it, as negative shift counts
9794 are undefined behavior and zero-extension is more
9798 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9799 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9801 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9802 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9803 op
= copy_to_reg (op
);
9806 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9807 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9812 case CODE_FOR_avx_vinsertf128v4di
:
9813 case CODE_FOR_avx_vextractf128v4di
:
9814 error ("the last argument must be an 1-bit immediate");
9817 case CODE_FOR_avx512f_cmpv8di3_mask
:
9818 case CODE_FOR_avx512f_cmpv16si3_mask
:
9819 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9820 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9821 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9822 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9823 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9824 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9825 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9826 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9827 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9828 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9829 error ("the last argument must be a 3-bit immediate");
9832 case CODE_FOR_sse4_1_roundsd
:
9833 case CODE_FOR_sse4_1_roundss
:
9835 case CODE_FOR_sse4_1_roundpd
:
9836 case CODE_FOR_sse4_1_roundps
:
9837 case CODE_FOR_avx_roundpd256
:
9838 case CODE_FOR_avx_roundps256
:
9840 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9841 case CODE_FOR_sse4_1_roundps_sfix
:
9842 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9843 case CODE_FOR_avx_roundps_sfix256
:
9845 case CODE_FOR_sse4_1_blendps
:
9846 case CODE_FOR_avx_blendpd256
:
9847 case CODE_FOR_avx_vpermilv4df
:
9848 case CODE_FOR_avx_vpermilv4df_mask
:
9849 case CODE_FOR_avx512f_getmantv8df_mask
:
9850 case CODE_FOR_avx512f_getmantv16sf_mask
:
9851 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9852 case CODE_FOR_avx512vl_getmantv4df_mask
:
9853 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9854 case CODE_FOR_avx512vl_getmantv2df_mask
:
9855 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9856 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9857 case CODE_FOR_avx512dq_rangepv4df_mask
:
9858 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9859 case CODE_FOR_avx512dq_rangepv2df_mask
:
9860 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9861 case CODE_FOR_avx_shufpd256_mask
:
9862 error ("the last argument must be a 4-bit immediate");
9865 case CODE_FOR_sha1rnds4
:
9866 case CODE_FOR_sse4_1_blendpd
:
9867 case CODE_FOR_avx_vpermilv2df
:
9868 case CODE_FOR_avx_vpermilv2df_mask
:
9869 case CODE_FOR_xop_vpermil2v2df3
:
9870 case CODE_FOR_xop_vpermil2v4sf3
:
9871 case CODE_FOR_xop_vpermil2v4df3
:
9872 case CODE_FOR_xop_vpermil2v8sf3
:
9873 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9874 case CODE_FOR_avx512f_vinserti32x4_mask
:
9875 case CODE_FOR_avx512f_vextractf32x4_mask
:
9876 case CODE_FOR_avx512f_vextracti32x4_mask
:
9877 case CODE_FOR_sse2_shufpd
:
9878 case CODE_FOR_sse2_shufpd_mask
:
9879 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9880 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9881 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9882 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9883 error ("the last argument must be a 2-bit immediate");
9886 case CODE_FOR_avx_vextractf128v4df
:
9887 case CODE_FOR_avx_vextractf128v8sf
:
9888 case CODE_FOR_avx_vextractf128v8si
:
9889 case CODE_FOR_avx_vinsertf128v4df
:
9890 case CODE_FOR_avx_vinsertf128v8sf
:
9891 case CODE_FOR_avx_vinsertf128v8si
:
9892 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9893 case CODE_FOR_avx512f_vinserti64x4_mask
:
9894 case CODE_FOR_avx512f_vextractf64x4_mask
:
9895 case CODE_FOR_avx512f_vextracti64x4_mask
:
9896 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9897 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9898 case CODE_FOR_avx512vl_vinsertv4df
:
9899 case CODE_FOR_avx512vl_vinsertv4di
:
9900 case CODE_FOR_avx512vl_vinsertv8sf
:
9901 case CODE_FOR_avx512vl_vinsertv8si
:
9902 error ("the last argument must be a 1-bit immediate");
9905 case CODE_FOR_avx_vmcmpv2df3
:
9906 case CODE_FOR_avx_vmcmpv4sf3
:
9907 case CODE_FOR_avx_cmpv2df3
:
9908 case CODE_FOR_avx_cmpv4sf3
:
9909 case CODE_FOR_avx_cmpv4df3
:
9910 case CODE_FOR_avx_cmpv8sf3
:
9911 case CODE_FOR_avx512f_cmpv8df3_mask
:
9912 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9913 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9914 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9915 error ("the last argument must be a 5-bit immediate");
9919 switch (nargs_constant
)
9922 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9923 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9925 error ("the next to last argument must be an 8-bit immediate");
9930 error ("the last argument must be an 8-bit immediate");
9940 if (VECTOR_MODE_P (mode
))
9941 op
= safe_vector_operand (op
, mode
);
9943 /* If we aren't optimizing, only allow one memory operand to
9945 if (memory_operand (op
, mode
))
9948 op
= fixup_modeless_constant (op
, mode
);
9950 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9952 if (optimize
|| !match
|| num_memory
> 1)
9953 op
= copy_to_mode_reg (mode
, op
);
9957 op
= copy_to_reg (op
);
9958 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9968 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
9971 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
9974 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
9977 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
9981 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
9982 xops
[2], xops
[3], xops
[4]);
9985 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
9986 xops
[2], xops
[3], xops
[4], xops
[5]);
9999 /* Transform pattern of following layout:
10001 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10007 ix86_erase_embedded_rounding (rtx pat
)
10009 if (GET_CODE (pat
) == INSN
)
10010 pat
= PATTERN (pat
);
10012 gcc_assert (GET_CODE (pat
) == SET
);
10013 rtx src
= SET_SRC (pat
);
10014 gcc_assert (XVECLEN (src
, 0) == 2);
10015 rtx p0
= XVECEXP (src
, 0, 0);
10016 gcc_assert (GET_CODE (src
) == UNSPEC
10017 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
10018 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
10022 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10025 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
10026 tree exp
, rtx target
)
10029 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10030 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10031 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10032 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10033 rtx op0
= expand_normal (arg0
);
10034 rtx op1
= expand_normal (arg1
);
10035 rtx op2
= expand_normal (arg2
);
10036 rtx op3
= expand_normal (arg3
);
10037 enum insn_code icode
= d
->icode
;
10038 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10039 machine_mode mode0
= insn_p
->operand
[0].mode
;
10040 machine_mode mode1
= insn_p
->operand
[1].mode
;
10042 /* See avxintrin.h for values. */
10043 static const enum rtx_code comparisons
[32] =
10045 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10046 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
10047 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10048 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
10050 static const bool ordereds
[32] =
10052 true, true, true, false, false, false, false, true,
10053 false, false, false, true, true, true, true, false,
10054 true, true, true, false, false, false, false, true,
10055 false, false, false, true, true, true, true, false
10057 static const bool non_signalings
[32] =
10059 true, false, false, true, true, false, false, true,
10060 true, false, false, true, true, false, false, true,
10061 false, true, true, false, false, true, true, false,
10062 false, true, true, false, false, true, true, false
10065 if (!CONST_INT_P (op2
))
10067 error ("the third argument must be comparison constant");
10070 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10072 error ("incorrect comparison mode");
10076 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10078 error ("incorrect rounding operand");
10082 if (VECTOR_MODE_P (mode0
))
10083 op0
= safe_vector_operand (op0
, mode0
);
10084 if (VECTOR_MODE_P (mode1
))
10085 op1
= safe_vector_operand (op1
, mode1
);
10087 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10088 bool ordered
= ordereds
[INTVAL (op2
)];
10089 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10090 rtx const_val
= const0_rtx
;
10092 bool check_unordered
= false;
10093 machine_mode mode
= CCFPmode
;
10094 switch (comparison
)
10099 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10100 if (!non_signaling
)
10106 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10116 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10123 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10124 if (!non_signaling
)
10131 case LE
: /* -> GE */
10132 case LT
: /* -> GT */
10133 case UNGE
: /* -> UNLE */
10134 case UNGT
: /* -> UNLT */
10135 std::swap (op0
, op1
);
10136 comparison
= swap_condition (comparison
);
10144 /* These are supported by CCFPmode. NB: Use ordered/signaling
10145 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10146 with NAN operands. */
10147 if (ordered
== non_signaling
)
10148 ordered
= !ordered
;
10151 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10152 _CMP_EQ_OQ/_CMP_EQ_OS. */
10153 check_unordered
= true;
10157 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10158 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10159 gcc_assert (!ordered
);
10160 check_unordered
= true;
10162 const_val
= const1_rtx
;
10165 gcc_unreachable ();
10168 target
= gen_reg_rtx (SImode
);
10169 emit_move_insn (target
, const_val
);
10170 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10172 if ((optimize
&& !register_operand (op0
, mode0
))
10173 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10174 op0
= copy_to_mode_reg (mode0
, op0
);
10175 if ((optimize
&& !register_operand (op1
, mode1
))
10176 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10177 op1
= copy_to_mode_reg (mode1
, op1
);
10180 1. COMI: ordered and signaling.
10181 2. UCOMI: unordered and non-signaling.
10184 icode
= (icode
== CODE_FOR_sse_comi_round
10185 ? CODE_FOR_sse_ucomi_round
10186 : CODE_FOR_sse2_ucomi_round
);
10188 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10192 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10193 if (INTVAL (op3
) == NO_ROUND
)
10195 pat
= ix86_erase_embedded_rounding (pat
);
10199 set_dst
= SET_DEST (pat
);
10203 gcc_assert (GET_CODE (pat
) == SET
);
10204 set_dst
= SET_DEST (pat
);
10209 rtx_code_label
*label
= NULL
;
10211 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10212 with NAN operands. */
10213 if (check_unordered
)
10215 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10217 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10218 label
= gen_label_rtx ();
10219 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10220 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10221 gen_rtx_LABEL_REF (VOIDmode
, label
),
10223 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10226 /* NB: Set CCFPmode and check a different CCmode which is in subset
10228 if (GET_MODE (set_dst
) != mode
)
10230 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10231 || mode
== CCOmode
|| mode
== CCPmode
10232 || mode
== CCSmode
|| mode
== CCZmode
);
10233 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10236 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10237 gen_rtx_fmt_ee (comparison
, QImode
,
10242 emit_label (label
);
10244 return SUBREG_REG (target
);
10248 ix86_expand_round_builtin (const struct builtin_description
*d
,
10249 tree exp
, rtx target
)
10252 unsigned int i
, nargs
;
10254 enum insn_code icode
= d
->icode
;
10255 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10256 machine_mode tmode
= insn_p
->operand
[0].mode
;
10257 unsigned int nargs_constant
= 0;
10258 unsigned int redundant_embed_rnd
= 0;
10260 switch ((enum ix86_builtin_func_type
) d
->flag
)
10262 case UINT64_FTYPE_V2DF_INT
:
10263 case UINT64_FTYPE_V4SF_INT
:
10264 case UINT_FTYPE_V2DF_INT
:
10265 case UINT_FTYPE_V4SF_INT
:
10266 case INT64_FTYPE_V2DF_INT
:
10267 case INT64_FTYPE_V4SF_INT
:
10268 case INT_FTYPE_V2DF_INT
:
10269 case INT_FTYPE_V4SF_INT
:
10272 case V4SF_FTYPE_V4SF_UINT_INT
:
10273 case V4SF_FTYPE_V4SF_UINT64_INT
:
10274 case V2DF_FTYPE_V2DF_UINT64_INT
:
10275 case V4SF_FTYPE_V4SF_INT_INT
:
10276 case V4SF_FTYPE_V4SF_INT64_INT
:
10277 case V2DF_FTYPE_V2DF_INT64_INT
:
10278 case V4SF_FTYPE_V4SF_V4SF_INT
:
10279 case V2DF_FTYPE_V2DF_V2DF_INT
:
10280 case V4SF_FTYPE_V4SF_V2DF_INT
:
10281 case V2DF_FTYPE_V2DF_V4SF_INT
:
10284 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10285 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10286 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10287 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10288 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10289 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10290 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10291 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10292 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10293 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10294 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10295 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10296 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10297 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10300 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10301 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10302 nargs_constant
= 2;
10305 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10306 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10307 return ix86_expand_sse_comi_round (d
, exp
, target
);
10308 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10309 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10310 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10311 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10312 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10313 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10314 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
10315 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10316 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10317 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
10320 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10321 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10322 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
10323 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
10324 nargs_constant
= 4;
10327 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10328 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10329 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10330 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10331 nargs_constant
= 3;
10334 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10335 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10336 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10337 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10338 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10339 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10341 nargs_constant
= 4;
10343 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10344 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10345 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10346 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10348 nargs_constant
= 3;
10351 gcc_unreachable ();
10353 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10357 || GET_MODE (target
) != tmode
10358 || !insn_p
->operand
[0].predicate (target
, tmode
))
10359 target
= gen_reg_rtx (tmode
);
10361 for (i
= 0; i
< nargs
; i
++)
10363 tree arg
= CALL_EXPR_ARG (exp
, i
);
10364 rtx op
= expand_normal (arg
);
10365 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10366 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10368 if (i
== nargs
- nargs_constant
)
10374 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10375 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10376 case CODE_FOR_avx512f_vgetmantv2df_round
:
10377 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10378 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10379 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10380 error ("the immediate argument must be a 4-bit immediate");
10382 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10383 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10384 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10385 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10386 error ("the immediate argument must be a 5-bit immediate");
10389 error ("the immediate argument must be an 8-bit immediate");
10394 else if (i
== nargs
-1)
10396 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10398 error ("incorrect rounding operand");
10402 /* If there is no rounding use normal version of the pattern. */
10403 if (INTVAL (op
) == NO_ROUND
)
10404 redundant_embed_rnd
= 1;
10408 if (VECTOR_MODE_P (mode
))
10409 op
= safe_vector_operand (op
, mode
);
10411 op
= fixup_modeless_constant (op
, mode
);
10413 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10415 if (optimize
|| !match
)
10416 op
= copy_to_mode_reg (mode
, op
);
10420 op
= copy_to_reg (op
);
10421 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10431 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10434 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10437 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10440 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10444 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10445 xops
[2], xops
[3], xops
[4]);
10448 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10449 xops
[2], xops
[3], xops
[4], xops
[5]);
10452 gcc_unreachable ();
10458 if (redundant_embed_rnd
)
10459 pat
= ix86_erase_embedded_rounding (pat
);
10465 /* Subroutine of ix86_expand_builtin to take care of special insns
10466 with variable number of operands. */
10469 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10470 tree exp
, rtx target
)
10474 unsigned int i
, nargs
, arg_adjust
, memory
;
10475 bool aligned_mem
= false;
10477 enum insn_code icode
= d
->icode
;
10478 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10479 machine_mode tmode
= insn_p
->operand
[0].mode
;
10480 enum { load
, store
} klass
;
10482 switch ((enum ix86_builtin_func_type
) d
->flag
)
10484 case VOID_FTYPE_VOID
:
10485 emit_insn (GEN_FCN (icode
) (target
));
10487 case VOID_FTYPE_UINT64
:
10488 case VOID_FTYPE_UNSIGNED
:
10494 case INT_FTYPE_VOID
:
10495 case USHORT_FTYPE_VOID
:
10496 case UINT64_FTYPE_VOID
:
10497 case UINT_FTYPE_VOID
:
10498 case UINT8_FTYPE_VOID
:
10499 case UNSIGNED_FTYPE_VOID
:
10504 case UINT64_FTYPE_PUNSIGNED
:
10505 case V2DI_FTYPE_PV2DI
:
10506 case V4DI_FTYPE_PV4DI
:
10507 case V32QI_FTYPE_PCCHAR
:
10508 case V16QI_FTYPE_PCCHAR
:
10509 case V8SF_FTYPE_PCV4SF
:
10510 case V8SF_FTYPE_PCFLOAT
:
10511 case V4SF_FTYPE_PCFLOAT
:
10512 case V4DF_FTYPE_PCV2DF
:
10513 case V4DF_FTYPE_PCDOUBLE
:
10514 case V2DF_FTYPE_PCDOUBLE
:
10515 case VOID_FTYPE_PVOID
:
10516 case V8DI_FTYPE_PV8DI
:
10522 case CODE_FOR_sse4_1_movntdqa
:
10523 case CODE_FOR_avx2_movntdqa
:
10524 case CODE_FOR_avx512f_movntdqa
:
10525 aligned_mem
= true;
10531 case VOID_FTYPE_PV2SF_V4SF
:
10532 case VOID_FTYPE_PV8DI_V8DI
:
10533 case VOID_FTYPE_PV4DI_V4DI
:
10534 case VOID_FTYPE_PV2DI_V2DI
:
10535 case VOID_FTYPE_PCHAR_V32QI
:
10536 case VOID_FTYPE_PCHAR_V16QI
:
10537 case VOID_FTYPE_PFLOAT_V16SF
:
10538 case VOID_FTYPE_PFLOAT_V8SF
:
10539 case VOID_FTYPE_PFLOAT_V4SF
:
10540 case VOID_FTYPE_PDOUBLE_V8DF
:
10541 case VOID_FTYPE_PDOUBLE_V4DF
:
10542 case VOID_FTYPE_PDOUBLE_V2DF
:
10543 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10544 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10545 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10546 case VOID_FTYPE_PINT_INT
:
10549 /* Reserve memory operand for target. */
10550 memory
= ARRAY_SIZE (xops
);
10553 /* These builtins and instructions require the memory
10554 to be properly aligned. */
10555 case CODE_FOR_avx_movntv4di
:
10556 case CODE_FOR_sse2_movntv2di
:
10557 case CODE_FOR_avx_movntv8sf
:
10558 case CODE_FOR_sse_movntv4sf
:
10559 case CODE_FOR_sse4a_vmmovntv4sf
:
10560 case CODE_FOR_avx_movntv4df
:
10561 case CODE_FOR_sse2_movntv2df
:
10562 case CODE_FOR_sse4a_vmmovntv2df
:
10563 case CODE_FOR_sse2_movntidi
:
10564 case CODE_FOR_sse_movntq
:
10565 case CODE_FOR_sse2_movntisi
:
10566 case CODE_FOR_avx512f_movntv16sf
:
10567 case CODE_FOR_avx512f_movntv8df
:
10568 case CODE_FOR_avx512f_movntv8di
:
10569 aligned_mem
= true;
10575 case VOID_FTYPE_PVOID_PCVOID
:
10581 case V4SF_FTYPE_V4SF_PCV2SF
:
10582 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10587 case V8SF_FTYPE_PCV8SF_V8SI
:
10588 case V4DF_FTYPE_PCV4DF_V4DI
:
10589 case V4SF_FTYPE_PCV4SF_V4SI
:
10590 case V2DF_FTYPE_PCV2DF_V2DI
:
10591 case V8SI_FTYPE_PCV8SI_V8SI
:
10592 case V4DI_FTYPE_PCV4DI_V4DI
:
10593 case V4SI_FTYPE_PCV4SI_V4SI
:
10594 case V2DI_FTYPE_PCV2DI_V2DI
:
10595 case VOID_FTYPE_INT_INT64
:
10600 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10601 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10602 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10603 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10604 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10605 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10606 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10607 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10608 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10609 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10610 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10611 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10612 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10613 case VOID_FTYPE_PV32HI_V32HI_USI
:
10614 case VOID_FTYPE_PV32QI_V32QI_USI
:
10615 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10616 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10617 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10620 /* These builtins and instructions require the memory
10621 to be properly aligned. */
10622 case CODE_FOR_avx512f_storev16sf_mask
:
10623 case CODE_FOR_avx512f_storev16si_mask
:
10624 case CODE_FOR_avx512f_storev8df_mask
:
10625 case CODE_FOR_avx512f_storev8di_mask
:
10626 case CODE_FOR_avx512vl_storev8sf_mask
:
10627 case CODE_FOR_avx512vl_storev8si_mask
:
10628 case CODE_FOR_avx512vl_storev4df_mask
:
10629 case CODE_FOR_avx512vl_storev4di_mask
:
10630 case CODE_FOR_avx512vl_storev4sf_mask
:
10631 case CODE_FOR_avx512vl_storev4si_mask
:
10632 case CODE_FOR_avx512vl_storev2df_mask
:
10633 case CODE_FOR_avx512vl_storev2di_mask
:
10634 aligned_mem
= true;
10640 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10641 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10642 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10643 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10644 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10645 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10646 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10647 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10648 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10649 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10650 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10651 case VOID_FTYPE_PUDI_V8DI_UQI
:
10652 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10653 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10654 case VOID_FTYPE_PUDI_V2DI_UQI
:
10655 case VOID_FTYPE_PUDI_V4DI_UQI
:
10656 case VOID_FTYPE_PUSI_V2DI_UQI
:
10657 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10658 case VOID_FTYPE_PUDI_V4SI_UQI
:
10659 case VOID_FTYPE_PUSI_V4DI_UQI
:
10660 case VOID_FTYPE_PUHI_V2DI_UQI
:
10661 case VOID_FTYPE_PUDI_V8SI_UQI
:
10662 case VOID_FTYPE_PUSI_V4SI_UQI
:
10663 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10664 case VOID_FTYPE_PCHAR_V32QI_USI
:
10665 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10666 case VOID_FTYPE_PSHORT_V32HI_USI
:
10667 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10668 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10669 case VOID_FTYPE_PINT_V16SI_UHI
:
10670 case VOID_FTYPE_PINT_V8SI_UQI
:
10671 case VOID_FTYPE_PINT_V4SI_UQI
:
10672 case VOID_FTYPE_PINT64_V8DI_UQI
:
10673 case VOID_FTYPE_PINT64_V4DI_UQI
:
10674 case VOID_FTYPE_PINT64_V2DI_UQI
:
10675 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10676 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10677 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10678 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10679 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10680 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10681 case VOID_FTYPE_PV32QI_V32HI_USI
:
10682 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10683 case VOID_FTYPE_PUDI_V8HI_UQI
:
10686 /* Reserve memory operand for target. */
10687 memory
= ARRAY_SIZE (xops
);
10689 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10690 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10691 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10692 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10693 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10694 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10695 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10696 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10697 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10698 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10699 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10700 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10701 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10702 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10703 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10704 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10705 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10706 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10709 /* These builtins and instructions require the memory
10710 to be properly aligned. */
10711 case CODE_FOR_avx512f_loadv16sf_mask
:
10712 case CODE_FOR_avx512f_loadv16si_mask
:
10713 case CODE_FOR_avx512f_loadv8df_mask
:
10714 case CODE_FOR_avx512f_loadv8di_mask
:
10715 case CODE_FOR_avx512vl_loadv8sf_mask
:
10716 case CODE_FOR_avx512vl_loadv8si_mask
:
10717 case CODE_FOR_avx512vl_loadv4df_mask
:
10718 case CODE_FOR_avx512vl_loadv4di_mask
:
10719 case CODE_FOR_avx512vl_loadv4sf_mask
:
10720 case CODE_FOR_avx512vl_loadv4si_mask
:
10721 case CODE_FOR_avx512vl_loadv2df_mask
:
10722 case CODE_FOR_avx512vl_loadv2di_mask
:
10723 case CODE_FOR_avx512bw_loadv64qi_mask
:
10724 case CODE_FOR_avx512vl_loadv32qi_mask
:
10725 case CODE_FOR_avx512vl_loadv16qi_mask
:
10726 case CODE_FOR_avx512bw_loadv32hi_mask
:
10727 case CODE_FOR_avx512vl_loadv16hi_mask
:
10728 case CODE_FOR_avx512vl_loadv8hi_mask
:
10729 aligned_mem
= true;
10735 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10736 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10737 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10738 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10739 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10740 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10741 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10742 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10743 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10744 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10745 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10746 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10747 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10748 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10749 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10750 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10751 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10752 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10758 gcc_unreachable ();
10761 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10763 if (klass
== store
)
10765 arg
= CALL_EXPR_ARG (exp
, 0);
10766 op
= expand_normal (arg
);
10767 gcc_assert (target
== 0);
10770 op
= ix86_zero_extend_to_Pmode (op
);
10771 target
= gen_rtx_MEM (tmode
, op
);
10772 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10773 on it. Try to improve it using get_pointer_alignment,
10774 and if the special builtin is one that requires strict
10775 mode alignment, also from it's GET_MODE_ALIGNMENT.
10776 Failure to do so could lead to ix86_legitimate_combined_insn
10777 rejecting all changes to such insns. */
10778 unsigned int align
= get_pointer_alignment (arg
);
10779 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10780 align
= GET_MODE_ALIGNMENT (tmode
);
10781 if (MEM_ALIGN (target
) < align
)
10782 set_mem_align (target
, align
);
10785 target
= force_reg (tmode
, op
);
10793 || !register_operand (target
, tmode
)
10794 || GET_MODE (target
) != tmode
)
10795 target
= gen_reg_rtx (tmode
);
10798 for (i
= 0; i
< nargs
; i
++)
10800 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10802 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10803 op
= expand_normal (arg
);
10807 /* This must be the memory operand. */
10808 op
= ix86_zero_extend_to_Pmode (op
);
10809 op
= gen_rtx_MEM (mode
, op
);
10810 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10811 on it. Try to improve it using get_pointer_alignment,
10812 and if the special builtin is one that requires strict
10813 mode alignment, also from it's GET_MODE_ALIGNMENT.
10814 Failure to do so could lead to ix86_legitimate_combined_insn
10815 rejecting all changes to such insns. */
10816 unsigned int align
= get_pointer_alignment (arg
);
10817 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10818 align
= GET_MODE_ALIGNMENT (mode
);
10819 if (MEM_ALIGN (op
) < align
)
10820 set_mem_align (op
, align
);
10824 /* This must be register. */
10825 if (VECTOR_MODE_P (mode
))
10826 op
= safe_vector_operand (op
, mode
);
10828 op
= fixup_modeless_constant (op
, mode
);
10830 /* NB: 3-operands load implied it's a mask load,
10831 and that mask operand shoud be at the end.
10832 Keep all-ones mask which would be simplified by the expander. */
10833 if (nargs
== 3 && i
== 2 && klass
== load
10834 && constm1_operand (op
, mode
))
10836 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10837 op
= copy_to_mode_reg (mode
, op
);
10840 op
= copy_to_reg (op
);
10841 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10851 pat
= GEN_FCN (icode
) (target
);
10854 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10857 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10860 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10863 gcc_unreachable ();
10870 return klass
== store
? 0 : target
;
10873 /* Return the integer constant in ARG. Constrain it to be in the range
10874 of the subparts of VEC_TYPE; issue an error if not. */
10877 get_element_number (tree vec_type
, tree arg
)
10879 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10881 if (!tree_fits_uhwi_p (arg
)
10882 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10884 error ("selector must be an integer constant in the range "
10892 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10893 ix86_expand_vector_init. We DO have language-level syntax for this, in
10894 the form of (type){ init-list }. Except that since we can't place emms
10895 instructions from inside the compiler, we can't allow the use of MMX
10896 registers unless the user explicitly asks for it. So we do *not* define
10897 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10898 we have builtins invoked by mmintrin.h that gives us license to emit
10899 these sorts of instructions. */
10902 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10904 machine_mode tmode
= TYPE_MODE (type
);
10905 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10906 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10907 rtvec v
= rtvec_alloc (n_elt
);
10909 gcc_assert (VECTOR_MODE_P (tmode
));
10910 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10912 for (i
= 0; i
< n_elt
; ++i
)
10914 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10915 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10918 if (!target
|| !register_operand (target
, tmode
))
10919 target
= gen_reg_rtx (tmode
);
10921 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10925 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10926 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10927 had a language-level syntax for referencing vector elements. */
10930 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10932 machine_mode tmode
, mode0
;
10937 arg0
= CALL_EXPR_ARG (exp
, 0);
10938 arg1
= CALL_EXPR_ARG (exp
, 1);
10940 op0
= expand_normal (arg0
);
10941 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10943 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10944 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10945 gcc_assert (VECTOR_MODE_P (mode0
));
10947 op0
= force_reg (mode0
, op0
);
10949 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10950 target
= gen_reg_rtx (tmode
);
10952 ix86_expand_vector_extract (true, target
, op0
, elt
);
10957 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10958 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10959 a language-level syntax for referencing vector elements. */
10962 ix86_expand_vec_set_builtin (tree exp
)
10964 machine_mode tmode
, mode1
;
10965 tree arg0
, arg1
, arg2
;
10967 rtx op0
, op1
, target
;
10969 arg0
= CALL_EXPR_ARG (exp
, 0);
10970 arg1
= CALL_EXPR_ARG (exp
, 1);
10971 arg2
= CALL_EXPR_ARG (exp
, 2);
10973 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10974 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10975 gcc_assert (VECTOR_MODE_P (tmode
));
10977 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10978 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10979 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10981 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10982 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10984 op0
= force_reg (tmode
, op0
);
10985 op1
= force_reg (mode1
, op1
);
10987 /* OP0 is the source of these builtin functions and shouldn't be
10988 modified. Create a copy, use it and return it as target. */
10989 target
= gen_reg_rtx (tmode
);
10990 emit_move_insn (target
, op0
);
10991 ix86_expand_vector_set (true, target
, op1
, elt
);
10996 /* Expand an expression EXP that calls a built-in function,
10997 with result going to TARGET if that's convenient
10998 (and in mode MODE if that's convenient).
10999 SUBTARGET may be used as the target for computing one of EXP's operands.
11000 IGNORE is nonzero if the value is to be ignored. */
11003 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
11004 machine_mode mode
, int ignore
)
11007 enum insn_code icode
, icode2
;
11008 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
11009 tree arg0
, arg1
, arg2
, arg3
, arg4
;
11010 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
11011 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
11012 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
11014 /* For CPU builtins that can be folded, fold first and expand the fold. */
11017 case IX86_BUILTIN_CPU_INIT
:
11019 /* Make it call __cpu_indicator_init in libgcc. */
11020 tree call_expr
, fndecl
, type
;
11021 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
11022 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
11023 call_expr
= build_call_expr (fndecl
, 0);
11024 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
11026 case IX86_BUILTIN_CPU_IS
:
11027 case IX86_BUILTIN_CPU_SUPPORTS
:
11029 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11030 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11031 gcc_assert (fold_expr
!= NULL_TREE
);
11032 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11036 HOST_WIDE_INT isa
= ix86_isa_flags
;
11037 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11038 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11039 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11040 /* The general case is we require all the ISAs specified in bisa{,2}
11042 The exceptions are:
11043 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11044 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11045 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11046 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11047 OPTION_MASK_ISA2_AVXVNNI
11048 where for each such pair it is sufficient if either of the ISAs is
11049 enabled, plus if it is ored with other options also those others.
11050 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11051 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11052 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11053 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11054 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11056 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11057 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11058 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11059 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11061 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11062 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11063 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11064 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11066 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11067 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11068 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
11069 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11070 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11071 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
11073 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
11074 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
11077 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
11078 /* __builtin_ia32_maskmovq requires MMX registers. */
11079 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
11081 bisa
&= ~OPTION_MASK_ISA_MMX
;
11082 bisa
|= OPTION_MASK_ISA_SSE2
;
11085 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11087 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11088 if (TARGET_ABI_X32
)
11089 bisa
|= OPTION_MASK_ABI_X32
;
11091 bisa
|= OPTION_MASK_ABI_64
;
11092 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11093 (enum fpmath_unit
) 0,
11094 (enum prefer_vector_width
) 0,
11097 error ("%qE needs unknown isa option", fndecl
);
11100 gcc_assert (opts
!= NULL
);
11101 error ("%qE needs isa option %s", fndecl
, opts
);
11104 return expand_call (exp
, target
, ignore
);
11109 case IX86_BUILTIN_MASKMOVQ
:
11110 case IX86_BUILTIN_MASKMOVDQU
:
11111 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11112 ? CODE_FOR_mmx_maskmovq
11113 : CODE_FOR_sse2_maskmovdqu
);
11114 /* Note the arg order is different from the operand order. */
11115 arg1
= CALL_EXPR_ARG (exp
, 0);
11116 arg2
= CALL_EXPR_ARG (exp
, 1);
11117 arg0
= CALL_EXPR_ARG (exp
, 2);
11118 op0
= expand_normal (arg0
);
11119 op1
= expand_normal (arg1
);
11120 op2
= expand_normal (arg2
);
11121 mode0
= insn_data
[icode
].operand
[0].mode
;
11122 mode1
= insn_data
[icode
].operand
[1].mode
;
11123 mode2
= insn_data
[icode
].operand
[2].mode
;
11125 op0
= ix86_zero_extend_to_Pmode (op0
);
11126 op0
= gen_rtx_MEM (mode1
, op0
);
11128 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11129 op0
= copy_to_mode_reg (mode0
, op0
);
11130 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11131 op1
= copy_to_mode_reg (mode1
, op1
);
11132 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11133 op2
= copy_to_mode_reg (mode2
, op2
);
11134 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11140 case IX86_BUILTIN_LDMXCSR
:
11141 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11142 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11143 emit_move_insn (target
, op0
);
11144 emit_insn (gen_sse_ldmxcsr (target
));
11147 case IX86_BUILTIN_STMXCSR
:
11148 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11149 emit_insn (gen_sse_stmxcsr (target
));
11150 return copy_to_mode_reg (SImode
, target
);
11152 case IX86_BUILTIN_CLFLUSH
:
11153 arg0
= CALL_EXPR_ARG (exp
, 0);
11154 op0
= expand_normal (arg0
);
11155 icode
= CODE_FOR_sse2_clflush
;
11156 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11157 op0
= ix86_zero_extend_to_Pmode (op0
);
11159 emit_insn (gen_sse2_clflush (op0
));
11162 case IX86_BUILTIN_CLWB
:
11163 arg0
= CALL_EXPR_ARG (exp
, 0);
11164 op0
= expand_normal (arg0
);
11165 icode
= CODE_FOR_clwb
;
11166 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11167 op0
= ix86_zero_extend_to_Pmode (op0
);
11169 emit_insn (gen_clwb (op0
));
11172 case IX86_BUILTIN_CLFLUSHOPT
:
11173 arg0
= CALL_EXPR_ARG (exp
, 0);
11174 op0
= expand_normal (arg0
);
11175 icode
= CODE_FOR_clflushopt
;
11176 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11177 op0
= ix86_zero_extend_to_Pmode (op0
);
11179 emit_insn (gen_clflushopt (op0
));
11182 case IX86_BUILTIN_MONITOR
:
11183 case IX86_BUILTIN_MONITORX
:
11184 arg0
= CALL_EXPR_ARG (exp
, 0);
11185 arg1
= CALL_EXPR_ARG (exp
, 1);
11186 arg2
= CALL_EXPR_ARG (exp
, 2);
11187 op0
= expand_normal (arg0
);
11188 op1
= expand_normal (arg1
);
11189 op2
= expand_normal (arg2
);
11191 op0
= ix86_zero_extend_to_Pmode (op0
);
11193 op1
= copy_to_mode_reg (SImode
, op1
);
11195 op2
= copy_to_mode_reg (SImode
, op2
);
11197 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11198 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11199 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11202 case IX86_BUILTIN_MWAIT
:
11203 arg0
= CALL_EXPR_ARG (exp
, 0);
11204 arg1
= CALL_EXPR_ARG (exp
, 1);
11205 op0
= expand_normal (arg0
);
11206 op1
= expand_normal (arg1
);
11208 op0
= copy_to_mode_reg (SImode
, op0
);
11210 op1
= copy_to_mode_reg (SImode
, op1
);
11211 emit_insn (gen_sse3_mwait (op0
, op1
));
11214 case IX86_BUILTIN_MWAITX
:
11215 arg0
= CALL_EXPR_ARG (exp
, 0);
11216 arg1
= CALL_EXPR_ARG (exp
, 1);
11217 arg2
= CALL_EXPR_ARG (exp
, 2);
11218 op0
= expand_normal (arg0
);
11219 op1
= expand_normal (arg1
);
11220 op2
= expand_normal (arg2
);
11222 op0
= copy_to_mode_reg (SImode
, op0
);
11224 op1
= copy_to_mode_reg (SImode
, op1
);
11226 op2
= copy_to_mode_reg (SImode
, op2
);
11227 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11230 case IX86_BUILTIN_UMONITOR
:
11231 arg0
= CALL_EXPR_ARG (exp
, 0);
11232 op0
= expand_normal (arg0
);
11234 op0
= ix86_zero_extend_to_Pmode (op0
);
11235 emit_insn (gen_umonitor (Pmode
, op0
));
11238 case IX86_BUILTIN_UMWAIT
:
11239 case IX86_BUILTIN_TPAUSE
:
11240 arg0
= CALL_EXPR_ARG (exp
, 0);
11241 arg1
= CALL_EXPR_ARG (exp
, 1);
11242 op0
= expand_normal (arg0
);
11243 op1
= expand_normal (arg1
);
11246 op0
= copy_to_mode_reg (SImode
, op0
);
11248 op1
= force_reg (DImode
, op1
);
11252 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11253 NULL
, 1, OPTAB_DIRECT
);
11256 case IX86_BUILTIN_UMWAIT
:
11257 icode
= CODE_FOR_umwait_rex64
;
11259 case IX86_BUILTIN_TPAUSE
:
11260 icode
= CODE_FOR_tpause_rex64
;
11263 gcc_unreachable ();
11266 op2
= gen_lowpart (SImode
, op2
);
11267 op1
= gen_lowpart (SImode
, op1
);
11268 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11274 case IX86_BUILTIN_UMWAIT
:
11275 icode
= CODE_FOR_umwait
;
11277 case IX86_BUILTIN_TPAUSE
:
11278 icode
= CODE_FOR_tpause
;
11281 gcc_unreachable ();
11283 pat
= GEN_FCN (icode
) (op0
, op1
);
11292 || !register_operand (target
, QImode
))
11293 target
= gen_reg_rtx (QImode
);
11295 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11297 emit_insn (gen_rtx_SET (target
, pat
));
11301 case IX86_BUILTIN_TESTUI
:
11302 emit_insn (gen_testui ());
11305 || !register_operand (target
, QImode
))
11306 target
= gen_reg_rtx (QImode
);
11308 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11310 emit_insn (gen_rtx_SET (target
, pat
));
11314 case IX86_BUILTIN_CLZERO
:
11315 arg0
= CALL_EXPR_ARG (exp
, 0);
11316 op0
= expand_normal (arg0
);
11318 op0
= ix86_zero_extend_to_Pmode (op0
);
11319 emit_insn (gen_clzero (Pmode
, op0
));
11322 case IX86_BUILTIN_CLDEMOTE
:
11323 arg0
= CALL_EXPR_ARG (exp
, 0);
11324 op0
= expand_normal (arg0
);
11325 icode
= CODE_FOR_cldemote
;
11326 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11327 op0
= ix86_zero_extend_to_Pmode (op0
);
11329 emit_insn (gen_cldemote (op0
));
11332 case IX86_BUILTIN_LOADIWKEY
:
11334 arg0
= CALL_EXPR_ARG (exp
, 0);
11335 arg1
= CALL_EXPR_ARG (exp
, 1);
11336 arg2
= CALL_EXPR_ARG (exp
, 2);
11337 arg3
= CALL_EXPR_ARG (exp
, 3);
11339 op0
= expand_normal (arg0
);
11340 op1
= expand_normal (arg1
);
11341 op2
= expand_normal (arg2
);
11342 op3
= expand_normal (arg3
);
11345 op0
= copy_to_mode_reg (V2DImode
, op0
);
11347 op1
= copy_to_mode_reg (V2DImode
, op1
);
11349 op2
= copy_to_mode_reg (V2DImode
, op2
);
11351 op3
= copy_to_mode_reg (SImode
, op3
);
11353 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
11358 case IX86_BUILTIN_AESDEC128KLU8
:
11359 icode
= CODE_FOR_aesdec128klu8
;
11360 goto aesdecenc_expand
;
11362 case IX86_BUILTIN_AESDEC256KLU8
:
11363 icode
= CODE_FOR_aesdec256klu8
;
11364 goto aesdecenc_expand
;
11366 case IX86_BUILTIN_AESENC128KLU8
:
11367 icode
= CODE_FOR_aesenc128klu8
;
11368 goto aesdecenc_expand
;
11370 case IX86_BUILTIN_AESENC256KLU8
:
11371 icode
= CODE_FOR_aesenc256klu8
;
11375 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
11376 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
11377 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11379 op0
= expand_normal (arg0
);
11380 op1
= expand_normal (arg1
);
11381 op2
= expand_normal (arg2
);
11383 if (!address_operand (op0
, V2DImode
))
11385 op0
= convert_memory_address (Pmode
, op0
);
11386 op0
= copy_addr_to_reg (op0
);
11388 op0
= gen_rtx_MEM (V2DImode
, op0
);
11391 op1
= copy_to_mode_reg (V2DImode
, op1
);
11393 if (!address_operand (op2
, VOIDmode
))
11395 op2
= convert_memory_address (Pmode
, op2
);
11396 op2
= copy_addr_to_reg (op2
);
11398 op2
= gen_rtx_MEM (BLKmode
, op2
);
11400 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
11403 target
= gen_reg_rtx (QImode
);
11405 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11407 emit_insn (gen_rtx_SET (target
, pat
));
11409 emit_insn (gen_rtx_SET (op0
, op1
));
11413 case IX86_BUILTIN_AESDECWIDE128KLU8
:
11414 icode
= CODE_FOR_aesdecwide128klu8
;
11415 goto wideaesdecenc_expand
;
11417 case IX86_BUILTIN_AESDECWIDE256KLU8
:
11418 icode
= CODE_FOR_aesdecwide256klu8
;
11419 goto wideaesdecenc_expand
;
11421 case IX86_BUILTIN_AESENCWIDE128KLU8
:
11422 icode
= CODE_FOR_aesencwide128klu8
;
11423 goto wideaesdecenc_expand
;
11425 case IX86_BUILTIN_AESENCWIDE256KLU8
:
11426 icode
= CODE_FOR_aesencwide256klu8
;
11428 wideaesdecenc_expand
:
11433 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
11434 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
11435 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11437 op0
= expand_normal (arg0
);
11438 op1
= expand_normal (arg1
);
11439 op2
= expand_normal (arg2
);
11441 if (!address_operand (op2
, VOIDmode
))
11443 op2
= convert_memory_address (Pmode
, op2
);
11444 op2
= copy_addr_to_reg (op2
);
11446 op2
= gen_rtx_MEM (BLKmode
, op2
);
11448 for (i
= 0; i
< 8; i
++)
11450 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11452 op
= gen_rtx_MEM (V2DImode
,
11453 plus_constant (Pmode
, op1
, (i
* 16)));
11455 emit_move_insn (xmm_regs
[i
], op
);
11458 emit_insn (GEN_FCN (icode
) (op2
));
11461 target
= gen_reg_rtx (QImode
);
11463 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11465 emit_insn (gen_rtx_SET (target
, pat
));
11467 for (i
= 0; i
< 8; i
++)
11469 op
= gen_rtx_MEM (V2DImode
,
11470 plus_constant (Pmode
, op0
, (i
* 16)));
11471 emit_move_insn (op
, xmm_regs
[i
]);
11476 case IX86_BUILTIN_ENCODEKEY128U32
:
11478 rtx op
, xmm_regs
[7];
11480 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11481 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
11482 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
11484 op0
= expand_normal (arg0
);
11485 op1
= expand_normal (arg1
);
11486 op2
= expand_normal (arg2
);
11489 op0
= copy_to_mode_reg (SImode
, op0
);
11491 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11492 emit_move_insn (op
, op1
);
11494 for (i
= 0; i
< 3; i
++)
11495 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11498 target
= gen_reg_rtx (SImode
);
11500 emit_insn (gen_encodekey128u32 (target
, op0
));
11502 for (i
= 0; i
< 3; i
++)
11504 op
= gen_rtx_MEM (V2DImode
,
11505 plus_constant (Pmode
, op2
, (i
* 16)));
11506 emit_move_insn (op
, xmm_regs
[i
]);
11511 case IX86_BUILTIN_ENCODEKEY256U32
:
11513 rtx op
, xmm_regs
[7];
11515 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11516 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
11517 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
11518 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
11520 op0
= expand_normal (arg0
);
11521 op1
= expand_normal (arg1
);
11522 op2
= expand_normal (arg2
);
11523 op3
= expand_normal (arg3
);
11526 op0
= copy_to_mode_reg (SImode
, op0
);
11528 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11529 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11530 emit_move_insn (op
, op1
);
11531 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
11532 emit_move_insn (op
, op2
);
11534 for (i
= 0; i
< 4; i
++)
11535 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11538 target
= gen_reg_rtx (SImode
);
11540 emit_insn (gen_encodekey256u32 (target
, op0
));
11542 for (i
= 0; i
< 4; i
++)
11544 op
= gen_rtx_MEM (V2DImode
,
11545 plus_constant (Pmode
, op3
, (i
* 16)));
11546 emit_move_insn (op
, xmm_regs
[i
]);
11552 case IX86_BUILTIN_VEC_INIT_V2SI
:
11553 case IX86_BUILTIN_VEC_INIT_V4HI
:
11554 case IX86_BUILTIN_VEC_INIT_V8QI
:
11555 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11557 case IX86_BUILTIN_VEC_EXT_V2DF
:
11558 case IX86_BUILTIN_VEC_EXT_V2DI
:
11559 case IX86_BUILTIN_VEC_EXT_V4SF
:
11560 case IX86_BUILTIN_VEC_EXT_V4SI
:
11561 case IX86_BUILTIN_VEC_EXT_V8HI
:
11562 case IX86_BUILTIN_VEC_EXT_V2SI
:
11563 case IX86_BUILTIN_VEC_EXT_V4HI
:
11564 case IX86_BUILTIN_VEC_EXT_V16QI
:
11565 return ix86_expand_vec_ext_builtin (exp
, target
);
11567 case IX86_BUILTIN_VEC_SET_V2DI
:
11568 case IX86_BUILTIN_VEC_SET_V4SF
:
11569 case IX86_BUILTIN_VEC_SET_V4SI
:
11570 case IX86_BUILTIN_VEC_SET_V8HI
:
11571 case IX86_BUILTIN_VEC_SET_V4HI
:
11572 case IX86_BUILTIN_VEC_SET_V16QI
:
11573 return ix86_expand_vec_set_builtin (exp
);
11575 case IX86_BUILTIN_NANQ
:
11576 case IX86_BUILTIN_NANSQ
:
11577 return expand_call (exp
, target
, ignore
);
11579 case IX86_BUILTIN_RDPID
:
11581 op0
= gen_reg_rtx (word_mode
);
11585 insn
= gen_rdpid_rex64 (op0
);
11586 op0
= convert_to_mode (SImode
, op0
, 1);
11589 insn
= gen_rdpid (op0
);
11594 || !register_operand (target
, SImode
))
11595 target
= gen_reg_rtx (SImode
);
11597 emit_move_insn (target
, op0
);
11600 case IX86_BUILTIN_2INTERSECTD512
:
11601 case IX86_BUILTIN_2INTERSECTQ512
:
11602 case IX86_BUILTIN_2INTERSECTD256
:
11603 case IX86_BUILTIN_2INTERSECTQ256
:
11604 case IX86_BUILTIN_2INTERSECTD128
:
11605 case IX86_BUILTIN_2INTERSECTQ128
:
11606 arg0
= CALL_EXPR_ARG (exp
, 0);
11607 arg1
= CALL_EXPR_ARG (exp
, 1);
11608 arg2
= CALL_EXPR_ARG (exp
, 2);
11609 arg3
= CALL_EXPR_ARG (exp
, 3);
11610 op0
= expand_normal (arg0
);
11611 op1
= expand_normal (arg1
);
11612 op2
= expand_normal (arg2
);
11613 op3
= expand_normal (arg3
);
11615 if (!address_operand (op0
, VOIDmode
))
11617 op0
= convert_memory_address (Pmode
, op0
);
11618 op0
= copy_addr_to_reg (op0
);
11620 if (!address_operand (op1
, VOIDmode
))
11622 op1
= convert_memory_address (Pmode
, op1
);
11623 op1
= copy_addr_to_reg (op1
);
11628 case IX86_BUILTIN_2INTERSECTD512
:
11630 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11632 case IX86_BUILTIN_2INTERSECTQ512
:
11634 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11636 case IX86_BUILTIN_2INTERSECTD256
:
11638 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11640 case IX86_BUILTIN_2INTERSECTQ256
:
11642 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11644 case IX86_BUILTIN_2INTERSECTD128
:
11646 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11648 case IX86_BUILTIN_2INTERSECTQ128
:
11650 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11653 gcc_unreachable ();
11656 mode2
= insn_data
[icode
].operand
[1].mode
;
11657 mode3
= insn_data
[icode
].operand
[2].mode
;
11658 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11659 op2
= copy_to_mode_reg (mode2
, op2
);
11660 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11661 op3
= copy_to_mode_reg (mode3
, op3
);
11663 op4
= gen_reg_rtx (mode4
);
11664 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11665 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11666 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11667 gen_lowpart (mode0
, op4
));
11668 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11669 gen_highpart (mode0
, op4
));
11673 case IX86_BUILTIN_RDPMC
:
11674 case IX86_BUILTIN_RDTSC
:
11675 case IX86_BUILTIN_RDTSCP
:
11676 case IX86_BUILTIN_XGETBV
:
11678 op0
= gen_reg_rtx (DImode
);
11679 op1
= gen_reg_rtx (DImode
);
11681 if (fcode
== IX86_BUILTIN_RDPMC
)
11683 arg0
= CALL_EXPR_ARG (exp
, 0);
11684 op2
= expand_normal (arg0
);
11685 if (!register_operand (op2
, SImode
))
11686 op2
= copy_to_mode_reg (SImode
, op2
);
11688 insn
= (TARGET_64BIT
11689 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11690 : gen_rdpmc (op0
, op2
));
11693 else if (fcode
== IX86_BUILTIN_XGETBV
)
11695 arg0
= CALL_EXPR_ARG (exp
, 0);
11696 op2
= expand_normal (arg0
);
11697 if (!register_operand (op2
, SImode
))
11698 op2
= copy_to_mode_reg (SImode
, op2
);
11700 insn
= (TARGET_64BIT
11701 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11702 : gen_xgetbv (op0
, op2
));
11705 else if (fcode
== IX86_BUILTIN_RDTSC
)
11707 insn
= (TARGET_64BIT
11708 ? gen_rdtsc_rex64 (op0
, op1
)
11709 : gen_rdtsc (op0
));
11714 op2
= gen_reg_rtx (SImode
);
11716 insn
= (TARGET_64BIT
11717 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11718 : gen_rdtscp (op0
, op2
));
11721 arg0
= CALL_EXPR_ARG (exp
, 0);
11722 op4
= expand_normal (arg0
);
11723 if (!address_operand (op4
, VOIDmode
))
11725 op4
= convert_memory_address (Pmode
, op4
);
11726 op4
= copy_addr_to_reg (op4
);
11728 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11732 || !register_operand (target
, DImode
))
11733 target
= gen_reg_rtx (DImode
);
11737 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11738 op1
, 1, OPTAB_DIRECT
);
11739 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11740 op0
, 1, OPTAB_DIRECT
);
11743 emit_move_insn (target
, op0
);
11746 case IX86_BUILTIN_ENQCMD
:
11747 case IX86_BUILTIN_ENQCMDS
:
11748 case IX86_BUILTIN_MOVDIR64B
:
11750 arg0
= CALL_EXPR_ARG (exp
, 0);
11751 arg1
= CALL_EXPR_ARG (exp
, 1);
11752 op0
= expand_normal (arg0
);
11753 op1
= expand_normal (arg1
);
11755 op0
= ix86_zero_extend_to_Pmode (op0
);
11756 if (!address_operand (op1
, VOIDmode
))
11758 op1
= convert_memory_address (Pmode
, op1
);
11759 op1
= copy_addr_to_reg (op1
);
11761 op1
= gen_rtx_MEM (XImode
, op1
);
11763 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11765 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11771 || !register_operand (target
, SImode
))
11772 target
= gen_reg_rtx (SImode
);
11774 emit_move_insn (target
, const0_rtx
);
11775 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11777 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
11779 : UNSPECV_ENQCMDS
);
11780 icode
= code_for_enqcmd (unspecv
, Pmode
);
11781 emit_insn (GEN_FCN (icode
) (op0
, op1
));
11784 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11785 gen_rtx_fmt_ee (EQ
, QImode
,
11786 gen_rtx_REG (CCZmode
, FLAGS_REG
),
11788 return SUBREG_REG (target
);
11791 case IX86_BUILTIN_FXSAVE
:
11792 case IX86_BUILTIN_FXRSTOR
:
11793 case IX86_BUILTIN_FXSAVE64
:
11794 case IX86_BUILTIN_FXRSTOR64
:
11795 case IX86_BUILTIN_FNSTENV
:
11796 case IX86_BUILTIN_FLDENV
:
11800 case IX86_BUILTIN_FXSAVE
:
11801 icode
= CODE_FOR_fxsave
;
11803 case IX86_BUILTIN_FXRSTOR
:
11804 icode
= CODE_FOR_fxrstor
;
11806 case IX86_BUILTIN_FXSAVE64
:
11807 icode
= CODE_FOR_fxsave64
;
11809 case IX86_BUILTIN_FXRSTOR64
:
11810 icode
= CODE_FOR_fxrstor64
;
11812 case IX86_BUILTIN_FNSTENV
:
11813 icode
= CODE_FOR_fnstenv
;
11815 case IX86_BUILTIN_FLDENV
:
11816 icode
= CODE_FOR_fldenv
;
11819 gcc_unreachable ();
11822 arg0
= CALL_EXPR_ARG (exp
, 0);
11823 op0
= expand_normal (arg0
);
11825 if (!address_operand (op0
, VOIDmode
))
11827 op0
= convert_memory_address (Pmode
, op0
);
11828 op0
= copy_addr_to_reg (op0
);
11830 op0
= gen_rtx_MEM (mode0
, op0
);
11832 pat
= GEN_FCN (icode
) (op0
);
11837 case IX86_BUILTIN_XSETBV
:
11838 arg0
= CALL_EXPR_ARG (exp
, 0);
11839 arg1
= CALL_EXPR_ARG (exp
, 1);
11840 op0
= expand_normal (arg0
);
11841 op1
= expand_normal (arg1
);
11844 op0
= copy_to_mode_reg (SImode
, op0
);
11846 op1
= force_reg (DImode
, op1
);
11850 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11851 NULL
, 1, OPTAB_DIRECT
);
11853 icode
= CODE_FOR_xsetbv_rex64
;
11855 op2
= gen_lowpart (SImode
, op2
);
11856 op1
= gen_lowpart (SImode
, op1
);
11857 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11861 icode
= CODE_FOR_xsetbv
;
11863 pat
= GEN_FCN (icode
) (op0
, op1
);
11869 case IX86_BUILTIN_XSAVE
:
11870 case IX86_BUILTIN_XRSTOR
:
11871 case IX86_BUILTIN_XSAVE64
:
11872 case IX86_BUILTIN_XRSTOR64
:
11873 case IX86_BUILTIN_XSAVEOPT
:
11874 case IX86_BUILTIN_XSAVEOPT64
:
11875 case IX86_BUILTIN_XSAVES
:
11876 case IX86_BUILTIN_XRSTORS
:
11877 case IX86_BUILTIN_XSAVES64
:
11878 case IX86_BUILTIN_XRSTORS64
:
11879 case IX86_BUILTIN_XSAVEC
:
11880 case IX86_BUILTIN_XSAVEC64
:
11881 arg0
= CALL_EXPR_ARG (exp
, 0);
11882 arg1
= CALL_EXPR_ARG (exp
, 1);
11883 op0
= expand_normal (arg0
);
11884 op1
= expand_normal (arg1
);
11886 if (!address_operand (op0
, VOIDmode
))
11888 op0
= convert_memory_address (Pmode
, op0
);
11889 op0
= copy_addr_to_reg (op0
);
11891 op0
= gen_rtx_MEM (BLKmode
, op0
);
11893 op1
= force_reg (DImode
, op1
);
11897 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11898 NULL
, 1, OPTAB_DIRECT
);
11901 case IX86_BUILTIN_XSAVE
:
11902 icode
= CODE_FOR_xsave_rex64
;
11904 case IX86_BUILTIN_XRSTOR
:
11905 icode
= CODE_FOR_xrstor_rex64
;
11907 case IX86_BUILTIN_XSAVE64
:
11908 icode
= CODE_FOR_xsave64
;
11910 case IX86_BUILTIN_XRSTOR64
:
11911 icode
= CODE_FOR_xrstor64
;
11913 case IX86_BUILTIN_XSAVEOPT
:
11914 icode
= CODE_FOR_xsaveopt_rex64
;
11916 case IX86_BUILTIN_XSAVEOPT64
:
11917 icode
= CODE_FOR_xsaveopt64
;
11919 case IX86_BUILTIN_XSAVES
:
11920 icode
= CODE_FOR_xsaves_rex64
;
11922 case IX86_BUILTIN_XRSTORS
:
11923 icode
= CODE_FOR_xrstors_rex64
;
11925 case IX86_BUILTIN_XSAVES64
:
11926 icode
= CODE_FOR_xsaves64
;
11928 case IX86_BUILTIN_XRSTORS64
:
11929 icode
= CODE_FOR_xrstors64
;
11931 case IX86_BUILTIN_XSAVEC
:
11932 icode
= CODE_FOR_xsavec_rex64
;
11934 case IX86_BUILTIN_XSAVEC64
:
11935 icode
= CODE_FOR_xsavec64
;
11938 gcc_unreachable ();
11941 op2
= gen_lowpart (SImode
, op2
);
11942 op1
= gen_lowpart (SImode
, op1
);
11943 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11949 case IX86_BUILTIN_XSAVE
:
11950 icode
= CODE_FOR_xsave
;
11952 case IX86_BUILTIN_XRSTOR
:
11953 icode
= CODE_FOR_xrstor
;
11955 case IX86_BUILTIN_XSAVEOPT
:
11956 icode
= CODE_FOR_xsaveopt
;
11958 case IX86_BUILTIN_XSAVES
:
11959 icode
= CODE_FOR_xsaves
;
11961 case IX86_BUILTIN_XRSTORS
:
11962 icode
= CODE_FOR_xrstors
;
11964 case IX86_BUILTIN_XSAVEC
:
11965 icode
= CODE_FOR_xsavec
;
11968 gcc_unreachable ();
11970 pat
= GEN_FCN (icode
) (op0
, op1
);
11977 case IX86_BUILTIN_LLWPCB
:
11978 arg0
= CALL_EXPR_ARG (exp
, 0);
11979 op0
= expand_normal (arg0
);
11981 if (!register_operand (op0
, Pmode
))
11982 op0
= ix86_zero_extend_to_Pmode (op0
);
11983 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
11986 case IX86_BUILTIN_SLWPCB
:
11988 || !register_operand (target
, Pmode
))
11989 target
= gen_reg_rtx (Pmode
);
11990 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
11993 case IX86_BUILTIN_LWPVAL32
:
11994 case IX86_BUILTIN_LWPVAL64
:
11995 case IX86_BUILTIN_LWPINS32
:
11996 case IX86_BUILTIN_LWPINS64
:
11997 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
11998 || fcode
== IX86_BUILTIN_LWPINS32
)
11999 ? SImode
: DImode
);
12001 if (fcode
== IX86_BUILTIN_LWPVAL32
12002 || fcode
== IX86_BUILTIN_LWPVAL64
)
12003 icode
= code_for_lwp_lwpval (mode
);
12005 icode
= code_for_lwp_lwpins (mode
);
12007 arg0
= CALL_EXPR_ARG (exp
, 0);
12008 arg1
= CALL_EXPR_ARG (exp
, 1);
12009 arg2
= CALL_EXPR_ARG (exp
, 2);
12010 op0
= expand_normal (arg0
);
12011 op1
= expand_normal (arg1
);
12012 op2
= expand_normal (arg2
);
12013 mode0
= insn_data
[icode
].operand
[0].mode
;
12015 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12016 op0
= copy_to_mode_reg (mode0
, op0
);
12017 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
12018 op1
= copy_to_mode_reg (SImode
, op1
);
12020 if (!CONST_INT_P (op2
))
12022 error ("the last argument must be a 32-bit immediate");
12026 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
12028 if (fcode
== IX86_BUILTIN_LWPINS32
12029 || fcode
== IX86_BUILTIN_LWPINS64
)
12032 || !nonimmediate_operand (target
, QImode
))
12033 target
= gen_reg_rtx (QImode
);
12035 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12037 emit_insn (gen_rtx_SET (target
, pat
));
12044 case IX86_BUILTIN_BEXTRI32
:
12045 case IX86_BUILTIN_BEXTRI64
:
12046 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
12048 arg0
= CALL_EXPR_ARG (exp
, 0);
12049 arg1
= CALL_EXPR_ARG (exp
, 1);
12050 op0
= expand_normal (arg0
);
12051 op1
= expand_normal (arg1
);
12053 if (!CONST_INT_P (op1
))
12055 error ("last argument must be an immediate");
12060 unsigned char lsb_index
= UINTVAL (op1
);
12061 unsigned char length
= UINTVAL (op1
) >> 8;
12063 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
12065 icode
= code_for_tbm_bextri (mode
);
12067 mode1
= insn_data
[icode
].operand
[1].mode
;
12068 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
12069 op0
= copy_to_mode_reg (mode1
, op0
);
12071 mode0
= insn_data
[icode
].operand
[0].mode
;
12073 || !register_operand (target
, mode0
))
12074 target
= gen_reg_rtx (mode0
);
12076 if (length
== 0 || lsb_index
>= bitsize
)
12078 emit_move_insn (target
, const0_rtx
);
12082 if (length
+ lsb_index
> bitsize
)
12083 length
= bitsize
- lsb_index
;
12085 op1
= GEN_INT (length
);
12086 op2
= GEN_INT (lsb_index
);
12088 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
12092 case IX86_BUILTIN_RDRAND16_STEP
:
12096 case IX86_BUILTIN_RDRAND32_STEP
:
12100 case IX86_BUILTIN_RDRAND64_STEP
:
12104 arg0
= CALL_EXPR_ARG (exp
, 0);
12105 op1
= expand_normal (arg0
);
12106 if (!address_operand (op1
, VOIDmode
))
12108 op1
= convert_memory_address (Pmode
, op1
);
12109 op1
= copy_addr_to_reg (op1
);
12112 op0
= gen_reg_rtx (mode
);
12113 emit_insn (gen_rdrand (mode
, op0
));
12115 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12117 op1
= force_reg (SImode
, const1_rtx
);
12119 /* Emit SImode conditional move. */
12120 if (mode
== HImode
)
12122 if (TARGET_ZERO_EXTEND_WITH_AND
12123 && optimize_function_for_speed_p (cfun
))
12125 op2
= force_reg (SImode
, const0_rtx
);
12127 emit_insn (gen_movstricthi
12128 (gen_lowpart (HImode
, op2
), op0
));
12132 op2
= gen_reg_rtx (SImode
);
12134 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
12137 else if (mode
== SImode
)
12140 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
12143 || !register_operand (target
, SImode
))
12144 target
= gen_reg_rtx (SImode
);
12146 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12148 emit_insn (gen_rtx_SET (target
,
12149 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
12152 case IX86_BUILTIN_RDSEED16_STEP
:
12156 case IX86_BUILTIN_RDSEED32_STEP
:
12160 case IX86_BUILTIN_RDSEED64_STEP
:
12164 arg0
= CALL_EXPR_ARG (exp
, 0);
12165 op1
= expand_normal (arg0
);
12166 if (!address_operand (op1
, VOIDmode
))
12168 op1
= convert_memory_address (Pmode
, op1
);
12169 op1
= copy_addr_to_reg (op1
);
12172 op0
= gen_reg_rtx (mode
);
12173 emit_insn (gen_rdseed (mode
, op0
));
12175 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12177 op2
= gen_reg_rtx (QImode
);
12179 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12181 emit_insn (gen_rtx_SET (op2
, pat
));
12184 || !register_operand (target
, SImode
))
12185 target
= gen_reg_rtx (SImode
);
12187 emit_insn (gen_zero_extendqisi2 (target
, op2
));
12190 case IX86_BUILTIN_SBB32
:
12191 icode
= CODE_FOR_subborrowsi
;
12192 icode2
= CODE_FOR_subborrowsi_0
;
12198 case IX86_BUILTIN_SBB64
:
12199 icode
= CODE_FOR_subborrowdi
;
12200 icode2
= CODE_FOR_subborrowdi_0
;
12206 case IX86_BUILTIN_ADDCARRYX32
:
12207 icode
= CODE_FOR_addcarrysi
;
12208 icode2
= CODE_FOR_addcarrysi_0
;
12214 case IX86_BUILTIN_ADDCARRYX64
:
12215 icode
= CODE_FOR_addcarrydi
;
12216 icode2
= CODE_FOR_addcarrydi_0
;
12222 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
12223 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
12224 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
12225 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
12227 op1
= expand_normal (arg0
);
12228 if (!integer_zerop (arg0
))
12229 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
12231 op2
= expand_normal (arg1
);
12232 if (!register_operand (op2
, mode0
))
12233 op2
= copy_to_mode_reg (mode0
, op2
);
12235 op3
= expand_normal (arg2
);
12236 if (!register_operand (op3
, mode0
))
12237 op3
= copy_to_mode_reg (mode0
, op3
);
12239 op4
= expand_normal (arg3
);
12240 if (!address_operand (op4
, VOIDmode
))
12242 op4
= convert_memory_address (Pmode
, op4
);
12243 op4
= copy_addr_to_reg (op4
);
12246 op0
= gen_reg_rtx (mode0
);
12247 if (integer_zerop (arg0
))
12249 /* If arg0 is 0, optimize right away into add or sub
12250 instruction that sets CCCmode flags. */
12251 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
12252 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
12256 /* Generate CF from input operand. */
12257 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
12259 /* Generate instruction that consumes CF. */
12260 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
12261 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
12262 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
12263 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
12266 /* Return current CF value. */
12268 target
= gen_reg_rtx (QImode
);
12270 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
12271 emit_insn (gen_rtx_SET (target
, pat
));
12273 /* Store the result. */
12274 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
12278 case IX86_BUILTIN_READ_FLAGS
:
12279 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12282 || target
== NULL_RTX
12283 || !nonimmediate_operand (target
, word_mode
)
12284 || GET_MODE (target
) != word_mode
)
12285 target
= gen_reg_rtx (word_mode
);
12287 emit_insn (gen_pop (target
));
12290 case IX86_BUILTIN_WRITE_FLAGS
:
12292 arg0
= CALL_EXPR_ARG (exp
, 0);
12293 op0
= expand_normal (arg0
);
12294 if (!general_no_elim_operand (op0
, word_mode
))
12295 op0
= copy_to_mode_reg (word_mode
, op0
);
12297 emit_insn (gen_push (op0
));
12298 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12301 case IX86_BUILTIN_KTESTC8
:
12302 icode
= CODE_FOR_ktestqi
;
12306 case IX86_BUILTIN_KTESTZ8
:
12307 icode
= CODE_FOR_ktestqi
;
12311 case IX86_BUILTIN_KTESTC16
:
12312 icode
= CODE_FOR_ktesthi
;
12316 case IX86_BUILTIN_KTESTZ16
:
12317 icode
= CODE_FOR_ktesthi
;
12321 case IX86_BUILTIN_KTESTC32
:
12322 icode
= CODE_FOR_ktestsi
;
12326 case IX86_BUILTIN_KTESTZ32
:
12327 icode
= CODE_FOR_ktestsi
;
12331 case IX86_BUILTIN_KTESTC64
:
12332 icode
= CODE_FOR_ktestdi
;
12336 case IX86_BUILTIN_KTESTZ64
:
12337 icode
= CODE_FOR_ktestdi
;
12341 case IX86_BUILTIN_KORTESTC8
:
12342 icode
= CODE_FOR_kortestqi
;
12346 case IX86_BUILTIN_KORTESTZ8
:
12347 icode
= CODE_FOR_kortestqi
;
12351 case IX86_BUILTIN_KORTESTC16
:
12352 icode
= CODE_FOR_kortesthi
;
12356 case IX86_BUILTIN_KORTESTZ16
:
12357 icode
= CODE_FOR_kortesthi
;
12361 case IX86_BUILTIN_KORTESTC32
:
12362 icode
= CODE_FOR_kortestsi
;
12366 case IX86_BUILTIN_KORTESTZ32
:
12367 icode
= CODE_FOR_kortestsi
;
12371 case IX86_BUILTIN_KORTESTC64
:
12372 icode
= CODE_FOR_kortestdi
;
12376 case IX86_BUILTIN_KORTESTZ64
:
12377 icode
= CODE_FOR_kortestdi
;
12381 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12382 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12383 op0
= expand_normal (arg0
);
12384 op1
= expand_normal (arg1
);
12386 mode0
= insn_data
[icode
].operand
[0].mode
;
12387 mode1
= insn_data
[icode
].operand
[1].mode
;
12389 if (GET_MODE (op0
) != VOIDmode
)
12390 op0
= force_reg (GET_MODE (op0
), op0
);
12392 op0
= gen_lowpart (mode0
, op0
);
12394 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12395 op0
= copy_to_mode_reg (mode0
, op0
);
12397 if (GET_MODE (op1
) != VOIDmode
)
12398 op1
= force_reg (GET_MODE (op1
), op1
);
12400 op1
= gen_lowpart (mode1
, op1
);
12402 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12403 op1
= copy_to_mode_reg (mode1
, op1
);
12405 target
= gen_reg_rtx (QImode
);
12407 /* Emit kortest. */
12408 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12409 /* And use setcc to return result from flags. */
12410 ix86_expand_setcc (target
, EQ
,
12411 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12414 case IX86_BUILTIN_GATHERSIV2DF
:
12415 icode
= CODE_FOR_avx2_gathersiv2df
;
12417 case IX86_BUILTIN_GATHERSIV4DF
:
12418 icode
= CODE_FOR_avx2_gathersiv4df
;
12420 case IX86_BUILTIN_GATHERDIV2DF
:
12421 icode
= CODE_FOR_avx2_gatherdiv2df
;
12423 case IX86_BUILTIN_GATHERDIV4DF
:
12424 icode
= CODE_FOR_avx2_gatherdiv4df
;
12426 case IX86_BUILTIN_GATHERSIV4SF
:
12427 icode
= CODE_FOR_avx2_gathersiv4sf
;
12429 case IX86_BUILTIN_GATHERSIV8SF
:
12430 icode
= CODE_FOR_avx2_gathersiv8sf
;
12432 case IX86_BUILTIN_GATHERDIV4SF
:
12433 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12435 case IX86_BUILTIN_GATHERDIV8SF
:
12436 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12438 case IX86_BUILTIN_GATHERSIV2DI
:
12439 icode
= CODE_FOR_avx2_gathersiv2di
;
12441 case IX86_BUILTIN_GATHERSIV4DI
:
12442 icode
= CODE_FOR_avx2_gathersiv4di
;
12444 case IX86_BUILTIN_GATHERDIV2DI
:
12445 icode
= CODE_FOR_avx2_gatherdiv2di
;
12447 case IX86_BUILTIN_GATHERDIV4DI
:
12448 icode
= CODE_FOR_avx2_gatherdiv4di
;
12450 case IX86_BUILTIN_GATHERSIV4SI
:
12451 icode
= CODE_FOR_avx2_gathersiv4si
;
12453 case IX86_BUILTIN_GATHERSIV8SI
:
12454 icode
= CODE_FOR_avx2_gathersiv8si
;
12456 case IX86_BUILTIN_GATHERDIV4SI
:
12457 icode
= CODE_FOR_avx2_gatherdiv4si
;
12459 case IX86_BUILTIN_GATHERDIV8SI
:
12460 icode
= CODE_FOR_avx2_gatherdiv8si
;
12462 case IX86_BUILTIN_GATHERALTSIV4DF
:
12463 icode
= CODE_FOR_avx2_gathersiv4df
;
12465 case IX86_BUILTIN_GATHERALTDIV8SF
:
12466 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12468 case IX86_BUILTIN_GATHERALTSIV4DI
:
12469 icode
= CODE_FOR_avx2_gathersiv4di
;
12471 case IX86_BUILTIN_GATHERALTDIV8SI
:
12472 icode
= CODE_FOR_avx2_gatherdiv8si
;
12474 case IX86_BUILTIN_GATHER3SIV16SF
:
12475 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12477 case IX86_BUILTIN_GATHER3SIV8DF
:
12478 icode
= CODE_FOR_avx512f_gathersiv8df
;
12480 case IX86_BUILTIN_GATHER3DIV16SF
:
12481 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12483 case IX86_BUILTIN_GATHER3DIV8DF
:
12484 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12486 case IX86_BUILTIN_GATHER3SIV16SI
:
12487 icode
= CODE_FOR_avx512f_gathersiv16si
;
12489 case IX86_BUILTIN_GATHER3SIV8DI
:
12490 icode
= CODE_FOR_avx512f_gathersiv8di
;
12492 case IX86_BUILTIN_GATHER3DIV16SI
:
12493 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12495 case IX86_BUILTIN_GATHER3DIV8DI
:
12496 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12498 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12499 icode
= CODE_FOR_avx512f_gathersiv8df
;
12501 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12502 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12504 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12505 icode
= CODE_FOR_avx512f_gathersiv8di
;
12507 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12508 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12510 case IX86_BUILTIN_GATHER3SIV2DF
:
12511 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12513 case IX86_BUILTIN_GATHER3SIV4DF
:
12514 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12516 case IX86_BUILTIN_GATHER3DIV2DF
:
12517 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12519 case IX86_BUILTIN_GATHER3DIV4DF
:
12520 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12522 case IX86_BUILTIN_GATHER3SIV4SF
:
12523 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12525 case IX86_BUILTIN_GATHER3SIV8SF
:
12526 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12528 case IX86_BUILTIN_GATHER3DIV4SF
:
12529 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12531 case IX86_BUILTIN_GATHER3DIV8SF
:
12532 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12534 case IX86_BUILTIN_GATHER3SIV2DI
:
12535 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12537 case IX86_BUILTIN_GATHER3SIV4DI
:
12538 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12540 case IX86_BUILTIN_GATHER3DIV2DI
:
12541 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12543 case IX86_BUILTIN_GATHER3DIV4DI
:
12544 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12546 case IX86_BUILTIN_GATHER3SIV4SI
:
12547 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12549 case IX86_BUILTIN_GATHER3SIV8SI
:
12550 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12552 case IX86_BUILTIN_GATHER3DIV4SI
:
12553 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12555 case IX86_BUILTIN_GATHER3DIV8SI
:
12556 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12558 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12559 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12561 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12562 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12564 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12565 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12567 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12568 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12570 case IX86_BUILTIN_SCATTERSIV16SF
:
12571 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12573 case IX86_BUILTIN_SCATTERSIV8DF
:
12574 icode
= CODE_FOR_avx512f_scattersiv8df
;
12576 case IX86_BUILTIN_SCATTERDIV16SF
:
12577 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12579 case IX86_BUILTIN_SCATTERDIV8DF
:
12580 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12582 case IX86_BUILTIN_SCATTERSIV16SI
:
12583 icode
= CODE_FOR_avx512f_scattersiv16si
;
12585 case IX86_BUILTIN_SCATTERSIV8DI
:
12586 icode
= CODE_FOR_avx512f_scattersiv8di
;
12588 case IX86_BUILTIN_SCATTERDIV16SI
:
12589 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12591 case IX86_BUILTIN_SCATTERDIV8DI
:
12592 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12594 case IX86_BUILTIN_SCATTERSIV8SF
:
12595 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12597 case IX86_BUILTIN_SCATTERSIV4SF
:
12598 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12600 case IX86_BUILTIN_SCATTERSIV4DF
:
12601 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12603 case IX86_BUILTIN_SCATTERSIV2DF
:
12604 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12606 case IX86_BUILTIN_SCATTERDIV8SF
:
12607 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12609 case IX86_BUILTIN_SCATTERDIV4SF
:
12610 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12612 case IX86_BUILTIN_SCATTERDIV4DF
:
12613 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12615 case IX86_BUILTIN_SCATTERDIV2DF
:
12616 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12618 case IX86_BUILTIN_SCATTERSIV8SI
:
12619 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12621 case IX86_BUILTIN_SCATTERSIV4SI
:
12622 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12624 case IX86_BUILTIN_SCATTERSIV4DI
:
12625 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12627 case IX86_BUILTIN_SCATTERSIV2DI
:
12628 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12630 case IX86_BUILTIN_SCATTERDIV8SI
:
12631 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12633 case IX86_BUILTIN_SCATTERDIV4SI
:
12634 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12636 case IX86_BUILTIN_SCATTERDIV4DI
:
12637 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12639 case IX86_BUILTIN_SCATTERDIV2DI
:
12640 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12642 case IX86_BUILTIN_GATHERPFDPD
:
12643 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12644 goto vec_prefetch_gen
;
12645 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12646 icode
= CODE_FOR_avx512f_scattersiv8df
;
12648 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12649 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12651 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12652 icode
= CODE_FOR_avx512f_scattersiv8di
;
12654 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12655 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12657 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12658 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12660 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12661 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12663 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12664 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12666 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12667 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12669 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12670 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12672 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12673 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12675 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12676 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12678 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12679 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12681 case IX86_BUILTIN_GATHERPFDPS
:
12682 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12683 goto vec_prefetch_gen
;
12684 case IX86_BUILTIN_GATHERPFQPD
:
12685 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12686 goto vec_prefetch_gen
;
12687 case IX86_BUILTIN_GATHERPFQPS
:
12688 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12689 goto vec_prefetch_gen
;
12690 case IX86_BUILTIN_SCATTERPFDPD
:
12691 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12692 goto vec_prefetch_gen
;
12693 case IX86_BUILTIN_SCATTERPFDPS
:
12694 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12695 goto vec_prefetch_gen
;
12696 case IX86_BUILTIN_SCATTERPFQPD
:
12697 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12698 goto vec_prefetch_gen
;
12699 case IX86_BUILTIN_SCATTERPFQPS
:
12700 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12701 goto vec_prefetch_gen
;
12705 rtx (*gen
) (rtx
, rtx
);
12707 arg0
= CALL_EXPR_ARG (exp
, 0);
12708 arg1
= CALL_EXPR_ARG (exp
, 1);
12709 arg2
= CALL_EXPR_ARG (exp
, 2);
12710 arg3
= CALL_EXPR_ARG (exp
, 3);
12711 arg4
= CALL_EXPR_ARG (exp
, 4);
12712 op0
= expand_normal (arg0
);
12713 op1
= expand_normal (arg1
);
12714 op2
= expand_normal (arg2
);
12715 op3
= expand_normal (arg3
);
12716 op4
= expand_normal (arg4
);
12717 /* Note the arg order is different from the operand order. */
12718 mode0
= insn_data
[icode
].operand
[1].mode
;
12719 mode2
= insn_data
[icode
].operand
[3].mode
;
12720 mode3
= insn_data
[icode
].operand
[4].mode
;
12721 mode4
= insn_data
[icode
].operand
[5].mode
;
12723 if (target
== NULL_RTX
12724 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12725 || !insn_data
[icode
].operand
[0].predicate (target
,
12726 GET_MODE (target
)))
12727 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12729 subtarget
= target
;
12733 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12734 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12735 half
= gen_reg_rtx (V8SImode
);
12736 if (!nonimmediate_operand (op2
, V16SImode
))
12737 op2
= copy_to_mode_reg (V16SImode
, op2
);
12738 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12741 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12742 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12743 case IX86_BUILTIN_GATHERALTSIV4DF
:
12744 case IX86_BUILTIN_GATHERALTSIV4DI
:
12745 half
= gen_reg_rtx (V4SImode
);
12746 if (!nonimmediate_operand (op2
, V8SImode
))
12747 op2
= copy_to_mode_reg (V8SImode
, op2
);
12748 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12751 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12752 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12753 half
= gen_reg_rtx (mode0
);
12754 if (mode0
== V8SFmode
)
12755 gen
= gen_vec_extract_lo_v16sf
;
12757 gen
= gen_vec_extract_lo_v16si
;
12758 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12759 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12760 emit_insn (gen (half
, op0
));
12762 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12764 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12765 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12766 case IX86_BUILTIN_GATHERALTDIV8SF
:
12767 case IX86_BUILTIN_GATHERALTDIV8SI
:
12768 half
= gen_reg_rtx (mode0
);
12769 if (mode0
== V4SFmode
)
12770 gen
= gen_vec_extract_lo_v8sf
;
12772 gen
= gen_vec_extract_lo_v8si
;
12773 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12774 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12775 emit_insn (gen (half
, op0
));
12777 if (VECTOR_MODE_P (GET_MODE (op3
)))
12779 half
= gen_reg_rtx (mode0
);
12780 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12781 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12782 emit_insn (gen (half
, op3
));
12790 /* Force memory operand only with base register here. But we
12791 don't want to do it on memory operand for other builtin
12793 op1
= ix86_zero_extend_to_Pmode (op1
);
12795 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12796 op0
= copy_to_mode_reg (mode0
, op0
);
12797 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12798 op1
= copy_to_mode_reg (Pmode
, op1
);
12799 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12800 op2
= copy_to_mode_reg (mode2
, op2
);
12802 op3
= fixup_modeless_constant (op3
, mode3
);
12804 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12806 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12807 op3
= copy_to_mode_reg (mode3
, op3
);
12811 op3
= copy_to_reg (op3
);
12812 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12814 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12816 error ("the last argument must be scale 1, 2, 4, 8");
12820 /* Optimize. If mask is known to have all high bits set,
12821 replace op0 with pc_rtx to signal that the instruction
12822 overwrites the whole destination and doesn't use its
12823 previous contents. */
12826 if (TREE_CODE (arg3
) == INTEGER_CST
)
12828 if (integer_all_onesp (arg3
))
12831 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12833 unsigned int negative
= 0;
12834 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12836 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12837 if (TREE_CODE (cst
) == INTEGER_CST
12838 && tree_int_cst_sign_bit (cst
))
12840 else if (TREE_CODE (cst
) == REAL_CST
12841 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12844 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12847 else if (TREE_CODE (arg3
) == SSA_NAME
12848 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12850 /* Recognize also when mask is like:
12851 __v2df src = _mm_setzero_pd ();
12852 __v2df mask = _mm_cmpeq_pd (src, src);
12854 __v8sf src = _mm256_setzero_ps ();
12855 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12856 as that is a cheaper way to load all ones into
12857 a register than having to load a constant from
12859 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12860 if (is_gimple_call (def_stmt
))
12862 tree fndecl
= gimple_call_fndecl (def_stmt
);
12864 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12865 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12867 case IX86_BUILTIN_CMPPD
:
12868 case IX86_BUILTIN_CMPPS
:
12869 case IX86_BUILTIN_CMPPD256
:
12870 case IX86_BUILTIN_CMPPS256
:
12871 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12874 case IX86_BUILTIN_CMPEQPD
:
12875 case IX86_BUILTIN_CMPEQPS
:
12876 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12877 && initializer_zerop (gimple_call_arg (def_stmt
,
12888 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12895 case IX86_BUILTIN_GATHER3DIV16SF
:
12896 if (target
== NULL_RTX
)
12897 target
= gen_reg_rtx (V8SFmode
);
12898 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12900 case IX86_BUILTIN_GATHER3DIV16SI
:
12901 if (target
== NULL_RTX
)
12902 target
= gen_reg_rtx (V8SImode
);
12903 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12905 case IX86_BUILTIN_GATHER3DIV8SF
:
12906 case IX86_BUILTIN_GATHERDIV8SF
:
12907 if (target
== NULL_RTX
)
12908 target
= gen_reg_rtx (V4SFmode
);
12909 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12911 case IX86_BUILTIN_GATHER3DIV8SI
:
12912 case IX86_BUILTIN_GATHERDIV8SI
:
12913 if (target
== NULL_RTX
)
12914 target
= gen_reg_rtx (V4SImode
);
12915 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12918 target
= subtarget
;
12924 arg0
= CALL_EXPR_ARG (exp
, 0);
12925 arg1
= CALL_EXPR_ARG (exp
, 1);
12926 arg2
= CALL_EXPR_ARG (exp
, 2);
12927 arg3
= CALL_EXPR_ARG (exp
, 3);
12928 arg4
= CALL_EXPR_ARG (exp
, 4);
12929 op0
= expand_normal (arg0
);
12930 op1
= expand_normal (arg1
);
12931 op2
= expand_normal (arg2
);
12932 op3
= expand_normal (arg3
);
12933 op4
= expand_normal (arg4
);
12934 mode1
= insn_data
[icode
].operand
[1].mode
;
12935 mode2
= insn_data
[icode
].operand
[2].mode
;
12936 mode3
= insn_data
[icode
].operand
[3].mode
;
12937 mode4
= insn_data
[icode
].operand
[4].mode
;
12939 /* Scatter instruction stores operand op3 to memory with
12940 indices from op2 and scale from op4 under writemask op1.
12941 If index operand op2 has more elements then source operand
12942 op3 one need to use only its low half. And vice versa. */
12945 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12946 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12947 half
= gen_reg_rtx (V8SImode
);
12948 if (!nonimmediate_operand (op2
, V16SImode
))
12949 op2
= copy_to_mode_reg (V16SImode
, op2
);
12950 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12953 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12954 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12955 half
= gen_reg_rtx (mode3
);
12956 if (mode3
== V8SFmode
)
12957 gen
= gen_vec_extract_lo_v16sf
;
12959 gen
= gen_vec_extract_lo_v16si
;
12960 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12961 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12962 emit_insn (gen (half
, op3
));
12965 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12966 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12967 half
= gen_reg_rtx (V4SImode
);
12968 if (!nonimmediate_operand (op2
, V8SImode
))
12969 op2
= copy_to_mode_reg (V8SImode
, op2
);
12970 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12973 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12974 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12975 half
= gen_reg_rtx (mode3
);
12976 if (mode3
== V4SFmode
)
12977 gen
= gen_vec_extract_lo_v8sf
;
12979 gen
= gen_vec_extract_lo_v8si
;
12980 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12981 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12982 emit_insn (gen (half
, op3
));
12985 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12986 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12987 if (!nonimmediate_operand (op2
, V4SImode
))
12988 op2
= copy_to_mode_reg (V4SImode
, op2
);
12990 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12991 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12992 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12993 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12999 /* Force memory operand only with base register here. But we
13000 don't want to do it on memory operand for other builtin
13002 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
13004 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13005 op0
= copy_to_mode_reg (Pmode
, op0
);
13007 op1
= fixup_modeless_constant (op1
, mode1
);
13009 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
13011 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13012 op1
= copy_to_mode_reg (mode1
, op1
);
13016 op1
= copy_to_reg (op1
);
13017 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
13020 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
13021 op2
= copy_to_mode_reg (mode2
, op2
);
13023 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13024 op3
= copy_to_mode_reg (mode3
, op3
);
13026 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13028 error ("the last argument must be scale 1, 2, 4, 8");
13032 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13040 arg0
= CALL_EXPR_ARG (exp
, 0);
13041 arg1
= CALL_EXPR_ARG (exp
, 1);
13042 arg2
= CALL_EXPR_ARG (exp
, 2);
13043 arg3
= CALL_EXPR_ARG (exp
, 3);
13044 arg4
= CALL_EXPR_ARG (exp
, 4);
13045 op0
= expand_normal (arg0
);
13046 op1
= expand_normal (arg1
);
13047 op2
= expand_normal (arg2
);
13048 op3
= expand_normal (arg3
);
13049 op4
= expand_normal (arg4
);
13050 mode0
= insn_data
[icode
].operand
[0].mode
;
13051 mode1
= insn_data
[icode
].operand
[1].mode
;
13052 mode3
= insn_data
[icode
].operand
[3].mode
;
13053 mode4
= insn_data
[icode
].operand
[4].mode
;
13055 op0
= fixup_modeless_constant (op0
, mode0
);
13057 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
13059 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13060 op0
= copy_to_mode_reg (mode0
, op0
);
13064 op0
= copy_to_reg (op0
);
13065 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
13068 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13069 op1
= copy_to_mode_reg (mode1
, op1
);
13071 /* Force memory operand only with base register here. But we
13072 don't want to do it on memory operand for other builtin
13074 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
13076 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
13077 op2
= copy_to_mode_reg (Pmode
, op2
);
13079 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13081 error ("the forth argument must be scale 1, 2, 4, 8");
13085 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13087 error ("incorrect hint operand");
13091 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13099 case IX86_BUILTIN_XABORT
:
13100 icode
= CODE_FOR_xabort
;
13101 arg0
= CALL_EXPR_ARG (exp
, 0);
13102 op0
= expand_normal (arg0
);
13103 mode0
= insn_data
[icode
].operand
[0].mode
;
13104 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13106 error ("the argument to %<xabort%> intrinsic must "
13107 "be an 8-bit immediate");
13110 emit_insn (gen_xabort (op0
));
13113 case IX86_BUILTIN_RDSSPD
:
13114 case IX86_BUILTIN_RDSSPQ
:
13115 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
13118 || !register_operand (target
, mode
))
13119 target
= gen_reg_rtx (mode
);
13121 op0
= force_reg (mode
, const0_rtx
);
13123 emit_insn (gen_rdssp (mode
, target
, op0
));
13126 case IX86_BUILTIN_INCSSPD
:
13127 case IX86_BUILTIN_INCSSPQ
:
13128 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
13130 arg0
= CALL_EXPR_ARG (exp
, 0);
13131 op0
= expand_normal (arg0
);
13133 op0
= force_reg (mode
, op0
);
13135 emit_insn (gen_incssp (mode
, op0
));
13138 case IX86_BUILTIN_HRESET
:
13139 icode
= CODE_FOR_hreset
;
13140 arg0
= CALL_EXPR_ARG (exp
, 0);
13141 op0
= expand_normal (arg0
);
13142 op0
= force_reg (SImode
, op0
);
13143 emit_insn (gen_hreset (op0
));
13146 case IX86_BUILTIN_RSTORSSP
:
13147 case IX86_BUILTIN_CLRSSBSY
:
13148 arg0
= CALL_EXPR_ARG (exp
, 0);
13149 op0
= expand_normal (arg0
);
13150 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
13151 ? CODE_FOR_rstorssp
13152 : CODE_FOR_clrssbsy
);
13154 if (!address_operand (op0
, VOIDmode
))
13156 op0
= convert_memory_address (Pmode
, op0
);
13157 op0
= copy_addr_to_reg (op0
);
13159 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
13162 case IX86_BUILTIN_WRSSD
:
13163 case IX86_BUILTIN_WRSSQ
:
13164 case IX86_BUILTIN_WRUSSD
:
13165 case IX86_BUILTIN_WRUSSQ
:
13166 mode
= ((fcode
== IX86_BUILTIN_WRSSD
13167 || fcode
== IX86_BUILTIN_WRUSSD
)
13168 ? SImode
: DImode
);
13170 arg0
= CALL_EXPR_ARG (exp
, 0);
13171 op0
= expand_normal (arg0
);
13172 arg1
= CALL_EXPR_ARG (exp
, 1);
13173 op1
= expand_normal (arg1
);
13175 op0
= force_reg (mode
, op0
);
13177 if (!address_operand (op1
, VOIDmode
))
13179 op1
= convert_memory_address (Pmode
, op1
);
13180 op1
= copy_addr_to_reg (op1
);
13182 op1
= gen_rtx_MEM (mode
, op1
);
13184 icode
= ((fcode
== IX86_BUILTIN_WRSSD
13185 || fcode
== IX86_BUILTIN_WRSSQ
)
13186 ? code_for_wrss (mode
)
13187 : code_for_wruss (mode
));
13188 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13196 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13197 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
13199 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
13200 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
13204 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
13205 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
13207 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
13208 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
13209 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
13210 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
13212 machine_mode mode
, wide_mode
, nar_mode
;
13214 nar_mode
= V4SFmode
;
13216 wide_mode
= V64SFmode
;
13217 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
13218 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
13222 case IX86_BUILTIN_4FMAPS
:
13223 fcn
= gen_avx5124fmaddps_4fmaddps
;
13227 case IX86_BUILTIN_4DPWSSD
:
13228 nar_mode
= V4SImode
;
13230 wide_mode
= V64SImode
;
13231 fcn
= gen_avx5124vnniw_vp4dpwssd
;
13235 case IX86_BUILTIN_4DPWSSDS
:
13236 nar_mode
= V4SImode
;
13238 wide_mode
= V64SImode
;
13239 fcn
= gen_avx5124vnniw_vp4dpwssds
;
13243 case IX86_BUILTIN_4FNMAPS
:
13244 fcn
= gen_avx5124fmaddps_4fnmaddps
;
13248 case IX86_BUILTIN_4FNMAPS_MASK
:
13249 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
13250 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
13253 case IX86_BUILTIN_4DPWSSD_MASK
:
13254 nar_mode
= V4SImode
;
13256 wide_mode
= V64SImode
;
13257 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
13258 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
13261 case IX86_BUILTIN_4DPWSSDS_MASK
:
13262 nar_mode
= V4SImode
;
13264 wide_mode
= V64SImode
;
13265 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
13266 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
13269 case IX86_BUILTIN_4FMAPS_MASK
:
13279 wide_reg
= gen_reg_rtx (wide_mode
);
13280 for (i
= 0; i
< 4; i
++)
13282 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13283 ops
[i
] = expand_normal (args
[i
]);
13285 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
13289 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13290 accum
= force_reg (mode
, accum
);
13292 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13293 addr
= force_reg (Pmode
, addr
);
13295 mem
= gen_rtx_MEM (nar_mode
, addr
);
13297 target
= gen_reg_rtx (mode
);
13299 emit_move_insn (target
, accum
);
13302 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13306 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13308 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13310 if (CONST_INT_P (mask
))
13311 mask
= fixup_modeless_constant (mask
, HImode
);
13313 mask
= force_reg (HImode
, mask
);
13315 if (GET_MODE (mask
) != HImode
)
13316 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
13318 /* If merge is 0 then we're about to emit z-masked variant. */
13319 if (const0_operand (merge
, mode
))
13320 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13321 /* If merge is the same as accum then emit merge-masked variant. */
13322 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13324 merge
= force_reg (mode
, merge
);
13325 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13327 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13330 target
= gen_reg_rtx (mode
);
13331 emit_move_insn (target
, merge
);
13332 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13338 case IX86_BUILTIN_4FNMASS
:
13339 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13343 case IX86_BUILTIN_4FMASS
:
13344 fcn
= gen_avx5124fmaddps_4fmaddss
;
13348 case IX86_BUILTIN_4FNMASS_MASK
:
13349 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13350 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13353 case IX86_BUILTIN_4FMASS_MASK
:
13362 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13363 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13367 wide_reg
= gen_reg_rtx (V64SFmode
);
13368 for (i
= 0; i
< 4; i
++)
13371 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13372 ops
[i
] = expand_normal (args
[i
]);
13374 tmp
= gen_reg_rtx (SFmode
);
13375 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13377 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13378 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13381 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13382 accum
= force_reg (V4SFmode
, accum
);
13384 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13385 addr
= force_reg (Pmode
, addr
);
13387 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13389 target
= gen_reg_rtx (V4SFmode
);
13391 emit_move_insn (target
, accum
);
13394 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13398 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13400 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13402 if (CONST_INT_P (mask
))
13403 mask
= fixup_modeless_constant (mask
, QImode
);
13405 mask
= force_reg (QImode
, mask
);
13407 if (GET_MODE (mask
) != QImode
)
13408 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13410 /* If merge is 0 then we're about to emit z-masked variant. */
13411 if (const0_operand (merge
, mode
))
13412 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13413 /* If merge is the same as accum then emit merge-masked
13415 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13417 merge
= force_reg (mode
, merge
);
13418 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13420 /* Merge with something unknown might happen if we z-mask
13424 target
= gen_reg_rtx (mode
);
13425 emit_move_insn (target
, merge
);
13426 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13431 case IX86_BUILTIN_RDPID
:
13432 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13434 case IX86_BUILTIN_FABSQ
:
13435 case IX86_BUILTIN_COPYSIGNQ
:
13437 /* Emit a normal call if SSE isn't available. */
13438 return expand_call (exp
, target
, ignore
);
13441 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13445 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13446 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13448 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13449 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13452 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13453 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13455 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13456 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13459 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13460 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13462 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13463 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13466 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13467 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13469 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13470 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13473 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13474 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13476 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13477 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13478 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13479 (enum ix86_builtin_func_type
)
13480 d
->flag
, d
->comparison
);
13483 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13484 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13486 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13487 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13491 gcc_unreachable ();
13494 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13495 fill target with val via vec_duplicate. */
13498 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13504 /* First attempt to recognize VAL as-is. */
13505 dup
= gen_vec_duplicate (mode
, val
);
13506 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13507 if (recog_memoized (insn
) < 0)
13510 machine_mode innermode
= GET_MODE_INNER (mode
);
13513 /* If that fails, force VAL into a register. */
13516 reg
= force_reg (innermode
, val
);
13517 if (GET_MODE (reg
) != innermode
)
13518 reg
= gen_lowpart (innermode
, reg
);
13519 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13520 seq
= get_insns ();
13523 emit_insn_before (seq
, insn
);
13525 ok
= recog_memoized (insn
) >= 0;
13531 /* Get a vector mode of the same size as the original but with elements
13532 twice as wide. This is only guaranteed to apply to integral vectors. */
13534 static machine_mode
13535 get_mode_wider_vector (machine_mode o
)
13537 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13538 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13539 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13540 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13544 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13545 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13547 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13548 with all elements equal to VAR. Return true if successful. */
13551 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13552 rtx target
, rtx val
)
13576 return ix86_vector_duplicate_value (mode
, target
, val
);
13581 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13585 val
= gen_lowpart (SImode
, val
);
13586 x
= gen_rtx_TRUNCATE (HImode
, val
);
13587 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13588 emit_insn (gen_rtx_SET (target
, x
));
13600 return ix86_vector_duplicate_value (mode
, target
, val
);
13604 struct expand_vec_perm_d dperm
;
13608 memset (&dperm
, 0, sizeof (dperm
));
13609 dperm
.target
= target
;
13610 dperm
.vmode
= mode
;
13611 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13612 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13613 dperm
.one_operand_p
= true;
13615 /* Extend to SImode using a paradoxical SUBREG. */
13616 tmp1
= gen_reg_rtx (SImode
);
13617 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13619 /* Insert the SImode value as low element of a V4SImode vector. */
13620 tmp2
= gen_reg_rtx (V4SImode
);
13621 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13622 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13624 ok
= (expand_vec_perm_1 (&dperm
)
13625 || expand_vec_perm_broadcast_1 (&dperm
));
13633 return ix86_vector_duplicate_value (mode
, target
, val
);
13640 /* Replicate the value once into the next wider mode and recurse. */
13642 machine_mode smode
, wsmode
, wvmode
;
13645 smode
= GET_MODE_INNER (mode
);
13646 wvmode
= get_mode_wider_vector (mode
);
13647 wsmode
= GET_MODE_INNER (wvmode
);
13649 val
= convert_modes (wsmode
, smode
, val
, true);
13650 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13651 GEN_INT (GET_MODE_BITSIZE (smode
)),
13652 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13653 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13655 x
= gen_reg_rtx (wvmode
);
13656 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13658 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13665 return ix86_vector_duplicate_value (mode
, target
, val
);
13668 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13669 rtx x
= gen_reg_rtx (hvmode
);
13671 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13674 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13675 emit_insn (gen_rtx_SET (target
, x
));
13681 if (TARGET_AVX512BW
)
13682 return ix86_vector_duplicate_value (mode
, target
, val
);
13685 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13686 rtx x
= gen_reg_rtx (hvmode
);
13688 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13691 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13692 emit_insn (gen_rtx_SET (target
, x
));
13701 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13702 whose ONE_VAR element is VAR, and other elements are zero. Return true
13706 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13707 rtx target
, rtx var
, int one_var
)
13709 machine_mode vsimode
;
13712 bool use_vector_set
= false;
13713 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13718 /* For SSE4.1, we normally use vector set. But if the second
13719 element is zero and inter-unit moves are OK, we use movq
13721 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13722 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13728 use_vector_set
= TARGET_SSE4_1
;
13731 use_vector_set
= TARGET_SSE2
;
13734 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13737 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13741 use_vector_set
= TARGET_AVX
;
13744 use_vector_set
= TARGET_AVX
;
13745 gen_vec_set_0
= gen_vec_setv8si_0
;
13748 use_vector_set
= TARGET_AVX
;
13749 gen_vec_set_0
= gen_vec_setv8sf_0
;
13752 use_vector_set
= TARGET_AVX
;
13753 gen_vec_set_0
= gen_vec_setv4df_0
;
13756 /* Use ix86_expand_vector_set in 64bit mode only. */
13757 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13758 gen_vec_set_0
= gen_vec_setv4di_0
;
13761 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13762 gen_vec_set_0
= gen_vec_setv16si_0
;
13765 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13766 gen_vec_set_0
= gen_vec_setv16sf_0
;
13769 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13770 gen_vec_set_0
= gen_vec_setv8df_0
;
13773 /* Use ix86_expand_vector_set in 64bit mode only. */
13774 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13775 gen_vec_set_0
= gen_vec_setv8di_0
;
13781 if (use_vector_set
)
13783 if (gen_vec_set_0
&& one_var
== 0)
13785 var
= force_reg (GET_MODE_INNER (mode
), var
);
13786 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13789 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13790 var
= force_reg (GET_MODE_INNER (mode
), var
);
13791 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13807 var
= force_reg (GET_MODE_INNER (mode
), var
);
13808 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13809 emit_insn (gen_rtx_SET (target
, x
));
13814 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13815 new_target
= gen_reg_rtx (mode
);
13817 new_target
= target
;
13818 var
= force_reg (GET_MODE_INNER (mode
), var
);
13819 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13820 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13821 emit_insn (gen_rtx_SET (new_target
, x
));
13824 /* We need to shuffle the value to the correct position, so
13825 create a new pseudo to store the intermediate result. */
13827 /* With SSE2, we can use the integer shuffle insns. */
13828 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13830 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13832 GEN_INT (one_var
== 1 ? 0 : 1),
13833 GEN_INT (one_var
== 2 ? 0 : 1),
13834 GEN_INT (one_var
== 3 ? 0 : 1)));
13835 if (target
!= new_target
)
13836 emit_move_insn (target
, new_target
);
13840 /* Otherwise convert the intermediate result to V4SFmode and
13841 use the SSE1 shuffle instructions. */
13842 if (mode
!= V4SFmode
)
13844 tmp
= gen_reg_rtx (V4SFmode
);
13845 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13850 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13852 GEN_INT (one_var
== 1 ? 0 : 1),
13853 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13854 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13856 if (mode
!= V4SFmode
)
13857 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13858 else if (tmp
!= target
)
13859 emit_move_insn (target
, tmp
);
13861 else if (target
!= new_target
)
13862 emit_move_insn (target
, new_target
);
13867 vsimode
= V4SImode
;
13873 vsimode
= V2SImode
;
13879 /* Zero extend the variable element to SImode and recurse. */
13880 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13882 x
= gen_reg_rtx (vsimode
);
13883 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13885 gcc_unreachable ();
13887 emit_move_insn (target
, gen_lowpart (mode
, x
));
13895 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13896 consisting of the values in VALS. It is known that all elements
13897 except ONE_VAR are constants. Return true if successful. */
13900 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13901 rtx target
, rtx vals
, int one_var
)
13903 rtx var
= XVECEXP (vals
, 0, one_var
);
13904 machine_mode wmode
;
13907 const_vec
= copy_rtx (vals
);
13908 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13909 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13917 /* For the two element vectors, it's just as easy to use
13918 the general case. */
13922 /* Use ix86_expand_vector_set in 64bit mode only. */
13943 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13948 /* There's no way to set one QImode entry easily. Combine
13949 the variable value with its adjacent constant value, and
13950 promote to an HImode set. */
13951 x
= XVECEXP (vals
, 0, one_var
^ 1);
13954 var
= convert_modes (HImode
, QImode
, var
, true);
13955 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13956 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13957 x
= GEN_INT (INTVAL (x
) & 0xff);
13961 var
= convert_modes (HImode
, QImode
, var
, true);
13962 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13964 if (x
!= const0_rtx
)
13965 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13966 1, OPTAB_LIB_WIDEN
);
13968 x
= gen_reg_rtx (wmode
);
13969 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13970 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13972 emit_move_insn (target
, gen_lowpart (mode
, x
));
13979 emit_move_insn (target
, const_vec
);
13980 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13984 /* A subroutine of ix86_expand_vector_init_general. Use vector
13985 concatenate to handle the most general case: all values variable,
13986 and none identical. */
13989 ix86_expand_vector_init_concat (machine_mode mode
,
13990 rtx target
, rtx
*ops
, int n
)
13992 machine_mode half_mode
= VOIDmode
;
14003 half_mode
= V8SImode
;
14006 half_mode
= V8SFmode
;
14009 half_mode
= V4DImode
;
14012 half_mode
= V4DFmode
;
14015 half_mode
= V4SImode
;
14018 half_mode
= V4SFmode
;
14021 half_mode
= V2DImode
;
14024 half_mode
= V2DFmode
;
14027 half_mode
= V2SImode
;
14030 half_mode
= V2SFmode
;
14033 half_mode
= DImode
;
14036 half_mode
= SImode
;
14039 half_mode
= DFmode
;
14042 half_mode
= SFmode
;
14045 gcc_unreachable ();
14048 if (!register_operand (ops
[1], half_mode
))
14049 ops
[1] = force_reg (half_mode
, ops
[1]);
14050 if (!register_operand (ops
[0], half_mode
))
14051 ops
[0] = force_reg (half_mode
, ops
[0]);
14052 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
14060 half_mode
= V2DImode
;
14063 half_mode
= V2DFmode
;
14066 half_mode
= V2SImode
;
14069 half_mode
= V2SFmode
;
14072 gcc_unreachable ();
14080 half_mode
= V4DImode
;
14083 half_mode
= V4DFmode
;
14086 half_mode
= V4SImode
;
14089 half_mode
= V4SFmode
;
14092 gcc_unreachable ();
14100 half_mode
= V8SImode
;
14103 half_mode
= V8SFmode
;
14106 gcc_unreachable ();
14111 /* FIXME: We process inputs backward to help RA. PR 36222. */
14113 for (j
= 1; j
!= -1; j
--)
14115 half
[j
] = gen_reg_rtx (half_mode
);
14119 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
14123 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14127 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
14128 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14132 gcc_unreachable ();
14134 ix86_expand_vector_init (false, half
[j
],
14135 gen_rtx_PARALLEL (half_mode
, v
));
14138 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
14142 gcc_unreachable ();
14146 /* A subroutine of ix86_expand_vector_init_general. Use vector
14147 interleave to handle the most general case: all values variable,
14148 and none identical. */
14151 ix86_expand_vector_init_interleave (machine_mode mode
,
14152 rtx target
, rtx
*ops
, int n
)
14154 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
14157 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
14158 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
14159 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
14164 gen_load_even
= gen_vec_setv8hi
;
14165 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
14166 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14167 inner_mode
= HImode
;
14168 first_imode
= V4SImode
;
14169 second_imode
= V2DImode
;
14170 third_imode
= VOIDmode
;
14173 gen_load_even
= gen_vec_setv16qi
;
14174 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
14175 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
14176 inner_mode
= QImode
;
14177 first_imode
= V8HImode
;
14178 second_imode
= V4SImode
;
14179 third_imode
= V2DImode
;
14182 gcc_unreachable ();
14185 for (i
= 0; i
< n
; i
++)
14187 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14188 op0
= gen_reg_rtx (SImode
);
14189 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
14191 /* Insert the SImode value as low element of V4SImode vector. */
14192 op1
= gen_reg_rtx (V4SImode
);
14193 op0
= gen_rtx_VEC_MERGE (V4SImode
,
14194 gen_rtx_VEC_DUPLICATE (V4SImode
,
14196 CONST0_RTX (V4SImode
),
14198 emit_insn (gen_rtx_SET (op1
, op0
));
14200 /* Cast the V4SImode vector back to a vector in orignal mode. */
14201 op0
= gen_reg_rtx (mode
);
14202 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
14204 /* Load even elements into the second position. */
14205 emit_insn (gen_load_even (op0
,
14206 force_reg (inner_mode
,
14210 /* Cast vector to FIRST_IMODE vector. */
14211 ops
[i
] = gen_reg_rtx (first_imode
);
14212 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
14215 /* Interleave low FIRST_IMODE vectors. */
14216 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
14218 op0
= gen_reg_rtx (first_imode
);
14219 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
14221 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14222 ops
[j
] = gen_reg_rtx (second_imode
);
14223 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
14226 /* Interleave low SECOND_IMODE vectors. */
14227 switch (second_imode
)
14230 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
14232 op0
= gen_reg_rtx (second_imode
);
14233 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
14236 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14238 ops
[j
] = gen_reg_rtx (third_imode
);
14239 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
14241 second_imode
= V2DImode
;
14242 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14246 op0
= gen_reg_rtx (second_imode
);
14247 emit_insn (gen_interleave_second_low (op0
, ops
[0],
14250 /* Cast the SECOND_IMODE vector back to a vector on original
14252 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
14256 gcc_unreachable ();
14260 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14261 all values variable, and none identical. */
14264 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
14265 rtx target
, rtx vals
)
14267 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
14268 machine_mode half_mode
= VOIDmode
;
14269 machine_mode quarter_mode
= VOIDmode
;
14276 if (!mmx_ok
&& !TARGET_SSE
)
14292 n
= GET_MODE_NUNITS (mode
);
14293 for (i
= 0; i
< n
; i
++)
14294 ops
[i
] = XVECEXP (vals
, 0, i
);
14295 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
14299 for (i
= 0; i
< 2; i
++)
14300 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14301 op0
= gen_reg_rtx (V4DImode
);
14302 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
14303 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14307 for (i
= 0; i
< 4; i
++)
14308 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14309 ops
[4] = gen_reg_rtx (V4DImode
);
14310 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
14311 ops
[5] = gen_reg_rtx (V4DImode
);
14312 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
14313 op0
= gen_reg_rtx (V8DImode
);
14314 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
14315 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14319 half_mode
= V16QImode
;
14323 half_mode
= V8HImode
;
14327 n
= GET_MODE_NUNITS (mode
);
14328 for (i
= 0; i
< n
; i
++)
14329 ops
[i
] = XVECEXP (vals
, 0, i
);
14330 op0
= gen_reg_rtx (half_mode
);
14331 op1
= gen_reg_rtx (half_mode
);
14332 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14334 ix86_expand_vector_init_interleave (half_mode
, op1
,
14335 &ops
[n
>> 1], n
>> 2);
14336 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14340 quarter_mode
= V16QImode
;
14341 half_mode
= V32QImode
;
14345 quarter_mode
= V8HImode
;
14346 half_mode
= V16HImode
;
14350 n
= GET_MODE_NUNITS (mode
);
14351 for (i
= 0; i
< n
; i
++)
14352 ops
[i
] = XVECEXP (vals
, 0, i
);
14353 op0
= gen_reg_rtx (quarter_mode
);
14354 op1
= gen_reg_rtx (quarter_mode
);
14355 op2
= gen_reg_rtx (quarter_mode
);
14356 op3
= gen_reg_rtx (quarter_mode
);
14357 op4
= gen_reg_rtx (half_mode
);
14358 op5
= gen_reg_rtx (half_mode
);
14359 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14361 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14362 &ops
[n
>> 2], n
>> 3);
14363 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14364 &ops
[n
>> 1], n
>> 3);
14365 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14366 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14367 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14368 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14369 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14373 if (!TARGET_SSE4_1
)
14381 /* Don't use ix86_expand_vector_init_interleave if we can't
14382 move from GPR to SSE register directly. */
14383 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14386 n
= GET_MODE_NUNITS (mode
);
14387 for (i
= 0; i
< n
; i
++)
14388 ops
[i
] = XVECEXP (vals
, 0, i
);
14389 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14397 gcc_unreachable ();
14401 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14402 machine_mode inner_mode
;
14403 rtx words
[4], shift
;
14405 inner_mode
= GET_MODE_INNER (mode
);
14406 n_elts
= GET_MODE_NUNITS (mode
);
14407 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14408 n_elt_per_word
= n_elts
/ n_words
;
14409 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14411 for (i
= 0; i
< n_words
; ++i
)
14413 rtx word
= NULL_RTX
;
14415 for (j
= 0; j
< n_elt_per_word
; ++j
)
14417 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14418 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14424 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14425 word
, 1, OPTAB_LIB_WIDEN
);
14426 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14427 word
, 1, OPTAB_LIB_WIDEN
);
14435 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14436 else if (n_words
== 2)
14438 rtx tmp
= gen_reg_rtx (mode
);
14439 emit_clobber (tmp
);
14440 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14441 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14442 emit_move_insn (target
, tmp
);
14444 else if (n_words
== 4)
14446 rtx tmp
= gen_reg_rtx (V4SImode
);
14447 gcc_assert (word_mode
== SImode
);
14448 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14449 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14450 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14453 gcc_unreachable ();
14457 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14458 instructions unless MMX_OK is true. */
14461 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14463 machine_mode mode
= GET_MODE (target
);
14464 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14465 int n_elts
= GET_MODE_NUNITS (mode
);
14466 int n_var
= 0, one_var
= -1;
14467 bool all_same
= true, all_const_zero
= true;
14471 /* Handle first initialization from vector elts. */
14472 if (n_elts
!= XVECLEN (vals
, 0))
14474 rtx subtarget
= target
;
14475 x
= XVECEXP (vals
, 0, 0);
14476 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14477 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14479 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14480 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14482 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14483 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14484 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14485 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14486 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14487 subtarget
= gen_reg_rtx (mode
);
14489 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14490 if (subtarget
!= target
)
14491 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14494 gcc_unreachable ();
14497 for (i
= 0; i
< n_elts
; ++i
)
14499 x
= XVECEXP (vals
, 0, i
);
14500 if (!(CONST_SCALAR_INT_P (x
)
14501 || CONST_DOUBLE_P (x
)
14502 || CONST_FIXED_P (x
)))
14503 n_var
++, one_var
= i
;
14504 else if (x
!= CONST0_RTX (inner_mode
))
14505 all_const_zero
= false;
14506 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14510 /* Constants are best loaded from the constant pool. */
14513 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14517 /* If all values are identical, broadcast the value. */
14519 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14520 XVECEXP (vals
, 0, 0)))
14523 /* Values where only one field is non-constant are best loaded from
14524 the pool and overwritten via move later. */
14528 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14529 XVECEXP (vals
, 0, one_var
),
14533 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14537 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14541 V setg (V v, int idx, T val)
14543 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14544 V valv = (V){val, val, val, val, val, val, val, val};
14545 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14546 v = (v & ~mask) | (valv & mask);
14550 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
14553 machine_mode mode
= GET_MODE (target
);
14554 machine_mode cmp_mode
= mode
;
14555 int n_elts
= GET_MODE_NUNITS (mode
);
14556 rtx valv
,idxv
,constv
,idx_tmp
;
14559 /* 512-bits vector byte/word broadcast and comparison only available
14560 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14561 when without TARGET_AVX512BW. */
14562 if ((mode
== V32HImode
|| mode
== V64QImode
) && !TARGET_AVX512BW
)
14564 gcc_assert (TARGET_AVX512F
);
14565 rtx vhi
, vlo
, idx_hi
;
14566 machine_mode half_mode
;
14567 rtx (*extract_hi
)(rtx
, rtx
);
14568 rtx (*extract_lo
)(rtx
, rtx
);
14570 if (mode
== V32HImode
)
14572 half_mode
= V16HImode
;
14573 extract_hi
= gen_vec_extract_hi_v32hi
;
14574 extract_lo
= gen_vec_extract_lo_v32hi
;
14578 half_mode
= V32QImode
;
14579 extract_hi
= gen_vec_extract_hi_v64qi
;
14580 extract_lo
= gen_vec_extract_lo_v64qi
;
14583 vhi
= gen_reg_rtx (half_mode
);
14584 vlo
= gen_reg_rtx (half_mode
);
14585 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
14586 emit_insn (extract_hi (vhi
, target
));
14587 emit_insn (extract_lo (vlo
, target
));
14590 vec
[2] = GEN_INT (n_elts
/2);
14591 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
14592 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
14593 ix86_expand_vector_set_var (vlo
, val
, idx
);
14594 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
14598 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
14603 cmp_mode
= V2DImode
;
14606 cmp_mode
= V4DImode
;
14609 cmp_mode
= V8DImode
;
14612 cmp_mode
= V4SImode
;
14615 cmp_mode
= V8SImode
;
14618 cmp_mode
= V16SImode
;
14621 gcc_unreachable ();
14625 for (int i
= 0; i
!= n_elts
; i
++)
14626 vec
[i
] = GEN_INT (i
);
14627 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
14628 valv
= gen_reg_rtx (mode
);
14629 idxv
= gen_reg_rtx (cmp_mode
);
14630 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
14632 ok
= ix86_expand_vector_init_duplicate (false, mode
, valv
, val
);
14634 ok
= ix86_expand_vector_init_duplicate (false, cmp_mode
, idxv
, idx_tmp
);
14639 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
14642 ok
= ix86_expand_int_vcond (vec
);
14647 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14649 machine_mode mode
= GET_MODE (target
);
14650 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14651 machine_mode half_mode
;
14652 bool use_vec_merge
= false;
14654 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14656 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14657 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14658 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14659 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14660 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14661 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14663 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14665 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14666 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14667 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14668 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14669 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14670 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14673 machine_mode mmode
= VOIDmode
;
14674 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14679 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14687 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14688 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14690 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14692 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14693 emit_insn (gen_rtx_SET (target
, tmp
));
14699 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14703 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14704 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14706 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14708 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14709 emit_insn (gen_rtx_SET (target
, tmp
));
14713 /* NB: For ELT == 0, use standard scalar operation patterns which
14714 preserve the rest of the vector for combiner:
14717 (vec_duplicate:V2DF (reg:DF))
14727 /* For the two element vectors, we implement a VEC_CONCAT with
14728 the extraction of the other element. */
14730 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14731 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14734 op0
= val
, op1
= tmp
;
14736 op0
= tmp
, op1
= val
;
14738 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14739 emit_insn (gen_rtx_SET (target
, tmp
));
14744 use_vec_merge
= TARGET_SSE4_1
;
14751 use_vec_merge
= true;
14755 /* tmp = target = A B C D */
14756 tmp
= copy_to_reg (target
);
14757 /* target = A A B B */
14758 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14759 /* target = X A B B */
14760 ix86_expand_vector_set (false, target
, val
, 0);
14761 /* target = A X C D */
14762 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14763 const1_rtx
, const0_rtx
,
14764 GEN_INT (2+4), GEN_INT (3+4)));
14768 /* tmp = target = A B C D */
14769 tmp
= copy_to_reg (target
);
14770 /* tmp = X B C D */
14771 ix86_expand_vector_set (false, tmp
, val
, 0);
14772 /* target = A B X D */
14773 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14774 const0_rtx
, const1_rtx
,
14775 GEN_INT (0+4), GEN_INT (3+4)));
14779 /* tmp = target = A B C D */
14780 tmp
= copy_to_reg (target
);
14781 /* tmp = X B C D */
14782 ix86_expand_vector_set (false, tmp
, val
, 0);
14783 /* target = A B X D */
14784 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14785 const0_rtx
, const1_rtx
,
14786 GEN_INT (2+4), GEN_INT (0+4)));
14790 gcc_unreachable ();
14795 use_vec_merge
= TARGET_SSE4_1
;
14799 /* Element 0 handled by vec_merge below. */
14802 use_vec_merge
= true;
14808 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14809 store into element 0, then shuffle them back. */
14813 order
[0] = GEN_INT (elt
);
14814 order
[1] = const1_rtx
;
14815 order
[2] = const2_rtx
;
14816 order
[3] = GEN_INT (3);
14817 order
[elt
] = const0_rtx
;
14819 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14820 order
[1], order
[2], order
[3]));
14822 ix86_expand_vector_set (false, target
, val
, 0);
14824 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14825 order
[1], order
[2], order
[3]));
14829 /* For SSE1, we have to reuse the V4SF code. */
14830 rtx t
= gen_reg_rtx (V4SFmode
);
14831 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14832 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14833 emit_move_insn (target
, gen_lowpart (mode
, t
));
14838 use_vec_merge
= TARGET_SSE2
;
14841 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14845 use_vec_merge
= TARGET_SSE4_1
;
14849 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14853 half_mode
= V16QImode
;
14859 half_mode
= V8HImode
;
14865 half_mode
= V4SImode
;
14871 half_mode
= V2DImode
;
14877 half_mode
= V4SFmode
;
14883 half_mode
= V2DFmode
;
14889 /* Compute offset. */
14893 gcc_assert (i
<= 1);
14895 /* Extract the half. */
14896 tmp
= gen_reg_rtx (half_mode
);
14897 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14899 /* Put val in tmp at elt. */
14900 ix86_expand_vector_set (false, tmp
, val
, elt
);
14903 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14907 if (TARGET_AVX512F
)
14910 gen_blendm
= gen_avx512f_blendmv8df
;
14915 if (TARGET_AVX512F
)
14918 gen_blendm
= gen_avx512f_blendmv8di
;
14923 if (TARGET_AVX512F
)
14926 gen_blendm
= gen_avx512f_blendmv16sf
;
14931 if (TARGET_AVX512F
)
14934 gen_blendm
= gen_avx512f_blendmv16si
;
14939 if (TARGET_AVX512BW
)
14942 gen_blendm
= gen_avx512bw_blendmv32hi
;
14944 else if (TARGET_AVX512F
)
14946 half_mode
= E_V8HImode
;
14953 if (TARGET_AVX512BW
)
14956 gen_blendm
= gen_avx512bw_blendmv64qi
;
14958 else if (TARGET_AVX512F
)
14960 half_mode
= E_V16QImode
;
14967 /* Compute offset. */
14971 gcc_assert (i
<= 3);
14974 /* Extract the quarter. */
14975 tmp
= gen_reg_rtx (V4SImode
);
14976 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14977 rtx mask
= gen_reg_rtx (QImode
);
14979 emit_move_insn (mask
, constm1_rtx
);
14980 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14983 tmp2
= gen_reg_rtx (half_mode
);
14984 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14987 /* Put val in tmp at elt. */
14988 ix86_expand_vector_set (false, tmp
, val
, elt
);
14991 tmp2
= gen_reg_rtx (V16SImode
);
14992 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14993 mask
= gen_reg_rtx (HImode
);
14994 emit_move_insn (mask
, constm1_rtx
);
14995 tmp
= gen_lowpart (V4SImode
, tmp
);
14996 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14998 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
15006 if (mmode
!= VOIDmode
)
15008 tmp
= gen_reg_rtx (mode
);
15009 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
15010 /* The avx512*_blendm<mode> expanders have different operand order
15011 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15012 elements where the mask is set and second input operand otherwise,
15013 in {sse,avx}*_*blend* the first input operand is used for elements
15014 where the mask is clear and second input operand otherwise. */
15015 emit_insn (gen_blendm (target
, target
, tmp
,
15017 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
15020 else if (use_vec_merge
)
15023 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
15024 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
15025 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
15026 emit_insn (gen_rtx_SET (target
, tmp
));
15030 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15032 emit_move_insn (mem
, target
);
15034 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
15035 emit_move_insn (tmp
, val
);
15037 emit_move_insn (target
, mem
);
15042 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
15044 machine_mode mode
= GET_MODE (vec
);
15045 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15046 bool use_vec_extr
= false;
15052 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15066 use_vec_extr
= true;
15070 use_vec_extr
= TARGET_SSE4_1
;
15082 tmp
= gen_reg_rtx (mode
);
15083 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
15084 GEN_INT (elt
), GEN_INT (elt
),
15085 GEN_INT (elt
+4), GEN_INT (elt
+4)));
15089 tmp
= gen_reg_rtx (mode
);
15090 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
15094 gcc_unreachable ();
15097 use_vec_extr
= true;
15102 use_vec_extr
= TARGET_SSE4_1
;
15116 tmp
= gen_reg_rtx (mode
);
15117 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
15118 GEN_INT (elt
), GEN_INT (elt
),
15119 GEN_INT (elt
), GEN_INT (elt
)));
15123 tmp
= gen_reg_rtx (mode
);
15124 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
15128 gcc_unreachable ();
15131 use_vec_extr
= true;
15136 /* For SSE1, we have to reuse the V4SF code. */
15137 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
15138 gen_lowpart (V4SFmode
, vec
), elt
);
15144 use_vec_extr
= TARGET_SSE2
;
15147 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
15151 use_vec_extr
= TARGET_SSE4_1
;
15155 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
15157 tmp
= gen_reg_rtx (SImode
);
15158 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
15160 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
15168 tmp
= gen_reg_rtx (V4SFmode
);
15170 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
15172 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
15173 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15181 tmp
= gen_reg_rtx (V2DFmode
);
15183 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
15185 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
15186 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15194 tmp
= gen_reg_rtx (V16QImode
);
15196 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
15198 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
15199 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15207 tmp
= gen_reg_rtx (V8HImode
);
15209 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
15211 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
15212 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15220 tmp
= gen_reg_rtx (V4SImode
);
15222 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
15224 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
15225 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15233 tmp
= gen_reg_rtx (V2DImode
);
15235 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
15237 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
15238 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15244 if (TARGET_AVX512BW
)
15246 tmp
= gen_reg_rtx (V16HImode
);
15248 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
15250 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
15251 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15257 if (TARGET_AVX512BW
)
15259 tmp
= gen_reg_rtx (V32QImode
);
15261 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
15263 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
15264 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
15270 tmp
= gen_reg_rtx (V8SFmode
);
15272 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
15274 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
15275 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15279 tmp
= gen_reg_rtx (V4DFmode
);
15281 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
15283 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
15284 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15288 tmp
= gen_reg_rtx (V8SImode
);
15290 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
15292 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
15293 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15297 tmp
= gen_reg_rtx (V4DImode
);
15299 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
15301 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
15302 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15306 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15307 /* ??? Could extract the appropriate HImode element and shift. */
15316 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
15317 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
15319 /* Let the rtl optimizers know about the zero extension performed. */
15320 if (inner_mode
== QImode
|| inner_mode
== HImode
)
15322 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
15323 target
= gen_lowpart (SImode
, target
);
15326 emit_insn (gen_rtx_SET (target
, tmp
));
15330 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15332 emit_move_insn (mem
, vec
);
15334 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
15335 emit_move_insn (target
, tmp
);
15339 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15340 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15341 The upper bits of DEST are undefined, though they shouldn't cause
15342 exceptions (some bits from src or all zeros are ok). */
15345 emit_reduc_half (rtx dest
, rtx src
, int i
)
15348 switch (GET_MODE (src
))
15352 tem
= gen_sse_movhlps (dest
, src
, src
);
15354 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
15355 GEN_INT (1 + 4), GEN_INT (1 + 4));
15358 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
15364 d
= gen_reg_rtx (V1TImode
);
15365 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
15370 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
15372 tem
= gen_avx_shufps256 (dest
, src
, src
,
15373 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
15377 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
15379 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
15387 if (GET_MODE (dest
) != V4DImode
)
15388 d
= gen_reg_rtx (V4DImode
);
15389 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
15390 gen_lowpart (V4DImode
, src
),
15395 d
= gen_reg_rtx (V2TImode
);
15396 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
15404 d
= gen_reg_rtx (V4TImode
);
15405 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
15415 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
15416 gen_lowpart (V16SImode
, src
),
15417 gen_lowpart (V16SImode
, src
),
15418 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
15419 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
15420 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
15421 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
15422 GEN_INT (0xC), GEN_INT (0xD),
15423 GEN_INT (0xE), GEN_INT (0xF),
15424 GEN_INT (0x10), GEN_INT (0x11),
15425 GEN_INT (0x12), GEN_INT (0x13),
15426 GEN_INT (0x14), GEN_INT (0x15),
15427 GEN_INT (0x16), GEN_INT (0x17));
15429 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
15430 gen_lowpart (V16SImode
, src
),
15431 GEN_INT (i
== 128 ? 0x2 : 0x1),
15435 GEN_INT (i
== 128 ? 0x6 : 0x5),
15439 GEN_INT (i
== 128 ? 0xA : 0x9),
15443 GEN_INT (i
== 128 ? 0xE : 0xD),
15449 gcc_unreachable ();
15453 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15456 /* Expand a vector reduction. FN is the binary pattern to reduce;
15457 DEST is the destination; IN is the input vector. */
15460 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15462 rtx half
, dst
, vec
= in
;
15463 machine_mode mode
= GET_MODE (in
);
15466 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15468 && mode
== V8HImode
15469 && fn
== gen_uminv8hi3
)
15471 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15475 for (i
= GET_MODE_BITSIZE (mode
);
15476 i
> GET_MODE_UNIT_BITSIZE (mode
);
15479 half
= gen_reg_rtx (mode
);
15480 emit_reduc_half (half
, vec
, i
);
15481 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15484 dst
= gen_reg_rtx (mode
);
15485 emit_insn (fn (dst
, half
, vec
));
15490 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15491 FP status register is set. */
15494 ix86_emit_fp_unordered_jump (rtx label
)
15496 rtx reg
= gen_reg_rtx (HImode
);
15500 emit_insn (gen_x86_fnstsw_1 (reg
));
15502 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15504 emit_insn (gen_x86_sahf_1 (reg
));
15506 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15507 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15511 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15513 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15514 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15517 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15518 gen_rtx_LABEL_REF (VOIDmode
, label
),
15520 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15521 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15522 JUMP_LABEL (insn
) = label
;
15525 /* Output code to perform an sinh XFmode calculation. */
15527 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15529 rtx e1
= gen_reg_rtx (XFmode
);
15530 rtx e2
= gen_reg_rtx (XFmode
);
15531 rtx scratch
= gen_reg_rtx (HImode
);
15532 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15533 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15535 rtx_code_label
*jump_label
= gen_label_rtx ();
15538 /* scratch = fxam (op1) */
15539 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15541 /* e1 = expm1 (|op1|) */
15542 emit_insn (gen_absxf2 (e2
, op1
));
15543 emit_insn (gen_expm1xf2 (e1
, e2
));
15545 /* e2 = e1 / (e1 + 1.0) + e1 */
15546 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15547 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15548 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15549 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15551 /* flags = signbit (op1) */
15552 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15554 /* if (flags) then e2 = -e2 */
15555 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15556 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15557 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15559 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15560 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15561 JUMP_LABEL (insn
) = jump_label
;
15563 emit_insn (gen_negxf2 (e2
, e2
));
15565 emit_label (jump_label
);
15566 LABEL_NUSES (jump_label
) = 1;
15568 /* op0 = 0.5 * e2 */
15569 half
= force_reg (XFmode
, half
);
15570 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15573 /* Output code to perform an cosh XFmode calculation. */
15575 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15577 rtx e1
= gen_reg_rtx (XFmode
);
15578 rtx e2
= gen_reg_rtx (XFmode
);
15579 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15582 /* e1 = exp (op1) */
15583 emit_insn (gen_expxf2 (e1
, op1
));
15585 /* e2 = e1 + 1.0 / e1 */
15586 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15587 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15588 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15590 /* op0 = 0.5 * e2 */
15591 half
= force_reg (XFmode
, half
);
15592 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15595 /* Output code to perform an tanh XFmode calculation. */
15597 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15599 rtx e1
= gen_reg_rtx (XFmode
);
15600 rtx e2
= gen_reg_rtx (XFmode
);
15601 rtx scratch
= gen_reg_rtx (HImode
);
15602 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15604 rtx_code_label
*jump_label
= gen_label_rtx ();
15607 /* scratch = fxam (op1) */
15608 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15610 /* e1 = expm1 (-|2 * op1|) */
15611 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15612 emit_insn (gen_absxf2 (e2
, e2
));
15613 emit_insn (gen_negxf2 (e2
, e2
));
15614 emit_insn (gen_expm1xf2 (e1
, e2
));
15616 /* e2 = e1 / (e1 + 2.0) */
15617 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15618 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15619 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15621 /* flags = signbit (op1) */
15622 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15624 /* if (!flags) then e2 = -e2 */
15625 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15626 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15627 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15629 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15630 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15631 JUMP_LABEL (insn
) = jump_label
;
15633 emit_insn (gen_negxf2 (e2
, e2
));
15635 emit_label (jump_label
);
15636 LABEL_NUSES (jump_label
) = 1;
15638 emit_move_insn (op0
, e2
);
15641 /* Output code to perform an asinh XFmode calculation. */
15643 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15645 rtx e1
= gen_reg_rtx (XFmode
);
15646 rtx e2
= gen_reg_rtx (XFmode
);
15647 rtx scratch
= gen_reg_rtx (HImode
);
15648 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15650 rtx_code_label
*jump_label
= gen_label_rtx ();
15653 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15654 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15655 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15656 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15657 emit_insn (gen_sqrtxf2 (e2
, e2
));
15658 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15661 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15663 /* scratch = fxam (op1) */
15664 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15666 /* e1 = e1 + |op1| */
15667 emit_insn (gen_absxf2 (e2
, op1
));
15668 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15670 /* e2 = log1p (e1) */
15671 ix86_emit_i387_log1p (e2
, e1
);
15673 /* flags = signbit (op1) */
15674 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15676 /* if (flags) then e2 = -e2 */
15677 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15678 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15679 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15681 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15682 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15683 JUMP_LABEL (insn
) = jump_label
;
15685 emit_insn (gen_negxf2 (e2
, e2
));
15687 emit_label (jump_label
);
15688 LABEL_NUSES (jump_label
) = 1;
15690 emit_move_insn (op0
, e2
);
15693 /* Output code to perform an acosh XFmode calculation. */
15695 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15697 rtx e1
= gen_reg_rtx (XFmode
);
15698 rtx e2
= gen_reg_rtx (XFmode
);
15699 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15701 /* e2 = sqrt (op1 + 1.0) */
15702 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15703 emit_insn (gen_sqrtxf2 (e2
, e2
));
15705 /* e1 = sqrt (op1 - 1.0) */
15706 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15707 emit_insn (gen_sqrtxf2 (e1
, e1
));
15710 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15712 /* e1 = e1 + op1 */
15713 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15715 /* op0 = log (e1) */
15716 emit_insn (gen_logxf2 (op0
, e1
));
15719 /* Output code to perform an atanh XFmode calculation. */
15721 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15723 rtx e1
= gen_reg_rtx (XFmode
);
15724 rtx e2
= gen_reg_rtx (XFmode
);
15725 rtx scratch
= gen_reg_rtx (HImode
);
15726 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15727 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15729 rtx_code_label
*jump_label
= gen_label_rtx ();
15732 /* scratch = fxam (op1) */
15733 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15736 emit_insn (gen_absxf2 (e2
, op1
));
15738 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15739 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15740 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15741 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15742 emit_insn (gen_negxf2 (e2
, e2
));
15743 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15745 /* e2 = log1p (e1) */
15746 ix86_emit_i387_log1p (e2
, e1
);
15748 /* flags = signbit (op1) */
15749 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15751 /* if (!flags) then e2 = -e2 */
15752 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15753 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15754 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15756 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15757 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15758 JUMP_LABEL (insn
) = jump_label
;
15760 emit_insn (gen_negxf2 (e2
, e2
));
15762 emit_label (jump_label
);
15763 LABEL_NUSES (jump_label
) = 1;
15765 /* op0 = 0.5 * e2 */
15766 half
= force_reg (XFmode
, half
);
15767 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15770 /* Output code to perform a log1p XFmode calculation. */
15772 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15774 rtx_code_label
*label1
= gen_label_rtx ();
15775 rtx_code_label
*label2
= gen_label_rtx ();
15777 rtx tmp
= gen_reg_rtx (XFmode
);
15778 rtx res
= gen_reg_rtx (XFmode
);
15779 rtx cst
, cstln2
, cst1
;
15782 cst
= const_double_from_real_value
15783 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15784 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15786 emit_insn (gen_absxf2 (tmp
, op1
));
15788 cst
= force_reg (XFmode
, cst
);
15789 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15790 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15791 insn
= get_last_insn ();
15792 JUMP_LABEL (insn
) = label1
;
15794 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15795 emit_jump (label2
);
15797 emit_label (label1
);
15798 LABEL_NUSES (label1
) = 1;
15800 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15801 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15802 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15804 emit_label (label2
);
15805 LABEL_NUSES (label2
) = 1;
15807 emit_move_insn (op0
, res
);
15810 /* Emit code for round calculation. */
15811 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15813 machine_mode inmode
= GET_MODE (op1
);
15814 machine_mode outmode
= GET_MODE (op0
);
15815 rtx e1
= gen_reg_rtx (XFmode
);
15816 rtx e2
= gen_reg_rtx (XFmode
);
15817 rtx scratch
= gen_reg_rtx (HImode
);
15818 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15819 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15820 rtx res
= gen_reg_rtx (outmode
);
15821 rtx_code_label
*jump_label
= gen_label_rtx ();
15822 rtx (*floor_insn
) (rtx
, rtx
);
15823 rtx (*neg_insn
) (rtx
, rtx
);
15831 tmp
= gen_reg_rtx (XFmode
);
15833 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15839 gcc_unreachable ();
15845 floor_insn
= gen_frndintxf2_floor
;
15846 neg_insn
= gen_negsf2
;
15849 floor_insn
= gen_frndintxf2_floor
;
15850 neg_insn
= gen_negdf2
;
15853 floor_insn
= gen_frndintxf2_floor
;
15854 neg_insn
= gen_negxf2
;
15857 floor_insn
= gen_lfloorxfhi2
;
15858 neg_insn
= gen_neghi2
;
15861 floor_insn
= gen_lfloorxfsi2
;
15862 neg_insn
= gen_negsi2
;
15865 floor_insn
= gen_lfloorxfdi2
;
15866 neg_insn
= gen_negdi2
;
15869 gcc_unreachable ();
15872 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15874 /* scratch = fxam(op1) */
15875 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15877 /* e1 = fabs(op1) */
15878 emit_insn (gen_absxf2 (e1
, op1
));
15880 /* e2 = e1 + 0.5 */
15881 half
= force_reg (XFmode
, half
);
15882 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15884 /* res = floor(e2) */
15890 tmp
= gen_reg_rtx (XFmode
);
15892 emit_insn (floor_insn (tmp
, e2
));
15893 emit_insn (gen_rtx_SET (res
,
15894 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15895 UNSPEC_TRUNC_NOOP
)));
15899 emit_insn (floor_insn (res
, e2
));
15902 /* flags = signbit(a) */
15903 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15905 /* if (flags) then res = -res */
15906 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15907 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15908 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15910 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15911 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15912 JUMP_LABEL (insn
) = jump_label
;
15914 emit_insn (neg_insn (res
, res
));
15916 emit_label (jump_label
);
15917 LABEL_NUSES (jump_label
) = 1;
15919 emit_move_insn (op0
, res
);
15922 /* Output code to perform a Newton-Rhapson approximation of a single precision
15923 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15925 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15927 rtx x0
, x1
, e0
, e1
;
15929 x0
= gen_reg_rtx (mode
);
15930 e0
= gen_reg_rtx (mode
);
15931 e1
= gen_reg_rtx (mode
);
15932 x1
= gen_reg_rtx (mode
);
15934 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15936 b
= force_reg (mode
, b
);
15938 /* x0 = rcp(b) estimate */
15939 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15941 if (TARGET_AVX512ER
)
15943 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15946 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15950 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15954 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15958 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15961 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15964 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15967 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15970 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15973 /* Output code to perform a Newton-Rhapson approximation of a
15974 single precision floating point [reciprocal] square root. */
15976 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15978 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15982 x0
= gen_reg_rtx (mode
);
15983 e0
= gen_reg_rtx (mode
);
15984 e1
= gen_reg_rtx (mode
);
15985 e2
= gen_reg_rtx (mode
);
15986 e3
= gen_reg_rtx (mode
);
15988 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15991 /* res = rsqrt28(a) estimate */
15992 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15996 /* x0 = rsqrt28(a) estimate */
15997 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15999 /* res = rcp28(x0) estimate */
16000 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
16006 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
16007 mthree
= const_double_from_real_value (r
, SFmode
);
16009 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
16010 mhalf
= const_double_from_real_value (r
, SFmode
);
16011 unspec
= UNSPEC_RSQRT
;
16013 if (VECTOR_MODE_P (mode
))
16015 mthree
= ix86_build_const_vector (mode
, true, mthree
);
16016 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
16017 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16018 if (GET_MODE_SIZE (mode
) == 64)
16019 unspec
= UNSPEC_RSQRT14
;
16022 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16023 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16025 a
= force_reg (mode
, a
);
16027 /* x0 = rsqrt(a) estimate */
16028 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16031 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16034 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
16037 /* Handle masked compare. */
16038 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
16040 mask
= gen_reg_rtx (HImode
);
16041 /* Imm value 0x4 corresponds to not-equal comparison. */
16042 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
16043 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
16047 mask
= gen_reg_rtx (mode
);
16048 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
16049 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
16053 mthree
= force_reg (mode
, mthree
);
16056 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
16058 unsigned vector_size
= GET_MODE_SIZE (mode
);
16060 || (TARGET_AVX512F
&& vector_size
== 64)
16061 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
16062 emit_insn (gen_rtx_SET (e2
,
16063 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
16067 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
16070 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
16073 mhalf
= force_reg (mode
, mhalf
);
16075 /* e3 = -.5 * x0 */
16076 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
16078 /* e3 = -.5 * e0 */
16079 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
16080 /* ret = e2 * e3 */
16081 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
16084 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16085 mask for masking out the sign-bit is stored in *SMASK, if that is
16089 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
16091 machine_mode vmode
, mode
= GET_MODE (op0
);
16094 xa
= gen_reg_rtx (mode
);
16095 if (mode
== SFmode
)
16097 else if (mode
== DFmode
)
16101 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
16102 if (!VECTOR_MODE_P (mode
))
16104 /* We need to generate a scalar mode mask in this case. */
16105 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16106 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16107 mask
= gen_reg_rtx (mode
);
16108 emit_insn (gen_rtx_SET (mask
, tmp
));
16110 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
16118 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16119 swapping the operands if SWAP_OPERANDS is true. The expanded
16120 code is a forward jump to a newly created label in case the
16121 comparison is true. The generated label rtx is returned. */
16122 static rtx_code_label
*
16123 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
16124 bool swap_operands
)
16126 bool unordered_compare
= ix86_unordered_fp_compare (code
);
16127 rtx_code_label
*label
;
16131 std::swap (op0
, op1
);
16133 label
= gen_label_rtx ();
16134 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
16135 if (unordered_compare
)
16136 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
16137 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
16138 emit_insn (gen_rtx_SET (reg
, tmp
));
16139 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
16140 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
16141 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
16142 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
16143 JUMP_LABEL (tmp
) = label
;
16148 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16149 using comparison code CODE. Operands are swapped for the comparison if
16150 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16152 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
16153 bool swap_operands
)
16155 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
16156 machine_mode mode
= GET_MODE (op0
);
16157 rtx mask
= gen_reg_rtx (mode
);
16160 std::swap (op0
, op1
);
16162 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
16164 emit_insn (insn (mask
, op0
, op1
,
16165 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
16169 /* Expand copysign from SIGN to the positive value ABS_VALUE
16170 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16174 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
16176 machine_mode mode
= GET_MODE (sign
);
16177 rtx sgn
= gen_reg_rtx (mode
);
16178 if (mask
== NULL_RTX
)
16180 machine_mode vmode
;
16182 if (mode
== SFmode
)
16184 else if (mode
== DFmode
)
16189 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
16190 if (!VECTOR_MODE_P (mode
))
16192 /* We need to generate a scalar mode mask in this case. */
16193 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16194 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16195 mask
= gen_reg_rtx (mode
);
16196 emit_insn (gen_rtx_SET (mask
, tmp
));
16200 mask
= gen_rtx_NOT (mode
, mask
);
16201 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
16202 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
16205 /* Expand SSE sequence for computing lround from OP1 storing
16209 ix86_expand_lround (rtx op0
, rtx op1
)
16211 /* C code for the stuff we're doing below:
16212 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16215 machine_mode mode
= GET_MODE (op1
);
16216 const struct real_format
*fmt
;
16217 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16220 /* load nextafter (0.5, 0.0) */
16221 fmt
= REAL_MODE_FORMAT (mode
);
16222 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16223 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16225 /* adj = copysign (0.5, op1) */
16226 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16227 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
16229 /* adj = op1 + adj */
16230 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16232 /* op0 = (imode)adj */
16233 expand_fix (op0
, adj
, 0);
16236 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16240 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
16242 /* C code for the stuff we're doing below (for do_floor):
16244 xi -= (double)xi > op1 ? 1 : 0;
16247 machine_mode fmode
= GET_MODE (op1
);
16248 machine_mode imode
= GET_MODE (op0
);
16249 rtx ireg
, freg
, tmp
;
16250 rtx_code_label
*label
;
16252 /* reg = (long)op1 */
16253 ireg
= gen_reg_rtx (imode
);
16254 expand_fix (ireg
, op1
, 0);
16256 /* freg = (double)reg */
16257 freg
= gen_reg_rtx (fmode
);
16258 expand_float (freg
, ireg
, 0);
16260 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16261 label
= ix86_expand_sse_compare_and_jump (UNLE
,
16262 freg
, op1
, !do_floor
);
16263 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
16264 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
16265 emit_move_insn (ireg
, tmp
);
16267 emit_label (label
);
16268 LABEL_NUSES (label
) = 1;
16270 emit_move_insn (op0
, ireg
);
16273 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16274 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16277 ix86_gen_TWO52 (machine_mode mode
)
16279 const struct real_format
*fmt
;
16280 REAL_VALUE_TYPE TWO52r
;
16283 fmt
= REAL_MODE_FORMAT (mode
);
16284 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
16285 TWO52
= const_double_from_real_value (TWO52r
, mode
);
16286 TWO52
= force_reg (mode
, TWO52
);
16291 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16294 ix86_expand_rint (rtx operand0
, rtx operand1
)
16296 /* C code for the stuff we're doing below:
16297 xa = fabs (operand1);
16298 if (!isless (xa, 2**52))
16301 if (flag_rounding_math)
16303 two52 = copysign (two52, operand1);
16306 xa = xa + two52 - two52;
16307 return copysign (xa, operand1);
16309 machine_mode mode
= GET_MODE (operand0
);
16310 rtx res
, xa
, TWO52
, mask
;
16311 rtx_code_label
*label
;
16313 TWO52
= ix86_gen_TWO52 (mode
);
16315 /* Temporary for holding the result, initialized to the input
16316 operand to ease control flow. */
16317 res
= copy_to_reg (operand1
);
16319 /* xa = abs (operand1) */
16320 xa
= ix86_expand_sse_fabs (res
, &mask
);
16322 /* if (!isless (xa, TWO52)) goto label; */
16323 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16325 if (flag_rounding_math
)
16327 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
16331 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16332 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16334 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16335 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16336 xa
= ix86_expand_sse_fabs (xa
, NULL
);
16338 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16340 emit_label (label
);
16341 LABEL_NUSES (label
) = 1;
16343 emit_move_insn (operand0
, res
);
16346 /* Expand SSE2 sequence for computing floor or ceil
16347 from OPERAND1 storing into OPERAND0. */
16349 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
16351 /* C code for the stuff we expand below.
16352 double xa = fabs (x), x2;
16353 if (!isless (xa, TWO52))
16355 x2 = (double)(long)x;
16364 if (HONOR_SIGNED_ZEROS (mode))
16365 return copysign (x2, x);
16368 machine_mode mode
= GET_MODE (operand0
);
16369 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
16370 rtx_code_label
*label
;
16372 TWO52
= ix86_gen_TWO52 (mode
);
16374 /* Temporary for holding the result, initialized to the input
16375 operand to ease control flow. */
16376 res
= copy_to_reg (operand1
);
16378 /* xa = abs (operand1) */
16379 xa
= ix86_expand_sse_fabs (res
, &mask
);
16381 /* if (!isless (xa, TWO52)) goto label; */
16382 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16384 /* xa = (double)(long)x */
16385 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16386 expand_fix (xi
, res
, 0);
16387 expand_float (xa
, xi
, 0);
16390 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16392 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16393 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16394 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16395 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16396 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16397 if (HONOR_SIGNED_ZEROS (mode
))
16399 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16400 if (do_floor
&& flag_rounding_math
)
16401 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16403 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16405 emit_move_insn (res
, tmp
);
16407 emit_label (label
);
16408 LABEL_NUSES (label
) = 1;
16410 emit_move_insn (operand0
, res
);
16413 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16414 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16415 that is only available on 64bit targets. */
16417 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
16419 /* C code for the stuff we expand below.
16420 double xa = fabs (x), x2;
16421 if (!isless (xa, TWO52))
16423 xa = xa + TWO52 - TWO52;
16424 x2 = copysign (xa, x);
16433 if (HONOR_SIGNED_ZEROS (mode))
16434 x2 = copysign (x2, x);
16437 machine_mode mode
= GET_MODE (operand0
);
16438 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
16439 rtx_code_label
*label
;
16441 TWO52
= ix86_gen_TWO52 (mode
);
16443 /* Temporary for holding the result, initialized to the input
16444 operand to ease control flow. */
16445 res
= copy_to_reg (operand1
);
16447 /* xa = abs (operand1) */
16448 xa
= ix86_expand_sse_fabs (res
, &mask
);
16450 /* if (!isless (xa, TWO52)) goto label; */
16451 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16453 /* xa = xa + TWO52 - TWO52; */
16454 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16455 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16457 /* xa = copysign (xa, operand1) */
16458 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16461 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16463 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16464 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16465 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16466 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16467 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16468 if (HONOR_SIGNED_ZEROS (mode
))
16470 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16471 if (do_floor
&& flag_rounding_math
)
16472 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16474 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16476 emit_move_insn (res
, tmp
);
16478 emit_label (label
);
16479 LABEL_NUSES (label
) = 1;
16481 emit_move_insn (operand0
, res
);
16484 /* Expand SSE sequence for computing trunc
16485 from OPERAND1 storing into OPERAND0. */
16487 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16489 /* C code for SSE variant we expand below.
16490 double xa = fabs (x), x2;
16491 if (!isless (xa, TWO52))
16493 x2 = (double)(long)x;
16494 if (HONOR_SIGNED_ZEROS (mode))
16495 return copysign (x2, x);
16498 machine_mode mode
= GET_MODE (operand0
);
16499 rtx xa
, xi
, TWO52
, res
, mask
;
16500 rtx_code_label
*label
;
16502 TWO52
= ix86_gen_TWO52 (mode
);
16504 /* Temporary for holding the result, initialized to the input
16505 operand to ease control flow. */
16506 res
= copy_to_reg (operand1
);
16508 /* xa = abs (operand1) */
16509 xa
= ix86_expand_sse_fabs (res
, &mask
);
16511 /* if (!isless (xa, TWO52)) goto label; */
16512 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16514 /* xa = (double)(long)x */
16515 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16516 expand_fix (xi
, res
, 0);
16517 expand_float (xa
, xi
, 0);
16519 if (HONOR_SIGNED_ZEROS (mode
))
16520 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16522 emit_move_insn (res
, xa
);
16524 emit_label (label
);
16525 LABEL_NUSES (label
) = 1;
16527 emit_move_insn (operand0
, res
);
16530 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16531 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16532 that is only available on 64bit targets. */
16534 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16536 machine_mode mode
= GET_MODE (operand0
);
16537 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
16538 rtx_code_label
*label
;
16540 /* C code for SSE variant we expand below.
16541 double xa = fabs (x), x2;
16542 if (!isless (xa, TWO52))
16544 xa2 = xa + TWO52 - TWO52;
16548 x2 = copysign (xa2, x);
16552 TWO52
= ix86_gen_TWO52 (mode
);
16554 /* Temporary for holding the result, initialized to the input
16555 operand to ease control flow. */
16556 res
=copy_to_reg (operand1
);
16558 /* xa = abs (operand1) */
16559 xa
= ix86_expand_sse_fabs (res
, &mask
);
16561 /* if (!isless (xa, TWO52)) goto label; */
16562 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16564 /* xa2 = xa + TWO52 - TWO52; */
16565 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16566 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16569 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16571 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16572 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
16573 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16574 tmp
= expand_simple_binop (mode
, MINUS
,
16575 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16576 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16577 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16578 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16580 /* res = copysign (xa2, operand1) */
16581 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
16583 emit_label (label
);
16584 LABEL_NUSES (label
) = 1;
16586 emit_move_insn (operand0
, res
);
16589 /* Expand SSE sequence for computing round
16590 from OPERAND1 storing into OPERAND0. */
16592 ix86_expand_round (rtx operand0
, rtx operand1
)
16594 /* C code for the stuff we're doing below:
16595 double xa = fabs (x);
16596 if (!isless (xa, TWO52))
16598 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16599 return copysign (xa, x);
16601 machine_mode mode
= GET_MODE (operand0
);
16602 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16603 rtx_code_label
*label
;
16604 const struct real_format
*fmt
;
16605 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16607 /* Temporary for holding the result, initialized to the input
16608 operand to ease control flow. */
16609 res
= copy_to_reg (operand1
);
16611 TWO52
= ix86_gen_TWO52 (mode
);
16612 xa
= ix86_expand_sse_fabs (res
, &mask
);
16613 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16615 /* load nextafter (0.5, 0.0) */
16616 fmt
= REAL_MODE_FORMAT (mode
);
16617 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16618 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16620 /* xa = xa + 0.5 */
16621 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16622 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16624 /* xa = (double)(int64_t)xa */
16625 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16626 expand_fix (xi
, xa
, 0);
16627 expand_float (xa
, xi
, 0);
16629 /* res = copysign (xa, operand1) */
16630 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16632 emit_label (label
);
16633 LABEL_NUSES (label
) = 1;
16635 emit_move_insn (operand0
, res
);
16638 /* Expand SSE sequence for computing round from OPERAND1 storing
16639 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16640 that is only available on 64bit targets. */
16642 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16644 /* C code for the stuff we expand below.
16645 double xa = fabs (x), xa2, x2;
16646 if (!isless (xa, TWO52))
16648 Using the absolute value and copying back sign makes
16649 -0.0 -> -0.0 correct.
16650 xa2 = xa + TWO52 - TWO52;
16655 else if (dxa > 0.5)
16657 x2 = copysign (xa2, x);
16660 machine_mode mode
= GET_MODE (operand0
);
16661 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16662 rtx_code_label
*label
;
16664 TWO52
= ix86_gen_TWO52 (mode
);
16666 /* Temporary for holding the result, initialized to the input
16667 operand to ease control flow. */
16668 res
= copy_to_reg (operand1
);
16670 /* xa = abs (operand1) */
16671 xa
= ix86_expand_sse_fabs (res
, &mask
);
16673 /* if (!isless (xa, TWO52)) goto label; */
16674 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16676 /* xa2 = xa + TWO52 - TWO52; */
16677 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16678 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16680 /* dxa = xa2 - xa; */
16681 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16683 /* generate 0.5, 1.0 and -0.5 */
16684 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16685 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16686 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16690 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16691 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16692 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16693 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16694 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16695 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16696 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16697 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16699 /* res = copysign (xa2, operand1) */
16700 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
16702 emit_label (label
);
16703 LABEL_NUSES (label
) = 1;
16705 emit_move_insn (operand0
, res
);
16708 /* Expand SSE sequence for computing round
16709 from OP1 storing into OP0 using sse4 round insn. */
16711 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16713 machine_mode mode
= GET_MODE (op0
);
16714 rtx e1
, e2
, res
, half
;
16715 const struct real_format
*fmt
;
16716 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16717 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16718 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16723 gen_copysign
= gen_copysignsf3
;
16724 gen_round
= gen_sse4_1_roundsf2
;
16727 gen_copysign
= gen_copysigndf3
;
16728 gen_round
= gen_sse4_1_rounddf2
;
16731 gcc_unreachable ();
16734 /* round (a) = trunc (a + copysign (0.5, a)) */
16736 /* load nextafter (0.5, 0.0) */
16737 fmt
= REAL_MODE_FORMAT (mode
);
16738 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16739 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16740 half
= const_double_from_real_value (pred_half
, mode
);
16742 /* e1 = copysign (0.5, op1) */
16743 e1
= gen_reg_rtx (mode
);
16744 emit_insn (gen_copysign (e1
, half
, op1
));
16746 /* e2 = op1 + e1 */
16747 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16749 /* res = trunc (e2) */
16750 res
= gen_reg_rtx (mode
);
16751 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16753 emit_move_insn (op0
, res
);
16756 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16757 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16758 insn every time. */
16760 static GTY(()) rtx_insn
*vselect_insn
;
16762 /* Initialize vselect_insn. */
16765 init_vselect_insn (void)
16770 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16771 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16772 XVECEXP (x
, 0, i
) = const0_rtx
;
16773 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16775 x
= gen_rtx_SET (const0_rtx
, x
);
16777 vselect_insn
= emit_insn (x
);
16781 /* Construct (set target (vec_select op0 (parallel perm))) and
16782 return true if that's a valid instruction in the active ISA. */
16785 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16786 unsigned nelt
, bool testing_p
)
16789 rtx x
, save_vconcat
;
16792 if (vselect_insn
== NULL_RTX
)
16793 init_vselect_insn ();
16795 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16796 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16797 for (i
= 0; i
< nelt
; ++i
)
16798 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16799 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16800 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16801 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16802 SET_DEST (PATTERN (vselect_insn
)) = target
;
16803 icode
= recog_memoized (vselect_insn
);
16805 if (icode
>= 0 && !testing_p
)
16806 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16808 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16809 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16810 INSN_CODE (vselect_insn
) = -1;
16815 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16818 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16819 const unsigned char *perm
, unsigned nelt
,
16822 machine_mode v2mode
;
16826 if (vselect_insn
== NULL_RTX
)
16827 init_vselect_insn ();
16829 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16831 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16832 PUT_MODE (x
, v2mode
);
16835 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16836 XEXP (x
, 0) = const0_rtx
;
16837 XEXP (x
, 1) = const0_rtx
;
16841 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16842 using movss or movsd. */
16844 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16846 machine_mode vmode
= d
->vmode
;
16847 unsigned i
, nelt
= d
->nelt
;
16850 if (d
->one_operand_p
)
16853 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16854 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
16855 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16858 /* Only the first element is changed. */
16859 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16861 for (i
= 1; i
< nelt
; ++i
)
16862 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16868 if (d
->perm
[0] == nelt
)
16869 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16871 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16873 emit_insn (gen_rtx_SET (d
->target
, x
));
16878 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16879 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16882 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16884 machine_mode mmode
, vmode
= d
->vmode
;
16885 unsigned i
, nelt
= d
->nelt
;
16886 unsigned HOST_WIDE_INT mask
;
16887 rtx target
, op0
, op1
, maskop
, x
;
16888 rtx rperm
[32], vperm
;
16890 if (d
->one_operand_p
)
16892 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16893 && (TARGET_AVX512BW
16894 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16896 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16898 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16900 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16905 /* This is a blend, not a permute. Elements must stay in their
16906 respective lanes. */
16907 for (i
= 0; i
< nelt
; ++i
)
16909 unsigned e
= d
->perm
[i
];
16910 if (!(e
== i
|| e
== i
+ nelt
))
16917 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16918 decision should be extracted elsewhere, so that we only try that
16919 sequence once all budget==3 options have been tried. */
16920 target
= d
->target
;
16939 for (i
= 0; i
< nelt
; ++i
)
16940 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16944 for (i
= 0; i
< 2; ++i
)
16945 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16950 for (i
= 0; i
< 4; ++i
)
16951 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16956 /* See if bytes move in pairs so we can use pblendw with
16957 an immediate argument, rather than pblendvb with a vector
16959 for (i
= 0; i
< 16; i
+= 2)
16960 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16963 for (i
= 0; i
< nelt
; ++i
)
16964 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16967 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16968 vperm
= force_reg (vmode
, vperm
);
16970 if (GET_MODE_SIZE (vmode
) == 16)
16971 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16973 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16974 if (target
!= d
->target
)
16975 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16979 for (i
= 0; i
< 8; ++i
)
16980 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16985 target
= gen_reg_rtx (vmode
);
16986 op0
= gen_lowpart (vmode
, op0
);
16987 op1
= gen_lowpart (vmode
, op1
);
16991 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16992 for (i
= 0; i
< 32; i
+= 2)
16993 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16995 /* See if bytes move in quadruplets. If yes, vpblendd
16996 with immediate can be used. */
16997 for (i
= 0; i
< 32; i
+= 4)
16998 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
17002 /* See if bytes move the same in both lanes. If yes,
17003 vpblendw with immediate can be used. */
17004 for (i
= 0; i
< 16; i
+= 2)
17005 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
17008 /* Use vpblendw. */
17009 for (i
= 0; i
< 16; ++i
)
17010 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
17015 /* Use vpblendd. */
17016 for (i
= 0; i
< 8; ++i
)
17017 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
17022 /* See if words move in pairs. If yes, vpblendd can be used. */
17023 for (i
= 0; i
< 16; i
+= 2)
17024 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17028 /* See if words move the same in both lanes. If not,
17029 vpblendvb must be used. */
17030 for (i
= 0; i
< 8; i
++)
17031 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
17033 /* Use vpblendvb. */
17034 for (i
= 0; i
< 32; ++i
)
17035 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
17039 target
= gen_reg_rtx (vmode
);
17040 op0
= gen_lowpart (vmode
, op0
);
17041 op1
= gen_lowpart (vmode
, op1
);
17042 goto finish_pblendvb
;
17045 /* Use vpblendw. */
17046 for (i
= 0; i
< 16; ++i
)
17047 mask
|= (d
->perm
[i
] >= 16) << i
;
17051 /* Use vpblendd. */
17052 for (i
= 0; i
< 8; ++i
)
17053 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
17058 /* Use vpblendd. */
17059 for (i
= 0; i
< 4; ++i
)
17060 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
17065 gcc_unreachable ();
17088 if (mmode
!= VOIDmode
)
17089 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
17091 maskop
= GEN_INT (mask
);
17093 /* This matches five different patterns with the different modes. */
17094 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
17095 x
= gen_rtx_SET (target
, x
);
17097 if (target
!= d
->target
)
17098 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17103 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17104 in terms of the variable form of vpermilps.
17106 Note that we will have already failed the immediate input vpermilps,
17107 which requires that the high and low part shuffle be identical; the
17108 variable form doesn't require that. */
17111 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
17113 rtx rperm
[8], vperm
;
17116 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
17119 /* We can only permute within the 128-bit lane. */
17120 for (i
= 0; i
< 8; ++i
)
17122 unsigned e
= d
->perm
[i
];
17123 if (i
< 4 ? e
>= 4 : e
< 4)
17130 for (i
= 0; i
< 8; ++i
)
17132 unsigned e
= d
->perm
[i
];
17134 /* Within each 128-bit lane, the elements of op0 are numbered
17135 from 0 and the elements of op1 are numbered from 4. */
17141 rperm
[i
] = GEN_INT (e
);
17144 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
17145 vperm
= force_reg (V8SImode
, vperm
);
17146 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
17151 /* Return true if permutation D can be performed as VMODE permutation
17155 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
17157 unsigned int i
, j
, chunk
;
17159 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
17160 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
17161 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
17164 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
17167 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
17168 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
17169 if (d
->perm
[i
] & (chunk
- 1))
17172 for (j
= 1; j
< chunk
; ++j
)
17173 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
17179 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17180 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17183 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
17185 unsigned i
, nelt
, eltsz
, mask
;
17186 unsigned char perm
[64];
17187 machine_mode vmode
= V16QImode
;
17188 rtx rperm
[64], vperm
, target
, op0
, op1
;
17192 if (!d
->one_operand_p
)
17194 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
17197 && valid_perm_using_mode_p (V2TImode
, d
))
17202 /* Use vperm2i128 insn. The pattern uses
17203 V4DImode instead of V2TImode. */
17204 target
= d
->target
;
17205 if (d
->vmode
!= V4DImode
)
17206 target
= gen_reg_rtx (V4DImode
);
17207 op0
= gen_lowpart (V4DImode
, d
->op0
);
17208 op1
= gen_lowpart (V4DImode
, d
->op1
);
17210 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
17211 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
17212 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
17213 if (target
!= d
->target
)
17214 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17222 if (GET_MODE_SIZE (d
->vmode
) == 16)
17227 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17232 /* V4DImode should be already handled through
17233 expand_vselect by vpermq instruction. */
17234 gcc_assert (d
->vmode
!= V4DImode
);
17237 if (d
->vmode
== V8SImode
17238 || d
->vmode
== V16HImode
17239 || d
->vmode
== V32QImode
)
17241 /* First see if vpermq can be used for
17242 V8SImode/V16HImode/V32QImode. */
17243 if (valid_perm_using_mode_p (V4DImode
, d
))
17245 for (i
= 0; i
< 4; i
++)
17246 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
17249 target
= gen_reg_rtx (V4DImode
);
17250 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
17253 emit_move_insn (d
->target
,
17254 gen_lowpart (d
->vmode
, target
));
17260 /* Next see if vpermd can be used. */
17261 if (valid_perm_using_mode_p (V8SImode
, d
))
17264 /* Or if vpermps can be used. */
17265 else if (d
->vmode
== V8SFmode
)
17268 if (vmode
== V32QImode
)
17270 /* vpshufb only works intra lanes, it is not
17271 possible to shuffle bytes in between the lanes. */
17272 for (i
= 0; i
< nelt
; ++i
)
17273 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
17277 else if (GET_MODE_SIZE (d
->vmode
) == 64)
17279 if (!TARGET_AVX512BW
)
17282 /* If vpermq didn't work, vpshufb won't work either. */
17283 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
17287 if (d
->vmode
== V16SImode
17288 || d
->vmode
== V32HImode
17289 || d
->vmode
== V64QImode
)
17291 /* First see if vpermq can be used for
17292 V16SImode/V32HImode/V64QImode. */
17293 if (valid_perm_using_mode_p (V8DImode
, d
))
17295 for (i
= 0; i
< 8; i
++)
17296 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
17299 target
= gen_reg_rtx (V8DImode
);
17300 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
17303 emit_move_insn (d
->target
,
17304 gen_lowpart (d
->vmode
, target
));
17310 /* Next see if vpermd can be used. */
17311 if (valid_perm_using_mode_p (V16SImode
, d
))
17314 /* Or if vpermps can be used. */
17315 else if (d
->vmode
== V16SFmode
)
17317 if (vmode
== V64QImode
)
17319 /* vpshufb only works intra lanes, it is not
17320 possible to shuffle bytes in between the lanes. */
17321 for (i
= 0; i
< nelt
; ++i
)
17322 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
17333 if (vmode
== V8SImode
)
17334 for (i
= 0; i
< 8; ++i
)
17335 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
17336 else if (vmode
== V16SImode
)
17337 for (i
= 0; i
< 16; ++i
)
17338 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
17341 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
17342 if (!d
->one_operand_p
)
17343 mask
= 2 * nelt
- 1;
17344 else if (vmode
== V16QImode
)
17346 else if (vmode
== V64QImode
)
17347 mask
= nelt
/ 4 - 1;
17349 mask
= nelt
/ 2 - 1;
17351 for (i
= 0; i
< nelt
; ++i
)
17353 unsigned j
, e
= d
->perm
[i
] & mask
;
17354 for (j
= 0; j
< eltsz
; ++j
)
17355 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
17359 vperm
= gen_rtx_CONST_VECTOR (vmode
,
17360 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
17361 vperm
= force_reg (vmode
, vperm
);
17363 target
= d
->target
;
17364 if (d
->vmode
!= vmode
)
17365 target
= gen_reg_rtx (vmode
);
17366 op0
= gen_lowpart (vmode
, d
->op0
);
17367 if (d
->one_operand_p
)
17369 if (vmode
== V16QImode
)
17370 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
17371 else if (vmode
== V32QImode
)
17372 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
17373 else if (vmode
== V64QImode
)
17374 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
17375 else if (vmode
== V8SFmode
)
17376 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
17377 else if (vmode
== V8SImode
)
17378 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
17379 else if (vmode
== V16SFmode
)
17380 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
17381 else if (vmode
== V16SImode
)
17382 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
17384 gcc_unreachable ();
17388 op1
= gen_lowpart (vmode
, d
->op1
);
17389 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
17391 if (target
!= d
->target
)
17392 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17397 /* For V*[QHS]Imode permutations, check if the same permutation
17398 can't be performed in a 2x, 4x or 8x wider inner mode. */
17401 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
17402 struct expand_vec_perm_d
*nd
)
17405 machine_mode mode
= VOIDmode
;
17409 case E_V16QImode
: mode
= V8HImode
; break;
17410 case E_V32QImode
: mode
= V16HImode
; break;
17411 case E_V64QImode
: mode
= V32HImode
; break;
17412 case E_V8HImode
: mode
= V4SImode
; break;
17413 case E_V16HImode
: mode
= V8SImode
; break;
17414 case E_V32HImode
: mode
= V16SImode
; break;
17415 case E_V4SImode
: mode
= V2DImode
; break;
17416 case E_V8SImode
: mode
= V4DImode
; break;
17417 case E_V16SImode
: mode
= V8DImode
; break;
17418 default: return false;
17420 for (i
= 0; i
< d
->nelt
; i
+= 2)
17421 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
17424 nd
->nelt
= d
->nelt
/ 2;
17425 for (i
= 0; i
< nd
->nelt
; i
++)
17426 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
17427 if (GET_MODE_INNER (mode
) != DImode
)
17428 canonicalize_vector_int_perm (nd
, nd
);
17431 nd
->one_operand_p
= d
->one_operand_p
;
17432 nd
->testing_p
= d
->testing_p
;
17433 if (d
->op0
== d
->op1
)
17434 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
17437 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
17438 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
17441 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
17443 nd
->target
= gen_reg_rtx (nd
->vmode
);
17448 /* Try to expand one-operand permutation with constant mask. */
17451 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
17453 machine_mode mode
= GET_MODE (d
->op0
);
17454 machine_mode maskmode
= mode
;
17455 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
17456 rtx target
, op0
, mask
;
17459 if (!rtx_equal_p (d
->op0
, d
->op1
))
17462 if (!TARGET_AVX512F
)
17468 gen
= gen_avx512f_permvarv16si
;
17471 gen
= gen_avx512f_permvarv16sf
;
17472 maskmode
= V16SImode
;
17475 gen
= gen_avx512f_permvarv8di
;
17478 gen
= gen_avx512f_permvarv8df
;
17479 maskmode
= V8DImode
;
17485 target
= d
->target
;
17487 for (int i
= 0; i
< d
->nelt
; ++i
)
17488 vec
[i
] = GEN_INT (d
->perm
[i
]);
17489 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17490 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17494 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17496 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17497 in a single instruction. */
17500 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17502 unsigned i
, nelt
= d
->nelt
;
17503 struct expand_vec_perm_d nd
;
17505 /* Check plain VEC_SELECT first, because AVX has instructions that could
17506 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17507 input where SEL+CONCAT may not. */
17508 if (d
->one_operand_p
)
17510 int mask
= nelt
- 1;
17511 bool identity_perm
= true;
17512 bool broadcast_perm
= true;
17514 for (i
= 0; i
< nelt
; i
++)
17516 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17517 if (nd
.perm
[i
] != i
)
17518 identity_perm
= false;
17520 broadcast_perm
= false;
17526 emit_move_insn (d
->target
, d
->op0
);
17529 else if (broadcast_perm
&& TARGET_AVX2
)
17531 /* Use vpbroadcast{b,w,d}. */
17532 rtx (*gen
) (rtx
, rtx
) = NULL
;
17536 if (TARGET_AVX512BW
)
17537 gen
= gen_avx512bw_vec_dupv64qi_1
;
17540 gen
= gen_avx2_pbroadcastv32qi_1
;
17543 if (TARGET_AVX512BW
)
17544 gen
= gen_avx512bw_vec_dupv32hi_1
;
17547 gen
= gen_avx2_pbroadcastv16hi_1
;
17550 if (TARGET_AVX512F
)
17551 gen
= gen_avx512f_vec_dupv16si_1
;
17554 gen
= gen_avx2_pbroadcastv8si_1
;
17557 gen
= gen_avx2_pbroadcastv16qi
;
17560 gen
= gen_avx2_pbroadcastv8hi
;
17563 if (TARGET_AVX512F
)
17564 gen
= gen_avx512f_vec_dupv16sf_1
;
17567 gen
= gen_avx2_vec_dupv8sf_1
;
17570 if (TARGET_AVX512F
)
17571 gen
= gen_avx512f_vec_dupv8df_1
;
17574 if (TARGET_AVX512F
)
17575 gen
= gen_avx512f_vec_dupv8di_1
;
17577 /* For other modes prefer other shuffles this function creates. */
17583 emit_insn (gen (d
->target
, d
->op0
));
17588 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17591 /* There are plenty of patterns in sse.md that are written for
17592 SEL+CONCAT and are not replicated for a single op. Perhaps
17593 that should be changed, to avoid the nastiness here. */
17595 /* Recognize interleave style patterns, which means incrementing
17596 every other permutation operand. */
17597 for (i
= 0; i
< nelt
; i
+= 2)
17599 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17600 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17602 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17606 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17609 for (i
= 0; i
< nelt
; i
+= 4)
17611 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17612 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17613 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17614 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17617 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17623 /* Try movss/movsd instructions. */
17624 if (expand_vec_perm_movs (d
))
17627 /* Finally, try the fully general two operand permute. */
17628 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17632 /* Recognize interleave style patterns with reversed operands. */
17633 if (!d
->one_operand_p
)
17635 for (i
= 0; i
< nelt
; ++i
)
17637 unsigned e
= d
->perm
[i
];
17645 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17650 /* Try the SSE4.1 blend variable merge instructions. */
17651 if (expand_vec_perm_blend (d
))
17654 /* Try one of the AVX vpermil variable permutations. */
17655 if (expand_vec_perm_vpermil (d
))
17658 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17659 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17660 if (expand_vec_perm_pshufb (d
))
17663 /* Try the AVX2 vpalignr instruction. */
17664 if (expand_vec_perm_palignr (d
, true))
17667 /* Try the AVX512F vperm{s,d} instructions. */
17668 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17671 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17672 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17675 /* See if we can get the same permutation in different vector integer
17677 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17680 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17686 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17687 in terms of a pair of pshuflw + pshufhw instructions. */
17690 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17692 unsigned char perm2
[MAX_VECT_LEN
];
17696 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17699 /* The two permutations only operate in 64-bit lanes. */
17700 for (i
= 0; i
< 4; ++i
)
17701 if (d
->perm
[i
] >= 4)
17703 for (i
= 4; i
< 8; ++i
)
17704 if (d
->perm
[i
] < 4)
17710 /* Emit the pshuflw. */
17711 memcpy (perm2
, d
->perm
, 4);
17712 for (i
= 4; i
< 8; ++i
)
17714 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17717 /* Emit the pshufhw. */
17718 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17719 for (i
= 0; i
< 4; ++i
)
17721 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17727 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17728 the permutation using the SSSE3 palignr instruction. This succeeds
17729 when all of the elements in PERM fit within one vector and we merely
17730 need to shift them down so that a single vector permutation has a
17731 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17732 the vpalignr instruction itself can perform the requested permutation. */
17735 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17737 unsigned i
, nelt
= d
->nelt
;
17738 unsigned min
, max
, minswap
, maxswap
;
17739 bool in_order
, ok
, swap
= false;
17741 struct expand_vec_perm_d dcopy
;
17743 /* Even with AVX, palignr only operates on 128-bit vectors,
17744 in AVX2 palignr operates on both 128-bit lanes. */
17745 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17746 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17751 minswap
= 2 * nelt
;
17753 for (i
= 0; i
< nelt
; ++i
)
17755 unsigned e
= d
->perm
[i
];
17756 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17757 if (GET_MODE_SIZE (d
->vmode
) == 32)
17759 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17760 eswap
= e
^ (nelt
/ 2);
17766 if (eswap
< minswap
)
17768 if (eswap
> maxswap
)
17772 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17774 if (d
->one_operand_p
17776 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17777 ? nelt
/ 2 : nelt
))
17784 /* Given that we have SSSE3, we know we'll be able to implement the
17785 single operand permutation after the palignr with pshufb for
17786 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17788 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17794 dcopy
.op0
= d
->op1
;
17795 dcopy
.op1
= d
->op0
;
17796 for (i
= 0; i
< nelt
; ++i
)
17797 dcopy
.perm
[i
] ^= nelt
;
17801 for (i
= 0; i
< nelt
; ++i
)
17803 unsigned e
= dcopy
.perm
[i
];
17804 if (GET_MODE_SIZE (d
->vmode
) == 32
17806 && (e
& (nelt
/ 2 - 1)) < min
)
17807 e
= e
- min
- (nelt
/ 2);
17814 dcopy
.one_operand_p
= true;
17816 if (single_insn_only_p
&& !in_order
)
17819 /* For AVX2, test whether we can permute the result in one instruction. */
17824 dcopy
.op1
= dcopy
.op0
;
17825 return expand_vec_perm_1 (&dcopy
);
17828 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17829 if (GET_MODE_SIZE (d
->vmode
) == 16)
17831 target
= gen_reg_rtx (TImode
);
17832 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17833 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17837 target
= gen_reg_rtx (V2TImode
);
17838 emit_insn (gen_avx2_palignrv2ti (target
,
17839 gen_lowpart (V2TImode
, dcopy
.op1
),
17840 gen_lowpart (V2TImode
, dcopy
.op0
),
17844 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17846 /* Test for the degenerate case where the alignment by itself
17847 produces the desired permutation. */
17850 emit_move_insn (d
->target
, dcopy
.op0
);
17854 ok
= expand_vec_perm_1 (&dcopy
);
17855 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17860 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17861 the permutation using the SSE4_1 pblendv instruction. Potentially
17862 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17865 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17867 unsigned i
, which
, nelt
= d
->nelt
;
17868 struct expand_vec_perm_d dcopy
, dcopy1
;
17869 machine_mode vmode
= d
->vmode
;
17872 /* Use the same checks as in expand_vec_perm_blend. */
17873 if (d
->one_operand_p
)
17875 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17877 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17879 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17884 /* Figure out where permutation elements stay not in their
17885 respective lanes. */
17886 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17888 unsigned e
= d
->perm
[i
];
17890 which
|= (e
< nelt
? 1 : 2);
17892 /* We can pblend the part where elements stay not in their
17893 respective lanes only when these elements are all in one
17894 half of a permutation.
17895 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17896 lanes, but both 8 and 9 >= 8
17897 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17898 respective lanes and 8 >= 8, but 2 not. */
17899 if (which
!= 1 && which
!= 2)
17901 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17904 /* First we apply one operand permutation to the part where
17905 elements stay not in their respective lanes. */
17908 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17910 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17912 dcopy
.target
= gen_reg_rtx (vmode
);
17913 dcopy
.one_operand_p
= true;
17915 for (i
= 0; i
< nelt
; ++i
)
17916 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17918 ok
= expand_vec_perm_1 (&dcopy
);
17919 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17926 /* Next we put permuted elements into their positions. */
17929 dcopy1
.op1
= dcopy
.target
;
17931 dcopy1
.op0
= dcopy
.target
;
17933 for (i
= 0; i
< nelt
; ++i
)
17934 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17936 ok
= expand_vec_perm_blend (&dcopy1
);
17942 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17944 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17945 a two vector permutation into a single vector permutation by using
17946 an interleave operation to merge the vectors. */
17949 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17951 struct expand_vec_perm_d dremap
, dfinal
;
17952 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17953 unsigned HOST_WIDE_INT contents
;
17954 unsigned char remap
[2 * MAX_VECT_LEN
];
17956 bool ok
, same_halves
= false;
17958 if (GET_MODE_SIZE (d
->vmode
) == 16)
17960 if (d
->one_operand_p
)
17963 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17967 /* For 32-byte modes allow even d->one_operand_p.
17968 The lack of cross-lane shuffling in some instructions
17969 might prevent a single insn shuffle. */
17971 dfinal
.testing_p
= true;
17972 /* If expand_vec_perm_interleave3 can expand this into
17973 a 3 insn sequence, give up and let it be expanded as
17974 3 insn sequence. While that is one insn longer,
17975 it doesn't need a memory operand and in the common
17976 case that both interleave low and high permutations
17977 with the same operands are adjacent needs 4 insns
17978 for both after CSE. */
17979 if (expand_vec_perm_interleave3 (&dfinal
))
17985 /* Examine from whence the elements come. */
17987 for (i
= 0; i
< nelt
; ++i
)
17988 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17990 memset (remap
, 0xff, sizeof (remap
));
17993 if (GET_MODE_SIZE (d
->vmode
) == 16)
17995 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17997 /* Split the two input vectors into 4 halves. */
17998 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
18003 /* If the elements from the low halves use interleave low, and similarly
18004 for interleave high. If the elements are from mis-matched halves, we
18005 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18006 if ((contents
& (h1
| h3
)) == contents
)
18009 for (i
= 0; i
< nelt2
; ++i
)
18012 remap
[i
+ nelt
] = i
* 2 + 1;
18013 dremap
.perm
[i
* 2] = i
;
18014 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18016 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18017 dremap
.vmode
= V4SFmode
;
18019 else if ((contents
& (h2
| h4
)) == contents
)
18022 for (i
= 0; i
< nelt2
; ++i
)
18024 remap
[i
+ nelt2
] = i
* 2;
18025 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
18026 dremap
.perm
[i
* 2] = i
+ nelt2
;
18027 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
18029 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18030 dremap
.vmode
= V4SFmode
;
18032 else if ((contents
& (h1
| h4
)) == contents
)
18035 for (i
= 0; i
< nelt2
; ++i
)
18038 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
18039 dremap
.perm
[i
] = i
;
18040 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
18045 dremap
.vmode
= V2DImode
;
18047 dremap
.perm
[0] = 0;
18048 dremap
.perm
[1] = 3;
18051 else if ((contents
& (h2
| h3
)) == contents
)
18054 for (i
= 0; i
< nelt2
; ++i
)
18056 remap
[i
+ nelt2
] = i
;
18057 remap
[i
+ nelt
] = i
+ nelt2
;
18058 dremap
.perm
[i
] = i
+ nelt2
;
18059 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
18064 dremap
.vmode
= V2DImode
;
18066 dremap
.perm
[0] = 1;
18067 dremap
.perm
[1] = 2;
18075 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
18076 unsigned HOST_WIDE_INT q
[8];
18077 unsigned int nonzero_halves
[4];
18079 /* Split the two input vectors into 8 quarters. */
18080 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
18081 for (i
= 1; i
< 8; ++i
)
18082 q
[i
] = q
[0] << (nelt4
* i
);
18083 for (i
= 0; i
< 4; ++i
)
18084 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
18086 nonzero_halves
[nzcnt
] = i
;
18092 gcc_assert (d
->one_operand_p
);
18093 nonzero_halves
[1] = nonzero_halves
[0];
18094 same_halves
= true;
18096 else if (d
->one_operand_p
)
18098 gcc_assert (nonzero_halves
[0] == 0);
18099 gcc_assert (nonzero_halves
[1] == 1);
18104 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
18106 /* Attempt to increase the likelihood that dfinal
18107 shuffle will be intra-lane. */
18108 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
18111 /* vperm2f128 or vperm2i128. */
18112 for (i
= 0; i
< nelt2
; ++i
)
18114 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
18115 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
18116 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
18117 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
18120 if (d
->vmode
!= V8SFmode
18121 && d
->vmode
!= V4DFmode
18122 && d
->vmode
!= V8SImode
)
18124 dremap
.vmode
= V8SImode
;
18126 for (i
= 0; i
< 4; ++i
)
18128 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
18129 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
18133 else if (d
->one_operand_p
)
18135 else if (TARGET_AVX2
18136 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
18139 for (i
= 0; i
< nelt4
; ++i
)
18142 remap
[i
+ nelt
] = i
* 2 + 1;
18143 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
18144 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
18145 dremap
.perm
[i
* 2] = i
;
18146 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18147 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
18148 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
18151 else if (TARGET_AVX2
18152 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
18155 for (i
= 0; i
< nelt4
; ++i
)
18157 remap
[i
+ nelt4
] = i
* 2;
18158 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
18159 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
18160 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
18161 dremap
.perm
[i
* 2] = i
+ nelt4
;
18162 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
18163 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
18164 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
18171 /* Use the remapping array set up above to move the elements from their
18172 swizzled locations into their final destinations. */
18174 for (i
= 0; i
< nelt
; ++i
)
18176 unsigned e
= remap
[d
->perm
[i
]];
18177 gcc_assert (e
< nelt
);
18178 /* If same_halves is true, both halves of the remapped vector are the
18179 same. Avoid cross-lane accesses if possible. */
18180 if (same_halves
&& i
>= nelt2
)
18182 gcc_assert (e
< nelt2
);
18183 dfinal
.perm
[i
] = e
+ nelt2
;
18186 dfinal
.perm
[i
] = e
;
18190 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
18191 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18193 dfinal
.op1
= dfinal
.op0
;
18194 dfinal
.one_operand_p
= true;
18196 /* Test if the final remap can be done with a single insn. For V4SFmode or
18197 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18199 ok
= expand_vec_perm_1 (&dfinal
);
18200 seq
= get_insns ();
18209 if (dremap
.vmode
!= dfinal
.vmode
)
18211 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
18212 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
18215 ok
= expand_vec_perm_1 (&dremap
);
18222 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18223 a single vector cross-lane permutation into vpermq followed
18224 by any of the single insn permutations. */
18227 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
18229 struct expand_vec_perm_d dremap
, dfinal
;
18230 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
18231 unsigned contents
[2];
18235 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
18236 && d
->one_operand_p
))
18241 for (i
= 0; i
< nelt2
; ++i
)
18243 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
18244 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
18247 for (i
= 0; i
< 2; ++i
)
18249 unsigned int cnt
= 0;
18250 for (j
= 0; j
< 4; ++j
)
18251 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
18259 dremap
.vmode
= V4DImode
;
18261 dremap
.target
= gen_reg_rtx (V4DImode
);
18262 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
18263 dremap
.op1
= dremap
.op0
;
18264 dremap
.one_operand_p
= true;
18265 for (i
= 0; i
< 2; ++i
)
18267 unsigned int cnt
= 0;
18268 for (j
= 0; j
< 4; ++j
)
18269 if ((contents
[i
] & (1u << j
)) != 0)
18270 dremap
.perm
[2 * i
+ cnt
++] = j
;
18271 for (; cnt
< 2; ++cnt
)
18272 dremap
.perm
[2 * i
+ cnt
] = 0;
18276 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18277 dfinal
.op1
= dfinal
.op0
;
18278 dfinal
.one_operand_p
= true;
18279 for (i
= 0, j
= 0; i
< nelt
; ++i
)
18283 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
18284 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
18286 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
18287 dfinal
.perm
[i
] |= nelt4
;
18289 gcc_unreachable ();
18292 ok
= expand_vec_perm_1 (&dremap
);
18295 ok
= expand_vec_perm_1 (&dfinal
);
18301 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
18303 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18304 a vector permutation using two instructions, vperm2f128 resp.
18305 vperm2i128 followed by any single in-lane permutation. */
18308 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
18310 struct expand_vec_perm_d dfirst
, dsecond
;
18311 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
18315 || GET_MODE_SIZE (d
->vmode
) != 32
18316 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
18320 dsecond
.one_operand_p
= false;
18321 dsecond
.testing_p
= true;
18323 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18324 immediate. For perm < 16 the second permutation uses
18325 d->op0 as first operand, for perm >= 16 it uses d->op1
18326 as first operand. The second operand is the result of
18328 for (perm
= 0; perm
< 32; perm
++)
18330 /* Ignore permutations which do not move anything cross-lane. */
18333 /* The second shuffle for e.g. V4DFmode has
18334 0123 and ABCD operands.
18335 Ignore AB23, as 23 is already in the second lane
18336 of the first operand. */
18337 if ((perm
& 0xc) == (1 << 2)) continue;
18338 /* And 01CD, as 01 is in the first lane of the first
18340 if ((perm
& 3) == 0) continue;
18341 /* And 4567, as then the vperm2[fi]128 doesn't change
18342 anything on the original 4567 second operand. */
18343 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
18347 /* The second shuffle for e.g. V4DFmode has
18348 4567 and ABCD operands.
18349 Ignore AB67, as 67 is already in the second lane
18350 of the first operand. */
18351 if ((perm
& 0xc) == (3 << 2)) continue;
18352 /* And 45CD, as 45 is in the first lane of the first
18354 if ((perm
& 3) == 2) continue;
18355 /* And 0123, as then the vperm2[fi]128 doesn't change
18356 anything on the original 0123 first operand. */
18357 if ((perm
& 0xf) == (1 << 2)) continue;
18360 for (i
= 0; i
< nelt
; i
++)
18362 j
= d
->perm
[i
] / nelt2
;
18363 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
18364 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
18365 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
18366 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
18374 ok
= expand_vec_perm_1 (&dsecond
);
18385 /* Found a usable second shuffle. dfirst will be
18386 vperm2f128 on d->op0 and d->op1. */
18387 dsecond
.testing_p
= false;
18389 dfirst
.target
= gen_reg_rtx (d
->vmode
);
18390 for (i
= 0; i
< nelt
; i
++)
18391 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
18392 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
18394 canonicalize_perm (&dfirst
);
18395 ok
= expand_vec_perm_1 (&dfirst
);
18398 /* And dsecond is some single insn shuffle, taking
18399 d->op0 and result of vperm2f128 (if perm < 16) or
18400 d->op1 and result of vperm2f128 (otherwise). */
18402 dsecond
.op0
= dsecond
.op1
;
18403 dsecond
.op1
= dfirst
.target
;
18405 ok
= expand_vec_perm_1 (&dsecond
);
18411 /* For one operand, the only useful vperm2f128 permutation is 0x01
18413 if (d
->one_operand_p
)
18420 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18421 a two vector permutation using 2 intra-lane interleave insns
18422 and cross-lane shuffle for 32-byte vectors. */
18425 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
18428 rtx (*gen
) (rtx
, rtx
, rtx
);
18430 if (d
->one_operand_p
)
18432 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
18434 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
18440 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
18442 for (i
= 0; i
< nelt
; i
+= 2)
18443 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
18444 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
18454 gen
= gen_vec_interleave_highv32qi
;
18456 gen
= gen_vec_interleave_lowv32qi
;
18460 gen
= gen_vec_interleave_highv16hi
;
18462 gen
= gen_vec_interleave_lowv16hi
;
18466 gen
= gen_vec_interleave_highv8si
;
18468 gen
= gen_vec_interleave_lowv8si
;
18472 gen
= gen_vec_interleave_highv4di
;
18474 gen
= gen_vec_interleave_lowv4di
;
18478 gen
= gen_vec_interleave_highv8sf
;
18480 gen
= gen_vec_interleave_lowv8sf
;
18484 gen
= gen_vec_interleave_highv4df
;
18486 gen
= gen_vec_interleave_lowv4df
;
18489 gcc_unreachable ();
18492 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18496 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18497 a single vector permutation using a single intra-lane vector
18498 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18499 the non-swapped and swapped vectors together. */
18502 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18504 struct expand_vec_perm_d dfirst
, dsecond
;
18505 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18508 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18512 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18513 || !d
->one_operand_p
)
18517 for (i
= 0; i
< nelt
; i
++)
18518 dfirst
.perm
[i
] = 0xff;
18519 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18521 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18522 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18524 dfirst
.perm
[j
] = d
->perm
[i
];
18528 for (i
= 0; i
< nelt
; i
++)
18529 if (dfirst
.perm
[i
] == 0xff)
18530 dfirst
.perm
[i
] = i
;
18533 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18536 ok
= expand_vec_perm_1 (&dfirst
);
18537 seq
= get_insns ();
18549 dsecond
.op0
= dfirst
.target
;
18550 dsecond
.op1
= dfirst
.target
;
18551 dsecond
.one_operand_p
= true;
18552 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18553 for (i
= 0; i
< nelt
; i
++)
18554 dsecond
.perm
[i
] = i
^ nelt2
;
18556 ok
= expand_vec_perm_1 (&dsecond
);
18559 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18560 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18564 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18565 permutation using two vperm2f128, followed by a vshufpd insn blending
18566 the two vectors together. */
18569 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18571 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18574 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18584 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18585 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18586 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18587 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18588 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18589 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18590 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18591 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18592 dthird
.perm
[0] = (d
->perm
[0] % 2);
18593 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18594 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18595 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18597 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18598 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18599 dthird
.op0
= dfirst
.target
;
18600 dthird
.op1
= dsecond
.target
;
18601 dthird
.one_operand_p
= false;
18603 canonicalize_perm (&dfirst
);
18604 canonicalize_perm (&dsecond
);
18606 ok
= expand_vec_perm_1 (&dfirst
)
18607 && expand_vec_perm_1 (&dsecond
)
18608 && expand_vec_perm_1 (&dthird
);
18615 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18617 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18618 a two vector permutation using two intra-lane vector
18619 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18620 the non-swapped and swapped vectors together. */
18623 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18625 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18626 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18627 rtx_insn
*seq1
, *seq2
;
18629 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18633 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18634 || d
->one_operand_p
)
18639 for (i
= 0; i
< nelt
; i
++)
18641 dfirst
.perm
[i
] = 0xff;
18642 dsecond
.perm
[i
] = 0xff;
18644 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18646 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18649 dfirst
.perm
[j
] = d
->perm
[i
];
18650 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18654 dsecond
.perm
[j
] = d
->perm
[i
];
18655 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18659 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18664 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18665 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18668 for (i
= 0; i
< nelt
; i
++)
18670 if (dfirst
.perm
[i
] == 0xff)
18671 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18672 if (dsecond
.perm
[i
] == 0xff)
18673 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18675 canonicalize_perm (&dfirst
);
18677 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18678 seq1
= get_insns ();
18684 canonicalize_perm (&dsecond
);
18686 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18687 seq2
= get_insns ();
18700 dthird
.op0
= dsecond
.target
;
18701 dthird
.op1
= dsecond
.target
;
18702 dthird
.one_operand_p
= true;
18703 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18704 for (i
= 0; i
< nelt
; i
++)
18705 dthird
.perm
[i
] = i
^ nelt2
;
18707 ok
= expand_vec_perm_1 (&dthird
);
18710 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18711 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18715 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18716 permutation with two pshufb insns and an ior. We should have already
18717 failed all two instruction sequences. */
18720 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18722 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18723 unsigned int i
, nelt
, eltsz
;
18725 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18727 gcc_assert (!d
->one_operand_p
);
18733 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18735 /* Generate two permutation masks. If the required element is within
18736 the given vector it is shuffled into the proper lane. If the required
18737 element is in the other vector, force a zero into the lane by setting
18738 bit 7 in the permutation mask. */
18739 m128
= GEN_INT (-128);
18740 for (i
= 0; i
< nelt
; ++i
)
18742 unsigned j
, e
= d
->perm
[i
];
18743 unsigned which
= (e
>= nelt
);
18747 for (j
= 0; j
< eltsz
; ++j
)
18749 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18750 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18754 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18755 vperm
= force_reg (V16QImode
, vperm
);
18757 l
= gen_reg_rtx (V16QImode
);
18758 op
= gen_lowpart (V16QImode
, d
->op0
);
18759 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18761 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18762 vperm
= force_reg (V16QImode
, vperm
);
18764 h
= gen_reg_rtx (V16QImode
);
18765 op
= gen_lowpart (V16QImode
, d
->op1
);
18766 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18769 if (d
->vmode
!= V16QImode
)
18770 op
= gen_reg_rtx (V16QImode
);
18771 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18772 if (op
!= d
->target
)
18773 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18778 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18779 with two vpshufb insns, vpermq and vpor. We should have already failed
18780 all two or three instruction sequences. */
18783 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18785 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18786 unsigned int i
, nelt
, eltsz
;
18789 || !d
->one_operand_p
18790 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18797 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18799 /* Generate two permutation masks. If the required element is within
18800 the same lane, it is shuffled in. If the required element from the
18801 other lane, force a zero by setting bit 7 in the permutation mask.
18802 In the other mask the mask has non-negative elements if element
18803 is requested from the other lane, but also moved to the other lane,
18804 so that the result of vpshufb can have the two V2TImode halves
18806 m128
= GEN_INT (-128);
18807 for (i
= 0; i
< nelt
; ++i
)
18809 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18810 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18812 for (j
= 0; j
< eltsz
; ++j
)
18814 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18815 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18819 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18820 vperm
= force_reg (V32QImode
, vperm
);
18822 h
= gen_reg_rtx (V32QImode
);
18823 op
= gen_lowpart (V32QImode
, d
->op0
);
18824 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18826 /* Swap the 128-byte lanes of h into hp. */
18827 hp
= gen_reg_rtx (V4DImode
);
18828 op
= gen_lowpart (V4DImode
, h
);
18829 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18832 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18833 vperm
= force_reg (V32QImode
, vperm
);
18835 l
= gen_reg_rtx (V32QImode
);
18836 op
= gen_lowpart (V32QImode
, d
->op0
);
18837 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18840 if (d
->vmode
!= V32QImode
)
18841 op
= gen_reg_rtx (V32QImode
);
18842 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18843 if (op
!= d
->target
)
18844 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18849 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18850 and extract-odd permutations of two V32QImode and V16QImode operand
18851 with two vpshufb insns, vpor and vpermq. We should have already
18852 failed all two or three instruction sequences. */
18855 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18857 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18858 unsigned int i
, nelt
, eltsz
;
18861 || d
->one_operand_p
18862 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18865 for (i
= 0; i
< d
->nelt
; ++i
)
18866 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18873 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18875 /* Generate two permutation masks. In the first permutation mask
18876 the first quarter will contain indexes for the first half
18877 of the op0, the second quarter will contain bit 7 set, third quarter
18878 will contain indexes for the second half of the op0 and the
18879 last quarter bit 7 set. In the second permutation mask
18880 the first quarter will contain bit 7 set, the second quarter
18881 indexes for the first half of the op1, the third quarter bit 7 set
18882 and last quarter indexes for the second half of the op1.
18883 I.e. the first mask e.g. for V32QImode extract even will be:
18884 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18885 (all values masked with 0xf except for -128) and second mask
18886 for extract even will be
18887 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18888 m128
= GEN_INT (-128);
18889 for (i
= 0; i
< nelt
; ++i
)
18891 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18892 unsigned which
= d
->perm
[i
] >= nelt
;
18893 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18895 for (j
= 0; j
< eltsz
; ++j
)
18897 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18898 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18902 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18903 vperm
= force_reg (V32QImode
, vperm
);
18905 l
= gen_reg_rtx (V32QImode
);
18906 op
= gen_lowpart (V32QImode
, d
->op0
);
18907 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18909 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18910 vperm
= force_reg (V32QImode
, vperm
);
18912 h
= gen_reg_rtx (V32QImode
);
18913 op
= gen_lowpart (V32QImode
, d
->op1
);
18914 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18916 ior
= gen_reg_rtx (V32QImode
);
18917 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18919 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18920 op
= gen_reg_rtx (V4DImode
);
18921 ior
= gen_lowpart (V4DImode
, ior
);
18922 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18923 const1_rtx
, GEN_INT (3)));
18924 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18929 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18930 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18931 with two "and" and "pack" or two "shift" and "pack" insns. We should
18932 have already failed all two instruction sequences. */
18935 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18937 rtx op
, dop0
, dop1
, t
;
18938 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18939 bool end_perm
= false;
18940 machine_mode half_mode
;
18941 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18942 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18943 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18945 if (d
->one_operand_p
)
18951 /* Required for "pack". */
18952 if (!TARGET_SSE4_1
)
18956 half_mode
= V4SImode
;
18957 gen_and
= gen_andv4si3
;
18958 gen_pack
= gen_sse4_1_packusdw
;
18959 gen_shift
= gen_lshrv4si3
;
18962 /* No check as all instructions are SSE2. */
18965 half_mode
= V8HImode
;
18966 gen_and
= gen_andv8hi3
;
18967 gen_pack
= gen_sse2_packuswb
;
18968 gen_shift
= gen_lshrv8hi3
;
18975 half_mode
= V8SImode
;
18976 gen_and
= gen_andv8si3
;
18977 gen_pack
= gen_avx2_packusdw
;
18978 gen_shift
= gen_lshrv8si3
;
18986 half_mode
= V16HImode
;
18987 gen_and
= gen_andv16hi3
;
18988 gen_pack
= gen_avx2_packuswb
;
18989 gen_shift
= gen_lshrv16hi3
;
18993 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18994 general shuffles. */
18998 /* Check that permutation is even or odd. */
19003 for (i
= 1; i
< nelt
; ++i
)
19004 if (d
->perm
[i
] != 2 * i
+ odd
)
19010 dop0
= gen_reg_rtx (half_mode
);
19011 dop1
= gen_reg_rtx (half_mode
);
19014 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
19015 t
= force_reg (half_mode
, t
);
19016 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
19017 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
19021 emit_insn (gen_shift (dop0
,
19022 gen_lowpart (half_mode
, d
->op0
),
19024 emit_insn (gen_shift (dop1
,
19025 gen_lowpart (half_mode
, d
->op1
),
19028 /* In AVX2 for 256 bit case we need to permute pack result. */
19029 if (TARGET_AVX2
&& end_perm
)
19031 op
= gen_reg_rtx (d
->vmode
);
19032 t
= gen_reg_rtx (V4DImode
);
19033 emit_insn (gen_pack (op
, dop0
, dop1
));
19034 emit_insn (gen_avx2_permv4di_1 (t
,
19035 gen_lowpart (V4DImode
, op
),
19040 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
19043 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
19048 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19049 and extract-odd permutations of two V64QI operands
19050 with two "shifts", two "truncs" and one "concat" insns for "odd"
19051 and two "truncs" and one concat insn for "even."
19052 Have already failed all two instruction sequences. */
19055 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
19057 rtx t1
, t2
, t3
, t4
;
19058 unsigned i
, odd
, nelt
= d
->nelt
;
19060 if (!TARGET_AVX512BW
19061 || d
->one_operand_p
19062 || d
->vmode
!= V64QImode
)
19065 /* Check that permutation is even or odd. */
19070 for (i
= 1; i
< nelt
; ++i
)
19071 if (d
->perm
[i
] != 2 * i
+ odd
)
19080 t1
= gen_reg_rtx (V32HImode
);
19081 t2
= gen_reg_rtx (V32HImode
);
19082 emit_insn (gen_lshrv32hi3 (t1
,
19083 gen_lowpart (V32HImode
, d
->op0
),
19085 emit_insn (gen_lshrv32hi3 (t2
,
19086 gen_lowpart (V32HImode
, d
->op1
),
19091 t1
= gen_lowpart (V32HImode
, d
->op0
);
19092 t2
= gen_lowpart (V32HImode
, d
->op1
);
19095 t3
= gen_reg_rtx (V32QImode
);
19096 t4
= gen_reg_rtx (V32QImode
);
19097 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
19098 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
19099 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
19104 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19105 and extract-odd permutations. */
19108 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
19110 rtx t1
, t2
, t3
, t4
, t5
;
19117 t1
= gen_reg_rtx (V4DFmode
);
19118 t2
= gen_reg_rtx (V4DFmode
);
19120 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19121 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19122 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19124 /* Now an unpck[lh]pd will produce the result required. */
19126 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
19128 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
19134 int mask
= odd
? 0xdd : 0x88;
19138 t1
= gen_reg_rtx (V8SFmode
);
19139 t2
= gen_reg_rtx (V8SFmode
);
19140 t3
= gen_reg_rtx (V8SFmode
);
19142 /* Shuffle within the 128-bit lanes to produce:
19143 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19144 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
19147 /* Shuffle the lanes around to produce:
19148 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19149 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
19152 /* Shuffle within the 128-bit lanes to produce:
19153 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19154 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
19156 /* Shuffle within the 128-bit lanes to produce:
19157 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19158 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
19160 /* Shuffle the lanes around to produce:
19161 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19162 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
19172 /* These are always directly implementable by expand_vec_perm_1. */
19173 gcc_unreachable ();
19176 gcc_assert (TARGET_MMX_WITH_SSE
);
19177 /* We have no suitable instructions. */
19185 /* We need 2*log2(N)-1 operations to achieve odd/even
19186 with interleave. */
19187 t1
= gen_reg_rtx (V4HImode
);
19188 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
19189 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
19191 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
19193 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
19199 return expand_vec_perm_even_odd_pack (d
);
19200 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
19201 return expand_vec_perm_pshufb2 (d
);
19206 /* We need 2*log2(N)-1 operations to achieve odd/even
19207 with interleave. */
19208 t1
= gen_reg_rtx (V8HImode
);
19209 t2
= gen_reg_rtx (V8HImode
);
19210 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
19211 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
19212 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
19213 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
19215 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
19217 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
19223 return expand_vec_perm_even_odd_pack (d
);
19227 return expand_vec_perm_even_odd_pack (d
);
19230 return expand_vec_perm_even_odd_trunc (d
);
19235 struct expand_vec_perm_d d_copy
= *d
;
19236 d_copy
.vmode
= V4DFmode
;
19238 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19240 d_copy
.target
= gen_reg_rtx (V4DFmode
);
19241 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
19242 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
19243 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19246 emit_move_insn (d
->target
,
19247 gen_lowpart (V4DImode
, d_copy
.target
));
19256 t1
= gen_reg_rtx (V4DImode
);
19257 t2
= gen_reg_rtx (V4DImode
);
19259 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19260 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19261 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19263 /* Now an vpunpck[lh]qdq will produce the result required. */
19265 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
19267 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
19274 struct expand_vec_perm_d d_copy
= *d
;
19275 d_copy
.vmode
= V8SFmode
;
19277 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19279 d_copy
.target
= gen_reg_rtx (V8SFmode
);
19280 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
19281 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
19282 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19285 emit_move_insn (d
->target
,
19286 gen_lowpart (V8SImode
, d_copy
.target
));
19295 t1
= gen_reg_rtx (V8SImode
);
19296 t2
= gen_reg_rtx (V8SImode
);
19297 t3
= gen_reg_rtx (V4DImode
);
19298 t4
= gen_reg_rtx (V4DImode
);
19299 t5
= gen_reg_rtx (V4DImode
);
19301 /* Shuffle the lanes around into
19302 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19303 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
19304 gen_lowpart (V4DImode
, d
->op1
),
19306 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
19307 gen_lowpart (V4DImode
, d
->op1
),
19310 /* Swap the 2nd and 3rd position in each lane into
19311 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19312 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
19313 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19314 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
19315 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19317 /* Now an vpunpck[lh]qdq will produce
19318 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19320 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
19321 gen_lowpart (V4DImode
, t2
));
19323 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
19324 gen_lowpart (V4DImode
, t2
));
19326 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
19330 gcc_unreachable ();
19336 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19337 extract-even and extract-odd permutations. */
19340 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
19342 unsigned i
, odd
, nelt
= d
->nelt
;
19345 if (odd
!= 0 && odd
!= 1)
19348 for (i
= 1; i
< nelt
; ++i
)
19349 if (d
->perm
[i
] != 2 * i
+ odd
)
19352 return expand_vec_perm_even_odd_1 (d
, odd
);
19355 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19356 permutations. We assume that expand_vec_perm_1 has already failed. */
19359 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
19361 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
19362 machine_mode vmode
= d
->vmode
;
19363 unsigned char perm2
[4];
19364 rtx op0
= d
->op0
, dest
;
19371 /* These are special-cased in sse.md so that we can optionally
19372 use the vbroadcast instruction. They expand to two insns
19373 if the input happens to be in a register. */
19374 gcc_unreachable ();
19382 /* These are always implementable using standard shuffle patterns. */
19383 gcc_unreachable ();
19387 /* These can be implemented via interleave. We save one insn by
19388 stopping once we have promoted to V4SImode and then use pshufd. */
19394 rtx (*gen
) (rtx
, rtx
, rtx
)
19395 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
19396 : gen_vec_interleave_lowv8hi
;
19400 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
19401 : gen_vec_interleave_highv8hi
;
19406 dest
= gen_reg_rtx (vmode
);
19407 emit_insn (gen (dest
, op0
, op0
));
19408 vmode
= get_mode_wider_vector (vmode
);
19409 op0
= gen_lowpart (vmode
, dest
);
19411 while (vmode
!= V4SImode
);
19413 memset (perm2
, elt
, 4);
19414 dest
= gen_reg_rtx (V4SImode
);
19415 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
19418 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
19426 /* For AVX2 broadcasts of the first element vpbroadcast* or
19427 vpermq should be used by expand_vec_perm_1. */
19428 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
19432 gcc_unreachable ();
19436 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19437 broadcast permutations. */
19440 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
19442 unsigned i
, elt
, nelt
= d
->nelt
;
19444 if (!d
->one_operand_p
)
19448 for (i
= 1; i
< nelt
; ++i
)
19449 if (d
->perm
[i
] != elt
)
19452 return expand_vec_perm_broadcast_1 (d
);
19455 /* Implement arbitrary permutations of two V64QImode operands
19456 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19458 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
19460 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
19466 struct expand_vec_perm_d ds
[2];
19467 rtx rperm
[128], vperm
, target0
, target1
;
19468 unsigned int i
, nelt
;
19469 machine_mode vmode
;
19474 for (i
= 0; i
< 2; i
++)
19477 ds
[i
].vmode
= V32HImode
;
19479 ds
[i
].target
= gen_reg_rtx (V32HImode
);
19480 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
19481 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
19484 /* Prepare permutations such that the first one takes care of
19485 putting the even bytes into the right positions or one higher
19486 positions (ds[0]) and the second one takes care of
19487 putting the odd bytes into the right positions or one below
19490 for (i
= 0; i
< nelt
; i
++)
19492 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
19495 rperm
[i
] = constm1_rtx
;
19496 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19500 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19501 rperm
[i
+ 64] = constm1_rtx
;
19505 bool ok
= expand_vec_perm_1 (&ds
[0]);
19507 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
19509 ok
= expand_vec_perm_1 (&ds
[1]);
19511 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
19513 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
19514 vperm
= force_reg (vmode
, vperm
);
19515 target0
= gen_reg_rtx (V64QImode
);
19516 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
19518 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
19519 vperm
= force_reg (vmode
, vperm
);
19520 target1
= gen_reg_rtx (V64QImode
);
19521 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
19523 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
19527 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19528 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19529 all the shorter instruction sequences. */
19532 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19534 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19535 unsigned int i
, nelt
, eltsz
;
19539 || d
->one_operand_p
19540 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19547 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19549 /* Generate 4 permutation masks. If the required element is within
19550 the same lane, it is shuffled in. If the required element from the
19551 other lane, force a zero by setting bit 7 in the permutation mask.
19552 In the other mask the mask has non-negative elements if element
19553 is requested from the other lane, but also moved to the other lane,
19554 so that the result of vpshufb can have the two V2TImode halves
19556 m128
= GEN_INT (-128);
19557 for (i
= 0; i
< 32; ++i
)
19559 rperm
[0][i
] = m128
;
19560 rperm
[1][i
] = m128
;
19561 rperm
[2][i
] = m128
;
19562 rperm
[3][i
] = m128
;
19568 for (i
= 0; i
< nelt
; ++i
)
19570 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19571 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19572 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19574 for (j
= 0; j
< eltsz
; ++j
)
19575 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19576 used
[which
] = true;
19579 for (i
= 0; i
< 2; ++i
)
19581 if (!used
[2 * i
+ 1])
19586 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19587 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19588 vperm
= force_reg (V32QImode
, vperm
);
19589 h
[i
] = gen_reg_rtx (V32QImode
);
19590 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19591 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19594 /* Swap the 128-byte lanes of h[X]. */
19595 for (i
= 0; i
< 2; ++i
)
19597 if (h
[i
] == NULL_RTX
)
19599 op
= gen_reg_rtx (V4DImode
);
19600 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19601 const2_rtx
, GEN_INT (3), const0_rtx
,
19603 h
[i
] = gen_lowpart (V32QImode
, op
);
19606 for (i
= 0; i
< 2; ++i
)
19613 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19614 vperm
= force_reg (V32QImode
, vperm
);
19615 l
[i
] = gen_reg_rtx (V32QImode
);
19616 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19617 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19620 for (i
= 0; i
< 2; ++i
)
19624 op
= gen_reg_rtx (V32QImode
);
19625 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19632 gcc_assert (l
[0] && l
[1]);
19634 if (d
->vmode
!= V32QImode
)
19635 op
= gen_reg_rtx (V32QImode
);
19636 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19637 if (op
!= d
->target
)
19638 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19642 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19643 taken care of, perform the expansion in D and return true on success. */
19646 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19648 /* Try a single instruction expansion. */
19649 if (expand_vec_perm_1 (d
))
19652 /* Try sequences of two instructions. */
19654 if (expand_vec_perm_pshuflw_pshufhw (d
))
19657 if (expand_vec_perm_palignr (d
, false))
19660 if (expand_vec_perm_interleave2 (d
))
19663 if (expand_vec_perm_broadcast (d
))
19666 if (expand_vec_perm_vpermq_perm_1 (d
))
19669 if (expand_vec_perm_vperm2f128 (d
))
19672 if (expand_vec_perm_pblendv (d
))
19675 /* Try sequences of three instructions. */
19677 if (expand_vec_perm_even_odd_pack (d
))
19680 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19683 if (expand_vec_perm_pshufb2 (d
))
19686 if (expand_vec_perm_interleave3 (d
))
19689 if (expand_vec_perm_vperm2f128_vblend (d
))
19692 /* Try sequences of four instructions. */
19694 if (expand_vec_perm_even_odd_trunc (d
))
19696 if (expand_vec_perm_vpshufb2_vpermq (d
))
19699 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19702 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19705 /* ??? Look for narrow permutations whose element orderings would
19706 allow the promotion to a wider mode. */
19708 /* ??? Look for sequences of interleave or a wider permute that place
19709 the data into the correct lanes for a half-vector shuffle like
19710 pshuf[lh]w or vpermilps. */
19712 /* ??? Look for sequences of interleave that produce the desired results.
19713 The combinatorics of punpck[lh] get pretty ugly... */
19715 if (expand_vec_perm_even_odd (d
))
19718 /* Even longer sequences. */
19719 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19722 /* See if we can get the same permutation in different vector integer
19724 struct expand_vec_perm_d nd
;
19725 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19728 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19732 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19733 if (expand_vec_perm2_vperm2f128_vblend (d
))
19739 /* If a permutation only uses one operand, make it clear. Returns true
19740 if the permutation references both operands. */
19743 canonicalize_perm (struct expand_vec_perm_d
*d
)
19745 int i
, which
, nelt
= d
->nelt
;
19747 for (i
= which
= 0; i
< nelt
; ++i
)
19748 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19750 d
->one_operand_p
= true;
19757 if (!rtx_equal_p (d
->op0
, d
->op1
))
19759 d
->one_operand_p
= false;
19762 /* The elements of PERM do not suggest that only the first operand
19763 is used, but both operands are identical. Allow easier matching
19764 of the permutation by folding the permutation into the single
19769 for (i
= 0; i
< nelt
; ++i
)
19770 d
->perm
[i
] &= nelt
- 1;
19779 return (which
== 3);
19782 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19785 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19786 rtx op1
, const vec_perm_indices
&sel
)
19788 struct expand_vec_perm_d d
;
19789 unsigned char perm
[MAX_VECT_LEN
];
19790 unsigned int i
, nelt
, which
;
19798 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19799 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19800 d
.testing_p
= !target
;
19802 gcc_assert (sel
.length () == nelt
);
19803 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19805 /* Given sufficient ISA support we can just return true here
19806 for selected vector modes. */
19813 if (!TARGET_AVX512F
)
19815 /* All implementable with a single vperm[it]2 insn. */
19820 if (!TARGET_AVX512BW
)
19823 /* All implementable with a single vperm[it]2 insn. */
19827 if (!TARGET_AVX512BW
)
19830 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19839 if (d
.testing_p
&& TARGET_AVX512VL
)
19840 /* All implementable with a single vperm[it]2 insn. */
19846 if (d
.testing_p
&& TARGET_AVX2
)
19847 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19853 if (d
.testing_p
&& TARGET_AVX2
)
19854 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19861 /* Fall through. */
19866 /* All implementable with a single vpperm insn. */
19867 if (d
.testing_p
&& TARGET_XOP
)
19869 /* All implementable with 2 pshufb + 1 ior. */
19870 if (d
.testing_p
&& TARGET_SSSE3
)
19876 if (!TARGET_MMX_WITH_SSE
)
19883 /* All implementable with shufpd or unpck[lh]pd. */
19891 for (i
= which
= 0; i
< nelt
; ++i
)
19893 unsigned char e
= sel
[i
];
19894 gcc_assert (e
< 2 * nelt
);
19897 which
|= (e
< nelt
? 1 : 2);
19902 /* For all elements from second vector, fold the elements to first. */
19904 for (i
= 0; i
< nelt
; ++i
)
19907 /* Check whether the mask can be applied to the vector type. */
19908 d
.one_operand_p
= (which
!= 3);
19910 /* Implementable with shufps or pshufd. */
19911 if (d
.one_operand_p
19912 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
19913 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
))
19916 /* Otherwise we have to go through the motions and see if we can
19917 figure out how to generate the requested permutation. */
19918 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19919 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19920 if (!d
.one_operand_p
)
19921 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19924 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19930 two_args
= canonicalize_perm (&d
);
19932 /* If one of the operands is a zero vector, try to match pmovzx. */
19933 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
19935 struct expand_vec_perm_d dzero
= d
;
19936 if (d
.op0
== CONST0_RTX (vmode
))
19938 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
19939 std::swap (dzero
.op0
, dzero
.op1
);
19940 for (i
= 0; i
< nelt
; ++i
)
19941 dzero
.perm
[i
] ^= nelt
;
19944 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
19946 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
19947 dzero
.perm
, nelt
, dzero
.testing_p
))
19951 /* Force operands into registers. */
19952 rtx nop0
= force_reg (vmode
, d
.op0
);
19953 if (d
.op0
== d
.op1
)
19956 d
.op1
= force_reg (vmode
, d
.op1
);
19958 if (ix86_expand_vec_perm_const_1 (&d
))
19961 /* If the selector says both arguments are needed, but the operands are the
19962 same, the above tried to expand with one_operand_p and flattened selector.
19963 If that didn't work, retry without one_operand_p; we succeeded with that
19965 if (two_args
&& d
.one_operand_p
)
19967 d
.one_operand_p
= false;
19968 memcpy (d
.perm
, perm
, sizeof (perm
));
19969 return ix86_expand_vec_perm_const_1 (&d
);
19976 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19978 struct expand_vec_perm_d d
;
19984 d
.vmode
= GET_MODE (targ
);
19985 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19986 d
.one_operand_p
= false;
19987 d
.testing_p
= false;
19989 for (i
= 0; i
< nelt
; ++i
)
19990 d
.perm
[i
] = i
* 2 + odd
;
19992 /* We'll either be able to implement the permutation directly... */
19993 if (expand_vec_perm_1 (&d
))
19996 /* ... or we use the special-case patterns. */
19997 expand_vec_perm_even_odd_1 (&d
, odd
);
20001 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
20003 struct expand_vec_perm_d d
;
20004 unsigned i
, nelt
, base
;
20010 d
.vmode
= GET_MODE (targ
);
20011 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
20012 d
.one_operand_p
= false;
20013 d
.testing_p
= false;
20015 base
= high_p
? nelt
/ 2 : 0;
20016 for (i
= 0; i
< nelt
/ 2; ++i
)
20018 d
.perm
[i
* 2] = i
+ base
;
20019 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
20022 /* Note that for AVX this isn't one instruction. */
20023 ok
= ix86_expand_vec_perm_const_1 (&d
);
20027 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20028 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20030 vpmovzxbw ymm2, xmm0
20031 vpmovzxbw ymm3, xmm1
20032 vpmullw ymm4, ymm2, ymm3
20035 it would take less instructions than ix86_expand_vecop_qihi.
20036 Return true if success. */
20039 ix86_expand_vecmul_qihi (rtx dest
, rtx op1
, rtx op2
)
20041 machine_mode himode
, qimode
= GET_MODE (dest
);
20042 rtx hop1
, hop2
, hdest
;
20043 rtx (*gen_extend
)(rtx
, rtx
);
20044 rtx (*gen_truncate
)(rtx
, rtx
);
20046 /* There's no V64HImode multiplication instruction. */
20047 if (qimode
== E_V64QImode
)
20050 /* vpmovwb only available under AVX512BW. */
20051 if (!TARGET_AVX512BW
)
20053 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
20054 && !TARGET_AVX512VL
)
20056 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20057 if (qimode
== V32QImode
20058 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
20065 gen_extend
= gen_zero_extendv8qiv8hi2
;
20066 gen_truncate
= gen_truncv8hiv8qi2
;
20069 himode
= V16HImode
;
20070 gen_extend
= gen_zero_extendv16qiv16hi2
;
20071 gen_truncate
= gen_truncv16hiv16qi2
;
20074 himode
= V32HImode
;
20075 gen_extend
= gen_zero_extendv32qiv32hi2
;
20076 gen_truncate
= gen_truncv32hiv32qi2
;
20079 gcc_unreachable ();
20082 hop1
= gen_reg_rtx (himode
);
20083 hop2
= gen_reg_rtx (himode
);
20084 hdest
= gen_reg_rtx (himode
);
20085 emit_insn (gen_extend (hop1
, op1
));
20086 emit_insn (gen_extend (hop2
, op2
));
20087 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (MULT
, himode
,
20089 emit_insn (gen_truncate (dest
, hdest
));
20093 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20094 same operation on V*HImode. Return true if success. */
20096 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20098 machine_mode qimode
, himode
;
20099 HOST_WIDE_INT and_constant
, xor_constant
;
20100 HOST_WIDE_INT shift_amount
;
20101 rtx vec_const_and
, vec_const_xor
;
20102 rtx tmp
, op1_subreg
;
20103 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
20104 rtx (*gen_and
) (rtx
, rtx
, rtx
);
20105 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
20106 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
20108 /* Only optimize shift by constant. */
20109 if (!CONST_INT_P (op2
))
20112 qimode
= GET_MODE (dest
);
20113 shift_amount
= INTVAL (op2
);
20114 /* Do nothing when shift amount greater equal 8. */
20115 if (shift_amount
> 7)
20118 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
20119 /* Record sign bit. */
20120 xor_constant
= 1 << (8 - shift_amount
- 1);
20122 /* Zero upper/lower bits shift from left/right element. */
20124 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
20125 : (1 << (8 - shift_amount
)) - 1);
20134 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
20135 gen_and
= gen_andv16qi3
;
20136 gen_xor
= gen_xorv16qi3
;
20137 gen_sub
= gen_subv16qi3
;
20140 himode
= V16HImode
;
20144 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
20145 gen_and
= gen_andv32qi3
;
20146 gen_xor
= gen_xorv32qi3
;
20147 gen_sub
= gen_subv32qi3
;
20150 himode
= V32HImode
;
20154 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
20155 gen_and
= gen_andv64qi3
;
20156 gen_xor
= gen_xorv64qi3
;
20157 gen_sub
= gen_subv64qi3
;
20160 gcc_unreachable ();
20163 tmp
= gen_reg_rtx (himode
);
20164 vec_const_and
= gen_reg_rtx (qimode
);
20165 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
20167 /* For ASHIFT and LSHIFTRT, perform operation like
20168 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20169 vpand %vec_const_and, %dest. */
20170 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
20171 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
20172 emit_move_insn (vec_const_and
,
20173 ix86_build_const_vector (qimode
, true,
20174 gen_int_mode (and_constant
, QImode
)));
20175 emit_insn (gen_and (dest
, dest
, vec_const_and
));
20177 /* For ASHIFTRT, perform extra operation like
20178 vpxor %vec_const_xor, %dest, %dest
20179 vpsubb %vec_const_xor, %dest, %dest */
20180 if (code
== ASHIFTRT
)
20182 vec_const_xor
= gen_reg_rtx (qimode
);
20183 emit_move_insn (vec_const_xor
,
20184 ix86_build_const_vector (qimode
, true,
20185 gen_int_mode (xor_constant
, QImode
)));
20186 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
20187 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
20192 /* Expand a vector operation CODE for a V*QImode in terms of the
20193 same operation on V*HImode. */
20196 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20198 machine_mode qimode
= GET_MODE (dest
);
20199 machine_mode himode
;
20200 rtx (*gen_il
) (rtx
, rtx
, rtx
);
20201 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
20202 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
20203 struct expand_vec_perm_d d
;
20204 bool ok
, full_interleave
;
20205 bool uns_p
= false;
20212 gen_il
= gen_vec_interleave_lowv16qi
;
20213 gen_ih
= gen_vec_interleave_highv16qi
;
20216 himode
= V16HImode
;
20217 gen_il
= gen_avx2_interleave_lowv32qi
;
20218 gen_ih
= gen_avx2_interleave_highv32qi
;
20221 himode
= V32HImode
;
20222 gen_il
= gen_avx512bw_interleave_lowv64qi
;
20223 gen_ih
= gen_avx512bw_interleave_highv64qi
;
20226 gcc_unreachable ();
20229 op2_l
= op2_h
= op2
;
20233 /* Unpack data such that we've got a source byte in each low byte of
20234 each word. We don't care what goes into the high byte of each word.
20235 Rather than trying to get zero in there, most convenient is to let
20236 it be a copy of the low byte. */
20237 op2_l
= gen_reg_rtx (qimode
);
20238 op2_h
= gen_reg_rtx (qimode
);
20239 emit_insn (gen_il (op2_l
, op2
, op2
));
20240 emit_insn (gen_ih (op2_h
, op2
, op2
));
20242 op1_l
= gen_reg_rtx (qimode
);
20243 op1_h
= gen_reg_rtx (qimode
);
20244 emit_insn (gen_il (op1_l
, op1
, op1
));
20245 emit_insn (gen_ih (op1_h
, op1
, op1
));
20246 full_interleave
= qimode
== V16QImode
;
20254 op1_l
= gen_reg_rtx (himode
);
20255 op1_h
= gen_reg_rtx (himode
);
20256 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
20257 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
20258 full_interleave
= true;
20261 gcc_unreachable ();
20264 /* Perform the operation. */
20265 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
20267 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
20269 gcc_assert (res_l
&& res_h
);
20271 /* Merge the data back into the right place. */
20273 d
.op0
= gen_lowpart (qimode
, res_l
);
20274 d
.op1
= gen_lowpart (qimode
, res_h
);
20276 d
.nelt
= GET_MODE_NUNITS (qimode
);
20277 d
.one_operand_p
= false;
20278 d
.testing_p
= false;
20280 if (full_interleave
)
20282 /* For SSE2, we used an full interleave, so the desired
20283 results are in the even elements. */
20284 for (i
= 0; i
< d
.nelt
; ++i
)
20289 /* For AVX, the interleave used above was not cross-lane. So the
20290 extraction is evens but with the second and third quarter swapped.
20291 Happily, that is even one insn shorter than even extraction.
20292 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20293 always first from the first and then from the second source operand,
20294 the index bits above the low 4 bits remains the same.
20295 Thus, for d.nelt == 32 we want permutation
20296 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20297 and for d.nelt == 64 we want permutation
20298 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20299 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20300 for (i
= 0; i
< d
.nelt
; ++i
)
20301 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
20304 ok
= ix86_expand_vec_perm_const_1 (&d
);
20307 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20308 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
20311 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20312 if op is CONST_VECTOR with all odd elements equal to their
20313 preceding element. */
20316 const_vector_equal_evenodd_p (rtx op
)
20318 machine_mode mode
= GET_MODE (op
);
20319 int i
, nunits
= GET_MODE_NUNITS (mode
);
20320 if (GET_CODE (op
) != CONST_VECTOR
20321 || nunits
!= CONST_VECTOR_NUNITS (op
))
20323 for (i
= 0; i
< nunits
; i
+= 2)
20324 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
20330 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
20331 bool uns_p
, bool odd_p
)
20333 machine_mode mode
= GET_MODE (op1
);
20334 machine_mode wmode
= GET_MODE (dest
);
20336 rtx orig_op1
= op1
, orig_op2
= op2
;
20338 if (!nonimmediate_operand (op1
, mode
))
20339 op1
= force_reg (mode
, op1
);
20340 if (!nonimmediate_operand (op2
, mode
))
20341 op2
= force_reg (mode
, op2
);
20343 /* We only play even/odd games with vectors of SImode. */
20344 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
20346 /* If we're looking for the odd results, shift those members down to
20347 the even slots. For some cpus this is faster than a PSHUFD. */
20350 /* For XOP use vpmacsdqh, but only for smult, as it is only
20352 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
20354 x
= force_reg (wmode
, CONST0_RTX (wmode
));
20355 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
20359 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
20360 if (!const_vector_equal_evenodd_p (orig_op1
))
20361 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
20362 x
, NULL
, 1, OPTAB_DIRECT
);
20363 if (!const_vector_equal_evenodd_p (orig_op2
))
20364 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
20365 x
, NULL
, 1, OPTAB_DIRECT
);
20366 op1
= gen_lowpart (mode
, op1
);
20367 op2
= gen_lowpart (mode
, op2
);
20370 if (mode
== V16SImode
)
20373 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
20375 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
20377 else if (mode
== V8SImode
)
20380 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
20382 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
20385 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
20386 else if (TARGET_SSE4_1
)
20387 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
20390 rtx s1
, s2
, t0
, t1
, t2
;
20392 /* The easiest way to implement this without PMULDQ is to go through
20393 the motions as if we are performing a full 64-bit multiply. With
20394 the exception that we need to do less shuffling of the elements. */
20396 /* Compute the sign-extension, aka highparts, of the two operands. */
20397 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20398 op1
, pc_rtx
, pc_rtx
);
20399 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20400 op2
, pc_rtx
, pc_rtx
);
20402 /* Multiply LO(A) * HI(B), and vice-versa. */
20403 t1
= gen_reg_rtx (wmode
);
20404 t2
= gen_reg_rtx (wmode
);
20405 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
20406 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
20408 /* Multiply LO(A) * LO(B). */
20409 t0
= gen_reg_rtx (wmode
);
20410 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
20412 /* Combine and shift the highparts into place. */
20413 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
20414 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
20417 /* Combine high and low parts. */
20418 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
20425 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
20426 bool uns_p
, bool high_p
)
20428 machine_mode wmode
= GET_MODE (dest
);
20429 machine_mode mode
= GET_MODE (op1
);
20430 rtx t1
, t2
, t3
, t4
, mask
;
20435 t1
= gen_reg_rtx (mode
);
20436 t2
= gen_reg_rtx (mode
);
20437 if (TARGET_XOP
&& !uns_p
)
20439 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20440 shuffle the elements once so that all elements are in the right
20441 place for immediate use: { A C B D }. */
20442 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
20443 const1_rtx
, GEN_INT (3)));
20444 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
20445 const1_rtx
, GEN_INT (3)));
20449 /* Put the elements into place for the multiply. */
20450 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
20451 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
20454 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
20458 /* Shuffle the elements between the lanes. After this we
20459 have { A B E F | C D G H } for each operand. */
20460 t1
= gen_reg_rtx (V4DImode
);
20461 t2
= gen_reg_rtx (V4DImode
);
20462 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
20463 const0_rtx
, const2_rtx
,
20464 const1_rtx
, GEN_INT (3)));
20465 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
20466 const0_rtx
, const2_rtx
,
20467 const1_rtx
, GEN_INT (3)));
20469 /* Shuffle the elements within the lanes. After this we
20470 have { A A B B | C C D D } or { E E F F | G G H H }. */
20471 t3
= gen_reg_rtx (V8SImode
);
20472 t4
= gen_reg_rtx (V8SImode
);
20473 mask
= GEN_INT (high_p
20474 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20475 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20476 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
20477 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
20479 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
20484 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
20485 uns_p
, OPTAB_DIRECT
);
20486 t2
= expand_binop (mode
,
20487 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
20488 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
20489 gcc_assert (t1
&& t2
);
20491 t3
= gen_reg_rtx (mode
);
20492 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
20493 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
20501 t1
= gen_reg_rtx (wmode
);
20502 t2
= gen_reg_rtx (wmode
);
20503 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
20504 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
20506 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
20510 gcc_unreachable ();
20515 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
20517 rtx res_1
, res_2
, res_3
, res_4
;
20519 res_1
= gen_reg_rtx (V4SImode
);
20520 res_2
= gen_reg_rtx (V4SImode
);
20521 res_3
= gen_reg_rtx (V2DImode
);
20522 res_4
= gen_reg_rtx (V2DImode
);
20523 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
20524 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
20526 /* Move the results in element 2 down to element 1; we don't care
20527 what goes in elements 2 and 3. Then we can merge the parts
20528 back together with an interleave.
20530 Note that two other sequences were tried:
20531 (1) Use interleaves at the start instead of psrldq, which allows
20532 us to use a single shufps to merge things back at the end.
20533 (2) Use shufps here to combine the two vectors, then pshufd to
20534 put the elements in the correct order.
20535 In both cases the cost of the reformatting stall was too high
20536 and the overall sequence slower. */
20538 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
20539 const0_rtx
, const2_rtx
,
20540 const0_rtx
, const0_rtx
));
20541 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
20542 const0_rtx
, const2_rtx
,
20543 const0_rtx
, const0_rtx
));
20544 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
20546 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
20550 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
20552 machine_mode mode
= GET_MODE (op0
);
20553 rtx t1
, t2
, t3
, t4
, t5
, t6
;
20555 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
20556 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
20557 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
20558 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
20559 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
20560 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
20561 else if (TARGET_XOP
&& mode
== V2DImode
)
20563 /* op1: A,B,C,D, op2: E,F,G,H */
20564 op1
= gen_lowpart (V4SImode
, op1
);
20565 op2
= gen_lowpart (V4SImode
, op2
);
20567 t1
= gen_reg_rtx (V4SImode
);
20568 t2
= gen_reg_rtx (V4SImode
);
20569 t3
= gen_reg_rtx (V2DImode
);
20570 t4
= gen_reg_rtx (V2DImode
);
20573 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
20579 /* t2: (B*E),(A*F),(D*G),(C*H) */
20580 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
20582 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20583 emit_insn (gen_xop_phadddq (t3
, t2
));
20585 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20586 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
20588 /* Multiply lower parts and add all */
20589 t5
= gen_reg_rtx (V2DImode
);
20590 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
20591 gen_lowpart (V4SImode
, op1
),
20592 gen_lowpart (V4SImode
, op2
)));
20593 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
20597 machine_mode nmode
;
20598 rtx (*umul
) (rtx
, rtx
, rtx
);
20600 if (mode
== V2DImode
)
20602 umul
= gen_vec_widen_umult_even_v4si
;
20605 else if (mode
== V4DImode
)
20607 umul
= gen_vec_widen_umult_even_v8si
;
20610 else if (mode
== V8DImode
)
20612 umul
= gen_vec_widen_umult_even_v16si
;
20616 gcc_unreachable ();
20619 /* Multiply low parts. */
20620 t1
= gen_reg_rtx (mode
);
20621 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
20623 /* Shift input vectors right 32 bits so we can multiply high parts. */
20625 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
20626 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
20628 /* Multiply high parts by low parts. */
20629 t4
= gen_reg_rtx (mode
);
20630 t5
= gen_reg_rtx (mode
);
20631 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
20632 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
20634 /* Combine and shift the highparts back. */
20635 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
20636 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
20638 /* Combine high and low parts. */
20639 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
20642 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20643 gen_rtx_MULT (mode
, op1
, op2
));
20646 /* Return 1 if control tansfer instruction INSN
20647 should be encoded with notrack prefix. */
20650 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
20652 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
20657 rtx call
= get_call_rtx_from (insn
);
20658 gcc_assert (call
!= NULL_RTX
);
20659 rtx addr
= XEXP (call
, 0);
20661 /* Do not emit 'notrack' if it's not an indirect call. */
20663 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
20666 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
20669 if (JUMP_P (insn
) && !flag_cet_switch
)
20671 rtx target
= JUMP_LABEL (insn
);
20672 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
20675 /* Check the jump is a switch table. */
20676 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
20677 rtx_insn
*table
= next_insn (label
);
20678 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20686 /* Calculate integer abs() using only SSE2 instructions. */
20689 ix86_expand_sse2_abs (rtx target
, rtx input
)
20691 machine_mode mode
= GET_MODE (target
);
20698 /* For 64-bit signed integer X, with SSE4.2 use
20699 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20700 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20701 32 and use logical instead of arithmetic right shift (which is
20702 unimplemented) and subtract. */
20705 tmp0
= gen_reg_rtx (mode
);
20706 tmp1
= gen_reg_rtx (mode
);
20707 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20708 if (mode
== E_V2DImode
)
20709 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20711 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20715 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20716 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20717 - 1), NULL
, 0, OPTAB_DIRECT
);
20718 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20721 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20722 NULL
, 0, OPTAB_DIRECT
);
20723 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20724 target
, 0, OPTAB_DIRECT
);
20728 /* For 32-bit signed integer X, the best way to calculate the absolute
20729 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20730 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20731 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20732 NULL
, 0, OPTAB_DIRECT
);
20733 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20734 NULL
, 0, OPTAB_DIRECT
);
20735 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20736 target
, 0, OPTAB_DIRECT
);
20740 /* For 16-bit signed integer X, the best way to calculate the absolute
20741 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20742 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20744 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20745 target
, 0, OPTAB_DIRECT
);
20749 /* For 8-bit signed integer X, the best way to calculate the absolute
20750 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20751 as SSE2 provides the PMINUB insn. */
20752 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20754 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20755 target
, 0, OPTAB_DIRECT
);
20759 gcc_unreachable ();
20763 emit_move_insn (target
, x
);
20766 /* Expand an extract from a vector register through pextr insn.
20767 Return true if successful. */
20770 ix86_expand_pextr (rtx
*operands
)
20772 rtx dst
= operands
[0];
20773 rtx src
= operands
[1];
20775 unsigned int size
= INTVAL (operands
[2]);
20776 unsigned int pos
= INTVAL (operands
[3]);
20778 if (SUBREG_P (dst
))
20780 /* Reject non-lowpart subregs. */
20781 if (SUBREG_BYTE (dst
) > 0)
20783 dst
= SUBREG_REG (dst
);
20786 if (SUBREG_P (src
))
20788 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20789 src
= SUBREG_REG (src
);
20792 switch (GET_MODE (src
))
20800 machine_mode srcmode
, dstmode
;
20803 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20809 if (!TARGET_SSE4_1
)
20811 srcmode
= V16QImode
;
20817 srcmode
= V8HImode
;
20821 if (!TARGET_SSE4_1
)
20823 srcmode
= V4SImode
;
20827 gcc_assert (TARGET_64BIT
);
20828 if (!TARGET_SSE4_1
)
20830 srcmode
= V2DImode
;
20837 /* Reject extractions from misaligned positions. */
20838 if (pos
& (size
-1))
20841 if (GET_MODE (dst
) == dstmode
)
20844 d
= gen_reg_rtx (dstmode
);
20846 /* Construct insn pattern. */
20847 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20848 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20850 /* Let the rtl optimizers know about the zero extension performed. */
20851 if (dstmode
== QImode
|| dstmode
== HImode
)
20853 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20854 d
= gen_lowpart (SImode
, d
);
20857 emit_insn (gen_rtx_SET (d
, pat
));
20860 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20869 /* Expand an insert into a vector register through pinsr insn.
20870 Return true if successful. */
20873 ix86_expand_pinsr (rtx
*operands
)
20875 rtx dst
= operands
[0];
20876 rtx src
= operands
[3];
20878 unsigned int size
= INTVAL (operands
[1]);
20879 unsigned int pos
= INTVAL (operands
[2]);
20881 if (SUBREG_P (dst
))
20883 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20884 dst
= SUBREG_REG (dst
);
20887 switch (GET_MODE (dst
))
20895 machine_mode srcmode
, dstmode
;
20896 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20899 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20905 if (!TARGET_SSE4_1
)
20907 dstmode
= V16QImode
;
20908 pinsr
= gen_sse4_1_pinsrb
;
20914 dstmode
= V8HImode
;
20915 pinsr
= gen_sse2_pinsrw
;
20919 if (!TARGET_SSE4_1
)
20921 dstmode
= V4SImode
;
20922 pinsr
= gen_sse4_1_pinsrd
;
20926 gcc_assert (TARGET_64BIT
);
20927 if (!TARGET_SSE4_1
)
20929 dstmode
= V2DImode
;
20930 pinsr
= gen_sse4_1_pinsrq
;
20937 /* Reject insertions to misaligned positions. */
20938 if (pos
& (size
-1))
20941 if (SUBREG_P (src
))
20943 unsigned int srcpos
= SUBREG_BYTE (src
);
20949 extr_ops
[0] = gen_reg_rtx (srcmode
);
20950 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20951 extr_ops
[2] = GEN_INT (size
);
20952 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20954 if (!ix86_expand_pextr (extr_ops
))
20960 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20963 if (GET_MODE (dst
) == dstmode
)
20966 d
= gen_reg_rtx (dstmode
);
20968 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20969 gen_lowpart (srcmode
, src
),
20970 GEN_INT (1 << (pos
/ size
))));
20972 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20981 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20982 upper against lower halves up to SSE reg size. */
20985 ix86_split_reduction (machine_mode mode
)
20987 /* Reduce lowpart against highpart until we reach SSE reg width to
20988 avoid cross-lane operations. */
21014 /* Generate call to __divmoddi4. */
21017 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
21019 rtx
*quot_p
, rtx
*rem_p
)
21021 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
21023 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
21024 mode
, op0
, mode
, op1
, mode
,
21025 XEXP (rem
, 0), Pmode
);
21030 #include "gt-i386-expand.h"