280645f60d5cf2dc571329cd9686327ac212674e
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162 }
163
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167 void
168 ix86_expand_clear (rtx dest)
169 {
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187 }
188
189 void
190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0, op1))
201 {
202 tmp = gen_reg_rtx (mode);
203 operands[0] = tmp;
204 ix86_expand_move (mode, operands);
205 operands[0] = op0;
206 operands[1] = tmp;
207 op1 = tmp;
208 }
209
210 switch (GET_CODE (op1))
211 {
212 case CONST:
213 tmp = XEXP (op1, 0);
214
215 if (GET_CODE (tmp) != PLUS
216 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
217 break;
218
219 op1 = XEXP (tmp, 0);
220 addend = XEXP (tmp, 1);
221 /* FALLTHRU */
222
223 case SYMBOL_REF:
224 model = SYMBOL_REF_TLS_MODEL (op1);
225
226 if (model)
227 op1 = legitimize_tls_address (op1, model, true);
228 else if (ix86_force_load_from_GOT_p (op1))
229 {
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
232 (TARGET_64BIT
233 ? UNSPEC_GOTPCREL
234 : UNSPEC_GOT));
235 op1 = gen_rtx_CONST (Pmode, op1);
236 op1 = gen_const_mem (Pmode, op1);
237 set_mem_alias_set (op1, ix86_GOT_alias_set ());
238 }
239 else
240 {
241 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
242 if (tmp)
243 {
244 op1 = tmp;
245 if (!addend)
246 break;
247 }
248 else
249 {
250 op1 = operands[1];
251 break;
252 }
253 }
254
255 if (addend)
256 {
257 op1 = force_operand (op1, NULL_RTX);
258 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
259 op0, 1, OPTAB_DIRECT);
260 }
261 else
262 op1 = force_operand (op1, op0);
263
264 if (op1 == op0)
265 return;
266
267 op1 = convert_to_mode (mode, op1, 1);
268
269 default:
270 break;
271 }
272
273 if ((flag_pic || MACHOPIC_INDIRECT)
274 && symbolic_operand (op1, mode))
275 {
276 if (TARGET_MACHO && !TARGET_64BIT)
277 {
278 #if TARGET_MACHO
279 /* dynamic-no-pic */
280 if (MACHOPIC_INDIRECT)
281 {
282 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
283 ? op0 : gen_reg_rtx (Pmode);
284 op1 = machopic_indirect_data_reference (op1, temp);
285 if (MACHOPIC_PURE)
286 op1 = machopic_legitimize_pic_address (op1, mode,
287 temp == op1 ? 0 : temp);
288 }
289 if (op0 != op1 && GET_CODE (op0) != MEM)
290 {
291 rtx insn = gen_rtx_SET (op0, op1);
292 emit_insn (insn);
293 return;
294 }
295 if (GET_CODE (op0) == MEM)
296 op1 = force_reg (Pmode, op1);
297 else
298 {
299 rtx temp = op0;
300 if (GET_CODE (temp) != REG)
301 temp = gen_reg_rtx (Pmode);
302 temp = legitimize_pic_address (op1, temp);
303 if (temp == op0)
304 return;
305 op1 = temp;
306 }
307 /* dynamic-no-pic */
308 #endif
309 }
310 else
311 {
312 if (MEM_P (op0))
313 op1 = force_reg (mode, op1);
314 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
315 {
316 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
317 op1 = legitimize_pic_address (op1, reg);
318 if (op0 == op1)
319 return;
320 op1 = convert_to_mode (mode, op1, 1);
321 }
322 }
323 }
324 else
325 {
326 if (MEM_P (op0)
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
328 || !push_operand (op0, mode))
329 && MEM_P (op1))
330 op1 = force_reg (mode, op1);
331
332 if (push_operand (op0, mode)
333 && ! general_no_elim_operand (op1, mode))
334 op1 = copy_to_mode_reg (mode, op1);
335
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode == DImode) && TARGET_64BIT
340 && immediate_operand (op1, mode)
341 && !x86_64_zext_immediate_operand (op1, VOIDmode)
342 && !register_operand (op0, mode)
343 && optimize)
344 op1 = copy_to_mode_reg (mode, op1);
345
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1))
348 {
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
351 out the back end. */
352
353 op1 = validize_mem (force_const_mem (mode, op1));
354 if (!register_operand (op0, mode))
355 {
356 rtx temp = gen_reg_rtx (mode);
357 emit_insn (gen_rtx_SET (temp, op1));
358 emit_move_insn (op0, temp);
359 return;
360 }
361 }
362 }
363
364 emit_insn (gen_rtx_SET (op0, op1));
365 }
366
367 void
368 ix86_expand_vector_move (machine_mode mode, rtx operands[])
369 {
370 rtx op0 = operands[0], op1 = operands[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align = (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode)
375 : GET_MODE_ALIGNMENT (mode));
376
377 if (push_operand (op0, VOIDmode))
378 op0 = emit_move_resolve_push (mode, op0);
379
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
385 && (CONSTANT_P (op1)
386 || (SUBREG_P (op1)
387 && CONSTANT_P (SUBREG_REG (op1))))
388 && ((register_operand (op0, mode)
389 && !standard_sse_constant_p (op1, mode))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode)
392 && MEM_P (op0)
393 && MEM_ALIGN (op0) < align)))
394 {
395 if (SUBREG_P (op1))
396 {
397 machine_mode imode = GET_MODE (SUBREG_REG (op1));
398 rtx r = force_const_mem (imode, SUBREG_REG (op1));
399 if (r)
400 r = validize_mem (r);
401 else
402 r = force_reg (imode, SUBREG_REG (op1));
403 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
404 }
405 else
406 op1 = validize_mem (force_const_mem (mode, op1));
407 }
408
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode)
413 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
414 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
415 {
416 rtx tmp[2];
417
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 op1 = force_reg (mode, op1);
423
424 tmp[0] = op0; tmp[1] = op1;
425 ix86_expand_vector_move_misalign (mode, tmp);
426 return;
427 }
428
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0, mode)
432 && !register_operand (op1, mode))
433 {
434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
435 return;
436 }
437
438 emit_insn (gen_rtx_SET (op0, op1));
439 }
440
441 /* Split 32-byte AVX unaligned load and store if needed. */
442
443 static void
444 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
445 {
446 rtx m;
447 rtx (*extract) (rtx, rtx, rtx);
448 machine_mode mode;
449
450 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
451 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
452 {
453 emit_insn (gen_rtx_SET (op0, op1));
454 return;
455 }
456
457 rtx orig_op0 = NULL_RTX;
458 mode = GET_MODE (op0);
459 switch (GET_MODE_CLASS (mode))
460 {
461 case MODE_VECTOR_INT:
462 case MODE_INT:
463 if (mode != V32QImode)
464 {
465 if (!MEM_P (op0))
466 {
467 orig_op0 = op0;
468 op0 = gen_reg_rtx (V32QImode);
469 }
470 else
471 op0 = gen_lowpart (V32QImode, op0);
472 op1 = gen_lowpart (V32QImode, op1);
473 mode = V32QImode;
474 }
475 break;
476 case MODE_VECTOR_FLOAT:
477 break;
478 default:
479 gcc_unreachable ();
480 }
481
482 switch (mode)
483 {
484 default:
485 gcc_unreachable ();
486 case E_V32QImode:
487 extract = gen_avx_vextractf128v32qi;
488 mode = V16QImode;
489 break;
490 case E_V8SFmode:
491 extract = gen_avx_vextractf128v8sf;
492 mode = V4SFmode;
493 break;
494 case E_V4DFmode:
495 extract = gen_avx_vextractf128v4df;
496 mode = V2DFmode;
497 break;
498 }
499
500 if (MEM_P (op1))
501 {
502 rtx r = gen_reg_rtx (mode);
503 m = adjust_address (op1, mode, 0);
504 emit_move_insn (r, m);
505 m = adjust_address (op1, mode, 16);
506 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
507 emit_move_insn (op0, r);
508 }
509 else if (MEM_P (op0))
510 {
511 m = adjust_address (op0, mode, 0);
512 emit_insn (extract (m, op1, const0_rtx));
513 m = adjust_address (op0, mode, 16);
514 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
515 }
516 else
517 gcc_unreachable ();
518
519 if (orig_op0)
520 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
521 }
522
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
527 movaps reg, reg
528 else
529 movss reg, reg
530 if (x86_sse_partial_reg_dependency == true)
531 movapd reg, reg
532 else
533 movsd reg, reg
534
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
538 else
539 movsd mem, reg
540
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
544 movups mem, reg
545
546 if (x86_sse_partial_reg_dependency == true)
547 {
548 xorps reg, reg
549 movlps mem, reg
550 movhps mem+8, reg
551 }
552 else
553 {
554 movlps mem, reg
555 movhps mem+8, reg
556 }
557
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
561 movupd mem, reg
562
563 if (x86_sse_split_regs == true)
564 {
565 movlpd mem, reg
566 movhpd mem+8, reg
567 }
568 else
569 {
570 movsd mem, reg
571 movhpd mem+8, reg
572 }
573 */
574
575 void
576 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
577 {
578 rtx op0, op1, m;
579
580 op0 = operands[0];
581 op1 = operands[1];
582
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 if (TARGET_AVX)
591 {
592 if (GET_MODE_SIZE (mode) == 32)
593 ix86_avx256_split_vector_move_misalign (op0, op1);
594 else
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0, op1));
597 return;
598 }
599
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
602 {
603 emit_insn (gen_rtx_SET (op0, op1));
604 return;
605 }
606
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
609 integer type. */
610 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
611 {
612 emit_insn (gen_rtx_SET (op0, op1));
613 return;
614 }
615
616 if (MEM_P (op1))
617 {
618 if (TARGET_SSE2 && mode == V2DFmode)
619 {
620 rtx zero;
621
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS)
625 {
626 emit_clobber (op0);
627 zero = op0;
628 }
629 else
630 {
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero = CONST0_RTX (V2DFmode);
640 }
641
642 m = adjust_address (op1, DFmode, 0);
643 emit_insn (gen_sse2_loadlpd (op0, zero, m));
644 m = adjust_address (op1, DFmode, 8);
645 emit_insn (gen_sse2_loadhpd (op0, op0, m));
646 }
647 else
648 {
649 rtx t;
650
651 if (mode != V4SFmode)
652 t = gen_reg_rtx (V4SFmode);
653 else
654 t = op0;
655
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
657 emit_move_insn (t, CONST0_RTX (V4SFmode));
658 else
659 emit_clobber (t);
660
661 m = adjust_address (op1, V2SFmode, 0);
662 emit_insn (gen_sse_loadlps (t, t, m));
663 m = adjust_address (op1, V2SFmode, 8);
664 emit_insn (gen_sse_loadhps (t, t, m));
665 if (mode != V4SFmode)
666 emit_move_insn (op0, gen_lowpart (mode, t));
667 }
668 }
669 else if (MEM_P (op0))
670 {
671 if (TARGET_SSE2 && mode == V2DFmode)
672 {
673 m = adjust_address (op0, DFmode, 0);
674 emit_insn (gen_sse2_storelpd (m, op1));
675 m = adjust_address (op0, DFmode, 8);
676 emit_insn (gen_sse2_storehpd (m, op1));
677 }
678 else
679 {
680 if (mode != V4SFmode)
681 op1 = gen_lowpart (V4SFmode, op1);
682
683 m = adjust_address (op0, V2SFmode, 0);
684 emit_insn (gen_sse_storelps (m, op1));
685 m = adjust_address (op0, V2SFmode, 8);
686 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
687 }
688 }
689 else
690 gcc_unreachable ();
691 }
692
693 /* Move bits 64:95 to bits 32:63. */
694
695 void
696 ix86_move_vector_high_sse_to_mmx (rtx op)
697 {
698 rtx mask = gen_rtx_PARALLEL (VOIDmode,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
702 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
703 rtx insn = gen_rtx_SET (dest, op);
704 emit_insn (insn);
705 }
706
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
708
709 void
710 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
711 {
712 rtx op0 = operands[0];
713 rtx op1 = operands[1];
714 rtx op2 = operands[2];
715
716 machine_mode dmode = GET_MODE (op0);
717 machine_mode smode = GET_MODE (op1);
718 machine_mode inner_dmode = GET_MODE_INNER (dmode);
719 machine_mode inner_smode = GET_MODE_INNER (smode);
720
721 /* Get the corresponding SSE mode for destination. */
722 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
723 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
724 nunits).require ();
725 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
726 nunits / 2).require ();
727
728 /* Get the corresponding SSE mode for source. */
729 nunits = 16 / GET_MODE_SIZE (inner_smode);
730 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
731 nunits).require ();
732
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
735 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
736 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
737
738 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
739 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
740 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
741 op1, op2));
742 emit_insn (insn);
743
744 ix86_move_vector_high_sse_to_mmx (op0);
745 }
746
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
748
749 void
750 ix86_split_mmx_punpck (rtx operands[], bool high_p)
751 {
752 rtx op0 = operands[0];
753 rtx op1 = operands[1];
754 rtx op2 = operands[2];
755 machine_mode mode = GET_MODE (op0);
756 rtx mask;
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode, double_sse_mode;
759
760 switch (mode)
761 {
762 case E_V8QImode:
763 sse_mode = V16QImode;
764 double_sse_mode = V32QImode;
765 mask = gen_rtx_PARALLEL (VOIDmode,
766 gen_rtvec (16,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
775 break;
776
777 case E_V4HImode:
778 sse_mode = V8HImode;
779 double_sse_mode = V16HImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (8,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
786 break;
787
788 case E_V2SImode:
789 sse_mode = V4SImode;
790 double_sse_mode = V8SImode;
791 mask = gen_rtx_PARALLEL (VOIDmode,
792 gen_rtvec (4,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
795 break;
796
797 default:
798 gcc_unreachable ();
799 }
800
801 /* Generate SSE punpcklXX. */
802 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
803 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
804 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
805
806 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
807 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
808 rtx insn = gen_rtx_SET (dest, op2);
809 emit_insn (insn);
810
811 if (high_p)
812 {
813 /* Move bits 64:127 to bits 0:63. */
814 mask = gen_rtx_PARALLEL (VOIDmode,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
818 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
819 insn = gen_rtx_SET (dest, op1);
820 emit_insn (insn);
821 }
822 }
823
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
826
827 static bool
828 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
829 rtx operands[])
830 {
831 rtx dst = operands[0];
832 rtx src1 = operands[1];
833 rtx src2 = operands[2];
834
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
838 return false;
839
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst, src1))
842 return false;
843 if (rtx_equal_p (dst, src2))
844 return true;
845
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2, mode))
848 return false;
849 if (immediate_operand (src1, mode))
850 return true;
851
852 /* Lowest priority is that memory references should come second. */
853 if (MEM_P (src2))
854 return false;
855 if (MEM_P (src1))
856 return true;
857
858 return false;
859 }
860
861
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
865
866 rtx
867 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
868 rtx operands[])
869 {
870 rtx dst = operands[0];
871 rtx src1 = operands[1];
872 rtx src2 = operands[2];
873
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code, mode, operands))
876 {
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
879
880 std::swap (src1, src2);
881 }
882
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1) && MEM_P (src2))
885 {
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1, src2))
888 {
889 src2 = force_reg (mode, src2);
890 src1 = src2;
891 }
892 else if (rtx_equal_p (dst, src1))
893 src2 = force_reg (mode, src2);
894 else
895 src1 = force_reg (mode, src1);
896 }
897
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
901 dst = gen_reg_rtx (mode);
902
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1))
905 src1 = force_reg (mode, src1);
906
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
909 src1 = force_reg (mode, src1);
910
911 /* Improve address combine. */
912 if (code == PLUS
913 && GET_MODE_CLASS (mode) == MODE_INT
914 && MEM_P (src2))
915 src2 = force_reg (mode, src2);
916
917 operands[1] = src1;
918 operands[2] = src2;
919 return dst;
920 }
921
922 /* Similarly, but assume that the destination has already been
923 set up properly. */
924
925 void
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
927 machine_mode mode, rtx operands[])
928 {
929 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
930 gcc_assert (dst == operands[0]);
931 }
932
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
936
937 void
938 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
939 rtx operands[])
940 {
941 rtx src1, src2, dst, op, clob;
942
943 dst = ix86_fixup_binary_operands (code, mode, operands);
944 src1 = operands[1];
945 src2 = operands[2];
946
947 /* Emit the instruction. */
948
949 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
950
951 if (reload_completed
952 && code == PLUS
953 && !rtx_equal_p (dst, src1))
954 {
955 /* This is going to be an LEA; avoid splitting it later. */
956 emit_insn (op);
957 }
958 else
959 {
960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
962 }
963
964 /* Fix up the destination if needed. */
965 if (dst != operands[0])
966 emit_move_insn (operands[0], dst);
967 }
968
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
971
972 void
973 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
974 rtx operands[])
975 {
976 rtx op1 = NULL_RTX, op2 = NULL_RTX;
977 if (SUBREG_P (operands[1]))
978 {
979 op1 = operands[1];
980 op2 = operands[2];
981 }
982 else if (SUBREG_P (operands[2]))
983 {
984 op1 = operands[2];
985 op2 = operands[1];
986 }
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
992 if (op1
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
997 && SUBREG_BYTE (op1) == 0
998 && (GET_CODE (op2) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1000 && SUBREG_BYTE (op2) == 0))
1001 && can_create_pseudo_p ())
1002 {
1003 rtx dst;
1004 switch (GET_MODE (SUBREG_REG (op1)))
1005 {
1006 case E_V4SFmode:
1007 case E_V8SFmode:
1008 case E_V16SFmode:
1009 case E_V2DFmode:
1010 case E_V4DFmode:
1011 case E_V8DFmode:
1012 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1013 if (GET_CODE (op2) == CONST_VECTOR)
1014 {
1015 op2 = gen_lowpart (GET_MODE (dst), op2);
1016 op2 = force_reg (GET_MODE (dst), op2);
1017 }
1018 else
1019 {
1020 op1 = operands[1];
1021 op2 = SUBREG_REG (operands[2]);
1022 if (!vector_operand (op2, GET_MODE (dst)))
1023 op2 = force_reg (GET_MODE (dst), op2);
1024 }
1025 op1 = SUBREG_REG (op1);
1026 if (!vector_operand (op1, GET_MODE (dst)))
1027 op1 = force_reg (GET_MODE (dst), op1);
1028 emit_insn (gen_rtx_SET (dst,
1029 gen_rtx_fmt_ee (code, GET_MODE (dst),
1030 op1, op2)));
1031 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1032 return;
1033 default:
1034 break;
1035 }
1036 }
1037 if (!vector_operand (operands[1], mode))
1038 operands[1] = force_reg (mode, operands[1]);
1039 if (!vector_operand (operands[2], mode))
1040 operands[2] = force_reg (mode, operands[2]);
1041 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1042 emit_insn (gen_rtx_SET (operands[0],
1043 gen_rtx_fmt_ee (code, mode, operands[1],
1044 operands[2])));
1045 }
1046
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1049
1050 bool
1051 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1052 rtx operands[3])
1053 {
1054 rtx dst = operands[0];
1055 rtx src1 = operands[1];
1056 rtx src2 = operands[2];
1057
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1060 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1061 return false;
1062
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code, mode, operands))
1065 std::swap (src1, src2);
1066
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1069 return false;
1070
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1))
1073 return false;
1074
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1078 return (code == AND
1079 && (mode == HImode
1080 || mode == SImode
1081 || (TARGET_64BIT && mode == DImode))
1082 && satisfies_constraint_L (src2));
1083
1084 return true;
1085 }
1086
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1090
1091 void
1092 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1093 rtx operands[])
1094 {
1095 bool matching_memory = false;
1096 rtx src, dst, op, clob;
1097
1098 dst = operands[0];
1099 src = operands[1];
1100
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1103 if (MEM_P (dst))
1104 {
1105 if (rtx_equal_p (dst, src))
1106 matching_memory = true;
1107 else
1108 dst = gen_reg_rtx (mode);
1109 }
1110
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src) && !matching_memory)
1113 src = force_reg (mode, src);
1114
1115 /* Emit the instruction. */
1116
1117 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1118
1119 if (code == NOT)
1120 emit_insn (op);
1121 else
1122 {
1123 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1125 }
1126
1127 /* Fix up the destination if needed. */
1128 if (dst != operands[0])
1129 emit_move_insn (operands[0], dst);
1130 }
1131
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1133
1134 static void
1135 predict_jump (int prob)
1136 {
1137 rtx_insn *insn = get_last_insn ();
1138 gcc_assert (JUMP_P (insn));
1139 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1140 }
1141
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1144
1145 void
1146 ix86_split_idivmod (machine_mode mode, rtx operands[],
1147 bool unsigned_p)
1148 {
1149 rtx_code_label *end_label, *qimode_label;
1150 rtx div, mod;
1151 rtx_insn *insn;
1152 rtx scratch, tmp0, tmp1, tmp2;
1153 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1154
1155 switch (mode)
1156 {
1157 case E_SImode:
1158 if (GET_MODE (operands[0]) == SImode)
1159 {
1160 if (GET_MODE (operands[1]) == SImode)
1161 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1162 else
1163 gen_divmod4_1
1164 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1165 }
1166 else
1167 gen_divmod4_1
1168 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1169 break;
1170
1171 case E_DImode:
1172 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1173 break;
1174
1175 default:
1176 gcc_unreachable ();
1177 }
1178
1179 end_label = gen_label_rtx ();
1180 qimode_label = gen_label_rtx ();
1181
1182 scratch = gen_reg_rtx (mode);
1183
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch, operands[2]);
1187 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1188 scratch, 1, OPTAB_DIRECT);
1189 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1190 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1191 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1192 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1193 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1194 pc_rtx);
1195 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1196 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1197 JUMP_LABEL (insn) = qimode_label;
1198
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1201 operands[2], operands[3]));
1202
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label));
1205 emit_barrier ();
1206
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0 = lowpart_subreg (HImode, scratch, mode);
1212 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1213 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1214 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1215
1216 if (unsigned_p)
1217 {
1218 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1219 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1220 }
1221 else
1222 {
1223 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1224 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1225 }
1226 if (mode == SImode)
1227 {
1228 if (GET_MODE (operands[0]) != SImode)
1229 div = gen_rtx_ZERO_EXTEND (DImode, div);
1230 if (GET_MODE (operands[1]) != SImode)
1231 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1232 }
1233
1234 /* Extract remainder from AH. */
1235 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1236 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1237 GEN_INT (8), GEN_INT (8));
1238 insn = emit_move_insn (operands[1], tmp1);
1239 set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241 /* Zero extend quotient from AL. */
1242 tmp1 = gen_lowpart (QImode, tmp0);
1243 insn = emit_insn (gen_extend_insn
1244 (operands[0], tmp1,
1245 GET_MODE (operands[0]), QImode, 1));
1246 set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248 emit_label (end_label);
1249 }
1250
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1253
1254 void
1255 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 rtx dst, rtx src)
1257 {
1258 rtx op, clob;
1259
1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264 }
1265
1266 /* Return true if regno1 def is nearest to the insn. */
1267
1268 static bool
1269 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270 {
1271 rtx_insn *prev = insn;
1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274 if (insn == start)
1275 return false;
1276 while (prev && prev != start)
1277 {
1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 {
1280 prev = PREV_INSN (prev);
1281 continue;
1282 }
1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 return true;
1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 return false;
1287 prev = PREV_INSN (prev);
1288 }
1289
1290 /* None of the regs is defined in the bb. */
1291 return false;
1292 }
1293
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1297 at lea position. */
1298
1299 void
1300 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301 {
1302 unsigned int regno0, regno1, regno2;
1303 struct ix86_address parts;
1304 rtx target, tmp;
1305 int ok, adds;
1306
1307 ok = ix86_decompose_address (operands[1], &parts);
1308 gcc_assert (ok);
1309
1310 target = gen_lowpart (mode, operands[0]);
1311
1312 regno0 = true_regnum (target);
1313 regno1 = INVALID_REGNUM;
1314 regno2 = INVALID_REGNUM;
1315
1316 if (parts.base)
1317 {
1318 parts.base = gen_lowpart (mode, parts.base);
1319 regno1 = true_regnum (parts.base);
1320 }
1321
1322 if (parts.index)
1323 {
1324 parts.index = gen_lowpart (mode, parts.index);
1325 regno2 = true_regnum (parts.index);
1326 }
1327
1328 if (parts.disp)
1329 parts.disp = gen_lowpart (mode, parts.disp);
1330
1331 if (parts.scale > 1)
1332 {
1333 /* Case r1 = r1 + ... */
1334 if (regno1 == regno0)
1335 {
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2 != regno0);
1341
1342 for (adds = parts.scale; adds > 0; adds--)
1343 ix86_emit_binop (PLUS, mode, target, parts.index);
1344 }
1345 else
1346 {
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0 != regno2)
1349 emit_insn (gen_rtx_SET (target, parts.index));
1350
1351 /* Use shift for scaling. */
1352 ix86_emit_binop (ASHIFT, mode, target,
1353 GEN_INT (exact_log2 (parts.scale)));
1354
1355 if (parts.base)
1356 ix86_emit_binop (PLUS, mode, target, parts.base);
1357
1358 if (parts.disp && parts.disp != const0_rtx)
1359 ix86_emit_binop (PLUS, mode, target, parts.disp);
1360 }
1361 }
1362 else if (!parts.base && !parts.index)
1363 {
1364 gcc_assert(parts.disp);
1365 emit_insn (gen_rtx_SET (target, parts.disp));
1366 }
1367 else
1368 {
1369 if (!parts.base)
1370 {
1371 if (regno0 != regno2)
1372 emit_insn (gen_rtx_SET (target, parts.index));
1373 }
1374 else if (!parts.index)
1375 {
1376 if (regno0 != regno1)
1377 emit_insn (gen_rtx_SET (target, parts.base));
1378 }
1379 else
1380 {
1381 if (regno0 == regno1)
1382 tmp = parts.index;
1383 else if (regno0 == regno2)
1384 tmp = parts.base;
1385 else
1386 {
1387 rtx tmp1;
1388
1389 /* Find better operand for SET instruction, depending
1390 on which definition is farther from the insn. */
1391 if (find_nearest_reg_def (insn, regno1, regno2))
1392 tmp = parts.index, tmp1 = parts.base;
1393 else
1394 tmp = parts.base, tmp1 = parts.index;
1395
1396 emit_insn (gen_rtx_SET (target, tmp));
1397
1398 if (parts.disp && parts.disp != const0_rtx)
1399 ix86_emit_binop (PLUS, mode, target, parts.disp);
1400
1401 ix86_emit_binop (PLUS, mode, target, tmp1);
1402 return;
1403 }
1404
1405 ix86_emit_binop (PLUS, mode, target, tmp);
1406 }
1407
1408 if (parts.disp && parts.disp != const0_rtx)
1409 ix86_emit_binop (PLUS, mode, target, parts.disp);
1410 }
1411 }
1412
1413 /* Post-reload splitter for converting an SF or DFmode value in an
1414 SSE register into an unsigned SImode. */
1415
1416 void
1417 ix86_split_convert_uns_si_sse (rtx operands[])
1418 {
1419 machine_mode vecmode;
1420 rtx value, large, zero_or_two31, input, two31, x;
1421
1422 large = operands[1];
1423 zero_or_two31 = operands[2];
1424 input = operands[3];
1425 two31 = operands[4];
1426 vecmode = GET_MODE (large);
1427 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1428
1429 /* Load up the value into the low element. We must ensure that the other
1430 elements are valid floats -- zero is the easiest such value. */
1431 if (MEM_P (input))
1432 {
1433 if (vecmode == V4SFmode)
1434 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1435 else
1436 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1437 }
1438 else
1439 {
1440 input = gen_rtx_REG (vecmode, REGNO (input));
1441 emit_move_insn (value, CONST0_RTX (vecmode));
1442 if (vecmode == V4SFmode)
1443 emit_insn (gen_sse_movss (value, value, input));
1444 else
1445 emit_insn (gen_sse2_movsd (value, value, input));
1446 }
1447
1448 emit_move_insn (large, two31);
1449 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1450
1451 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1452 emit_insn (gen_rtx_SET (large, x));
1453
1454 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1455 emit_insn (gen_rtx_SET (zero_or_two31, x));
1456
1457 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1458 emit_insn (gen_rtx_SET (value, x));
1459
1460 large = gen_rtx_REG (V4SImode, REGNO (large));
1461 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1462
1463 x = gen_rtx_REG (V4SImode, REGNO (value));
1464 if (vecmode == V4SFmode)
1465 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1466 else
1467 emit_insn (gen_sse2_cvttpd2dq (x, value));
1468 value = x;
1469
1470 emit_insn (gen_xorv4si3 (value, value, large));
1471 }
1472
1473 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1474 machine_mode mode, rtx target,
1475 rtx var, int one_var);
1476
1477 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1478 Expects the 64-bit DImode to be supplied in a pair of integral
1479 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1480 -mfpmath=sse, !optimize_size only. */
1481
1482 void
1483 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1484 {
1485 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1486 rtx int_xmm, fp_xmm;
1487 rtx biases, exponents;
1488 rtx x;
1489
1490 int_xmm = gen_reg_rtx (V4SImode);
1491 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1492 emit_insn (gen_movdi_to_sse (int_xmm, input));
1493 else if (TARGET_SSE_SPLIT_REGS)
1494 {
1495 emit_clobber (int_xmm);
1496 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1497 }
1498 else
1499 {
1500 x = gen_reg_rtx (V2DImode);
1501 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1502 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1503 }
1504
1505 x = gen_rtx_CONST_VECTOR (V4SImode,
1506 gen_rtvec (4, GEN_INT (0x43300000UL),
1507 GEN_INT (0x45300000UL),
1508 const0_rtx, const0_rtx));
1509 exponents = validize_mem (force_const_mem (V4SImode, x));
1510
1511 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1513
1514 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517 (0x1.0p84 + double(fp_value_hi_xmm)).
1518 Note these exponents differ by 32. */
1519
1520 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1521
1522 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1524 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1525 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1526 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1527 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1528 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1529 biases = validize_mem (force_const_mem (V2DFmode, biases));
1530 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1531
1532 /* Add the upper and lower DFmode values together. */
1533 if (TARGET_SSE3)
1534 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1535 else
1536 {
1537 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1538 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1539 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1540 }
1541
1542 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1543 }
1544
1545 /* Not used, but eases macroization of patterns. */
1546 void
1547 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1548 {
1549 gcc_unreachable ();
1550 }
1551
1552 /* Convert an unsigned SImode value into a DFmode. Only currently used
1553 for SSE, but applicable anywhere. */
1554
1555 void
1556 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1557 {
1558 REAL_VALUE_TYPE TWO31r;
1559 rtx x, fp;
1560
1561 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1562 NULL, 1, OPTAB_DIRECT);
1563
1564 fp = gen_reg_rtx (DFmode);
1565 emit_insn (gen_floatsidf2 (fp, x));
1566
1567 real_ldexp (&TWO31r, &dconst1, 31);
1568 x = const_double_from_real_value (TWO31r, DFmode);
1569
1570 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1571 if (x != target)
1572 emit_move_insn (target, x);
1573 }
1574
1575 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1576 32-bit mode; otherwise we have a direct convert instruction. */
1577
1578 void
1579 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1580 {
1581 REAL_VALUE_TYPE TWO32r;
1582 rtx fp_lo, fp_hi, x;
1583
1584 fp_lo = gen_reg_rtx (DFmode);
1585 fp_hi = gen_reg_rtx (DFmode);
1586
1587 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1588
1589 real_ldexp (&TWO32r, &dconst1, 32);
1590 x = const_double_from_real_value (TWO32r, DFmode);
1591 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1592
1593 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1594
1595 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1596 0, OPTAB_DIRECT);
1597 if (x != target)
1598 emit_move_insn (target, x);
1599 }
1600
1601 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1602 For x86_32, -mfpmath=sse, !optimize_size only. */
1603 void
1604 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1605 {
1606 REAL_VALUE_TYPE ONE16r;
1607 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1608
1609 real_ldexp (&ONE16r, &dconst1, 16);
1610 x = const_double_from_real_value (ONE16r, SFmode);
1611 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1612 NULL, 0, OPTAB_DIRECT);
1613 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1614 NULL, 0, OPTAB_DIRECT);
1615 fp_hi = gen_reg_rtx (SFmode);
1616 fp_lo = gen_reg_rtx (SFmode);
1617 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1618 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1619 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1620 0, OPTAB_DIRECT);
1621 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1622 0, OPTAB_DIRECT);
1623 if (!rtx_equal_p (target, fp_hi))
1624 emit_move_insn (target, fp_hi);
1625 }
1626
1627 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1628 a vector of unsigned ints VAL to vector of floats TARGET. */
1629
1630 void
1631 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1632 {
1633 rtx tmp[8];
1634 REAL_VALUE_TYPE TWO16r;
1635 machine_mode intmode = GET_MODE (val);
1636 machine_mode fltmode = GET_MODE (target);
1637 rtx (*cvt) (rtx, rtx);
1638
1639 if (intmode == V4SImode)
1640 cvt = gen_floatv4siv4sf2;
1641 else
1642 cvt = gen_floatv8siv8sf2;
1643 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1644 tmp[0] = force_reg (intmode, tmp[0]);
1645 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1646 OPTAB_DIRECT);
1647 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1648 NULL_RTX, 1, OPTAB_DIRECT);
1649 tmp[3] = gen_reg_rtx (fltmode);
1650 emit_insn (cvt (tmp[3], tmp[1]));
1651 tmp[4] = gen_reg_rtx (fltmode);
1652 emit_insn (cvt (tmp[4], tmp[2]));
1653 real_ldexp (&TWO16r, &dconst1, 16);
1654 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1655 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1656 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1657 OPTAB_DIRECT);
1658 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1659 OPTAB_DIRECT);
1660 if (tmp[7] != target)
1661 emit_move_insn (target, tmp[7]);
1662 }
1663
1664 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1668
1669 rtx
1670 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1671 {
1672 REAL_VALUE_TYPE TWO31r;
1673 rtx two31r, tmp[4];
1674 machine_mode mode = GET_MODE (val);
1675 machine_mode scalarmode = GET_MODE_INNER (mode);
1676 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1677 rtx (*cmp) (rtx, rtx, rtx, rtx);
1678 int i;
1679
1680 for (i = 0; i < 3; i++)
1681 tmp[i] = gen_reg_rtx (mode);
1682 real_ldexp (&TWO31r, &dconst1, 31);
1683 two31r = const_double_from_real_value (TWO31r, scalarmode);
1684 two31r = ix86_build_const_vector (mode, 1, two31r);
1685 two31r = force_reg (mode, two31r);
1686 switch (mode)
1687 {
1688 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1689 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1690 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1691 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1692 default: gcc_unreachable ();
1693 }
1694 tmp[3] = gen_rtx_LE (mode, two31r, val);
1695 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1696 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1697 0, OPTAB_DIRECT);
1698 if (intmode == V4SImode || TARGET_AVX2)
1699 *xorp = expand_simple_binop (intmode, ASHIFT,
1700 gen_lowpart (intmode, tmp[0]),
1701 GEN_INT (31), NULL_RTX, 0,
1702 OPTAB_DIRECT);
1703 else
1704 {
1705 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1706 two31 = ix86_build_const_vector (intmode, 1, two31);
1707 *xorp = expand_simple_binop (intmode, AND,
1708 gen_lowpart (intmode, tmp[0]),
1709 two31, NULL_RTX, 0,
1710 OPTAB_DIRECT);
1711 }
1712 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1713 0, OPTAB_DIRECT);
1714 }
1715
1716 /* Generate code for floating point ABS or NEG. */
1717
1718 void
1719 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1720 rtx operands[])
1721 {
1722 rtx set, dst, src;
1723 bool use_sse = false;
1724 bool vector_mode = VECTOR_MODE_P (mode);
1725 machine_mode vmode = mode;
1726 rtvec par;
1727
1728 if (vector_mode || mode == TFmode)
1729 use_sse = true;
1730 else if (TARGET_SSE_MATH)
1731 {
1732 use_sse = SSE_FLOAT_MODE_P (mode);
1733 if (mode == SFmode)
1734 vmode = V4SFmode;
1735 else if (mode == DFmode)
1736 vmode = V2DFmode;
1737 }
1738
1739 dst = operands[0];
1740 src = operands[1];
1741
1742 set = gen_rtx_fmt_e (code, mode, src);
1743 set = gen_rtx_SET (dst, set);
1744
1745 if (use_sse)
1746 {
1747 rtx mask, use, clob;
1748
1749 /* NEG and ABS performed with SSE use bitwise mask operations.
1750 Create the appropriate mask now. */
1751 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1752 use = gen_rtx_USE (VOIDmode, mask);
1753 if (vector_mode || mode == TFmode)
1754 par = gen_rtvec (2, set, use);
1755 else
1756 {
1757 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1758 par = gen_rtvec (3, set, use, clob);
1759 }
1760 }
1761 else
1762 {
1763 rtx clob;
1764
1765 /* Changing of sign for FP values is doable using integer unit too. */
1766 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1767 par = gen_rtvec (2, set, clob);
1768 }
1769
1770 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1771 }
1772
1773 /* Deconstruct a floating point ABS or NEG operation
1774 with integer registers into integer operations. */
1775
1776 void
1777 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1778 rtx operands[])
1779 {
1780 enum rtx_code absneg_op;
1781 rtx dst, set;
1782
1783 gcc_assert (operands_match_p (operands[0], operands[1]));
1784
1785 switch (mode)
1786 {
1787 case E_SFmode:
1788 dst = gen_lowpart (SImode, operands[0]);
1789
1790 if (code == ABS)
1791 {
1792 set = gen_int_mode (0x7fffffff, SImode);
1793 absneg_op = AND;
1794 }
1795 else
1796 {
1797 set = gen_int_mode (0x80000000, SImode);
1798 absneg_op = XOR;
1799 }
1800 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1801 break;
1802
1803 case E_DFmode:
1804 if (TARGET_64BIT)
1805 {
1806 dst = gen_lowpart (DImode, operands[0]);
1807 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1808
1809 if (code == ABS)
1810 set = const0_rtx;
1811 else
1812 set = gen_rtx_NOT (DImode, dst);
1813 }
1814 else
1815 {
1816 dst = gen_highpart (SImode, operands[0]);
1817
1818 if (code == ABS)
1819 {
1820 set = gen_int_mode (0x7fffffff, SImode);
1821 absneg_op = AND;
1822 }
1823 else
1824 {
1825 set = gen_int_mode (0x80000000, SImode);
1826 absneg_op = XOR;
1827 }
1828 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1829 }
1830 break;
1831
1832 case E_XFmode:
1833 dst = gen_rtx_REG (SImode,
1834 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1835 if (code == ABS)
1836 {
1837 set = GEN_INT (0x7fff);
1838 absneg_op = AND;
1839 }
1840 else
1841 {
1842 set = GEN_INT (0x8000);
1843 absneg_op = XOR;
1844 }
1845 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1846 break;
1847
1848 default:
1849 gcc_unreachable ();
1850 }
1851
1852 set = gen_rtx_SET (dst, set);
1853
1854 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1855 rtvec par = gen_rtvec (2, set, clob);
1856
1857 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1858 }
1859
1860 /* Expand a copysign operation. Special case operand 0 being a constant. */
1861
1862 void
1863 ix86_expand_copysign (rtx operands[])
1864 {
1865 machine_mode mode, vmode;
1866 rtx dest, op0, op1, mask;
1867
1868 dest = operands[0];
1869 op0 = operands[1];
1870 op1 = operands[2];
1871
1872 mode = GET_MODE (dest);
1873
1874 if (mode == SFmode)
1875 vmode = V4SFmode;
1876 else if (mode == DFmode)
1877 vmode = V2DFmode;
1878 else if (mode == TFmode)
1879 vmode = mode;
1880 else
1881 gcc_unreachable ();
1882
1883 mask = ix86_build_signbit_mask (vmode, 0, 0);
1884
1885 if (CONST_DOUBLE_P (op0))
1886 {
1887 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1888 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1889
1890 if (mode == SFmode || mode == DFmode)
1891 {
1892 if (op0 == CONST0_RTX (mode))
1893 op0 = CONST0_RTX (vmode);
1894 else
1895 {
1896 rtx v = ix86_build_const_vector (vmode, false, op0);
1897
1898 op0 = force_reg (vmode, v);
1899 }
1900 }
1901 else if (op0 != CONST0_RTX (mode))
1902 op0 = force_reg (mode, op0);
1903
1904 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1905 }
1906 else
1907 {
1908 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1909
1910 emit_insn (gen_copysign3_var
1911 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1912 }
1913 }
1914
1915 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1916 be a constant, and so has already been expanded into a vector constant. */
1917
1918 void
1919 ix86_split_copysign_const (rtx operands[])
1920 {
1921 machine_mode mode, vmode;
1922 rtx dest, op0, mask, x;
1923
1924 dest = operands[0];
1925 op0 = operands[1];
1926 mask = operands[3];
1927
1928 mode = GET_MODE (dest);
1929 vmode = GET_MODE (mask);
1930
1931 dest = lowpart_subreg (vmode, dest, mode);
1932 x = gen_rtx_AND (vmode, dest, mask);
1933 emit_insn (gen_rtx_SET (dest, x));
1934
1935 if (op0 != CONST0_RTX (vmode))
1936 {
1937 x = gen_rtx_IOR (vmode, dest, op0);
1938 emit_insn (gen_rtx_SET (dest, x));
1939 }
1940 }
1941
1942 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1943 so we have to do two masks. */
1944
1945 void
1946 ix86_split_copysign_var (rtx operands[])
1947 {
1948 machine_mode mode, vmode;
1949 rtx dest, scratch, op0, op1, mask, nmask, x;
1950
1951 dest = operands[0];
1952 scratch = operands[1];
1953 op0 = operands[2];
1954 op1 = operands[3];
1955 nmask = operands[4];
1956 mask = operands[5];
1957
1958 mode = GET_MODE (dest);
1959 vmode = GET_MODE (mask);
1960
1961 if (rtx_equal_p (op0, op1))
1962 {
1963 /* Shouldn't happen often (it's useless, obviously), but when it does
1964 we'd generate incorrect code if we continue below. */
1965 emit_move_insn (dest, op0);
1966 return;
1967 }
1968
1969 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1970 {
1971 gcc_assert (REGNO (op1) == REGNO (scratch));
1972
1973 x = gen_rtx_AND (vmode, scratch, mask);
1974 emit_insn (gen_rtx_SET (scratch, x));
1975
1976 dest = mask;
1977 op0 = lowpart_subreg (vmode, op0, mode);
1978 x = gen_rtx_NOT (vmode, dest);
1979 x = gen_rtx_AND (vmode, x, op0);
1980 emit_insn (gen_rtx_SET (dest, x));
1981 }
1982 else
1983 {
1984 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1985 {
1986 x = gen_rtx_AND (vmode, scratch, mask);
1987 }
1988 else /* alternative 2,4 */
1989 {
1990 gcc_assert (REGNO (mask) == REGNO (scratch));
1991 op1 = lowpart_subreg (vmode, op1, mode);
1992 x = gen_rtx_AND (vmode, scratch, op1);
1993 }
1994 emit_insn (gen_rtx_SET (scratch, x));
1995
1996 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1997 {
1998 dest = lowpart_subreg (vmode, op0, mode);
1999 x = gen_rtx_AND (vmode, dest, nmask);
2000 }
2001 else /* alternative 3,4 */
2002 {
2003 gcc_assert (REGNO (nmask) == REGNO (dest));
2004 dest = nmask;
2005 op0 = lowpart_subreg (vmode, op0, mode);
2006 x = gen_rtx_AND (vmode, dest, op0);
2007 }
2008 emit_insn (gen_rtx_SET (dest, x));
2009 }
2010
2011 x = gen_rtx_IOR (vmode, dest, scratch);
2012 emit_insn (gen_rtx_SET (dest, x));
2013 }
2014
2015 /* Expand an xorsign operation. */
2016
2017 void
2018 ix86_expand_xorsign (rtx operands[])
2019 {
2020 machine_mode mode, vmode;
2021 rtx dest, op0, op1, mask;
2022
2023 dest = operands[0];
2024 op0 = operands[1];
2025 op1 = operands[2];
2026
2027 mode = GET_MODE (dest);
2028
2029 if (mode == SFmode)
2030 vmode = V4SFmode;
2031 else if (mode == DFmode)
2032 vmode = V2DFmode;
2033 else
2034 gcc_unreachable ();
2035
2036 mask = ix86_build_signbit_mask (vmode, 0, 0);
2037
2038 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2039 }
2040
2041 /* Deconstruct an xorsign operation into bit masks. */
2042
2043 void
2044 ix86_split_xorsign (rtx operands[])
2045 {
2046 machine_mode mode, vmode;
2047 rtx dest, op0, mask, x;
2048
2049 dest = operands[0];
2050 op0 = operands[1];
2051 mask = operands[3];
2052
2053 mode = GET_MODE (dest);
2054 vmode = GET_MODE (mask);
2055
2056 dest = lowpart_subreg (vmode, dest, mode);
2057 x = gen_rtx_AND (vmode, dest, mask);
2058 emit_insn (gen_rtx_SET (dest, x));
2059
2060 op0 = lowpart_subreg (vmode, op0, mode);
2061 x = gen_rtx_XOR (vmode, dest, op0);
2062 emit_insn (gen_rtx_SET (dest, x));
2063 }
2064
2065 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2066
2067 void
2068 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2069 {
2070 machine_mode mode = GET_MODE (op0);
2071 rtx tmp;
2072
2073 /* Handle special case - vector comparsion with boolean result, transform
2074 it using ptest instruction. */
2075 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2076 {
2077 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2078 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2079
2080 gcc_assert (code == EQ || code == NE);
2081 /* Generate XOR since we can't check that one operand is zero vector. */
2082 tmp = gen_reg_rtx (mode);
2083 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2084 tmp = gen_lowpart (p_mode, tmp);
2085 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2086 gen_rtx_UNSPEC (CCmode,
2087 gen_rtvec (2, tmp, tmp),
2088 UNSPEC_PTEST)));
2089 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2090 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2091 gen_rtx_LABEL_REF (VOIDmode, label),
2092 pc_rtx);
2093 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2094 return;
2095 }
2096
2097 switch (mode)
2098 {
2099 case E_SFmode:
2100 case E_DFmode:
2101 case E_XFmode:
2102 case E_QImode:
2103 case E_HImode:
2104 case E_SImode:
2105 simple:
2106 tmp = ix86_expand_compare (code, op0, op1);
2107 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2108 gen_rtx_LABEL_REF (VOIDmode, label),
2109 pc_rtx);
2110 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2111 return;
2112
2113 case E_DImode:
2114 if (TARGET_64BIT)
2115 goto simple;
2116 /* For 32-bit target DI comparison may be performed on
2117 SSE registers. To allow this we should avoid split
2118 to SI mode which is achieved by doing xor in DI mode
2119 and then comparing with zero (which is recognized by
2120 STV pass). We don't compare using xor when optimizing
2121 for size. */
2122 if (!optimize_insn_for_size_p ()
2123 && TARGET_STV
2124 && (code == EQ || code == NE))
2125 {
2126 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2127 op1 = const0_rtx;
2128 }
2129 /* FALLTHRU */
2130 case E_TImode:
2131 /* Expand DImode branch into multiple compare+branch. */
2132 {
2133 rtx lo[2], hi[2];
2134 rtx_code_label *label2;
2135 enum rtx_code code1, code2, code3;
2136 machine_mode submode;
2137
2138 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2139 {
2140 std::swap (op0, op1);
2141 code = swap_condition (code);
2142 }
2143
2144 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2145 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2146
2147 submode = mode == DImode ? SImode : DImode;
2148
2149 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2150 avoid two branches. This costs one extra insn, so disable when
2151 optimizing for size. */
2152
2153 if ((code == EQ || code == NE)
2154 && (!optimize_insn_for_size_p ()
2155 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2156 {
2157 rtx xor0, xor1;
2158
2159 xor1 = hi[0];
2160 if (hi[1] != const0_rtx)
2161 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2162 NULL_RTX, 0, OPTAB_WIDEN);
2163
2164 xor0 = lo[0];
2165 if (lo[1] != const0_rtx)
2166 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2167 NULL_RTX, 0, OPTAB_WIDEN);
2168
2169 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2170 NULL_RTX, 0, OPTAB_WIDEN);
2171
2172 ix86_expand_branch (code, tmp, const0_rtx, label);
2173 return;
2174 }
2175
2176 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2177 op1 is a constant and the low word is zero, then we can just
2178 examine the high word. Similarly for low word -1 and
2179 less-or-equal-than or greater-than. */
2180
2181 if (CONST_INT_P (hi[1]))
2182 switch (code)
2183 {
2184 case LT: case LTU: case GE: case GEU:
2185 if (lo[1] == const0_rtx)
2186 {
2187 ix86_expand_branch (code, hi[0], hi[1], label);
2188 return;
2189 }
2190 break;
2191 case LE: case LEU: case GT: case GTU:
2192 if (lo[1] == constm1_rtx)
2193 {
2194 ix86_expand_branch (code, hi[0], hi[1], label);
2195 return;
2196 }
2197 break;
2198 default:
2199 break;
2200 }
2201
2202 /* Emulate comparisons that do not depend on Zero flag with
2203 double-word subtraction. Note that only Overflow, Sign
2204 and Carry flags are valid, so swap arguments and condition
2205 of comparisons that would otherwise test Zero flag. */
2206
2207 switch (code)
2208 {
2209 case LE: case LEU: case GT: case GTU:
2210 std::swap (lo[0], lo[1]);
2211 std::swap (hi[0], hi[1]);
2212 code = swap_condition (code);
2213 /* FALLTHRU */
2214
2215 case LT: case LTU: case GE: case GEU:
2216 {
2217 bool uns = (code == LTU || code == GEU);
2218 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2219 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2220
2221 if (!nonimmediate_operand (lo[0], submode))
2222 lo[0] = force_reg (submode, lo[0]);
2223 if (!x86_64_general_operand (lo[1], submode))
2224 lo[1] = force_reg (submode, lo[1]);
2225
2226 if (!register_operand (hi[0], submode))
2227 hi[0] = force_reg (submode, hi[0]);
2228 if ((uns && !nonimmediate_operand (hi[1], submode))
2229 || (!uns && !x86_64_general_operand (hi[1], submode)))
2230 hi[1] = force_reg (submode, hi[1]);
2231
2232 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2233
2234 tmp = gen_rtx_SCRATCH (submode);
2235 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2236
2237 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2238 ix86_expand_branch (code, tmp, const0_rtx, label);
2239 return;
2240 }
2241
2242 default:
2243 break;
2244 }
2245
2246 /* Otherwise, we need two or three jumps. */
2247
2248 label2 = gen_label_rtx ();
2249
2250 code1 = code;
2251 code2 = swap_condition (code);
2252 code3 = unsigned_condition (code);
2253
2254 switch (code)
2255 {
2256 case LT: case GT: case LTU: case GTU:
2257 break;
2258
2259 case LE: code1 = LT; code2 = GT; break;
2260 case GE: code1 = GT; code2 = LT; break;
2261 case LEU: code1 = LTU; code2 = GTU; break;
2262 case GEU: code1 = GTU; code2 = LTU; break;
2263
2264 case EQ: code1 = UNKNOWN; code2 = NE; break;
2265 case NE: code2 = UNKNOWN; break;
2266
2267 default:
2268 gcc_unreachable ();
2269 }
2270
2271 /*
2272 * a < b =>
2273 * if (hi(a) < hi(b)) goto true;
2274 * if (hi(a) > hi(b)) goto false;
2275 * if (lo(a) < lo(b)) goto true;
2276 * false:
2277 */
2278
2279 if (code1 != UNKNOWN)
2280 ix86_expand_branch (code1, hi[0], hi[1], label);
2281 if (code2 != UNKNOWN)
2282 ix86_expand_branch (code2, hi[0], hi[1], label2);
2283
2284 ix86_expand_branch (code3, lo[0], lo[1], label);
2285
2286 if (code2 != UNKNOWN)
2287 emit_label (label2);
2288 return;
2289 }
2290
2291 default:
2292 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2293 goto simple;
2294 }
2295 }
2296
2297 /* Figure out whether to use unordered fp comparisons. */
2298
2299 static bool
2300 ix86_unordered_fp_compare (enum rtx_code code)
2301 {
2302 if (!TARGET_IEEE_FP)
2303 return false;
2304
2305 switch (code)
2306 {
2307 case LT:
2308 case LE:
2309 case GT:
2310 case GE:
2311 case LTGT:
2312 return false;
2313
2314 case EQ:
2315 case NE:
2316
2317 case UNORDERED:
2318 case ORDERED:
2319 case UNLT:
2320 case UNLE:
2321 case UNGT:
2322 case UNGE:
2323 case UNEQ:
2324 return true;
2325
2326 default:
2327 gcc_unreachable ();
2328 }
2329 }
2330
2331 /* Return a comparison we can do and that it is equivalent to
2332 swap_condition (code) apart possibly from orderedness.
2333 But, never change orderedness if TARGET_IEEE_FP, returning
2334 UNKNOWN in that case if necessary. */
2335
2336 static enum rtx_code
2337 ix86_fp_swap_condition (enum rtx_code code)
2338 {
2339 switch (code)
2340 {
2341 case GT: /* GTU - CF=0 & ZF=0 */
2342 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2343 case GE: /* GEU - CF=0 */
2344 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2345 case UNLT: /* LTU - CF=1 */
2346 return TARGET_IEEE_FP ? UNKNOWN : GT;
2347 case UNLE: /* LEU - CF=1 | ZF=1 */
2348 return TARGET_IEEE_FP ? UNKNOWN : GE;
2349 default:
2350 return swap_condition (code);
2351 }
2352 }
2353
2354 /* Return cost of comparison CODE using the best strategy for performance.
2355 All following functions do use number of instructions as a cost metrics.
2356 In future this should be tweaked to compute bytes for optimize_size and
2357 take into account performance of various instructions on various CPUs. */
2358
2359 static int
2360 ix86_fp_comparison_cost (enum rtx_code code)
2361 {
2362 int arith_cost;
2363
2364 /* The cost of code using bit-twiddling on %ah. */
2365 switch (code)
2366 {
2367 case UNLE:
2368 case UNLT:
2369 case LTGT:
2370 case GT:
2371 case GE:
2372 case UNORDERED:
2373 case ORDERED:
2374 case UNEQ:
2375 arith_cost = 4;
2376 break;
2377 case LT:
2378 case NE:
2379 case EQ:
2380 case UNGE:
2381 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2382 break;
2383 case LE:
2384 case UNGT:
2385 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2386 break;
2387 default:
2388 gcc_unreachable ();
2389 }
2390
2391 switch (ix86_fp_comparison_strategy (code))
2392 {
2393 case IX86_FPCMP_COMI:
2394 return arith_cost > 4 ? 3 : 2;
2395 case IX86_FPCMP_SAHF:
2396 return arith_cost > 4 ? 4 : 3;
2397 default:
2398 return arith_cost;
2399 }
2400 }
2401
2402 /* Swap, force into registers, or otherwise massage the two operands
2403 to a fp comparison. The operands are updated in place; the new
2404 comparison code is returned. */
2405
2406 static enum rtx_code
2407 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2408 {
2409 bool unordered_compare = ix86_unordered_fp_compare (code);
2410 rtx op0 = *pop0, op1 = *pop1;
2411 machine_mode op_mode = GET_MODE (op0);
2412 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2413
2414 /* All of the unordered compare instructions only work on registers.
2415 The same is true of the fcomi compare instructions. The XFmode
2416 compare instructions require registers except when comparing
2417 against zero or when converting operand 1 from fixed point to
2418 floating point. */
2419
2420 if (!is_sse
2421 && (unordered_compare
2422 || (op_mode == XFmode
2423 && ! (standard_80387_constant_p (op0) == 1
2424 || standard_80387_constant_p (op1) == 1)
2425 && GET_CODE (op1) != FLOAT)
2426 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2427 {
2428 op0 = force_reg (op_mode, op0);
2429 op1 = force_reg (op_mode, op1);
2430 }
2431 else
2432 {
2433 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2434 things around if they appear profitable, otherwise force op0
2435 into a register. */
2436
2437 if (standard_80387_constant_p (op0) == 0
2438 || (MEM_P (op0)
2439 && ! (standard_80387_constant_p (op1) == 0
2440 || MEM_P (op1))))
2441 {
2442 enum rtx_code new_code = ix86_fp_swap_condition (code);
2443 if (new_code != UNKNOWN)
2444 {
2445 std::swap (op0, op1);
2446 code = new_code;
2447 }
2448 }
2449
2450 if (!REG_P (op0))
2451 op0 = force_reg (op_mode, op0);
2452
2453 if (CONSTANT_P (op1))
2454 {
2455 int tmp = standard_80387_constant_p (op1);
2456 if (tmp == 0)
2457 op1 = validize_mem (force_const_mem (op_mode, op1));
2458 else if (tmp == 1)
2459 {
2460 if (TARGET_CMOVE)
2461 op1 = force_reg (op_mode, op1);
2462 }
2463 else
2464 op1 = force_reg (op_mode, op1);
2465 }
2466 }
2467
2468 /* Try to rearrange the comparison to make it cheaper. */
2469 if (ix86_fp_comparison_cost (code)
2470 > ix86_fp_comparison_cost (swap_condition (code))
2471 && (REG_P (op1) || can_create_pseudo_p ()))
2472 {
2473 std::swap (op0, op1);
2474 code = swap_condition (code);
2475 if (!REG_P (op0))
2476 op0 = force_reg (op_mode, op0);
2477 }
2478
2479 *pop0 = op0;
2480 *pop1 = op1;
2481 return code;
2482 }
2483
2484 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2485
2486 static rtx
2487 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2488 {
2489 bool unordered_compare = ix86_unordered_fp_compare (code);
2490 machine_mode cmp_mode;
2491 rtx tmp, scratch;
2492
2493 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2494
2495 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2496 if (unordered_compare)
2497 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2498
2499 /* Do fcomi/sahf based test when profitable. */
2500 switch (ix86_fp_comparison_strategy (code))
2501 {
2502 case IX86_FPCMP_COMI:
2503 cmp_mode = CCFPmode;
2504 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2505 break;
2506
2507 case IX86_FPCMP_SAHF:
2508 cmp_mode = CCFPmode;
2509 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2510 scratch = gen_reg_rtx (HImode);
2511 emit_insn (gen_rtx_SET (scratch, tmp));
2512 emit_insn (gen_x86_sahf_1 (scratch));
2513 break;
2514
2515 case IX86_FPCMP_ARITH:
2516 cmp_mode = CCNOmode;
2517 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2518 scratch = gen_reg_rtx (HImode);
2519 emit_insn (gen_rtx_SET (scratch, tmp));
2520
2521 /* In the unordered case, we have to check C2 for NaN's, which
2522 doesn't happen to work out to anything nice combination-wise.
2523 So do some bit twiddling on the value we've got in AH to come
2524 up with an appropriate set of condition codes. */
2525
2526 switch (code)
2527 {
2528 case GT:
2529 case UNGT:
2530 if (code == GT || !TARGET_IEEE_FP)
2531 {
2532 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2533 code = EQ;
2534 }
2535 else
2536 {
2537 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2538 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2539 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2540 cmp_mode = CCmode;
2541 code = GEU;
2542 }
2543 break;
2544 case LT:
2545 case UNLT:
2546 if (code == LT && TARGET_IEEE_FP)
2547 {
2548 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2549 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2550 cmp_mode = CCmode;
2551 code = EQ;
2552 }
2553 else
2554 {
2555 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2556 code = NE;
2557 }
2558 break;
2559 case GE:
2560 case UNGE:
2561 if (code == GE || !TARGET_IEEE_FP)
2562 {
2563 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2564 code = EQ;
2565 }
2566 else
2567 {
2568 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2569 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2570 code = NE;
2571 }
2572 break;
2573 case LE:
2574 case UNLE:
2575 if (code == LE && TARGET_IEEE_FP)
2576 {
2577 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2578 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2579 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2580 cmp_mode = CCmode;
2581 code = LTU;
2582 }
2583 else
2584 {
2585 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2586 code = NE;
2587 }
2588 break;
2589 case EQ:
2590 case UNEQ:
2591 if (code == EQ && TARGET_IEEE_FP)
2592 {
2593 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2594 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2595 cmp_mode = CCmode;
2596 code = EQ;
2597 }
2598 else
2599 {
2600 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2601 code = NE;
2602 }
2603 break;
2604 case NE:
2605 case LTGT:
2606 if (code == NE && TARGET_IEEE_FP)
2607 {
2608 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2609 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2610 GEN_INT (0x40)));
2611 code = NE;
2612 }
2613 else
2614 {
2615 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2616 code = EQ;
2617 }
2618 break;
2619
2620 case UNORDERED:
2621 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2622 code = NE;
2623 break;
2624 case ORDERED:
2625 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2626 code = EQ;
2627 break;
2628
2629 default:
2630 gcc_unreachable ();
2631 }
2632 break;
2633
2634 default:
2635 gcc_unreachable();
2636 }
2637
2638 /* Return the test that should be put into the flags user, i.e.
2639 the bcc, scc, or cmov instruction. */
2640 return gen_rtx_fmt_ee (code, VOIDmode,
2641 gen_rtx_REG (cmp_mode, FLAGS_REG),
2642 const0_rtx);
2643 }
2644
2645 /* Generate insn patterns to do an integer compare of OPERANDS. */
2646
2647 static rtx
2648 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2649 {
2650 machine_mode cmpmode;
2651 rtx tmp, flags;
2652
2653 cmpmode = SELECT_CC_MODE (code, op0, op1);
2654 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2655
2656 /* This is very simple, but making the interface the same as in the
2657 FP case makes the rest of the code easier. */
2658 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2659 emit_insn (gen_rtx_SET (flags, tmp));
2660
2661 /* Return the test that should be put into the flags user, i.e.
2662 the bcc, scc, or cmov instruction. */
2663 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2664 }
2665
2666 static rtx
2667 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2668 {
2669 rtx ret;
2670
2671 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2672 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2673
2674 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2675 {
2676 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2677 ret = ix86_expand_fp_compare (code, op0, op1);
2678 }
2679 else
2680 ret = ix86_expand_int_compare (code, op0, op1);
2681
2682 return ret;
2683 }
2684
2685 void
2686 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2687 {
2688 rtx ret;
2689
2690 gcc_assert (GET_MODE (dest) == QImode);
2691
2692 ret = ix86_expand_compare (code, op0, op1);
2693 PUT_MODE (ret, QImode);
2694 emit_insn (gen_rtx_SET (dest, ret));
2695 }
2696
2697 /* Expand comparison setting or clearing carry flag. Return true when
2698 successful and set pop for the operation. */
2699 static bool
2700 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2701 {
2702 machine_mode mode
2703 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2704
2705 /* Do not handle double-mode compares that go through special path. */
2706 if (mode == (TARGET_64BIT ? TImode : DImode))
2707 return false;
2708
2709 if (SCALAR_FLOAT_MODE_P (mode))
2710 {
2711 rtx compare_op;
2712 rtx_insn *compare_seq;
2713
2714 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2715
2716 /* Shortcut: following common codes never translate
2717 into carry flag compares. */
2718 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2719 || code == ORDERED || code == UNORDERED)
2720 return false;
2721
2722 /* These comparisons require zero flag; swap operands so they won't. */
2723 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2724 && !TARGET_IEEE_FP)
2725 {
2726 std::swap (op0, op1);
2727 code = swap_condition (code);
2728 }
2729
2730 /* Try to expand the comparison and verify that we end up with
2731 carry flag based comparison. This fails to be true only when
2732 we decide to expand comparison using arithmetic that is not
2733 too common scenario. */
2734 start_sequence ();
2735 compare_op = ix86_expand_fp_compare (code, op0, op1);
2736 compare_seq = get_insns ();
2737 end_sequence ();
2738
2739 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2740 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2741 else
2742 code = GET_CODE (compare_op);
2743
2744 if (code != LTU && code != GEU)
2745 return false;
2746
2747 emit_insn (compare_seq);
2748 *pop = compare_op;
2749 return true;
2750 }
2751
2752 if (!INTEGRAL_MODE_P (mode))
2753 return false;
2754
2755 switch (code)
2756 {
2757 case LTU:
2758 case GEU:
2759 break;
2760
2761 /* Convert a==0 into (unsigned)a<1. */
2762 case EQ:
2763 case NE:
2764 if (op1 != const0_rtx)
2765 return false;
2766 op1 = const1_rtx;
2767 code = (code == EQ ? LTU : GEU);
2768 break;
2769
2770 /* Convert a>b into b<a or a>=b-1. */
2771 case GTU:
2772 case LEU:
2773 if (CONST_INT_P (op1))
2774 {
2775 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2776 /* Bail out on overflow. We still can swap operands but that
2777 would force loading of the constant into register. */
2778 if (op1 == const0_rtx
2779 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2780 return false;
2781 code = (code == GTU ? GEU : LTU);
2782 }
2783 else
2784 {
2785 std::swap (op0, op1);
2786 code = (code == GTU ? LTU : GEU);
2787 }
2788 break;
2789
2790 /* Convert a>=0 into (unsigned)a<0x80000000. */
2791 case LT:
2792 case GE:
2793 if (mode == DImode || op1 != const0_rtx)
2794 return false;
2795 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2796 code = (code == LT ? GEU : LTU);
2797 break;
2798 case LE:
2799 case GT:
2800 if (mode == DImode || op1 != constm1_rtx)
2801 return false;
2802 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2803 code = (code == LE ? GEU : LTU);
2804 break;
2805
2806 default:
2807 return false;
2808 }
2809 /* Swapping operands may cause constant to appear as first operand. */
2810 if (!nonimmediate_operand (op0, VOIDmode))
2811 {
2812 if (!can_create_pseudo_p ())
2813 return false;
2814 op0 = force_reg (mode, op0);
2815 }
2816 *pop = ix86_expand_compare (code, op0, op1);
2817 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2818 return true;
2819 }
2820
2821 /* Expand conditional increment or decrement using adb/sbb instructions.
2822 The default case using setcc followed by the conditional move can be
2823 done by generic code. */
2824 bool
2825 ix86_expand_int_addcc (rtx operands[])
2826 {
2827 enum rtx_code code = GET_CODE (operands[1]);
2828 rtx flags;
2829 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2830 rtx compare_op;
2831 rtx val = const0_rtx;
2832 bool fpcmp = false;
2833 machine_mode mode;
2834 rtx op0 = XEXP (operands[1], 0);
2835 rtx op1 = XEXP (operands[1], 1);
2836
2837 if (operands[3] != const1_rtx
2838 && operands[3] != constm1_rtx)
2839 return false;
2840 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2841 return false;
2842 code = GET_CODE (compare_op);
2843
2844 flags = XEXP (compare_op, 0);
2845
2846 if (GET_MODE (flags) == CCFPmode)
2847 {
2848 fpcmp = true;
2849 code = ix86_fp_compare_code_to_integer (code);
2850 }
2851
2852 if (code != LTU)
2853 {
2854 val = constm1_rtx;
2855 if (fpcmp)
2856 PUT_CODE (compare_op,
2857 reverse_condition_maybe_unordered
2858 (GET_CODE (compare_op)));
2859 else
2860 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2861 }
2862
2863 mode = GET_MODE (operands[0]);
2864
2865 /* Construct either adc or sbb insn. */
2866 if ((code == LTU) == (operands[3] == constm1_rtx))
2867 insn = gen_sub3_carry;
2868 else
2869 insn = gen_add3_carry;
2870
2871 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2872
2873 return true;
2874 }
2875
2876 bool
2877 ix86_expand_int_movcc (rtx operands[])
2878 {
2879 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2880 rtx_insn *compare_seq;
2881 rtx compare_op;
2882 machine_mode mode = GET_MODE (operands[0]);
2883 bool sign_bit_compare_p = false;
2884 rtx op0 = XEXP (operands[1], 0);
2885 rtx op1 = XEXP (operands[1], 1);
2886
2887 if (GET_MODE (op0) == TImode
2888 || (GET_MODE (op0) == DImode
2889 && !TARGET_64BIT))
2890 return false;
2891
2892 start_sequence ();
2893 compare_op = ix86_expand_compare (code, op0, op1);
2894 compare_seq = get_insns ();
2895 end_sequence ();
2896
2897 compare_code = GET_CODE (compare_op);
2898
2899 if ((op1 == const0_rtx && (code == GE || code == LT))
2900 || (op1 == constm1_rtx && (code == GT || code == LE)))
2901 sign_bit_compare_p = true;
2902
2903 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2904 HImode insns, we'd be swallowed in word prefix ops. */
2905
2906 if ((mode != HImode || TARGET_FAST_PREFIX)
2907 && (mode != (TARGET_64BIT ? TImode : DImode))
2908 && CONST_INT_P (operands[2])
2909 && CONST_INT_P (operands[3]))
2910 {
2911 rtx out = operands[0];
2912 HOST_WIDE_INT ct = INTVAL (operands[2]);
2913 HOST_WIDE_INT cf = INTVAL (operands[3]);
2914 HOST_WIDE_INT diff;
2915
2916 diff = ct - cf;
2917 /* Sign bit compares are better done using shifts than we do by using
2918 sbb. */
2919 if (sign_bit_compare_p
2920 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2921 {
2922 /* Detect overlap between destination and compare sources. */
2923 rtx tmp = out;
2924
2925 if (!sign_bit_compare_p)
2926 {
2927 rtx flags;
2928 bool fpcmp = false;
2929
2930 compare_code = GET_CODE (compare_op);
2931
2932 flags = XEXP (compare_op, 0);
2933
2934 if (GET_MODE (flags) == CCFPmode)
2935 {
2936 fpcmp = true;
2937 compare_code
2938 = ix86_fp_compare_code_to_integer (compare_code);
2939 }
2940
2941 /* To simplify rest of code, restrict to the GEU case. */
2942 if (compare_code == LTU)
2943 {
2944 std::swap (ct, cf);
2945 compare_code = reverse_condition (compare_code);
2946 code = reverse_condition (code);
2947 }
2948 else
2949 {
2950 if (fpcmp)
2951 PUT_CODE (compare_op,
2952 reverse_condition_maybe_unordered
2953 (GET_CODE (compare_op)));
2954 else
2955 PUT_CODE (compare_op,
2956 reverse_condition (GET_CODE (compare_op)));
2957 }
2958 diff = ct - cf;
2959
2960 if (reg_overlap_mentioned_p (out, op0)
2961 || reg_overlap_mentioned_p (out, op1))
2962 tmp = gen_reg_rtx (mode);
2963
2964 if (mode == DImode)
2965 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2966 else
2967 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2968 flags, compare_op));
2969 }
2970 else
2971 {
2972 if (code == GT || code == GE)
2973 code = reverse_condition (code);
2974 else
2975 {
2976 std::swap (ct, cf);
2977 diff = ct - cf;
2978 }
2979 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2980 }
2981
2982 if (diff == 1)
2983 {
2984 /*
2985 * cmpl op0,op1
2986 * sbbl dest,dest
2987 * [addl dest, ct]
2988 *
2989 * Size 5 - 8.
2990 */
2991 if (ct)
2992 tmp = expand_simple_binop (mode, PLUS,
2993 tmp, GEN_INT (ct),
2994 copy_rtx (tmp), 1, OPTAB_DIRECT);
2995 }
2996 else if (cf == -1)
2997 {
2998 /*
2999 * cmpl op0,op1
3000 * sbbl dest,dest
3001 * orl $ct, dest
3002 *
3003 * Size 8.
3004 */
3005 tmp = expand_simple_binop (mode, IOR,
3006 tmp, GEN_INT (ct),
3007 copy_rtx (tmp), 1, OPTAB_DIRECT);
3008 }
3009 else if (diff == -1 && ct)
3010 {
3011 /*
3012 * cmpl op0,op1
3013 * sbbl dest,dest
3014 * notl dest
3015 * [addl dest, cf]
3016 *
3017 * Size 8 - 11.
3018 */
3019 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3020 if (cf)
3021 tmp = expand_simple_binop (mode, PLUS,
3022 copy_rtx (tmp), GEN_INT (cf),
3023 copy_rtx (tmp), 1, OPTAB_DIRECT);
3024 }
3025 else
3026 {
3027 /*
3028 * cmpl op0,op1
3029 * sbbl dest,dest
3030 * [notl dest]
3031 * andl cf - ct, dest
3032 * [addl dest, ct]
3033 *
3034 * Size 8 - 11.
3035 */
3036
3037 if (cf == 0)
3038 {
3039 cf = ct;
3040 ct = 0;
3041 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3042 }
3043
3044 tmp = expand_simple_binop (mode, AND,
3045 copy_rtx (tmp),
3046 gen_int_mode (cf - ct, mode),
3047 copy_rtx (tmp), 1, OPTAB_DIRECT);
3048 if (ct)
3049 tmp = expand_simple_binop (mode, PLUS,
3050 copy_rtx (tmp), GEN_INT (ct),
3051 copy_rtx (tmp), 1, OPTAB_DIRECT);
3052 }
3053
3054 if (!rtx_equal_p (tmp, out))
3055 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3056
3057 return true;
3058 }
3059
3060 if (diff < 0)
3061 {
3062 machine_mode cmp_mode = GET_MODE (op0);
3063 enum rtx_code new_code;
3064
3065 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3066 {
3067 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3068
3069 /* We may be reversing a non-trapping
3070 comparison to a trapping comparison. */
3071 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3072 && code != EQ && code != NE
3073 && code != ORDERED && code != UNORDERED)
3074 new_code = UNKNOWN;
3075 else
3076 new_code = reverse_condition_maybe_unordered (code);
3077 }
3078 else
3079 new_code = ix86_reverse_condition (code, cmp_mode);
3080 if (new_code != UNKNOWN)
3081 {
3082 std::swap (ct, cf);
3083 diff = -diff;
3084 code = new_code;
3085 }
3086 }
3087
3088 compare_code = UNKNOWN;
3089 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3090 && CONST_INT_P (op1))
3091 {
3092 if (op1 == const0_rtx
3093 && (code == LT || code == GE))
3094 compare_code = code;
3095 else if (op1 == constm1_rtx)
3096 {
3097 if (code == LE)
3098 compare_code = LT;
3099 else if (code == GT)
3100 compare_code = GE;
3101 }
3102 }
3103
3104 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3105 if (compare_code != UNKNOWN
3106 && GET_MODE (op0) == GET_MODE (out)
3107 && (cf == -1 || ct == -1))
3108 {
3109 /* If lea code below could be used, only optimize
3110 if it results in a 2 insn sequence. */
3111
3112 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3113 || diff == 3 || diff == 5 || diff == 9)
3114 || (compare_code == LT && ct == -1)
3115 || (compare_code == GE && cf == -1))
3116 {
3117 /*
3118 * notl op1 (if necessary)
3119 * sarl $31, op1
3120 * orl cf, op1
3121 */
3122 if (ct != -1)
3123 {
3124 cf = ct;
3125 ct = -1;
3126 code = reverse_condition (code);
3127 }
3128
3129 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3130
3131 out = expand_simple_binop (mode, IOR,
3132 out, GEN_INT (cf),
3133 out, 1, OPTAB_DIRECT);
3134 if (out != operands[0])
3135 emit_move_insn (operands[0], out);
3136
3137 return true;
3138 }
3139 }
3140
3141
3142 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3143 || diff == 3 || diff == 5 || diff == 9)
3144 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3145 && (mode != DImode
3146 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3147 {
3148 /*
3149 * xorl dest,dest
3150 * cmpl op1,op2
3151 * setcc dest
3152 * lea cf(dest*(ct-cf)),dest
3153 *
3154 * Size 14.
3155 *
3156 * This also catches the degenerate setcc-only case.
3157 */
3158
3159 rtx tmp;
3160 int nops;
3161
3162 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3163
3164 nops = 0;
3165 /* On x86_64 the lea instruction operates on Pmode, so we need
3166 to get arithmetics done in proper mode to match. */
3167 if (diff == 1)
3168 tmp = copy_rtx (out);
3169 else
3170 {
3171 rtx out1;
3172 out1 = copy_rtx (out);
3173 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3174 nops++;
3175 if (diff & 1)
3176 {
3177 tmp = gen_rtx_PLUS (mode, tmp, out1);
3178 nops++;
3179 }
3180 }
3181 if (cf != 0)
3182 {
3183 tmp = plus_constant (mode, tmp, cf);
3184 nops++;
3185 }
3186 if (!rtx_equal_p (tmp, out))
3187 {
3188 if (nops == 1)
3189 out = force_operand (tmp, copy_rtx (out));
3190 else
3191 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3192 }
3193 if (!rtx_equal_p (out, operands[0]))
3194 emit_move_insn (operands[0], copy_rtx (out));
3195
3196 return true;
3197 }
3198
3199 /*
3200 * General case: Jumpful:
3201 * xorl dest,dest cmpl op1, op2
3202 * cmpl op1, op2 movl ct, dest
3203 * setcc dest jcc 1f
3204 * decl dest movl cf, dest
3205 * andl (cf-ct),dest 1:
3206 * addl ct,dest
3207 *
3208 * Size 20. Size 14.
3209 *
3210 * This is reasonably steep, but branch mispredict costs are
3211 * high on modern cpus, so consider failing only if optimizing
3212 * for space.
3213 */
3214
3215 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3216 && BRANCH_COST (optimize_insn_for_speed_p (),
3217 false) >= 2)
3218 {
3219 if (cf == 0)
3220 {
3221 machine_mode cmp_mode = GET_MODE (op0);
3222 enum rtx_code new_code;
3223
3224 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3225 {
3226 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3227
3228 /* We may be reversing a non-trapping
3229 comparison to a trapping comparison. */
3230 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3231 && code != EQ && code != NE
3232 && code != ORDERED && code != UNORDERED)
3233 new_code = UNKNOWN;
3234 else
3235 new_code = reverse_condition_maybe_unordered (code);
3236
3237 }
3238 else
3239 {
3240 new_code = ix86_reverse_condition (code, cmp_mode);
3241 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3242 compare_code = reverse_condition (compare_code);
3243 }
3244
3245 if (new_code != UNKNOWN)
3246 {
3247 cf = ct;
3248 ct = 0;
3249 code = new_code;
3250 }
3251 }
3252
3253 if (compare_code != UNKNOWN)
3254 {
3255 /* notl op1 (if needed)
3256 sarl $31, op1
3257 andl (cf-ct), op1
3258 addl ct, op1
3259
3260 For x < 0 (resp. x <= -1) there will be no notl,
3261 so if possible swap the constants to get rid of the
3262 complement.
3263 True/false will be -1/0 while code below (store flag
3264 followed by decrement) is 0/-1, so the constants need
3265 to be exchanged once more. */
3266
3267 if (compare_code == GE || !cf)
3268 {
3269 code = reverse_condition (code);
3270 compare_code = LT;
3271 }
3272 else
3273 std::swap (ct, cf);
3274
3275 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3276 }
3277 else
3278 {
3279 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3280
3281 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3282 constm1_rtx,
3283 copy_rtx (out), 1, OPTAB_DIRECT);
3284 }
3285
3286 out = expand_simple_binop (mode, AND, copy_rtx (out),
3287 gen_int_mode (cf - ct, mode),
3288 copy_rtx (out), 1, OPTAB_DIRECT);
3289 if (ct)
3290 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3291 copy_rtx (out), 1, OPTAB_DIRECT);
3292 if (!rtx_equal_p (out, operands[0]))
3293 emit_move_insn (operands[0], copy_rtx (out));
3294
3295 return true;
3296 }
3297 }
3298
3299 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3300 {
3301 /* Try a few things more with specific constants and a variable. */
3302
3303 optab op;
3304 rtx var, orig_out, out, tmp;
3305
3306 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3307 return false;
3308
3309 /* If one of the two operands is an interesting constant, load a
3310 constant with the above and mask it in with a logical operation. */
3311
3312 if (CONST_INT_P (operands[2]))
3313 {
3314 var = operands[3];
3315 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3316 operands[3] = constm1_rtx, op = and_optab;
3317 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3318 operands[3] = const0_rtx, op = ior_optab;
3319 else
3320 return false;
3321 }
3322 else if (CONST_INT_P (operands[3]))
3323 {
3324 var = operands[2];
3325 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3326 {
3327 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3328 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3329 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3330 operands[1] = simplify_gen_relational (LT, VOIDmode,
3331 GET_MODE (op0),
3332 op0, const0_rtx);
3333
3334 operands[2] = constm1_rtx;
3335 op = and_optab;
3336 }
3337 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3338 operands[2] = const0_rtx, op = ior_optab;
3339 else
3340 return false;
3341 }
3342 else
3343 return false;
3344
3345 orig_out = operands[0];
3346 tmp = gen_reg_rtx (mode);
3347 operands[0] = tmp;
3348
3349 /* Recurse to get the constant loaded. */
3350 if (!ix86_expand_int_movcc (operands))
3351 return false;
3352
3353 /* Mask in the interesting variable. */
3354 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3355 OPTAB_WIDEN);
3356 if (!rtx_equal_p (out, orig_out))
3357 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3358
3359 return true;
3360 }
3361
3362 /*
3363 * For comparison with above,
3364 *
3365 * movl cf,dest
3366 * movl ct,tmp
3367 * cmpl op1,op2
3368 * cmovcc tmp,dest
3369 *
3370 * Size 15.
3371 */
3372
3373 if (! nonimmediate_operand (operands[2], mode))
3374 operands[2] = force_reg (mode, operands[2]);
3375 if (! nonimmediate_operand (operands[3], mode))
3376 operands[3] = force_reg (mode, operands[3]);
3377
3378 if (! register_operand (operands[2], VOIDmode)
3379 && (mode == QImode
3380 || ! register_operand (operands[3], VOIDmode)))
3381 operands[2] = force_reg (mode, operands[2]);
3382
3383 if (mode == QImode
3384 && ! register_operand (operands[3], VOIDmode))
3385 operands[3] = force_reg (mode, operands[3]);
3386
3387 emit_insn (compare_seq);
3388 emit_insn (gen_rtx_SET (operands[0],
3389 gen_rtx_IF_THEN_ELSE (mode,
3390 compare_op, operands[2],
3391 operands[3])));
3392 return true;
3393 }
3394
3395 /* Detect conditional moves that exactly match min/max operational
3396 semantics. Note that this is IEEE safe, as long as we don't
3397 interchange the operands.
3398
3399 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3400 and TRUE if the operation is successful and instructions are emitted. */
3401
3402 static bool
3403 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3404 rtx cmp_op1, rtx if_true, rtx if_false)
3405 {
3406 machine_mode mode;
3407 bool is_min;
3408 rtx tmp;
3409
3410 if (code == LT)
3411 ;
3412 else if (code == UNGE)
3413 std::swap (if_true, if_false);
3414 else
3415 return false;
3416
3417 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3418 is_min = true;
3419 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3420 is_min = false;
3421 else
3422 return false;
3423
3424 mode = GET_MODE (dest);
3425
3426 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3427 but MODE may be a vector mode and thus not appropriate. */
3428 if (!flag_finite_math_only || flag_signed_zeros)
3429 {
3430 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3431 rtvec v;
3432
3433 if_true = force_reg (mode, if_true);
3434 v = gen_rtvec (2, if_true, if_false);
3435 tmp = gen_rtx_UNSPEC (mode, v, u);
3436 }
3437 else
3438 {
3439 code = is_min ? SMIN : SMAX;
3440 if (MEM_P (if_true) && MEM_P (if_false))
3441 if_true = force_reg (mode, if_true);
3442 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3443 }
3444
3445 emit_insn (gen_rtx_SET (dest, tmp));
3446 return true;
3447 }
3448
3449 /* Return true if MODE is valid for vector compare to mask register,
3450 Same result for conditionl vector move with mask register. */
3451 static bool
3452 ix86_valid_mask_cmp_mode (machine_mode mode)
3453 {
3454 /* XOP has its own vector conditional movement. */
3455 if (TARGET_XOP && !TARGET_AVX512F)
3456 return false;
3457
3458 /* AVX512F is needed for mask operation. */
3459 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3460 return false;
3461
3462 /* AVX512BW is needed for vector QI/HImode,
3463 AVX512VL is needed for 128/256-bit vector. */
3464 machine_mode inner_mode = GET_MODE_INNER (mode);
3465 int vector_size = GET_MODE_SIZE (mode);
3466 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3467 return false;
3468
3469 return vector_size == 64 || TARGET_AVX512VL;
3470 }
3471
3472 /* Expand an SSE comparison. Return the register with the result. */
3473
3474 static rtx
3475 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3476 rtx op_true, rtx op_false)
3477 {
3478 machine_mode mode = GET_MODE (dest);
3479 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3480
3481 /* In general case result of comparison can differ from operands' type. */
3482 machine_mode cmp_mode;
3483
3484 /* In AVX512F the result of comparison is an integer mask. */
3485 bool maskcmp = false;
3486 rtx x;
3487
3488 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
3489 {
3490 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3491 maskcmp = true;
3492 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3493 }
3494 else
3495 cmp_mode = cmp_ops_mode;
3496
3497 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3498
3499 int (*op1_predicate)(rtx, machine_mode)
3500 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3501
3502 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3503 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3504
3505 if (optimize
3506 || (maskcmp && cmp_mode != mode)
3507 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3508 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3509 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3510
3511 if (maskcmp)
3512 {
3513 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3514 gcc_assert (ok);
3515 return dest;
3516 }
3517
3518 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3519
3520 if (cmp_mode != mode && !maskcmp)
3521 {
3522 x = force_reg (cmp_ops_mode, x);
3523 convert_move (dest, x, false);
3524 }
3525 else
3526 emit_insn (gen_rtx_SET (dest, x));
3527
3528 return dest;
3529 }
3530
3531 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3532 operations. This is used for both scalar and vector conditional moves. */
3533
3534 void
3535 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3536 {
3537 machine_mode mode = GET_MODE (dest);
3538 machine_mode cmpmode = GET_MODE (cmp);
3539
3540 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3541 if (rtx_equal_p (op_true, op_false))
3542 {
3543 emit_move_insn (dest, op_true);
3544 return;
3545 }
3546
3547 /* In AVX512F the result of comparison is an integer mask. */
3548 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
3549
3550 rtx t2, t3, x;
3551
3552 /* If we have an integer mask and FP value then we need
3553 to cast mask to FP mode. */
3554 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3555 {
3556 cmp = force_reg (cmpmode, cmp);
3557 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3558 }
3559
3560 if (maskcmp)
3561 {
3562 /* Using vector move with mask register. */
3563 cmp = force_reg (cmpmode, cmp);
3564 /* Optimize for mask zero. */
3565 op_true = (op_true != CONST0_RTX (mode)
3566 ? force_reg (mode, op_true) : op_true);
3567 op_false = (op_false != CONST0_RTX (mode)
3568 ? force_reg (mode, op_false) : op_false);
3569 if (op_true == CONST0_RTX (mode))
3570 {
3571 rtx (*gen_not) (rtx, rtx);
3572 switch (cmpmode)
3573 {
3574 case E_QImode: gen_not = gen_knotqi; break;
3575 case E_HImode: gen_not = gen_knothi; break;
3576 case E_SImode: gen_not = gen_knotsi; break;
3577 case E_DImode: gen_not = gen_knotdi; break;
3578 default: gcc_unreachable ();
3579 }
3580 rtx n = gen_reg_rtx (cmpmode);
3581 emit_insn (gen_not (n, cmp));
3582 cmp = n;
3583 /* Reverse op_true op_false. */
3584 std::swap (op_true, op_false);
3585 }
3586
3587 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3588 emit_insn (gen_rtx_SET (dest, vec_merge));
3589 return;
3590 }
3591 else if (vector_all_ones_operand (op_true, mode)
3592 && op_false == CONST0_RTX (mode))
3593 {
3594 emit_insn (gen_rtx_SET (dest, cmp));
3595 return;
3596 }
3597 else if (op_false == CONST0_RTX (mode))
3598 {
3599 op_true = force_reg (mode, op_true);
3600 x = gen_rtx_AND (mode, cmp, op_true);
3601 emit_insn (gen_rtx_SET (dest, x));
3602 return;
3603 }
3604 else if (op_true == CONST0_RTX (mode))
3605 {
3606 op_false = force_reg (mode, op_false);
3607 x = gen_rtx_NOT (mode, cmp);
3608 x = gen_rtx_AND (mode, x, op_false);
3609 emit_insn (gen_rtx_SET (dest, x));
3610 return;
3611 }
3612 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3613 {
3614 op_false = force_reg (mode, op_false);
3615 x = gen_rtx_IOR (mode, cmp, op_false);
3616 emit_insn (gen_rtx_SET (dest, x));
3617 return;
3618 }
3619 else if (TARGET_XOP)
3620 {
3621 op_true = force_reg (mode, op_true);
3622
3623 if (!nonimmediate_operand (op_false, mode))
3624 op_false = force_reg (mode, op_false);
3625
3626 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3627 op_true,
3628 op_false)));
3629 return;
3630 }
3631
3632 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3633 rtx d = dest;
3634
3635 if (!vector_operand (op_true, mode))
3636 op_true = force_reg (mode, op_true);
3637
3638 op_false = force_reg (mode, op_false);
3639
3640 switch (mode)
3641 {
3642 case E_V4SFmode:
3643 if (TARGET_SSE4_1)
3644 gen = gen_sse4_1_blendvps;
3645 break;
3646 case E_V2DFmode:
3647 if (TARGET_SSE4_1)
3648 gen = gen_sse4_1_blendvpd;
3649 break;
3650 case E_SFmode:
3651 if (TARGET_SSE4_1)
3652 {
3653 gen = gen_sse4_1_blendvss;
3654 op_true = force_reg (mode, op_true);
3655 }
3656 break;
3657 case E_DFmode:
3658 if (TARGET_SSE4_1)
3659 {
3660 gen = gen_sse4_1_blendvsd;
3661 op_true = force_reg (mode, op_true);
3662 }
3663 break;
3664 case E_V16QImode:
3665 case E_V8HImode:
3666 case E_V4SImode:
3667 case E_V2DImode:
3668 if (TARGET_SSE4_1)
3669 {
3670 gen = gen_sse4_1_pblendvb;
3671 if (mode != V16QImode)
3672 d = gen_reg_rtx (V16QImode);
3673 op_false = gen_lowpart (V16QImode, op_false);
3674 op_true = gen_lowpart (V16QImode, op_true);
3675 cmp = gen_lowpart (V16QImode, cmp);
3676 }
3677 break;
3678 case E_V8SFmode:
3679 if (TARGET_AVX)
3680 gen = gen_avx_blendvps256;
3681 break;
3682 case E_V4DFmode:
3683 if (TARGET_AVX)
3684 gen = gen_avx_blendvpd256;
3685 break;
3686 case E_V32QImode:
3687 case E_V16HImode:
3688 case E_V8SImode:
3689 case E_V4DImode:
3690 if (TARGET_AVX2)
3691 {
3692 gen = gen_avx2_pblendvb;
3693 if (mode != V32QImode)
3694 d = gen_reg_rtx (V32QImode);
3695 op_false = gen_lowpart (V32QImode, op_false);
3696 op_true = gen_lowpart (V32QImode, op_true);
3697 cmp = gen_lowpart (V32QImode, cmp);
3698 }
3699 break;
3700
3701 case E_V64QImode:
3702 gen = gen_avx512bw_blendmv64qi;
3703 break;
3704 case E_V32HImode:
3705 gen = gen_avx512bw_blendmv32hi;
3706 break;
3707 case E_V16SImode:
3708 gen = gen_avx512f_blendmv16si;
3709 break;
3710 case E_V8DImode:
3711 gen = gen_avx512f_blendmv8di;
3712 break;
3713 case E_V8DFmode:
3714 gen = gen_avx512f_blendmv8df;
3715 break;
3716 case E_V16SFmode:
3717 gen = gen_avx512f_blendmv16sf;
3718 break;
3719
3720 default:
3721 break;
3722 }
3723
3724 if (gen != NULL)
3725 {
3726 emit_insn (gen (d, op_false, op_true, cmp));
3727 if (d != dest)
3728 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3729 }
3730 else
3731 {
3732 op_true = force_reg (mode, op_true);
3733
3734 t2 = gen_reg_rtx (mode);
3735 if (optimize)
3736 t3 = gen_reg_rtx (mode);
3737 else
3738 t3 = dest;
3739
3740 x = gen_rtx_AND (mode, op_true, cmp);
3741 emit_insn (gen_rtx_SET (t2, x));
3742
3743 x = gen_rtx_NOT (mode, cmp);
3744 x = gen_rtx_AND (mode, x, op_false);
3745 emit_insn (gen_rtx_SET (t3, x));
3746
3747 x = gen_rtx_IOR (mode, t3, t2);
3748 emit_insn (gen_rtx_SET (dest, x));
3749 }
3750 }
3751
3752 /* Swap, force into registers, or otherwise massage the two operands
3753 to an sse comparison with a mask result. Thus we differ a bit from
3754 ix86_prepare_fp_compare_args which expects to produce a flags result.
3755
3756 The DEST operand exists to help determine whether to commute commutative
3757 operators. The POP0/POP1 operands are updated in place. The new
3758 comparison code is returned, or UNKNOWN if not implementable. */
3759
3760 static enum rtx_code
3761 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3762 rtx *pop0, rtx *pop1)
3763 {
3764 switch (code)
3765 {
3766 case LTGT:
3767 case UNEQ:
3768 /* AVX supports all the needed comparisons. */
3769 if (TARGET_AVX)
3770 break;
3771 /* We have no LTGT as an operator. We could implement it with
3772 NE & ORDERED, but this requires an extra temporary. It's
3773 not clear that it's worth it. */
3774 return UNKNOWN;
3775
3776 case LT:
3777 case LE:
3778 case UNGT:
3779 case UNGE:
3780 /* These are supported directly. */
3781 break;
3782
3783 case EQ:
3784 case NE:
3785 case UNORDERED:
3786 case ORDERED:
3787 /* AVX has 3 operand comparisons, no need to swap anything. */
3788 if (TARGET_AVX)
3789 break;
3790 /* For commutative operators, try to canonicalize the destination
3791 operand to be first in the comparison - this helps reload to
3792 avoid extra moves. */
3793 if (!dest || !rtx_equal_p (dest, *pop1))
3794 break;
3795 /* FALLTHRU */
3796
3797 case GE:
3798 case GT:
3799 case UNLE:
3800 case UNLT:
3801 /* These are not supported directly before AVX, and furthermore
3802 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3803 comparison operands to transform into something that is
3804 supported. */
3805 std::swap (*pop0, *pop1);
3806 code = swap_condition (code);
3807 break;
3808
3809 default:
3810 gcc_unreachable ();
3811 }
3812
3813 return code;
3814 }
3815
3816 /* Expand a floating-point conditional move. Return true if successful. */
3817
3818 bool
3819 ix86_expand_fp_movcc (rtx operands[])
3820 {
3821 machine_mode mode = GET_MODE (operands[0]);
3822 enum rtx_code code = GET_CODE (operands[1]);
3823 rtx tmp, compare_op;
3824 rtx op0 = XEXP (operands[1], 0);
3825 rtx op1 = XEXP (operands[1], 1);
3826
3827 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3828 {
3829 machine_mode cmode;
3830
3831 /* Since we've no cmove for sse registers, don't force bad register
3832 allocation just to gain access to it. Deny movcc when the
3833 comparison mode doesn't match the move mode. */
3834 cmode = GET_MODE (op0);
3835 if (cmode == VOIDmode)
3836 cmode = GET_MODE (op1);
3837 if (cmode != mode)
3838 return false;
3839
3840 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3841 if (code == UNKNOWN)
3842 return false;
3843
3844 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3845 operands[2], operands[3]))
3846 return true;
3847
3848 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3849 operands[2], operands[3]);
3850 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3851 return true;
3852 }
3853
3854 if (GET_MODE (op0) == TImode
3855 || (GET_MODE (op0) == DImode
3856 && !TARGET_64BIT))
3857 return false;
3858
3859 /* The floating point conditional move instructions don't directly
3860 support conditions resulting from a signed integer comparison. */
3861
3862 compare_op = ix86_expand_compare (code, op0, op1);
3863 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3864 {
3865 tmp = gen_reg_rtx (QImode);
3866 ix86_expand_setcc (tmp, code, op0, op1);
3867
3868 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3869 }
3870
3871 emit_insn (gen_rtx_SET (operands[0],
3872 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3873 operands[2], operands[3])));
3874
3875 return true;
3876 }
3877
3878 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3879
3880 static int
3881 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3882 {
3883 switch (code)
3884 {
3885 case EQ:
3886 return 0;
3887 case LT:
3888 case LTU:
3889 return 1;
3890 case LE:
3891 case LEU:
3892 return 2;
3893 case NE:
3894 return 4;
3895 case GE:
3896 case GEU:
3897 return 5;
3898 case GT:
3899 case GTU:
3900 return 6;
3901 default:
3902 gcc_unreachable ();
3903 }
3904 }
3905
3906 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3907
3908 static int
3909 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3910 {
3911 switch (code)
3912 {
3913 case EQ:
3914 return 0x00;
3915 case NE:
3916 return 0x04;
3917 case GT:
3918 return 0x0e;
3919 case LE:
3920 return 0x02;
3921 case GE:
3922 return 0x0d;
3923 case LT:
3924 return 0x01;
3925 case UNLE:
3926 return 0x0a;
3927 case UNLT:
3928 return 0x09;
3929 case UNGE:
3930 return 0x05;
3931 case UNGT:
3932 return 0x06;
3933 case UNEQ:
3934 return 0x18;
3935 case LTGT:
3936 return 0x0c;
3937 case ORDERED:
3938 return 0x07;
3939 case UNORDERED:
3940 return 0x03;
3941 default:
3942 gcc_unreachable ();
3943 }
3944 }
3945
3946 /* Return immediate value to be used in UNSPEC_PCMP
3947 for comparison CODE in MODE. */
3948
3949 static int
3950 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3951 {
3952 if (FLOAT_MODE_P (mode))
3953 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3954 return ix86_int_cmp_code_to_pcmp_immediate (code);
3955 }
3956
3957 /* Expand AVX-512 vector comparison. */
3958
3959 bool
3960 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3961 {
3962 machine_mode mask_mode = GET_MODE (dest);
3963 machine_mode cmp_mode = GET_MODE (cmp_op0);
3964 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3965 int unspec_code;
3966 rtx unspec;
3967
3968 switch (code)
3969 {
3970 case LEU:
3971 case GTU:
3972 case GEU:
3973 case LTU:
3974 unspec_code = UNSPEC_UNSIGNED_PCMP;
3975 break;
3976
3977 default:
3978 unspec_code = UNSPEC_PCMP;
3979 }
3980
3981 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
3982 unspec_code);
3983 emit_insn (gen_rtx_SET (dest, unspec));
3984
3985 return true;
3986 }
3987
3988 /* Expand fp vector comparison. */
3989
3990 bool
3991 ix86_expand_fp_vec_cmp (rtx operands[])
3992 {
3993 enum rtx_code code = GET_CODE (operands[1]);
3994 rtx cmp;
3995
3996 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3997 &operands[2], &operands[3]);
3998 if (code == UNKNOWN)
3999 {
4000 rtx temp;
4001 switch (GET_CODE (operands[1]))
4002 {
4003 case LTGT:
4004 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4005 operands[3], NULL, NULL);
4006 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4007 operands[3], NULL, NULL);
4008 code = AND;
4009 break;
4010 case UNEQ:
4011 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4012 operands[3], NULL, NULL);
4013 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4014 operands[3], NULL, NULL);
4015 code = IOR;
4016 break;
4017 default:
4018 gcc_unreachable ();
4019 }
4020 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4021 OPTAB_DIRECT);
4022 }
4023 else
4024 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4025 operands[1], operands[2]);
4026
4027 if (operands[0] != cmp)
4028 emit_move_insn (operands[0], cmp);
4029
4030 return true;
4031 }
4032
4033 static rtx
4034 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4035 rtx op_true, rtx op_false, bool *negate)
4036 {
4037 machine_mode data_mode = GET_MODE (dest);
4038 machine_mode mode = GET_MODE (cop0);
4039 rtx x;
4040
4041 *negate = false;
4042
4043 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4044 if (TARGET_XOP
4045 && (mode == V16QImode || mode == V8HImode
4046 || mode == V4SImode || mode == V2DImode))
4047 ;
4048 /* AVX512F supports all of the comparsions
4049 on all 128/256/512-bit vector int types. */
4050 else if (ix86_valid_mask_cmp_mode (mode))
4051 ;
4052 else
4053 {
4054 /* Canonicalize the comparison to EQ, GT, GTU. */
4055 switch (code)
4056 {
4057 case EQ:
4058 case GT:
4059 case GTU:
4060 break;
4061
4062 case NE:
4063 case LE:
4064 case LEU:
4065 code = reverse_condition (code);
4066 *negate = true;
4067 break;
4068
4069 case GE:
4070 case GEU:
4071 code = reverse_condition (code);
4072 *negate = true;
4073 /* FALLTHRU */
4074
4075 case LT:
4076 case LTU:
4077 std::swap (cop0, cop1);
4078 code = swap_condition (code);
4079 break;
4080
4081 default:
4082 gcc_unreachable ();
4083 }
4084
4085 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4086 if (mode == V2DImode)
4087 {
4088 switch (code)
4089 {
4090 case EQ:
4091 /* SSE4.1 supports EQ. */
4092 if (!TARGET_SSE4_1)
4093 return NULL;
4094 break;
4095
4096 case GT:
4097 case GTU:
4098 /* SSE4.2 supports GT/GTU. */
4099 if (!TARGET_SSE4_2)
4100 return NULL;
4101 break;
4102
4103 default:
4104 gcc_unreachable ();
4105 }
4106 }
4107
4108 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4109 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4110 if (*negate)
4111 std::swap (optrue, opfalse);
4112
4113 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4114 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4115 min (x, y) == x). While we add one instruction (the minimum),
4116 we remove the need for two instructions in the negation, as the
4117 result is done this way.
4118 When using masks, do it for SI/DImode element types, as it is shorter
4119 than the two subtractions. */
4120 if ((code != EQ
4121 && GET_MODE_SIZE (mode) != 64
4122 && vector_all_ones_operand (opfalse, data_mode)
4123 && optrue == CONST0_RTX (data_mode))
4124 || (code == GTU
4125 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4126 /* Don't do it if not using integer masks and we'd end up with
4127 the right values in the registers though. */
4128 && (GET_MODE_SIZE (mode) == 64
4129 || !vector_all_ones_operand (optrue, data_mode)
4130 || opfalse != CONST0_RTX (data_mode))))
4131 {
4132 rtx (*gen) (rtx, rtx, rtx) = NULL;
4133
4134 switch (mode)
4135 {
4136 case E_V16SImode:
4137 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4138 break;
4139 case E_V8DImode:
4140 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4141 cop0 = force_reg (mode, cop0);
4142 cop1 = force_reg (mode, cop1);
4143 break;
4144 case E_V32QImode:
4145 if (TARGET_AVX2)
4146 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4147 break;
4148 case E_V16HImode:
4149 if (TARGET_AVX2)
4150 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4151 break;
4152 case E_V8SImode:
4153 if (TARGET_AVX2)
4154 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4155 break;
4156 case E_V4DImode:
4157 if (TARGET_AVX512VL)
4158 {
4159 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4160 cop0 = force_reg (mode, cop0);
4161 cop1 = force_reg (mode, cop1);
4162 }
4163 break;
4164 case E_V16QImode:
4165 if (code == GTU && TARGET_SSE2)
4166 gen = gen_uminv16qi3;
4167 else if (code == GT && TARGET_SSE4_1)
4168 gen = gen_sminv16qi3;
4169 break;
4170 case E_V8HImode:
4171 if (code == GTU && TARGET_SSE4_1)
4172 gen = gen_uminv8hi3;
4173 else if (code == GT && TARGET_SSE2)
4174 gen = gen_sminv8hi3;
4175 break;
4176 case E_V4SImode:
4177 if (TARGET_SSE4_1)
4178 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4179 break;
4180 case E_V2DImode:
4181 if (TARGET_AVX512VL)
4182 {
4183 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4184 cop0 = force_reg (mode, cop0);
4185 cop1 = force_reg (mode, cop1);
4186 }
4187 break;
4188 default:
4189 break;
4190 }
4191
4192 if (gen)
4193 {
4194 rtx tem = gen_reg_rtx (mode);
4195 if (!vector_operand (cop0, mode))
4196 cop0 = force_reg (mode, cop0);
4197 if (!vector_operand (cop1, mode))
4198 cop1 = force_reg (mode, cop1);
4199 *negate = !*negate;
4200 emit_insn (gen (tem, cop0, cop1));
4201 cop1 = tem;
4202 code = EQ;
4203 }
4204 }
4205
4206 /* Unsigned parallel compare is not supported by the hardware.
4207 Play some tricks to turn this into a signed comparison
4208 against 0. */
4209 if (code == GTU)
4210 {
4211 cop0 = force_reg (mode, cop0);
4212
4213 switch (mode)
4214 {
4215 case E_V16SImode:
4216 case E_V8DImode:
4217 case E_V8SImode:
4218 case E_V4DImode:
4219 case E_V4SImode:
4220 case E_V2DImode:
4221 {
4222 rtx t1, t2, mask;
4223
4224 /* Subtract (-(INT MAX) - 1) from both operands to make
4225 them signed. */
4226 mask = ix86_build_signbit_mask (mode, true, false);
4227 t1 = gen_reg_rtx (mode);
4228 emit_insn (gen_sub3_insn (t1, cop0, mask));
4229
4230 t2 = gen_reg_rtx (mode);
4231 emit_insn (gen_sub3_insn (t2, cop1, mask));
4232
4233 cop0 = t1;
4234 cop1 = t2;
4235 code = GT;
4236 }
4237 break;
4238
4239 case E_V64QImode:
4240 case E_V32HImode:
4241 case E_V32QImode:
4242 case E_V16HImode:
4243 case E_V16QImode:
4244 case E_V8HImode:
4245 /* Perform a parallel unsigned saturating subtraction. */
4246 x = gen_reg_rtx (mode);
4247 emit_insn (gen_rtx_SET
4248 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4249 cop0 = x;
4250 cop1 = CONST0_RTX (mode);
4251 code = EQ;
4252 *negate = !*negate;
4253 break;
4254
4255 default:
4256 gcc_unreachable ();
4257 }
4258 }
4259 }
4260
4261 if (*negate)
4262 std::swap (op_true, op_false);
4263
4264 /* Allow the comparison to be done in one mode, but the movcc to
4265 happen in another mode. */
4266 if (data_mode == mode)
4267 {
4268 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4269 op_true, op_false);
4270 }
4271 else
4272 {
4273 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4274 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4275 op_true, op_false);
4276 if (GET_MODE (x) == mode)
4277 x = gen_lowpart (data_mode, x);
4278 }
4279
4280 return x;
4281 }
4282
4283 /* Expand integer vector comparison. */
4284
4285 bool
4286 ix86_expand_int_vec_cmp (rtx operands[])
4287 {
4288 rtx_code code = GET_CODE (operands[1]);
4289 bool negate = false;
4290 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4291 operands[3], NULL, NULL, &negate);
4292
4293 if (!cmp)
4294 return false;
4295
4296 if (negate)
4297 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4298 CONST0_RTX (GET_MODE (cmp)),
4299 NULL, NULL, &negate);
4300
4301 gcc_assert (!negate);
4302
4303 if (operands[0] != cmp)
4304 emit_move_insn (operands[0], cmp);
4305
4306 return true;
4307 }
4308
4309 /* Expand a floating-point vector conditional move; a vcond operation
4310 rather than a movcc operation. */
4311
4312 bool
4313 ix86_expand_fp_vcond (rtx operands[])
4314 {
4315 enum rtx_code code = GET_CODE (operands[3]);
4316 rtx cmp;
4317
4318 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4319 &operands[4], &operands[5]);
4320 if (code == UNKNOWN)
4321 {
4322 rtx temp;
4323 switch (GET_CODE (operands[3]))
4324 {
4325 case LTGT:
4326 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4327 operands[5], operands[0], operands[0]);
4328 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4329 operands[5], operands[1], operands[2]);
4330 code = AND;
4331 break;
4332 case UNEQ:
4333 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4334 operands[5], operands[0], operands[0]);
4335 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4336 operands[5], operands[1], operands[2]);
4337 code = IOR;
4338 break;
4339 default:
4340 gcc_unreachable ();
4341 }
4342 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4343 OPTAB_DIRECT);
4344 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4345 return true;
4346 }
4347
4348 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4349 operands[5], operands[1], operands[2]))
4350 return true;
4351
4352 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4353 operands[1], operands[2]);
4354 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4355 return true;
4356 }
4357
4358 /* Expand a signed/unsigned integral vector conditional move. */
4359
4360 bool
4361 ix86_expand_int_vcond (rtx operands[])
4362 {
4363 machine_mode data_mode = GET_MODE (operands[0]);
4364 machine_mode mode = GET_MODE (operands[4]);
4365 enum rtx_code code = GET_CODE (operands[3]);
4366 bool negate = false;
4367 rtx x, cop0, cop1;
4368
4369 cop0 = operands[4];
4370 cop1 = operands[5];
4371
4372 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4373 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4374 if ((code == LT || code == GE)
4375 && data_mode == mode
4376 && cop1 == CONST0_RTX (mode)
4377 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4378 && GET_MODE_UNIT_SIZE (data_mode) > 1
4379 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4380 && (GET_MODE_SIZE (data_mode) == 16
4381 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4382 {
4383 rtx negop = operands[2 - (code == LT)];
4384 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4385 if (negop == CONST1_RTX (data_mode))
4386 {
4387 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4388 operands[0], 1, OPTAB_DIRECT);
4389 if (res != operands[0])
4390 emit_move_insn (operands[0], res);
4391 return true;
4392 }
4393 else if (GET_MODE_INNER (data_mode) != DImode
4394 && vector_all_ones_operand (negop, data_mode))
4395 {
4396 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4397 operands[0], 0, OPTAB_DIRECT);
4398 if (res != operands[0])
4399 emit_move_insn (operands[0], res);
4400 return true;
4401 }
4402 }
4403
4404 if (!nonimmediate_operand (cop1, mode))
4405 cop1 = force_reg (mode, cop1);
4406 if (!general_operand (operands[1], data_mode))
4407 operands[1] = force_reg (data_mode, operands[1]);
4408 if (!general_operand (operands[2], data_mode))
4409 operands[2] = force_reg (data_mode, operands[2]);
4410
4411 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4412 operands[1], operands[2], &negate);
4413
4414 if (!x)
4415 return false;
4416
4417 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4418 operands[2-negate]);
4419 return true;
4420 }
4421
4422 static bool
4423 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4424 struct expand_vec_perm_d *d)
4425 {
4426 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4427 expander, so args are either in d, or in op0, op1 etc. */
4428 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4429 machine_mode maskmode = mode;
4430 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4431
4432 switch (mode)
4433 {
4434 case E_V8HImode:
4435 if (TARGET_AVX512VL && TARGET_AVX512BW)
4436 gen = gen_avx512vl_vpermt2varv8hi3;
4437 break;
4438 case E_V16HImode:
4439 if (TARGET_AVX512VL && TARGET_AVX512BW)
4440 gen = gen_avx512vl_vpermt2varv16hi3;
4441 break;
4442 case E_V64QImode:
4443 if (TARGET_AVX512VBMI)
4444 gen = gen_avx512bw_vpermt2varv64qi3;
4445 break;
4446 case E_V32HImode:
4447 if (TARGET_AVX512BW)
4448 gen = gen_avx512bw_vpermt2varv32hi3;
4449 break;
4450 case E_V4SImode:
4451 if (TARGET_AVX512VL)
4452 gen = gen_avx512vl_vpermt2varv4si3;
4453 break;
4454 case E_V8SImode:
4455 if (TARGET_AVX512VL)
4456 gen = gen_avx512vl_vpermt2varv8si3;
4457 break;
4458 case E_V16SImode:
4459 if (TARGET_AVX512F)
4460 gen = gen_avx512f_vpermt2varv16si3;
4461 break;
4462 case E_V4SFmode:
4463 if (TARGET_AVX512VL)
4464 {
4465 gen = gen_avx512vl_vpermt2varv4sf3;
4466 maskmode = V4SImode;
4467 }
4468 break;
4469 case E_V8SFmode:
4470 if (TARGET_AVX512VL)
4471 {
4472 gen = gen_avx512vl_vpermt2varv8sf3;
4473 maskmode = V8SImode;
4474 }
4475 break;
4476 case E_V16SFmode:
4477 if (TARGET_AVX512F)
4478 {
4479 gen = gen_avx512f_vpermt2varv16sf3;
4480 maskmode = V16SImode;
4481 }
4482 break;
4483 case E_V2DImode:
4484 if (TARGET_AVX512VL)
4485 gen = gen_avx512vl_vpermt2varv2di3;
4486 break;
4487 case E_V4DImode:
4488 if (TARGET_AVX512VL)
4489 gen = gen_avx512vl_vpermt2varv4di3;
4490 break;
4491 case E_V8DImode:
4492 if (TARGET_AVX512F)
4493 gen = gen_avx512f_vpermt2varv8di3;
4494 break;
4495 case E_V2DFmode:
4496 if (TARGET_AVX512VL)
4497 {
4498 gen = gen_avx512vl_vpermt2varv2df3;
4499 maskmode = V2DImode;
4500 }
4501 break;
4502 case E_V4DFmode:
4503 if (TARGET_AVX512VL)
4504 {
4505 gen = gen_avx512vl_vpermt2varv4df3;
4506 maskmode = V4DImode;
4507 }
4508 break;
4509 case E_V8DFmode:
4510 if (TARGET_AVX512F)
4511 {
4512 gen = gen_avx512f_vpermt2varv8df3;
4513 maskmode = V8DImode;
4514 }
4515 break;
4516 default:
4517 break;
4518 }
4519
4520 if (gen == NULL)
4521 return false;
4522
4523 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4524 expander, so args are either in d, or in op0, op1 etc. */
4525 if (d)
4526 {
4527 rtx vec[64];
4528 target = d->target;
4529 op0 = d->op0;
4530 op1 = d->op1;
4531 for (int i = 0; i < d->nelt; ++i)
4532 vec[i] = GEN_INT (d->perm[i]);
4533 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4534 }
4535
4536 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4537 return true;
4538 }
4539
4540 /* Expand a variable vector permutation. */
4541
4542 void
4543 ix86_expand_vec_perm (rtx operands[])
4544 {
4545 rtx target = operands[0];
4546 rtx op0 = operands[1];
4547 rtx op1 = operands[2];
4548 rtx mask = operands[3];
4549 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4550 machine_mode mode = GET_MODE (op0);
4551 machine_mode maskmode = GET_MODE (mask);
4552 int w, e, i;
4553 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4554
4555 /* Number of elements in the vector. */
4556 w = GET_MODE_NUNITS (mode);
4557 e = GET_MODE_UNIT_SIZE (mode);
4558 gcc_assert (w <= 64);
4559
4560 if (TARGET_AVX512F && one_operand_shuffle)
4561 {
4562 rtx (*gen) (rtx, rtx, rtx) = NULL;
4563 switch (mode)
4564 {
4565 case E_V16SImode:
4566 gen =gen_avx512f_permvarv16si;
4567 break;
4568 case E_V16SFmode:
4569 gen = gen_avx512f_permvarv16sf;
4570 break;
4571 case E_V8DImode:
4572 gen = gen_avx512f_permvarv8di;
4573 break;
4574 case E_V8DFmode:
4575 gen = gen_avx512f_permvarv8df;
4576 break;
4577 default:
4578 break;
4579 }
4580 if (gen != NULL)
4581 {
4582 emit_insn (gen (target, op0, mask));
4583 return;
4584 }
4585 }
4586
4587 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4588 return;
4589
4590 if (TARGET_AVX2)
4591 {
4592 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4593 {
4594 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4595 an constant shuffle operand. With a tiny bit of effort we can
4596 use VPERMD instead. A re-interpretation stall for V4DFmode is
4597 unfortunate but there's no avoiding it.
4598 Similarly for V16HImode we don't have instructions for variable
4599 shuffling, while for V32QImode we can use after preparing suitable
4600 masks vpshufb; vpshufb; vpermq; vpor. */
4601
4602 if (mode == V16HImode)
4603 {
4604 maskmode = mode = V32QImode;
4605 w = 32;
4606 e = 1;
4607 }
4608 else
4609 {
4610 maskmode = mode = V8SImode;
4611 w = 8;
4612 e = 4;
4613 }
4614 t1 = gen_reg_rtx (maskmode);
4615
4616 /* Replicate the low bits of the V4DImode mask into V8SImode:
4617 mask = { A B C D }
4618 t1 = { A A B B C C D D }. */
4619 for (i = 0; i < w / 2; ++i)
4620 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4621 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4622 vt = force_reg (maskmode, vt);
4623 mask = gen_lowpart (maskmode, mask);
4624 if (maskmode == V8SImode)
4625 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4626 else
4627 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4628
4629 /* Multiply the shuffle indicies by two. */
4630 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4631 OPTAB_DIRECT);
4632
4633 /* Add one to the odd shuffle indicies:
4634 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4635 for (i = 0; i < w / 2; ++i)
4636 {
4637 vec[i * 2] = const0_rtx;
4638 vec[i * 2 + 1] = const1_rtx;
4639 }
4640 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4641 vt = validize_mem (force_const_mem (maskmode, vt));
4642 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4643 OPTAB_DIRECT);
4644
4645 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4646 operands[3] = mask = t1;
4647 target = gen_reg_rtx (mode);
4648 op0 = gen_lowpart (mode, op0);
4649 op1 = gen_lowpart (mode, op1);
4650 }
4651
4652 switch (mode)
4653 {
4654 case E_V8SImode:
4655 /* The VPERMD and VPERMPS instructions already properly ignore
4656 the high bits of the shuffle elements. No need for us to
4657 perform an AND ourselves. */
4658 if (one_operand_shuffle)
4659 {
4660 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4661 if (target != operands[0])
4662 emit_move_insn (operands[0],
4663 gen_lowpart (GET_MODE (operands[0]), target));
4664 }
4665 else
4666 {
4667 t1 = gen_reg_rtx (V8SImode);
4668 t2 = gen_reg_rtx (V8SImode);
4669 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4670 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4671 goto merge_two;
4672 }
4673 return;
4674
4675 case E_V8SFmode:
4676 mask = gen_lowpart (V8SImode, mask);
4677 if (one_operand_shuffle)
4678 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4679 else
4680 {
4681 t1 = gen_reg_rtx (V8SFmode);
4682 t2 = gen_reg_rtx (V8SFmode);
4683 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4684 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4685 goto merge_two;
4686 }
4687 return;
4688
4689 case E_V4SImode:
4690 /* By combining the two 128-bit input vectors into one 256-bit
4691 input vector, we can use VPERMD and VPERMPS for the full
4692 two-operand shuffle. */
4693 t1 = gen_reg_rtx (V8SImode);
4694 t2 = gen_reg_rtx (V8SImode);
4695 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4696 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4697 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4698 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4699 return;
4700
4701 case E_V4SFmode:
4702 t1 = gen_reg_rtx (V8SFmode);
4703 t2 = gen_reg_rtx (V8SImode);
4704 mask = gen_lowpart (V4SImode, mask);
4705 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4706 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4707 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4708 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4709 return;
4710
4711 case E_V32QImode:
4712 t1 = gen_reg_rtx (V32QImode);
4713 t2 = gen_reg_rtx (V32QImode);
4714 t3 = gen_reg_rtx (V32QImode);
4715 vt2 = GEN_INT (-128);
4716 vt = gen_const_vec_duplicate (V32QImode, vt2);
4717 vt = force_reg (V32QImode, vt);
4718 for (i = 0; i < 32; i++)
4719 vec[i] = i < 16 ? vt2 : const0_rtx;
4720 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4721 vt2 = force_reg (V32QImode, vt2);
4722 /* From mask create two adjusted masks, which contain the same
4723 bits as mask in the low 7 bits of each vector element.
4724 The first mask will have the most significant bit clear
4725 if it requests element from the same 128-bit lane
4726 and MSB set if it requests element from the other 128-bit lane.
4727 The second mask will have the opposite values of the MSB,
4728 and additionally will have its 128-bit lanes swapped.
4729 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4730 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4731 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4732 stands for other 12 bytes. */
4733 /* The bit whether element is from the same lane or the other
4734 lane is bit 4, so shift it up by 3 to the MSB position. */
4735 t5 = gen_reg_rtx (V4DImode);
4736 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4737 GEN_INT (3)));
4738 /* Clear MSB bits from the mask just in case it had them set. */
4739 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4740 /* After this t1 will have MSB set for elements from other lane. */
4741 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4742 /* Clear bits other than MSB. */
4743 emit_insn (gen_andv32qi3 (t1, t1, vt));
4744 /* Or in the lower bits from mask into t3. */
4745 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4746 /* And invert MSB bits in t1, so MSB is set for elements from the same
4747 lane. */
4748 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4749 /* Swap 128-bit lanes in t3. */
4750 t6 = gen_reg_rtx (V4DImode);
4751 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4752 const2_rtx, GEN_INT (3),
4753 const0_rtx, const1_rtx));
4754 /* And or in the lower bits from mask into t1. */
4755 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4756 if (one_operand_shuffle)
4757 {
4758 /* Each of these shuffles will put 0s in places where
4759 element from the other 128-bit lane is needed, otherwise
4760 will shuffle in the requested value. */
4761 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4762 gen_lowpart (V32QImode, t6)));
4763 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4764 /* For t3 the 128-bit lanes are swapped again. */
4765 t7 = gen_reg_rtx (V4DImode);
4766 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4767 const2_rtx, GEN_INT (3),
4768 const0_rtx, const1_rtx));
4769 /* And oring both together leads to the result. */
4770 emit_insn (gen_iorv32qi3 (target, t1,
4771 gen_lowpart (V32QImode, t7)));
4772 if (target != operands[0])
4773 emit_move_insn (operands[0],
4774 gen_lowpart (GET_MODE (operands[0]), target));
4775 return;
4776 }
4777
4778 t4 = gen_reg_rtx (V32QImode);
4779 /* Similarly to the above one_operand_shuffle code,
4780 just for repeated twice for each operand. merge_two:
4781 code will merge the two results together. */
4782 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4783 gen_lowpart (V32QImode, t6)));
4784 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4785 gen_lowpart (V32QImode, t6)));
4786 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4787 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4788 t7 = gen_reg_rtx (V4DImode);
4789 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4790 const2_rtx, GEN_INT (3),
4791 const0_rtx, const1_rtx));
4792 t8 = gen_reg_rtx (V4DImode);
4793 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4794 const2_rtx, GEN_INT (3),
4795 const0_rtx, const1_rtx));
4796 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4797 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4798 t1 = t4;
4799 t2 = t3;
4800 goto merge_two;
4801
4802 default:
4803 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4804 break;
4805 }
4806 }
4807
4808 if (TARGET_XOP)
4809 {
4810 /* The XOP VPPERM insn supports three inputs. By ignoring the
4811 one_operand_shuffle special case, we avoid creating another
4812 set of constant vectors in memory. */
4813 one_operand_shuffle = false;
4814
4815 /* mask = mask & {2*w-1, ...} */
4816 vt = GEN_INT (2*w - 1);
4817 }
4818 else
4819 {
4820 /* mask = mask & {w-1, ...} */
4821 vt = GEN_INT (w - 1);
4822 }
4823
4824 vt = gen_const_vec_duplicate (maskmode, vt);
4825 mask = expand_simple_binop (maskmode, AND, mask, vt,
4826 NULL_RTX, 0, OPTAB_DIRECT);
4827
4828 /* For non-QImode operations, convert the word permutation control
4829 into a byte permutation control. */
4830 if (mode != V16QImode)
4831 {
4832 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4833 GEN_INT (exact_log2 (e)),
4834 NULL_RTX, 0, OPTAB_DIRECT);
4835
4836 /* Convert mask to vector of chars. */
4837 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4838
4839 /* Replicate each of the input bytes into byte positions:
4840 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4841 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4842 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4843 for (i = 0; i < 16; ++i)
4844 vec[i] = GEN_INT (i/e * e);
4845 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4846 vt = validize_mem (force_const_mem (V16QImode, vt));
4847 if (TARGET_XOP)
4848 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4849 else
4850 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4851
4852 /* Convert it into the byte positions by doing
4853 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4854 for (i = 0; i < 16; ++i)
4855 vec[i] = GEN_INT (i % e);
4856 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4857 vt = validize_mem (force_const_mem (V16QImode, vt));
4858 emit_insn (gen_addv16qi3 (mask, mask, vt));
4859 }
4860
4861 /* The actual shuffle operations all operate on V16QImode. */
4862 op0 = gen_lowpart (V16QImode, op0);
4863 op1 = gen_lowpart (V16QImode, op1);
4864
4865 if (TARGET_XOP)
4866 {
4867 if (GET_MODE (target) != V16QImode)
4868 target = gen_reg_rtx (V16QImode);
4869 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4870 if (target != operands[0])
4871 emit_move_insn (operands[0],
4872 gen_lowpart (GET_MODE (operands[0]), target));
4873 }
4874 else if (one_operand_shuffle)
4875 {
4876 if (GET_MODE (target) != V16QImode)
4877 target = gen_reg_rtx (V16QImode);
4878 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4879 if (target != operands[0])
4880 emit_move_insn (operands[0],
4881 gen_lowpart (GET_MODE (operands[0]), target));
4882 }
4883 else
4884 {
4885 rtx xops[6];
4886 bool ok;
4887
4888 /* Shuffle the two input vectors independently. */
4889 t1 = gen_reg_rtx (V16QImode);
4890 t2 = gen_reg_rtx (V16QImode);
4891 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4892 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4893
4894 merge_two:
4895 /* Then merge them together. The key is whether any given control
4896 element contained a bit set that indicates the second word. */
4897 mask = operands[3];
4898 vt = GEN_INT (w);
4899 if (maskmode == V2DImode && !TARGET_SSE4_1)
4900 {
4901 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4902 more shuffle to convert the V2DI input mask into a V4SI
4903 input mask. At which point the masking that expand_int_vcond
4904 will work as desired. */
4905 rtx t3 = gen_reg_rtx (V4SImode);
4906 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4907 const0_rtx, const0_rtx,
4908 const2_rtx, const2_rtx));
4909 mask = t3;
4910 maskmode = V4SImode;
4911 e = w = 4;
4912 }
4913
4914 vt = gen_const_vec_duplicate (maskmode, vt);
4915 vt = force_reg (maskmode, vt);
4916 mask = expand_simple_binop (maskmode, AND, mask, vt,
4917 NULL_RTX, 0, OPTAB_DIRECT);
4918
4919 if (GET_MODE (target) != mode)
4920 target = gen_reg_rtx (mode);
4921 xops[0] = target;
4922 xops[1] = gen_lowpart (mode, t2);
4923 xops[2] = gen_lowpart (mode, t1);
4924 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4925 xops[4] = mask;
4926 xops[5] = vt;
4927 ok = ix86_expand_int_vcond (xops);
4928 gcc_assert (ok);
4929 if (target != operands[0])
4930 emit_move_insn (operands[0],
4931 gen_lowpart (GET_MODE (operands[0]), target));
4932 }
4933 }
4934
4935 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4936 true if we should do zero extension, else sign extension. HIGH_P is
4937 true if we want the N/2 high elements, else the low elements. */
4938
4939 void
4940 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4941 {
4942 machine_mode imode = GET_MODE (src);
4943 rtx tmp;
4944
4945 if (TARGET_SSE4_1)
4946 {
4947 rtx (*unpack)(rtx, rtx);
4948 rtx (*extract)(rtx, rtx) = NULL;
4949 machine_mode halfmode = BLKmode;
4950
4951 switch (imode)
4952 {
4953 case E_V64QImode:
4954 if (unsigned_p)
4955 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4956 else
4957 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4958 halfmode = V32QImode;
4959 extract
4960 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4961 break;
4962 case E_V32QImode:
4963 if (unsigned_p)
4964 unpack = gen_avx2_zero_extendv16qiv16hi2;
4965 else
4966 unpack = gen_avx2_sign_extendv16qiv16hi2;
4967 halfmode = V16QImode;
4968 extract
4969 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4970 break;
4971 case E_V32HImode:
4972 if (unsigned_p)
4973 unpack = gen_avx512f_zero_extendv16hiv16si2;
4974 else
4975 unpack = gen_avx512f_sign_extendv16hiv16si2;
4976 halfmode = V16HImode;
4977 extract
4978 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4979 break;
4980 case E_V16HImode:
4981 if (unsigned_p)
4982 unpack = gen_avx2_zero_extendv8hiv8si2;
4983 else
4984 unpack = gen_avx2_sign_extendv8hiv8si2;
4985 halfmode = V8HImode;
4986 extract
4987 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4988 break;
4989 case E_V16SImode:
4990 if (unsigned_p)
4991 unpack = gen_avx512f_zero_extendv8siv8di2;
4992 else
4993 unpack = gen_avx512f_sign_extendv8siv8di2;
4994 halfmode = V8SImode;
4995 extract
4996 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4997 break;
4998 case E_V8SImode:
4999 if (unsigned_p)
5000 unpack = gen_avx2_zero_extendv4siv4di2;
5001 else
5002 unpack = gen_avx2_sign_extendv4siv4di2;
5003 halfmode = V4SImode;
5004 extract
5005 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5006 break;
5007 case E_V16QImode:
5008 if (unsigned_p)
5009 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5010 else
5011 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5012 break;
5013 case E_V8HImode:
5014 if (unsigned_p)
5015 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5016 else
5017 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5018 break;
5019 case E_V4SImode:
5020 if (unsigned_p)
5021 unpack = gen_sse4_1_zero_extendv2siv2di2;
5022 else
5023 unpack = gen_sse4_1_sign_extendv2siv2di2;
5024 break;
5025 default:
5026 gcc_unreachable ();
5027 }
5028
5029 if (GET_MODE_SIZE (imode) >= 32)
5030 {
5031 tmp = gen_reg_rtx (halfmode);
5032 emit_insn (extract (tmp, src));
5033 }
5034 else if (high_p)
5035 {
5036 /* Shift higher 8 bytes to lower 8 bytes. */
5037 tmp = gen_reg_rtx (V1TImode);
5038 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5039 GEN_INT (64)));
5040 tmp = gen_lowpart (imode, tmp);
5041 }
5042 else
5043 tmp = src;
5044
5045 emit_insn (unpack (dest, tmp));
5046 }
5047 else
5048 {
5049 rtx (*unpack)(rtx, rtx, rtx);
5050
5051 switch (imode)
5052 {
5053 case E_V16QImode:
5054 if (high_p)
5055 unpack = gen_vec_interleave_highv16qi;
5056 else
5057 unpack = gen_vec_interleave_lowv16qi;
5058 break;
5059 case E_V8HImode:
5060 if (high_p)
5061 unpack = gen_vec_interleave_highv8hi;
5062 else
5063 unpack = gen_vec_interleave_lowv8hi;
5064 break;
5065 case E_V4SImode:
5066 if (high_p)
5067 unpack = gen_vec_interleave_highv4si;
5068 else
5069 unpack = gen_vec_interleave_lowv4si;
5070 break;
5071 default:
5072 gcc_unreachable ();
5073 }
5074
5075 if (unsigned_p)
5076 tmp = force_reg (imode, CONST0_RTX (imode));
5077 else
5078 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5079 src, pc_rtx, pc_rtx);
5080
5081 rtx tmp2 = gen_reg_rtx (imode);
5082 emit_insn (unpack (tmp2, src, tmp));
5083 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5084 }
5085 }
5086
5087 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5088 but works for floating pointer parameters and nonoffsetable memories.
5089 For pushes, it returns just stack offsets; the values will be saved
5090 in the right order. Maximally three parts are generated. */
5091
5092 static int
5093 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5094 {
5095 int size;
5096
5097 if (!TARGET_64BIT)
5098 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5099 else
5100 size = (GET_MODE_SIZE (mode) + 4) / 8;
5101
5102 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5103 gcc_assert (size >= 2 && size <= 4);
5104
5105 /* Optimize constant pool reference to immediates. This is used by fp
5106 moves, that force all constants to memory to allow combining. */
5107 if (MEM_P (operand) && MEM_READONLY_P (operand))
5108 operand = avoid_constant_pool_reference (operand);
5109
5110 if (MEM_P (operand) && !offsettable_memref_p (operand))
5111 {
5112 /* The only non-offsetable memories we handle are pushes. */
5113 int ok = push_operand (operand, VOIDmode);
5114
5115 gcc_assert (ok);
5116
5117 operand = copy_rtx (operand);
5118 PUT_MODE (operand, word_mode);
5119 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5120 return size;
5121 }
5122
5123 if (GET_CODE (operand) == CONST_VECTOR)
5124 {
5125 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5126 /* Caution: if we looked through a constant pool memory above,
5127 the operand may actually have a different mode now. That's
5128 ok, since we want to pun this all the way back to an integer. */
5129 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5130 gcc_assert (operand != NULL);
5131 mode = imode;
5132 }
5133
5134 if (!TARGET_64BIT)
5135 {
5136 if (mode == DImode)
5137 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5138 else
5139 {
5140 int i;
5141
5142 if (REG_P (operand))
5143 {
5144 gcc_assert (reload_completed);
5145 for (i = 0; i < size; i++)
5146 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5147 }
5148 else if (offsettable_memref_p (operand))
5149 {
5150 operand = adjust_address (operand, SImode, 0);
5151 parts[0] = operand;
5152 for (i = 1; i < size; i++)
5153 parts[i] = adjust_address (operand, SImode, 4 * i);
5154 }
5155 else if (CONST_DOUBLE_P (operand))
5156 {
5157 const REAL_VALUE_TYPE *r;
5158 long l[4];
5159
5160 r = CONST_DOUBLE_REAL_VALUE (operand);
5161 switch (mode)
5162 {
5163 case E_TFmode:
5164 real_to_target (l, r, mode);
5165 parts[3] = gen_int_mode (l[3], SImode);
5166 parts[2] = gen_int_mode (l[2], SImode);
5167 break;
5168 case E_XFmode:
5169 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5170 long double may not be 80-bit. */
5171 real_to_target (l, r, mode);
5172 parts[2] = gen_int_mode (l[2], SImode);
5173 break;
5174 case E_DFmode:
5175 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5176 break;
5177 default:
5178 gcc_unreachable ();
5179 }
5180 parts[1] = gen_int_mode (l[1], SImode);
5181 parts[0] = gen_int_mode (l[0], SImode);
5182 }
5183 else
5184 gcc_unreachable ();
5185 }
5186 }
5187 else
5188 {
5189 if (mode == TImode)
5190 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5191 if (mode == XFmode || mode == TFmode)
5192 {
5193 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5194 if (REG_P (operand))
5195 {
5196 gcc_assert (reload_completed);
5197 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5198 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5199 }
5200 else if (offsettable_memref_p (operand))
5201 {
5202 operand = adjust_address (operand, DImode, 0);
5203 parts[0] = operand;
5204 parts[1] = adjust_address (operand, upper_mode, 8);
5205 }
5206 else if (CONST_DOUBLE_P (operand))
5207 {
5208 long l[4];
5209
5210 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5211
5212 /* real_to_target puts 32-bit pieces in each long. */
5213 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5214 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5215 << 32), DImode);
5216
5217 if (upper_mode == SImode)
5218 parts[1] = gen_int_mode (l[2], SImode);
5219 else
5220 parts[1]
5221 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5222 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5223 << 32), DImode);
5224 }
5225 else
5226 gcc_unreachable ();
5227 }
5228 }
5229
5230 return size;
5231 }
5232
5233 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5234 Return false when normal moves are needed; true when all required
5235 insns have been emitted. Operands 2-4 contain the input values
5236 int the correct order; operands 5-7 contain the output values. */
5237
5238 void
5239 ix86_split_long_move (rtx operands[])
5240 {
5241 rtx part[2][4];
5242 int nparts, i, j;
5243 int push = 0;
5244 int collisions = 0;
5245 machine_mode mode = GET_MODE (operands[0]);
5246 bool collisionparts[4];
5247
5248 /* The DFmode expanders may ask us to move double.
5249 For 64bit target this is single move. By hiding the fact
5250 here we simplify i386.md splitters. */
5251 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5252 {
5253 /* Optimize constant pool reference to immediates. This is used by
5254 fp moves, that force all constants to memory to allow combining. */
5255
5256 if (MEM_P (operands[1])
5257 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5258 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5259 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5260 if (push_operand (operands[0], VOIDmode))
5261 {
5262 operands[0] = copy_rtx (operands[0]);
5263 PUT_MODE (operands[0], word_mode);
5264 }
5265 else
5266 operands[0] = gen_lowpart (DImode, operands[0]);
5267 operands[1] = gen_lowpart (DImode, operands[1]);
5268 emit_move_insn (operands[0], operands[1]);
5269 return;
5270 }
5271
5272 /* The only non-offsettable memory we handle is push. */
5273 if (push_operand (operands[0], VOIDmode))
5274 push = 1;
5275 else
5276 gcc_assert (!MEM_P (operands[0])
5277 || offsettable_memref_p (operands[0]));
5278
5279 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5280 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5281
5282 /* When emitting push, take care for source operands on the stack. */
5283 if (push && MEM_P (operands[1])
5284 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5285 {
5286 rtx src_base = XEXP (part[1][nparts - 1], 0);
5287
5288 /* Compensate for the stack decrement by 4. */
5289 if (!TARGET_64BIT && nparts == 3
5290 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5291 src_base = plus_constant (Pmode, src_base, 4);
5292
5293 /* src_base refers to the stack pointer and is
5294 automatically decreased by emitted push. */
5295 for (i = 0; i < nparts; i++)
5296 part[1][i] = change_address (part[1][i],
5297 GET_MODE (part[1][i]), src_base);
5298 }
5299
5300 /* We need to do copy in the right order in case an address register
5301 of the source overlaps the destination. */
5302 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5303 {
5304 rtx tmp;
5305
5306 for (i = 0; i < nparts; i++)
5307 {
5308 collisionparts[i]
5309 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5310 if (collisionparts[i])
5311 collisions++;
5312 }
5313
5314 /* Collision in the middle part can be handled by reordering. */
5315 if (collisions == 1 && nparts == 3 && collisionparts [1])
5316 {
5317 std::swap (part[0][1], part[0][2]);
5318 std::swap (part[1][1], part[1][2]);
5319 }
5320 else if (collisions == 1
5321 && nparts == 4
5322 && (collisionparts [1] || collisionparts [2]))
5323 {
5324 if (collisionparts [1])
5325 {
5326 std::swap (part[0][1], part[0][2]);
5327 std::swap (part[1][1], part[1][2]);
5328 }
5329 else
5330 {
5331 std::swap (part[0][2], part[0][3]);
5332 std::swap (part[1][2], part[1][3]);
5333 }
5334 }
5335
5336 /* If there are more collisions, we can't handle it by reordering.
5337 Do an lea to the last part and use only one colliding move. */
5338 else if (collisions > 1)
5339 {
5340 rtx base, addr;
5341
5342 collisions = 1;
5343
5344 base = part[0][nparts - 1];
5345
5346 /* Handle the case when the last part isn't valid for lea.
5347 Happens in 64-bit mode storing the 12-byte XFmode. */
5348 if (GET_MODE (base) != Pmode)
5349 base = gen_rtx_REG (Pmode, REGNO (base));
5350
5351 addr = XEXP (part[1][0], 0);
5352 if (TARGET_TLS_DIRECT_SEG_REFS)
5353 {
5354 struct ix86_address parts;
5355 int ok = ix86_decompose_address (addr, &parts);
5356 gcc_assert (ok);
5357 /* It is not valid to use %gs: or %fs: in lea. */
5358 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5359 }
5360 emit_insn (gen_rtx_SET (base, addr));
5361 part[1][0] = replace_equiv_address (part[1][0], base);
5362 for (i = 1; i < nparts; i++)
5363 {
5364 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5365 part[1][i] = replace_equiv_address (part[1][i], tmp);
5366 }
5367 }
5368 }
5369
5370 if (push)
5371 {
5372 if (!TARGET_64BIT)
5373 {
5374 if (nparts == 3)
5375 {
5376 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5377 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5378 emit_move_insn (part[0][2], part[1][2]);
5379 }
5380 else if (nparts == 4)
5381 {
5382 emit_move_insn (part[0][3], part[1][3]);
5383 emit_move_insn (part[0][2], part[1][2]);
5384 }
5385 }
5386 else
5387 {
5388 /* In 64bit mode we don't have 32bit push available. In case this is
5389 register, it is OK - we will just use larger counterpart. We also
5390 retype memory - these comes from attempt to avoid REX prefix on
5391 moving of second half of TFmode value. */
5392 if (GET_MODE (part[1][1]) == SImode)
5393 {
5394 switch (GET_CODE (part[1][1]))
5395 {
5396 case MEM:
5397 part[1][1] = adjust_address (part[1][1], DImode, 0);
5398 break;
5399
5400 case REG:
5401 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5402 break;
5403
5404 default:
5405 gcc_unreachable ();
5406 }
5407
5408 if (GET_MODE (part[1][0]) == SImode)
5409 part[1][0] = part[1][1];
5410 }
5411 }
5412 emit_move_insn (part[0][1], part[1][1]);
5413 emit_move_insn (part[0][0], part[1][0]);
5414 return;
5415 }
5416
5417 /* Choose correct order to not overwrite the source before it is copied. */
5418 if ((REG_P (part[0][0])
5419 && REG_P (part[1][1])
5420 && (REGNO (part[0][0]) == REGNO (part[1][1])
5421 || (nparts == 3
5422 && REGNO (part[0][0]) == REGNO (part[1][2]))
5423 || (nparts == 4
5424 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5425 || (collisions > 0
5426 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5427 {
5428 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5429 {
5430 operands[2 + i] = part[0][j];
5431 operands[6 + i] = part[1][j];
5432 }
5433 }
5434 else
5435 {
5436 for (i = 0; i < nparts; i++)
5437 {
5438 operands[2 + i] = part[0][i];
5439 operands[6 + i] = part[1][i];
5440 }
5441 }
5442
5443 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5444 if (optimize_insn_for_size_p ())
5445 {
5446 for (j = 0; j < nparts - 1; j++)
5447 if (CONST_INT_P (operands[6 + j])
5448 && operands[6 + j] != const0_rtx
5449 && REG_P (operands[2 + j]))
5450 for (i = j; i < nparts - 1; i++)
5451 if (CONST_INT_P (operands[7 + i])
5452 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5453 operands[7 + i] = operands[2 + j];
5454 }
5455
5456 for (i = 0; i < nparts; i++)
5457 emit_move_insn (operands[2 + i], operands[6 + i]);
5458
5459 return;
5460 }
5461
5462 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5463 left shift by a constant, either using a single shift or
5464 a sequence of add instructions. */
5465
5466 static void
5467 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5468 {
5469 if (count == 1
5470 || (count * ix86_cost->add <= ix86_cost->shift_const
5471 && !optimize_insn_for_size_p ()))
5472 {
5473 while (count-- > 0)
5474 emit_insn (gen_add2_insn (operand, operand));
5475 }
5476 else
5477 {
5478 rtx (*insn)(rtx, rtx, rtx);
5479
5480 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5481 emit_insn (insn (operand, operand, GEN_INT (count)));
5482 }
5483 }
5484
5485 void
5486 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5487 {
5488 rtx (*gen_ashl3)(rtx, rtx, rtx);
5489 rtx (*gen_shld)(rtx, rtx, rtx);
5490 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5491 machine_mode half_mode;
5492
5493 rtx low[2], high[2];
5494 int count;
5495
5496 if (CONST_INT_P (operands[2]))
5497 {
5498 split_double_mode (mode, operands, 2, low, high);
5499 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5500
5501 if (count >= half_width)
5502 {
5503 emit_move_insn (high[0], low[1]);
5504 emit_move_insn (low[0], const0_rtx);
5505
5506 if (count > half_width)
5507 ix86_expand_ashl_const (high[0], count - half_width, mode);
5508 }
5509 else
5510 {
5511 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5512
5513 if (!rtx_equal_p (operands[0], operands[1]))
5514 emit_move_insn (operands[0], operands[1]);
5515
5516 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5517 ix86_expand_ashl_const (low[0], count, mode);
5518 }
5519 return;
5520 }
5521
5522 split_double_mode (mode, operands, 1, low, high);
5523 half_mode = mode == DImode ? SImode : DImode;
5524
5525 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5526
5527 if (operands[1] == const1_rtx)
5528 {
5529 /* Assuming we've chosen a QImode capable registers, then 1 << N
5530 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5531 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5532 {
5533 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5534
5535 ix86_expand_clear (low[0]);
5536 ix86_expand_clear (high[0]);
5537 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5538
5539 d = gen_lowpart (QImode, low[0]);
5540 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5541 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5542 emit_insn (gen_rtx_SET (d, s));
5543
5544 d = gen_lowpart (QImode, high[0]);
5545 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5546 s = gen_rtx_NE (QImode, flags, const0_rtx);
5547 emit_insn (gen_rtx_SET (d, s));
5548 }
5549
5550 /* Otherwise, we can get the same results by manually performing
5551 a bit extract operation on bit 5/6, and then performing the two
5552 shifts. The two methods of getting 0/1 into low/high are exactly
5553 the same size. Avoiding the shift in the bit extract case helps
5554 pentium4 a bit; no one else seems to care much either way. */
5555 else
5556 {
5557 rtx (*gen_lshr3)(rtx, rtx, rtx);
5558 rtx (*gen_and3)(rtx, rtx, rtx);
5559 rtx (*gen_xor3)(rtx, rtx, rtx);
5560 HOST_WIDE_INT bits;
5561 rtx x;
5562
5563 if (mode == DImode)
5564 {
5565 gen_lshr3 = gen_lshrsi3;
5566 gen_and3 = gen_andsi3;
5567 gen_xor3 = gen_xorsi3;
5568 bits = 5;
5569 }
5570 else
5571 {
5572 gen_lshr3 = gen_lshrdi3;
5573 gen_and3 = gen_anddi3;
5574 gen_xor3 = gen_xordi3;
5575 bits = 6;
5576 }
5577
5578 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5579 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5580 else
5581 x = gen_lowpart (half_mode, operands[2]);
5582 emit_insn (gen_rtx_SET (high[0], x));
5583
5584 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5585 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5586 emit_move_insn (low[0], high[0]);
5587 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5588 }
5589
5590 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5591 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5592 return;
5593 }
5594
5595 if (operands[1] == constm1_rtx)
5596 {
5597 /* For -1 << N, we can avoid the shld instruction, because we
5598 know that we're shifting 0...31/63 ones into a -1. */
5599 emit_move_insn (low[0], constm1_rtx);
5600 if (optimize_insn_for_size_p ())
5601 emit_move_insn (high[0], low[0]);
5602 else
5603 emit_move_insn (high[0], constm1_rtx);
5604 }
5605 else
5606 {
5607 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5608
5609 if (!rtx_equal_p (operands[0], operands[1]))
5610 emit_move_insn (operands[0], operands[1]);
5611
5612 split_double_mode (mode, operands, 1, low, high);
5613 emit_insn (gen_shld (high[0], low[0], operands[2]));
5614 }
5615
5616 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5617
5618 if (TARGET_CMOVE && scratch)
5619 {
5620 ix86_expand_clear (scratch);
5621 emit_insn (gen_x86_shift_adj_1
5622 (half_mode, high[0], low[0], operands[2], scratch));
5623 }
5624 else
5625 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5626 }
5627
5628 void
5629 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5630 {
5631 rtx (*gen_ashr3)(rtx, rtx, rtx)
5632 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5633 rtx (*gen_shrd)(rtx, rtx, rtx);
5634 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5635
5636 rtx low[2], high[2];
5637 int count;
5638
5639 if (CONST_INT_P (operands[2]))
5640 {
5641 split_double_mode (mode, operands, 2, low, high);
5642 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5643
5644 if (count == GET_MODE_BITSIZE (mode) - 1)
5645 {
5646 emit_move_insn (high[0], high[1]);
5647 emit_insn (gen_ashr3 (high[0], high[0],
5648 GEN_INT (half_width - 1)));
5649 emit_move_insn (low[0], high[0]);
5650
5651 }
5652 else if (count >= half_width)
5653 {
5654 emit_move_insn (low[0], high[1]);
5655 emit_move_insn (high[0], low[0]);
5656 emit_insn (gen_ashr3 (high[0], high[0],
5657 GEN_INT (half_width - 1)));
5658
5659 if (count > half_width)
5660 emit_insn (gen_ashr3 (low[0], low[0],
5661 GEN_INT (count - half_width)));
5662 }
5663 else
5664 {
5665 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5666
5667 if (!rtx_equal_p (operands[0], operands[1]))
5668 emit_move_insn (operands[0], operands[1]);
5669
5670 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5671 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5672 }
5673 }
5674 else
5675 {
5676 machine_mode half_mode;
5677
5678 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5679
5680 if (!rtx_equal_p (operands[0], operands[1]))
5681 emit_move_insn (operands[0], operands[1]);
5682
5683 split_double_mode (mode, operands, 1, low, high);
5684 half_mode = mode == DImode ? SImode : DImode;
5685
5686 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5687 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5688
5689 if (TARGET_CMOVE && scratch)
5690 {
5691 emit_move_insn (scratch, high[0]);
5692 emit_insn (gen_ashr3 (scratch, scratch,
5693 GEN_INT (half_width - 1)));
5694 emit_insn (gen_x86_shift_adj_1
5695 (half_mode, low[0], high[0], operands[2], scratch));
5696 }
5697 else
5698 emit_insn (gen_x86_shift_adj_3
5699 (half_mode, low[0], high[0], operands[2]));
5700 }
5701 }
5702
5703 void
5704 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5705 {
5706 rtx (*gen_lshr3)(rtx, rtx, rtx)
5707 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5708 rtx (*gen_shrd)(rtx, rtx, rtx);
5709 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5710
5711 rtx low[2], high[2];
5712 int count;
5713
5714 if (CONST_INT_P (operands[2]))
5715 {
5716 split_double_mode (mode, operands, 2, low, high);
5717 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5718
5719 if (count >= half_width)
5720 {
5721 emit_move_insn (low[0], high[1]);
5722 ix86_expand_clear (high[0]);
5723
5724 if (count > half_width)
5725 emit_insn (gen_lshr3 (low[0], low[0],
5726 GEN_INT (count - half_width)));
5727 }
5728 else
5729 {
5730 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5731
5732 if (!rtx_equal_p (operands[0], operands[1]))
5733 emit_move_insn (operands[0], operands[1]);
5734
5735 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5736 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5737 }
5738 }
5739 else
5740 {
5741 machine_mode half_mode;
5742
5743 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5744
5745 if (!rtx_equal_p (operands[0], operands[1]))
5746 emit_move_insn (operands[0], operands[1]);
5747
5748 split_double_mode (mode, operands, 1, low, high);
5749 half_mode = mode == DImode ? SImode : DImode;
5750
5751 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5752 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5753
5754 if (TARGET_CMOVE && scratch)
5755 {
5756 ix86_expand_clear (scratch);
5757 emit_insn (gen_x86_shift_adj_1
5758 (half_mode, low[0], high[0], operands[2], scratch));
5759 }
5760 else
5761 emit_insn (gen_x86_shift_adj_2
5762 (half_mode, low[0], high[0], operands[2]));
5763 }
5764 }
5765
5766 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5767 DImode for constant loop counts. */
5768
5769 static machine_mode
5770 counter_mode (rtx count_exp)
5771 {
5772 if (GET_MODE (count_exp) != VOIDmode)
5773 return GET_MODE (count_exp);
5774 if (!CONST_INT_P (count_exp))
5775 return Pmode;
5776 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5777 return DImode;
5778 return SImode;
5779 }
5780
5781 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5782 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5783 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5784 memory by VALUE (supposed to be in MODE).
5785
5786 The size is rounded down to whole number of chunk size moved at once.
5787 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5788
5789
5790 static void
5791 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5792 rtx destptr, rtx srcptr, rtx value,
5793 rtx count, machine_mode mode, int unroll,
5794 int expected_size, bool issetmem)
5795 {
5796 rtx_code_label *out_label, *top_label;
5797 rtx iter, tmp;
5798 machine_mode iter_mode = counter_mode (count);
5799 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5800 rtx piece_size = GEN_INT (piece_size_n);
5801 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5802 rtx size;
5803 int i;
5804
5805 top_label = gen_label_rtx ();
5806 out_label = gen_label_rtx ();
5807 iter = gen_reg_rtx (iter_mode);
5808
5809 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5810 NULL, 1, OPTAB_DIRECT);
5811 /* Those two should combine. */
5812 if (piece_size == const1_rtx)
5813 {
5814 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5815 true, out_label);
5816 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5817 }
5818 emit_move_insn (iter, const0_rtx);
5819
5820 emit_label (top_label);
5821
5822 tmp = convert_modes (Pmode, iter_mode, iter, true);
5823
5824 /* This assert could be relaxed - in this case we'll need to compute
5825 smallest power of two, containing in PIECE_SIZE_N and pass it to
5826 offset_address. */
5827 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5828 destmem = offset_address (destmem, tmp, piece_size_n);
5829 destmem = adjust_address (destmem, mode, 0);
5830
5831 if (!issetmem)
5832 {
5833 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5834 srcmem = adjust_address (srcmem, mode, 0);
5835
5836 /* When unrolling for chips that reorder memory reads and writes,
5837 we can save registers by using single temporary.
5838 Also using 4 temporaries is overkill in 32bit mode. */
5839 if (!TARGET_64BIT && 0)
5840 {
5841 for (i = 0; i < unroll; i++)
5842 {
5843 if (i)
5844 {
5845 destmem = adjust_address (copy_rtx (destmem), mode,
5846 GET_MODE_SIZE (mode));
5847 srcmem = adjust_address (copy_rtx (srcmem), mode,
5848 GET_MODE_SIZE (mode));
5849 }
5850 emit_move_insn (destmem, srcmem);
5851 }
5852 }
5853 else
5854 {
5855 rtx tmpreg[4];
5856 gcc_assert (unroll <= 4);
5857 for (i = 0; i < unroll; i++)
5858 {
5859 tmpreg[i] = gen_reg_rtx (mode);
5860 if (i)
5861 srcmem = adjust_address (copy_rtx (srcmem), mode,
5862 GET_MODE_SIZE (mode));
5863 emit_move_insn (tmpreg[i], srcmem);
5864 }
5865 for (i = 0; i < unroll; i++)
5866 {
5867 if (i)
5868 destmem = adjust_address (copy_rtx (destmem), mode,
5869 GET_MODE_SIZE (mode));
5870 emit_move_insn (destmem, tmpreg[i]);
5871 }
5872 }
5873 }
5874 else
5875 for (i = 0; i < unroll; i++)
5876 {
5877 if (i)
5878 destmem = adjust_address (copy_rtx (destmem), mode,
5879 GET_MODE_SIZE (mode));
5880 emit_move_insn (destmem, value);
5881 }
5882
5883 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5884 true, OPTAB_LIB_WIDEN);
5885 if (tmp != iter)
5886 emit_move_insn (iter, tmp);
5887
5888 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5889 true, top_label);
5890 if (expected_size != -1)
5891 {
5892 expected_size /= GET_MODE_SIZE (mode) * unroll;
5893 if (expected_size == 0)
5894 predict_jump (0);
5895 else if (expected_size > REG_BR_PROB_BASE)
5896 predict_jump (REG_BR_PROB_BASE - 1);
5897 else
5898 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5899 / expected_size);
5900 }
5901 else
5902 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5903 iter = ix86_zero_extend_to_Pmode (iter);
5904 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5905 true, OPTAB_LIB_WIDEN);
5906 if (tmp != destptr)
5907 emit_move_insn (destptr, tmp);
5908 if (!issetmem)
5909 {
5910 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5911 true, OPTAB_LIB_WIDEN);
5912 if (tmp != srcptr)
5913 emit_move_insn (srcptr, tmp);
5914 }
5915 emit_label (out_label);
5916 }
5917
5918 /* Divide COUNTREG by SCALE. */
5919 static rtx
5920 scale_counter (rtx countreg, int scale)
5921 {
5922 rtx sc;
5923
5924 if (scale == 1)
5925 return countreg;
5926 if (CONST_INT_P (countreg))
5927 return GEN_INT (INTVAL (countreg) / scale);
5928 gcc_assert (REG_P (countreg));
5929
5930 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5931 GEN_INT (exact_log2 (scale)),
5932 NULL, 1, OPTAB_DIRECT);
5933 return sc;
5934 }
5935
5936 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5937 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5938 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5939 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5940 ORIG_VALUE is the original value passed to memset to fill the memory with.
5941 Other arguments have same meaning as for previous function. */
5942
5943 static void
5944 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5945 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5946 rtx count,
5947 machine_mode mode, bool issetmem)
5948 {
5949 rtx destexp;
5950 rtx srcexp;
5951 rtx countreg;
5952 HOST_WIDE_INT rounded_count;
5953
5954 /* If possible, it is shorter to use rep movs.
5955 TODO: Maybe it is better to move this logic to decide_alg. */
5956 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5957 && (!issetmem || orig_value == const0_rtx))
5958 mode = SImode;
5959
5960 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5961 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5962
5963 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5964 GET_MODE_SIZE (mode)));
5965 if (mode != QImode)
5966 {
5967 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5968 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5969 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5970 }
5971 else
5972 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5973 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5974 {
5975 rounded_count
5976 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5977 destmem = shallow_copy_rtx (destmem);
5978 set_mem_size (destmem, rounded_count);
5979 }
5980 else if (MEM_SIZE_KNOWN_P (destmem))
5981 clear_mem_size (destmem);
5982
5983 if (issetmem)
5984 {
5985 value = force_reg (mode, gen_lowpart (mode, value));
5986 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5987 }
5988 else
5989 {
5990 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5991 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5992 if (mode != QImode)
5993 {
5994 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5995 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5996 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5997 }
5998 else
5999 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6000 if (CONST_INT_P (count))
6001 {
6002 rounded_count
6003 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6004 srcmem = shallow_copy_rtx (srcmem);
6005 set_mem_size (srcmem, rounded_count);
6006 }
6007 else
6008 {
6009 if (MEM_SIZE_KNOWN_P (srcmem))
6010 clear_mem_size (srcmem);
6011 }
6012 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6013 destexp, srcexp));
6014 }
6015 }
6016
6017 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6018 DESTMEM.
6019 SRC is passed by pointer to be updated on return.
6020 Return value is updated DST. */
6021 static rtx
6022 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6023 HOST_WIDE_INT size_to_move)
6024 {
6025 rtx dst = destmem, src = *srcmem, tempreg;
6026 enum insn_code code;
6027 machine_mode move_mode;
6028 int piece_size, i;
6029
6030 /* Find the widest mode in which we could perform moves.
6031 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6032 it until move of such size is supported. */
6033 piece_size = 1 << floor_log2 (size_to_move);
6034 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6035 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6036 {
6037 gcc_assert (piece_size > 1);
6038 piece_size >>= 1;
6039 }
6040
6041 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6042 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6043 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6044 {
6045 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6046 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6047 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6048 {
6049 move_mode = word_mode;
6050 piece_size = GET_MODE_SIZE (move_mode);
6051 code = optab_handler (mov_optab, move_mode);
6052 }
6053 }
6054 gcc_assert (code != CODE_FOR_nothing);
6055
6056 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6057 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6058
6059 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6060 gcc_assert (size_to_move % piece_size == 0);
6061
6062 for (i = 0; i < size_to_move; i += piece_size)
6063 {
6064 /* We move from memory to memory, so we'll need to do it via
6065 a temporary register. */
6066 tempreg = gen_reg_rtx (move_mode);
6067 emit_insn (GEN_FCN (code) (tempreg, src));
6068 emit_insn (GEN_FCN (code) (dst, tempreg));
6069
6070 emit_move_insn (destptr,
6071 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6072 emit_move_insn (srcptr,
6073 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6074
6075 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6076 piece_size);
6077 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6078 piece_size);
6079 }
6080
6081 /* Update DST and SRC rtx. */
6082 *srcmem = src;
6083 return dst;
6084 }
6085
6086 /* Helper function for the string operations below. Dest VARIABLE whether
6087 it is aligned to VALUE bytes. If true, jump to the label. */
6088
6089 static rtx_code_label *
6090 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6091 {
6092 rtx_code_label *label = gen_label_rtx ();
6093 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6094 if (GET_MODE (variable) == DImode)
6095 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6096 else
6097 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6098 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6099 1, label);
6100 if (epilogue)
6101 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6102 else
6103 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6104 return label;
6105 }
6106
6107
6108 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6109
6110 static void
6111 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6112 rtx destptr, rtx srcptr, rtx count, int max_size)
6113 {
6114 rtx src, dest;
6115 if (CONST_INT_P (count))
6116 {
6117 HOST_WIDE_INT countval = INTVAL (count);
6118 HOST_WIDE_INT epilogue_size = countval % max_size;
6119 int i;
6120
6121 /* For now MAX_SIZE should be a power of 2. This assert could be
6122 relaxed, but it'll require a bit more complicated epilogue
6123 expanding. */
6124 gcc_assert ((max_size & (max_size - 1)) == 0);
6125 for (i = max_size; i >= 1; i >>= 1)
6126 {
6127 if (epilogue_size & i)
6128 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6129 }
6130 return;
6131 }
6132 if (max_size > 8)
6133 {
6134 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6135 count, 1, OPTAB_DIRECT);
6136 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6137 count, QImode, 1, 4, false);
6138 return;
6139 }
6140
6141 /* When there are stringops, we can cheaply increase dest and src pointers.
6142 Otherwise we save code size by maintaining offset (zero is readily
6143 available from preceding rep operation) and using x86 addressing modes.
6144 */
6145 if (TARGET_SINGLE_STRINGOP)
6146 {
6147 if (max_size > 4)
6148 {
6149 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6150 src = change_address (srcmem, SImode, srcptr);
6151 dest = change_address (destmem, SImode, destptr);
6152 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6153 emit_label (label);
6154 LABEL_NUSES (label) = 1;
6155 }
6156 if (max_size > 2)
6157 {
6158 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6159 src = change_address (srcmem, HImode, srcptr);
6160 dest = change_address (destmem, HImode, destptr);
6161 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6162 emit_label (label);
6163 LABEL_NUSES (label) = 1;
6164 }
6165 if (max_size > 1)
6166 {
6167 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6168 src = change_address (srcmem, QImode, srcptr);
6169 dest = change_address (destmem, QImode, destptr);
6170 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6171 emit_label (label);
6172 LABEL_NUSES (label) = 1;
6173 }
6174 }
6175 else
6176 {
6177 rtx offset = force_reg (Pmode, const0_rtx);
6178 rtx tmp;
6179
6180 if (max_size > 4)
6181 {
6182 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6183 src = change_address (srcmem, SImode, srcptr);
6184 dest = change_address (destmem, SImode, destptr);
6185 emit_move_insn (dest, src);
6186 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6187 true, OPTAB_LIB_WIDEN);
6188 if (tmp != offset)
6189 emit_move_insn (offset, tmp);
6190 emit_label (label);
6191 LABEL_NUSES (label) = 1;
6192 }
6193 if (max_size > 2)
6194 {
6195 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6196 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6197 src = change_address (srcmem, HImode, tmp);
6198 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6199 dest = change_address (destmem, HImode, tmp);
6200 emit_move_insn (dest, src);
6201 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6202 true, OPTAB_LIB_WIDEN);
6203 if (tmp != offset)
6204 emit_move_insn (offset, tmp);
6205 emit_label (label);
6206 LABEL_NUSES (label) = 1;
6207 }
6208 if (max_size > 1)
6209 {
6210 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6211 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6212 src = change_address (srcmem, QImode, tmp);
6213 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6214 dest = change_address (destmem, QImode, tmp);
6215 emit_move_insn (dest, src);
6216 emit_label (label);
6217 LABEL_NUSES (label) = 1;
6218 }
6219 }
6220 }
6221
6222 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6223 with value PROMOTED_VAL.
6224 SRC is passed by pointer to be updated on return.
6225 Return value is updated DST. */
6226 static rtx
6227 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6228 HOST_WIDE_INT size_to_move)
6229 {
6230 rtx dst = destmem;
6231 enum insn_code code;
6232 machine_mode move_mode;
6233 int piece_size, i;
6234
6235 /* Find the widest mode in which we could perform moves.
6236 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6237 it until move of such size is supported. */
6238 move_mode = GET_MODE (promoted_val);
6239 if (move_mode == VOIDmode)
6240 move_mode = QImode;
6241 if (size_to_move < GET_MODE_SIZE (move_mode))
6242 {
6243 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6244 move_mode = int_mode_for_size (move_bits, 0).require ();
6245 promoted_val = gen_lowpart (move_mode, promoted_val);
6246 }
6247 piece_size = GET_MODE_SIZE (move_mode);
6248 code = optab_handler (mov_optab, move_mode);
6249 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6250
6251 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6252
6253 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6254 gcc_assert (size_to_move % piece_size == 0);
6255
6256 for (i = 0; i < size_to_move; i += piece_size)
6257 {
6258 if (piece_size <= GET_MODE_SIZE (word_mode))
6259 {
6260 emit_insn (gen_strset (destptr, dst, promoted_val));
6261 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6262 piece_size);
6263 continue;
6264 }
6265
6266 emit_insn (GEN_FCN (code) (dst, promoted_val));
6267
6268 emit_move_insn (destptr,
6269 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6270
6271 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6272 piece_size);
6273 }
6274
6275 /* Update DST rtx. */
6276 return dst;
6277 }
6278 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6279 static void
6280 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6281 rtx count, int max_size)
6282 {
6283 count = expand_simple_binop (counter_mode (count), AND, count,
6284 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6285 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6286 gen_lowpart (QImode, value), count, QImode,
6287 1, max_size / 2, true);
6288 }
6289
6290 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6291 static void
6292 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6293 rtx count, int max_size)
6294 {
6295 rtx dest;
6296
6297 if (CONST_INT_P (count))
6298 {
6299 HOST_WIDE_INT countval = INTVAL (count);
6300 HOST_WIDE_INT epilogue_size = countval % max_size;
6301 int i;
6302
6303 /* For now MAX_SIZE should be a power of 2. This assert could be
6304 relaxed, but it'll require a bit more complicated epilogue
6305 expanding. */
6306 gcc_assert ((max_size & (max_size - 1)) == 0);
6307 for (i = max_size; i >= 1; i >>= 1)
6308 {
6309 if (epilogue_size & i)
6310 {
6311 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6312 destmem = emit_memset (destmem, destptr, vec_value, i);
6313 else
6314 destmem = emit_memset (destmem, destptr, value, i);
6315 }
6316 }
6317 return;
6318 }
6319 if (max_size > 32)
6320 {
6321 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6322 return;
6323 }
6324 if (max_size > 16)
6325 {
6326 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6327 if (TARGET_64BIT)
6328 {
6329 dest = change_address (destmem, DImode, destptr);
6330 emit_insn (gen_strset (destptr, dest, value));
6331 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6332 emit_insn (gen_strset (destptr, dest, value));
6333 }
6334 else
6335 {
6336 dest = change_address (destmem, SImode, destptr);
6337 emit_insn (gen_strset (destptr, dest, value));
6338 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6339 emit_insn (gen_strset (destptr, dest, value));
6340 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6341 emit_insn (gen_strset (destptr, dest, value));
6342 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6343 emit_insn (gen_strset (destptr, dest, value));
6344 }
6345 emit_label (label);
6346 LABEL_NUSES (label) = 1;
6347 }
6348 if (max_size > 8)
6349 {
6350 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6351 if (TARGET_64BIT)
6352 {
6353 dest = change_address (destmem, DImode, destptr);
6354 emit_insn (gen_strset (destptr, dest, value));
6355 }
6356 else
6357 {
6358 dest = change_address (destmem, SImode, destptr);
6359 emit_insn (gen_strset (destptr, dest, value));
6360 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6361 emit_insn (gen_strset (destptr, dest, value));
6362 }
6363 emit_label (label);
6364 LABEL_NUSES (label) = 1;
6365 }
6366 if (max_size > 4)
6367 {
6368 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6369 dest = change_address (destmem, SImode, destptr);
6370 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6371 emit_label (label);
6372 LABEL_NUSES (label) = 1;
6373 }
6374 if (max_size > 2)
6375 {
6376 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6377 dest = change_address (destmem, HImode, destptr);
6378 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6379 emit_label (label);
6380 LABEL_NUSES (label) = 1;
6381 }
6382 if (max_size > 1)
6383 {
6384 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6385 dest = change_address (destmem, QImode, destptr);
6386 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6387 emit_label (label);
6388 LABEL_NUSES (label) = 1;
6389 }
6390 }
6391
6392 /* Adjust COUNTER by the VALUE. */
6393 static void
6394 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6395 {
6396 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6397 }
6398
6399 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6400 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6401 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6402 ignored.
6403 Return value is updated DESTMEM. */
6404
6405 static rtx
6406 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6407 rtx destptr, rtx srcptr, rtx value,
6408 rtx vec_value, rtx count, int align,
6409 int desired_alignment, bool issetmem)
6410 {
6411 int i;
6412 for (i = 1; i < desired_alignment; i <<= 1)
6413 {
6414 if (align <= i)
6415 {
6416 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6417 if (issetmem)
6418 {
6419 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6420 destmem = emit_memset (destmem, destptr, vec_value, i);
6421 else
6422 destmem = emit_memset (destmem, destptr, value, i);
6423 }
6424 else
6425 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6426 ix86_adjust_counter (count, i);
6427 emit_label (label);
6428 LABEL_NUSES (label) = 1;
6429 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6430 }
6431 }
6432 return destmem;
6433 }
6434
6435 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6436 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6437 and jump to DONE_LABEL. */
6438 static void
6439 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6440 rtx destptr, rtx srcptr,
6441 rtx value, rtx vec_value,
6442 rtx count, int size,
6443 rtx done_label, bool issetmem)
6444 {
6445 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6446 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6447 rtx modesize;
6448 int n;
6449
6450 /* If we do not have vector value to copy, we must reduce size. */
6451 if (issetmem)
6452 {
6453 if (!vec_value)
6454 {
6455 if (GET_MODE (value) == VOIDmode && size > 8)
6456 mode = Pmode;
6457 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6458 mode = GET_MODE (value);
6459 }
6460 else
6461 mode = GET_MODE (vec_value), value = vec_value;
6462 }
6463 else
6464 {
6465 /* Choose appropriate vector mode. */
6466 if (size >= 32)
6467 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6468 else if (size >= 16)
6469 mode = TARGET_SSE ? V16QImode : DImode;
6470 srcmem = change_address (srcmem, mode, srcptr);
6471 }
6472 destmem = change_address (destmem, mode, destptr);
6473 modesize = GEN_INT (GET_MODE_SIZE (mode));
6474 gcc_assert (GET_MODE_SIZE (mode) <= size);
6475 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6476 {
6477 if (issetmem)
6478 emit_move_insn (destmem, gen_lowpart (mode, value));
6479 else
6480 {
6481 emit_move_insn (destmem, srcmem);
6482 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6483 }
6484 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6485 }
6486
6487 destmem = offset_address (destmem, count, 1);
6488 destmem = offset_address (destmem, GEN_INT (-2 * size),
6489 GET_MODE_SIZE (mode));
6490 if (!issetmem)
6491 {
6492 srcmem = offset_address (srcmem, count, 1);
6493 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6494 GET_MODE_SIZE (mode));
6495 }
6496 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6497 {
6498 if (issetmem)
6499 emit_move_insn (destmem, gen_lowpart (mode, value));
6500 else
6501 {
6502 emit_move_insn (destmem, srcmem);
6503 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6504 }
6505 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6506 }
6507 emit_jump_insn (gen_jump (done_label));
6508 emit_barrier ();
6509
6510 emit_label (label);
6511 LABEL_NUSES (label) = 1;
6512 }
6513
6514 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6515 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6516 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6517 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6518 DONE_LABEL is a label after the whole copying sequence. The label is created
6519 on demand if *DONE_LABEL is NULL.
6520 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6521 bounds after the initial copies.
6522
6523 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6524 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6525 we will dispatch to a library call for large blocks.
6526
6527 In pseudocode we do:
6528
6529 if (COUNT < SIZE)
6530 {
6531 Assume that SIZE is 4. Bigger sizes are handled analogously
6532 if (COUNT & 4)
6533 {
6534 copy 4 bytes from SRCPTR to DESTPTR
6535 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6536 goto done_label
6537 }
6538 if (!COUNT)
6539 goto done_label;
6540 copy 1 byte from SRCPTR to DESTPTR
6541 if (COUNT & 2)
6542 {
6543 copy 2 bytes from SRCPTR to DESTPTR
6544 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6545 }
6546 }
6547 else
6548 {
6549 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6550 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6551
6552 OLD_DESPTR = DESTPTR;
6553 Align DESTPTR up to DESIRED_ALIGN
6554 SRCPTR += DESTPTR - OLD_DESTPTR
6555 COUNT -= DEST_PTR - OLD_DESTPTR
6556 if (DYNAMIC_CHECK)
6557 Round COUNT down to multiple of SIZE
6558 << optional caller supplied zero size guard is here >>
6559 << optional caller supplied dynamic check is here >>
6560 << caller supplied main copy loop is here >>
6561 }
6562 done_label:
6563 */
6564 static void
6565 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6566 rtx *destptr, rtx *srcptr,
6567 machine_mode mode,
6568 rtx value, rtx vec_value,
6569 rtx *count,
6570 rtx_code_label **done_label,
6571 int size,
6572 int desired_align,
6573 int align,
6574 unsigned HOST_WIDE_INT *min_size,
6575 bool dynamic_check,
6576 bool issetmem)
6577 {
6578 rtx_code_label *loop_label = NULL, *label;
6579 int n;
6580 rtx modesize;
6581 int prolog_size = 0;
6582 rtx mode_value;
6583
6584 /* Chose proper value to copy. */
6585 if (issetmem && VECTOR_MODE_P (mode))
6586 mode_value = vec_value;
6587 else
6588 mode_value = value;
6589 gcc_assert (GET_MODE_SIZE (mode) <= size);
6590
6591 /* See if block is big or small, handle small blocks. */
6592 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6593 {
6594 int size2 = size;
6595 loop_label = gen_label_rtx ();
6596
6597 if (!*done_label)
6598 *done_label = gen_label_rtx ();
6599
6600 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6601 1, loop_label);
6602 size2 >>= 1;
6603
6604 /* Handle sizes > 3. */
6605 for (;size2 > 2; size2 >>= 1)
6606 expand_small_cpymem_or_setmem (destmem, srcmem,
6607 *destptr, *srcptr,
6608 value, vec_value,
6609 *count,
6610 size2, *done_label, issetmem);
6611 /* Nothing to copy? Jump to DONE_LABEL if so */
6612 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6613 1, *done_label);
6614
6615 /* Do a byte copy. */
6616 destmem = change_address (destmem, QImode, *destptr);
6617 if (issetmem)
6618 emit_move_insn (destmem, gen_lowpart (QImode, value));
6619 else
6620 {
6621 srcmem = change_address (srcmem, QImode, *srcptr);
6622 emit_move_insn (destmem, srcmem);
6623 }
6624
6625 /* Handle sizes 2 and 3. */
6626 label = ix86_expand_aligntest (*count, 2, false);
6627 destmem = change_address (destmem, HImode, *destptr);
6628 destmem = offset_address (destmem, *count, 1);
6629 destmem = offset_address (destmem, GEN_INT (-2), 2);
6630 if (issetmem)
6631 emit_move_insn (destmem, gen_lowpart (HImode, value));
6632 else
6633 {
6634 srcmem = change_address (srcmem, HImode, *srcptr);
6635 srcmem = offset_address (srcmem, *count, 1);
6636 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6637 emit_move_insn (destmem, srcmem);
6638 }
6639
6640 emit_label (label);
6641 LABEL_NUSES (label) = 1;
6642 emit_jump_insn (gen_jump (*done_label));
6643 emit_barrier ();
6644 }
6645 else
6646 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6647 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6648
6649 /* Start memcpy for COUNT >= SIZE. */
6650 if (loop_label)
6651 {
6652 emit_label (loop_label);
6653 LABEL_NUSES (loop_label) = 1;
6654 }
6655
6656 /* Copy first desired_align bytes. */
6657 if (!issetmem)
6658 srcmem = change_address (srcmem, mode, *srcptr);
6659 destmem = change_address (destmem, mode, *destptr);
6660 modesize = GEN_INT (GET_MODE_SIZE (mode));
6661 for (n = 0; prolog_size < desired_align - align; n++)
6662 {
6663 if (issetmem)
6664 emit_move_insn (destmem, mode_value);
6665 else
6666 {
6667 emit_move_insn (destmem, srcmem);
6668 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6669 }
6670 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6671 prolog_size += GET_MODE_SIZE (mode);
6672 }
6673
6674
6675 /* Copy last SIZE bytes. */
6676 destmem = offset_address (destmem, *count, 1);
6677 destmem = offset_address (destmem,
6678 GEN_INT (-size - prolog_size),
6679 1);
6680 if (issetmem)
6681 emit_move_insn (destmem, mode_value);
6682 else
6683 {
6684 srcmem = offset_address (srcmem, *count, 1);
6685 srcmem = offset_address (srcmem,
6686 GEN_INT (-size - prolog_size),
6687 1);
6688 emit_move_insn (destmem, srcmem);
6689 }
6690 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6691 {
6692 destmem = offset_address (destmem, modesize, 1);
6693 if (issetmem)
6694 emit_move_insn (destmem, mode_value);
6695 else
6696 {
6697 srcmem = offset_address (srcmem, modesize, 1);
6698 emit_move_insn (destmem, srcmem);
6699 }
6700 }
6701
6702 /* Align destination. */
6703 if (desired_align > 1 && desired_align > align)
6704 {
6705 rtx saveddest = *destptr;
6706
6707 gcc_assert (desired_align <= size);
6708 /* Align destptr up, place it to new register. */
6709 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6710 GEN_INT (prolog_size),
6711 NULL_RTX, 1, OPTAB_DIRECT);
6712 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6713 REG_POINTER (*destptr) = 1;
6714 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6715 GEN_INT (-desired_align),
6716 *destptr, 1, OPTAB_DIRECT);
6717 /* See how many bytes we skipped. */
6718 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6719 *destptr,
6720 saveddest, 1, OPTAB_DIRECT);
6721 /* Adjust srcptr and count. */
6722 if (!issetmem)
6723 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6724 saveddest, *srcptr, 1, OPTAB_DIRECT);
6725 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6726 saveddest, *count, 1, OPTAB_DIRECT);
6727 /* We copied at most size + prolog_size. */
6728 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6729 *min_size
6730 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6731 else
6732 *min_size = 0;
6733
6734 /* Our loops always round down the block size, but for dispatch to
6735 library we need precise value. */
6736 if (dynamic_check)
6737 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6738 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6739 }
6740 else
6741 {
6742 gcc_assert (prolog_size == 0);
6743 /* Decrease count, so we won't end up copying last word twice. */
6744 if (!CONST_INT_P (*count))
6745 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6746 constm1_rtx, *count, 1, OPTAB_DIRECT);
6747 else
6748 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6749 (unsigned HOST_WIDE_INT)size));
6750 if (*min_size)
6751 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6752 }
6753 }
6754
6755
6756 /* This function is like the previous one, except here we know how many bytes
6757 need to be copied. That allows us to update alignment not only of DST, which
6758 is returned, but also of SRC, which is passed as a pointer for that
6759 reason. */
6760 static rtx
6761 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6762 rtx srcreg, rtx value, rtx vec_value,
6763 int desired_align, int align_bytes,
6764 bool issetmem)
6765 {
6766 rtx src = NULL;
6767 rtx orig_dst = dst;
6768 rtx orig_src = NULL;
6769 int piece_size = 1;
6770 int copied_bytes = 0;
6771
6772 if (!issetmem)
6773 {
6774 gcc_assert (srcp != NULL);
6775 src = *srcp;
6776 orig_src = src;
6777 }
6778
6779 for (piece_size = 1;
6780 piece_size <= desired_align && copied_bytes < align_bytes;
6781 piece_size <<= 1)
6782 {
6783 if (align_bytes & piece_size)
6784 {
6785 if (issetmem)
6786 {
6787 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6788 dst = emit_memset (dst, destreg, vec_value, piece_size);
6789 else
6790 dst = emit_memset (dst, destreg, value, piece_size);
6791 }
6792 else
6793 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6794 copied_bytes += piece_size;
6795 }
6796 }
6797 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6798 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6799 if (MEM_SIZE_KNOWN_P (orig_dst))
6800 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6801
6802 if (!issetmem)
6803 {
6804 int src_align_bytes = get_mem_align_offset (src, desired_align
6805 * BITS_PER_UNIT);
6806 if (src_align_bytes >= 0)
6807 src_align_bytes = desired_align - src_align_bytes;
6808 if (src_align_bytes >= 0)
6809 {
6810 unsigned int src_align;
6811 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6812 {
6813 if ((src_align_bytes & (src_align - 1))
6814 == (align_bytes & (src_align - 1)))
6815 break;
6816 }
6817 if (src_align > (unsigned int) desired_align)
6818 src_align = desired_align;
6819 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6820 set_mem_align (src, src_align * BITS_PER_UNIT);
6821 }
6822 if (MEM_SIZE_KNOWN_P (orig_src))
6823 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6824 *srcp = src;
6825 }
6826
6827 return dst;
6828 }
6829
6830 /* Return true if ALG can be used in current context.
6831 Assume we expand memset if MEMSET is true. */
6832 static bool
6833 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6834 {
6835 if (alg == no_stringop)
6836 return false;
6837 if (alg == vector_loop)
6838 return TARGET_SSE || TARGET_AVX;
6839 /* Algorithms using the rep prefix want at least edi and ecx;
6840 additionally, memset wants eax and memcpy wants esi. Don't
6841 consider such algorithms if the user has appropriated those
6842 registers for their own purposes, or if we have a non-default
6843 address space, since some string insns cannot override the segment. */
6844 if (alg == rep_prefix_1_byte
6845 || alg == rep_prefix_4_byte
6846 || alg == rep_prefix_8_byte)
6847 {
6848 if (have_as)
6849 return false;
6850 if (fixed_regs[CX_REG]
6851 || fixed_regs[DI_REG]
6852 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6853 return false;
6854 }
6855 return true;
6856 }
6857
6858 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6859 static enum stringop_alg
6860 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6861 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6862 bool memset, bool zero_memset, bool have_as,
6863 int *dynamic_check, bool *noalign, bool recur)
6864 {
6865 const struct stringop_algs *algs;
6866 bool optimize_for_speed;
6867 int max = 0;
6868 const struct processor_costs *cost;
6869 int i;
6870 bool any_alg_usable_p = false;
6871
6872 *noalign = false;
6873 *dynamic_check = -1;
6874
6875 /* Even if the string operation call is cold, we still might spend a lot
6876 of time processing large blocks. */
6877 if (optimize_function_for_size_p (cfun)
6878 || (optimize_insn_for_size_p ()
6879 && (max_size < 256
6880 || (expected_size != -1 && expected_size < 256))))
6881 optimize_for_speed = false;
6882 else
6883 optimize_for_speed = true;
6884
6885 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6886 if (memset)
6887 algs = &cost->memset[TARGET_64BIT != 0];
6888 else
6889 algs = &cost->memcpy[TARGET_64BIT != 0];
6890
6891 /* See maximal size for user defined algorithm. */
6892 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6893 {
6894 enum stringop_alg candidate = algs->size[i].alg;
6895 bool usable = alg_usable_p (candidate, memset, have_as);
6896 any_alg_usable_p |= usable;
6897
6898 if (candidate != libcall && candidate && usable)
6899 max = algs->size[i].max;
6900 }
6901
6902 /* If expected size is not known but max size is small enough
6903 so inline version is a win, set expected size into
6904 the range. */
6905 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6906 && expected_size == -1)
6907 expected_size = min_size / 2 + max_size / 2;
6908
6909 /* If user specified the algorithm, honor it if possible. */
6910 if (ix86_stringop_alg != no_stringop
6911 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6912 return ix86_stringop_alg;
6913 /* rep; movq or rep; movl is the smallest variant. */
6914 else if (!optimize_for_speed)
6915 {
6916 *noalign = true;
6917 if (!count || (count & 3) || (memset && !zero_memset))
6918 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6919 ? rep_prefix_1_byte : loop_1_byte;
6920 else
6921 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6922 ? rep_prefix_4_byte : loop;
6923 }
6924 /* Very tiny blocks are best handled via the loop, REP is expensive to
6925 setup. */
6926 else if (expected_size != -1 && expected_size < 4)
6927 return loop_1_byte;
6928 else if (expected_size != -1)
6929 {
6930 enum stringop_alg alg = libcall;
6931 bool alg_noalign = false;
6932 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6933 {
6934 /* We get here if the algorithms that were not libcall-based
6935 were rep-prefix based and we are unable to use rep prefixes
6936 based on global register usage. Break out of the loop and
6937 use the heuristic below. */
6938 if (algs->size[i].max == 0)
6939 break;
6940 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6941 {
6942 enum stringop_alg candidate = algs->size[i].alg;
6943
6944 if (candidate != libcall
6945 && alg_usable_p (candidate, memset, have_as))
6946 {
6947 alg = candidate;
6948 alg_noalign = algs->size[i].noalign;
6949 }
6950 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6951 last non-libcall inline algorithm. */
6952 if (TARGET_INLINE_ALL_STRINGOPS)
6953 {
6954 /* When the current size is best to be copied by a libcall,
6955 but we are still forced to inline, run the heuristic below
6956 that will pick code for medium sized blocks. */
6957 if (alg != libcall)
6958 {
6959 *noalign = alg_noalign;
6960 return alg;
6961 }
6962 else if (!any_alg_usable_p)
6963 break;
6964 }
6965 else if (alg_usable_p (candidate, memset, have_as))
6966 {
6967 *noalign = algs->size[i].noalign;
6968 return candidate;
6969 }
6970 }
6971 }
6972 }
6973 /* When asked to inline the call anyway, try to pick meaningful choice.
6974 We look for maximal size of block that is faster to copy by hand and
6975 take blocks of at most of that size guessing that average size will
6976 be roughly half of the block.
6977
6978 If this turns out to be bad, we might simply specify the preferred
6979 choice in ix86_costs. */
6980 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6981 && (algs->unknown_size == libcall
6982 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6983 {
6984 enum stringop_alg alg;
6985 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6986
6987 /* If there aren't any usable algorithms or if recursing already,
6988 then recursing on smaller sizes or same size isn't going to
6989 find anything. Just return the simple byte-at-a-time copy loop. */
6990 if (!any_alg_usable_p || recur)
6991 {
6992 /* Pick something reasonable. */
6993 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6994 *dynamic_check = 128;
6995 return loop_1_byte;
6996 }
6997 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6998 zero_memset, have_as, dynamic_check, noalign, true);
6999 gcc_assert (*dynamic_check == -1);
7000 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7001 *dynamic_check = max;
7002 else
7003 gcc_assert (alg != libcall);
7004 return alg;
7005 }
7006 return (alg_usable_p (algs->unknown_size, memset, have_as)
7007 ? algs->unknown_size : libcall);
7008 }
7009
7010 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7011 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7012 static int
7013 decide_alignment (int align,
7014 enum stringop_alg alg,
7015 int expected_size,
7016 machine_mode move_mode)
7017 {
7018 int desired_align = 0;
7019
7020 gcc_assert (alg != no_stringop);
7021
7022 if (alg == libcall)
7023 return 0;
7024 if (move_mode == VOIDmode)
7025 return 0;
7026
7027 desired_align = GET_MODE_SIZE (move_mode);
7028 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7029 copying whole cacheline at once. */
7030 if (TARGET_PENTIUMPRO
7031 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7032 desired_align = 8;
7033
7034 if (optimize_size)
7035 desired_align = 1;
7036 if (desired_align < align)
7037 desired_align = align;
7038 if (expected_size != -1 && expected_size < 4)
7039 desired_align = align;
7040
7041 return desired_align;
7042 }
7043
7044
7045 /* Helper function for memcpy. For QImode value 0xXY produce
7046 0xXYXYXYXY of wide specified by MODE. This is essentially
7047 a * 0x10101010, but we can do slightly better than
7048 synth_mult by unwinding the sequence by hand on CPUs with
7049 slow multiply. */
7050 static rtx
7051 promote_duplicated_reg (machine_mode mode, rtx val)
7052 {
7053 machine_mode valmode = GET_MODE (val);
7054 rtx tmp;
7055 int nops = mode == DImode ? 3 : 2;
7056
7057 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7058 if (val == const0_rtx)
7059 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7060 if (CONST_INT_P (val))
7061 {
7062 HOST_WIDE_INT v = INTVAL (val) & 255;
7063
7064 v |= v << 8;
7065 v |= v << 16;
7066 if (mode == DImode)
7067 v |= (v << 16) << 16;
7068 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7069 }
7070
7071 if (valmode == VOIDmode)
7072 valmode = QImode;
7073 if (valmode != QImode)
7074 val = gen_lowpart (QImode, val);
7075 if (mode == QImode)
7076 return val;
7077 if (!TARGET_PARTIAL_REG_STALL)
7078 nops--;
7079 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7080 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7081 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7082 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7083 {
7084 rtx reg = convert_modes (mode, QImode, val, true);
7085 tmp = promote_duplicated_reg (mode, const1_rtx);
7086 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7087 OPTAB_DIRECT);
7088 }
7089 else
7090 {
7091 rtx reg = convert_modes (mode, QImode, val, true);
7092
7093 if (!TARGET_PARTIAL_REG_STALL)
7094 emit_insn (gen_insv_1 (mode, reg, reg));
7095 else
7096 {
7097 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7098 NULL, 1, OPTAB_DIRECT);
7099 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7100 OPTAB_DIRECT);
7101 }
7102 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7103 NULL, 1, OPTAB_DIRECT);
7104 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7105 if (mode == SImode)
7106 return reg;
7107 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7108 NULL, 1, OPTAB_DIRECT);
7109 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7110 return reg;
7111 }
7112 }
7113
7114 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7115 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7116 alignment from ALIGN to DESIRED_ALIGN. */
7117 static rtx
7118 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7119 int align)
7120 {
7121 rtx promoted_val;
7122
7123 if (TARGET_64BIT
7124 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7125 promoted_val = promote_duplicated_reg (DImode, val);
7126 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7127 promoted_val = promote_duplicated_reg (SImode, val);
7128 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7129 promoted_val = promote_duplicated_reg (HImode, val);
7130 else
7131 promoted_val = val;
7132
7133 return promoted_val;
7134 }
7135
7136 /* Copy the address to a Pmode register. This is used for x32 to
7137 truncate DImode TLS address to a SImode register. */
7138
7139 static rtx
7140 ix86_copy_addr_to_reg (rtx addr)
7141 {
7142 rtx reg;
7143 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7144 {
7145 reg = copy_addr_to_reg (addr);
7146 REG_POINTER (reg) = 1;
7147 return reg;
7148 }
7149 else
7150 {
7151 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7152 reg = copy_to_mode_reg (DImode, addr);
7153 REG_POINTER (reg) = 1;
7154 return gen_rtx_SUBREG (SImode, reg, 0);
7155 }
7156 }
7157
7158 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7159 operations when profitable. The code depends upon architecture, block size
7160 and alignment, but always has one of the following overall structures:
7161
7162 Aligned move sequence:
7163
7164 1) Prologue guard: Conditional that jumps up to epilogues for small
7165 blocks that can be handled by epilogue alone. This is faster
7166 but also needed for correctness, since prologue assume the block
7167 is larger than the desired alignment.
7168
7169 Optional dynamic check for size and libcall for large
7170 blocks is emitted here too, with -minline-stringops-dynamically.
7171
7172 2) Prologue: copy first few bytes in order to get destination
7173 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7174 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7175 copied. We emit either a jump tree on power of two sized
7176 blocks, or a byte loop.
7177
7178 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7179 with specified algorithm.
7180
7181 4) Epilogue: code copying tail of the block that is too small to be
7182 handled by main body (or up to size guarded by prologue guard).
7183
7184 Misaligned move sequence
7185
7186 1) missaligned move prologue/epilogue containing:
7187 a) Prologue handling small memory blocks and jumping to done_label
7188 (skipped if blocks are known to be large enough)
7189 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7190 needed by single possibly misaligned move
7191 (skipped if alignment is not needed)
7192 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7193
7194 2) Zero size guard dispatching to done_label, if needed
7195
7196 3) dispatch to library call, if needed,
7197
7198 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7199 with specified algorithm. */
7200 bool
7201 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7202 rtx align_exp, rtx expected_align_exp,
7203 rtx expected_size_exp, rtx min_size_exp,
7204 rtx max_size_exp, rtx probable_max_size_exp,
7205 bool issetmem)
7206 {
7207 rtx destreg;
7208 rtx srcreg = NULL;
7209 rtx_code_label *label = NULL;
7210 rtx tmp;
7211 rtx_code_label *jump_around_label = NULL;
7212 HOST_WIDE_INT align = 1;
7213 unsigned HOST_WIDE_INT count = 0;
7214 HOST_WIDE_INT expected_size = -1;
7215 int size_needed = 0, epilogue_size_needed;
7216 int desired_align = 0, align_bytes = 0;
7217 enum stringop_alg alg;
7218 rtx promoted_val = NULL;
7219 rtx vec_promoted_val = NULL;
7220 bool force_loopy_epilogue = false;
7221 int dynamic_check;
7222 bool need_zero_guard = false;
7223 bool noalign;
7224 machine_mode move_mode = VOIDmode;
7225 machine_mode wider_mode;
7226 int unroll_factor = 1;
7227 /* TODO: Once value ranges are available, fill in proper data. */
7228 unsigned HOST_WIDE_INT min_size = 0;
7229 unsigned HOST_WIDE_INT max_size = -1;
7230 unsigned HOST_WIDE_INT probable_max_size = -1;
7231 bool misaligned_prologue_used = false;
7232 bool have_as;
7233
7234 if (CONST_INT_P (align_exp))
7235 align = INTVAL (align_exp);
7236 /* i386 can do misaligned access on reasonably increased cost. */
7237 if (CONST_INT_P (expected_align_exp)
7238 && INTVAL (expected_align_exp) > align)
7239 align = INTVAL (expected_align_exp);
7240 /* ALIGN is the minimum of destination and source alignment, but we care here
7241 just about destination alignment. */
7242 else if (!issetmem
7243 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7244 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7245
7246 if (CONST_INT_P (count_exp))
7247 {
7248 min_size = max_size = probable_max_size = count = expected_size
7249 = INTVAL (count_exp);
7250 /* When COUNT is 0, there is nothing to do. */
7251 if (!count)
7252 return true;
7253 }
7254 else
7255 {
7256 if (min_size_exp)
7257 min_size = INTVAL (min_size_exp);
7258 if (max_size_exp)
7259 max_size = INTVAL (max_size_exp);
7260 if (probable_max_size_exp)
7261 probable_max_size = INTVAL (probable_max_size_exp);
7262 if (CONST_INT_P (expected_size_exp))
7263 expected_size = INTVAL (expected_size_exp);
7264 }
7265
7266 /* Make sure we don't need to care about overflow later on. */
7267 if (count > (HOST_WIDE_INT_1U << 30))
7268 return false;
7269
7270 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7271 if (!issetmem)
7272 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7273
7274 /* Step 0: Decide on preferred algorithm, desired alignment and
7275 size of chunks to be copied by main loop. */
7276 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7277 issetmem,
7278 issetmem && val_exp == const0_rtx, have_as,
7279 &dynamic_check, &noalign, false);
7280
7281 if (dump_file)
7282 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7283 stringop_alg_names[alg]);
7284
7285 if (alg == libcall)
7286 return false;
7287 gcc_assert (alg != no_stringop);
7288
7289 /* For now vector-version of memset is generated only for memory zeroing, as
7290 creating of promoted vector value is very cheap in this case. */
7291 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7292 alg = unrolled_loop;
7293
7294 if (!count)
7295 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7296 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7297 if (!issetmem)
7298 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7299
7300 unroll_factor = 1;
7301 move_mode = word_mode;
7302 switch (alg)
7303 {
7304 case libcall:
7305 case no_stringop:
7306 case last_alg:
7307 gcc_unreachable ();
7308 case loop_1_byte:
7309 need_zero_guard = true;
7310 move_mode = QImode;
7311 break;
7312 case loop:
7313 need_zero_guard = true;
7314 break;
7315 case unrolled_loop:
7316 need_zero_guard = true;
7317 unroll_factor = (TARGET_64BIT ? 4 : 2);
7318 break;
7319 case vector_loop:
7320 need_zero_guard = true;
7321 unroll_factor = 4;
7322 /* Find the widest supported mode. */
7323 move_mode = word_mode;
7324 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7325 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7326 move_mode = wider_mode;
7327
7328 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7329 move_mode = TImode;
7330
7331 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7332 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7333 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7334 {
7335 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7336 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7337 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7338 move_mode = word_mode;
7339 }
7340 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7341 break;
7342 case rep_prefix_8_byte:
7343 move_mode = DImode;
7344 break;
7345 case rep_prefix_4_byte:
7346 move_mode = SImode;
7347 break;
7348 case rep_prefix_1_byte:
7349 move_mode = QImode;
7350 break;
7351 }
7352 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7353 epilogue_size_needed = size_needed;
7354
7355 /* If we are going to call any library calls conditionally, make sure any
7356 pending stack adjustment happen before the first conditional branch,
7357 otherwise they will be emitted before the library call only and won't
7358 happen from the other branches. */
7359 if (dynamic_check != -1)
7360 do_pending_stack_adjust ();
7361
7362 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7363 if (!TARGET_ALIGN_STRINGOPS || noalign)
7364 align = desired_align;
7365
7366 /* Step 1: Prologue guard. */
7367
7368 /* Alignment code needs count to be in register. */
7369 if (CONST_INT_P (count_exp) && desired_align > align)
7370 {
7371 if (INTVAL (count_exp) > desired_align
7372 && INTVAL (count_exp) > size_needed)
7373 {
7374 align_bytes
7375 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7376 if (align_bytes <= 0)
7377 align_bytes = 0;
7378 else
7379 align_bytes = desired_align - align_bytes;
7380 }
7381 if (align_bytes == 0)
7382 count_exp = force_reg (counter_mode (count_exp), count_exp);
7383 }
7384 gcc_assert (desired_align >= 1 && align >= 1);
7385
7386 /* Misaligned move sequences handle both prologue and epilogue at once.
7387 Default code generation results in a smaller code for large alignments
7388 and also avoids redundant job when sizes are known precisely. */
7389 misaligned_prologue_used
7390 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7391 && MAX (desired_align, epilogue_size_needed) <= 32
7392 && desired_align <= epilogue_size_needed
7393 && ((desired_align > align && !align_bytes)
7394 || (!count && epilogue_size_needed > 1)));
7395
7396 /* Do the cheap promotion to allow better CSE across the
7397 main loop and epilogue (ie one load of the big constant in the
7398 front of all code.
7399 For now the misaligned move sequences do not have fast path
7400 without broadcasting. */
7401 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7402 {
7403 if (alg == vector_loop)
7404 {
7405 gcc_assert (val_exp == const0_rtx);
7406 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7407 promoted_val = promote_duplicated_reg_to_size (val_exp,
7408 GET_MODE_SIZE (word_mode),
7409 desired_align, align);
7410 }
7411 else
7412 {
7413 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7414 desired_align, align);
7415 }
7416 }
7417 /* Misaligned move sequences handles both prologues and epilogues at once.
7418 Default code generation results in smaller code for large alignments and
7419 also avoids redundant job when sizes are known precisely. */
7420 if (misaligned_prologue_used)
7421 {
7422 /* Misaligned move prologue handled small blocks by itself. */
7423 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7424 (dst, src, &destreg, &srcreg,
7425 move_mode, promoted_val, vec_promoted_val,
7426 &count_exp,
7427 &jump_around_label,
7428 desired_align < align
7429 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7430 desired_align, align, &min_size, dynamic_check, issetmem);
7431 if (!issetmem)
7432 src = change_address (src, BLKmode, srcreg);
7433 dst = change_address (dst, BLKmode, destreg);
7434 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7435 epilogue_size_needed = 0;
7436 if (need_zero_guard
7437 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7438 {
7439 /* It is possible that we copied enough so the main loop will not
7440 execute. */
7441 gcc_assert (size_needed > 1);
7442 if (jump_around_label == NULL_RTX)
7443 jump_around_label = gen_label_rtx ();
7444 emit_cmp_and_jump_insns (count_exp,
7445 GEN_INT (size_needed),
7446 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7447 if (expected_size == -1
7448 || expected_size < (desired_align - align) / 2 + size_needed)
7449 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7450 else
7451 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7452 }
7453 }
7454 /* Ensure that alignment prologue won't copy past end of block. */
7455 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7456 {
7457 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7458 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7459 Make sure it is power of 2. */
7460 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7461
7462 /* To improve performance of small blocks, we jump around the VAL
7463 promoting mode. This mean that if the promoted VAL is not constant,
7464 we might not use it in the epilogue and have to use byte
7465 loop variant. */
7466 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7467 force_loopy_epilogue = true;
7468 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7469 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7470 {
7471 /* If main algorithm works on QImode, no epilogue is needed.
7472 For small sizes just don't align anything. */
7473 if (size_needed == 1)
7474 desired_align = align;
7475 else
7476 goto epilogue;
7477 }
7478 else if (!count
7479 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7480 {
7481 label = gen_label_rtx ();
7482 emit_cmp_and_jump_insns (count_exp,
7483 GEN_INT (epilogue_size_needed),
7484 LTU, 0, counter_mode (count_exp), 1, label);
7485 if (expected_size == -1 || expected_size < epilogue_size_needed)
7486 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7487 else
7488 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7489 }
7490 }
7491
7492 /* Emit code to decide on runtime whether library call or inline should be
7493 used. */
7494 if (dynamic_check != -1)
7495 {
7496 if (!issetmem && CONST_INT_P (count_exp))
7497 {
7498 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7499 {
7500 emit_block_copy_via_libcall (dst, src, count_exp);
7501 count_exp = const0_rtx;
7502 goto epilogue;
7503 }
7504 }
7505 else
7506 {
7507 rtx_code_label *hot_label = gen_label_rtx ();
7508 if (jump_around_label == NULL_RTX)
7509 jump_around_label = gen_label_rtx ();
7510 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7511 LEU, 0, counter_mode (count_exp),
7512 1, hot_label);
7513 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7514 if (issetmem)
7515 set_storage_via_libcall (dst, count_exp, val_exp);
7516 else
7517 emit_block_copy_via_libcall (dst, src, count_exp);
7518 emit_jump (jump_around_label);
7519 emit_label (hot_label);
7520 }
7521 }
7522
7523 /* Step 2: Alignment prologue. */
7524 /* Do the expensive promotion once we branched off the small blocks. */
7525 if (issetmem && !promoted_val)
7526 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7527 desired_align, align);
7528
7529 if (desired_align > align && !misaligned_prologue_used)
7530 {
7531 if (align_bytes == 0)
7532 {
7533 /* Except for the first move in prologue, we no longer know
7534 constant offset in aliasing info. It don't seems to worth
7535 the pain to maintain it for the first move, so throw away
7536 the info early. */
7537 dst = change_address (dst, BLKmode, destreg);
7538 if (!issetmem)
7539 src = change_address (src, BLKmode, srcreg);
7540 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7541 promoted_val, vec_promoted_val,
7542 count_exp, align, desired_align,
7543 issetmem);
7544 /* At most desired_align - align bytes are copied. */
7545 if (min_size < (unsigned)(desired_align - align))
7546 min_size = 0;
7547 else
7548 min_size -= desired_align - align;
7549 }
7550 else
7551 {
7552 /* If we know how many bytes need to be stored before dst is
7553 sufficiently aligned, maintain aliasing info accurately. */
7554 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7555 srcreg,
7556 promoted_val,
7557 vec_promoted_val,
7558 desired_align,
7559 align_bytes,
7560 issetmem);
7561
7562 count_exp = plus_constant (counter_mode (count_exp),
7563 count_exp, -align_bytes);
7564 count -= align_bytes;
7565 min_size -= align_bytes;
7566 max_size -= align_bytes;
7567 }
7568 if (need_zero_guard
7569 && min_size < (unsigned HOST_WIDE_INT) size_needed
7570 && (count < (unsigned HOST_WIDE_INT) size_needed
7571 || (align_bytes == 0
7572 && count < ((unsigned HOST_WIDE_INT) size_needed
7573 + desired_align - align))))
7574 {
7575 /* It is possible that we copied enough so the main loop will not
7576 execute. */
7577 gcc_assert (size_needed > 1);
7578 if (label == NULL_RTX)
7579 label = gen_label_rtx ();
7580 emit_cmp_and_jump_insns (count_exp,
7581 GEN_INT (size_needed),
7582 LTU, 0, counter_mode (count_exp), 1, label);
7583 if (expected_size == -1
7584 || expected_size < (desired_align - align) / 2 + size_needed)
7585 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7586 else
7587 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7588 }
7589 }
7590 if (label && size_needed == 1)
7591 {
7592 emit_label (label);
7593 LABEL_NUSES (label) = 1;
7594 label = NULL;
7595 epilogue_size_needed = 1;
7596 if (issetmem)
7597 promoted_val = val_exp;
7598 }
7599 else if (label == NULL_RTX && !misaligned_prologue_used)
7600 epilogue_size_needed = size_needed;
7601
7602 /* Step 3: Main loop. */
7603
7604 switch (alg)
7605 {
7606 case libcall:
7607 case no_stringop:
7608 case last_alg:
7609 gcc_unreachable ();
7610 case loop_1_byte:
7611 case loop:
7612 case unrolled_loop:
7613 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7614 count_exp, move_mode, unroll_factor,
7615 expected_size, issetmem);
7616 break;
7617 case vector_loop:
7618 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7619 vec_promoted_val, count_exp, move_mode,
7620 unroll_factor, expected_size, issetmem);
7621 break;
7622 case rep_prefix_8_byte:
7623 case rep_prefix_4_byte:
7624 case rep_prefix_1_byte:
7625 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7626 val_exp, count_exp, move_mode, issetmem);
7627 break;
7628 }
7629 /* Adjust properly the offset of src and dest memory for aliasing. */
7630 if (CONST_INT_P (count_exp))
7631 {
7632 if (!issetmem)
7633 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7634 (count / size_needed) * size_needed);
7635 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7636 (count / size_needed) * size_needed);
7637 }
7638 else
7639 {
7640 if (!issetmem)
7641 src = change_address (src, BLKmode, srcreg);
7642 dst = change_address (dst, BLKmode, destreg);
7643 }
7644
7645 /* Step 4: Epilogue to copy the remaining bytes. */
7646 epilogue:
7647 if (label)
7648 {
7649 /* When the main loop is done, COUNT_EXP might hold original count,
7650 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7651 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7652 bytes. Compensate if needed. */
7653
7654 if (size_needed < epilogue_size_needed)
7655 {
7656 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7657 GEN_INT (size_needed - 1), count_exp, 1,
7658 OPTAB_DIRECT);
7659 if (tmp != count_exp)
7660 emit_move_insn (count_exp, tmp);
7661 }
7662 emit_label (label);
7663 LABEL_NUSES (label) = 1;
7664 }
7665
7666 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7667 {
7668 if (force_loopy_epilogue)
7669 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7670 epilogue_size_needed);
7671 else
7672 {
7673 if (issetmem)
7674 expand_setmem_epilogue (dst, destreg, promoted_val,
7675 vec_promoted_val, count_exp,
7676 epilogue_size_needed);
7677 else
7678 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7679 epilogue_size_needed);
7680 }
7681 }
7682 if (jump_around_label)
7683 emit_label (jump_around_label);
7684 return true;
7685 }
7686
7687 /* Expand cmpstrn or memcmp. */
7688
7689 bool
7690 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
7691 rtx length, rtx align, bool is_cmpstrn)
7692 {
7693 /* Expand strncmp and memcmp only with -minline-all-stringops since
7694 "repz cmpsb" can be much slower than strncmp and memcmp functions
7695 implemented with vector instructions, see
7696
7697 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7698 */
7699 if (!TARGET_INLINE_ALL_STRINGOPS)
7700 return false;
7701
7702 /* Can't use this if the user has appropriated ecx, esi or edi. */
7703 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
7704 return false;
7705
7706 if (is_cmpstrn)
7707 {
7708 /* For strncmp, length is the maximum length, which can be larger
7709 than actual string lengths. We can expand the cmpstrn pattern
7710 to "repz cmpsb" only if one of the strings is a constant so
7711 that expand_builtin_strncmp() can write the length argument to
7712 be the minimum of the const string length and the actual length
7713 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7714 tree t1 = MEM_EXPR (src1);
7715 tree t2 = MEM_EXPR (src2);
7716 if (!((t1 && TREE_CODE (t1) == MEM_REF
7717 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
7718 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
7719 == STRING_CST))
7720 || (t2 && TREE_CODE (t2) == MEM_REF
7721 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
7722 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
7723 == STRING_CST))))
7724 return false;
7725 }
7726
7727 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
7728 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
7729 if (addr1 != XEXP (src1, 0))
7730 src1 = replace_equiv_address_nv (src1, addr1);
7731 if (addr2 != XEXP (src2, 0))
7732 src2 = replace_equiv_address_nv (src2, addr2);
7733
7734 /* NB: Make a copy of the data length to avoid changing the original
7735 data length by cmpstrnqi patterns. */
7736 length = ix86_zero_extend_to_Pmode (length);
7737 rtx lengthreg = gen_reg_rtx (Pmode);
7738 emit_move_insn (lengthreg, length);
7739
7740 /* If we are testing strict equality, we can use known alignment to
7741 good advantage. This may be possible with combine, particularly
7742 once cc0 is dead. */
7743 if (CONST_INT_P (length))
7744 {
7745 if (length == const0_rtx)
7746 {
7747 emit_move_insn (result, const0_rtx);
7748 return true;
7749 }
7750 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
7751 src1, src2));
7752 }
7753 else
7754 {
7755 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
7756 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
7757 src1, src2));
7758 }
7759
7760 rtx out = gen_lowpart (QImode, result);
7761 emit_insn (gen_cmpintqi (out));
7762 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
7763
7764 return true;
7765 }
7766
7767 /* Expand the appropriate insns for doing strlen if not just doing
7768 repnz; scasb
7769
7770 out = result, initialized with the start address
7771 align_rtx = alignment of the address.
7772 scratch = scratch register, initialized with the startaddress when
7773 not aligned, otherwise undefined
7774
7775 This is just the body. It needs the initializations mentioned above and
7776 some address computing at the end. These things are done in i386.md. */
7777
7778 static void
7779 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7780 {
7781 int align;
7782 rtx tmp;
7783 rtx_code_label *align_2_label = NULL;
7784 rtx_code_label *align_3_label = NULL;
7785 rtx_code_label *align_4_label = gen_label_rtx ();
7786 rtx_code_label *end_0_label = gen_label_rtx ();
7787 rtx mem;
7788 rtx tmpreg = gen_reg_rtx (SImode);
7789 rtx scratch = gen_reg_rtx (SImode);
7790 rtx cmp;
7791
7792 align = 0;
7793 if (CONST_INT_P (align_rtx))
7794 align = INTVAL (align_rtx);
7795
7796 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7797
7798 /* Is there a known alignment and is it less than 4? */
7799 if (align < 4)
7800 {
7801 rtx scratch1 = gen_reg_rtx (Pmode);
7802 emit_move_insn (scratch1, out);
7803 /* Is there a known alignment and is it not 2? */
7804 if (align != 2)
7805 {
7806 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7807 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7808
7809 /* Leave just the 3 lower bits. */
7810 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7811 NULL_RTX, 0, OPTAB_WIDEN);
7812
7813 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7814 Pmode, 1, align_4_label);
7815 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7816 Pmode, 1, align_2_label);
7817 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7818 Pmode, 1, align_3_label);
7819 }
7820 else
7821 {
7822 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7823 check if is aligned to 4 - byte. */
7824
7825 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7826 NULL_RTX, 0, OPTAB_WIDEN);
7827
7828 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7829 Pmode, 1, align_4_label);
7830 }
7831
7832 mem = change_address (src, QImode, out);
7833
7834 /* Now compare the bytes. */
7835
7836 /* Compare the first n unaligned byte on a byte per byte basis. */
7837 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7838 QImode, 1, end_0_label);
7839
7840 /* Increment the address. */
7841 emit_insn (gen_add2_insn (out, const1_rtx));
7842
7843 /* Not needed with an alignment of 2 */
7844 if (align != 2)
7845 {
7846 emit_label (align_2_label);
7847
7848 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7849 end_0_label);
7850
7851 emit_insn (gen_add2_insn (out, const1_rtx));
7852
7853 emit_label (align_3_label);
7854 }
7855
7856 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7857 end_0_label);
7858
7859 emit_insn (gen_add2_insn (out, const1_rtx));
7860 }
7861
7862 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7863 align this loop. It gives only huge programs, but does not help to
7864 speed up. */
7865 emit_label (align_4_label);
7866
7867 mem = change_address (src, SImode, out);
7868 emit_move_insn (scratch, mem);
7869 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7870
7871 /* This formula yields a nonzero result iff one of the bytes is zero.
7872 This saves three branches inside loop and many cycles. */
7873
7874 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7875 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7876 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7877 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7878 gen_int_mode (0x80808080, SImode)));
7879 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7880 align_4_label);
7881
7882 if (TARGET_CMOVE)
7883 {
7884 rtx reg = gen_reg_rtx (SImode);
7885 rtx reg2 = gen_reg_rtx (Pmode);
7886 emit_move_insn (reg, tmpreg);
7887 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7888
7889 /* If zero is not in the first two bytes, move two bytes forward. */
7890 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7891 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7892 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7893 emit_insn (gen_rtx_SET (tmpreg,
7894 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7895 reg,
7896 tmpreg)));
7897 /* Emit lea manually to avoid clobbering of flags. */
7898 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7899
7900 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7901 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7902 emit_insn (gen_rtx_SET (out,
7903 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7904 reg2,
7905 out)));
7906 }
7907 else
7908 {
7909 rtx_code_label *end_2_label = gen_label_rtx ();
7910 /* Is zero in the first two bytes? */
7911
7912 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7913 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7914 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7915 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7916 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7917 pc_rtx);
7918 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7919 JUMP_LABEL (tmp) = end_2_label;
7920
7921 /* Not in the first two. Move two bytes forward. */
7922 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7923 emit_insn (gen_add2_insn (out, const2_rtx));
7924
7925 emit_label (end_2_label);
7926
7927 }
7928
7929 /* Avoid branch in fixing the byte. */
7930 tmpreg = gen_lowpart (QImode, tmpreg);
7931 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7932 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7933 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7934 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7935
7936 emit_label (end_0_label);
7937 }
7938
7939 /* Expand strlen. */
7940
7941 bool
7942 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7943 {
7944 if (TARGET_UNROLL_STRLEN
7945 && TARGET_INLINE_ALL_STRINGOPS
7946 && eoschar == const0_rtx
7947 && optimize > 1)
7948 {
7949 /* The generic case of strlen expander is long. Avoid it's
7950 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7951 rtx addr = force_reg (Pmode, XEXP (src, 0));
7952 /* Well it seems that some optimizer does not combine a call like
7953 foo(strlen(bar), strlen(bar));
7954 when the move and the subtraction is done here. It does calculate
7955 the length just once when these instructions are done inside of
7956 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7957 often used and I use one fewer register for the lifetime of
7958 output_strlen_unroll() this is better. */
7959
7960 emit_move_insn (out, addr);
7961
7962 ix86_expand_strlensi_unroll_1 (out, src, align);
7963
7964 /* strlensi_unroll_1 returns the address of the zero at the end of
7965 the string, like memchr(), so compute the length by subtracting
7966 the start address. */
7967 emit_insn (gen_sub2_insn (out, addr));
7968 return true;
7969 }
7970 else
7971 return false;
7972 }
7973
7974 /* For given symbol (function) construct code to compute address of it's PLT
7975 entry in large x86-64 PIC model. */
7976
7977 static rtx
7978 construct_plt_address (rtx symbol)
7979 {
7980 rtx tmp, unspec;
7981
7982 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7983 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7984 gcc_assert (Pmode == DImode);
7985
7986 tmp = gen_reg_rtx (Pmode);
7987 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7988
7989 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7990 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7991 return tmp;
7992 }
7993
7994 /* Additional registers that are clobbered by SYSV calls. */
7995
7996 static int const x86_64_ms_sysv_extra_clobbered_registers
7997 [NUM_X86_64_MS_CLOBBERED_REGS] =
7998 {
7999 SI_REG, DI_REG,
8000 XMM6_REG, XMM7_REG,
8001 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8002 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8003 };
8004
8005 rtx_insn *
8006 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8007 rtx callarg2,
8008 rtx pop, bool sibcall)
8009 {
8010 rtx vec[3];
8011 rtx use = NULL, call;
8012 unsigned int vec_len = 0;
8013 tree fndecl;
8014
8015 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8016 {
8017 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8018 if (fndecl
8019 && (lookup_attribute ("interrupt",
8020 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
8021 error ("interrupt service routine cannot be called directly");
8022 }
8023 else
8024 fndecl = NULL_TREE;
8025
8026 if (pop == const0_rtx)
8027 pop = NULL;
8028 gcc_assert (!TARGET_64BIT || !pop);
8029
8030 if (TARGET_MACHO && !TARGET_64BIT)
8031 {
8032 #if TARGET_MACHO
8033 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8034 fnaddr = machopic_indirect_call_target (fnaddr);
8035 #endif
8036 }
8037 else
8038 {
8039 /* Static functions and indirect calls don't need the pic register. Also,
8040 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8041 it an indirect call. */
8042 rtx addr = XEXP (fnaddr, 0);
8043 if (flag_pic
8044 && GET_CODE (addr) == SYMBOL_REF
8045 && !SYMBOL_REF_LOCAL_P (addr))
8046 {
8047 if (flag_plt
8048 && (SYMBOL_REF_DECL (addr) == NULL_TREE
8049 || !lookup_attribute ("noplt",
8050 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8051 {
8052 if (!TARGET_64BIT
8053 || (ix86_cmodel == CM_LARGE_PIC
8054 && DEFAULT_ABI != MS_ABI))
8055 {
8056 use_reg (&use, gen_rtx_REG (Pmode,
8057 REAL_PIC_OFFSET_TABLE_REGNUM));
8058 if (ix86_use_pseudo_pic_reg ())
8059 emit_move_insn (gen_rtx_REG (Pmode,
8060 REAL_PIC_OFFSET_TABLE_REGNUM),
8061 pic_offset_table_rtx);
8062 }
8063 }
8064 else if (!TARGET_PECOFF && !TARGET_MACHO)
8065 {
8066 if (TARGET_64BIT
8067 && ix86_cmodel == CM_LARGE_PIC
8068 && DEFAULT_ABI != MS_ABI)
8069 {
8070 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8071 UNSPEC_GOT);
8072 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8073 fnaddr = force_reg (Pmode, fnaddr);
8074 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
8075 }
8076 else if (TARGET_64BIT)
8077 {
8078 fnaddr = gen_rtx_UNSPEC (Pmode,
8079 gen_rtvec (1, addr),
8080 UNSPEC_GOTPCREL);
8081 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8082 }
8083 else
8084 {
8085 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8086 UNSPEC_GOT);
8087 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8088 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8089 fnaddr);
8090 }
8091 fnaddr = gen_const_mem (Pmode, fnaddr);
8092 /* Pmode may not be the same as word_mode for x32, which
8093 doesn't support indirect branch via 32-bit memory slot.
8094 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8095 indirect branch via x32 GOT slot is OK. */
8096 if (GET_MODE (fnaddr) != word_mode)
8097 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8098 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8099 }
8100 }
8101 }
8102
8103 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8104 parameters passed in vector registers. */
8105 if (TARGET_64BIT
8106 && (INTVAL (callarg2) > 0
8107 || (INTVAL (callarg2) == 0
8108 && (TARGET_SSE || !flag_skip_rax_setup))))
8109 {
8110 rtx al = gen_rtx_REG (QImode, AX_REG);
8111 emit_move_insn (al, callarg2);
8112 use_reg (&use, al);
8113 }
8114
8115 if (ix86_cmodel == CM_LARGE_PIC
8116 && !TARGET_PECOFF
8117 && MEM_P (fnaddr)
8118 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8119 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8120 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8121 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8122 branch via x32 GOT slot is OK. */
8123 else if (!(TARGET_X32
8124 && MEM_P (fnaddr)
8125 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8126 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8127 && (sibcall
8128 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8129 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8130 {
8131 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8132 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8133 }
8134
8135 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8136
8137 if (retval)
8138 call = gen_rtx_SET (retval, call);
8139 vec[vec_len++] = call;
8140
8141 if (pop)
8142 {
8143 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8144 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8145 vec[vec_len++] = pop;
8146 }
8147
8148 if (cfun->machine->no_caller_saved_registers
8149 && (!fndecl
8150 || (!TREE_THIS_VOLATILE (fndecl)
8151 && !lookup_attribute ("no_caller_saved_registers",
8152 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8153 {
8154 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8155 bool is_64bit_ms_abi = (TARGET_64BIT
8156 && ix86_function_abi (fndecl) == MS_ABI);
8157 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8158
8159 /* If there are no caller-saved registers, add all registers
8160 that are clobbered by the call which returns. */
8161 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8162 if (!fixed_regs[i]
8163 && (ix86_call_used_regs[i] == 1
8164 || (ix86_call_used_regs[i] & c_mask))
8165 && !STACK_REGNO_P (i)
8166 && !MMX_REGNO_P (i))
8167 clobber_reg (&use,
8168 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8169 }
8170 else if (TARGET_64BIT_MS_ABI
8171 && (!callarg2 || INTVAL (callarg2) != -2))
8172 {
8173 unsigned i;
8174
8175 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8176 {
8177 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8178 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8179
8180 clobber_reg (&use, gen_rtx_REG (mode, regno));
8181 }
8182
8183 /* Set here, but it may get cleared later. */
8184 if (TARGET_CALL_MS2SYSV_XLOGUES)
8185 {
8186 if (!TARGET_SSE)
8187 ;
8188
8189 /* Don't break hot-patched functions. */
8190 else if (ix86_function_ms_hook_prologue (current_function_decl))
8191 ;
8192
8193 /* TODO: Cases not yet examined. */
8194 else if (flag_split_stack)
8195 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8196
8197 else
8198 {
8199 gcc_assert (!reload_completed);
8200 cfun->machine->call_ms2sysv = true;
8201 }
8202 }
8203 }
8204
8205 if (vec_len > 1)
8206 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8207 rtx_insn *call_insn = emit_call_insn (call);
8208 if (use)
8209 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8210
8211 return call_insn;
8212 }
8213
8214 /* Split simple return with popping POPC bytes from stack to indirect
8215 branch with stack adjustment . */
8216
8217 void
8218 ix86_split_simple_return_pop_internal (rtx popc)
8219 {
8220 struct machine_function *m = cfun->machine;
8221 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8222 rtx_insn *insn;
8223
8224 /* There is no "pascal" calling convention in any 64bit ABI. */
8225 gcc_assert (!TARGET_64BIT);
8226
8227 insn = emit_insn (gen_pop (ecx));
8228 m->fs.cfa_offset -= UNITS_PER_WORD;
8229 m->fs.sp_offset -= UNITS_PER_WORD;
8230
8231 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8232 x = gen_rtx_SET (stack_pointer_rtx, x);
8233 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8234 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8235 RTX_FRAME_RELATED_P (insn) = 1;
8236
8237 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8238 x = gen_rtx_SET (stack_pointer_rtx, x);
8239 insn = emit_insn (x);
8240 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8241 RTX_FRAME_RELATED_P (insn) = 1;
8242
8243 /* Now return address is in ECX. */
8244 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8245 }
8246
8247 /* Errors in the source file can cause expand_expr to return const0_rtx
8248 where we expect a vector. To avoid crashing, use one of the vector
8249 clear instructions. */
8250
8251 static rtx
8252 safe_vector_operand (rtx x, machine_mode mode)
8253 {
8254 if (x == const0_rtx)
8255 x = CONST0_RTX (mode);
8256 return x;
8257 }
8258
8259 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8260
8261 static rtx
8262 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8263 {
8264 rtx pat;
8265 tree arg0 = CALL_EXPR_ARG (exp, 0);
8266 tree arg1 = CALL_EXPR_ARG (exp, 1);
8267 rtx op0 = expand_normal (arg0);
8268 rtx op1 = expand_normal (arg1);
8269 machine_mode tmode = insn_data[icode].operand[0].mode;
8270 machine_mode mode0 = insn_data[icode].operand[1].mode;
8271 machine_mode mode1 = insn_data[icode].operand[2].mode;
8272
8273 if (VECTOR_MODE_P (mode0))
8274 op0 = safe_vector_operand (op0, mode0);
8275 if (VECTOR_MODE_P (mode1))
8276 op1 = safe_vector_operand (op1, mode1);
8277
8278 if (optimize || !target
8279 || GET_MODE (target) != tmode
8280 || !insn_data[icode].operand[0].predicate (target, tmode))
8281 target = gen_reg_rtx (tmode);
8282
8283 if (GET_MODE (op1) == SImode && mode1 == TImode)
8284 {
8285 rtx x = gen_reg_rtx (V4SImode);
8286 emit_insn (gen_sse2_loadd (x, op1));
8287 op1 = gen_lowpart (TImode, x);
8288 }
8289
8290 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8291 op0 = copy_to_mode_reg (mode0, op0);
8292 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8293 op1 = copy_to_mode_reg (mode1, op1);
8294
8295 pat = GEN_FCN (icode) (target, op0, op1);
8296 if (! pat)
8297 return 0;
8298
8299 emit_insn (pat);
8300
8301 return target;
8302 }
8303
8304 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8305
8306 static rtx
8307 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8308 enum ix86_builtin_func_type m_type,
8309 enum rtx_code sub_code)
8310 {
8311 rtx pat;
8312 unsigned int i, nargs;
8313 bool comparison_p = false;
8314 bool tf_p = false;
8315 bool last_arg_constant = false;
8316 int num_memory = 0;
8317 rtx xops[4];
8318
8319 machine_mode tmode = insn_data[icode].operand[0].mode;
8320
8321 switch (m_type)
8322 {
8323 case MULTI_ARG_4_DF2_DI_I:
8324 case MULTI_ARG_4_DF2_DI_I1:
8325 case MULTI_ARG_4_SF2_SI_I:
8326 case MULTI_ARG_4_SF2_SI_I1:
8327 nargs = 4;
8328 last_arg_constant = true;
8329 break;
8330
8331 case MULTI_ARG_3_SF:
8332 case MULTI_ARG_3_DF:
8333 case MULTI_ARG_3_SF2:
8334 case MULTI_ARG_3_DF2:
8335 case MULTI_ARG_3_DI:
8336 case MULTI_ARG_3_SI:
8337 case MULTI_ARG_3_SI_DI:
8338 case MULTI_ARG_3_HI:
8339 case MULTI_ARG_3_HI_SI:
8340 case MULTI_ARG_3_QI:
8341 case MULTI_ARG_3_DI2:
8342 case MULTI_ARG_3_SI2:
8343 case MULTI_ARG_3_HI2:
8344 case MULTI_ARG_3_QI2:
8345 nargs = 3;
8346 break;
8347
8348 case MULTI_ARG_2_SF:
8349 case MULTI_ARG_2_DF:
8350 case MULTI_ARG_2_DI:
8351 case MULTI_ARG_2_SI:
8352 case MULTI_ARG_2_HI:
8353 case MULTI_ARG_2_QI:
8354 nargs = 2;
8355 break;
8356
8357 case MULTI_ARG_2_DI_IMM:
8358 case MULTI_ARG_2_SI_IMM:
8359 case MULTI_ARG_2_HI_IMM:
8360 case MULTI_ARG_2_QI_IMM:
8361 nargs = 2;
8362 last_arg_constant = true;
8363 break;
8364
8365 case MULTI_ARG_1_SF:
8366 case MULTI_ARG_1_DF:
8367 case MULTI_ARG_1_SF2:
8368 case MULTI_ARG_1_DF2:
8369 case MULTI_ARG_1_DI:
8370 case MULTI_ARG_1_SI:
8371 case MULTI_ARG_1_HI:
8372 case MULTI_ARG_1_QI:
8373 case MULTI_ARG_1_SI_DI:
8374 case MULTI_ARG_1_HI_DI:
8375 case MULTI_ARG_1_HI_SI:
8376 case MULTI_ARG_1_QI_DI:
8377 case MULTI_ARG_1_QI_SI:
8378 case MULTI_ARG_1_QI_HI:
8379 nargs = 1;
8380 break;
8381
8382 case MULTI_ARG_2_DI_CMP:
8383 case MULTI_ARG_2_SI_CMP:
8384 case MULTI_ARG_2_HI_CMP:
8385 case MULTI_ARG_2_QI_CMP:
8386 nargs = 2;
8387 comparison_p = true;
8388 break;
8389
8390 case MULTI_ARG_2_SF_TF:
8391 case MULTI_ARG_2_DF_TF:
8392 case MULTI_ARG_2_DI_TF:
8393 case MULTI_ARG_2_SI_TF:
8394 case MULTI_ARG_2_HI_TF:
8395 case MULTI_ARG_2_QI_TF:
8396 nargs = 2;
8397 tf_p = true;
8398 break;
8399
8400 default:
8401 gcc_unreachable ();
8402 }
8403
8404 if (optimize || !target
8405 || GET_MODE (target) != tmode
8406 || !insn_data[icode].operand[0].predicate (target, tmode))
8407 target = gen_reg_rtx (tmode);
8408 else if (memory_operand (target, tmode))
8409 num_memory++;
8410
8411 gcc_assert (nargs <= ARRAY_SIZE (xops));
8412
8413 for (i = 0; i < nargs; i++)
8414 {
8415 tree arg = CALL_EXPR_ARG (exp, i);
8416 rtx op = expand_normal (arg);
8417 int adjust = (comparison_p) ? 1 : 0;
8418 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8419
8420 if (last_arg_constant && i == nargs - 1)
8421 {
8422 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8423 {
8424 enum insn_code new_icode = icode;
8425 switch (icode)
8426 {
8427 case CODE_FOR_xop_vpermil2v2df3:
8428 case CODE_FOR_xop_vpermil2v4sf3:
8429 case CODE_FOR_xop_vpermil2v4df3:
8430 case CODE_FOR_xop_vpermil2v8sf3:
8431 error ("the last argument must be a 2-bit immediate");
8432 return gen_reg_rtx (tmode);
8433 case CODE_FOR_xop_rotlv2di3:
8434 new_icode = CODE_FOR_rotlv2di3;
8435 goto xop_rotl;
8436 case CODE_FOR_xop_rotlv4si3:
8437 new_icode = CODE_FOR_rotlv4si3;
8438 goto xop_rotl;
8439 case CODE_FOR_xop_rotlv8hi3:
8440 new_icode = CODE_FOR_rotlv8hi3;
8441 goto xop_rotl;
8442 case CODE_FOR_xop_rotlv16qi3:
8443 new_icode = CODE_FOR_rotlv16qi3;
8444 xop_rotl:
8445 if (CONST_INT_P (op))
8446 {
8447 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8448 op = GEN_INT (INTVAL (op) & mask);
8449 gcc_checking_assert
8450 (insn_data[icode].operand[i + 1].predicate (op, mode));
8451 }
8452 else
8453 {
8454 gcc_checking_assert
8455 (nargs == 2
8456 && insn_data[new_icode].operand[0].mode == tmode
8457 && insn_data[new_icode].operand[1].mode == tmode
8458 && insn_data[new_icode].operand[2].mode == mode
8459 && insn_data[new_icode].operand[0].predicate
8460 == insn_data[icode].operand[0].predicate
8461 && insn_data[new_icode].operand[1].predicate
8462 == insn_data[icode].operand[1].predicate);
8463 icode = new_icode;
8464 goto non_constant;
8465 }
8466 break;
8467 default:
8468 gcc_unreachable ();
8469 }
8470 }
8471 }
8472 else
8473 {
8474 non_constant:
8475 if (VECTOR_MODE_P (mode))
8476 op = safe_vector_operand (op, mode);
8477
8478 /* If we aren't optimizing, only allow one memory operand to be
8479 generated. */
8480 if (memory_operand (op, mode))
8481 num_memory++;
8482
8483 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8484
8485 if (optimize
8486 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8487 || num_memory > 1)
8488 op = force_reg (mode, op);
8489 }
8490
8491 xops[i] = op;
8492 }
8493
8494 switch (nargs)
8495 {
8496 case 1:
8497 pat = GEN_FCN (icode) (target, xops[0]);
8498 break;
8499
8500 case 2:
8501 if (tf_p)
8502 pat = GEN_FCN (icode) (target, xops[0], xops[1],
8503 GEN_INT ((int)sub_code));
8504 else if (! comparison_p)
8505 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
8506 else
8507 {
8508 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8509 xops[0], xops[1]);
8510
8511 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
8512 }
8513 break;
8514
8515 case 3:
8516 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
8517 break;
8518
8519 case 4:
8520 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
8521 break;
8522
8523 default:
8524 gcc_unreachable ();
8525 }
8526
8527 if (! pat)
8528 return 0;
8529
8530 emit_insn (pat);
8531 return target;
8532 }
8533
8534 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8535 insns with vec_merge. */
8536
8537 static rtx
8538 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8539 rtx target)
8540 {
8541 rtx pat;
8542 tree arg0 = CALL_EXPR_ARG (exp, 0);
8543 rtx op1, op0 = expand_normal (arg0);
8544 machine_mode tmode = insn_data[icode].operand[0].mode;
8545 machine_mode mode0 = insn_data[icode].operand[1].mode;
8546
8547 if (optimize || !target
8548 || GET_MODE (target) != tmode
8549 || !insn_data[icode].operand[0].predicate (target, tmode))
8550 target = gen_reg_rtx (tmode);
8551
8552 if (VECTOR_MODE_P (mode0))
8553 op0 = safe_vector_operand (op0, mode0);
8554
8555 if ((optimize && !register_operand (op0, mode0))
8556 || !insn_data[icode].operand[1].predicate (op0, mode0))
8557 op0 = copy_to_mode_reg (mode0, op0);
8558
8559 op1 = op0;
8560 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8561 op1 = copy_to_mode_reg (mode0, op1);
8562
8563 pat = GEN_FCN (icode) (target, op0, op1);
8564 if (! pat)
8565 return 0;
8566 emit_insn (pat);
8567 return target;
8568 }
8569
8570 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8571
8572 static rtx
8573 ix86_expand_sse_compare (const struct builtin_description *d,
8574 tree exp, rtx target, bool swap)
8575 {
8576 rtx pat;
8577 tree arg0 = CALL_EXPR_ARG (exp, 0);
8578 tree arg1 = CALL_EXPR_ARG (exp, 1);
8579 rtx op0 = expand_normal (arg0);
8580 rtx op1 = expand_normal (arg1);
8581 rtx op2;
8582 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8583 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8584 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8585 enum rtx_code comparison = d->comparison;
8586
8587 if (VECTOR_MODE_P (mode0))
8588 op0 = safe_vector_operand (op0, mode0);
8589 if (VECTOR_MODE_P (mode1))
8590 op1 = safe_vector_operand (op1, mode1);
8591
8592 /* Swap operands if we have a comparison that isn't available in
8593 hardware. */
8594 if (swap)
8595 std::swap (op0, op1);
8596
8597 if (optimize || !target
8598 || GET_MODE (target) != tmode
8599 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8600 target = gen_reg_rtx (tmode);
8601
8602 if ((optimize && !register_operand (op0, mode0))
8603 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8604 op0 = copy_to_mode_reg (mode0, op0);
8605 if ((optimize && !register_operand (op1, mode1))
8606 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8607 op1 = copy_to_mode_reg (mode1, op1);
8608
8609 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8610 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8611 if (! pat)
8612 return 0;
8613 emit_insn (pat);
8614 return target;
8615 }
8616
8617 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8618
8619 static rtx
8620 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8621 rtx target)
8622 {
8623 rtx pat;
8624 tree arg0 = CALL_EXPR_ARG (exp, 0);
8625 tree arg1 = CALL_EXPR_ARG (exp, 1);
8626 rtx op0 = expand_normal (arg0);
8627 rtx op1 = expand_normal (arg1);
8628 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8629 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8630 enum rtx_code comparison = d->comparison;
8631
8632 if (VECTOR_MODE_P (mode0))
8633 op0 = safe_vector_operand (op0, mode0);
8634 if (VECTOR_MODE_P (mode1))
8635 op1 = safe_vector_operand (op1, mode1);
8636
8637 target = gen_reg_rtx (SImode);
8638 emit_move_insn (target, const0_rtx);
8639 target = gen_rtx_SUBREG (QImode, target, 0);
8640
8641 if ((optimize && !register_operand (op0, mode0))
8642 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8643 op0 = copy_to_mode_reg (mode0, op0);
8644 if ((optimize && !register_operand (op1, mode1))
8645 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8646 op1 = copy_to_mode_reg (mode1, op1);
8647
8648 pat = GEN_FCN (d->icode) (op0, op1);
8649 if (! pat)
8650 return 0;
8651 emit_insn (pat);
8652 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8653 gen_rtx_fmt_ee (comparison, QImode,
8654 SET_DEST (pat),
8655 const0_rtx)));
8656
8657 return SUBREG_REG (target);
8658 }
8659
8660 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8661
8662 static rtx
8663 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8664 rtx target)
8665 {
8666 rtx pat;
8667 tree arg0 = CALL_EXPR_ARG (exp, 0);
8668 rtx op1, op0 = expand_normal (arg0);
8669 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8670 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8671
8672 if (optimize || target == 0
8673 || GET_MODE (target) != tmode
8674 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8675 target = gen_reg_rtx (tmode);
8676
8677 if (VECTOR_MODE_P (mode0))
8678 op0 = safe_vector_operand (op0, mode0);
8679
8680 if ((optimize && !register_operand (op0, mode0))
8681 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8682 op0 = copy_to_mode_reg (mode0, op0);
8683
8684 op1 = GEN_INT (d->comparison);
8685
8686 pat = GEN_FCN (d->icode) (target, op0, op1);
8687 if (! pat)
8688 return 0;
8689 emit_insn (pat);
8690 return target;
8691 }
8692
8693 static rtx
8694 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8695 tree exp, rtx target)
8696 {
8697 rtx pat;
8698 tree arg0 = CALL_EXPR_ARG (exp, 0);
8699 tree arg1 = CALL_EXPR_ARG (exp, 1);
8700 rtx op0 = expand_normal (arg0);
8701 rtx op1 = expand_normal (arg1);
8702 rtx op2;
8703 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8704 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8705 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8706
8707 if (optimize || target == 0
8708 || GET_MODE (target) != tmode
8709 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8710 target = gen_reg_rtx (tmode);
8711
8712 op0 = safe_vector_operand (op0, mode0);
8713 op1 = safe_vector_operand (op1, mode1);
8714
8715 if ((optimize && !register_operand (op0, mode0))
8716 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8717 op0 = copy_to_mode_reg (mode0, op0);
8718 if ((optimize && !register_operand (op1, mode1))
8719 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8720 op1 = copy_to_mode_reg (mode1, op1);
8721
8722 op2 = GEN_INT (d->comparison);
8723
8724 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8725 if (! pat)
8726 return 0;
8727 emit_insn (pat);
8728 return target;
8729 }
8730
8731 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8732
8733 static rtx
8734 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8735 rtx target)
8736 {
8737 rtx pat;
8738 tree arg0 = CALL_EXPR_ARG (exp, 0);
8739 tree arg1 = CALL_EXPR_ARG (exp, 1);
8740 rtx op0 = expand_normal (arg0);
8741 rtx op1 = expand_normal (arg1);
8742 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8743 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8744 enum rtx_code comparison = d->comparison;
8745
8746 if (VECTOR_MODE_P (mode0))
8747 op0 = safe_vector_operand (op0, mode0);
8748 if (VECTOR_MODE_P (mode1))
8749 op1 = safe_vector_operand (op1, mode1);
8750
8751 target = gen_reg_rtx (SImode);
8752 emit_move_insn (target, const0_rtx);
8753 target = gen_rtx_SUBREG (QImode, target, 0);
8754
8755 if ((optimize && !register_operand (op0, mode0))
8756 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8757 op0 = copy_to_mode_reg (mode0, op0);
8758 if ((optimize && !register_operand (op1, mode1))
8759 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8760 op1 = copy_to_mode_reg (mode1, op1);
8761
8762 pat = GEN_FCN (d->icode) (op0, op1);
8763 if (! pat)
8764 return 0;
8765 emit_insn (pat);
8766 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8767 gen_rtx_fmt_ee (comparison, QImode,
8768 SET_DEST (pat),
8769 const0_rtx)));
8770
8771 return SUBREG_REG (target);
8772 }
8773
8774 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8775
8776 static rtx
8777 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8778 tree exp, rtx target)
8779 {
8780 rtx pat;
8781 tree arg0 = CALL_EXPR_ARG (exp, 0);
8782 tree arg1 = CALL_EXPR_ARG (exp, 1);
8783 tree arg2 = CALL_EXPR_ARG (exp, 2);
8784 tree arg3 = CALL_EXPR_ARG (exp, 3);
8785 tree arg4 = CALL_EXPR_ARG (exp, 4);
8786 rtx scratch0, scratch1;
8787 rtx op0 = expand_normal (arg0);
8788 rtx op1 = expand_normal (arg1);
8789 rtx op2 = expand_normal (arg2);
8790 rtx op3 = expand_normal (arg3);
8791 rtx op4 = expand_normal (arg4);
8792 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8793
8794 tmode0 = insn_data[d->icode].operand[0].mode;
8795 tmode1 = insn_data[d->icode].operand[1].mode;
8796 modev2 = insn_data[d->icode].operand[2].mode;
8797 modei3 = insn_data[d->icode].operand[3].mode;
8798 modev4 = insn_data[d->icode].operand[4].mode;
8799 modei5 = insn_data[d->icode].operand[5].mode;
8800 modeimm = insn_data[d->icode].operand[6].mode;
8801
8802 if (VECTOR_MODE_P (modev2))
8803 op0 = safe_vector_operand (op0, modev2);
8804 if (VECTOR_MODE_P (modev4))
8805 op2 = safe_vector_operand (op2, modev4);
8806
8807 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8808 op0 = copy_to_mode_reg (modev2, op0);
8809 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8810 op1 = copy_to_mode_reg (modei3, op1);
8811 if ((optimize && !register_operand (op2, modev4))
8812 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8813 op2 = copy_to_mode_reg (modev4, op2);
8814 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8815 op3 = copy_to_mode_reg (modei5, op3);
8816
8817 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8818 {
8819 error ("the fifth argument must be an 8-bit immediate");
8820 return const0_rtx;
8821 }
8822
8823 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8824 {
8825 if (optimize || !target
8826 || GET_MODE (target) != tmode0
8827 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8828 target = gen_reg_rtx (tmode0);
8829
8830 scratch1 = gen_reg_rtx (tmode1);
8831
8832 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8833 }
8834 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8835 {
8836 if (optimize || !target
8837 || GET_MODE (target) != tmode1
8838 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8839 target = gen_reg_rtx (tmode1);
8840
8841 scratch0 = gen_reg_rtx (tmode0);
8842
8843 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8844 }
8845 else
8846 {
8847 gcc_assert (d->flag);
8848
8849 scratch0 = gen_reg_rtx (tmode0);
8850 scratch1 = gen_reg_rtx (tmode1);
8851
8852 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8853 }
8854
8855 if (! pat)
8856 return 0;
8857
8858 emit_insn (pat);
8859
8860 if (d->flag)
8861 {
8862 target = gen_reg_rtx (SImode);
8863 emit_move_insn (target, const0_rtx);
8864 target = gen_rtx_SUBREG (QImode, target, 0);
8865
8866 emit_insn
8867 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8868 gen_rtx_fmt_ee (EQ, QImode,
8869 gen_rtx_REG ((machine_mode) d->flag,
8870 FLAGS_REG),
8871 const0_rtx)));
8872 return SUBREG_REG (target);
8873 }
8874 else
8875 return target;
8876 }
8877
8878
8879 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8880
8881 static rtx
8882 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8883 tree exp, rtx target)
8884 {
8885 rtx pat;
8886 tree arg0 = CALL_EXPR_ARG (exp, 0);
8887 tree arg1 = CALL_EXPR_ARG (exp, 1);
8888 tree arg2 = CALL_EXPR_ARG (exp, 2);
8889 rtx scratch0, scratch1;
8890 rtx op0 = expand_normal (arg0);
8891 rtx op1 = expand_normal (arg1);
8892 rtx op2 = expand_normal (arg2);
8893 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8894
8895 tmode0 = insn_data[d->icode].operand[0].mode;
8896 tmode1 = insn_data[d->icode].operand[1].mode;
8897 modev2 = insn_data[d->icode].operand[2].mode;
8898 modev3 = insn_data[d->icode].operand[3].mode;
8899 modeimm = insn_data[d->icode].operand[4].mode;
8900
8901 if (VECTOR_MODE_P (modev2))
8902 op0 = safe_vector_operand (op0, modev2);
8903 if (VECTOR_MODE_P (modev3))
8904 op1 = safe_vector_operand (op1, modev3);
8905
8906 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8907 op0 = copy_to_mode_reg (modev2, op0);
8908 if ((optimize && !register_operand (op1, modev3))
8909 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8910 op1 = copy_to_mode_reg (modev3, op1);
8911
8912 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8913 {
8914 error ("the third argument must be an 8-bit immediate");
8915 return const0_rtx;
8916 }
8917
8918 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8919 {
8920 if (optimize || !target
8921 || GET_MODE (target) != tmode0
8922 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8923 target = gen_reg_rtx (tmode0);
8924
8925 scratch1 = gen_reg_rtx (tmode1);
8926
8927 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8928 }
8929 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8930 {
8931 if (optimize || !target
8932 || GET_MODE (target) != tmode1
8933 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8934 target = gen_reg_rtx (tmode1);
8935
8936 scratch0 = gen_reg_rtx (tmode0);
8937
8938 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8939 }
8940 else
8941 {
8942 gcc_assert (d->flag);
8943
8944 scratch0 = gen_reg_rtx (tmode0);
8945 scratch1 = gen_reg_rtx (tmode1);
8946
8947 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8948 }
8949
8950 if (! pat)
8951 return 0;
8952
8953 emit_insn (pat);
8954
8955 if (d->flag)
8956 {
8957 target = gen_reg_rtx (SImode);
8958 emit_move_insn (target, const0_rtx);
8959 target = gen_rtx_SUBREG (QImode, target, 0);
8960
8961 emit_insn
8962 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8963 gen_rtx_fmt_ee (EQ, QImode,
8964 gen_rtx_REG ((machine_mode) d->flag,
8965 FLAGS_REG),
8966 const0_rtx)));
8967 return SUBREG_REG (target);
8968 }
8969 else
8970 return target;
8971 }
8972
8973 /* Fixup modeless constants to fit required mode. */
8974
8975 static rtx
8976 fixup_modeless_constant (rtx x, machine_mode mode)
8977 {
8978 if (GET_MODE (x) == VOIDmode)
8979 x = convert_to_mode (mode, x, 1);
8980 return x;
8981 }
8982
8983 /* Subroutine of ix86_expand_builtin to take care of insns with
8984 variable number of operands. */
8985
8986 static rtx
8987 ix86_expand_args_builtin (const struct builtin_description *d,
8988 tree exp, rtx target)
8989 {
8990 rtx pat, real_target;
8991 unsigned int i, nargs;
8992 unsigned int nargs_constant = 0;
8993 unsigned int mask_pos = 0;
8994 int num_memory = 0;
8995 rtx xops[6];
8996 bool second_arg_count = false;
8997 enum insn_code icode = d->icode;
8998 const struct insn_data_d *insn_p = &insn_data[icode];
8999 machine_mode tmode = insn_p->operand[0].mode;
9000 machine_mode rmode = VOIDmode;
9001 bool swap = false;
9002 enum rtx_code comparison = d->comparison;
9003
9004 switch ((enum ix86_builtin_func_type) d->flag)
9005 {
9006 case V2DF_FTYPE_V2DF_ROUND:
9007 case V4DF_FTYPE_V4DF_ROUND:
9008 case V8DF_FTYPE_V8DF_ROUND:
9009 case V4SF_FTYPE_V4SF_ROUND:
9010 case V8SF_FTYPE_V8SF_ROUND:
9011 case V16SF_FTYPE_V16SF_ROUND:
9012 case V4SI_FTYPE_V4SF_ROUND:
9013 case V8SI_FTYPE_V8SF_ROUND:
9014 case V16SI_FTYPE_V16SF_ROUND:
9015 return ix86_expand_sse_round (d, exp, target);
9016 case V4SI_FTYPE_V2DF_V2DF_ROUND:
9017 case V8SI_FTYPE_V4DF_V4DF_ROUND:
9018 case V16SI_FTYPE_V8DF_V8DF_ROUND:
9019 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9020 case INT_FTYPE_V8SF_V8SF_PTEST:
9021 case INT_FTYPE_V4DI_V4DI_PTEST:
9022 case INT_FTYPE_V4DF_V4DF_PTEST:
9023 case INT_FTYPE_V4SF_V4SF_PTEST:
9024 case INT_FTYPE_V2DI_V2DI_PTEST:
9025 case INT_FTYPE_V2DF_V2DF_PTEST:
9026 return ix86_expand_sse_ptest (d, exp, target);
9027 case FLOAT128_FTYPE_FLOAT128:
9028 case FLOAT_FTYPE_FLOAT:
9029 case INT_FTYPE_INT:
9030 case UINT_FTYPE_UINT:
9031 case UINT16_FTYPE_UINT16:
9032 case UINT64_FTYPE_INT:
9033 case UINT64_FTYPE_UINT64:
9034 case INT64_FTYPE_INT64:
9035 case INT64_FTYPE_V4SF:
9036 case INT64_FTYPE_V2DF:
9037 case INT_FTYPE_V16QI:
9038 case INT_FTYPE_V8QI:
9039 case INT_FTYPE_V8SF:
9040 case INT_FTYPE_V4DF:
9041 case INT_FTYPE_V4SF:
9042 case INT_FTYPE_V2DF:
9043 case INT_FTYPE_V32QI:
9044 case V16QI_FTYPE_V16QI:
9045 case V8SI_FTYPE_V8SF:
9046 case V8SI_FTYPE_V4SI:
9047 case V8HI_FTYPE_V8HI:
9048 case V8HI_FTYPE_V16QI:
9049 case V8QI_FTYPE_V8QI:
9050 case V8SF_FTYPE_V8SF:
9051 case V8SF_FTYPE_V8SI:
9052 case V8SF_FTYPE_V4SF:
9053 case V8SF_FTYPE_V8HI:
9054 case V4SI_FTYPE_V4SI:
9055 case V4SI_FTYPE_V16QI:
9056 case V4SI_FTYPE_V4SF:
9057 case V4SI_FTYPE_V8SI:
9058 case V4SI_FTYPE_V8HI:
9059 case V4SI_FTYPE_V4DF:
9060 case V4SI_FTYPE_V2DF:
9061 case V4HI_FTYPE_V4HI:
9062 case V4DF_FTYPE_V4DF:
9063 case V4DF_FTYPE_V4SI:
9064 case V4DF_FTYPE_V4SF:
9065 case V4DF_FTYPE_V2DF:
9066 case V4SF_FTYPE_V4SF:
9067 case V4SF_FTYPE_V4SI:
9068 case V4SF_FTYPE_V8SF:
9069 case V4SF_FTYPE_V4DF:
9070 case V4SF_FTYPE_V8HI:
9071 case V4SF_FTYPE_V2DF:
9072 case V2DI_FTYPE_V2DI:
9073 case V2DI_FTYPE_V16QI:
9074 case V2DI_FTYPE_V8HI:
9075 case V2DI_FTYPE_V4SI:
9076 case V2DF_FTYPE_V2DF:
9077 case V2DF_FTYPE_V4SI:
9078 case V2DF_FTYPE_V4DF:
9079 case V2DF_FTYPE_V4SF:
9080 case V2DF_FTYPE_V2SI:
9081 case V2SI_FTYPE_V2SI:
9082 case V2SI_FTYPE_V4SF:
9083 case V2SI_FTYPE_V2SF:
9084 case V2SI_FTYPE_V2DF:
9085 case V2SF_FTYPE_V2SF:
9086 case V2SF_FTYPE_V2SI:
9087 case V32QI_FTYPE_V32QI:
9088 case V32QI_FTYPE_V16QI:
9089 case V16HI_FTYPE_V16HI:
9090 case V16HI_FTYPE_V8HI:
9091 case V8SI_FTYPE_V8SI:
9092 case V16HI_FTYPE_V16QI:
9093 case V8SI_FTYPE_V16QI:
9094 case V4DI_FTYPE_V16QI:
9095 case V8SI_FTYPE_V8HI:
9096 case V4DI_FTYPE_V8HI:
9097 case V4DI_FTYPE_V4SI:
9098 case V4DI_FTYPE_V2DI:
9099 case UQI_FTYPE_UQI:
9100 case UHI_FTYPE_UHI:
9101 case USI_FTYPE_USI:
9102 case USI_FTYPE_UQI:
9103 case USI_FTYPE_UHI:
9104 case UDI_FTYPE_UDI:
9105 case UHI_FTYPE_V16QI:
9106 case USI_FTYPE_V32QI:
9107 case UDI_FTYPE_V64QI:
9108 case V16QI_FTYPE_UHI:
9109 case V32QI_FTYPE_USI:
9110 case V64QI_FTYPE_UDI:
9111 case V8HI_FTYPE_UQI:
9112 case V16HI_FTYPE_UHI:
9113 case V32HI_FTYPE_USI:
9114 case V4SI_FTYPE_UQI:
9115 case V8SI_FTYPE_UQI:
9116 case V4SI_FTYPE_UHI:
9117 case V8SI_FTYPE_UHI:
9118 case UQI_FTYPE_V8HI:
9119 case UHI_FTYPE_V16HI:
9120 case USI_FTYPE_V32HI:
9121 case UQI_FTYPE_V4SI:
9122 case UQI_FTYPE_V8SI:
9123 case UHI_FTYPE_V16SI:
9124 case UQI_FTYPE_V2DI:
9125 case UQI_FTYPE_V4DI:
9126 case UQI_FTYPE_V8DI:
9127 case V16SI_FTYPE_UHI:
9128 case V2DI_FTYPE_UQI:
9129 case V4DI_FTYPE_UQI:
9130 case V16SI_FTYPE_INT:
9131 case V16SF_FTYPE_V8SF:
9132 case V16SI_FTYPE_V8SI:
9133 case V16SF_FTYPE_V4SF:
9134 case V16SI_FTYPE_V4SI:
9135 case V16SI_FTYPE_V16SF:
9136 case V16SI_FTYPE_V16SI:
9137 case V64QI_FTYPE_V64QI:
9138 case V32HI_FTYPE_V32HI:
9139 case V16SF_FTYPE_V16SF:
9140 case V8DI_FTYPE_UQI:
9141 case V8DI_FTYPE_V8DI:
9142 case V8DF_FTYPE_V4DF:
9143 case V8DF_FTYPE_V2DF:
9144 case V8DF_FTYPE_V8DF:
9145 case V4DI_FTYPE_V4DI:
9146 case V16HI_FTYPE_V16SF:
9147 case V8HI_FTYPE_V8SF:
9148 case V8HI_FTYPE_V4SF:
9149 nargs = 1;
9150 break;
9151 case V4SF_FTYPE_V4SF_VEC_MERGE:
9152 case V2DF_FTYPE_V2DF_VEC_MERGE:
9153 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9154 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9155 case V16QI_FTYPE_V16QI_V16QI:
9156 case V16QI_FTYPE_V8HI_V8HI:
9157 case V16SF_FTYPE_V16SF_V16SF:
9158 case V8QI_FTYPE_V8QI_V8QI:
9159 case V8QI_FTYPE_V4HI_V4HI:
9160 case V8HI_FTYPE_V8HI_V8HI:
9161 case V8HI_FTYPE_V16QI_V16QI:
9162 case V8HI_FTYPE_V4SI_V4SI:
9163 case V8SF_FTYPE_V8SF_V8SF:
9164 case V8SF_FTYPE_V8SF_V8SI:
9165 case V8DF_FTYPE_V8DF_V8DF:
9166 case V4SI_FTYPE_V4SI_V4SI:
9167 case V4SI_FTYPE_V8HI_V8HI:
9168 case V4SI_FTYPE_V2DF_V2DF:
9169 case V4HI_FTYPE_V4HI_V4HI:
9170 case V4HI_FTYPE_V8QI_V8QI:
9171 case V4HI_FTYPE_V2SI_V2SI:
9172 case V4DF_FTYPE_V4DF_V4DF:
9173 case V4DF_FTYPE_V4DF_V4DI:
9174 case V4SF_FTYPE_V4SF_V4SF:
9175 case V4SF_FTYPE_V4SF_V4SI:
9176 case V4SF_FTYPE_V4SF_V2SI:
9177 case V4SF_FTYPE_V4SF_V2DF:
9178 case V4SF_FTYPE_V4SF_UINT:
9179 case V4SF_FTYPE_V4SF_DI:
9180 case V4SF_FTYPE_V4SF_SI:
9181 case V2DI_FTYPE_V2DI_V2DI:
9182 case V2DI_FTYPE_V16QI_V16QI:
9183 case V2DI_FTYPE_V4SI_V4SI:
9184 case V2DI_FTYPE_V2DI_V16QI:
9185 case V2SI_FTYPE_V2SI_V2SI:
9186 case V2SI_FTYPE_V4HI_V4HI:
9187 case V2SI_FTYPE_V2SF_V2SF:
9188 case V2DF_FTYPE_V2DF_V2DF:
9189 case V2DF_FTYPE_V2DF_V4SF:
9190 case V2DF_FTYPE_V2DF_V2DI:
9191 case V2DF_FTYPE_V2DF_DI:
9192 case V2DF_FTYPE_V2DF_SI:
9193 case V2DF_FTYPE_V2DF_UINT:
9194 case V2SF_FTYPE_V2SF_V2SF:
9195 case V1DI_FTYPE_V1DI_V1DI:
9196 case V1DI_FTYPE_V8QI_V8QI:
9197 case V1DI_FTYPE_V2SI_V2SI:
9198 case V32QI_FTYPE_V16HI_V16HI:
9199 case V16HI_FTYPE_V8SI_V8SI:
9200 case V64QI_FTYPE_V64QI_V64QI:
9201 case V32QI_FTYPE_V32QI_V32QI:
9202 case V16HI_FTYPE_V32QI_V32QI:
9203 case V16HI_FTYPE_V16HI_V16HI:
9204 case V8SI_FTYPE_V4DF_V4DF:
9205 case V8SI_FTYPE_V8SI_V8SI:
9206 case V8SI_FTYPE_V16HI_V16HI:
9207 case V4DI_FTYPE_V4DI_V4DI:
9208 case V4DI_FTYPE_V8SI_V8SI:
9209 case V8DI_FTYPE_V64QI_V64QI:
9210 if (comparison == UNKNOWN)
9211 return ix86_expand_binop_builtin (icode, exp, target);
9212 nargs = 2;
9213 break;
9214 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9215 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9216 gcc_assert (comparison != UNKNOWN);
9217 nargs = 2;
9218 swap = true;
9219 break;
9220 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9221 case V16HI_FTYPE_V16HI_SI_COUNT:
9222 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9223 case V8SI_FTYPE_V8SI_SI_COUNT:
9224 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9225 case V4DI_FTYPE_V4DI_INT_COUNT:
9226 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9227 case V8HI_FTYPE_V8HI_SI_COUNT:
9228 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9229 case V4SI_FTYPE_V4SI_SI_COUNT:
9230 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9231 case V4HI_FTYPE_V4HI_SI_COUNT:
9232 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9233 case V2DI_FTYPE_V2DI_SI_COUNT:
9234 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9235 case V2SI_FTYPE_V2SI_SI_COUNT:
9236 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9237 case V1DI_FTYPE_V1DI_SI_COUNT:
9238 nargs = 2;
9239 second_arg_count = true;
9240 break;
9241 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9242 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9243 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9244 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9245 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9246 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9247 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9248 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9249 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9250 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9251 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9252 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9253 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9254 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9255 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9256 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9257 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9258 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9259 nargs = 4;
9260 second_arg_count = true;
9261 break;
9262 case UINT64_FTYPE_UINT64_UINT64:
9263 case UINT_FTYPE_UINT_UINT:
9264 case UINT_FTYPE_UINT_USHORT:
9265 case UINT_FTYPE_UINT_UCHAR:
9266 case UINT16_FTYPE_UINT16_INT:
9267 case UINT8_FTYPE_UINT8_INT:
9268 case UQI_FTYPE_UQI_UQI:
9269 case UHI_FTYPE_UHI_UHI:
9270 case USI_FTYPE_USI_USI:
9271 case UDI_FTYPE_UDI_UDI:
9272 case V16SI_FTYPE_V8DF_V8DF:
9273 case V32HI_FTYPE_V16SF_V16SF:
9274 case V16HI_FTYPE_V8SF_V8SF:
9275 case V8HI_FTYPE_V4SF_V4SF:
9276 case V16HI_FTYPE_V16SF_UHI:
9277 case V8HI_FTYPE_V8SF_UQI:
9278 case V8HI_FTYPE_V4SF_UQI:
9279 nargs = 2;
9280 break;
9281 case V2DI_FTYPE_V2DI_INT_CONVERT:
9282 nargs = 2;
9283 rmode = V1TImode;
9284 nargs_constant = 1;
9285 break;
9286 case V4DI_FTYPE_V4DI_INT_CONVERT:
9287 nargs = 2;
9288 rmode = V2TImode;
9289 nargs_constant = 1;
9290 break;
9291 case V8DI_FTYPE_V8DI_INT_CONVERT:
9292 nargs = 2;
9293 rmode = V4TImode;
9294 nargs_constant = 1;
9295 break;
9296 case V8HI_FTYPE_V8HI_INT:
9297 case V8HI_FTYPE_V8SF_INT:
9298 case V16HI_FTYPE_V16SF_INT:
9299 case V8HI_FTYPE_V4SF_INT:
9300 case V8SF_FTYPE_V8SF_INT:
9301 case V4SF_FTYPE_V16SF_INT:
9302 case V16SF_FTYPE_V16SF_INT:
9303 case V4SI_FTYPE_V4SI_INT:
9304 case V4SI_FTYPE_V8SI_INT:
9305 case V4HI_FTYPE_V4HI_INT:
9306 case V4DF_FTYPE_V4DF_INT:
9307 case V4DF_FTYPE_V8DF_INT:
9308 case V4SF_FTYPE_V4SF_INT:
9309 case V4SF_FTYPE_V8SF_INT:
9310 case V2DI_FTYPE_V2DI_INT:
9311 case V2DF_FTYPE_V2DF_INT:
9312 case V2DF_FTYPE_V4DF_INT:
9313 case V16HI_FTYPE_V16HI_INT:
9314 case V8SI_FTYPE_V8SI_INT:
9315 case V16SI_FTYPE_V16SI_INT:
9316 case V4SI_FTYPE_V16SI_INT:
9317 case V4DI_FTYPE_V4DI_INT:
9318 case V2DI_FTYPE_V4DI_INT:
9319 case V4DI_FTYPE_V8DI_INT:
9320 case UQI_FTYPE_UQI_UQI_CONST:
9321 case UHI_FTYPE_UHI_UQI:
9322 case USI_FTYPE_USI_UQI:
9323 case UDI_FTYPE_UDI_UQI:
9324 nargs = 2;
9325 nargs_constant = 1;
9326 break;
9327 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9328 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9329 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9330 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9331 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9332 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9333 case UHI_FTYPE_V16SI_V16SI_UHI:
9334 case UQI_FTYPE_V8DI_V8DI_UQI:
9335 case V16HI_FTYPE_V16SI_V16HI_UHI:
9336 case V16QI_FTYPE_V16SI_V16QI_UHI:
9337 case V16QI_FTYPE_V8DI_V16QI_UQI:
9338 case V16SF_FTYPE_V16SF_V16SF_UHI:
9339 case V16SF_FTYPE_V4SF_V16SF_UHI:
9340 case V16SI_FTYPE_SI_V16SI_UHI:
9341 case V16SI_FTYPE_V16HI_V16SI_UHI:
9342 case V16SI_FTYPE_V16QI_V16SI_UHI:
9343 case V8SF_FTYPE_V4SF_V8SF_UQI:
9344 case V4DF_FTYPE_V2DF_V4DF_UQI:
9345 case V8SI_FTYPE_V4SI_V8SI_UQI:
9346 case V8SI_FTYPE_SI_V8SI_UQI:
9347 case V4SI_FTYPE_V4SI_V4SI_UQI:
9348 case V4SI_FTYPE_SI_V4SI_UQI:
9349 case V4DI_FTYPE_V2DI_V4DI_UQI:
9350 case V4DI_FTYPE_DI_V4DI_UQI:
9351 case V2DI_FTYPE_V2DI_V2DI_UQI:
9352 case V2DI_FTYPE_DI_V2DI_UQI:
9353 case V64QI_FTYPE_V64QI_V64QI_UDI:
9354 case V64QI_FTYPE_V16QI_V64QI_UDI:
9355 case V64QI_FTYPE_QI_V64QI_UDI:
9356 case V32QI_FTYPE_V32QI_V32QI_USI:
9357 case V32QI_FTYPE_V16QI_V32QI_USI:
9358 case V32QI_FTYPE_QI_V32QI_USI:
9359 case V16QI_FTYPE_V16QI_V16QI_UHI:
9360 case V16QI_FTYPE_QI_V16QI_UHI:
9361 case V32HI_FTYPE_V8HI_V32HI_USI:
9362 case V32HI_FTYPE_HI_V32HI_USI:
9363 case V16HI_FTYPE_V8HI_V16HI_UHI:
9364 case V16HI_FTYPE_HI_V16HI_UHI:
9365 case V8HI_FTYPE_V8HI_V8HI_UQI:
9366 case V8HI_FTYPE_HI_V8HI_UQI:
9367 case V8SF_FTYPE_V8HI_V8SF_UQI:
9368 case V4SF_FTYPE_V8HI_V4SF_UQI:
9369 case V8SI_FTYPE_V8SF_V8SI_UQI:
9370 case V4SI_FTYPE_V4SF_V4SI_UQI:
9371 case V4DI_FTYPE_V4SF_V4DI_UQI:
9372 case V2DI_FTYPE_V4SF_V2DI_UQI:
9373 case V4SF_FTYPE_V4DI_V4SF_UQI:
9374 case V4SF_FTYPE_V2DI_V4SF_UQI:
9375 case V4DF_FTYPE_V4DI_V4DF_UQI:
9376 case V2DF_FTYPE_V2DI_V2DF_UQI:
9377 case V16QI_FTYPE_V8HI_V16QI_UQI:
9378 case V16QI_FTYPE_V16HI_V16QI_UHI:
9379 case V16QI_FTYPE_V4SI_V16QI_UQI:
9380 case V16QI_FTYPE_V8SI_V16QI_UQI:
9381 case V8HI_FTYPE_V4SI_V8HI_UQI:
9382 case V8HI_FTYPE_V8SI_V8HI_UQI:
9383 case V16QI_FTYPE_V2DI_V16QI_UQI:
9384 case V16QI_FTYPE_V4DI_V16QI_UQI:
9385 case V8HI_FTYPE_V2DI_V8HI_UQI:
9386 case V8HI_FTYPE_V4DI_V8HI_UQI:
9387 case V4SI_FTYPE_V2DI_V4SI_UQI:
9388 case V4SI_FTYPE_V4DI_V4SI_UQI:
9389 case V32QI_FTYPE_V32HI_V32QI_USI:
9390 case UHI_FTYPE_V16QI_V16QI_UHI:
9391 case USI_FTYPE_V32QI_V32QI_USI:
9392 case UDI_FTYPE_V64QI_V64QI_UDI:
9393 case UQI_FTYPE_V8HI_V8HI_UQI:
9394 case UHI_FTYPE_V16HI_V16HI_UHI:
9395 case USI_FTYPE_V32HI_V32HI_USI:
9396 case UQI_FTYPE_V4SI_V4SI_UQI:
9397 case UQI_FTYPE_V8SI_V8SI_UQI:
9398 case UQI_FTYPE_V2DI_V2DI_UQI:
9399 case UQI_FTYPE_V4DI_V4DI_UQI:
9400 case V4SF_FTYPE_V2DF_V4SF_UQI:
9401 case V4SF_FTYPE_V4DF_V4SF_UQI:
9402 case V16SI_FTYPE_V16SI_V16SI_UHI:
9403 case V16SI_FTYPE_V4SI_V16SI_UHI:
9404 case V2DI_FTYPE_V4SI_V2DI_UQI:
9405 case V2DI_FTYPE_V8HI_V2DI_UQI:
9406 case V2DI_FTYPE_V16QI_V2DI_UQI:
9407 case V4DI_FTYPE_V4DI_V4DI_UQI:
9408 case V4DI_FTYPE_V4SI_V4DI_UQI:
9409 case V4DI_FTYPE_V8HI_V4DI_UQI:
9410 case V4DI_FTYPE_V16QI_V4DI_UQI:
9411 case V4DI_FTYPE_V4DF_V4DI_UQI:
9412 case V2DI_FTYPE_V2DF_V2DI_UQI:
9413 case V4SI_FTYPE_V4DF_V4SI_UQI:
9414 case V4SI_FTYPE_V2DF_V4SI_UQI:
9415 case V4SI_FTYPE_V8HI_V4SI_UQI:
9416 case V4SI_FTYPE_V16QI_V4SI_UQI:
9417 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9418 case V8DF_FTYPE_V2DF_V8DF_UQI:
9419 case V8DF_FTYPE_V4DF_V8DF_UQI:
9420 case V8DF_FTYPE_V8DF_V8DF_UQI:
9421 case V8SF_FTYPE_V8SF_V8SF_UQI:
9422 case V8SF_FTYPE_V8SI_V8SF_UQI:
9423 case V4DF_FTYPE_V4DF_V4DF_UQI:
9424 case V4SF_FTYPE_V4SF_V4SF_UQI:
9425 case V2DF_FTYPE_V2DF_V2DF_UQI:
9426 case V2DF_FTYPE_V4SF_V2DF_UQI:
9427 case V2DF_FTYPE_V4SI_V2DF_UQI:
9428 case V4SF_FTYPE_V4SI_V4SF_UQI:
9429 case V4DF_FTYPE_V4SF_V4DF_UQI:
9430 case V4DF_FTYPE_V4SI_V4DF_UQI:
9431 case V8SI_FTYPE_V8SI_V8SI_UQI:
9432 case V8SI_FTYPE_V8HI_V8SI_UQI:
9433 case V8SI_FTYPE_V16QI_V8SI_UQI:
9434 case V8DF_FTYPE_V8SI_V8DF_UQI:
9435 case V8DI_FTYPE_DI_V8DI_UQI:
9436 case V16SF_FTYPE_V8SF_V16SF_UHI:
9437 case V16SI_FTYPE_V8SI_V16SI_UHI:
9438 case V16HI_FTYPE_V16HI_V16HI_UHI:
9439 case V8HI_FTYPE_V16QI_V8HI_UQI:
9440 case V16HI_FTYPE_V16QI_V16HI_UHI:
9441 case V32HI_FTYPE_V32HI_V32HI_USI:
9442 case V32HI_FTYPE_V32QI_V32HI_USI:
9443 case V8DI_FTYPE_V16QI_V8DI_UQI:
9444 case V8DI_FTYPE_V2DI_V8DI_UQI:
9445 case V8DI_FTYPE_V4DI_V8DI_UQI:
9446 case V8DI_FTYPE_V8DI_V8DI_UQI:
9447 case V8DI_FTYPE_V8HI_V8DI_UQI:
9448 case V8DI_FTYPE_V8SI_V8DI_UQI:
9449 case V8HI_FTYPE_V8DI_V8HI_UQI:
9450 case V8SI_FTYPE_V8DI_V8SI_UQI:
9451 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9452 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9453 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9454 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9455 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9456 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9457 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9458 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9459 case V32HI_FTYPE_V16SF_V16SF_USI:
9460 case V16HI_FTYPE_V8SF_V8SF_UHI:
9461 case V8HI_FTYPE_V4SF_V4SF_UQI:
9462 case V16HI_FTYPE_V16SF_V16HI_UHI:
9463 case V8HI_FTYPE_V8SF_V8HI_UQI:
9464 case V8HI_FTYPE_V4SF_V8HI_UQI:
9465 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9466 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9467 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9468 nargs = 3;
9469 break;
9470 case V32QI_FTYPE_V32QI_V32QI_INT:
9471 case V16HI_FTYPE_V16HI_V16HI_INT:
9472 case V16QI_FTYPE_V16QI_V16QI_INT:
9473 case V4DI_FTYPE_V4DI_V4DI_INT:
9474 case V8HI_FTYPE_V8HI_V8HI_INT:
9475 case V8SI_FTYPE_V8SI_V8SI_INT:
9476 case V8SI_FTYPE_V8SI_V4SI_INT:
9477 case V8SF_FTYPE_V8SF_V8SF_INT:
9478 case V8SF_FTYPE_V8SF_V4SF_INT:
9479 case V4SI_FTYPE_V4SI_V4SI_INT:
9480 case V4DF_FTYPE_V4DF_V4DF_INT:
9481 case V16SF_FTYPE_V16SF_V16SF_INT:
9482 case V16SF_FTYPE_V16SF_V4SF_INT:
9483 case V16SI_FTYPE_V16SI_V4SI_INT:
9484 case V4DF_FTYPE_V4DF_V2DF_INT:
9485 case V4SF_FTYPE_V4SF_V4SF_INT:
9486 case V2DI_FTYPE_V2DI_V2DI_INT:
9487 case V4DI_FTYPE_V4DI_V2DI_INT:
9488 case V2DF_FTYPE_V2DF_V2DF_INT:
9489 case UQI_FTYPE_V8DI_V8UDI_INT:
9490 case UQI_FTYPE_V8DF_V8DF_INT:
9491 case UQI_FTYPE_V2DF_V2DF_INT:
9492 case UQI_FTYPE_V4SF_V4SF_INT:
9493 case UHI_FTYPE_V16SI_V16SI_INT:
9494 case UHI_FTYPE_V16SF_V16SF_INT:
9495 case V64QI_FTYPE_V64QI_V64QI_INT:
9496 case V32HI_FTYPE_V32HI_V32HI_INT:
9497 case V16SI_FTYPE_V16SI_V16SI_INT:
9498 case V8DI_FTYPE_V8DI_V8DI_INT:
9499 nargs = 3;
9500 nargs_constant = 1;
9501 break;
9502 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9503 nargs = 3;
9504 rmode = V4DImode;
9505 nargs_constant = 1;
9506 break;
9507 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9508 nargs = 3;
9509 rmode = V2DImode;
9510 nargs_constant = 1;
9511 break;
9512 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9513 nargs = 3;
9514 rmode = DImode;
9515 nargs_constant = 1;
9516 break;
9517 case V2DI_FTYPE_V2DI_UINT_UINT:
9518 nargs = 3;
9519 nargs_constant = 2;
9520 break;
9521 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9522 nargs = 3;
9523 rmode = V8DImode;
9524 nargs_constant = 1;
9525 break;
9526 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9527 nargs = 5;
9528 rmode = V8DImode;
9529 mask_pos = 2;
9530 nargs_constant = 1;
9531 break;
9532 case QI_FTYPE_V8DF_INT_UQI:
9533 case QI_FTYPE_V4DF_INT_UQI:
9534 case QI_FTYPE_V2DF_INT_UQI:
9535 case HI_FTYPE_V16SF_INT_UHI:
9536 case QI_FTYPE_V8SF_INT_UQI:
9537 case QI_FTYPE_V4SF_INT_UQI:
9538 case V4SI_FTYPE_V4SI_V4SI_UHI:
9539 case V8SI_FTYPE_V8SI_V8SI_UHI:
9540 nargs = 3;
9541 mask_pos = 1;
9542 nargs_constant = 1;
9543 break;
9544 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9545 nargs = 5;
9546 rmode = V4DImode;
9547 mask_pos = 2;
9548 nargs_constant = 1;
9549 break;
9550 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9551 nargs = 5;
9552 rmode = V2DImode;
9553 mask_pos = 2;
9554 nargs_constant = 1;
9555 break;
9556 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9557 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9558 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9559 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9560 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9561 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9562 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9563 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9564 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9565 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9566 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9567 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9568 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9569 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9570 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9571 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9572 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9573 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9574 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9575 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9576 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9577 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9578 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9579 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9580 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9581 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9582 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9583 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9584 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9585 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9586 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9587 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9588 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9589 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9590 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9591 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9592 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9593 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9594 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9595 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9596 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9597 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9598 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9599 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9600 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9601 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9602 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9603 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9604 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9605 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9606 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9607 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9608 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9609 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9610 nargs = 4;
9611 break;
9612 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9613 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9614 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9615 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9616 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9617 nargs = 4;
9618 nargs_constant = 1;
9619 break;
9620 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9621 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9622 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9623 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9624 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9625 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9626 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9627 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9628 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9629 case USI_FTYPE_V32QI_V32QI_INT_USI:
9630 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9631 case USI_FTYPE_V32HI_V32HI_INT_USI:
9632 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9633 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9634 nargs = 4;
9635 mask_pos = 1;
9636 nargs_constant = 1;
9637 break;
9638 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9639 nargs = 4;
9640 nargs_constant = 2;
9641 break;
9642 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9643 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9644 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9645 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9646 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9647 nargs = 4;
9648 break;
9649 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9650 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9651 mask_pos = 1;
9652 nargs = 4;
9653 nargs_constant = 1;
9654 break;
9655 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9656 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9657 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9658 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9659 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9660 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9661 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9662 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9663 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9664 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9665 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9666 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9667 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9668 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9669 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9670 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9671 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9672 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9673 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9674 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9675 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9676 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9677 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9678 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9679 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9680 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9681 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9682 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9683 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9684 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9685 nargs = 4;
9686 mask_pos = 2;
9687 nargs_constant = 1;
9688 break;
9689 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9690 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9691 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9692 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9693 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9694 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9695 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9696 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9697 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9698 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9699 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9700 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9701 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9702 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9703 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9704 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9705 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9706 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9707 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9708 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9709 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9710 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9711 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9712 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9713 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9714 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9715 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9716 nargs = 5;
9717 mask_pos = 2;
9718 nargs_constant = 1;
9719 break;
9720 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9721 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9722 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9723 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9724 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9725 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9726 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9727 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9728 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9729 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9730 nargs = 5;
9731 mask_pos = 1;
9732 nargs_constant = 1;
9733 break;
9734 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9735 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9736 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9737 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9738 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9739 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9740 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9741 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9742 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9743 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9744 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9745 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9746 nargs = 5;
9747 mask_pos = 1;
9748 nargs_constant = 2;
9749 break;
9750
9751 default:
9752 gcc_unreachable ();
9753 }
9754
9755 gcc_assert (nargs <= ARRAY_SIZE (xops));
9756
9757 if (comparison != UNKNOWN)
9758 {
9759 gcc_assert (nargs == 2);
9760 return ix86_expand_sse_compare (d, exp, target, swap);
9761 }
9762
9763 if (rmode == VOIDmode || rmode == tmode)
9764 {
9765 if (optimize
9766 || target == 0
9767 || GET_MODE (target) != tmode
9768 || !insn_p->operand[0].predicate (target, tmode))
9769 target = gen_reg_rtx (tmode);
9770 else if (memory_operand (target, tmode))
9771 num_memory++;
9772 real_target = target;
9773 }
9774 else
9775 {
9776 real_target = gen_reg_rtx (tmode);
9777 target = lowpart_subreg (rmode, real_target, tmode);
9778 }
9779
9780 for (i = 0; i < nargs; i++)
9781 {
9782 tree arg = CALL_EXPR_ARG (exp, i);
9783 rtx op = expand_normal (arg);
9784 machine_mode mode = insn_p->operand[i + 1].mode;
9785 bool match = insn_p->operand[i + 1].predicate (op, mode);
9786
9787 if (second_arg_count && i == 1)
9788 {
9789 /* SIMD shift insns take either an 8-bit immediate or
9790 register as count. But builtin functions take int as
9791 count. If count doesn't match, we put it in register.
9792 The instructions are using 64-bit count, if op is just
9793 32-bit, zero-extend it, as negative shift counts
9794 are undefined behavior and zero-extension is more
9795 efficient. */
9796 if (!match)
9797 {
9798 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9799 op = convert_modes (mode, GET_MODE (op), op, 1);
9800 else
9801 op = lowpart_subreg (mode, op, GET_MODE (op));
9802 if (!insn_p->operand[i + 1].predicate (op, mode))
9803 op = copy_to_reg (op);
9804 }
9805 }
9806 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9807 (!mask_pos && (nargs - i) <= nargs_constant))
9808 {
9809 if (!match)
9810 switch (icode)
9811 {
9812 case CODE_FOR_avx_vinsertf128v4di:
9813 case CODE_FOR_avx_vextractf128v4di:
9814 error ("the last argument must be an 1-bit immediate");
9815 return const0_rtx;
9816
9817 case CODE_FOR_avx512f_cmpv8di3_mask:
9818 case CODE_FOR_avx512f_cmpv16si3_mask:
9819 case CODE_FOR_avx512f_ucmpv8di3_mask:
9820 case CODE_FOR_avx512f_ucmpv16si3_mask:
9821 case CODE_FOR_avx512vl_cmpv4di3_mask:
9822 case CODE_FOR_avx512vl_cmpv8si3_mask:
9823 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9824 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9825 case CODE_FOR_avx512vl_cmpv2di3_mask:
9826 case CODE_FOR_avx512vl_cmpv4si3_mask:
9827 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9828 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9829 error ("the last argument must be a 3-bit immediate");
9830 return const0_rtx;
9831
9832 case CODE_FOR_sse4_1_roundsd:
9833 case CODE_FOR_sse4_1_roundss:
9834
9835 case CODE_FOR_sse4_1_roundpd:
9836 case CODE_FOR_sse4_1_roundps:
9837 case CODE_FOR_avx_roundpd256:
9838 case CODE_FOR_avx_roundps256:
9839
9840 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9841 case CODE_FOR_sse4_1_roundps_sfix:
9842 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9843 case CODE_FOR_avx_roundps_sfix256:
9844
9845 case CODE_FOR_sse4_1_blendps:
9846 case CODE_FOR_avx_blendpd256:
9847 case CODE_FOR_avx_vpermilv4df:
9848 case CODE_FOR_avx_vpermilv4df_mask:
9849 case CODE_FOR_avx512f_getmantv8df_mask:
9850 case CODE_FOR_avx512f_getmantv16sf_mask:
9851 case CODE_FOR_avx512vl_getmantv8sf_mask:
9852 case CODE_FOR_avx512vl_getmantv4df_mask:
9853 case CODE_FOR_avx512vl_getmantv4sf_mask:
9854 case CODE_FOR_avx512vl_getmantv2df_mask:
9855 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9856 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9857 case CODE_FOR_avx512dq_rangepv4df_mask:
9858 case CODE_FOR_avx512dq_rangepv8sf_mask:
9859 case CODE_FOR_avx512dq_rangepv2df_mask:
9860 case CODE_FOR_avx512dq_rangepv4sf_mask:
9861 case CODE_FOR_avx_shufpd256_mask:
9862 error ("the last argument must be a 4-bit immediate");
9863 return const0_rtx;
9864
9865 case CODE_FOR_sha1rnds4:
9866 case CODE_FOR_sse4_1_blendpd:
9867 case CODE_FOR_avx_vpermilv2df:
9868 case CODE_FOR_avx_vpermilv2df_mask:
9869 case CODE_FOR_xop_vpermil2v2df3:
9870 case CODE_FOR_xop_vpermil2v4sf3:
9871 case CODE_FOR_xop_vpermil2v4df3:
9872 case CODE_FOR_xop_vpermil2v8sf3:
9873 case CODE_FOR_avx512f_vinsertf32x4_mask:
9874 case CODE_FOR_avx512f_vinserti32x4_mask:
9875 case CODE_FOR_avx512f_vextractf32x4_mask:
9876 case CODE_FOR_avx512f_vextracti32x4_mask:
9877 case CODE_FOR_sse2_shufpd:
9878 case CODE_FOR_sse2_shufpd_mask:
9879 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9880 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9881 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9882 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9883 error ("the last argument must be a 2-bit immediate");
9884 return const0_rtx;
9885
9886 case CODE_FOR_avx_vextractf128v4df:
9887 case CODE_FOR_avx_vextractf128v8sf:
9888 case CODE_FOR_avx_vextractf128v8si:
9889 case CODE_FOR_avx_vinsertf128v4df:
9890 case CODE_FOR_avx_vinsertf128v8sf:
9891 case CODE_FOR_avx_vinsertf128v8si:
9892 case CODE_FOR_avx512f_vinsertf64x4_mask:
9893 case CODE_FOR_avx512f_vinserti64x4_mask:
9894 case CODE_FOR_avx512f_vextractf64x4_mask:
9895 case CODE_FOR_avx512f_vextracti64x4_mask:
9896 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9897 case CODE_FOR_avx512dq_vinserti32x8_mask:
9898 case CODE_FOR_avx512vl_vinsertv4df:
9899 case CODE_FOR_avx512vl_vinsertv4di:
9900 case CODE_FOR_avx512vl_vinsertv8sf:
9901 case CODE_FOR_avx512vl_vinsertv8si:
9902 error ("the last argument must be a 1-bit immediate");
9903 return const0_rtx;
9904
9905 case CODE_FOR_avx_vmcmpv2df3:
9906 case CODE_FOR_avx_vmcmpv4sf3:
9907 case CODE_FOR_avx_cmpv2df3:
9908 case CODE_FOR_avx_cmpv4sf3:
9909 case CODE_FOR_avx_cmpv4df3:
9910 case CODE_FOR_avx_cmpv8sf3:
9911 case CODE_FOR_avx512f_cmpv8df3_mask:
9912 case CODE_FOR_avx512f_cmpv16sf3_mask:
9913 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9914 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9915 error ("the last argument must be a 5-bit immediate");
9916 return const0_rtx;
9917
9918 default:
9919 switch (nargs_constant)
9920 {
9921 case 2:
9922 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9923 (!mask_pos && (nargs - i) == nargs_constant))
9924 {
9925 error ("the next to last argument must be an 8-bit immediate");
9926 break;
9927 }
9928 /* FALLTHRU */
9929 case 1:
9930 error ("the last argument must be an 8-bit immediate");
9931 break;
9932 default:
9933 gcc_unreachable ();
9934 }
9935 return const0_rtx;
9936 }
9937 }
9938 else
9939 {
9940 if (VECTOR_MODE_P (mode))
9941 op = safe_vector_operand (op, mode);
9942
9943 /* If we aren't optimizing, only allow one memory operand to
9944 be generated. */
9945 if (memory_operand (op, mode))
9946 num_memory++;
9947
9948 op = fixup_modeless_constant (op, mode);
9949
9950 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9951 {
9952 if (optimize || !match || num_memory > 1)
9953 op = copy_to_mode_reg (mode, op);
9954 }
9955 else
9956 {
9957 op = copy_to_reg (op);
9958 op = lowpart_subreg (mode, op, GET_MODE (op));
9959 }
9960 }
9961
9962 xops[i] = op;
9963 }
9964
9965 switch (nargs)
9966 {
9967 case 1:
9968 pat = GEN_FCN (icode) (real_target, xops[0]);
9969 break;
9970 case 2:
9971 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
9972 break;
9973 case 3:
9974 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
9975 break;
9976 case 4:
9977 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
9978 xops[2], xops[3]);
9979 break;
9980 case 5:
9981 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
9982 xops[2], xops[3], xops[4]);
9983 break;
9984 case 6:
9985 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
9986 xops[2], xops[3], xops[4], xops[5]);
9987 break;
9988 default:
9989 gcc_unreachable ();
9990 }
9991
9992 if (! pat)
9993 return 0;
9994
9995 emit_insn (pat);
9996 return target;
9997 }
9998
9999 /* Transform pattern of following layout:
10000 (set A
10001 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10002 )
10003 into:
10004 (set (A B)) */
10005
10006 static rtx
10007 ix86_erase_embedded_rounding (rtx pat)
10008 {
10009 if (GET_CODE (pat) == INSN)
10010 pat = PATTERN (pat);
10011
10012 gcc_assert (GET_CODE (pat) == SET);
10013 rtx src = SET_SRC (pat);
10014 gcc_assert (XVECLEN (src, 0) == 2);
10015 rtx p0 = XVECEXP (src, 0, 0);
10016 gcc_assert (GET_CODE (src) == UNSPEC
10017 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10018 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10019 return res;
10020 }
10021
10022 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10023 with rounding. */
10024 static rtx
10025 ix86_expand_sse_comi_round (const struct builtin_description *d,
10026 tree exp, rtx target)
10027 {
10028 rtx pat, set_dst;
10029 tree arg0 = CALL_EXPR_ARG (exp, 0);
10030 tree arg1 = CALL_EXPR_ARG (exp, 1);
10031 tree arg2 = CALL_EXPR_ARG (exp, 2);
10032 tree arg3 = CALL_EXPR_ARG (exp, 3);
10033 rtx op0 = expand_normal (arg0);
10034 rtx op1 = expand_normal (arg1);
10035 rtx op2 = expand_normal (arg2);
10036 rtx op3 = expand_normal (arg3);
10037 enum insn_code icode = d->icode;
10038 const struct insn_data_d *insn_p = &insn_data[icode];
10039 machine_mode mode0 = insn_p->operand[0].mode;
10040 machine_mode mode1 = insn_p->operand[1].mode;
10041
10042 /* See avxintrin.h for values. */
10043 static const enum rtx_code comparisons[32] =
10044 {
10045 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10046 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10047 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10048 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10049 };
10050 static const bool ordereds[32] =
10051 {
10052 true, true, true, false, false, false, false, true,
10053 false, false, false, true, true, true, true, false,
10054 true, true, true, false, false, false, false, true,
10055 false, false, false, true, true, true, true, false
10056 };
10057 static const bool non_signalings[32] =
10058 {
10059 true, false, false, true, true, false, false, true,
10060 true, false, false, true, true, false, false, true,
10061 false, true, true, false, false, true, true, false,
10062 false, true, true, false, false, true, true, false
10063 };
10064
10065 if (!CONST_INT_P (op2))
10066 {
10067 error ("the third argument must be comparison constant");
10068 return const0_rtx;
10069 }
10070 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10071 {
10072 error ("incorrect comparison mode");
10073 return const0_rtx;
10074 }
10075
10076 if (!insn_p->operand[2].predicate (op3, SImode))
10077 {
10078 error ("incorrect rounding operand");
10079 return const0_rtx;
10080 }
10081
10082 if (VECTOR_MODE_P (mode0))
10083 op0 = safe_vector_operand (op0, mode0);
10084 if (VECTOR_MODE_P (mode1))
10085 op1 = safe_vector_operand (op1, mode1);
10086
10087 enum rtx_code comparison = comparisons[INTVAL (op2)];
10088 bool ordered = ordereds[INTVAL (op2)];
10089 bool non_signaling = non_signalings[INTVAL (op2)];
10090 rtx const_val = const0_rtx;
10091
10092 bool check_unordered = false;
10093 machine_mode mode = CCFPmode;
10094 switch (comparison)
10095 {
10096 case ORDERED:
10097 if (!ordered)
10098 {
10099 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10100 if (!non_signaling)
10101 ordered = true;
10102 mode = CCSmode;
10103 }
10104 else
10105 {
10106 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10107 if (non_signaling)
10108 ordered = false;
10109 mode = CCPmode;
10110 }
10111 comparison = NE;
10112 break;
10113 case UNORDERED:
10114 if (ordered)
10115 {
10116 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10117 if (non_signaling)
10118 ordered = false;
10119 mode = CCSmode;
10120 }
10121 else
10122 {
10123 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10124 if (!non_signaling)
10125 ordered = true;
10126 mode = CCPmode;
10127 }
10128 comparison = EQ;
10129 break;
10130
10131 case LE: /* -> GE */
10132 case LT: /* -> GT */
10133 case UNGE: /* -> UNLE */
10134 case UNGT: /* -> UNLT */
10135 std::swap (op0, op1);
10136 comparison = swap_condition (comparison);
10137 /* FALLTHRU */
10138 case GT:
10139 case GE:
10140 case UNEQ:
10141 case UNLT:
10142 case UNLE:
10143 case LTGT:
10144 /* These are supported by CCFPmode. NB: Use ordered/signaling
10145 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10146 with NAN operands. */
10147 if (ordered == non_signaling)
10148 ordered = !ordered;
10149 break;
10150 case EQ:
10151 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10152 _CMP_EQ_OQ/_CMP_EQ_OS. */
10153 check_unordered = true;
10154 mode = CCZmode;
10155 break;
10156 case NE:
10157 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10158 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10159 gcc_assert (!ordered);
10160 check_unordered = true;
10161 mode = CCZmode;
10162 const_val = const1_rtx;
10163 break;
10164 default:
10165 gcc_unreachable ();
10166 }
10167
10168 target = gen_reg_rtx (SImode);
10169 emit_move_insn (target, const_val);
10170 target = gen_rtx_SUBREG (QImode, target, 0);
10171
10172 if ((optimize && !register_operand (op0, mode0))
10173 || !insn_p->operand[0].predicate (op0, mode0))
10174 op0 = copy_to_mode_reg (mode0, op0);
10175 if ((optimize && !register_operand (op1, mode1))
10176 || !insn_p->operand[1].predicate (op1, mode1))
10177 op1 = copy_to_mode_reg (mode1, op1);
10178
10179 /*
10180 1. COMI: ordered and signaling.
10181 2. UCOMI: unordered and non-signaling.
10182 */
10183 if (non_signaling)
10184 icode = (icode == CODE_FOR_sse_comi_round
10185 ? CODE_FOR_sse_ucomi_round
10186 : CODE_FOR_sse2_ucomi_round);
10187
10188 pat = GEN_FCN (icode) (op0, op1, op3);
10189 if (! pat)
10190 return 0;
10191
10192 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10193 if (INTVAL (op3) == NO_ROUND)
10194 {
10195 pat = ix86_erase_embedded_rounding (pat);
10196 if (! pat)
10197 return 0;
10198
10199 set_dst = SET_DEST (pat);
10200 }
10201 else
10202 {
10203 gcc_assert (GET_CODE (pat) == SET);
10204 set_dst = SET_DEST (pat);
10205 }
10206
10207 emit_insn (pat);
10208
10209 rtx_code_label *label = NULL;
10210
10211 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10212 with NAN operands. */
10213 if (check_unordered)
10214 {
10215 gcc_assert (comparison == EQ || comparison == NE);
10216
10217 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10218 label = gen_label_rtx ();
10219 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10220 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10221 gen_rtx_LABEL_REF (VOIDmode, label),
10222 pc_rtx);
10223 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10224 }
10225
10226 /* NB: Set CCFPmode and check a different CCmode which is in subset
10227 of CCFPmode. */
10228 if (GET_MODE (set_dst) != mode)
10229 {
10230 gcc_assert (mode == CCAmode || mode == CCCmode
10231 || mode == CCOmode || mode == CCPmode
10232 || mode == CCSmode || mode == CCZmode);
10233 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10234 }
10235
10236 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10237 gen_rtx_fmt_ee (comparison, QImode,
10238 set_dst,
10239 const0_rtx)));
10240
10241 if (label)
10242 emit_label (label);
10243
10244 return SUBREG_REG (target);
10245 }
10246
10247 static rtx
10248 ix86_expand_round_builtin (const struct builtin_description *d,
10249 tree exp, rtx target)
10250 {
10251 rtx pat;
10252 unsigned int i, nargs;
10253 rtx xops[6];
10254 enum insn_code icode = d->icode;
10255 const struct insn_data_d *insn_p = &insn_data[icode];
10256 machine_mode tmode = insn_p->operand[0].mode;
10257 unsigned int nargs_constant = 0;
10258 unsigned int redundant_embed_rnd = 0;
10259
10260 switch ((enum ix86_builtin_func_type) d->flag)
10261 {
10262 case UINT64_FTYPE_V2DF_INT:
10263 case UINT64_FTYPE_V4SF_INT:
10264 case UINT_FTYPE_V2DF_INT:
10265 case UINT_FTYPE_V4SF_INT:
10266 case INT64_FTYPE_V2DF_INT:
10267 case INT64_FTYPE_V4SF_INT:
10268 case INT_FTYPE_V2DF_INT:
10269 case INT_FTYPE_V4SF_INT:
10270 nargs = 2;
10271 break;
10272 case V4SF_FTYPE_V4SF_UINT_INT:
10273 case V4SF_FTYPE_V4SF_UINT64_INT:
10274 case V2DF_FTYPE_V2DF_UINT64_INT:
10275 case V4SF_FTYPE_V4SF_INT_INT:
10276 case V4SF_FTYPE_V4SF_INT64_INT:
10277 case V2DF_FTYPE_V2DF_INT64_INT:
10278 case V4SF_FTYPE_V4SF_V4SF_INT:
10279 case V2DF_FTYPE_V2DF_V2DF_INT:
10280 case V4SF_FTYPE_V4SF_V2DF_INT:
10281 case V2DF_FTYPE_V2DF_V4SF_INT:
10282 nargs = 3;
10283 break;
10284 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10285 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10286 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10287 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10288 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10289 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10290 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10291 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10292 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10293 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10294 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10295 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10296 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10297 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10298 nargs = 4;
10299 break;
10300 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10301 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10302 nargs_constant = 2;
10303 nargs = 4;
10304 break;
10305 case INT_FTYPE_V4SF_V4SF_INT_INT:
10306 case INT_FTYPE_V2DF_V2DF_INT_INT:
10307 return ix86_expand_sse_comi_round (d, exp, target);
10308 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10309 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10310 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10311 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10312 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10313 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10314 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
10315 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10316 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10317 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
10318 nargs = 5;
10319 break;
10320 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10321 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10322 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10323 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
10324 nargs_constant = 4;
10325 nargs = 5;
10326 break;
10327 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10328 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10329 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10330 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10331 nargs_constant = 3;
10332 nargs = 5;
10333 break;
10334 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10335 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10336 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10337 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10338 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10339 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10340 nargs = 6;
10341 nargs_constant = 4;
10342 break;
10343 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10344 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10345 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10346 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10347 nargs = 6;
10348 nargs_constant = 3;
10349 break;
10350 default:
10351 gcc_unreachable ();
10352 }
10353 gcc_assert (nargs <= ARRAY_SIZE (xops));
10354
10355 if (optimize
10356 || target == 0
10357 || GET_MODE (target) != tmode
10358 || !insn_p->operand[0].predicate (target, tmode))
10359 target = gen_reg_rtx (tmode);
10360
10361 for (i = 0; i < nargs; i++)
10362 {
10363 tree arg = CALL_EXPR_ARG (exp, i);
10364 rtx op = expand_normal (arg);
10365 machine_mode mode = insn_p->operand[i + 1].mode;
10366 bool match = insn_p->operand[i + 1].predicate (op, mode);
10367
10368 if (i == nargs - nargs_constant)
10369 {
10370 if (!match)
10371 {
10372 switch (icode)
10373 {
10374 case CODE_FOR_avx512f_getmantv8df_mask_round:
10375 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10376 case CODE_FOR_avx512f_vgetmantv2df_round:
10377 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10378 case CODE_FOR_avx512f_vgetmantv4sf_round:
10379 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10380 error ("the immediate argument must be a 4-bit immediate");
10381 return const0_rtx;
10382 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10383 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10384 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10385 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10386 error ("the immediate argument must be a 5-bit immediate");
10387 return const0_rtx;
10388 default:
10389 error ("the immediate argument must be an 8-bit immediate");
10390 return const0_rtx;
10391 }
10392 }
10393 }
10394 else if (i == nargs-1)
10395 {
10396 if (!insn_p->operand[nargs].predicate (op, SImode))
10397 {
10398 error ("incorrect rounding operand");
10399 return const0_rtx;
10400 }
10401
10402 /* If there is no rounding use normal version of the pattern. */
10403 if (INTVAL (op) == NO_ROUND)
10404 redundant_embed_rnd = 1;
10405 }
10406 else
10407 {
10408 if (VECTOR_MODE_P (mode))
10409 op = safe_vector_operand (op, mode);
10410
10411 op = fixup_modeless_constant (op, mode);
10412
10413 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10414 {
10415 if (optimize || !match)
10416 op = copy_to_mode_reg (mode, op);
10417 }
10418 else
10419 {
10420 op = copy_to_reg (op);
10421 op = lowpart_subreg (mode, op, GET_MODE (op));
10422 }
10423 }
10424
10425 xops[i] = op;
10426 }
10427
10428 switch (nargs)
10429 {
10430 case 1:
10431 pat = GEN_FCN (icode) (target, xops[0]);
10432 break;
10433 case 2:
10434 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10435 break;
10436 case 3:
10437 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10438 break;
10439 case 4:
10440 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10441 xops[2], xops[3]);
10442 break;
10443 case 5:
10444 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10445 xops[2], xops[3], xops[4]);
10446 break;
10447 case 6:
10448 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10449 xops[2], xops[3], xops[4], xops[5]);
10450 break;
10451 default:
10452 gcc_unreachable ();
10453 }
10454
10455 if (!pat)
10456 return 0;
10457
10458 if (redundant_embed_rnd)
10459 pat = ix86_erase_embedded_rounding (pat);
10460
10461 emit_insn (pat);
10462 return target;
10463 }
10464
10465 /* Subroutine of ix86_expand_builtin to take care of special insns
10466 with variable number of operands. */
10467
10468 static rtx
10469 ix86_expand_special_args_builtin (const struct builtin_description *d,
10470 tree exp, rtx target)
10471 {
10472 tree arg;
10473 rtx pat, op;
10474 unsigned int i, nargs, arg_adjust, memory;
10475 bool aligned_mem = false;
10476 rtx xops[3];
10477 enum insn_code icode = d->icode;
10478 const struct insn_data_d *insn_p = &insn_data[icode];
10479 machine_mode tmode = insn_p->operand[0].mode;
10480 enum { load, store } klass;
10481
10482 switch ((enum ix86_builtin_func_type) d->flag)
10483 {
10484 case VOID_FTYPE_VOID:
10485 emit_insn (GEN_FCN (icode) (target));
10486 return 0;
10487 case VOID_FTYPE_UINT64:
10488 case VOID_FTYPE_UNSIGNED:
10489 nargs = 0;
10490 klass = store;
10491 memory = 0;
10492 break;
10493
10494 case INT_FTYPE_VOID:
10495 case USHORT_FTYPE_VOID:
10496 case UINT64_FTYPE_VOID:
10497 case UINT_FTYPE_VOID:
10498 case UINT8_FTYPE_VOID:
10499 case UNSIGNED_FTYPE_VOID:
10500 nargs = 0;
10501 klass = load;
10502 memory = 0;
10503 break;
10504 case UINT64_FTYPE_PUNSIGNED:
10505 case V2DI_FTYPE_PV2DI:
10506 case V4DI_FTYPE_PV4DI:
10507 case V32QI_FTYPE_PCCHAR:
10508 case V16QI_FTYPE_PCCHAR:
10509 case V8SF_FTYPE_PCV4SF:
10510 case V8SF_FTYPE_PCFLOAT:
10511 case V4SF_FTYPE_PCFLOAT:
10512 case V4DF_FTYPE_PCV2DF:
10513 case V4DF_FTYPE_PCDOUBLE:
10514 case V2DF_FTYPE_PCDOUBLE:
10515 case VOID_FTYPE_PVOID:
10516 case V8DI_FTYPE_PV8DI:
10517 nargs = 1;
10518 klass = load;
10519 memory = 0;
10520 switch (icode)
10521 {
10522 case CODE_FOR_sse4_1_movntdqa:
10523 case CODE_FOR_avx2_movntdqa:
10524 case CODE_FOR_avx512f_movntdqa:
10525 aligned_mem = true;
10526 break;
10527 default:
10528 break;
10529 }
10530 break;
10531 case VOID_FTYPE_PV2SF_V4SF:
10532 case VOID_FTYPE_PV8DI_V8DI:
10533 case VOID_FTYPE_PV4DI_V4DI:
10534 case VOID_FTYPE_PV2DI_V2DI:
10535 case VOID_FTYPE_PCHAR_V32QI:
10536 case VOID_FTYPE_PCHAR_V16QI:
10537 case VOID_FTYPE_PFLOAT_V16SF:
10538 case VOID_FTYPE_PFLOAT_V8SF:
10539 case VOID_FTYPE_PFLOAT_V4SF:
10540 case VOID_FTYPE_PDOUBLE_V8DF:
10541 case VOID_FTYPE_PDOUBLE_V4DF:
10542 case VOID_FTYPE_PDOUBLE_V2DF:
10543 case VOID_FTYPE_PLONGLONG_LONGLONG:
10544 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10545 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10546 case VOID_FTYPE_PINT_INT:
10547 nargs = 1;
10548 klass = store;
10549 /* Reserve memory operand for target. */
10550 memory = ARRAY_SIZE (xops);
10551 switch (icode)
10552 {
10553 /* These builtins and instructions require the memory
10554 to be properly aligned. */
10555 case CODE_FOR_avx_movntv4di:
10556 case CODE_FOR_sse2_movntv2di:
10557 case CODE_FOR_avx_movntv8sf:
10558 case CODE_FOR_sse_movntv4sf:
10559 case CODE_FOR_sse4a_vmmovntv4sf:
10560 case CODE_FOR_avx_movntv4df:
10561 case CODE_FOR_sse2_movntv2df:
10562 case CODE_FOR_sse4a_vmmovntv2df:
10563 case CODE_FOR_sse2_movntidi:
10564 case CODE_FOR_sse_movntq:
10565 case CODE_FOR_sse2_movntisi:
10566 case CODE_FOR_avx512f_movntv16sf:
10567 case CODE_FOR_avx512f_movntv8df:
10568 case CODE_FOR_avx512f_movntv8di:
10569 aligned_mem = true;
10570 break;
10571 default:
10572 break;
10573 }
10574 break;
10575 case VOID_FTYPE_PVOID_PCVOID:
10576 nargs = 1;
10577 klass = store;
10578 memory = 0;
10579
10580 break;
10581 case V4SF_FTYPE_V4SF_PCV2SF:
10582 case V2DF_FTYPE_V2DF_PCDOUBLE:
10583 nargs = 2;
10584 klass = load;
10585 memory = 1;
10586 break;
10587 case V8SF_FTYPE_PCV8SF_V8SI:
10588 case V4DF_FTYPE_PCV4DF_V4DI:
10589 case V4SF_FTYPE_PCV4SF_V4SI:
10590 case V2DF_FTYPE_PCV2DF_V2DI:
10591 case V8SI_FTYPE_PCV8SI_V8SI:
10592 case V4DI_FTYPE_PCV4DI_V4DI:
10593 case V4SI_FTYPE_PCV4SI_V4SI:
10594 case V2DI_FTYPE_PCV2DI_V2DI:
10595 case VOID_FTYPE_INT_INT64:
10596 nargs = 2;
10597 klass = load;
10598 memory = 0;
10599 break;
10600 case VOID_FTYPE_PV8DF_V8DF_UQI:
10601 case VOID_FTYPE_PV4DF_V4DF_UQI:
10602 case VOID_FTYPE_PV2DF_V2DF_UQI:
10603 case VOID_FTYPE_PV16SF_V16SF_UHI:
10604 case VOID_FTYPE_PV8SF_V8SF_UQI:
10605 case VOID_FTYPE_PV4SF_V4SF_UQI:
10606 case VOID_FTYPE_PV8DI_V8DI_UQI:
10607 case VOID_FTYPE_PV4DI_V4DI_UQI:
10608 case VOID_FTYPE_PV2DI_V2DI_UQI:
10609 case VOID_FTYPE_PV16SI_V16SI_UHI:
10610 case VOID_FTYPE_PV8SI_V8SI_UQI:
10611 case VOID_FTYPE_PV4SI_V4SI_UQI:
10612 case VOID_FTYPE_PV64QI_V64QI_UDI:
10613 case VOID_FTYPE_PV32HI_V32HI_USI:
10614 case VOID_FTYPE_PV32QI_V32QI_USI:
10615 case VOID_FTYPE_PV16QI_V16QI_UHI:
10616 case VOID_FTYPE_PV16HI_V16HI_UHI:
10617 case VOID_FTYPE_PV8HI_V8HI_UQI:
10618 switch (icode)
10619 {
10620 /* These builtins and instructions require the memory
10621 to be properly aligned. */
10622 case CODE_FOR_avx512f_storev16sf_mask:
10623 case CODE_FOR_avx512f_storev16si_mask:
10624 case CODE_FOR_avx512f_storev8df_mask:
10625 case CODE_FOR_avx512f_storev8di_mask:
10626 case CODE_FOR_avx512vl_storev8sf_mask:
10627 case CODE_FOR_avx512vl_storev8si_mask:
10628 case CODE_FOR_avx512vl_storev4df_mask:
10629 case CODE_FOR_avx512vl_storev4di_mask:
10630 case CODE_FOR_avx512vl_storev4sf_mask:
10631 case CODE_FOR_avx512vl_storev4si_mask:
10632 case CODE_FOR_avx512vl_storev2df_mask:
10633 case CODE_FOR_avx512vl_storev2di_mask:
10634 aligned_mem = true;
10635 break;
10636 default:
10637 break;
10638 }
10639 /* FALLTHRU */
10640 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10641 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10642 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10643 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10644 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10645 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10646 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10647 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10648 case VOID_FTYPE_PV8SI_V8DI_UQI:
10649 case VOID_FTYPE_PV8HI_V8DI_UQI:
10650 case VOID_FTYPE_PV16HI_V16SI_UHI:
10651 case VOID_FTYPE_PUDI_V8DI_UQI:
10652 case VOID_FTYPE_PV16QI_V16SI_UHI:
10653 case VOID_FTYPE_PV4SI_V4DI_UQI:
10654 case VOID_FTYPE_PUDI_V2DI_UQI:
10655 case VOID_FTYPE_PUDI_V4DI_UQI:
10656 case VOID_FTYPE_PUSI_V2DI_UQI:
10657 case VOID_FTYPE_PV8HI_V8SI_UQI:
10658 case VOID_FTYPE_PUDI_V4SI_UQI:
10659 case VOID_FTYPE_PUSI_V4DI_UQI:
10660 case VOID_FTYPE_PUHI_V2DI_UQI:
10661 case VOID_FTYPE_PUDI_V8SI_UQI:
10662 case VOID_FTYPE_PUSI_V4SI_UQI:
10663 case VOID_FTYPE_PCHAR_V64QI_UDI:
10664 case VOID_FTYPE_PCHAR_V32QI_USI:
10665 case VOID_FTYPE_PCHAR_V16QI_UHI:
10666 case VOID_FTYPE_PSHORT_V32HI_USI:
10667 case VOID_FTYPE_PSHORT_V16HI_UHI:
10668 case VOID_FTYPE_PSHORT_V8HI_UQI:
10669 case VOID_FTYPE_PINT_V16SI_UHI:
10670 case VOID_FTYPE_PINT_V8SI_UQI:
10671 case VOID_FTYPE_PINT_V4SI_UQI:
10672 case VOID_FTYPE_PINT64_V8DI_UQI:
10673 case VOID_FTYPE_PINT64_V4DI_UQI:
10674 case VOID_FTYPE_PINT64_V2DI_UQI:
10675 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10676 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10677 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10678 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10679 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10680 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10681 case VOID_FTYPE_PV32QI_V32HI_USI:
10682 case VOID_FTYPE_PV16QI_V16HI_UHI:
10683 case VOID_FTYPE_PUDI_V8HI_UQI:
10684 nargs = 2;
10685 klass = store;
10686 /* Reserve memory operand for target. */
10687 memory = ARRAY_SIZE (xops);
10688 break;
10689 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10690 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10691 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10692 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10693 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10694 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10695 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10696 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10697 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10698 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10699 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10700 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10701 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10702 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10703 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10704 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10705 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10706 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10707 switch (icode)
10708 {
10709 /* These builtins and instructions require the memory
10710 to be properly aligned. */
10711 case CODE_FOR_avx512f_loadv16sf_mask:
10712 case CODE_FOR_avx512f_loadv16si_mask:
10713 case CODE_FOR_avx512f_loadv8df_mask:
10714 case CODE_FOR_avx512f_loadv8di_mask:
10715 case CODE_FOR_avx512vl_loadv8sf_mask:
10716 case CODE_FOR_avx512vl_loadv8si_mask:
10717 case CODE_FOR_avx512vl_loadv4df_mask:
10718 case CODE_FOR_avx512vl_loadv4di_mask:
10719 case CODE_FOR_avx512vl_loadv4sf_mask:
10720 case CODE_FOR_avx512vl_loadv4si_mask:
10721 case CODE_FOR_avx512vl_loadv2df_mask:
10722 case CODE_FOR_avx512vl_loadv2di_mask:
10723 case CODE_FOR_avx512bw_loadv64qi_mask:
10724 case CODE_FOR_avx512vl_loadv32qi_mask:
10725 case CODE_FOR_avx512vl_loadv16qi_mask:
10726 case CODE_FOR_avx512bw_loadv32hi_mask:
10727 case CODE_FOR_avx512vl_loadv16hi_mask:
10728 case CODE_FOR_avx512vl_loadv8hi_mask:
10729 aligned_mem = true;
10730 break;
10731 default:
10732 break;
10733 }
10734 /* FALLTHRU */
10735 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10736 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10737 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10738 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10739 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10740 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10741 case V16SI_FTYPE_PCINT_V16SI_UHI:
10742 case V8SI_FTYPE_PCINT_V8SI_UQI:
10743 case V4SI_FTYPE_PCINT_V4SI_UQI:
10744 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10745 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10746 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10747 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10748 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10749 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10750 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10751 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10752 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10753 nargs = 3;
10754 klass = load;
10755 memory = 0;
10756 break;
10757 default:
10758 gcc_unreachable ();
10759 }
10760
10761 gcc_assert (nargs <= ARRAY_SIZE (xops));
10762
10763 if (klass == store)
10764 {
10765 arg = CALL_EXPR_ARG (exp, 0);
10766 op = expand_normal (arg);
10767 gcc_assert (target == 0);
10768 if (memory)
10769 {
10770 op = ix86_zero_extend_to_Pmode (op);
10771 target = gen_rtx_MEM (tmode, op);
10772 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10773 on it. Try to improve it using get_pointer_alignment,
10774 and if the special builtin is one that requires strict
10775 mode alignment, also from it's GET_MODE_ALIGNMENT.
10776 Failure to do so could lead to ix86_legitimate_combined_insn
10777 rejecting all changes to such insns. */
10778 unsigned int align = get_pointer_alignment (arg);
10779 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10780 align = GET_MODE_ALIGNMENT (tmode);
10781 if (MEM_ALIGN (target) < align)
10782 set_mem_align (target, align);
10783 }
10784 else
10785 target = force_reg (tmode, op);
10786 arg_adjust = 1;
10787 }
10788 else
10789 {
10790 arg_adjust = 0;
10791 if (optimize
10792 || target == 0
10793 || !register_operand (target, tmode)
10794 || GET_MODE (target) != tmode)
10795 target = gen_reg_rtx (tmode);
10796 }
10797
10798 for (i = 0; i < nargs; i++)
10799 {
10800 machine_mode mode = insn_p->operand[i + 1].mode;
10801
10802 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10803 op = expand_normal (arg);
10804
10805 if (i == memory)
10806 {
10807 /* This must be the memory operand. */
10808 op = ix86_zero_extend_to_Pmode (op);
10809 op = gen_rtx_MEM (mode, op);
10810 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10811 on it. Try to improve it using get_pointer_alignment,
10812 and if the special builtin is one that requires strict
10813 mode alignment, also from it's GET_MODE_ALIGNMENT.
10814 Failure to do so could lead to ix86_legitimate_combined_insn
10815 rejecting all changes to such insns. */
10816 unsigned int align = get_pointer_alignment (arg);
10817 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10818 align = GET_MODE_ALIGNMENT (mode);
10819 if (MEM_ALIGN (op) < align)
10820 set_mem_align (op, align);
10821 }
10822 else
10823 {
10824 /* This must be register. */
10825 if (VECTOR_MODE_P (mode))
10826 op = safe_vector_operand (op, mode);
10827
10828 op = fixup_modeless_constant (op, mode);
10829
10830 /* NB: 3-operands load implied it's a mask load,
10831 and that mask operand shoud be at the end.
10832 Keep all-ones mask which would be simplified by the expander. */
10833 if (nargs == 3 && i == 2 && klass == load
10834 && constm1_operand (op, mode))
10835 ;
10836 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10837 op = copy_to_mode_reg (mode, op);
10838 else
10839 {
10840 op = copy_to_reg (op);
10841 op = lowpart_subreg (mode, op, GET_MODE (op));
10842 }
10843 }
10844
10845 xops[i]= op;
10846 }
10847
10848 switch (nargs)
10849 {
10850 case 0:
10851 pat = GEN_FCN (icode) (target);
10852 break;
10853 case 1:
10854 pat = GEN_FCN (icode) (target, xops[0]);
10855 break;
10856 case 2:
10857 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10858 break;
10859 case 3:
10860 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10861 break;
10862 default:
10863 gcc_unreachable ();
10864 }
10865
10866 if (! pat)
10867 return 0;
10868
10869 emit_insn (pat);
10870 return klass == store ? 0 : target;
10871 }
10872
10873 /* Return the integer constant in ARG. Constrain it to be in the range
10874 of the subparts of VEC_TYPE; issue an error if not. */
10875
10876 static int
10877 get_element_number (tree vec_type, tree arg)
10878 {
10879 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10880
10881 if (!tree_fits_uhwi_p (arg)
10882 || (elt = tree_to_uhwi (arg), elt > max))
10883 {
10884 error ("selector must be an integer constant in the range "
10885 "[0, %wi]", max);
10886 return 0;
10887 }
10888
10889 return elt;
10890 }
10891
10892 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10893 ix86_expand_vector_init. We DO have language-level syntax for this, in
10894 the form of (type){ init-list }. Except that since we can't place emms
10895 instructions from inside the compiler, we can't allow the use of MMX
10896 registers unless the user explicitly asks for it. So we do *not* define
10897 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10898 we have builtins invoked by mmintrin.h that gives us license to emit
10899 these sorts of instructions. */
10900
10901 static rtx
10902 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10903 {
10904 machine_mode tmode = TYPE_MODE (type);
10905 machine_mode inner_mode = GET_MODE_INNER (tmode);
10906 int i, n_elt = GET_MODE_NUNITS (tmode);
10907 rtvec v = rtvec_alloc (n_elt);
10908
10909 gcc_assert (VECTOR_MODE_P (tmode));
10910 gcc_assert (call_expr_nargs (exp) == n_elt);
10911
10912 for (i = 0; i < n_elt; ++i)
10913 {
10914 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10915 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10916 }
10917
10918 if (!target || !register_operand (target, tmode))
10919 target = gen_reg_rtx (tmode);
10920
10921 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10922 return target;
10923 }
10924
10925 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10926 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10927 had a language-level syntax for referencing vector elements. */
10928
10929 static rtx
10930 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10931 {
10932 machine_mode tmode, mode0;
10933 tree arg0, arg1;
10934 int elt;
10935 rtx op0;
10936
10937 arg0 = CALL_EXPR_ARG (exp, 0);
10938 arg1 = CALL_EXPR_ARG (exp, 1);
10939
10940 op0 = expand_normal (arg0);
10941 elt = get_element_number (TREE_TYPE (arg0), arg1);
10942
10943 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10944 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10945 gcc_assert (VECTOR_MODE_P (mode0));
10946
10947 op0 = force_reg (mode0, op0);
10948
10949 if (optimize || !target || !register_operand (target, tmode))
10950 target = gen_reg_rtx (tmode);
10951
10952 ix86_expand_vector_extract (true, target, op0, elt);
10953
10954 return target;
10955 }
10956
10957 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10958 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10959 a language-level syntax for referencing vector elements. */
10960
10961 static rtx
10962 ix86_expand_vec_set_builtin (tree exp)
10963 {
10964 machine_mode tmode, mode1;
10965 tree arg0, arg1, arg2;
10966 int elt;
10967 rtx op0, op1, target;
10968
10969 arg0 = CALL_EXPR_ARG (exp, 0);
10970 arg1 = CALL_EXPR_ARG (exp, 1);
10971 arg2 = CALL_EXPR_ARG (exp, 2);
10972
10973 tmode = TYPE_MODE (TREE_TYPE (arg0));
10974 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10975 gcc_assert (VECTOR_MODE_P (tmode));
10976
10977 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10978 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10979 elt = get_element_number (TREE_TYPE (arg0), arg2);
10980
10981 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10982 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10983
10984 op0 = force_reg (tmode, op0);
10985 op1 = force_reg (mode1, op1);
10986
10987 /* OP0 is the source of these builtin functions and shouldn't be
10988 modified. Create a copy, use it and return it as target. */
10989 target = gen_reg_rtx (tmode);
10990 emit_move_insn (target, op0);
10991 ix86_expand_vector_set (true, target, op1, elt);
10992
10993 return target;
10994 }
10995
10996 /* Expand an expression EXP that calls a built-in function,
10997 with result going to TARGET if that's convenient
10998 (and in mode MODE if that's convenient).
10999 SUBTARGET may be used as the target for computing one of EXP's operands.
11000 IGNORE is nonzero if the value is to be ignored. */
11001
11002 rtx
11003 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11004 machine_mode mode, int ignore)
11005 {
11006 size_t i;
11007 enum insn_code icode, icode2;
11008 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11009 tree arg0, arg1, arg2, arg3, arg4;
11010 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11011 machine_mode mode0, mode1, mode2, mode3, mode4;
11012 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
11013
11014 /* For CPU builtins that can be folded, fold first and expand the fold. */
11015 switch (fcode)
11016 {
11017 case IX86_BUILTIN_CPU_INIT:
11018 {
11019 /* Make it call __cpu_indicator_init in libgcc. */
11020 tree call_expr, fndecl, type;
11021 type = build_function_type_list (integer_type_node, NULL_TREE);
11022 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11023 call_expr = build_call_expr (fndecl, 0);
11024 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11025 }
11026 case IX86_BUILTIN_CPU_IS:
11027 case IX86_BUILTIN_CPU_SUPPORTS:
11028 {
11029 tree arg0 = CALL_EXPR_ARG (exp, 0);
11030 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11031 gcc_assert (fold_expr != NULL_TREE);
11032 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11033 }
11034 }
11035
11036 HOST_WIDE_INT isa = ix86_isa_flags;
11037 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11038 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11039 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11040 /* The general case is we require all the ISAs specified in bisa{,2}
11041 to be enabled.
11042 The exceptions are:
11043 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11044 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11045 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11046 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11047 OPTION_MASK_ISA2_AVXVNNI
11048 where for each such pair it is sufficient if either of the ISAs is
11049 enabled, plus if it is ored with other options also those others.
11050 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11051 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11052 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11053 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11054 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11055
11056 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11057 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11058 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11059 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11060
11061 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11062 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11063 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11064 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11065
11066 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11067 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11068 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11069 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11070 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11071 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11072 {
11073 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11074 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11075 }
11076
11077 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11078 /* __builtin_ia32_maskmovq requires MMX registers. */
11079 && fcode != IX86_BUILTIN_MASKMOVQ)
11080 {
11081 bisa &= ~OPTION_MASK_ISA_MMX;
11082 bisa |= OPTION_MASK_ISA_SSE2;
11083 }
11084
11085 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11086 {
11087 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11088 if (TARGET_ABI_X32)
11089 bisa |= OPTION_MASK_ABI_X32;
11090 else
11091 bisa |= OPTION_MASK_ABI_64;
11092 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11093 (enum fpmath_unit) 0,
11094 (enum prefer_vector_width) 0,
11095 false, add_abi_p);
11096 if (!opts)
11097 error ("%qE needs unknown isa option", fndecl);
11098 else
11099 {
11100 gcc_assert (opts != NULL);
11101 error ("%qE needs isa option %s", fndecl, opts);
11102 free (opts);
11103 }
11104 return expand_call (exp, target, ignore);
11105 }
11106
11107 switch (fcode)
11108 {
11109 case IX86_BUILTIN_MASKMOVQ:
11110 case IX86_BUILTIN_MASKMOVDQU:
11111 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11112 ? CODE_FOR_mmx_maskmovq
11113 : CODE_FOR_sse2_maskmovdqu);
11114 /* Note the arg order is different from the operand order. */
11115 arg1 = CALL_EXPR_ARG (exp, 0);
11116 arg2 = CALL_EXPR_ARG (exp, 1);
11117 arg0 = CALL_EXPR_ARG (exp, 2);
11118 op0 = expand_normal (arg0);
11119 op1 = expand_normal (arg1);
11120 op2 = expand_normal (arg2);
11121 mode0 = insn_data[icode].operand[0].mode;
11122 mode1 = insn_data[icode].operand[1].mode;
11123 mode2 = insn_data[icode].operand[2].mode;
11124
11125 op0 = ix86_zero_extend_to_Pmode (op0);
11126 op0 = gen_rtx_MEM (mode1, op0);
11127
11128 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11129 op0 = copy_to_mode_reg (mode0, op0);
11130 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11131 op1 = copy_to_mode_reg (mode1, op1);
11132 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11133 op2 = copy_to_mode_reg (mode2, op2);
11134 pat = GEN_FCN (icode) (op0, op1, op2);
11135 if (! pat)
11136 return 0;
11137 emit_insn (pat);
11138 return 0;
11139
11140 case IX86_BUILTIN_LDMXCSR:
11141 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11142 target = assign_386_stack_local (SImode, SLOT_TEMP);
11143 emit_move_insn (target, op0);
11144 emit_insn (gen_sse_ldmxcsr (target));
11145 return 0;
11146
11147 case IX86_BUILTIN_STMXCSR:
11148 target = assign_386_stack_local (SImode, SLOT_TEMP);
11149 emit_insn (gen_sse_stmxcsr (target));
11150 return copy_to_mode_reg (SImode, target);
11151
11152 case IX86_BUILTIN_CLFLUSH:
11153 arg0 = CALL_EXPR_ARG (exp, 0);
11154 op0 = expand_normal (arg0);
11155 icode = CODE_FOR_sse2_clflush;
11156 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11157 op0 = ix86_zero_extend_to_Pmode (op0);
11158
11159 emit_insn (gen_sse2_clflush (op0));
11160 return 0;
11161
11162 case IX86_BUILTIN_CLWB:
11163 arg0 = CALL_EXPR_ARG (exp, 0);
11164 op0 = expand_normal (arg0);
11165 icode = CODE_FOR_clwb;
11166 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11167 op0 = ix86_zero_extend_to_Pmode (op0);
11168
11169 emit_insn (gen_clwb (op0));
11170 return 0;
11171
11172 case IX86_BUILTIN_CLFLUSHOPT:
11173 arg0 = CALL_EXPR_ARG (exp, 0);
11174 op0 = expand_normal (arg0);
11175 icode = CODE_FOR_clflushopt;
11176 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11177 op0 = ix86_zero_extend_to_Pmode (op0);
11178
11179 emit_insn (gen_clflushopt (op0));
11180 return 0;
11181
11182 case IX86_BUILTIN_MONITOR:
11183 case IX86_BUILTIN_MONITORX:
11184 arg0 = CALL_EXPR_ARG (exp, 0);
11185 arg1 = CALL_EXPR_ARG (exp, 1);
11186 arg2 = CALL_EXPR_ARG (exp, 2);
11187 op0 = expand_normal (arg0);
11188 op1 = expand_normal (arg1);
11189 op2 = expand_normal (arg2);
11190 if (!REG_P (op0))
11191 op0 = ix86_zero_extend_to_Pmode (op0);
11192 if (!REG_P (op1))
11193 op1 = copy_to_mode_reg (SImode, op1);
11194 if (!REG_P (op2))
11195 op2 = copy_to_mode_reg (SImode, op2);
11196
11197 emit_insn (fcode == IX86_BUILTIN_MONITOR
11198 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11199 : gen_monitorx (Pmode, op0, op1, op2));
11200 return 0;
11201
11202 case IX86_BUILTIN_MWAIT:
11203 arg0 = CALL_EXPR_ARG (exp, 0);
11204 arg1 = CALL_EXPR_ARG (exp, 1);
11205 op0 = expand_normal (arg0);
11206 op1 = expand_normal (arg1);
11207 if (!REG_P (op0))
11208 op0 = copy_to_mode_reg (SImode, op0);
11209 if (!REG_P (op1))
11210 op1 = copy_to_mode_reg (SImode, op1);
11211 emit_insn (gen_sse3_mwait (op0, op1));
11212 return 0;
11213
11214 case IX86_BUILTIN_MWAITX:
11215 arg0 = CALL_EXPR_ARG (exp, 0);
11216 arg1 = CALL_EXPR_ARG (exp, 1);
11217 arg2 = CALL_EXPR_ARG (exp, 2);
11218 op0 = expand_normal (arg0);
11219 op1 = expand_normal (arg1);
11220 op2 = expand_normal (arg2);
11221 if (!REG_P (op0))
11222 op0 = copy_to_mode_reg (SImode, op0);
11223 if (!REG_P (op1))
11224 op1 = copy_to_mode_reg (SImode, op1);
11225 if (!REG_P (op2))
11226 op2 = copy_to_mode_reg (SImode, op2);
11227 emit_insn (gen_mwaitx (op0, op1, op2));
11228 return 0;
11229
11230 case IX86_BUILTIN_UMONITOR:
11231 arg0 = CALL_EXPR_ARG (exp, 0);
11232 op0 = expand_normal (arg0);
11233
11234 op0 = ix86_zero_extend_to_Pmode (op0);
11235 emit_insn (gen_umonitor (Pmode, op0));
11236 return 0;
11237
11238 case IX86_BUILTIN_UMWAIT:
11239 case IX86_BUILTIN_TPAUSE:
11240 arg0 = CALL_EXPR_ARG (exp, 0);
11241 arg1 = CALL_EXPR_ARG (exp, 1);
11242 op0 = expand_normal (arg0);
11243 op1 = expand_normal (arg1);
11244
11245 if (!REG_P (op0))
11246 op0 = copy_to_mode_reg (SImode, op0);
11247
11248 op1 = force_reg (DImode, op1);
11249
11250 if (TARGET_64BIT)
11251 {
11252 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11253 NULL, 1, OPTAB_DIRECT);
11254 switch (fcode)
11255 {
11256 case IX86_BUILTIN_UMWAIT:
11257 icode = CODE_FOR_umwait_rex64;
11258 break;
11259 case IX86_BUILTIN_TPAUSE:
11260 icode = CODE_FOR_tpause_rex64;
11261 break;
11262 default:
11263 gcc_unreachable ();
11264 }
11265
11266 op2 = gen_lowpart (SImode, op2);
11267 op1 = gen_lowpart (SImode, op1);
11268 pat = GEN_FCN (icode) (op0, op1, op2);
11269 }
11270 else
11271 {
11272 switch (fcode)
11273 {
11274 case IX86_BUILTIN_UMWAIT:
11275 icode = CODE_FOR_umwait;
11276 break;
11277 case IX86_BUILTIN_TPAUSE:
11278 icode = CODE_FOR_tpause;
11279 break;
11280 default:
11281 gcc_unreachable ();
11282 }
11283 pat = GEN_FCN (icode) (op0, op1);
11284 }
11285
11286 if (!pat)
11287 return 0;
11288
11289 emit_insn (pat);
11290
11291 if (target == 0
11292 || !register_operand (target, QImode))
11293 target = gen_reg_rtx (QImode);
11294
11295 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11296 const0_rtx);
11297 emit_insn (gen_rtx_SET (target, pat));
11298
11299 return target;
11300
11301 case IX86_BUILTIN_TESTUI:
11302 emit_insn (gen_testui ());
11303
11304 if (target == 0
11305 || !register_operand (target, QImode))
11306 target = gen_reg_rtx (QImode);
11307
11308 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11309 const0_rtx);
11310 emit_insn (gen_rtx_SET (target, pat));
11311
11312 return target;
11313
11314 case IX86_BUILTIN_CLZERO:
11315 arg0 = CALL_EXPR_ARG (exp, 0);
11316 op0 = expand_normal (arg0);
11317 if (!REG_P (op0))
11318 op0 = ix86_zero_extend_to_Pmode (op0);
11319 emit_insn (gen_clzero (Pmode, op0));
11320 return 0;
11321
11322 case IX86_BUILTIN_CLDEMOTE:
11323 arg0 = CALL_EXPR_ARG (exp, 0);
11324 op0 = expand_normal (arg0);
11325 icode = CODE_FOR_cldemote;
11326 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11327 op0 = ix86_zero_extend_to_Pmode (op0);
11328
11329 emit_insn (gen_cldemote (op0));
11330 return 0;
11331
11332 case IX86_BUILTIN_LOADIWKEY:
11333 {
11334 arg0 = CALL_EXPR_ARG (exp, 0);
11335 arg1 = CALL_EXPR_ARG (exp, 1);
11336 arg2 = CALL_EXPR_ARG (exp, 2);
11337 arg3 = CALL_EXPR_ARG (exp, 3);
11338
11339 op0 = expand_normal (arg0);
11340 op1 = expand_normal (arg1);
11341 op2 = expand_normal (arg2);
11342 op3 = expand_normal (arg3);
11343
11344 if (!REG_P (op0))
11345 op0 = copy_to_mode_reg (V2DImode, op0);
11346 if (!REG_P (op1))
11347 op1 = copy_to_mode_reg (V2DImode, op1);
11348 if (!REG_P (op2))
11349 op2 = copy_to_mode_reg (V2DImode, op2);
11350 if (!REG_P (op3))
11351 op3 = copy_to_mode_reg (SImode, op3);
11352
11353 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11354
11355 return 0;
11356 }
11357
11358 case IX86_BUILTIN_AESDEC128KLU8:
11359 icode = CODE_FOR_aesdec128klu8;
11360 goto aesdecenc_expand;
11361
11362 case IX86_BUILTIN_AESDEC256KLU8:
11363 icode = CODE_FOR_aesdec256klu8;
11364 goto aesdecenc_expand;
11365
11366 case IX86_BUILTIN_AESENC128KLU8:
11367 icode = CODE_FOR_aesenc128klu8;
11368 goto aesdecenc_expand;
11369
11370 case IX86_BUILTIN_AESENC256KLU8:
11371 icode = CODE_FOR_aesenc256klu8;
11372
11373 aesdecenc_expand:
11374
11375 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11376 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11377 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11378
11379 op0 = expand_normal (arg0);
11380 op1 = expand_normal (arg1);
11381 op2 = expand_normal (arg2);
11382
11383 if (!address_operand (op0, V2DImode))
11384 {
11385 op0 = convert_memory_address (Pmode, op0);
11386 op0 = copy_addr_to_reg (op0);
11387 }
11388 op0 = gen_rtx_MEM (V2DImode, op0);
11389
11390 if (!REG_P (op1))
11391 op1 = copy_to_mode_reg (V2DImode, op1);
11392
11393 if (!address_operand (op2, VOIDmode))
11394 {
11395 op2 = convert_memory_address (Pmode, op2);
11396 op2 = copy_addr_to_reg (op2);
11397 }
11398 op2 = gen_rtx_MEM (BLKmode, op2);
11399
11400 emit_insn (GEN_FCN (icode) (op1, op1, op2));
11401
11402 if (target == 0)
11403 target = gen_reg_rtx (QImode);
11404
11405 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11406 const0_rtx);
11407 emit_insn (gen_rtx_SET (target, pat));
11408
11409 emit_insn (gen_rtx_SET (op0, op1));
11410
11411 return target;
11412
11413 case IX86_BUILTIN_AESDECWIDE128KLU8:
11414 icode = CODE_FOR_aesdecwide128klu8;
11415 goto wideaesdecenc_expand;
11416
11417 case IX86_BUILTIN_AESDECWIDE256KLU8:
11418 icode = CODE_FOR_aesdecwide256klu8;
11419 goto wideaesdecenc_expand;
11420
11421 case IX86_BUILTIN_AESENCWIDE128KLU8:
11422 icode = CODE_FOR_aesencwide128klu8;
11423 goto wideaesdecenc_expand;
11424
11425 case IX86_BUILTIN_AESENCWIDE256KLU8:
11426 icode = CODE_FOR_aesencwide256klu8;
11427
11428 wideaesdecenc_expand:
11429
11430 rtx xmm_regs[8];
11431 rtx op;
11432
11433 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11434 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11435 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11436
11437 op0 = expand_normal (arg0);
11438 op1 = expand_normal (arg1);
11439 op2 = expand_normal (arg2);
11440
11441 if (!address_operand (op2, VOIDmode))
11442 {
11443 op2 = convert_memory_address (Pmode, op2);
11444 op2 = copy_addr_to_reg (op2);
11445 }
11446 op2 = gen_rtx_MEM (BLKmode, op2);
11447
11448 for (i = 0; i < 8; i++)
11449 {
11450 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11451
11452 op = gen_rtx_MEM (V2DImode,
11453 plus_constant (Pmode, op1, (i * 16)));
11454
11455 emit_move_insn (xmm_regs[i], op);
11456 }
11457
11458 emit_insn (GEN_FCN (icode) (op2));
11459
11460 if (target == 0)
11461 target = gen_reg_rtx (QImode);
11462
11463 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11464 const0_rtx);
11465 emit_insn (gen_rtx_SET (target, pat));
11466
11467 for (i = 0; i < 8; i++)
11468 {
11469 op = gen_rtx_MEM (V2DImode,
11470 plus_constant (Pmode, op0, (i * 16)));
11471 emit_move_insn (op, xmm_regs[i]);
11472 }
11473
11474 return target;
11475
11476 case IX86_BUILTIN_ENCODEKEY128U32:
11477 {
11478 rtx op, xmm_regs[7];
11479
11480 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11481 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11482 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11483
11484 op0 = expand_normal (arg0);
11485 op1 = expand_normal (arg1);
11486 op2 = expand_normal (arg2);
11487
11488 if (!REG_P (op0))
11489 op0 = copy_to_mode_reg (SImode, op0);
11490
11491 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11492 emit_move_insn (op, op1);
11493
11494 for (i = 0; i < 3; i++)
11495 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11496
11497 if (target == 0)
11498 target = gen_reg_rtx (SImode);
11499
11500 emit_insn (gen_encodekey128u32 (target, op0));
11501
11502 for (i = 0; i < 3; i++)
11503 {
11504 op = gen_rtx_MEM (V2DImode,
11505 plus_constant (Pmode, op2, (i * 16)));
11506 emit_move_insn (op, xmm_regs[i]);
11507 }
11508
11509 return target;
11510 }
11511 case IX86_BUILTIN_ENCODEKEY256U32:
11512 {
11513 rtx op, xmm_regs[7];
11514
11515 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11516 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
11517 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
11518 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
11519
11520 op0 = expand_normal (arg0);
11521 op1 = expand_normal (arg1);
11522 op2 = expand_normal (arg2);
11523 op3 = expand_normal (arg3);
11524
11525 if (!REG_P (op0))
11526 op0 = copy_to_mode_reg (SImode, op0);
11527
11528 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11529 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11530 emit_move_insn (op, op1);
11531 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
11532 emit_move_insn (op, op2);
11533
11534 for (i = 0; i < 4; i++)
11535 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11536
11537 if (target == 0)
11538 target = gen_reg_rtx (SImode);
11539
11540 emit_insn (gen_encodekey256u32 (target, op0));
11541
11542 for (i = 0; i < 4; i++)
11543 {
11544 op = gen_rtx_MEM (V2DImode,
11545 plus_constant (Pmode, op3, (i * 16)));
11546 emit_move_insn (op, xmm_regs[i]);
11547 }
11548
11549 return target;
11550 }
11551
11552 case IX86_BUILTIN_VEC_INIT_V2SI:
11553 case IX86_BUILTIN_VEC_INIT_V4HI:
11554 case IX86_BUILTIN_VEC_INIT_V8QI:
11555 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11556
11557 case IX86_BUILTIN_VEC_EXT_V2DF:
11558 case IX86_BUILTIN_VEC_EXT_V2DI:
11559 case IX86_BUILTIN_VEC_EXT_V4SF:
11560 case IX86_BUILTIN_VEC_EXT_V4SI:
11561 case IX86_BUILTIN_VEC_EXT_V8HI:
11562 case IX86_BUILTIN_VEC_EXT_V2SI:
11563 case IX86_BUILTIN_VEC_EXT_V4HI:
11564 case IX86_BUILTIN_VEC_EXT_V16QI:
11565 return ix86_expand_vec_ext_builtin (exp, target);
11566
11567 case IX86_BUILTIN_VEC_SET_V2DI:
11568 case IX86_BUILTIN_VEC_SET_V4SF:
11569 case IX86_BUILTIN_VEC_SET_V4SI:
11570 case IX86_BUILTIN_VEC_SET_V8HI:
11571 case IX86_BUILTIN_VEC_SET_V4HI:
11572 case IX86_BUILTIN_VEC_SET_V16QI:
11573 return ix86_expand_vec_set_builtin (exp);
11574
11575 case IX86_BUILTIN_NANQ:
11576 case IX86_BUILTIN_NANSQ:
11577 return expand_call (exp, target, ignore);
11578
11579 case IX86_BUILTIN_RDPID:
11580
11581 op0 = gen_reg_rtx (word_mode);
11582
11583 if (TARGET_64BIT)
11584 {
11585 insn = gen_rdpid_rex64 (op0);
11586 op0 = convert_to_mode (SImode, op0, 1);
11587 }
11588 else
11589 insn = gen_rdpid (op0);
11590
11591 emit_insn (insn);
11592
11593 if (target == 0
11594 || !register_operand (target, SImode))
11595 target = gen_reg_rtx (SImode);
11596
11597 emit_move_insn (target, op0);
11598 return target;
11599
11600 case IX86_BUILTIN_2INTERSECTD512:
11601 case IX86_BUILTIN_2INTERSECTQ512:
11602 case IX86_BUILTIN_2INTERSECTD256:
11603 case IX86_BUILTIN_2INTERSECTQ256:
11604 case IX86_BUILTIN_2INTERSECTD128:
11605 case IX86_BUILTIN_2INTERSECTQ128:
11606 arg0 = CALL_EXPR_ARG (exp, 0);
11607 arg1 = CALL_EXPR_ARG (exp, 1);
11608 arg2 = CALL_EXPR_ARG (exp, 2);
11609 arg3 = CALL_EXPR_ARG (exp, 3);
11610 op0 = expand_normal (arg0);
11611 op1 = expand_normal (arg1);
11612 op2 = expand_normal (arg2);
11613 op3 = expand_normal (arg3);
11614
11615 if (!address_operand (op0, VOIDmode))
11616 {
11617 op0 = convert_memory_address (Pmode, op0);
11618 op0 = copy_addr_to_reg (op0);
11619 }
11620 if (!address_operand (op1, VOIDmode))
11621 {
11622 op1 = convert_memory_address (Pmode, op1);
11623 op1 = copy_addr_to_reg (op1);
11624 }
11625
11626 switch (fcode)
11627 {
11628 case IX86_BUILTIN_2INTERSECTD512:
11629 mode4 = P2HImode;
11630 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11631 break;
11632 case IX86_BUILTIN_2INTERSECTQ512:
11633 mode4 = P2QImode;
11634 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11635 break;
11636 case IX86_BUILTIN_2INTERSECTD256:
11637 mode4 = P2QImode;
11638 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11639 break;
11640 case IX86_BUILTIN_2INTERSECTQ256:
11641 mode4 = P2QImode;
11642 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11643 break;
11644 case IX86_BUILTIN_2INTERSECTD128:
11645 mode4 = P2QImode;
11646 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11647 break;
11648 case IX86_BUILTIN_2INTERSECTQ128:
11649 mode4 = P2QImode;
11650 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11651 break;
11652 default:
11653 gcc_unreachable ();
11654 }
11655
11656 mode2 = insn_data[icode].operand[1].mode;
11657 mode3 = insn_data[icode].operand[2].mode;
11658 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11659 op2 = copy_to_mode_reg (mode2, op2);
11660 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11661 op3 = copy_to_mode_reg (mode3, op3);
11662
11663 op4 = gen_reg_rtx (mode4);
11664 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11665 mode0 = mode4 == P2HImode ? HImode : QImode;
11666 emit_move_insn (gen_rtx_MEM (mode0, op0),
11667 gen_lowpart (mode0, op4));
11668 emit_move_insn (gen_rtx_MEM (mode0, op1),
11669 gen_highpart (mode0, op4));
11670
11671 return 0;
11672
11673 case IX86_BUILTIN_RDPMC:
11674 case IX86_BUILTIN_RDTSC:
11675 case IX86_BUILTIN_RDTSCP:
11676 case IX86_BUILTIN_XGETBV:
11677
11678 op0 = gen_reg_rtx (DImode);
11679 op1 = gen_reg_rtx (DImode);
11680
11681 if (fcode == IX86_BUILTIN_RDPMC)
11682 {
11683 arg0 = CALL_EXPR_ARG (exp, 0);
11684 op2 = expand_normal (arg0);
11685 if (!register_operand (op2, SImode))
11686 op2 = copy_to_mode_reg (SImode, op2);
11687
11688 insn = (TARGET_64BIT
11689 ? gen_rdpmc_rex64 (op0, op1, op2)
11690 : gen_rdpmc (op0, op2));
11691 emit_insn (insn);
11692 }
11693 else if (fcode == IX86_BUILTIN_XGETBV)
11694 {
11695 arg0 = CALL_EXPR_ARG (exp, 0);
11696 op2 = expand_normal (arg0);
11697 if (!register_operand (op2, SImode))
11698 op2 = copy_to_mode_reg (SImode, op2);
11699
11700 insn = (TARGET_64BIT
11701 ? gen_xgetbv_rex64 (op0, op1, op2)
11702 : gen_xgetbv (op0, op2));
11703 emit_insn (insn);
11704 }
11705 else if (fcode == IX86_BUILTIN_RDTSC)
11706 {
11707 insn = (TARGET_64BIT
11708 ? gen_rdtsc_rex64 (op0, op1)
11709 : gen_rdtsc (op0));
11710 emit_insn (insn);
11711 }
11712 else
11713 {
11714 op2 = gen_reg_rtx (SImode);
11715
11716 insn = (TARGET_64BIT
11717 ? gen_rdtscp_rex64 (op0, op1, op2)
11718 : gen_rdtscp (op0, op2));
11719 emit_insn (insn);
11720
11721 arg0 = CALL_EXPR_ARG (exp, 0);
11722 op4 = expand_normal (arg0);
11723 if (!address_operand (op4, VOIDmode))
11724 {
11725 op4 = convert_memory_address (Pmode, op4);
11726 op4 = copy_addr_to_reg (op4);
11727 }
11728 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11729 }
11730
11731 if (target == 0
11732 || !register_operand (target, DImode))
11733 target = gen_reg_rtx (DImode);
11734
11735 if (TARGET_64BIT)
11736 {
11737 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11738 op1, 1, OPTAB_DIRECT);
11739 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11740 op0, 1, OPTAB_DIRECT);
11741 }
11742
11743 emit_move_insn (target, op0);
11744 return target;
11745
11746 case IX86_BUILTIN_ENQCMD:
11747 case IX86_BUILTIN_ENQCMDS:
11748 case IX86_BUILTIN_MOVDIR64B:
11749
11750 arg0 = CALL_EXPR_ARG (exp, 0);
11751 arg1 = CALL_EXPR_ARG (exp, 1);
11752 op0 = expand_normal (arg0);
11753 op1 = expand_normal (arg1);
11754
11755 op0 = ix86_zero_extend_to_Pmode (op0);
11756 if (!address_operand (op1, VOIDmode))
11757 {
11758 op1 = convert_memory_address (Pmode, op1);
11759 op1 = copy_addr_to_reg (op1);
11760 }
11761 op1 = gen_rtx_MEM (XImode, op1);
11762
11763 if (fcode == IX86_BUILTIN_MOVDIR64B)
11764 {
11765 emit_insn (gen_movdir64b (Pmode, op0, op1));
11766 return 0;
11767 }
11768 else
11769 {
11770 if (target == 0
11771 || !register_operand (target, SImode))
11772 target = gen_reg_rtx (SImode);
11773
11774 emit_move_insn (target, const0_rtx);
11775 target = gen_rtx_SUBREG (QImode, target, 0);
11776
11777 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11778 ? UNSPECV_ENQCMD
11779 : UNSPECV_ENQCMDS);
11780 icode = code_for_enqcmd (unspecv, Pmode);
11781 emit_insn (GEN_FCN (icode) (op0, op1));
11782
11783 emit_insn
11784 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11785 gen_rtx_fmt_ee (EQ, QImode,
11786 gen_rtx_REG (CCZmode, FLAGS_REG),
11787 const0_rtx)));
11788 return SUBREG_REG (target);
11789 }
11790
11791 case IX86_BUILTIN_FXSAVE:
11792 case IX86_BUILTIN_FXRSTOR:
11793 case IX86_BUILTIN_FXSAVE64:
11794 case IX86_BUILTIN_FXRSTOR64:
11795 case IX86_BUILTIN_FNSTENV:
11796 case IX86_BUILTIN_FLDENV:
11797 mode0 = BLKmode;
11798 switch (fcode)
11799 {
11800 case IX86_BUILTIN_FXSAVE:
11801 icode = CODE_FOR_fxsave;
11802 break;
11803 case IX86_BUILTIN_FXRSTOR:
11804 icode = CODE_FOR_fxrstor;
11805 break;
11806 case IX86_BUILTIN_FXSAVE64:
11807 icode = CODE_FOR_fxsave64;
11808 break;
11809 case IX86_BUILTIN_FXRSTOR64:
11810 icode = CODE_FOR_fxrstor64;
11811 break;
11812 case IX86_BUILTIN_FNSTENV:
11813 icode = CODE_FOR_fnstenv;
11814 break;
11815 case IX86_BUILTIN_FLDENV:
11816 icode = CODE_FOR_fldenv;
11817 break;
11818 default:
11819 gcc_unreachable ();
11820 }
11821
11822 arg0 = CALL_EXPR_ARG (exp, 0);
11823 op0 = expand_normal (arg0);
11824
11825 if (!address_operand (op0, VOIDmode))
11826 {
11827 op0 = convert_memory_address (Pmode, op0);
11828 op0 = copy_addr_to_reg (op0);
11829 }
11830 op0 = gen_rtx_MEM (mode0, op0);
11831
11832 pat = GEN_FCN (icode) (op0);
11833 if (pat)
11834 emit_insn (pat);
11835 return 0;
11836
11837 case IX86_BUILTIN_XSETBV:
11838 arg0 = CALL_EXPR_ARG (exp, 0);
11839 arg1 = CALL_EXPR_ARG (exp, 1);
11840 op0 = expand_normal (arg0);
11841 op1 = expand_normal (arg1);
11842
11843 if (!REG_P (op0))
11844 op0 = copy_to_mode_reg (SImode, op0);
11845
11846 op1 = force_reg (DImode, op1);
11847
11848 if (TARGET_64BIT)
11849 {
11850 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11851 NULL, 1, OPTAB_DIRECT);
11852
11853 icode = CODE_FOR_xsetbv_rex64;
11854
11855 op2 = gen_lowpart (SImode, op2);
11856 op1 = gen_lowpart (SImode, op1);
11857 pat = GEN_FCN (icode) (op0, op1, op2);
11858 }
11859 else
11860 {
11861 icode = CODE_FOR_xsetbv;
11862
11863 pat = GEN_FCN (icode) (op0, op1);
11864 }
11865 if (pat)
11866 emit_insn (pat);
11867 return 0;
11868
11869 case IX86_BUILTIN_XSAVE:
11870 case IX86_BUILTIN_XRSTOR:
11871 case IX86_BUILTIN_XSAVE64:
11872 case IX86_BUILTIN_XRSTOR64:
11873 case IX86_BUILTIN_XSAVEOPT:
11874 case IX86_BUILTIN_XSAVEOPT64:
11875 case IX86_BUILTIN_XSAVES:
11876 case IX86_BUILTIN_XRSTORS:
11877 case IX86_BUILTIN_XSAVES64:
11878 case IX86_BUILTIN_XRSTORS64:
11879 case IX86_BUILTIN_XSAVEC:
11880 case IX86_BUILTIN_XSAVEC64:
11881 arg0 = CALL_EXPR_ARG (exp, 0);
11882 arg1 = CALL_EXPR_ARG (exp, 1);
11883 op0 = expand_normal (arg0);
11884 op1 = expand_normal (arg1);
11885
11886 if (!address_operand (op0, VOIDmode))
11887 {
11888 op0 = convert_memory_address (Pmode, op0);
11889 op0 = copy_addr_to_reg (op0);
11890 }
11891 op0 = gen_rtx_MEM (BLKmode, op0);
11892
11893 op1 = force_reg (DImode, op1);
11894
11895 if (TARGET_64BIT)
11896 {
11897 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11898 NULL, 1, OPTAB_DIRECT);
11899 switch (fcode)
11900 {
11901 case IX86_BUILTIN_XSAVE:
11902 icode = CODE_FOR_xsave_rex64;
11903 break;
11904 case IX86_BUILTIN_XRSTOR:
11905 icode = CODE_FOR_xrstor_rex64;
11906 break;
11907 case IX86_BUILTIN_XSAVE64:
11908 icode = CODE_FOR_xsave64;
11909 break;
11910 case IX86_BUILTIN_XRSTOR64:
11911 icode = CODE_FOR_xrstor64;
11912 break;
11913 case IX86_BUILTIN_XSAVEOPT:
11914 icode = CODE_FOR_xsaveopt_rex64;
11915 break;
11916 case IX86_BUILTIN_XSAVEOPT64:
11917 icode = CODE_FOR_xsaveopt64;
11918 break;
11919 case IX86_BUILTIN_XSAVES:
11920 icode = CODE_FOR_xsaves_rex64;
11921 break;
11922 case IX86_BUILTIN_XRSTORS:
11923 icode = CODE_FOR_xrstors_rex64;
11924 break;
11925 case IX86_BUILTIN_XSAVES64:
11926 icode = CODE_FOR_xsaves64;
11927 break;
11928 case IX86_BUILTIN_XRSTORS64:
11929 icode = CODE_FOR_xrstors64;
11930 break;
11931 case IX86_BUILTIN_XSAVEC:
11932 icode = CODE_FOR_xsavec_rex64;
11933 break;
11934 case IX86_BUILTIN_XSAVEC64:
11935 icode = CODE_FOR_xsavec64;
11936 break;
11937 default:
11938 gcc_unreachable ();
11939 }
11940
11941 op2 = gen_lowpart (SImode, op2);
11942 op1 = gen_lowpart (SImode, op1);
11943 pat = GEN_FCN (icode) (op0, op1, op2);
11944 }
11945 else
11946 {
11947 switch (fcode)
11948 {
11949 case IX86_BUILTIN_XSAVE:
11950 icode = CODE_FOR_xsave;
11951 break;
11952 case IX86_BUILTIN_XRSTOR:
11953 icode = CODE_FOR_xrstor;
11954 break;
11955 case IX86_BUILTIN_XSAVEOPT:
11956 icode = CODE_FOR_xsaveopt;
11957 break;
11958 case IX86_BUILTIN_XSAVES:
11959 icode = CODE_FOR_xsaves;
11960 break;
11961 case IX86_BUILTIN_XRSTORS:
11962 icode = CODE_FOR_xrstors;
11963 break;
11964 case IX86_BUILTIN_XSAVEC:
11965 icode = CODE_FOR_xsavec;
11966 break;
11967 default:
11968 gcc_unreachable ();
11969 }
11970 pat = GEN_FCN (icode) (op0, op1);
11971 }
11972
11973 if (pat)
11974 emit_insn (pat);
11975 return 0;
11976
11977 case IX86_BUILTIN_LLWPCB:
11978 arg0 = CALL_EXPR_ARG (exp, 0);
11979 op0 = expand_normal (arg0);
11980
11981 if (!register_operand (op0, Pmode))
11982 op0 = ix86_zero_extend_to_Pmode (op0);
11983 emit_insn (gen_lwp_llwpcb (Pmode, op0));
11984 return 0;
11985
11986 case IX86_BUILTIN_SLWPCB:
11987 if (!target
11988 || !register_operand (target, Pmode))
11989 target = gen_reg_rtx (Pmode);
11990 emit_insn (gen_lwp_slwpcb (Pmode, target));
11991 return target;
11992
11993 case IX86_BUILTIN_LWPVAL32:
11994 case IX86_BUILTIN_LWPVAL64:
11995 case IX86_BUILTIN_LWPINS32:
11996 case IX86_BUILTIN_LWPINS64:
11997 mode = ((fcode == IX86_BUILTIN_LWPVAL32
11998 || fcode == IX86_BUILTIN_LWPINS32)
11999 ? SImode : DImode);
12000
12001 if (fcode == IX86_BUILTIN_LWPVAL32
12002 || fcode == IX86_BUILTIN_LWPVAL64)
12003 icode = code_for_lwp_lwpval (mode);
12004 else
12005 icode = code_for_lwp_lwpins (mode);
12006
12007 arg0 = CALL_EXPR_ARG (exp, 0);
12008 arg1 = CALL_EXPR_ARG (exp, 1);
12009 arg2 = CALL_EXPR_ARG (exp, 2);
12010 op0 = expand_normal (arg0);
12011 op1 = expand_normal (arg1);
12012 op2 = expand_normal (arg2);
12013 mode0 = insn_data[icode].operand[0].mode;
12014
12015 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12016 op0 = copy_to_mode_reg (mode0, op0);
12017 if (!insn_data[icode].operand[1].predicate (op1, SImode))
12018 op1 = copy_to_mode_reg (SImode, op1);
12019
12020 if (!CONST_INT_P (op2))
12021 {
12022 error ("the last argument must be a 32-bit immediate");
12023 return const0_rtx;
12024 }
12025
12026 emit_insn (GEN_FCN (icode) (op0, op1, op2));
12027
12028 if (fcode == IX86_BUILTIN_LWPINS32
12029 || fcode == IX86_BUILTIN_LWPINS64)
12030 {
12031 if (target == 0
12032 || !nonimmediate_operand (target, QImode))
12033 target = gen_reg_rtx (QImode);
12034
12035 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12036 const0_rtx);
12037 emit_insn (gen_rtx_SET (target, pat));
12038
12039 return target;
12040 }
12041 else
12042 return 0;
12043
12044 case IX86_BUILTIN_BEXTRI32:
12045 case IX86_BUILTIN_BEXTRI64:
12046 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12047
12048 arg0 = CALL_EXPR_ARG (exp, 0);
12049 arg1 = CALL_EXPR_ARG (exp, 1);
12050 op0 = expand_normal (arg0);
12051 op1 = expand_normal (arg1);
12052
12053 if (!CONST_INT_P (op1))
12054 {
12055 error ("last argument must be an immediate");
12056 return const0_rtx;
12057 }
12058 else
12059 {
12060 unsigned char lsb_index = UINTVAL (op1);
12061 unsigned char length = UINTVAL (op1) >> 8;
12062
12063 unsigned char bitsize = GET_MODE_BITSIZE (mode);
12064
12065 icode = code_for_tbm_bextri (mode);
12066
12067 mode1 = insn_data[icode].operand[1].mode;
12068 if (!insn_data[icode].operand[1].predicate (op0, mode1))
12069 op0 = copy_to_mode_reg (mode1, op0);
12070
12071 mode0 = insn_data[icode].operand[0].mode;
12072 if (target == 0
12073 || !register_operand (target, mode0))
12074 target = gen_reg_rtx (mode0);
12075
12076 if (length == 0 || lsb_index >= bitsize)
12077 {
12078 emit_move_insn (target, const0_rtx);
12079 return target;
12080 }
12081
12082 if (length + lsb_index > bitsize)
12083 length = bitsize - lsb_index;
12084
12085 op1 = GEN_INT (length);
12086 op2 = GEN_INT (lsb_index);
12087
12088 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12089 return target;
12090 }
12091
12092 case IX86_BUILTIN_RDRAND16_STEP:
12093 mode = HImode;
12094 goto rdrand_step;
12095
12096 case IX86_BUILTIN_RDRAND32_STEP:
12097 mode = SImode;
12098 goto rdrand_step;
12099
12100 case IX86_BUILTIN_RDRAND64_STEP:
12101 mode = DImode;
12102
12103 rdrand_step:
12104 arg0 = CALL_EXPR_ARG (exp, 0);
12105 op1 = expand_normal (arg0);
12106 if (!address_operand (op1, VOIDmode))
12107 {
12108 op1 = convert_memory_address (Pmode, op1);
12109 op1 = copy_addr_to_reg (op1);
12110 }
12111
12112 op0 = gen_reg_rtx (mode);
12113 emit_insn (gen_rdrand (mode, op0));
12114
12115 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12116
12117 op1 = force_reg (SImode, const1_rtx);
12118
12119 /* Emit SImode conditional move. */
12120 if (mode == HImode)
12121 {
12122 if (TARGET_ZERO_EXTEND_WITH_AND
12123 && optimize_function_for_speed_p (cfun))
12124 {
12125 op2 = force_reg (SImode, const0_rtx);
12126
12127 emit_insn (gen_movstricthi
12128 (gen_lowpart (HImode, op2), op0));
12129 }
12130 else
12131 {
12132 op2 = gen_reg_rtx (SImode);
12133
12134 emit_insn (gen_zero_extendhisi2 (op2, op0));
12135 }
12136 }
12137 else if (mode == SImode)
12138 op2 = op0;
12139 else
12140 op2 = gen_rtx_SUBREG (SImode, op0, 0);
12141
12142 if (target == 0
12143 || !register_operand (target, SImode))
12144 target = gen_reg_rtx (SImode);
12145
12146 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12147 const0_rtx);
12148 emit_insn (gen_rtx_SET (target,
12149 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12150 return target;
12151
12152 case IX86_BUILTIN_RDSEED16_STEP:
12153 mode = HImode;
12154 goto rdseed_step;
12155
12156 case IX86_BUILTIN_RDSEED32_STEP:
12157 mode = SImode;
12158 goto rdseed_step;
12159
12160 case IX86_BUILTIN_RDSEED64_STEP:
12161 mode = DImode;
12162
12163 rdseed_step:
12164 arg0 = CALL_EXPR_ARG (exp, 0);
12165 op1 = expand_normal (arg0);
12166 if (!address_operand (op1, VOIDmode))
12167 {
12168 op1 = convert_memory_address (Pmode, op1);
12169 op1 = copy_addr_to_reg (op1);
12170 }
12171
12172 op0 = gen_reg_rtx (mode);
12173 emit_insn (gen_rdseed (mode, op0));
12174
12175 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12176
12177 op2 = gen_reg_rtx (QImode);
12178
12179 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12180 const0_rtx);
12181 emit_insn (gen_rtx_SET (op2, pat));
12182
12183 if (target == 0
12184 || !register_operand (target, SImode))
12185 target = gen_reg_rtx (SImode);
12186
12187 emit_insn (gen_zero_extendqisi2 (target, op2));
12188 return target;
12189
12190 case IX86_BUILTIN_SBB32:
12191 icode = CODE_FOR_subborrowsi;
12192 icode2 = CODE_FOR_subborrowsi_0;
12193 mode0 = SImode;
12194 mode1 = DImode;
12195 mode2 = CCmode;
12196 goto handlecarry;
12197
12198 case IX86_BUILTIN_SBB64:
12199 icode = CODE_FOR_subborrowdi;
12200 icode2 = CODE_FOR_subborrowdi_0;
12201 mode0 = DImode;
12202 mode1 = TImode;
12203 mode2 = CCmode;
12204 goto handlecarry;
12205
12206 case IX86_BUILTIN_ADDCARRYX32:
12207 icode = CODE_FOR_addcarrysi;
12208 icode2 = CODE_FOR_addcarrysi_0;
12209 mode0 = SImode;
12210 mode1 = DImode;
12211 mode2 = CCCmode;
12212 goto handlecarry;
12213
12214 case IX86_BUILTIN_ADDCARRYX64:
12215 icode = CODE_FOR_addcarrydi;
12216 icode2 = CODE_FOR_addcarrydi_0;
12217 mode0 = DImode;
12218 mode1 = TImode;
12219 mode2 = CCCmode;
12220
12221 handlecarry:
12222 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
12223 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
12224 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
12225 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
12226
12227 op1 = expand_normal (arg0);
12228 if (!integer_zerop (arg0))
12229 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12230
12231 op2 = expand_normal (arg1);
12232 if (!register_operand (op2, mode0))
12233 op2 = copy_to_mode_reg (mode0, op2);
12234
12235 op3 = expand_normal (arg2);
12236 if (!register_operand (op3, mode0))
12237 op3 = copy_to_mode_reg (mode0, op3);
12238
12239 op4 = expand_normal (arg3);
12240 if (!address_operand (op4, VOIDmode))
12241 {
12242 op4 = convert_memory_address (Pmode, op4);
12243 op4 = copy_addr_to_reg (op4);
12244 }
12245
12246 op0 = gen_reg_rtx (mode0);
12247 if (integer_zerop (arg0))
12248 {
12249 /* If arg0 is 0, optimize right away into add or sub
12250 instruction that sets CCCmode flags. */
12251 op1 = gen_rtx_REG (mode2, FLAGS_REG);
12252 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12253 }
12254 else
12255 {
12256 /* Generate CF from input operand. */
12257 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12258
12259 /* Generate instruction that consumes CF. */
12260 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12261 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12262 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12263 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12264 }
12265
12266 /* Return current CF value. */
12267 if (target == 0)
12268 target = gen_reg_rtx (QImode);
12269
12270 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12271 emit_insn (gen_rtx_SET (target, pat));
12272
12273 /* Store the result. */
12274 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12275
12276 return target;
12277
12278 case IX86_BUILTIN_READ_FLAGS:
12279 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12280
12281 if (optimize
12282 || target == NULL_RTX
12283 || !nonimmediate_operand (target, word_mode)
12284 || GET_MODE (target) != word_mode)
12285 target = gen_reg_rtx (word_mode);
12286
12287 emit_insn (gen_pop (target));
12288 return target;
12289
12290 case IX86_BUILTIN_WRITE_FLAGS:
12291
12292 arg0 = CALL_EXPR_ARG (exp, 0);
12293 op0 = expand_normal (arg0);
12294 if (!general_no_elim_operand (op0, word_mode))
12295 op0 = copy_to_mode_reg (word_mode, op0);
12296
12297 emit_insn (gen_push (op0));
12298 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12299 return 0;
12300
12301 case IX86_BUILTIN_KTESTC8:
12302 icode = CODE_FOR_ktestqi;
12303 mode3 = CCCmode;
12304 goto kortest;
12305
12306 case IX86_BUILTIN_KTESTZ8:
12307 icode = CODE_FOR_ktestqi;
12308 mode3 = CCZmode;
12309 goto kortest;
12310
12311 case IX86_BUILTIN_KTESTC16:
12312 icode = CODE_FOR_ktesthi;
12313 mode3 = CCCmode;
12314 goto kortest;
12315
12316 case IX86_BUILTIN_KTESTZ16:
12317 icode = CODE_FOR_ktesthi;
12318 mode3 = CCZmode;
12319 goto kortest;
12320
12321 case IX86_BUILTIN_KTESTC32:
12322 icode = CODE_FOR_ktestsi;
12323 mode3 = CCCmode;
12324 goto kortest;
12325
12326 case IX86_BUILTIN_KTESTZ32:
12327 icode = CODE_FOR_ktestsi;
12328 mode3 = CCZmode;
12329 goto kortest;
12330
12331 case IX86_BUILTIN_KTESTC64:
12332 icode = CODE_FOR_ktestdi;
12333 mode3 = CCCmode;
12334 goto kortest;
12335
12336 case IX86_BUILTIN_KTESTZ64:
12337 icode = CODE_FOR_ktestdi;
12338 mode3 = CCZmode;
12339 goto kortest;
12340
12341 case IX86_BUILTIN_KORTESTC8:
12342 icode = CODE_FOR_kortestqi;
12343 mode3 = CCCmode;
12344 goto kortest;
12345
12346 case IX86_BUILTIN_KORTESTZ8:
12347 icode = CODE_FOR_kortestqi;
12348 mode3 = CCZmode;
12349 goto kortest;
12350
12351 case IX86_BUILTIN_KORTESTC16:
12352 icode = CODE_FOR_kortesthi;
12353 mode3 = CCCmode;
12354 goto kortest;
12355
12356 case IX86_BUILTIN_KORTESTZ16:
12357 icode = CODE_FOR_kortesthi;
12358 mode3 = CCZmode;
12359 goto kortest;
12360
12361 case IX86_BUILTIN_KORTESTC32:
12362 icode = CODE_FOR_kortestsi;
12363 mode3 = CCCmode;
12364 goto kortest;
12365
12366 case IX86_BUILTIN_KORTESTZ32:
12367 icode = CODE_FOR_kortestsi;
12368 mode3 = CCZmode;
12369 goto kortest;
12370
12371 case IX86_BUILTIN_KORTESTC64:
12372 icode = CODE_FOR_kortestdi;
12373 mode3 = CCCmode;
12374 goto kortest;
12375
12376 case IX86_BUILTIN_KORTESTZ64:
12377 icode = CODE_FOR_kortestdi;
12378 mode3 = CCZmode;
12379
12380 kortest:
12381 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12382 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12383 op0 = expand_normal (arg0);
12384 op1 = expand_normal (arg1);
12385
12386 mode0 = insn_data[icode].operand[0].mode;
12387 mode1 = insn_data[icode].operand[1].mode;
12388
12389 if (GET_MODE (op0) != VOIDmode)
12390 op0 = force_reg (GET_MODE (op0), op0);
12391
12392 op0 = gen_lowpart (mode0, op0);
12393
12394 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12395 op0 = copy_to_mode_reg (mode0, op0);
12396
12397 if (GET_MODE (op1) != VOIDmode)
12398 op1 = force_reg (GET_MODE (op1), op1);
12399
12400 op1 = gen_lowpart (mode1, op1);
12401
12402 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12403 op1 = copy_to_mode_reg (mode1, op1);
12404
12405 target = gen_reg_rtx (QImode);
12406
12407 /* Emit kortest. */
12408 emit_insn (GEN_FCN (icode) (op0, op1));
12409 /* And use setcc to return result from flags. */
12410 ix86_expand_setcc (target, EQ,
12411 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12412 return target;
12413
12414 case IX86_BUILTIN_GATHERSIV2DF:
12415 icode = CODE_FOR_avx2_gathersiv2df;
12416 goto gather_gen;
12417 case IX86_BUILTIN_GATHERSIV4DF:
12418 icode = CODE_FOR_avx2_gathersiv4df;
12419 goto gather_gen;
12420 case IX86_BUILTIN_GATHERDIV2DF:
12421 icode = CODE_FOR_avx2_gatherdiv2df;
12422 goto gather_gen;
12423 case IX86_BUILTIN_GATHERDIV4DF:
12424 icode = CODE_FOR_avx2_gatherdiv4df;
12425 goto gather_gen;
12426 case IX86_BUILTIN_GATHERSIV4SF:
12427 icode = CODE_FOR_avx2_gathersiv4sf;
12428 goto gather_gen;
12429 case IX86_BUILTIN_GATHERSIV8SF:
12430 icode = CODE_FOR_avx2_gathersiv8sf;
12431 goto gather_gen;
12432 case IX86_BUILTIN_GATHERDIV4SF:
12433 icode = CODE_FOR_avx2_gatherdiv4sf;
12434 goto gather_gen;
12435 case IX86_BUILTIN_GATHERDIV8SF:
12436 icode = CODE_FOR_avx2_gatherdiv8sf;
12437 goto gather_gen;
12438 case IX86_BUILTIN_GATHERSIV2DI:
12439 icode = CODE_FOR_avx2_gathersiv2di;
12440 goto gather_gen;
12441 case IX86_BUILTIN_GATHERSIV4DI:
12442 icode = CODE_FOR_avx2_gathersiv4di;
12443 goto gather_gen;
12444 case IX86_BUILTIN_GATHERDIV2DI:
12445 icode = CODE_FOR_avx2_gatherdiv2di;
12446 goto gather_gen;
12447 case IX86_BUILTIN_GATHERDIV4DI:
12448 icode = CODE_FOR_avx2_gatherdiv4di;
12449 goto gather_gen;
12450 case IX86_BUILTIN_GATHERSIV4SI:
12451 icode = CODE_FOR_avx2_gathersiv4si;
12452 goto gather_gen;
12453 case IX86_BUILTIN_GATHERSIV8SI:
12454 icode = CODE_FOR_avx2_gathersiv8si;
12455 goto gather_gen;
12456 case IX86_BUILTIN_GATHERDIV4SI:
12457 icode = CODE_FOR_avx2_gatherdiv4si;
12458 goto gather_gen;
12459 case IX86_BUILTIN_GATHERDIV8SI:
12460 icode = CODE_FOR_avx2_gatherdiv8si;
12461 goto gather_gen;
12462 case IX86_BUILTIN_GATHERALTSIV4DF:
12463 icode = CODE_FOR_avx2_gathersiv4df;
12464 goto gather_gen;
12465 case IX86_BUILTIN_GATHERALTDIV8SF:
12466 icode = CODE_FOR_avx2_gatherdiv8sf;
12467 goto gather_gen;
12468 case IX86_BUILTIN_GATHERALTSIV4DI:
12469 icode = CODE_FOR_avx2_gathersiv4di;
12470 goto gather_gen;
12471 case IX86_BUILTIN_GATHERALTDIV8SI:
12472 icode = CODE_FOR_avx2_gatherdiv8si;
12473 goto gather_gen;
12474 case IX86_BUILTIN_GATHER3SIV16SF:
12475 icode = CODE_FOR_avx512f_gathersiv16sf;
12476 goto gather_gen;
12477 case IX86_BUILTIN_GATHER3SIV8DF:
12478 icode = CODE_FOR_avx512f_gathersiv8df;
12479 goto gather_gen;
12480 case IX86_BUILTIN_GATHER3DIV16SF:
12481 icode = CODE_FOR_avx512f_gatherdiv16sf;
12482 goto gather_gen;
12483 case IX86_BUILTIN_GATHER3DIV8DF:
12484 icode = CODE_FOR_avx512f_gatherdiv8df;
12485 goto gather_gen;
12486 case IX86_BUILTIN_GATHER3SIV16SI:
12487 icode = CODE_FOR_avx512f_gathersiv16si;
12488 goto gather_gen;
12489 case IX86_BUILTIN_GATHER3SIV8DI:
12490 icode = CODE_FOR_avx512f_gathersiv8di;
12491 goto gather_gen;
12492 case IX86_BUILTIN_GATHER3DIV16SI:
12493 icode = CODE_FOR_avx512f_gatherdiv16si;
12494 goto gather_gen;
12495 case IX86_BUILTIN_GATHER3DIV8DI:
12496 icode = CODE_FOR_avx512f_gatherdiv8di;
12497 goto gather_gen;
12498 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12499 icode = CODE_FOR_avx512f_gathersiv8df;
12500 goto gather_gen;
12501 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12502 icode = CODE_FOR_avx512f_gatherdiv16sf;
12503 goto gather_gen;
12504 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12505 icode = CODE_FOR_avx512f_gathersiv8di;
12506 goto gather_gen;
12507 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12508 icode = CODE_FOR_avx512f_gatherdiv16si;
12509 goto gather_gen;
12510 case IX86_BUILTIN_GATHER3SIV2DF:
12511 icode = CODE_FOR_avx512vl_gathersiv2df;
12512 goto gather_gen;
12513 case IX86_BUILTIN_GATHER3SIV4DF:
12514 icode = CODE_FOR_avx512vl_gathersiv4df;
12515 goto gather_gen;
12516 case IX86_BUILTIN_GATHER3DIV2DF:
12517 icode = CODE_FOR_avx512vl_gatherdiv2df;
12518 goto gather_gen;
12519 case IX86_BUILTIN_GATHER3DIV4DF:
12520 icode = CODE_FOR_avx512vl_gatherdiv4df;
12521 goto gather_gen;
12522 case IX86_BUILTIN_GATHER3SIV4SF:
12523 icode = CODE_FOR_avx512vl_gathersiv4sf;
12524 goto gather_gen;
12525 case IX86_BUILTIN_GATHER3SIV8SF:
12526 icode = CODE_FOR_avx512vl_gathersiv8sf;
12527 goto gather_gen;
12528 case IX86_BUILTIN_GATHER3DIV4SF:
12529 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12530 goto gather_gen;
12531 case IX86_BUILTIN_GATHER3DIV8SF:
12532 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12533 goto gather_gen;
12534 case IX86_BUILTIN_GATHER3SIV2DI:
12535 icode = CODE_FOR_avx512vl_gathersiv2di;
12536 goto gather_gen;
12537 case IX86_BUILTIN_GATHER3SIV4DI:
12538 icode = CODE_FOR_avx512vl_gathersiv4di;
12539 goto gather_gen;
12540 case IX86_BUILTIN_GATHER3DIV2DI:
12541 icode = CODE_FOR_avx512vl_gatherdiv2di;
12542 goto gather_gen;
12543 case IX86_BUILTIN_GATHER3DIV4DI:
12544 icode = CODE_FOR_avx512vl_gatherdiv4di;
12545 goto gather_gen;
12546 case IX86_BUILTIN_GATHER3SIV4SI:
12547 icode = CODE_FOR_avx512vl_gathersiv4si;
12548 goto gather_gen;
12549 case IX86_BUILTIN_GATHER3SIV8SI:
12550 icode = CODE_FOR_avx512vl_gathersiv8si;
12551 goto gather_gen;
12552 case IX86_BUILTIN_GATHER3DIV4SI:
12553 icode = CODE_FOR_avx512vl_gatherdiv4si;
12554 goto gather_gen;
12555 case IX86_BUILTIN_GATHER3DIV8SI:
12556 icode = CODE_FOR_avx512vl_gatherdiv8si;
12557 goto gather_gen;
12558 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12559 icode = CODE_FOR_avx512vl_gathersiv4df;
12560 goto gather_gen;
12561 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12562 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12563 goto gather_gen;
12564 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12565 icode = CODE_FOR_avx512vl_gathersiv4di;
12566 goto gather_gen;
12567 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12568 icode = CODE_FOR_avx512vl_gatherdiv8si;
12569 goto gather_gen;
12570 case IX86_BUILTIN_SCATTERSIV16SF:
12571 icode = CODE_FOR_avx512f_scattersiv16sf;
12572 goto scatter_gen;
12573 case IX86_BUILTIN_SCATTERSIV8DF:
12574 icode = CODE_FOR_avx512f_scattersiv8df;
12575 goto scatter_gen;
12576 case IX86_BUILTIN_SCATTERDIV16SF:
12577 icode = CODE_FOR_avx512f_scatterdiv16sf;
12578 goto scatter_gen;
12579 case IX86_BUILTIN_SCATTERDIV8DF:
12580 icode = CODE_FOR_avx512f_scatterdiv8df;
12581 goto scatter_gen;
12582 case IX86_BUILTIN_SCATTERSIV16SI:
12583 icode = CODE_FOR_avx512f_scattersiv16si;
12584 goto scatter_gen;
12585 case IX86_BUILTIN_SCATTERSIV8DI:
12586 icode = CODE_FOR_avx512f_scattersiv8di;
12587 goto scatter_gen;
12588 case IX86_BUILTIN_SCATTERDIV16SI:
12589 icode = CODE_FOR_avx512f_scatterdiv16si;
12590 goto scatter_gen;
12591 case IX86_BUILTIN_SCATTERDIV8DI:
12592 icode = CODE_FOR_avx512f_scatterdiv8di;
12593 goto scatter_gen;
12594 case IX86_BUILTIN_SCATTERSIV8SF:
12595 icode = CODE_FOR_avx512vl_scattersiv8sf;
12596 goto scatter_gen;
12597 case IX86_BUILTIN_SCATTERSIV4SF:
12598 icode = CODE_FOR_avx512vl_scattersiv4sf;
12599 goto scatter_gen;
12600 case IX86_BUILTIN_SCATTERSIV4DF:
12601 icode = CODE_FOR_avx512vl_scattersiv4df;
12602 goto scatter_gen;
12603 case IX86_BUILTIN_SCATTERSIV2DF:
12604 icode = CODE_FOR_avx512vl_scattersiv2df;
12605 goto scatter_gen;
12606 case IX86_BUILTIN_SCATTERDIV8SF:
12607 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12608 goto scatter_gen;
12609 case IX86_BUILTIN_SCATTERDIV4SF:
12610 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12611 goto scatter_gen;
12612 case IX86_BUILTIN_SCATTERDIV4DF:
12613 icode = CODE_FOR_avx512vl_scatterdiv4df;
12614 goto scatter_gen;
12615 case IX86_BUILTIN_SCATTERDIV2DF:
12616 icode = CODE_FOR_avx512vl_scatterdiv2df;
12617 goto scatter_gen;
12618 case IX86_BUILTIN_SCATTERSIV8SI:
12619 icode = CODE_FOR_avx512vl_scattersiv8si;
12620 goto scatter_gen;
12621 case IX86_BUILTIN_SCATTERSIV4SI:
12622 icode = CODE_FOR_avx512vl_scattersiv4si;
12623 goto scatter_gen;
12624 case IX86_BUILTIN_SCATTERSIV4DI:
12625 icode = CODE_FOR_avx512vl_scattersiv4di;
12626 goto scatter_gen;
12627 case IX86_BUILTIN_SCATTERSIV2DI:
12628 icode = CODE_FOR_avx512vl_scattersiv2di;
12629 goto scatter_gen;
12630 case IX86_BUILTIN_SCATTERDIV8SI:
12631 icode = CODE_FOR_avx512vl_scatterdiv8si;
12632 goto scatter_gen;
12633 case IX86_BUILTIN_SCATTERDIV4SI:
12634 icode = CODE_FOR_avx512vl_scatterdiv4si;
12635 goto scatter_gen;
12636 case IX86_BUILTIN_SCATTERDIV4DI:
12637 icode = CODE_FOR_avx512vl_scatterdiv4di;
12638 goto scatter_gen;
12639 case IX86_BUILTIN_SCATTERDIV2DI:
12640 icode = CODE_FOR_avx512vl_scatterdiv2di;
12641 goto scatter_gen;
12642 case IX86_BUILTIN_GATHERPFDPD:
12643 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12644 goto vec_prefetch_gen;
12645 case IX86_BUILTIN_SCATTERALTSIV8DF:
12646 icode = CODE_FOR_avx512f_scattersiv8df;
12647 goto scatter_gen;
12648 case IX86_BUILTIN_SCATTERALTDIV16SF:
12649 icode = CODE_FOR_avx512f_scatterdiv16sf;
12650 goto scatter_gen;
12651 case IX86_BUILTIN_SCATTERALTSIV8DI:
12652 icode = CODE_FOR_avx512f_scattersiv8di;
12653 goto scatter_gen;
12654 case IX86_BUILTIN_SCATTERALTDIV16SI:
12655 icode = CODE_FOR_avx512f_scatterdiv16si;
12656 goto scatter_gen;
12657 case IX86_BUILTIN_SCATTERALTSIV4DF:
12658 icode = CODE_FOR_avx512vl_scattersiv4df;
12659 goto scatter_gen;
12660 case IX86_BUILTIN_SCATTERALTDIV8SF:
12661 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12662 goto scatter_gen;
12663 case IX86_BUILTIN_SCATTERALTSIV4DI:
12664 icode = CODE_FOR_avx512vl_scattersiv4di;
12665 goto scatter_gen;
12666 case IX86_BUILTIN_SCATTERALTDIV8SI:
12667 icode = CODE_FOR_avx512vl_scatterdiv8si;
12668 goto scatter_gen;
12669 case IX86_BUILTIN_SCATTERALTSIV2DF:
12670 icode = CODE_FOR_avx512vl_scattersiv2df;
12671 goto scatter_gen;
12672 case IX86_BUILTIN_SCATTERALTDIV4SF:
12673 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12674 goto scatter_gen;
12675 case IX86_BUILTIN_SCATTERALTSIV2DI:
12676 icode = CODE_FOR_avx512vl_scattersiv2di;
12677 goto scatter_gen;
12678 case IX86_BUILTIN_SCATTERALTDIV4SI:
12679 icode = CODE_FOR_avx512vl_scatterdiv4si;
12680 goto scatter_gen;
12681 case IX86_BUILTIN_GATHERPFDPS:
12682 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12683 goto vec_prefetch_gen;
12684 case IX86_BUILTIN_GATHERPFQPD:
12685 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12686 goto vec_prefetch_gen;
12687 case IX86_BUILTIN_GATHERPFQPS:
12688 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12689 goto vec_prefetch_gen;
12690 case IX86_BUILTIN_SCATTERPFDPD:
12691 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12692 goto vec_prefetch_gen;
12693 case IX86_BUILTIN_SCATTERPFDPS:
12694 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12695 goto vec_prefetch_gen;
12696 case IX86_BUILTIN_SCATTERPFQPD:
12697 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12698 goto vec_prefetch_gen;
12699 case IX86_BUILTIN_SCATTERPFQPS:
12700 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12701 goto vec_prefetch_gen;
12702
12703 gather_gen:
12704 rtx half;
12705 rtx (*gen) (rtx, rtx);
12706
12707 arg0 = CALL_EXPR_ARG (exp, 0);
12708 arg1 = CALL_EXPR_ARG (exp, 1);
12709 arg2 = CALL_EXPR_ARG (exp, 2);
12710 arg3 = CALL_EXPR_ARG (exp, 3);
12711 arg4 = CALL_EXPR_ARG (exp, 4);
12712 op0 = expand_normal (arg0);
12713 op1 = expand_normal (arg1);
12714 op2 = expand_normal (arg2);
12715 op3 = expand_normal (arg3);
12716 op4 = expand_normal (arg4);
12717 /* Note the arg order is different from the operand order. */
12718 mode0 = insn_data[icode].operand[1].mode;
12719 mode2 = insn_data[icode].operand[3].mode;
12720 mode3 = insn_data[icode].operand[4].mode;
12721 mode4 = insn_data[icode].operand[5].mode;
12722
12723 if (target == NULL_RTX
12724 || GET_MODE (target) != insn_data[icode].operand[0].mode
12725 || !insn_data[icode].operand[0].predicate (target,
12726 GET_MODE (target)))
12727 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12728 else
12729 subtarget = target;
12730
12731 switch (fcode)
12732 {
12733 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12734 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12735 half = gen_reg_rtx (V8SImode);
12736 if (!nonimmediate_operand (op2, V16SImode))
12737 op2 = copy_to_mode_reg (V16SImode, op2);
12738 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12739 op2 = half;
12740 break;
12741 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12742 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12743 case IX86_BUILTIN_GATHERALTSIV4DF:
12744 case IX86_BUILTIN_GATHERALTSIV4DI:
12745 half = gen_reg_rtx (V4SImode);
12746 if (!nonimmediate_operand (op2, V8SImode))
12747 op2 = copy_to_mode_reg (V8SImode, op2);
12748 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12749 op2 = half;
12750 break;
12751 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12752 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12753 half = gen_reg_rtx (mode0);
12754 if (mode0 == V8SFmode)
12755 gen = gen_vec_extract_lo_v16sf;
12756 else
12757 gen = gen_vec_extract_lo_v16si;
12758 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12759 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12760 emit_insn (gen (half, op0));
12761 op0 = half;
12762 op3 = lowpart_subreg (QImode, op3, HImode);
12763 break;
12764 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12765 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12766 case IX86_BUILTIN_GATHERALTDIV8SF:
12767 case IX86_BUILTIN_GATHERALTDIV8SI:
12768 half = gen_reg_rtx (mode0);
12769 if (mode0 == V4SFmode)
12770 gen = gen_vec_extract_lo_v8sf;
12771 else
12772 gen = gen_vec_extract_lo_v8si;
12773 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12774 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12775 emit_insn (gen (half, op0));
12776 op0 = half;
12777 if (VECTOR_MODE_P (GET_MODE (op3)))
12778 {
12779 half = gen_reg_rtx (mode0);
12780 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12781 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12782 emit_insn (gen (half, op3));
12783 op3 = half;
12784 }
12785 break;
12786 default:
12787 break;
12788 }
12789
12790 /* Force memory operand only with base register here. But we
12791 don't want to do it on memory operand for other builtin
12792 functions. */
12793 op1 = ix86_zero_extend_to_Pmode (op1);
12794
12795 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12796 op0 = copy_to_mode_reg (mode0, op0);
12797 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12798 op1 = copy_to_mode_reg (Pmode, op1);
12799 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12800 op2 = copy_to_mode_reg (mode2, op2);
12801
12802 op3 = fixup_modeless_constant (op3, mode3);
12803
12804 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12805 {
12806 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12807 op3 = copy_to_mode_reg (mode3, op3);
12808 }
12809 else
12810 {
12811 op3 = copy_to_reg (op3);
12812 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12813 }
12814 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12815 {
12816 error ("the last argument must be scale 1, 2, 4, 8");
12817 return const0_rtx;
12818 }
12819
12820 /* Optimize. If mask is known to have all high bits set,
12821 replace op0 with pc_rtx to signal that the instruction
12822 overwrites the whole destination and doesn't use its
12823 previous contents. */
12824 if (optimize)
12825 {
12826 if (TREE_CODE (arg3) == INTEGER_CST)
12827 {
12828 if (integer_all_onesp (arg3))
12829 op0 = pc_rtx;
12830 }
12831 else if (TREE_CODE (arg3) == VECTOR_CST)
12832 {
12833 unsigned int negative = 0;
12834 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12835 {
12836 tree cst = VECTOR_CST_ELT (arg3, i);
12837 if (TREE_CODE (cst) == INTEGER_CST
12838 && tree_int_cst_sign_bit (cst))
12839 negative++;
12840 else if (TREE_CODE (cst) == REAL_CST
12841 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12842 negative++;
12843 }
12844 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12845 op0 = pc_rtx;
12846 }
12847 else if (TREE_CODE (arg3) == SSA_NAME
12848 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12849 {
12850 /* Recognize also when mask is like:
12851 __v2df src = _mm_setzero_pd ();
12852 __v2df mask = _mm_cmpeq_pd (src, src);
12853 or
12854 __v8sf src = _mm256_setzero_ps ();
12855 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12856 as that is a cheaper way to load all ones into
12857 a register than having to load a constant from
12858 memory. */
12859 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12860 if (is_gimple_call (def_stmt))
12861 {
12862 tree fndecl = gimple_call_fndecl (def_stmt);
12863 if (fndecl
12864 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12865 switch (DECL_MD_FUNCTION_CODE (fndecl))
12866 {
12867 case IX86_BUILTIN_CMPPD:
12868 case IX86_BUILTIN_CMPPS:
12869 case IX86_BUILTIN_CMPPD256:
12870 case IX86_BUILTIN_CMPPS256:
12871 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12872 break;
12873 /* FALLTHRU */
12874 case IX86_BUILTIN_CMPEQPD:
12875 case IX86_BUILTIN_CMPEQPS:
12876 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12877 && initializer_zerop (gimple_call_arg (def_stmt,
12878 1)))
12879 op0 = pc_rtx;
12880 break;
12881 default:
12882 break;
12883 }
12884 }
12885 }
12886 }
12887
12888 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12889 if (! pat)
12890 return const0_rtx;
12891 emit_insn (pat);
12892
12893 switch (fcode)
12894 {
12895 case IX86_BUILTIN_GATHER3DIV16SF:
12896 if (target == NULL_RTX)
12897 target = gen_reg_rtx (V8SFmode);
12898 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12899 break;
12900 case IX86_BUILTIN_GATHER3DIV16SI:
12901 if (target == NULL_RTX)
12902 target = gen_reg_rtx (V8SImode);
12903 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12904 break;
12905 case IX86_BUILTIN_GATHER3DIV8SF:
12906 case IX86_BUILTIN_GATHERDIV8SF:
12907 if (target == NULL_RTX)
12908 target = gen_reg_rtx (V4SFmode);
12909 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12910 break;
12911 case IX86_BUILTIN_GATHER3DIV8SI:
12912 case IX86_BUILTIN_GATHERDIV8SI:
12913 if (target == NULL_RTX)
12914 target = gen_reg_rtx (V4SImode);
12915 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12916 break;
12917 default:
12918 target = subtarget;
12919 break;
12920 }
12921 return target;
12922
12923 scatter_gen:
12924 arg0 = CALL_EXPR_ARG (exp, 0);
12925 arg1 = CALL_EXPR_ARG (exp, 1);
12926 arg2 = CALL_EXPR_ARG (exp, 2);
12927 arg3 = CALL_EXPR_ARG (exp, 3);
12928 arg4 = CALL_EXPR_ARG (exp, 4);
12929 op0 = expand_normal (arg0);
12930 op1 = expand_normal (arg1);
12931 op2 = expand_normal (arg2);
12932 op3 = expand_normal (arg3);
12933 op4 = expand_normal (arg4);
12934 mode1 = insn_data[icode].operand[1].mode;
12935 mode2 = insn_data[icode].operand[2].mode;
12936 mode3 = insn_data[icode].operand[3].mode;
12937 mode4 = insn_data[icode].operand[4].mode;
12938
12939 /* Scatter instruction stores operand op3 to memory with
12940 indices from op2 and scale from op4 under writemask op1.
12941 If index operand op2 has more elements then source operand
12942 op3 one need to use only its low half. And vice versa. */
12943 switch (fcode)
12944 {
12945 case IX86_BUILTIN_SCATTERALTSIV8DF:
12946 case IX86_BUILTIN_SCATTERALTSIV8DI:
12947 half = gen_reg_rtx (V8SImode);
12948 if (!nonimmediate_operand (op2, V16SImode))
12949 op2 = copy_to_mode_reg (V16SImode, op2);
12950 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12951 op2 = half;
12952 break;
12953 case IX86_BUILTIN_SCATTERALTDIV16SF:
12954 case IX86_BUILTIN_SCATTERALTDIV16SI:
12955 half = gen_reg_rtx (mode3);
12956 if (mode3 == V8SFmode)
12957 gen = gen_vec_extract_lo_v16sf;
12958 else
12959 gen = gen_vec_extract_lo_v16si;
12960 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12961 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12962 emit_insn (gen (half, op3));
12963 op3 = half;
12964 break;
12965 case IX86_BUILTIN_SCATTERALTSIV4DF:
12966 case IX86_BUILTIN_SCATTERALTSIV4DI:
12967 half = gen_reg_rtx (V4SImode);
12968 if (!nonimmediate_operand (op2, V8SImode))
12969 op2 = copy_to_mode_reg (V8SImode, op2);
12970 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12971 op2 = half;
12972 break;
12973 case IX86_BUILTIN_SCATTERALTDIV8SF:
12974 case IX86_BUILTIN_SCATTERALTDIV8SI:
12975 half = gen_reg_rtx (mode3);
12976 if (mode3 == V4SFmode)
12977 gen = gen_vec_extract_lo_v8sf;
12978 else
12979 gen = gen_vec_extract_lo_v8si;
12980 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12981 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12982 emit_insn (gen (half, op3));
12983 op3 = half;
12984 break;
12985 case IX86_BUILTIN_SCATTERALTSIV2DF:
12986 case IX86_BUILTIN_SCATTERALTSIV2DI:
12987 if (!nonimmediate_operand (op2, V4SImode))
12988 op2 = copy_to_mode_reg (V4SImode, op2);
12989 break;
12990 case IX86_BUILTIN_SCATTERALTDIV4SF:
12991 case IX86_BUILTIN_SCATTERALTDIV4SI:
12992 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12993 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12994 break;
12995 default:
12996 break;
12997 }
12998
12999 /* Force memory operand only with base register here. But we
13000 don't want to do it on memory operand for other builtin
13001 functions. */
13002 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13003
13004 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13005 op0 = copy_to_mode_reg (Pmode, op0);
13006
13007 op1 = fixup_modeless_constant (op1, mode1);
13008
13009 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13010 {
13011 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13012 op1 = copy_to_mode_reg (mode1, op1);
13013 }
13014 else
13015 {
13016 op1 = copy_to_reg (op1);
13017 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13018 }
13019
13020 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13021 op2 = copy_to_mode_reg (mode2, op2);
13022
13023 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13024 op3 = copy_to_mode_reg (mode3, op3);
13025
13026 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13027 {
13028 error ("the last argument must be scale 1, 2, 4, 8");
13029 return const0_rtx;
13030 }
13031
13032 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13033 if (! pat)
13034 return const0_rtx;
13035
13036 emit_insn (pat);
13037 return 0;
13038
13039 vec_prefetch_gen:
13040 arg0 = CALL_EXPR_ARG (exp, 0);
13041 arg1 = CALL_EXPR_ARG (exp, 1);
13042 arg2 = CALL_EXPR_ARG (exp, 2);
13043 arg3 = CALL_EXPR_ARG (exp, 3);
13044 arg4 = CALL_EXPR_ARG (exp, 4);
13045 op0 = expand_normal (arg0);
13046 op1 = expand_normal (arg1);
13047 op2 = expand_normal (arg2);
13048 op3 = expand_normal (arg3);
13049 op4 = expand_normal (arg4);
13050 mode0 = insn_data[icode].operand[0].mode;
13051 mode1 = insn_data[icode].operand[1].mode;
13052 mode3 = insn_data[icode].operand[3].mode;
13053 mode4 = insn_data[icode].operand[4].mode;
13054
13055 op0 = fixup_modeless_constant (op0, mode0);
13056
13057 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13058 {
13059 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13060 op0 = copy_to_mode_reg (mode0, op0);
13061 }
13062 else
13063 {
13064 op0 = copy_to_reg (op0);
13065 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13066 }
13067
13068 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13069 op1 = copy_to_mode_reg (mode1, op1);
13070
13071 /* Force memory operand only with base register here. But we
13072 don't want to do it on memory operand for other builtin
13073 functions. */
13074 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13075
13076 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13077 op2 = copy_to_mode_reg (Pmode, op2);
13078
13079 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13080 {
13081 error ("the forth argument must be scale 1, 2, 4, 8");
13082 return const0_rtx;
13083 }
13084
13085 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13086 {
13087 error ("incorrect hint operand");
13088 return const0_rtx;
13089 }
13090
13091 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13092 if (! pat)
13093 return const0_rtx;
13094
13095 emit_insn (pat);
13096
13097 return 0;
13098
13099 case IX86_BUILTIN_XABORT:
13100 icode = CODE_FOR_xabort;
13101 arg0 = CALL_EXPR_ARG (exp, 0);
13102 op0 = expand_normal (arg0);
13103 mode0 = insn_data[icode].operand[0].mode;
13104 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13105 {
13106 error ("the argument to %<xabort%> intrinsic must "
13107 "be an 8-bit immediate");
13108 return const0_rtx;
13109 }
13110 emit_insn (gen_xabort (op0));
13111 return 0;
13112
13113 case IX86_BUILTIN_RDSSPD:
13114 case IX86_BUILTIN_RDSSPQ:
13115 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13116
13117 if (target == 0
13118 || !register_operand (target, mode))
13119 target = gen_reg_rtx (mode);
13120
13121 op0 = force_reg (mode, const0_rtx);
13122
13123 emit_insn (gen_rdssp (mode, target, op0));
13124 return target;
13125
13126 case IX86_BUILTIN_INCSSPD:
13127 case IX86_BUILTIN_INCSSPQ:
13128 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13129
13130 arg0 = CALL_EXPR_ARG (exp, 0);
13131 op0 = expand_normal (arg0);
13132
13133 op0 = force_reg (mode, op0);
13134
13135 emit_insn (gen_incssp (mode, op0));
13136 return 0;
13137
13138 case IX86_BUILTIN_HRESET:
13139 icode = CODE_FOR_hreset;
13140 arg0 = CALL_EXPR_ARG (exp, 0);
13141 op0 = expand_normal (arg0);
13142 op0 = force_reg (SImode, op0);
13143 emit_insn (gen_hreset (op0));
13144 return 0;
13145
13146 case IX86_BUILTIN_RSTORSSP:
13147 case IX86_BUILTIN_CLRSSBSY:
13148 arg0 = CALL_EXPR_ARG (exp, 0);
13149 op0 = expand_normal (arg0);
13150 icode = (fcode == IX86_BUILTIN_RSTORSSP
13151 ? CODE_FOR_rstorssp
13152 : CODE_FOR_clrssbsy);
13153
13154 if (!address_operand (op0, VOIDmode))
13155 {
13156 op0 = convert_memory_address (Pmode, op0);
13157 op0 = copy_addr_to_reg (op0);
13158 }
13159 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
13160 return 0;
13161
13162 case IX86_BUILTIN_WRSSD:
13163 case IX86_BUILTIN_WRSSQ:
13164 case IX86_BUILTIN_WRUSSD:
13165 case IX86_BUILTIN_WRUSSQ:
13166 mode = ((fcode == IX86_BUILTIN_WRSSD
13167 || fcode == IX86_BUILTIN_WRUSSD)
13168 ? SImode : DImode);
13169
13170 arg0 = CALL_EXPR_ARG (exp, 0);
13171 op0 = expand_normal (arg0);
13172 arg1 = CALL_EXPR_ARG (exp, 1);
13173 op1 = expand_normal (arg1);
13174
13175 op0 = force_reg (mode, op0);
13176
13177 if (!address_operand (op1, VOIDmode))
13178 {
13179 op1 = convert_memory_address (Pmode, op1);
13180 op1 = copy_addr_to_reg (op1);
13181 }
13182 op1 = gen_rtx_MEM (mode, op1);
13183
13184 icode = ((fcode == IX86_BUILTIN_WRSSD
13185 || fcode == IX86_BUILTIN_WRSSQ)
13186 ? code_for_wrss (mode)
13187 : code_for_wruss (mode));
13188 emit_insn (GEN_FCN (icode) (op0, op1));
13189
13190 return 0;
13191
13192 default:
13193 break;
13194 }
13195
13196 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13197 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13198 {
13199 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13200 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13201 target);
13202 }
13203
13204 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13205 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13206 {
13207 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13208 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13209 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13210 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13211 int masked = 1;
13212 machine_mode mode, wide_mode, nar_mode;
13213
13214 nar_mode = V4SFmode;
13215 mode = V16SFmode;
13216 wide_mode = V64SFmode;
13217 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
13218 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13219
13220 switch (fcode)
13221 {
13222 case IX86_BUILTIN_4FMAPS:
13223 fcn = gen_avx5124fmaddps_4fmaddps;
13224 masked = 0;
13225 goto v4fma_expand;
13226
13227 case IX86_BUILTIN_4DPWSSD:
13228 nar_mode = V4SImode;
13229 mode = V16SImode;
13230 wide_mode = V64SImode;
13231 fcn = gen_avx5124vnniw_vp4dpwssd;
13232 masked = 0;
13233 goto v4fma_expand;
13234
13235 case IX86_BUILTIN_4DPWSSDS:
13236 nar_mode = V4SImode;
13237 mode = V16SImode;
13238 wide_mode = V64SImode;
13239 fcn = gen_avx5124vnniw_vp4dpwssds;
13240 masked = 0;
13241 goto v4fma_expand;
13242
13243 case IX86_BUILTIN_4FNMAPS:
13244 fcn = gen_avx5124fmaddps_4fnmaddps;
13245 masked = 0;
13246 goto v4fma_expand;
13247
13248 case IX86_BUILTIN_4FNMAPS_MASK:
13249 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
13250 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13251 goto v4fma_expand;
13252
13253 case IX86_BUILTIN_4DPWSSD_MASK:
13254 nar_mode = V4SImode;
13255 mode = V16SImode;
13256 wide_mode = V64SImode;
13257 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
13258 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13259 goto v4fma_expand;
13260
13261 case IX86_BUILTIN_4DPWSSDS_MASK:
13262 nar_mode = V4SImode;
13263 mode = V16SImode;
13264 wide_mode = V64SImode;
13265 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
13266 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13267 goto v4fma_expand;
13268
13269 case IX86_BUILTIN_4FMAPS_MASK:
13270 {
13271 tree args[4];
13272 rtx ops[4];
13273 rtx wide_reg;
13274 rtx accum;
13275 rtx addr;
13276 rtx mem;
13277
13278 v4fma_expand:
13279 wide_reg = gen_reg_rtx (wide_mode);
13280 for (i = 0; i < 4; i++)
13281 {
13282 args[i] = CALL_EXPR_ARG (exp, i);
13283 ops[i] = expand_normal (args[i]);
13284
13285 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13286 ops[i]);
13287 }
13288
13289 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13290 accum = force_reg (mode, accum);
13291
13292 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13293 addr = force_reg (Pmode, addr);
13294
13295 mem = gen_rtx_MEM (nar_mode, addr);
13296
13297 target = gen_reg_rtx (mode);
13298
13299 emit_move_insn (target, accum);
13300
13301 if (! masked)
13302 emit_insn (fcn (target, accum, wide_reg, mem));
13303 else
13304 {
13305 rtx merge, mask;
13306 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13307
13308 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13309
13310 if (CONST_INT_P (mask))
13311 mask = fixup_modeless_constant (mask, HImode);
13312
13313 mask = force_reg (HImode, mask);
13314
13315 if (GET_MODE (mask) != HImode)
13316 mask = gen_rtx_SUBREG (HImode, mask, 0);
13317
13318 /* If merge is 0 then we're about to emit z-masked variant. */
13319 if (const0_operand (merge, mode))
13320 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13321 /* If merge is the same as accum then emit merge-masked variant. */
13322 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13323 {
13324 merge = force_reg (mode, merge);
13325 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13326 }
13327 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13328 else
13329 {
13330 target = gen_reg_rtx (mode);
13331 emit_move_insn (target, merge);
13332 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13333 }
13334 }
13335 return target;
13336 }
13337
13338 case IX86_BUILTIN_4FNMASS:
13339 fcn = gen_avx5124fmaddps_4fnmaddss;
13340 masked = 0;
13341 goto s4fma_expand;
13342
13343 case IX86_BUILTIN_4FMASS:
13344 fcn = gen_avx5124fmaddps_4fmaddss;
13345 masked = 0;
13346 goto s4fma_expand;
13347
13348 case IX86_BUILTIN_4FNMASS_MASK:
13349 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13350 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13351 goto s4fma_expand;
13352
13353 case IX86_BUILTIN_4FMASS_MASK:
13354 {
13355 tree args[4];
13356 rtx ops[4];
13357 rtx wide_reg;
13358 rtx accum;
13359 rtx addr;
13360 rtx mem;
13361
13362 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13363 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13364
13365 s4fma_expand:
13366 mode = V4SFmode;
13367 wide_reg = gen_reg_rtx (V64SFmode);
13368 for (i = 0; i < 4; i++)
13369 {
13370 rtx tmp;
13371 args[i] = CALL_EXPR_ARG (exp, i);
13372 ops[i] = expand_normal (args[i]);
13373
13374 tmp = gen_reg_rtx (SFmode);
13375 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13376
13377 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13378 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13379 }
13380
13381 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13382 accum = force_reg (V4SFmode, accum);
13383
13384 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13385 addr = force_reg (Pmode, addr);
13386
13387 mem = gen_rtx_MEM (V4SFmode, addr);
13388
13389 target = gen_reg_rtx (V4SFmode);
13390
13391 emit_move_insn (target, accum);
13392
13393 if (! masked)
13394 emit_insn (fcn (target, accum, wide_reg, mem));
13395 else
13396 {
13397 rtx merge, mask;
13398 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13399
13400 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13401
13402 if (CONST_INT_P (mask))
13403 mask = fixup_modeless_constant (mask, QImode);
13404
13405 mask = force_reg (QImode, mask);
13406
13407 if (GET_MODE (mask) != QImode)
13408 mask = gen_rtx_SUBREG (QImode, mask, 0);
13409
13410 /* If merge is 0 then we're about to emit z-masked variant. */
13411 if (const0_operand (merge, mode))
13412 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13413 /* If merge is the same as accum then emit merge-masked
13414 variant. */
13415 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13416 {
13417 merge = force_reg (mode, merge);
13418 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13419 }
13420 /* Merge with something unknown might happen if we z-mask
13421 w/ -O0. */
13422 else
13423 {
13424 target = gen_reg_rtx (mode);
13425 emit_move_insn (target, merge);
13426 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13427 }
13428 }
13429 return target;
13430 }
13431 case IX86_BUILTIN_RDPID:
13432 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13433 target);
13434 case IX86_BUILTIN_FABSQ:
13435 case IX86_BUILTIN_COPYSIGNQ:
13436 if (!TARGET_SSE)
13437 /* Emit a normal call if SSE isn't available. */
13438 return expand_call (exp, target, ignore);
13439 /* FALLTHRU */
13440 default:
13441 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13442 }
13443 }
13444
13445 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13446 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13447 {
13448 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13449 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13450 }
13451
13452 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13453 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13454 {
13455 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13456 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13457 }
13458
13459 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13460 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13461 {
13462 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13463 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13464 }
13465
13466 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13467 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13468 {
13469 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13470 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13471 }
13472
13473 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13474 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13475 {
13476 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13477 const struct builtin_description *d = bdesc_multi_arg + i;
13478 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13479 (enum ix86_builtin_func_type)
13480 d->flag, d->comparison);
13481 }
13482
13483 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13484 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13485 {
13486 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13487 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13488 target);
13489 }
13490
13491 gcc_unreachable ();
13492 }
13493
13494 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13495 fill target with val via vec_duplicate. */
13496
13497 static bool
13498 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13499 {
13500 bool ok;
13501 rtx_insn *insn;
13502 rtx dup;
13503
13504 /* First attempt to recognize VAL as-is. */
13505 dup = gen_vec_duplicate (mode, val);
13506 insn = emit_insn (gen_rtx_SET (target, dup));
13507 if (recog_memoized (insn) < 0)
13508 {
13509 rtx_insn *seq;
13510 machine_mode innermode = GET_MODE_INNER (mode);
13511 rtx reg;
13512
13513 /* If that fails, force VAL into a register. */
13514
13515 start_sequence ();
13516 reg = force_reg (innermode, val);
13517 if (GET_MODE (reg) != innermode)
13518 reg = gen_lowpart (innermode, reg);
13519 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13520 seq = get_insns ();
13521 end_sequence ();
13522 if (seq)
13523 emit_insn_before (seq, insn);
13524
13525 ok = recog_memoized (insn) >= 0;
13526 gcc_assert (ok);
13527 }
13528 return true;
13529 }
13530
13531 /* Get a vector mode of the same size as the original but with elements
13532 twice as wide. This is only guaranteed to apply to integral vectors. */
13533
13534 static machine_mode
13535 get_mode_wider_vector (machine_mode o)
13536 {
13537 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13538 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13539 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13540 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13541 return n;
13542 }
13543
13544 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13545 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13546
13547 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13548 with all elements equal to VAR. Return true if successful. */
13549
13550 static bool
13551 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13552 rtx target, rtx val)
13553 {
13554 bool ok;
13555
13556 switch (mode)
13557 {
13558 case E_V2SImode:
13559 case E_V2SFmode:
13560 if (!mmx_ok)
13561 return false;
13562 /* FALLTHRU */
13563
13564 case E_V4DFmode:
13565 case E_V4DImode:
13566 case E_V8SFmode:
13567 case E_V8SImode:
13568 case E_V2DFmode:
13569 case E_V2DImode:
13570 case E_V4SFmode:
13571 case E_V4SImode:
13572 case E_V16SImode:
13573 case E_V8DImode:
13574 case E_V16SFmode:
13575 case E_V8DFmode:
13576 return ix86_vector_duplicate_value (mode, target, val);
13577
13578 case E_V4HImode:
13579 if (!mmx_ok)
13580 return false;
13581 if (TARGET_SSE || TARGET_3DNOW_A)
13582 {
13583 rtx x;
13584
13585 val = gen_lowpart (SImode, val);
13586 x = gen_rtx_TRUNCATE (HImode, val);
13587 x = gen_rtx_VEC_DUPLICATE (mode, x);
13588 emit_insn (gen_rtx_SET (target, x));
13589 return true;
13590 }
13591 goto widen;
13592
13593 case E_V8QImode:
13594 if (!mmx_ok)
13595 return false;
13596 goto widen;
13597
13598 case E_V8HImode:
13599 if (TARGET_AVX2)
13600 return ix86_vector_duplicate_value (mode, target, val);
13601
13602 if (TARGET_SSE2)
13603 {
13604 struct expand_vec_perm_d dperm;
13605 rtx tmp1, tmp2;
13606
13607 permute:
13608 memset (&dperm, 0, sizeof (dperm));
13609 dperm.target = target;
13610 dperm.vmode = mode;
13611 dperm.nelt = GET_MODE_NUNITS (mode);
13612 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13613 dperm.one_operand_p = true;
13614
13615 /* Extend to SImode using a paradoxical SUBREG. */
13616 tmp1 = gen_reg_rtx (SImode);
13617 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13618
13619 /* Insert the SImode value as low element of a V4SImode vector. */
13620 tmp2 = gen_reg_rtx (V4SImode);
13621 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13622 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13623
13624 ok = (expand_vec_perm_1 (&dperm)
13625 || expand_vec_perm_broadcast_1 (&dperm));
13626 gcc_assert (ok);
13627 return ok;
13628 }
13629 goto widen;
13630
13631 case E_V16QImode:
13632 if (TARGET_AVX2)
13633 return ix86_vector_duplicate_value (mode, target, val);
13634
13635 if (TARGET_SSE2)
13636 goto permute;
13637 goto widen;
13638
13639 widen:
13640 /* Replicate the value once into the next wider mode and recurse. */
13641 {
13642 machine_mode smode, wsmode, wvmode;
13643 rtx x;
13644
13645 smode = GET_MODE_INNER (mode);
13646 wvmode = get_mode_wider_vector (mode);
13647 wsmode = GET_MODE_INNER (wvmode);
13648
13649 val = convert_modes (wsmode, smode, val, true);
13650 x = expand_simple_binop (wsmode, ASHIFT, val,
13651 GEN_INT (GET_MODE_BITSIZE (smode)),
13652 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13653 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13654
13655 x = gen_reg_rtx (wvmode);
13656 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13657 gcc_assert (ok);
13658 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13659 return ok;
13660 }
13661
13662 case E_V16HImode:
13663 case E_V32QImode:
13664 if (TARGET_AVX2)
13665 return ix86_vector_duplicate_value (mode, target, val);
13666 else
13667 {
13668 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13669 rtx x = gen_reg_rtx (hvmode);
13670
13671 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13672 gcc_assert (ok);
13673
13674 x = gen_rtx_VEC_CONCAT (mode, x, x);
13675 emit_insn (gen_rtx_SET (target, x));
13676 }
13677 return true;
13678
13679 case E_V64QImode:
13680 case E_V32HImode:
13681 if (TARGET_AVX512BW)
13682 return ix86_vector_duplicate_value (mode, target, val);
13683 else
13684 {
13685 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13686 rtx x = gen_reg_rtx (hvmode);
13687
13688 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13689 gcc_assert (ok);
13690
13691 x = gen_rtx_VEC_CONCAT (mode, x, x);
13692 emit_insn (gen_rtx_SET (target, x));
13693 }
13694 return true;
13695
13696 default:
13697 return false;
13698 }
13699 }
13700
13701 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13702 whose ONE_VAR element is VAR, and other elements are zero. Return true
13703 if successful. */
13704
13705 static bool
13706 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13707 rtx target, rtx var, int one_var)
13708 {
13709 machine_mode vsimode;
13710 rtx new_target;
13711 rtx x, tmp;
13712 bool use_vector_set = false;
13713 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13714
13715 switch (mode)
13716 {
13717 case E_V2DImode:
13718 /* For SSE4.1, we normally use vector set. But if the second
13719 element is zero and inter-unit moves are OK, we use movq
13720 instead. */
13721 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13722 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13723 && one_var == 0));
13724 break;
13725 case E_V16QImode:
13726 case E_V4SImode:
13727 case E_V4SFmode:
13728 use_vector_set = TARGET_SSE4_1;
13729 break;
13730 case E_V8HImode:
13731 use_vector_set = TARGET_SSE2;
13732 break;
13733 case E_V8QImode:
13734 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13735 break;
13736 case E_V4HImode:
13737 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13738 break;
13739 case E_V32QImode:
13740 case E_V16HImode:
13741 use_vector_set = TARGET_AVX;
13742 break;
13743 case E_V8SImode:
13744 use_vector_set = TARGET_AVX;
13745 gen_vec_set_0 = gen_vec_setv8si_0;
13746 break;
13747 case E_V8SFmode:
13748 use_vector_set = TARGET_AVX;
13749 gen_vec_set_0 = gen_vec_setv8sf_0;
13750 break;
13751 case E_V4DFmode:
13752 use_vector_set = TARGET_AVX;
13753 gen_vec_set_0 = gen_vec_setv4df_0;
13754 break;
13755 case E_V4DImode:
13756 /* Use ix86_expand_vector_set in 64bit mode only. */
13757 use_vector_set = TARGET_AVX && TARGET_64BIT;
13758 gen_vec_set_0 = gen_vec_setv4di_0;
13759 break;
13760 case E_V16SImode:
13761 use_vector_set = TARGET_AVX512F && one_var == 0;
13762 gen_vec_set_0 = gen_vec_setv16si_0;
13763 break;
13764 case E_V16SFmode:
13765 use_vector_set = TARGET_AVX512F && one_var == 0;
13766 gen_vec_set_0 = gen_vec_setv16sf_0;
13767 break;
13768 case E_V8DFmode:
13769 use_vector_set = TARGET_AVX512F && one_var == 0;
13770 gen_vec_set_0 = gen_vec_setv8df_0;
13771 break;
13772 case E_V8DImode:
13773 /* Use ix86_expand_vector_set in 64bit mode only. */
13774 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13775 gen_vec_set_0 = gen_vec_setv8di_0;
13776 break;
13777 default:
13778 break;
13779 }
13780
13781 if (use_vector_set)
13782 {
13783 if (gen_vec_set_0 && one_var == 0)
13784 {
13785 var = force_reg (GET_MODE_INNER (mode), var);
13786 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13787 return true;
13788 }
13789 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13790 var = force_reg (GET_MODE_INNER (mode), var);
13791 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13792 return true;
13793 }
13794
13795 switch (mode)
13796 {
13797 case E_V2SFmode:
13798 case E_V2SImode:
13799 if (!mmx_ok)
13800 return false;
13801 /* FALLTHRU */
13802
13803 case E_V2DFmode:
13804 case E_V2DImode:
13805 if (one_var != 0)
13806 return false;
13807 var = force_reg (GET_MODE_INNER (mode), var);
13808 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13809 emit_insn (gen_rtx_SET (target, x));
13810 return true;
13811
13812 case E_V4SFmode:
13813 case E_V4SImode:
13814 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13815 new_target = gen_reg_rtx (mode);
13816 else
13817 new_target = target;
13818 var = force_reg (GET_MODE_INNER (mode), var);
13819 x = gen_rtx_VEC_DUPLICATE (mode, var);
13820 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13821 emit_insn (gen_rtx_SET (new_target, x));
13822 if (one_var != 0)
13823 {
13824 /* We need to shuffle the value to the correct position, so
13825 create a new pseudo to store the intermediate result. */
13826
13827 /* With SSE2, we can use the integer shuffle insns. */
13828 if (mode != V4SFmode && TARGET_SSE2)
13829 {
13830 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13831 const1_rtx,
13832 GEN_INT (one_var == 1 ? 0 : 1),
13833 GEN_INT (one_var == 2 ? 0 : 1),
13834 GEN_INT (one_var == 3 ? 0 : 1)));
13835 if (target != new_target)
13836 emit_move_insn (target, new_target);
13837 return true;
13838 }
13839
13840 /* Otherwise convert the intermediate result to V4SFmode and
13841 use the SSE1 shuffle instructions. */
13842 if (mode != V4SFmode)
13843 {
13844 tmp = gen_reg_rtx (V4SFmode);
13845 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13846 }
13847 else
13848 tmp = new_target;
13849
13850 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13851 const1_rtx,
13852 GEN_INT (one_var == 1 ? 0 : 1),
13853 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13854 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13855
13856 if (mode != V4SFmode)
13857 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13858 else if (tmp != target)
13859 emit_move_insn (target, tmp);
13860 }
13861 else if (target != new_target)
13862 emit_move_insn (target, new_target);
13863 return true;
13864
13865 case E_V8HImode:
13866 case E_V16QImode:
13867 vsimode = V4SImode;
13868 goto widen;
13869 case E_V4HImode:
13870 case E_V8QImode:
13871 if (!mmx_ok)
13872 return false;
13873 vsimode = V2SImode;
13874 goto widen;
13875 widen:
13876 if (one_var != 0)
13877 return false;
13878
13879 /* Zero extend the variable element to SImode and recurse. */
13880 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13881
13882 x = gen_reg_rtx (vsimode);
13883 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13884 var, one_var))
13885 gcc_unreachable ();
13886
13887 emit_move_insn (target, gen_lowpart (mode, x));
13888 return true;
13889
13890 default:
13891 return false;
13892 }
13893 }
13894
13895 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13896 consisting of the values in VALS. It is known that all elements
13897 except ONE_VAR are constants. Return true if successful. */
13898
13899 static bool
13900 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13901 rtx target, rtx vals, int one_var)
13902 {
13903 rtx var = XVECEXP (vals, 0, one_var);
13904 machine_mode wmode;
13905 rtx const_vec, x;
13906
13907 const_vec = copy_rtx (vals);
13908 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13909 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13910
13911 switch (mode)
13912 {
13913 case E_V2DFmode:
13914 case E_V2DImode:
13915 case E_V2SFmode:
13916 case E_V2SImode:
13917 /* For the two element vectors, it's just as easy to use
13918 the general case. */
13919 return false;
13920
13921 case E_V4DImode:
13922 /* Use ix86_expand_vector_set in 64bit mode only. */
13923 if (!TARGET_64BIT)
13924 return false;
13925 /* FALLTHRU */
13926 case E_V4DFmode:
13927 case E_V8SFmode:
13928 case E_V8SImode:
13929 case E_V16HImode:
13930 case E_V32QImode:
13931 case E_V4SFmode:
13932 case E_V4SImode:
13933 case E_V8HImode:
13934 case E_V4HImode:
13935 break;
13936
13937 case E_V16QImode:
13938 if (TARGET_SSE4_1)
13939 break;
13940 wmode = V8HImode;
13941 goto widen;
13942 case E_V8QImode:
13943 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13944 break;
13945 wmode = V4HImode;
13946 goto widen;
13947 widen:
13948 /* There's no way to set one QImode entry easily. Combine
13949 the variable value with its adjacent constant value, and
13950 promote to an HImode set. */
13951 x = XVECEXP (vals, 0, one_var ^ 1);
13952 if (one_var & 1)
13953 {
13954 var = convert_modes (HImode, QImode, var, true);
13955 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13956 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13957 x = GEN_INT (INTVAL (x) & 0xff);
13958 }
13959 else
13960 {
13961 var = convert_modes (HImode, QImode, var, true);
13962 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13963 }
13964 if (x != const0_rtx)
13965 var = expand_simple_binop (HImode, IOR, var, x, var,
13966 1, OPTAB_LIB_WIDEN);
13967
13968 x = gen_reg_rtx (wmode);
13969 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13970 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13971
13972 emit_move_insn (target, gen_lowpart (mode, x));
13973 return true;
13974
13975 default:
13976 return false;
13977 }
13978
13979 emit_move_insn (target, const_vec);
13980 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13981 return true;
13982 }
13983
13984 /* A subroutine of ix86_expand_vector_init_general. Use vector
13985 concatenate to handle the most general case: all values variable,
13986 and none identical. */
13987
13988 static void
13989 ix86_expand_vector_init_concat (machine_mode mode,
13990 rtx target, rtx *ops, int n)
13991 {
13992 machine_mode half_mode = VOIDmode;
13993 rtx half[2];
13994 rtvec v;
13995 int i, j;
13996
13997 switch (n)
13998 {
13999 case 2:
14000 switch (mode)
14001 {
14002 case E_V16SImode:
14003 half_mode = V8SImode;
14004 break;
14005 case E_V16SFmode:
14006 half_mode = V8SFmode;
14007 break;
14008 case E_V8DImode:
14009 half_mode = V4DImode;
14010 break;
14011 case E_V8DFmode:
14012 half_mode = V4DFmode;
14013 break;
14014 case E_V8SImode:
14015 half_mode = V4SImode;
14016 break;
14017 case E_V8SFmode:
14018 half_mode = V4SFmode;
14019 break;
14020 case E_V4DImode:
14021 half_mode = V2DImode;
14022 break;
14023 case E_V4DFmode:
14024 half_mode = V2DFmode;
14025 break;
14026 case E_V4SImode:
14027 half_mode = V2SImode;
14028 break;
14029 case E_V4SFmode:
14030 half_mode = V2SFmode;
14031 break;
14032 case E_V2DImode:
14033 half_mode = DImode;
14034 break;
14035 case E_V2SImode:
14036 half_mode = SImode;
14037 break;
14038 case E_V2DFmode:
14039 half_mode = DFmode;
14040 break;
14041 case E_V2SFmode:
14042 half_mode = SFmode;
14043 break;
14044 default:
14045 gcc_unreachable ();
14046 }
14047
14048 if (!register_operand (ops[1], half_mode))
14049 ops[1] = force_reg (half_mode, ops[1]);
14050 if (!register_operand (ops[0], half_mode))
14051 ops[0] = force_reg (half_mode, ops[0]);
14052 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14053 ops[1])));
14054 break;
14055
14056 case 4:
14057 switch (mode)
14058 {
14059 case E_V4DImode:
14060 half_mode = V2DImode;
14061 break;
14062 case E_V4DFmode:
14063 half_mode = V2DFmode;
14064 break;
14065 case E_V4SImode:
14066 half_mode = V2SImode;
14067 break;
14068 case E_V4SFmode:
14069 half_mode = V2SFmode;
14070 break;
14071 default:
14072 gcc_unreachable ();
14073 }
14074 goto half;
14075
14076 case 8:
14077 switch (mode)
14078 {
14079 case E_V8DImode:
14080 half_mode = V4DImode;
14081 break;
14082 case E_V8DFmode:
14083 half_mode = V4DFmode;
14084 break;
14085 case E_V8SImode:
14086 half_mode = V4SImode;
14087 break;
14088 case E_V8SFmode:
14089 half_mode = V4SFmode;
14090 break;
14091 default:
14092 gcc_unreachable ();
14093 }
14094 goto half;
14095
14096 case 16:
14097 switch (mode)
14098 {
14099 case E_V16SImode:
14100 half_mode = V8SImode;
14101 break;
14102 case E_V16SFmode:
14103 half_mode = V8SFmode;
14104 break;
14105 default:
14106 gcc_unreachable ();
14107 }
14108 goto half;
14109
14110 half:
14111 /* FIXME: We process inputs backward to help RA. PR 36222. */
14112 i = n - 1;
14113 for (j = 1; j != -1; j--)
14114 {
14115 half[j] = gen_reg_rtx (half_mode);
14116 switch (n >> 1)
14117 {
14118 case 2:
14119 v = gen_rtvec (2, ops[i-1], ops[i]);
14120 i -= 2;
14121 break;
14122 case 4:
14123 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14124 i -= 4;
14125 break;
14126 case 8:
14127 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14128 ops[i-3], ops[i-2], ops[i-1], ops[i]);
14129 i -= 8;
14130 break;
14131 default:
14132 gcc_unreachable ();
14133 }
14134 ix86_expand_vector_init (false, half[j],
14135 gen_rtx_PARALLEL (half_mode, v));
14136 }
14137
14138 ix86_expand_vector_init_concat (mode, target, half, 2);
14139 break;
14140
14141 default:
14142 gcc_unreachable ();
14143 }
14144 }
14145
14146 /* A subroutine of ix86_expand_vector_init_general. Use vector
14147 interleave to handle the most general case: all values variable,
14148 and none identical. */
14149
14150 static void
14151 ix86_expand_vector_init_interleave (machine_mode mode,
14152 rtx target, rtx *ops, int n)
14153 {
14154 machine_mode first_imode, second_imode, third_imode, inner_mode;
14155 int i, j;
14156 rtx op0, op1;
14157 rtx (*gen_load_even) (rtx, rtx, rtx);
14158 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14159 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14160
14161 switch (mode)
14162 {
14163 case E_V8HImode:
14164 gen_load_even = gen_vec_setv8hi;
14165 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14166 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14167 inner_mode = HImode;
14168 first_imode = V4SImode;
14169 second_imode = V2DImode;
14170 third_imode = VOIDmode;
14171 break;
14172 case E_V16QImode:
14173 gen_load_even = gen_vec_setv16qi;
14174 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14175 gen_interleave_second_low = gen_vec_interleave_lowv4si;
14176 inner_mode = QImode;
14177 first_imode = V8HImode;
14178 second_imode = V4SImode;
14179 third_imode = V2DImode;
14180 break;
14181 default:
14182 gcc_unreachable ();
14183 }
14184
14185 for (i = 0; i < n; i++)
14186 {
14187 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14188 op0 = gen_reg_rtx (SImode);
14189 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
14190
14191 /* Insert the SImode value as low element of V4SImode vector. */
14192 op1 = gen_reg_rtx (V4SImode);
14193 op0 = gen_rtx_VEC_MERGE (V4SImode,
14194 gen_rtx_VEC_DUPLICATE (V4SImode,
14195 op0),
14196 CONST0_RTX (V4SImode),
14197 const1_rtx);
14198 emit_insn (gen_rtx_SET (op1, op0));
14199
14200 /* Cast the V4SImode vector back to a vector in orignal mode. */
14201 op0 = gen_reg_rtx (mode);
14202 emit_move_insn (op0, gen_lowpart (mode, op1));
14203
14204 /* Load even elements into the second position. */
14205 emit_insn (gen_load_even (op0,
14206 force_reg (inner_mode,
14207 ops [i + i + 1]),
14208 const1_rtx));
14209
14210 /* Cast vector to FIRST_IMODE vector. */
14211 ops[i] = gen_reg_rtx (first_imode);
14212 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14213 }
14214
14215 /* Interleave low FIRST_IMODE vectors. */
14216 for (i = j = 0; i < n; i += 2, j++)
14217 {
14218 op0 = gen_reg_rtx (first_imode);
14219 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14220
14221 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14222 ops[j] = gen_reg_rtx (second_imode);
14223 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14224 }
14225
14226 /* Interleave low SECOND_IMODE vectors. */
14227 switch (second_imode)
14228 {
14229 case E_V4SImode:
14230 for (i = j = 0; i < n / 2; i += 2, j++)
14231 {
14232 op0 = gen_reg_rtx (second_imode);
14233 emit_insn (gen_interleave_second_low (op0, ops[i],
14234 ops[i + 1]));
14235
14236 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14237 vector. */
14238 ops[j] = gen_reg_rtx (third_imode);
14239 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14240 }
14241 second_imode = V2DImode;
14242 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14243 /* FALLTHRU */
14244
14245 case E_V2DImode:
14246 op0 = gen_reg_rtx (second_imode);
14247 emit_insn (gen_interleave_second_low (op0, ops[0],
14248 ops[1]));
14249
14250 /* Cast the SECOND_IMODE vector back to a vector on original
14251 mode. */
14252 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14253 break;
14254
14255 default:
14256 gcc_unreachable ();
14257 }
14258 }
14259
14260 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14261 all values variable, and none identical. */
14262
14263 static void
14264 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14265 rtx target, rtx vals)
14266 {
14267 rtx ops[64], op0, op1, op2, op3, op4, op5;
14268 machine_mode half_mode = VOIDmode;
14269 machine_mode quarter_mode = VOIDmode;
14270 int n, i;
14271
14272 switch (mode)
14273 {
14274 case E_V2SFmode:
14275 case E_V2SImode:
14276 if (!mmx_ok && !TARGET_SSE)
14277 break;
14278 /* FALLTHRU */
14279
14280 case E_V16SImode:
14281 case E_V16SFmode:
14282 case E_V8DFmode:
14283 case E_V8DImode:
14284 case E_V8SFmode:
14285 case E_V8SImode:
14286 case E_V4DFmode:
14287 case E_V4DImode:
14288 case E_V4SFmode:
14289 case E_V4SImode:
14290 case E_V2DFmode:
14291 case E_V2DImode:
14292 n = GET_MODE_NUNITS (mode);
14293 for (i = 0; i < n; i++)
14294 ops[i] = XVECEXP (vals, 0, i);
14295 ix86_expand_vector_init_concat (mode, target, ops, n);
14296 return;
14297
14298 case E_V2TImode:
14299 for (i = 0; i < 2; i++)
14300 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14301 op0 = gen_reg_rtx (V4DImode);
14302 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14303 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14304 return;
14305
14306 case E_V4TImode:
14307 for (i = 0; i < 4; i++)
14308 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14309 ops[4] = gen_reg_rtx (V4DImode);
14310 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14311 ops[5] = gen_reg_rtx (V4DImode);
14312 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14313 op0 = gen_reg_rtx (V8DImode);
14314 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14315 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14316 return;
14317
14318 case E_V32QImode:
14319 half_mode = V16QImode;
14320 goto half;
14321
14322 case E_V16HImode:
14323 half_mode = V8HImode;
14324 goto half;
14325
14326 half:
14327 n = GET_MODE_NUNITS (mode);
14328 for (i = 0; i < n; i++)
14329 ops[i] = XVECEXP (vals, 0, i);
14330 op0 = gen_reg_rtx (half_mode);
14331 op1 = gen_reg_rtx (half_mode);
14332 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14333 n >> 2);
14334 ix86_expand_vector_init_interleave (half_mode, op1,
14335 &ops [n >> 1], n >> 2);
14336 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14337 return;
14338
14339 case E_V64QImode:
14340 quarter_mode = V16QImode;
14341 half_mode = V32QImode;
14342 goto quarter;
14343
14344 case E_V32HImode:
14345 quarter_mode = V8HImode;
14346 half_mode = V16HImode;
14347 goto quarter;
14348
14349 quarter:
14350 n = GET_MODE_NUNITS (mode);
14351 for (i = 0; i < n; i++)
14352 ops[i] = XVECEXP (vals, 0, i);
14353 op0 = gen_reg_rtx (quarter_mode);
14354 op1 = gen_reg_rtx (quarter_mode);
14355 op2 = gen_reg_rtx (quarter_mode);
14356 op3 = gen_reg_rtx (quarter_mode);
14357 op4 = gen_reg_rtx (half_mode);
14358 op5 = gen_reg_rtx (half_mode);
14359 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14360 n >> 3);
14361 ix86_expand_vector_init_interleave (quarter_mode, op1,
14362 &ops [n >> 2], n >> 3);
14363 ix86_expand_vector_init_interleave (quarter_mode, op2,
14364 &ops [n >> 1], n >> 3);
14365 ix86_expand_vector_init_interleave (quarter_mode, op3,
14366 &ops [(n >> 1) | (n >> 2)], n >> 3);
14367 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14368 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14369 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14370 return;
14371
14372 case E_V16QImode:
14373 if (!TARGET_SSE4_1)
14374 break;
14375 /* FALLTHRU */
14376
14377 case E_V8HImode:
14378 if (!TARGET_SSE2)
14379 break;
14380
14381 /* Don't use ix86_expand_vector_init_interleave if we can't
14382 move from GPR to SSE register directly. */
14383 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14384 break;
14385
14386 n = GET_MODE_NUNITS (mode);
14387 for (i = 0; i < n; i++)
14388 ops[i] = XVECEXP (vals, 0, i);
14389 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14390 return;
14391
14392 case E_V4HImode:
14393 case E_V8QImode:
14394 break;
14395
14396 default:
14397 gcc_unreachable ();
14398 }
14399
14400 {
14401 int i, j, n_elts, n_words, n_elt_per_word;
14402 machine_mode inner_mode;
14403 rtx words[4], shift;
14404
14405 inner_mode = GET_MODE_INNER (mode);
14406 n_elts = GET_MODE_NUNITS (mode);
14407 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14408 n_elt_per_word = n_elts / n_words;
14409 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14410
14411 for (i = 0; i < n_words; ++i)
14412 {
14413 rtx word = NULL_RTX;
14414
14415 for (j = 0; j < n_elt_per_word; ++j)
14416 {
14417 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14418 elt = convert_modes (word_mode, inner_mode, elt, true);
14419
14420 if (j == 0)
14421 word = elt;
14422 else
14423 {
14424 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14425 word, 1, OPTAB_LIB_WIDEN);
14426 word = expand_simple_binop (word_mode, IOR, word, elt,
14427 word, 1, OPTAB_LIB_WIDEN);
14428 }
14429 }
14430
14431 words[i] = word;
14432 }
14433
14434 if (n_words == 1)
14435 emit_move_insn (target, gen_lowpart (mode, words[0]));
14436 else if (n_words == 2)
14437 {
14438 rtx tmp = gen_reg_rtx (mode);
14439 emit_clobber (tmp);
14440 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14441 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14442 emit_move_insn (target, tmp);
14443 }
14444 else if (n_words == 4)
14445 {
14446 rtx tmp = gen_reg_rtx (V4SImode);
14447 gcc_assert (word_mode == SImode);
14448 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14449 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14450 emit_move_insn (target, gen_lowpart (mode, tmp));
14451 }
14452 else
14453 gcc_unreachable ();
14454 }
14455 }
14456
14457 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14458 instructions unless MMX_OK is true. */
14459
14460 void
14461 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14462 {
14463 machine_mode mode = GET_MODE (target);
14464 machine_mode inner_mode = GET_MODE_INNER (mode);
14465 int n_elts = GET_MODE_NUNITS (mode);
14466 int n_var = 0, one_var = -1;
14467 bool all_same = true, all_const_zero = true;
14468 int i;
14469 rtx x;
14470
14471 /* Handle first initialization from vector elts. */
14472 if (n_elts != XVECLEN (vals, 0))
14473 {
14474 rtx subtarget = target;
14475 x = XVECEXP (vals, 0, 0);
14476 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14477 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14478 {
14479 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14480 if (inner_mode == QImode || inner_mode == HImode)
14481 {
14482 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14483 mode = mode_for_vector (SImode, n_bits / 4).require ();
14484 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14485 ops[0] = gen_lowpart (inner_mode, ops[0]);
14486 ops[1] = gen_lowpart (inner_mode, ops[1]);
14487 subtarget = gen_reg_rtx (mode);
14488 }
14489 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14490 if (subtarget != target)
14491 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14492 return;
14493 }
14494 gcc_unreachable ();
14495 }
14496
14497 for (i = 0; i < n_elts; ++i)
14498 {
14499 x = XVECEXP (vals, 0, i);
14500 if (!(CONST_SCALAR_INT_P (x)
14501 || CONST_DOUBLE_P (x)
14502 || CONST_FIXED_P (x)))
14503 n_var++, one_var = i;
14504 else if (x != CONST0_RTX (inner_mode))
14505 all_const_zero = false;
14506 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14507 all_same = false;
14508 }
14509
14510 /* Constants are best loaded from the constant pool. */
14511 if (n_var == 0)
14512 {
14513 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14514 return;
14515 }
14516
14517 /* If all values are identical, broadcast the value. */
14518 if (all_same
14519 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14520 XVECEXP (vals, 0, 0)))
14521 return;
14522
14523 /* Values where only one field is non-constant are best loaded from
14524 the pool and overwritten via move later. */
14525 if (n_var == 1)
14526 {
14527 if (all_const_zero
14528 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14529 XVECEXP (vals, 0, one_var),
14530 one_var))
14531 return;
14532
14533 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14534 return;
14535 }
14536
14537 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14538 }
14539
14540 /* Implemented as
14541 V setg (V v, int idx, T val)
14542 {
14543 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14544 V valv = (V){val, val, val, val, val, val, val, val};
14545 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14546 v = (v & ~mask) | (valv & mask);
14547 return v;
14548 }. */
14549 void
14550 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
14551 {
14552 rtx vec[64];
14553 machine_mode mode = GET_MODE (target);
14554 machine_mode cmp_mode = mode;
14555 int n_elts = GET_MODE_NUNITS (mode);
14556 rtx valv,idxv,constv,idx_tmp;
14557 bool ok = false;
14558
14559 /* 512-bits vector byte/word broadcast and comparison only available
14560 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14561 when without TARGET_AVX512BW. */
14562 if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
14563 {
14564 gcc_assert (TARGET_AVX512F);
14565 rtx vhi, vlo, idx_hi;
14566 machine_mode half_mode;
14567 rtx (*extract_hi)(rtx, rtx);
14568 rtx (*extract_lo)(rtx, rtx);
14569
14570 if (mode == V32HImode)
14571 {
14572 half_mode = V16HImode;
14573 extract_hi = gen_vec_extract_hi_v32hi;
14574 extract_lo = gen_vec_extract_lo_v32hi;
14575 }
14576 else
14577 {
14578 half_mode = V32QImode;
14579 extract_hi = gen_vec_extract_hi_v64qi;
14580 extract_lo = gen_vec_extract_lo_v64qi;
14581 }
14582
14583 vhi = gen_reg_rtx (half_mode);
14584 vlo = gen_reg_rtx (half_mode);
14585 idx_hi = gen_reg_rtx (GET_MODE (idx));
14586 emit_insn (extract_hi (vhi, target));
14587 emit_insn (extract_lo (vlo, target));
14588 vec[0] = idx_hi;
14589 vec[1] = idx;
14590 vec[2] = GEN_INT (n_elts/2);
14591 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
14592 ix86_expand_vector_set_var (vhi, val, idx_hi);
14593 ix86_expand_vector_set_var (vlo, val, idx);
14594 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
14595 return;
14596 }
14597
14598 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
14599 {
14600 switch (mode)
14601 {
14602 case E_V2DFmode:
14603 cmp_mode = V2DImode;
14604 break;
14605 case E_V4DFmode:
14606 cmp_mode = V4DImode;
14607 break;
14608 case E_V8DFmode:
14609 cmp_mode = V8DImode;
14610 break;
14611 case E_V4SFmode:
14612 cmp_mode = V4SImode;
14613 break;
14614 case E_V8SFmode:
14615 cmp_mode = V8SImode;
14616 break;
14617 case E_V16SFmode:
14618 cmp_mode = V16SImode;
14619 break;
14620 default:
14621 gcc_unreachable ();
14622 }
14623 }
14624
14625 for (int i = 0; i != n_elts; i++)
14626 vec[i] = GEN_INT (i);
14627 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
14628 valv = gen_reg_rtx (mode);
14629 idxv = gen_reg_rtx (cmp_mode);
14630 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
14631
14632 ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
14633 gcc_assert (ok);
14634 ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
14635 gcc_assert (ok);
14636 vec[0] = target;
14637 vec[1] = valv;
14638 vec[2] = target;
14639 vec[3] = gen_rtx_EQ (mode, idxv, constv);
14640 vec[4] = idxv;
14641 vec[5] = constv;
14642 ok = ix86_expand_int_vcond (vec);
14643 gcc_assert (ok);
14644 }
14645
14646 void
14647 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14648 {
14649 machine_mode mode = GET_MODE (target);
14650 machine_mode inner_mode = GET_MODE_INNER (mode);
14651 machine_mode half_mode;
14652 bool use_vec_merge = false;
14653 rtx tmp;
14654 static rtx (*gen_extract[6][2]) (rtx, rtx)
14655 = {
14656 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14657 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14658 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14659 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14660 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14661 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14662 };
14663 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14664 = {
14665 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14666 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14667 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14668 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14669 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14670 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14671 };
14672 int i, j, n;
14673 machine_mode mmode = VOIDmode;
14674 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14675
14676 switch (mode)
14677 {
14678 case E_V2SImode:
14679 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14680 if (use_vec_merge)
14681 break;
14682 /* FALLTHRU */
14683
14684 case E_V2SFmode:
14685 if (mmx_ok)
14686 {
14687 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14688 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14689 if (elt == 0)
14690 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14691 else
14692 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14693 emit_insn (gen_rtx_SET (target, tmp));
14694 return;
14695 }
14696 break;
14697
14698 case E_V2DImode:
14699 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14700 if (use_vec_merge)
14701 break;
14702
14703 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14704 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14705 if (elt == 0)
14706 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14707 else
14708 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14709 emit_insn (gen_rtx_SET (target, tmp));
14710 return;
14711
14712 case E_V2DFmode:
14713 /* NB: For ELT == 0, use standard scalar operation patterns which
14714 preserve the rest of the vector for combiner:
14715
14716 (vec_merge:V2DF
14717 (vec_duplicate:V2DF (reg:DF))
14718 (reg:V2DF)
14719 (const_int 1))
14720 */
14721 if (elt == 0)
14722 goto do_vec_merge;
14723
14724 {
14725 rtx op0, op1;
14726
14727 /* For the two element vectors, we implement a VEC_CONCAT with
14728 the extraction of the other element. */
14729
14730 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14731 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14732
14733 if (elt == 0)
14734 op0 = val, op1 = tmp;
14735 else
14736 op0 = tmp, op1 = val;
14737
14738 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14739 emit_insn (gen_rtx_SET (target, tmp));
14740 }
14741 return;
14742
14743 case E_V4SFmode:
14744 use_vec_merge = TARGET_SSE4_1;
14745 if (use_vec_merge)
14746 break;
14747
14748 switch (elt)
14749 {
14750 case 0:
14751 use_vec_merge = true;
14752 break;
14753
14754 case 1:
14755 /* tmp = target = A B C D */
14756 tmp = copy_to_reg (target);
14757 /* target = A A B B */
14758 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14759 /* target = X A B B */
14760 ix86_expand_vector_set (false, target, val, 0);
14761 /* target = A X C D */
14762 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14763 const1_rtx, const0_rtx,
14764 GEN_INT (2+4), GEN_INT (3+4)));
14765 return;
14766
14767 case 2:
14768 /* tmp = target = A B C D */
14769 tmp = copy_to_reg (target);
14770 /* tmp = X B C D */
14771 ix86_expand_vector_set (false, tmp, val, 0);
14772 /* target = A B X D */
14773 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14774 const0_rtx, const1_rtx,
14775 GEN_INT (0+4), GEN_INT (3+4)));
14776 return;
14777
14778 case 3:
14779 /* tmp = target = A B C D */
14780 tmp = copy_to_reg (target);
14781 /* tmp = X B C D */
14782 ix86_expand_vector_set (false, tmp, val, 0);
14783 /* target = A B X D */
14784 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14785 const0_rtx, const1_rtx,
14786 GEN_INT (2+4), GEN_INT (0+4)));
14787 return;
14788
14789 default:
14790 gcc_unreachable ();
14791 }
14792 break;
14793
14794 case E_V4SImode:
14795 use_vec_merge = TARGET_SSE4_1;
14796 if (use_vec_merge)
14797 break;
14798
14799 /* Element 0 handled by vec_merge below. */
14800 if (elt == 0)
14801 {
14802 use_vec_merge = true;
14803 break;
14804 }
14805
14806 if (TARGET_SSE2)
14807 {
14808 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14809 store into element 0, then shuffle them back. */
14810
14811 rtx order[4];
14812
14813 order[0] = GEN_INT (elt);
14814 order[1] = const1_rtx;
14815 order[2] = const2_rtx;
14816 order[3] = GEN_INT (3);
14817 order[elt] = const0_rtx;
14818
14819 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14820 order[1], order[2], order[3]));
14821
14822 ix86_expand_vector_set (false, target, val, 0);
14823
14824 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14825 order[1], order[2], order[3]));
14826 }
14827 else
14828 {
14829 /* For SSE1, we have to reuse the V4SF code. */
14830 rtx t = gen_reg_rtx (V4SFmode);
14831 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14832 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14833 emit_move_insn (target, gen_lowpart (mode, t));
14834 }
14835 return;
14836
14837 case E_V8HImode:
14838 use_vec_merge = TARGET_SSE2;
14839 break;
14840 case E_V4HImode:
14841 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14842 break;
14843
14844 case E_V16QImode:
14845 use_vec_merge = TARGET_SSE4_1;
14846 break;
14847
14848 case E_V8QImode:
14849 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14850 break;
14851
14852 case E_V32QImode:
14853 half_mode = V16QImode;
14854 j = 0;
14855 n = 16;
14856 goto half;
14857
14858 case E_V16HImode:
14859 half_mode = V8HImode;
14860 j = 1;
14861 n = 8;
14862 goto half;
14863
14864 case E_V8SImode:
14865 half_mode = V4SImode;
14866 j = 2;
14867 n = 4;
14868 goto half;
14869
14870 case E_V4DImode:
14871 half_mode = V2DImode;
14872 j = 3;
14873 n = 2;
14874 goto half;
14875
14876 case E_V8SFmode:
14877 half_mode = V4SFmode;
14878 j = 4;
14879 n = 4;
14880 goto half;
14881
14882 case E_V4DFmode:
14883 half_mode = V2DFmode;
14884 j = 5;
14885 n = 2;
14886 goto half;
14887
14888 half:
14889 /* Compute offset. */
14890 i = elt / n;
14891 elt %= n;
14892
14893 gcc_assert (i <= 1);
14894
14895 /* Extract the half. */
14896 tmp = gen_reg_rtx (half_mode);
14897 emit_insn (gen_extract[j][i] (tmp, target));
14898
14899 /* Put val in tmp at elt. */
14900 ix86_expand_vector_set (false, tmp, val, elt);
14901
14902 /* Put it back. */
14903 emit_insn (gen_insert[j][i] (target, target, tmp));
14904 return;
14905
14906 case E_V8DFmode:
14907 if (TARGET_AVX512F)
14908 {
14909 mmode = QImode;
14910 gen_blendm = gen_avx512f_blendmv8df;
14911 }
14912 break;
14913
14914 case E_V8DImode:
14915 if (TARGET_AVX512F)
14916 {
14917 mmode = QImode;
14918 gen_blendm = gen_avx512f_blendmv8di;
14919 }
14920 break;
14921
14922 case E_V16SFmode:
14923 if (TARGET_AVX512F)
14924 {
14925 mmode = HImode;
14926 gen_blendm = gen_avx512f_blendmv16sf;
14927 }
14928 break;
14929
14930 case E_V16SImode:
14931 if (TARGET_AVX512F)
14932 {
14933 mmode = HImode;
14934 gen_blendm = gen_avx512f_blendmv16si;
14935 }
14936 break;
14937
14938 case E_V32HImode:
14939 if (TARGET_AVX512BW)
14940 {
14941 mmode = SImode;
14942 gen_blendm = gen_avx512bw_blendmv32hi;
14943 }
14944 else if (TARGET_AVX512F)
14945 {
14946 half_mode = E_V8HImode;
14947 n = 8;
14948 goto quarter;
14949 }
14950 break;
14951
14952 case E_V64QImode:
14953 if (TARGET_AVX512BW)
14954 {
14955 mmode = DImode;
14956 gen_blendm = gen_avx512bw_blendmv64qi;
14957 }
14958 else if (TARGET_AVX512F)
14959 {
14960 half_mode = E_V16QImode;
14961 n = 16;
14962 goto quarter;
14963 }
14964 break;
14965
14966 quarter:
14967 /* Compute offset. */
14968 i = elt / n;
14969 elt %= n;
14970
14971 gcc_assert (i <= 3);
14972
14973 {
14974 /* Extract the quarter. */
14975 tmp = gen_reg_rtx (V4SImode);
14976 rtx tmp2 = gen_lowpart (V16SImode, target);
14977 rtx mask = gen_reg_rtx (QImode);
14978
14979 emit_move_insn (mask, constm1_rtx);
14980 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14981 tmp, mask));
14982
14983 tmp2 = gen_reg_rtx (half_mode);
14984 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14985 tmp = tmp2;
14986
14987 /* Put val in tmp at elt. */
14988 ix86_expand_vector_set (false, tmp, val, elt);
14989
14990 /* Put it back. */
14991 tmp2 = gen_reg_rtx (V16SImode);
14992 rtx tmp3 = gen_lowpart (V16SImode, target);
14993 mask = gen_reg_rtx (HImode);
14994 emit_move_insn (mask, constm1_rtx);
14995 tmp = gen_lowpart (V4SImode, tmp);
14996 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14997 tmp3, mask));
14998 emit_move_insn (target, gen_lowpart (mode, tmp2));
14999 }
15000 return;
15001
15002 default:
15003 break;
15004 }
15005
15006 if (mmode != VOIDmode)
15007 {
15008 tmp = gen_reg_rtx (mode);
15009 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
15010 /* The avx512*_blendm<mode> expanders have different operand order
15011 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15012 elements where the mask is set and second input operand otherwise,
15013 in {sse,avx}*_*blend* the first input operand is used for elements
15014 where the mask is clear and second input operand otherwise. */
15015 emit_insn (gen_blendm (target, target, tmp,
15016 force_reg (mmode,
15017 gen_int_mode (HOST_WIDE_INT_1U << elt,
15018 mmode))));
15019 }
15020 else if (use_vec_merge)
15021 {
15022 do_vec_merge:
15023 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15024 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15025 GEN_INT (HOST_WIDE_INT_1U << elt));
15026 emit_insn (gen_rtx_SET (target, tmp));
15027 }
15028 else
15029 {
15030 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15031
15032 emit_move_insn (mem, target);
15033
15034 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15035 emit_move_insn (tmp, val);
15036
15037 emit_move_insn (target, mem);
15038 }
15039 }
15040
15041 void
15042 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15043 {
15044 machine_mode mode = GET_MODE (vec);
15045 machine_mode inner_mode = GET_MODE_INNER (mode);
15046 bool use_vec_extr = false;
15047 rtx tmp;
15048
15049 switch (mode)
15050 {
15051 case E_V2SImode:
15052 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15053 if (use_vec_extr)
15054 break;
15055 /* FALLTHRU */
15056
15057 case E_V2SFmode:
15058 if (!mmx_ok)
15059 break;
15060 /* FALLTHRU */
15061
15062 case E_V2DFmode:
15063 case E_V2DImode:
15064 case E_V2TImode:
15065 case E_V4TImode:
15066 use_vec_extr = true;
15067 break;
15068
15069 case E_V4SFmode:
15070 use_vec_extr = TARGET_SSE4_1;
15071 if (use_vec_extr)
15072 break;
15073
15074 switch (elt)
15075 {
15076 case 0:
15077 tmp = vec;
15078 break;
15079
15080 case 1:
15081 case 3:
15082 tmp = gen_reg_rtx (mode);
15083 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15084 GEN_INT (elt), GEN_INT (elt),
15085 GEN_INT (elt+4), GEN_INT (elt+4)));
15086 break;
15087
15088 case 2:
15089 tmp = gen_reg_rtx (mode);
15090 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15091 break;
15092
15093 default:
15094 gcc_unreachable ();
15095 }
15096 vec = tmp;
15097 use_vec_extr = true;
15098 elt = 0;
15099 break;
15100
15101 case E_V4SImode:
15102 use_vec_extr = TARGET_SSE4_1;
15103 if (use_vec_extr)
15104 break;
15105
15106 if (TARGET_SSE2)
15107 {
15108 switch (elt)
15109 {
15110 case 0:
15111 tmp = vec;
15112 break;
15113
15114 case 1:
15115 case 3:
15116 tmp = gen_reg_rtx (mode);
15117 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15118 GEN_INT (elt), GEN_INT (elt),
15119 GEN_INT (elt), GEN_INT (elt)));
15120 break;
15121
15122 case 2:
15123 tmp = gen_reg_rtx (mode);
15124 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15125 break;
15126
15127 default:
15128 gcc_unreachable ();
15129 }
15130 vec = tmp;
15131 use_vec_extr = true;
15132 elt = 0;
15133 }
15134 else
15135 {
15136 /* For SSE1, we have to reuse the V4SF code. */
15137 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15138 gen_lowpart (V4SFmode, vec), elt);
15139 return;
15140 }
15141 break;
15142
15143 case E_V8HImode:
15144 use_vec_extr = TARGET_SSE2;
15145 break;
15146 case E_V4HImode:
15147 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15148 break;
15149
15150 case E_V16QImode:
15151 use_vec_extr = TARGET_SSE4_1;
15152 if (!use_vec_extr
15153 && TARGET_SSE2
15154 && elt == 0
15155 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15156 {
15157 tmp = gen_reg_rtx (SImode);
15158 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15159 0);
15160 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15161 return;
15162 }
15163 break;
15164
15165 case E_V8SFmode:
15166 if (TARGET_AVX)
15167 {
15168 tmp = gen_reg_rtx (V4SFmode);
15169 if (elt < 4)
15170 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15171 else
15172 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15173 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15174 return;
15175 }
15176 break;
15177
15178 case E_V4DFmode:
15179 if (TARGET_AVX)
15180 {
15181 tmp = gen_reg_rtx (V2DFmode);
15182 if (elt < 2)
15183 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15184 else
15185 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15186 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15187 return;
15188 }
15189 break;
15190
15191 case E_V32QImode:
15192 if (TARGET_AVX)
15193 {
15194 tmp = gen_reg_rtx (V16QImode);
15195 if (elt < 16)
15196 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15197 else
15198 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15199 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15200 return;
15201 }
15202 break;
15203
15204 case E_V16HImode:
15205 if (TARGET_AVX)
15206 {
15207 tmp = gen_reg_rtx (V8HImode);
15208 if (elt < 8)
15209 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15210 else
15211 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15212 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15213 return;
15214 }
15215 break;
15216
15217 case E_V8SImode:
15218 if (TARGET_AVX)
15219 {
15220 tmp = gen_reg_rtx (V4SImode);
15221 if (elt < 4)
15222 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15223 else
15224 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15225 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15226 return;
15227 }
15228 break;
15229
15230 case E_V4DImode:
15231 if (TARGET_AVX)
15232 {
15233 tmp = gen_reg_rtx (V2DImode);
15234 if (elt < 2)
15235 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15236 else
15237 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15238 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15239 return;
15240 }
15241 break;
15242
15243 case E_V32HImode:
15244 if (TARGET_AVX512BW)
15245 {
15246 tmp = gen_reg_rtx (V16HImode);
15247 if (elt < 16)
15248 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15249 else
15250 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15251 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15252 return;
15253 }
15254 break;
15255
15256 case E_V64QImode:
15257 if (TARGET_AVX512BW)
15258 {
15259 tmp = gen_reg_rtx (V32QImode);
15260 if (elt < 32)
15261 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15262 else
15263 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15264 ix86_expand_vector_extract (false, target, tmp, elt & 31);
15265 return;
15266 }
15267 break;
15268
15269 case E_V16SFmode:
15270 tmp = gen_reg_rtx (V8SFmode);
15271 if (elt < 8)
15272 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15273 else
15274 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15275 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15276 return;
15277
15278 case E_V8DFmode:
15279 tmp = gen_reg_rtx (V4DFmode);
15280 if (elt < 4)
15281 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15282 else
15283 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15284 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15285 return;
15286
15287 case E_V16SImode:
15288 tmp = gen_reg_rtx (V8SImode);
15289 if (elt < 8)
15290 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15291 else
15292 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15293 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15294 return;
15295
15296 case E_V8DImode:
15297 tmp = gen_reg_rtx (V4DImode);
15298 if (elt < 4)
15299 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15300 else
15301 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15302 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15303 return;
15304
15305 case E_V8QImode:
15306 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15307 /* ??? Could extract the appropriate HImode element and shift. */
15308 break;
15309
15310 default:
15311 break;
15312 }
15313
15314 if (use_vec_extr)
15315 {
15316 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15317 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15318
15319 /* Let the rtl optimizers know about the zero extension performed. */
15320 if (inner_mode == QImode || inner_mode == HImode)
15321 {
15322 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
15323 target = gen_lowpart (SImode, target);
15324 }
15325
15326 emit_insn (gen_rtx_SET (target, tmp));
15327 }
15328 else
15329 {
15330 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15331
15332 emit_move_insn (mem, vec);
15333
15334 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
15335 emit_move_insn (target, tmp);
15336 }
15337 }
15338
15339 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15340 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15341 The upper bits of DEST are undefined, though they shouldn't cause
15342 exceptions (some bits from src or all zeros are ok). */
15343
15344 static void
15345 emit_reduc_half (rtx dest, rtx src, int i)
15346 {
15347 rtx tem, d = dest;
15348 switch (GET_MODE (src))
15349 {
15350 case E_V4SFmode:
15351 if (i == 128)
15352 tem = gen_sse_movhlps (dest, src, src);
15353 else
15354 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
15355 GEN_INT (1 + 4), GEN_INT (1 + 4));
15356 break;
15357 case E_V2DFmode:
15358 tem = gen_vec_interleave_highv2df (dest, src, src);
15359 break;
15360 case E_V16QImode:
15361 case E_V8HImode:
15362 case E_V4SImode:
15363 case E_V2DImode:
15364 d = gen_reg_rtx (V1TImode);
15365 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
15366 GEN_INT (i / 2));
15367 break;
15368 case E_V8SFmode:
15369 if (i == 256)
15370 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
15371 else
15372 tem = gen_avx_shufps256 (dest, src, src,
15373 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
15374 break;
15375 case E_V4DFmode:
15376 if (i == 256)
15377 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
15378 else
15379 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
15380 break;
15381 case E_V32QImode:
15382 case E_V16HImode:
15383 case E_V8SImode:
15384 case E_V4DImode:
15385 if (i == 256)
15386 {
15387 if (GET_MODE (dest) != V4DImode)
15388 d = gen_reg_rtx (V4DImode);
15389 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
15390 gen_lowpart (V4DImode, src),
15391 const1_rtx);
15392 }
15393 else
15394 {
15395 d = gen_reg_rtx (V2TImode);
15396 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
15397 GEN_INT (i / 2));
15398 }
15399 break;
15400 case E_V64QImode:
15401 case E_V32HImode:
15402 if (i < 64)
15403 {
15404 d = gen_reg_rtx (V4TImode);
15405 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
15406 GEN_INT (i / 2));
15407 break;
15408 }
15409 /* FALLTHRU */
15410 case E_V16SImode:
15411 case E_V16SFmode:
15412 case E_V8DImode:
15413 case E_V8DFmode:
15414 if (i > 128)
15415 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
15416 gen_lowpart (V16SImode, src),
15417 gen_lowpart (V16SImode, src),
15418 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
15419 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
15420 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
15421 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
15422 GEN_INT (0xC), GEN_INT (0xD),
15423 GEN_INT (0xE), GEN_INT (0xF),
15424 GEN_INT (0x10), GEN_INT (0x11),
15425 GEN_INT (0x12), GEN_INT (0x13),
15426 GEN_INT (0x14), GEN_INT (0x15),
15427 GEN_INT (0x16), GEN_INT (0x17));
15428 else
15429 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
15430 gen_lowpart (V16SImode, src),
15431 GEN_INT (i == 128 ? 0x2 : 0x1),
15432 GEN_INT (0x3),
15433 GEN_INT (0x3),
15434 GEN_INT (0x3),
15435 GEN_INT (i == 128 ? 0x6 : 0x5),
15436 GEN_INT (0x7),
15437 GEN_INT (0x7),
15438 GEN_INT (0x7),
15439 GEN_INT (i == 128 ? 0xA : 0x9),
15440 GEN_INT (0xB),
15441 GEN_INT (0xB),
15442 GEN_INT (0xB),
15443 GEN_INT (i == 128 ? 0xE : 0xD),
15444 GEN_INT (0xF),
15445 GEN_INT (0xF),
15446 GEN_INT (0xF));
15447 break;
15448 default:
15449 gcc_unreachable ();
15450 }
15451 emit_insn (tem);
15452 if (d != dest)
15453 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15454 }
15455
15456 /* Expand a vector reduction. FN is the binary pattern to reduce;
15457 DEST is the destination; IN is the input vector. */
15458
15459 void
15460 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15461 {
15462 rtx half, dst, vec = in;
15463 machine_mode mode = GET_MODE (in);
15464 int i;
15465
15466 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15467 if (TARGET_SSE4_1
15468 && mode == V8HImode
15469 && fn == gen_uminv8hi3)
15470 {
15471 emit_insn (gen_sse4_1_phminposuw (dest, in));
15472 return;
15473 }
15474
15475 for (i = GET_MODE_BITSIZE (mode);
15476 i > GET_MODE_UNIT_BITSIZE (mode);
15477 i >>= 1)
15478 {
15479 half = gen_reg_rtx (mode);
15480 emit_reduc_half (half, vec, i);
15481 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15482 dst = dest;
15483 else
15484 dst = gen_reg_rtx (mode);
15485 emit_insn (fn (dst, half, vec));
15486 vec = dst;
15487 }
15488 }
15489
15490 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15491 FP status register is set. */
15492
15493 void
15494 ix86_emit_fp_unordered_jump (rtx label)
15495 {
15496 rtx reg = gen_reg_rtx (HImode);
15497 rtx_insn *insn;
15498 rtx temp;
15499
15500 emit_insn (gen_x86_fnstsw_1 (reg));
15501
15502 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15503 {
15504 emit_insn (gen_x86_sahf_1 (reg));
15505
15506 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15507 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15508 }
15509 else
15510 {
15511 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15512
15513 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15514 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15515 }
15516
15517 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15518 gen_rtx_LABEL_REF (VOIDmode, label),
15519 pc_rtx);
15520 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15521 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15522 JUMP_LABEL (insn) = label;
15523 }
15524
15525 /* Output code to perform an sinh XFmode calculation. */
15526
15527 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15528 {
15529 rtx e1 = gen_reg_rtx (XFmode);
15530 rtx e2 = gen_reg_rtx (XFmode);
15531 rtx scratch = gen_reg_rtx (HImode);
15532 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15533 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15534 rtx cst1, tmp;
15535 rtx_code_label *jump_label = gen_label_rtx ();
15536 rtx_insn *insn;
15537
15538 /* scratch = fxam (op1) */
15539 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15540
15541 /* e1 = expm1 (|op1|) */
15542 emit_insn (gen_absxf2 (e2, op1));
15543 emit_insn (gen_expm1xf2 (e1, e2));
15544
15545 /* e2 = e1 / (e1 + 1.0) + e1 */
15546 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15547 emit_insn (gen_addxf3 (e2, e1, cst1));
15548 emit_insn (gen_divxf3 (e2, e1, e2));
15549 emit_insn (gen_addxf3 (e2, e2, e1));
15550
15551 /* flags = signbit (op1) */
15552 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15553
15554 /* if (flags) then e2 = -e2 */
15555 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15556 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15557 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15558 pc_rtx);
15559 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15560 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15561 JUMP_LABEL (insn) = jump_label;
15562
15563 emit_insn (gen_negxf2 (e2, e2));
15564
15565 emit_label (jump_label);
15566 LABEL_NUSES (jump_label) = 1;
15567
15568 /* op0 = 0.5 * e2 */
15569 half = force_reg (XFmode, half);
15570 emit_insn (gen_mulxf3 (op0, e2, half));
15571 }
15572
15573 /* Output code to perform an cosh XFmode calculation. */
15574
15575 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15576 {
15577 rtx e1 = gen_reg_rtx (XFmode);
15578 rtx e2 = gen_reg_rtx (XFmode);
15579 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15580 rtx cst1;
15581
15582 /* e1 = exp (op1) */
15583 emit_insn (gen_expxf2 (e1, op1));
15584
15585 /* e2 = e1 + 1.0 / e1 */
15586 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15587 emit_insn (gen_divxf3 (e2, cst1, e1));
15588 emit_insn (gen_addxf3 (e2, e1, e2));
15589
15590 /* op0 = 0.5 * e2 */
15591 half = force_reg (XFmode, half);
15592 emit_insn (gen_mulxf3 (op0, e2, half));
15593 }
15594
15595 /* Output code to perform an tanh XFmode calculation. */
15596
15597 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15598 {
15599 rtx e1 = gen_reg_rtx (XFmode);
15600 rtx e2 = gen_reg_rtx (XFmode);
15601 rtx scratch = gen_reg_rtx (HImode);
15602 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15603 rtx cst2, tmp;
15604 rtx_code_label *jump_label = gen_label_rtx ();
15605 rtx_insn *insn;
15606
15607 /* scratch = fxam (op1) */
15608 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15609
15610 /* e1 = expm1 (-|2 * op1|) */
15611 emit_insn (gen_addxf3 (e2, op1, op1));
15612 emit_insn (gen_absxf2 (e2, e2));
15613 emit_insn (gen_negxf2 (e2, e2));
15614 emit_insn (gen_expm1xf2 (e1, e2));
15615
15616 /* e2 = e1 / (e1 + 2.0) */
15617 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15618 emit_insn (gen_addxf3 (e2, e1, cst2));
15619 emit_insn (gen_divxf3 (e2, e1, e2));
15620
15621 /* flags = signbit (op1) */
15622 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15623
15624 /* if (!flags) then e2 = -e2 */
15625 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15626 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15627 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15628 pc_rtx);
15629 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15630 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15631 JUMP_LABEL (insn) = jump_label;
15632
15633 emit_insn (gen_negxf2 (e2, e2));
15634
15635 emit_label (jump_label);
15636 LABEL_NUSES (jump_label) = 1;
15637
15638 emit_move_insn (op0, e2);
15639 }
15640
15641 /* Output code to perform an asinh XFmode calculation. */
15642
15643 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15644 {
15645 rtx e1 = gen_reg_rtx (XFmode);
15646 rtx e2 = gen_reg_rtx (XFmode);
15647 rtx scratch = gen_reg_rtx (HImode);
15648 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15649 rtx cst1, tmp;
15650 rtx_code_label *jump_label = gen_label_rtx ();
15651 rtx_insn *insn;
15652
15653 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15654 emit_insn (gen_mulxf3 (e1, op1, op1));
15655 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15656 emit_insn (gen_addxf3 (e2, e1, cst1));
15657 emit_insn (gen_sqrtxf2 (e2, e2));
15658 emit_insn (gen_addxf3 (e2, e2, cst1));
15659
15660 /* e1 = e1 / e2 */
15661 emit_insn (gen_divxf3 (e1, e1, e2));
15662
15663 /* scratch = fxam (op1) */
15664 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15665
15666 /* e1 = e1 + |op1| */
15667 emit_insn (gen_absxf2 (e2, op1));
15668 emit_insn (gen_addxf3 (e1, e1, e2));
15669
15670 /* e2 = log1p (e1) */
15671 ix86_emit_i387_log1p (e2, e1);
15672
15673 /* flags = signbit (op1) */
15674 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15675
15676 /* if (flags) then e2 = -e2 */
15677 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15678 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15679 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15680 pc_rtx);
15681 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15682 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15683 JUMP_LABEL (insn) = jump_label;
15684
15685 emit_insn (gen_negxf2 (e2, e2));
15686
15687 emit_label (jump_label);
15688 LABEL_NUSES (jump_label) = 1;
15689
15690 emit_move_insn (op0, e2);
15691 }
15692
15693 /* Output code to perform an acosh XFmode calculation. */
15694
15695 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15696 {
15697 rtx e1 = gen_reg_rtx (XFmode);
15698 rtx e2 = gen_reg_rtx (XFmode);
15699 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15700
15701 /* e2 = sqrt (op1 + 1.0) */
15702 emit_insn (gen_addxf3 (e2, op1, cst1));
15703 emit_insn (gen_sqrtxf2 (e2, e2));
15704
15705 /* e1 = sqrt (op1 - 1.0) */
15706 emit_insn (gen_subxf3 (e1, op1, cst1));
15707 emit_insn (gen_sqrtxf2 (e1, e1));
15708
15709 /* e1 = e1 * e2 */
15710 emit_insn (gen_mulxf3 (e1, e1, e2));
15711
15712 /* e1 = e1 + op1 */
15713 emit_insn (gen_addxf3 (e1, e1, op1));
15714
15715 /* op0 = log (e1) */
15716 emit_insn (gen_logxf2 (op0, e1));
15717 }
15718
15719 /* Output code to perform an atanh XFmode calculation. */
15720
15721 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15722 {
15723 rtx e1 = gen_reg_rtx (XFmode);
15724 rtx e2 = gen_reg_rtx (XFmode);
15725 rtx scratch = gen_reg_rtx (HImode);
15726 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15727 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15728 rtx cst1, tmp;
15729 rtx_code_label *jump_label = gen_label_rtx ();
15730 rtx_insn *insn;
15731
15732 /* scratch = fxam (op1) */
15733 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15734
15735 /* e2 = |op1| */
15736 emit_insn (gen_absxf2 (e2, op1));
15737
15738 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15739 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15740 emit_insn (gen_addxf3 (e1, e2, cst1));
15741 emit_insn (gen_addxf3 (e2, e2, e2));
15742 emit_insn (gen_negxf2 (e2, e2));
15743 emit_insn (gen_divxf3 (e1, e2, e1));
15744
15745 /* e2 = log1p (e1) */
15746 ix86_emit_i387_log1p (e2, e1);
15747
15748 /* flags = signbit (op1) */
15749 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15750
15751 /* if (!flags) then e2 = -e2 */
15752 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15753 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15754 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15755 pc_rtx);
15756 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15757 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15758 JUMP_LABEL (insn) = jump_label;
15759
15760 emit_insn (gen_negxf2 (e2, e2));
15761
15762 emit_label (jump_label);
15763 LABEL_NUSES (jump_label) = 1;
15764
15765 /* op0 = 0.5 * e2 */
15766 half = force_reg (XFmode, half);
15767 emit_insn (gen_mulxf3 (op0, e2, half));
15768 }
15769
15770 /* Output code to perform a log1p XFmode calculation. */
15771
15772 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15773 {
15774 rtx_code_label *label1 = gen_label_rtx ();
15775 rtx_code_label *label2 = gen_label_rtx ();
15776
15777 rtx tmp = gen_reg_rtx (XFmode);
15778 rtx res = gen_reg_rtx (XFmode);
15779 rtx cst, cstln2, cst1;
15780 rtx_insn *insn;
15781
15782 cst = const_double_from_real_value
15783 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15784 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15785
15786 emit_insn (gen_absxf2 (tmp, op1));
15787
15788 cst = force_reg (XFmode, cst);
15789 ix86_expand_branch (GE, tmp, cst, label1);
15790 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15791 insn = get_last_insn ();
15792 JUMP_LABEL (insn) = label1;
15793
15794 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15795 emit_jump (label2);
15796
15797 emit_label (label1);
15798 LABEL_NUSES (label1) = 1;
15799
15800 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15801 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15802 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15803
15804 emit_label (label2);
15805 LABEL_NUSES (label2) = 1;
15806
15807 emit_move_insn (op0, res);
15808 }
15809
15810 /* Emit code for round calculation. */
15811 void ix86_emit_i387_round (rtx op0, rtx op1)
15812 {
15813 machine_mode inmode = GET_MODE (op1);
15814 machine_mode outmode = GET_MODE (op0);
15815 rtx e1 = gen_reg_rtx (XFmode);
15816 rtx e2 = gen_reg_rtx (XFmode);
15817 rtx scratch = gen_reg_rtx (HImode);
15818 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15819 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15820 rtx res = gen_reg_rtx (outmode);
15821 rtx_code_label *jump_label = gen_label_rtx ();
15822 rtx (*floor_insn) (rtx, rtx);
15823 rtx (*neg_insn) (rtx, rtx);
15824 rtx_insn *insn;
15825 rtx tmp;
15826
15827 switch (inmode)
15828 {
15829 case E_SFmode:
15830 case E_DFmode:
15831 tmp = gen_reg_rtx (XFmode);
15832
15833 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15834 op1 = tmp;
15835 break;
15836 case E_XFmode:
15837 break;
15838 default:
15839 gcc_unreachable ();
15840 }
15841
15842 switch (outmode)
15843 {
15844 case E_SFmode:
15845 floor_insn = gen_frndintxf2_floor;
15846 neg_insn = gen_negsf2;
15847 break;
15848 case E_DFmode:
15849 floor_insn = gen_frndintxf2_floor;
15850 neg_insn = gen_negdf2;
15851 break;
15852 case E_XFmode:
15853 floor_insn = gen_frndintxf2_floor;
15854 neg_insn = gen_negxf2;
15855 break;
15856 case E_HImode:
15857 floor_insn = gen_lfloorxfhi2;
15858 neg_insn = gen_neghi2;
15859 break;
15860 case E_SImode:
15861 floor_insn = gen_lfloorxfsi2;
15862 neg_insn = gen_negsi2;
15863 break;
15864 case E_DImode:
15865 floor_insn = gen_lfloorxfdi2;
15866 neg_insn = gen_negdi2;
15867 break;
15868 default:
15869 gcc_unreachable ();
15870 }
15871
15872 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15873
15874 /* scratch = fxam(op1) */
15875 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15876
15877 /* e1 = fabs(op1) */
15878 emit_insn (gen_absxf2 (e1, op1));
15879
15880 /* e2 = e1 + 0.5 */
15881 half = force_reg (XFmode, half);
15882 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15883
15884 /* res = floor(e2) */
15885 switch (outmode)
15886 {
15887 case E_SFmode:
15888 case E_DFmode:
15889 {
15890 tmp = gen_reg_rtx (XFmode);
15891
15892 emit_insn (floor_insn (tmp, e2));
15893 emit_insn (gen_rtx_SET (res,
15894 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15895 UNSPEC_TRUNC_NOOP)));
15896 }
15897 break;
15898 default:
15899 emit_insn (floor_insn (res, e2));
15900 }
15901
15902 /* flags = signbit(a) */
15903 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15904
15905 /* if (flags) then res = -res */
15906 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15907 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15908 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15909 pc_rtx);
15910 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15911 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15912 JUMP_LABEL (insn) = jump_label;
15913
15914 emit_insn (neg_insn (res, res));
15915
15916 emit_label (jump_label);
15917 LABEL_NUSES (jump_label) = 1;
15918
15919 emit_move_insn (op0, res);
15920 }
15921
15922 /* Output code to perform a Newton-Rhapson approximation of a single precision
15923 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15924
15925 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15926 {
15927 rtx x0, x1, e0, e1;
15928
15929 x0 = gen_reg_rtx (mode);
15930 e0 = gen_reg_rtx (mode);
15931 e1 = gen_reg_rtx (mode);
15932 x1 = gen_reg_rtx (mode);
15933
15934 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15935
15936 b = force_reg (mode, b);
15937
15938 /* x0 = rcp(b) estimate */
15939 if (mode == V16SFmode || mode == V8DFmode)
15940 {
15941 if (TARGET_AVX512ER)
15942 {
15943 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15944 UNSPEC_RCP28)));
15945 /* res = a * x0 */
15946 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15947 return;
15948 }
15949 else
15950 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15951 UNSPEC_RCP14)));
15952 }
15953 else
15954 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15955 UNSPEC_RCP)));
15956
15957 /* e0 = x0 * b */
15958 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15959
15960 /* e0 = x0 * e0 */
15961 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15962
15963 /* e1 = x0 + x0 */
15964 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15965
15966 /* x1 = e1 - e0 */
15967 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15968
15969 /* res = a * x1 */
15970 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15971 }
15972
15973 /* Output code to perform a Newton-Rhapson approximation of a
15974 single precision floating point [reciprocal] square root. */
15975
15976 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15977 {
15978 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15979 REAL_VALUE_TYPE r;
15980 int unspec;
15981
15982 x0 = gen_reg_rtx (mode);
15983 e0 = gen_reg_rtx (mode);
15984 e1 = gen_reg_rtx (mode);
15985 e2 = gen_reg_rtx (mode);
15986 e3 = gen_reg_rtx (mode);
15987
15988 if (TARGET_AVX512ER && mode == V16SFmode)
15989 {
15990 if (recip)
15991 /* res = rsqrt28(a) estimate */
15992 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15993 UNSPEC_RSQRT28)));
15994 else
15995 {
15996 /* x0 = rsqrt28(a) estimate */
15997 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15998 UNSPEC_RSQRT28)));
15999 /* res = rcp28(x0) estimate */
16000 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16001 UNSPEC_RCP28)));
16002 }
16003 return;
16004 }
16005
16006 real_from_integer (&r, VOIDmode, -3, SIGNED);
16007 mthree = const_double_from_real_value (r, SFmode);
16008
16009 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16010 mhalf = const_double_from_real_value (r, SFmode);
16011 unspec = UNSPEC_RSQRT;
16012
16013 if (VECTOR_MODE_P (mode))
16014 {
16015 mthree = ix86_build_const_vector (mode, true, mthree);
16016 mhalf = ix86_build_const_vector (mode, true, mhalf);
16017 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16018 if (GET_MODE_SIZE (mode) == 64)
16019 unspec = UNSPEC_RSQRT14;
16020 }
16021
16022 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16023 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16024
16025 a = force_reg (mode, a);
16026
16027 /* x0 = rsqrt(a) estimate */
16028 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16029 unspec)));
16030
16031 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16032 if (!recip)
16033 {
16034 rtx zero = force_reg (mode, CONST0_RTX(mode));
16035 rtx mask;
16036
16037 /* Handle masked compare. */
16038 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16039 {
16040 mask = gen_reg_rtx (HImode);
16041 /* Imm value 0x4 corresponds to not-equal comparison. */
16042 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16043 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16044 }
16045 else
16046 {
16047 mask = gen_reg_rtx (mode);
16048 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16049 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16050 }
16051 }
16052
16053 mthree = force_reg (mode, mthree);
16054
16055 /* e0 = x0 * a */
16056 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
16057
16058 unsigned vector_size = GET_MODE_SIZE (mode);
16059 if (TARGET_FMA
16060 || (TARGET_AVX512F && vector_size == 64)
16061 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
16062 emit_insn (gen_rtx_SET (e2,
16063 gen_rtx_FMA (mode, e0, x0, mthree)));
16064 else
16065 {
16066 /* e1 = e0 * x0 */
16067 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16068
16069 /* e2 = e1 - 3. */
16070 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16071 }
16072
16073 mhalf = force_reg (mode, mhalf);
16074 if (recip)
16075 /* e3 = -.5 * x0 */
16076 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16077 else
16078 /* e3 = -.5 * e0 */
16079 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16080 /* ret = e2 * e3 */
16081 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16082 }
16083
16084 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16085 mask for masking out the sign-bit is stored in *SMASK, if that is
16086 non-null. */
16087
16088 static rtx
16089 ix86_expand_sse_fabs (rtx op0, rtx *smask)
16090 {
16091 machine_mode vmode, mode = GET_MODE (op0);
16092 rtx xa, mask;
16093
16094 xa = gen_reg_rtx (mode);
16095 if (mode == SFmode)
16096 vmode = V4SFmode;
16097 else if (mode == DFmode)
16098 vmode = V2DFmode;
16099 else
16100 vmode = mode;
16101 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16102 if (!VECTOR_MODE_P (mode))
16103 {
16104 /* We need to generate a scalar mode mask in this case. */
16105 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16106 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16107 mask = gen_reg_rtx (mode);
16108 emit_insn (gen_rtx_SET (mask, tmp));
16109 }
16110 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16111
16112 if (smask)
16113 *smask = mask;
16114
16115 return xa;
16116 }
16117
16118 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16119 swapping the operands if SWAP_OPERANDS is true. The expanded
16120 code is a forward jump to a newly created label in case the
16121 comparison is true. The generated label rtx is returned. */
16122 static rtx_code_label *
16123 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16124 bool swap_operands)
16125 {
16126 bool unordered_compare = ix86_unordered_fp_compare (code);
16127 rtx_code_label *label;
16128 rtx tmp, reg;
16129
16130 if (swap_operands)
16131 std::swap (op0, op1);
16132
16133 label = gen_label_rtx ();
16134 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16135 if (unordered_compare)
16136 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16137 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16138 emit_insn (gen_rtx_SET (reg, tmp));
16139 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16140 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16141 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16142 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16143 JUMP_LABEL (tmp) = label;
16144
16145 return label;
16146 }
16147
16148 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16149 using comparison code CODE. Operands are swapped for the comparison if
16150 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16151 static rtx
16152 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16153 bool swap_operands)
16154 {
16155 rtx (*insn)(rtx, rtx, rtx, rtx);
16156 machine_mode mode = GET_MODE (op0);
16157 rtx mask = gen_reg_rtx (mode);
16158
16159 if (swap_operands)
16160 std::swap (op0, op1);
16161
16162 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16163
16164 emit_insn (insn (mask, op0, op1,
16165 gen_rtx_fmt_ee (code, mode, op0, op1)));
16166 return mask;
16167 }
16168
16169 /* Expand copysign from SIGN to the positive value ABS_VALUE
16170 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16171 the sign-bit. */
16172
16173 static void
16174 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16175 {
16176 machine_mode mode = GET_MODE (sign);
16177 rtx sgn = gen_reg_rtx (mode);
16178 if (mask == NULL_RTX)
16179 {
16180 machine_mode vmode;
16181
16182 if (mode == SFmode)
16183 vmode = V4SFmode;
16184 else if (mode == DFmode)
16185 vmode = V2DFmode;
16186 else
16187 vmode = mode;
16188
16189 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16190 if (!VECTOR_MODE_P (mode))
16191 {
16192 /* We need to generate a scalar mode mask in this case. */
16193 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16194 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16195 mask = gen_reg_rtx (mode);
16196 emit_insn (gen_rtx_SET (mask, tmp));
16197 }
16198 }
16199 else
16200 mask = gen_rtx_NOT (mode, mask);
16201 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16202 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16203 }
16204
16205 /* Expand SSE sequence for computing lround from OP1 storing
16206 into OP0. */
16207
16208 void
16209 ix86_expand_lround (rtx op0, rtx op1)
16210 {
16211 /* C code for the stuff we're doing below:
16212 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16213 return (long)tmp;
16214 */
16215 machine_mode mode = GET_MODE (op1);
16216 const struct real_format *fmt;
16217 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16218 rtx adj;
16219
16220 /* load nextafter (0.5, 0.0) */
16221 fmt = REAL_MODE_FORMAT (mode);
16222 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16223 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16224
16225 /* adj = copysign (0.5, op1) */
16226 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16227 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16228
16229 /* adj = op1 + adj */
16230 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16231
16232 /* op0 = (imode)adj */
16233 expand_fix (op0, adj, 0);
16234 }
16235
16236 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16237 into OPERAND0. */
16238
16239 void
16240 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16241 {
16242 /* C code for the stuff we're doing below (for do_floor):
16243 xi = (long)op1;
16244 xi -= (double)xi > op1 ? 1 : 0;
16245 return xi;
16246 */
16247 machine_mode fmode = GET_MODE (op1);
16248 machine_mode imode = GET_MODE (op0);
16249 rtx ireg, freg, tmp;
16250 rtx_code_label *label;
16251
16252 /* reg = (long)op1 */
16253 ireg = gen_reg_rtx (imode);
16254 expand_fix (ireg, op1, 0);
16255
16256 /* freg = (double)reg */
16257 freg = gen_reg_rtx (fmode);
16258 expand_float (freg, ireg, 0);
16259
16260 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16261 label = ix86_expand_sse_compare_and_jump (UNLE,
16262 freg, op1, !do_floor);
16263 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16264 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16265 emit_move_insn (ireg, tmp);
16266
16267 emit_label (label);
16268 LABEL_NUSES (label) = 1;
16269
16270 emit_move_insn (op0, ireg);
16271 }
16272
16273 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16274 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16275
16276 static rtx
16277 ix86_gen_TWO52 (machine_mode mode)
16278 {
16279 const struct real_format *fmt;
16280 REAL_VALUE_TYPE TWO52r;
16281 rtx TWO52;
16282
16283 fmt = REAL_MODE_FORMAT (mode);
16284 real_2expN (&TWO52r, fmt->p - 1, mode);
16285 TWO52 = const_double_from_real_value (TWO52r, mode);
16286 TWO52 = force_reg (mode, TWO52);
16287
16288 return TWO52;
16289 }
16290
16291 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16292
16293 void
16294 ix86_expand_rint (rtx operand0, rtx operand1)
16295 {
16296 /* C code for the stuff we're doing below:
16297 xa = fabs (operand1);
16298 if (!isless (xa, 2**52))
16299 return operand1;
16300 two52 = 2**52;
16301 if (flag_rounding_math)
16302 {
16303 two52 = copysign (two52, operand1);
16304 xa = operand1;
16305 }
16306 xa = xa + two52 - two52;
16307 return copysign (xa, operand1);
16308 */
16309 machine_mode mode = GET_MODE (operand0);
16310 rtx res, xa, TWO52, mask;
16311 rtx_code_label *label;
16312
16313 TWO52 = ix86_gen_TWO52 (mode);
16314
16315 /* Temporary for holding the result, initialized to the input
16316 operand to ease control flow. */
16317 res = copy_to_reg (operand1);
16318
16319 /* xa = abs (operand1) */
16320 xa = ix86_expand_sse_fabs (res, &mask);
16321
16322 /* if (!isless (xa, TWO52)) goto label; */
16323 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16324
16325 if (flag_rounding_math)
16326 {
16327 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
16328 xa = res;
16329 }
16330
16331 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16332 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16333
16334 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16335 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16336 xa = ix86_expand_sse_fabs (xa, NULL);
16337
16338 ix86_sse_copysign_to_positive (res, xa, res, mask);
16339
16340 emit_label (label);
16341 LABEL_NUSES (label) = 1;
16342
16343 emit_move_insn (operand0, res);
16344 }
16345
16346 /* Expand SSE2 sequence for computing floor or ceil
16347 from OPERAND1 storing into OPERAND0. */
16348 void
16349 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
16350 {
16351 /* C code for the stuff we expand below.
16352 double xa = fabs (x), x2;
16353 if (!isless (xa, TWO52))
16354 return x;
16355 x2 = (double)(long)x;
16356
16357 Compensate. Floor:
16358 if (x2 > x)
16359 x2 -= 1;
16360 Compensate. Ceil:
16361 if (x2 < x)
16362 x2 += 1;
16363
16364 if (HONOR_SIGNED_ZEROS (mode))
16365 return copysign (x2, x);
16366 return x2;
16367 */
16368 machine_mode mode = GET_MODE (operand0);
16369 rtx xa, xi, TWO52, tmp, one, res, mask;
16370 rtx_code_label *label;
16371
16372 TWO52 = ix86_gen_TWO52 (mode);
16373
16374 /* Temporary for holding the result, initialized to the input
16375 operand to ease control flow. */
16376 res = copy_to_reg (operand1);
16377
16378 /* xa = abs (operand1) */
16379 xa = ix86_expand_sse_fabs (res, &mask);
16380
16381 /* if (!isless (xa, TWO52)) goto label; */
16382 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16383
16384 /* xa = (double)(long)x */
16385 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16386 expand_fix (xi, res, 0);
16387 expand_float (xa, xi, 0);
16388
16389 /* generate 1.0 */
16390 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16391
16392 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16393 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16394 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16395 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16396 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16397 if (HONOR_SIGNED_ZEROS (mode))
16398 {
16399 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16400 if (do_floor && flag_rounding_math)
16401 tmp = ix86_expand_sse_fabs (tmp, NULL);
16402
16403 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16404 }
16405 emit_move_insn (res, tmp);
16406
16407 emit_label (label);
16408 LABEL_NUSES (label) = 1;
16409
16410 emit_move_insn (operand0, res);
16411 }
16412
16413 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16414 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16415 that is only available on 64bit targets. */
16416 void
16417 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
16418 {
16419 /* C code for the stuff we expand below.
16420 double xa = fabs (x), x2;
16421 if (!isless (xa, TWO52))
16422 return x;
16423 xa = xa + TWO52 - TWO52;
16424 x2 = copysign (xa, x);
16425
16426 Compensate. Floor:
16427 if (x2 > x)
16428 x2 -= 1;
16429 Compensate. Ceil:
16430 if (x2 < x)
16431 x2 += 1;
16432
16433 if (HONOR_SIGNED_ZEROS (mode))
16434 x2 = copysign (x2, x);
16435 return x2;
16436 */
16437 machine_mode mode = GET_MODE (operand0);
16438 rtx xa, TWO52, tmp, one, res, mask;
16439 rtx_code_label *label;
16440
16441 TWO52 = ix86_gen_TWO52 (mode);
16442
16443 /* Temporary for holding the result, initialized to the input
16444 operand to ease control flow. */
16445 res = copy_to_reg (operand1);
16446
16447 /* xa = abs (operand1) */
16448 xa = ix86_expand_sse_fabs (res, &mask);
16449
16450 /* if (!isless (xa, TWO52)) goto label; */
16451 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16452
16453 /* xa = xa + TWO52 - TWO52; */
16454 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16455 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16456
16457 /* xa = copysign (xa, operand1) */
16458 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16459
16460 /* generate 1.0 */
16461 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16462
16463 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16464 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16465 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16466 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16467 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16468 if (HONOR_SIGNED_ZEROS (mode))
16469 {
16470 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16471 if (do_floor && flag_rounding_math)
16472 tmp = ix86_expand_sse_fabs (tmp, NULL);
16473
16474 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16475 }
16476 emit_move_insn (res, tmp);
16477
16478 emit_label (label);
16479 LABEL_NUSES (label) = 1;
16480
16481 emit_move_insn (operand0, res);
16482 }
16483
16484 /* Expand SSE sequence for computing trunc
16485 from OPERAND1 storing into OPERAND0. */
16486 void
16487 ix86_expand_trunc (rtx operand0, rtx operand1)
16488 {
16489 /* C code for SSE variant we expand below.
16490 double xa = fabs (x), x2;
16491 if (!isless (xa, TWO52))
16492 return x;
16493 x2 = (double)(long)x;
16494 if (HONOR_SIGNED_ZEROS (mode))
16495 return copysign (x2, x);
16496 return x2;
16497 */
16498 machine_mode mode = GET_MODE (operand0);
16499 rtx xa, xi, TWO52, res, mask;
16500 rtx_code_label *label;
16501
16502 TWO52 = ix86_gen_TWO52 (mode);
16503
16504 /* Temporary for holding the result, initialized to the input
16505 operand to ease control flow. */
16506 res = copy_to_reg (operand1);
16507
16508 /* xa = abs (operand1) */
16509 xa = ix86_expand_sse_fabs (res, &mask);
16510
16511 /* if (!isless (xa, TWO52)) goto label; */
16512 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16513
16514 /* xa = (double)(long)x */
16515 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16516 expand_fix (xi, res, 0);
16517 expand_float (xa, xi, 0);
16518
16519 if (HONOR_SIGNED_ZEROS (mode))
16520 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16521
16522 emit_move_insn (res, xa);
16523
16524 emit_label (label);
16525 LABEL_NUSES (label) = 1;
16526
16527 emit_move_insn (operand0, res);
16528 }
16529
16530 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16531 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16532 that is only available on 64bit targets. */
16533 void
16534 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16535 {
16536 machine_mode mode = GET_MODE (operand0);
16537 rtx xa, xa2, TWO52, tmp, one, res, mask;
16538 rtx_code_label *label;
16539
16540 /* C code for SSE variant we expand below.
16541 double xa = fabs (x), x2;
16542 if (!isless (xa, TWO52))
16543 return x;
16544 xa2 = xa + TWO52 - TWO52;
16545 Compensate:
16546 if (xa2 > xa)
16547 xa2 -= 1.0;
16548 x2 = copysign (xa2, x);
16549 return x2;
16550 */
16551
16552 TWO52 = ix86_gen_TWO52 (mode);
16553
16554 /* Temporary for holding the result, initialized to the input
16555 operand to ease control flow. */
16556 res =copy_to_reg (operand1);
16557
16558 /* xa = abs (operand1) */
16559 xa = ix86_expand_sse_fabs (res, &mask);
16560
16561 /* if (!isless (xa, TWO52)) goto label; */
16562 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16563
16564 /* xa2 = xa + TWO52 - TWO52; */
16565 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16566 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16567
16568 /* generate 1.0 */
16569 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16570
16571 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16572 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16573 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16574 tmp = expand_simple_binop (mode, MINUS,
16575 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16576 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16577 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16578 tmp = ix86_expand_sse_fabs (tmp, NULL);
16579
16580 /* res = copysign (xa2, operand1) */
16581 ix86_sse_copysign_to_positive (res, tmp, res, mask);
16582
16583 emit_label (label);
16584 LABEL_NUSES (label) = 1;
16585
16586 emit_move_insn (operand0, res);
16587 }
16588
16589 /* Expand SSE sequence for computing round
16590 from OPERAND1 storing into OPERAND0. */
16591 void
16592 ix86_expand_round (rtx operand0, rtx operand1)
16593 {
16594 /* C code for the stuff we're doing below:
16595 double xa = fabs (x);
16596 if (!isless (xa, TWO52))
16597 return x;
16598 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16599 return copysign (xa, x);
16600 */
16601 machine_mode mode = GET_MODE (operand0);
16602 rtx res, TWO52, xa, xi, half, mask;
16603 rtx_code_label *label;
16604 const struct real_format *fmt;
16605 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16606
16607 /* Temporary for holding the result, initialized to the input
16608 operand to ease control flow. */
16609 res = copy_to_reg (operand1);
16610
16611 TWO52 = ix86_gen_TWO52 (mode);
16612 xa = ix86_expand_sse_fabs (res, &mask);
16613 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16614
16615 /* load nextafter (0.5, 0.0) */
16616 fmt = REAL_MODE_FORMAT (mode);
16617 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16618 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16619
16620 /* xa = xa + 0.5 */
16621 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16622 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16623
16624 /* xa = (double)(int64_t)xa */
16625 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16626 expand_fix (xi, xa, 0);
16627 expand_float (xa, xi, 0);
16628
16629 /* res = copysign (xa, operand1) */
16630 ix86_sse_copysign_to_positive (res, xa, res, mask);
16631
16632 emit_label (label);
16633 LABEL_NUSES (label) = 1;
16634
16635 emit_move_insn (operand0, res);
16636 }
16637
16638 /* Expand SSE sequence for computing round from OPERAND1 storing
16639 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16640 that is only available on 64bit targets. */
16641 void
16642 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16643 {
16644 /* C code for the stuff we expand below.
16645 double xa = fabs (x), xa2, x2;
16646 if (!isless (xa, TWO52))
16647 return x;
16648 Using the absolute value and copying back sign makes
16649 -0.0 -> -0.0 correct.
16650 xa2 = xa + TWO52 - TWO52;
16651 Compensate.
16652 dxa = xa2 - xa;
16653 if (dxa <= -0.5)
16654 xa2 += 1;
16655 else if (dxa > 0.5)
16656 xa2 -= 1;
16657 x2 = copysign (xa2, x);
16658 return x2;
16659 */
16660 machine_mode mode = GET_MODE (operand0);
16661 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16662 rtx_code_label *label;
16663
16664 TWO52 = ix86_gen_TWO52 (mode);
16665
16666 /* Temporary for holding the result, initialized to the input
16667 operand to ease control flow. */
16668 res = copy_to_reg (operand1);
16669
16670 /* xa = abs (operand1) */
16671 xa = ix86_expand_sse_fabs (res, &mask);
16672
16673 /* if (!isless (xa, TWO52)) goto label; */
16674 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16675
16676 /* xa2 = xa + TWO52 - TWO52; */
16677 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16678 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16679
16680 /* dxa = xa2 - xa; */
16681 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16682
16683 /* generate 0.5, 1.0 and -0.5 */
16684 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16685 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16686 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16687 0, OPTAB_DIRECT);
16688
16689 /* Compensate. */
16690 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16691 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16692 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16693 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16694 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16695 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16696 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16697 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16698
16699 /* res = copysign (xa2, operand1) */
16700 ix86_sse_copysign_to_positive (res, xa2, res, mask);
16701
16702 emit_label (label);
16703 LABEL_NUSES (label) = 1;
16704
16705 emit_move_insn (operand0, res);
16706 }
16707
16708 /* Expand SSE sequence for computing round
16709 from OP1 storing into OP0 using sse4 round insn. */
16710 void
16711 ix86_expand_round_sse4 (rtx op0, rtx op1)
16712 {
16713 machine_mode mode = GET_MODE (op0);
16714 rtx e1, e2, res, half;
16715 const struct real_format *fmt;
16716 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16717 rtx (*gen_copysign) (rtx, rtx, rtx);
16718 rtx (*gen_round) (rtx, rtx, rtx);
16719
16720 switch (mode)
16721 {
16722 case E_SFmode:
16723 gen_copysign = gen_copysignsf3;
16724 gen_round = gen_sse4_1_roundsf2;
16725 break;
16726 case E_DFmode:
16727 gen_copysign = gen_copysigndf3;
16728 gen_round = gen_sse4_1_rounddf2;
16729 break;
16730 default:
16731 gcc_unreachable ();
16732 }
16733
16734 /* round (a) = trunc (a + copysign (0.5, a)) */
16735
16736 /* load nextafter (0.5, 0.0) */
16737 fmt = REAL_MODE_FORMAT (mode);
16738 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16739 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16740 half = const_double_from_real_value (pred_half, mode);
16741
16742 /* e1 = copysign (0.5, op1) */
16743 e1 = gen_reg_rtx (mode);
16744 emit_insn (gen_copysign (e1, half, op1));
16745
16746 /* e2 = op1 + e1 */
16747 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16748
16749 /* res = trunc (e2) */
16750 res = gen_reg_rtx (mode);
16751 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16752
16753 emit_move_insn (op0, res);
16754 }
16755
16756 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16757 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16758 insn every time. */
16759
16760 static GTY(()) rtx_insn *vselect_insn;
16761
16762 /* Initialize vselect_insn. */
16763
16764 static void
16765 init_vselect_insn (void)
16766 {
16767 unsigned i;
16768 rtx x;
16769
16770 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16771 for (i = 0; i < MAX_VECT_LEN; ++i)
16772 XVECEXP (x, 0, i) = const0_rtx;
16773 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16774 const0_rtx), x);
16775 x = gen_rtx_SET (const0_rtx, x);
16776 start_sequence ();
16777 vselect_insn = emit_insn (x);
16778 end_sequence ();
16779 }
16780
16781 /* Construct (set target (vec_select op0 (parallel perm))) and
16782 return true if that's a valid instruction in the active ISA. */
16783
16784 static bool
16785 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16786 unsigned nelt, bool testing_p)
16787 {
16788 unsigned int i;
16789 rtx x, save_vconcat;
16790 int icode;
16791
16792 if (vselect_insn == NULL_RTX)
16793 init_vselect_insn ();
16794
16795 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16796 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16797 for (i = 0; i < nelt; ++i)
16798 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16799 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16800 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16801 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16802 SET_DEST (PATTERN (vselect_insn)) = target;
16803 icode = recog_memoized (vselect_insn);
16804
16805 if (icode >= 0 && !testing_p)
16806 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16807
16808 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16809 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16810 INSN_CODE (vselect_insn) = -1;
16811
16812 return icode >= 0;
16813 }
16814
16815 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16816
16817 static bool
16818 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16819 const unsigned char *perm, unsigned nelt,
16820 bool testing_p)
16821 {
16822 machine_mode v2mode;
16823 rtx x;
16824 bool ok;
16825
16826 if (vselect_insn == NULL_RTX)
16827 init_vselect_insn ();
16828
16829 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16830 return false;
16831 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16832 PUT_MODE (x, v2mode);
16833 XEXP (x, 0) = op0;
16834 XEXP (x, 1) = op1;
16835 ok = expand_vselect (target, x, perm, nelt, testing_p);
16836 XEXP (x, 0) = const0_rtx;
16837 XEXP (x, 1) = const0_rtx;
16838 return ok;
16839 }
16840
16841 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16842 using movss or movsd. */
16843 static bool
16844 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16845 {
16846 machine_mode vmode = d->vmode;
16847 unsigned i, nelt = d->nelt;
16848 rtx x;
16849
16850 if (d->one_operand_p)
16851 return false;
16852
16853 if (!(TARGET_SSE && vmode == V4SFmode)
16854 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16855 && !(TARGET_SSE2 && vmode == V2DFmode))
16856 return false;
16857
16858 /* Only the first element is changed. */
16859 if (d->perm[0] != nelt && d->perm[0] != 0)
16860 return false;
16861 for (i = 1; i < nelt; ++i)
16862 if (d->perm[i] != i + nelt - d->perm[0])
16863 return false;
16864
16865 if (d->testing_p)
16866 return true;
16867
16868 if (d->perm[0] == nelt)
16869 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16870 else
16871 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16872
16873 emit_insn (gen_rtx_SET (d->target, x));
16874
16875 return true;
16876 }
16877
16878 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16879 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16880
16881 static bool
16882 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16883 {
16884 machine_mode mmode, vmode = d->vmode;
16885 unsigned i, nelt = d->nelt;
16886 unsigned HOST_WIDE_INT mask;
16887 rtx target, op0, op1, maskop, x;
16888 rtx rperm[32], vperm;
16889
16890 if (d->one_operand_p)
16891 return false;
16892 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16893 && (TARGET_AVX512BW
16894 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16895 ;
16896 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16897 ;
16898 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16899 ;
16900 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16901 ;
16902 else
16903 return false;
16904
16905 /* This is a blend, not a permute. Elements must stay in their
16906 respective lanes. */
16907 for (i = 0; i < nelt; ++i)
16908 {
16909 unsigned e = d->perm[i];
16910 if (!(e == i || e == i + nelt))
16911 return false;
16912 }
16913
16914 if (d->testing_p)
16915 return true;
16916
16917 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16918 decision should be extracted elsewhere, so that we only try that
16919 sequence once all budget==3 options have been tried. */
16920 target = d->target;
16921 op0 = d->op0;
16922 op1 = d->op1;
16923 mask = 0;
16924
16925 switch (vmode)
16926 {
16927 case E_V8DFmode:
16928 case E_V16SFmode:
16929 case E_V4DFmode:
16930 case E_V8SFmode:
16931 case E_V2DFmode:
16932 case E_V4SFmode:
16933 case E_V8HImode:
16934 case E_V8SImode:
16935 case E_V32HImode:
16936 case E_V64QImode:
16937 case E_V16SImode:
16938 case E_V8DImode:
16939 for (i = 0; i < nelt; ++i)
16940 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16941 break;
16942
16943 case E_V2DImode:
16944 for (i = 0; i < 2; ++i)
16945 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16946 vmode = V8HImode;
16947 goto do_subreg;
16948
16949 case E_V4SImode:
16950 for (i = 0; i < 4; ++i)
16951 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16952 vmode = V8HImode;
16953 goto do_subreg;
16954
16955 case E_V16QImode:
16956 /* See if bytes move in pairs so we can use pblendw with
16957 an immediate argument, rather than pblendvb with a vector
16958 argument. */
16959 for (i = 0; i < 16; i += 2)
16960 if (d->perm[i] + 1 != d->perm[i + 1])
16961 {
16962 use_pblendvb:
16963 for (i = 0; i < nelt; ++i)
16964 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16965
16966 finish_pblendvb:
16967 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16968 vperm = force_reg (vmode, vperm);
16969
16970 if (GET_MODE_SIZE (vmode) == 16)
16971 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16972 else
16973 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16974 if (target != d->target)
16975 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16976 return true;
16977 }
16978
16979 for (i = 0; i < 8; ++i)
16980 mask |= (d->perm[i * 2] >= 16) << i;
16981 vmode = V8HImode;
16982 /* FALLTHRU */
16983
16984 do_subreg:
16985 target = gen_reg_rtx (vmode);
16986 op0 = gen_lowpart (vmode, op0);
16987 op1 = gen_lowpart (vmode, op1);
16988 break;
16989
16990 case E_V32QImode:
16991 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16992 for (i = 0; i < 32; i += 2)
16993 if (d->perm[i] + 1 != d->perm[i + 1])
16994 goto use_pblendvb;
16995 /* See if bytes move in quadruplets. If yes, vpblendd
16996 with immediate can be used. */
16997 for (i = 0; i < 32; i += 4)
16998 if (d->perm[i] + 2 != d->perm[i + 2])
16999 break;
17000 if (i < 32)
17001 {
17002 /* See if bytes move the same in both lanes. If yes,
17003 vpblendw with immediate can be used. */
17004 for (i = 0; i < 16; i += 2)
17005 if (d->perm[i] + 16 != d->perm[i + 16])
17006 goto use_pblendvb;
17007
17008 /* Use vpblendw. */
17009 for (i = 0; i < 16; ++i)
17010 mask |= (d->perm[i * 2] >= 32) << i;
17011 vmode = V16HImode;
17012 goto do_subreg;
17013 }
17014
17015 /* Use vpblendd. */
17016 for (i = 0; i < 8; ++i)
17017 mask |= (d->perm[i * 4] >= 32) << i;
17018 vmode = V8SImode;
17019 goto do_subreg;
17020
17021 case E_V16HImode:
17022 /* See if words move in pairs. If yes, vpblendd can be used. */
17023 for (i = 0; i < 16; i += 2)
17024 if (d->perm[i] + 1 != d->perm[i + 1])
17025 break;
17026 if (i < 16)
17027 {
17028 /* See if words move the same in both lanes. If not,
17029 vpblendvb must be used. */
17030 for (i = 0; i < 8; i++)
17031 if (d->perm[i] + 8 != d->perm[i + 8])
17032 {
17033 /* Use vpblendvb. */
17034 for (i = 0; i < 32; ++i)
17035 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17036
17037 vmode = V32QImode;
17038 nelt = 32;
17039 target = gen_reg_rtx (vmode);
17040 op0 = gen_lowpart (vmode, op0);
17041 op1 = gen_lowpart (vmode, op1);
17042 goto finish_pblendvb;
17043 }
17044
17045 /* Use vpblendw. */
17046 for (i = 0; i < 16; ++i)
17047 mask |= (d->perm[i] >= 16) << i;
17048 break;
17049 }
17050
17051 /* Use vpblendd. */
17052 for (i = 0; i < 8; ++i)
17053 mask |= (d->perm[i * 2] >= 16) << i;
17054 vmode = V8SImode;
17055 goto do_subreg;
17056
17057 case E_V4DImode:
17058 /* Use vpblendd. */
17059 for (i = 0; i < 4; ++i)
17060 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17061 vmode = V8SImode;
17062 goto do_subreg;
17063
17064 default:
17065 gcc_unreachable ();
17066 }
17067
17068 switch (vmode)
17069 {
17070 case E_V8DFmode:
17071 case E_V8DImode:
17072 mmode = QImode;
17073 break;
17074 case E_V16SFmode:
17075 case E_V16SImode:
17076 mmode = HImode;
17077 break;
17078 case E_V32HImode:
17079 mmode = SImode;
17080 break;
17081 case E_V64QImode:
17082 mmode = DImode;
17083 break;
17084 default:
17085 mmode = VOIDmode;
17086 }
17087
17088 if (mmode != VOIDmode)
17089 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17090 else
17091 maskop = GEN_INT (mask);
17092
17093 /* This matches five different patterns with the different modes. */
17094 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17095 x = gen_rtx_SET (target, x);
17096 emit_insn (x);
17097 if (target != d->target)
17098 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17099
17100 return true;
17101 }
17102
17103 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17104 in terms of the variable form of vpermilps.
17105
17106 Note that we will have already failed the immediate input vpermilps,
17107 which requires that the high and low part shuffle be identical; the
17108 variable form doesn't require that. */
17109
17110 static bool
17111 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17112 {
17113 rtx rperm[8], vperm;
17114 unsigned i;
17115
17116 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17117 return false;
17118
17119 /* We can only permute within the 128-bit lane. */
17120 for (i = 0; i < 8; ++i)
17121 {
17122 unsigned e = d->perm[i];
17123 if (i < 4 ? e >= 4 : e < 4)
17124 return false;
17125 }
17126
17127 if (d->testing_p)
17128 return true;
17129
17130 for (i = 0; i < 8; ++i)
17131 {
17132 unsigned e = d->perm[i];
17133
17134 /* Within each 128-bit lane, the elements of op0 are numbered
17135 from 0 and the elements of op1 are numbered from 4. */
17136 if (e >= 8 + 4)
17137 e -= 8;
17138 else if (e >= 4)
17139 e -= 4;
17140
17141 rperm[i] = GEN_INT (e);
17142 }
17143
17144 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17145 vperm = force_reg (V8SImode, vperm);
17146 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17147
17148 return true;
17149 }
17150
17151 /* Return true if permutation D can be performed as VMODE permutation
17152 instead. */
17153
17154 static bool
17155 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17156 {
17157 unsigned int i, j, chunk;
17158
17159 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17160 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17161 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17162 return false;
17163
17164 if (GET_MODE_NUNITS (vmode) >= d->nelt)
17165 return true;
17166
17167 chunk = d->nelt / GET_MODE_NUNITS (vmode);
17168 for (i = 0; i < d->nelt; i += chunk)
17169 if (d->perm[i] & (chunk - 1))
17170 return false;
17171 else
17172 for (j = 1; j < chunk; ++j)
17173 if (d->perm[i] + j != d->perm[i + j])
17174 return false;
17175
17176 return true;
17177 }
17178
17179 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17180 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17181
17182 static bool
17183 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17184 {
17185 unsigned i, nelt, eltsz, mask;
17186 unsigned char perm[64];
17187 machine_mode vmode = V16QImode;
17188 rtx rperm[64], vperm, target, op0, op1;
17189
17190 nelt = d->nelt;
17191
17192 if (!d->one_operand_p)
17193 {
17194 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
17195 {
17196 if (TARGET_AVX2
17197 && valid_perm_using_mode_p (V2TImode, d))
17198 {
17199 if (d->testing_p)
17200 return true;
17201
17202 /* Use vperm2i128 insn. The pattern uses
17203 V4DImode instead of V2TImode. */
17204 target = d->target;
17205 if (d->vmode != V4DImode)
17206 target = gen_reg_rtx (V4DImode);
17207 op0 = gen_lowpart (V4DImode, d->op0);
17208 op1 = gen_lowpart (V4DImode, d->op1);
17209 rperm[0]
17210 = GEN_INT ((d->perm[0] / (nelt / 2))
17211 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17212 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17213 if (target != d->target)
17214 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17215 return true;
17216 }
17217 return false;
17218 }
17219 }
17220 else
17221 {
17222 if (GET_MODE_SIZE (d->vmode) == 16)
17223 {
17224 if (!TARGET_SSSE3)
17225 return false;
17226 }
17227 else if (GET_MODE_SIZE (d->vmode) == 32)
17228 {
17229 if (!TARGET_AVX2)
17230 return false;
17231
17232 /* V4DImode should be already handled through
17233 expand_vselect by vpermq instruction. */
17234 gcc_assert (d->vmode != V4DImode);
17235
17236 vmode = V32QImode;
17237 if (d->vmode == V8SImode
17238 || d->vmode == V16HImode
17239 || d->vmode == V32QImode)
17240 {
17241 /* First see if vpermq can be used for
17242 V8SImode/V16HImode/V32QImode. */
17243 if (valid_perm_using_mode_p (V4DImode, d))
17244 {
17245 for (i = 0; i < 4; i++)
17246 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
17247 if (d->testing_p)
17248 return true;
17249 target = gen_reg_rtx (V4DImode);
17250 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
17251 perm, 4, false))
17252 {
17253 emit_move_insn (d->target,
17254 gen_lowpart (d->vmode, target));
17255 return true;
17256 }
17257 return false;
17258 }
17259
17260 /* Next see if vpermd can be used. */
17261 if (valid_perm_using_mode_p (V8SImode, d))
17262 vmode = V8SImode;
17263 }
17264 /* Or if vpermps can be used. */
17265 else if (d->vmode == V8SFmode)
17266 vmode = V8SImode;
17267
17268 if (vmode == V32QImode)
17269 {
17270 /* vpshufb only works intra lanes, it is not
17271 possible to shuffle bytes in between the lanes. */
17272 for (i = 0; i < nelt; ++i)
17273 if ((d->perm[i] ^ i) & (nelt / 2))
17274 return false;
17275 }
17276 }
17277 else if (GET_MODE_SIZE (d->vmode) == 64)
17278 {
17279 if (!TARGET_AVX512BW)
17280 return false;
17281
17282 /* If vpermq didn't work, vpshufb won't work either. */
17283 if (d->vmode == V8DFmode || d->vmode == V8DImode)
17284 return false;
17285
17286 vmode = V64QImode;
17287 if (d->vmode == V16SImode
17288 || d->vmode == V32HImode
17289 || d->vmode == V64QImode)
17290 {
17291 /* First see if vpermq can be used for
17292 V16SImode/V32HImode/V64QImode. */
17293 if (valid_perm_using_mode_p (V8DImode, d))
17294 {
17295 for (i = 0; i < 8; i++)
17296 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
17297 if (d->testing_p)
17298 return true;
17299 target = gen_reg_rtx (V8DImode);
17300 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
17301 perm, 8, false))
17302 {
17303 emit_move_insn (d->target,
17304 gen_lowpart (d->vmode, target));
17305 return true;
17306 }
17307 return false;
17308 }
17309
17310 /* Next see if vpermd can be used. */
17311 if (valid_perm_using_mode_p (V16SImode, d))
17312 vmode = V16SImode;
17313 }
17314 /* Or if vpermps can be used. */
17315 else if (d->vmode == V16SFmode)
17316 vmode = V16SImode;
17317 if (vmode == V64QImode)
17318 {
17319 /* vpshufb only works intra lanes, it is not
17320 possible to shuffle bytes in between the lanes. */
17321 for (i = 0; i < nelt; ++i)
17322 if ((d->perm[i] ^ i) & (3 * nelt / 4))
17323 return false;
17324 }
17325 }
17326 else
17327 return false;
17328 }
17329
17330 if (d->testing_p)
17331 return true;
17332
17333 if (vmode == V8SImode)
17334 for (i = 0; i < 8; ++i)
17335 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
17336 else if (vmode == V16SImode)
17337 for (i = 0; i < 16; ++i)
17338 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
17339 else
17340 {
17341 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17342 if (!d->one_operand_p)
17343 mask = 2 * nelt - 1;
17344 else if (vmode == V16QImode)
17345 mask = nelt - 1;
17346 else if (vmode == V64QImode)
17347 mask = nelt / 4 - 1;
17348 else
17349 mask = nelt / 2 - 1;
17350
17351 for (i = 0; i < nelt; ++i)
17352 {
17353 unsigned j, e = d->perm[i] & mask;
17354 for (j = 0; j < eltsz; ++j)
17355 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
17356 }
17357 }
17358
17359 vperm = gen_rtx_CONST_VECTOR (vmode,
17360 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
17361 vperm = force_reg (vmode, vperm);
17362
17363 target = d->target;
17364 if (d->vmode != vmode)
17365 target = gen_reg_rtx (vmode);
17366 op0 = gen_lowpart (vmode, d->op0);
17367 if (d->one_operand_p)
17368 {
17369 if (vmode == V16QImode)
17370 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
17371 else if (vmode == V32QImode)
17372 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
17373 else if (vmode == V64QImode)
17374 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
17375 else if (vmode == V8SFmode)
17376 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
17377 else if (vmode == V8SImode)
17378 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
17379 else if (vmode == V16SFmode)
17380 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
17381 else if (vmode == V16SImode)
17382 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
17383 else
17384 gcc_unreachable ();
17385 }
17386 else
17387 {
17388 op1 = gen_lowpart (vmode, d->op1);
17389 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
17390 }
17391 if (target != d->target)
17392 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17393
17394 return true;
17395 }
17396
17397 /* For V*[QHS]Imode permutations, check if the same permutation
17398 can't be performed in a 2x, 4x or 8x wider inner mode. */
17399
17400 static bool
17401 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17402 struct expand_vec_perm_d *nd)
17403 {
17404 int i;
17405 machine_mode mode = VOIDmode;
17406
17407 switch (d->vmode)
17408 {
17409 case E_V16QImode: mode = V8HImode; break;
17410 case E_V32QImode: mode = V16HImode; break;
17411 case E_V64QImode: mode = V32HImode; break;
17412 case E_V8HImode: mode = V4SImode; break;
17413 case E_V16HImode: mode = V8SImode; break;
17414 case E_V32HImode: mode = V16SImode; break;
17415 case E_V4SImode: mode = V2DImode; break;
17416 case E_V8SImode: mode = V4DImode; break;
17417 case E_V16SImode: mode = V8DImode; break;
17418 default: return false;
17419 }
17420 for (i = 0; i < d->nelt; i += 2)
17421 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17422 return false;
17423 nd->vmode = mode;
17424 nd->nelt = d->nelt / 2;
17425 for (i = 0; i < nd->nelt; i++)
17426 nd->perm[i] = d->perm[2 * i] / 2;
17427 if (GET_MODE_INNER (mode) != DImode)
17428 canonicalize_vector_int_perm (nd, nd);
17429 if (nd != d)
17430 {
17431 nd->one_operand_p = d->one_operand_p;
17432 nd->testing_p = d->testing_p;
17433 if (d->op0 == d->op1)
17434 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17435 else
17436 {
17437 nd->op0 = gen_lowpart (nd->vmode, d->op0);
17438 nd->op1 = gen_lowpart (nd->vmode, d->op1);
17439 }
17440 if (d->testing_p)
17441 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17442 else
17443 nd->target = gen_reg_rtx (nd->vmode);
17444 }
17445 return true;
17446 }
17447
17448 /* Try to expand one-operand permutation with constant mask. */
17449
17450 static bool
17451 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17452 {
17453 machine_mode mode = GET_MODE (d->op0);
17454 machine_mode maskmode = mode;
17455 rtx (*gen) (rtx, rtx, rtx) = NULL;
17456 rtx target, op0, mask;
17457 rtx vec[64];
17458
17459 if (!rtx_equal_p (d->op0, d->op1))
17460 return false;
17461
17462 if (!TARGET_AVX512F)
17463 return false;
17464
17465 switch (mode)
17466 {
17467 case E_V16SImode:
17468 gen = gen_avx512f_permvarv16si;
17469 break;
17470 case E_V16SFmode:
17471 gen = gen_avx512f_permvarv16sf;
17472 maskmode = V16SImode;
17473 break;
17474 case E_V8DImode:
17475 gen = gen_avx512f_permvarv8di;
17476 break;
17477 case E_V8DFmode:
17478 gen = gen_avx512f_permvarv8df;
17479 maskmode = V8DImode;
17480 break;
17481 default:
17482 return false;
17483 }
17484
17485 target = d->target;
17486 op0 = d->op0;
17487 for (int i = 0; i < d->nelt; ++i)
17488 vec[i] = GEN_INT (d->perm[i]);
17489 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17490 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17491 return true;
17492 }
17493
17494 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17495
17496 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17497 in a single instruction. */
17498
17499 static bool
17500 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17501 {
17502 unsigned i, nelt = d->nelt;
17503 struct expand_vec_perm_d nd;
17504
17505 /* Check plain VEC_SELECT first, because AVX has instructions that could
17506 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17507 input where SEL+CONCAT may not. */
17508 if (d->one_operand_p)
17509 {
17510 int mask = nelt - 1;
17511 bool identity_perm = true;
17512 bool broadcast_perm = true;
17513
17514 for (i = 0; i < nelt; i++)
17515 {
17516 nd.perm[i] = d->perm[i] & mask;
17517 if (nd.perm[i] != i)
17518 identity_perm = false;
17519 if (nd.perm[i])
17520 broadcast_perm = false;
17521 }
17522
17523 if (identity_perm)
17524 {
17525 if (!d->testing_p)
17526 emit_move_insn (d->target, d->op0);
17527 return true;
17528 }
17529 else if (broadcast_perm && TARGET_AVX2)
17530 {
17531 /* Use vpbroadcast{b,w,d}. */
17532 rtx (*gen) (rtx, rtx) = NULL;
17533 switch (d->vmode)
17534 {
17535 case E_V64QImode:
17536 if (TARGET_AVX512BW)
17537 gen = gen_avx512bw_vec_dupv64qi_1;
17538 break;
17539 case E_V32QImode:
17540 gen = gen_avx2_pbroadcastv32qi_1;
17541 break;
17542 case E_V32HImode:
17543 if (TARGET_AVX512BW)
17544 gen = gen_avx512bw_vec_dupv32hi_1;
17545 break;
17546 case E_V16HImode:
17547 gen = gen_avx2_pbroadcastv16hi_1;
17548 break;
17549 case E_V16SImode:
17550 if (TARGET_AVX512F)
17551 gen = gen_avx512f_vec_dupv16si_1;
17552 break;
17553 case E_V8SImode:
17554 gen = gen_avx2_pbroadcastv8si_1;
17555 break;
17556 case E_V16QImode:
17557 gen = gen_avx2_pbroadcastv16qi;
17558 break;
17559 case E_V8HImode:
17560 gen = gen_avx2_pbroadcastv8hi;
17561 break;
17562 case E_V16SFmode:
17563 if (TARGET_AVX512F)
17564 gen = gen_avx512f_vec_dupv16sf_1;
17565 break;
17566 case E_V8SFmode:
17567 gen = gen_avx2_vec_dupv8sf_1;
17568 break;
17569 case E_V8DFmode:
17570 if (TARGET_AVX512F)
17571 gen = gen_avx512f_vec_dupv8df_1;
17572 break;
17573 case E_V8DImode:
17574 if (TARGET_AVX512F)
17575 gen = gen_avx512f_vec_dupv8di_1;
17576 break;
17577 /* For other modes prefer other shuffles this function creates. */
17578 default: break;
17579 }
17580 if (gen != NULL)
17581 {
17582 if (!d->testing_p)
17583 emit_insn (gen (d->target, d->op0));
17584 return true;
17585 }
17586 }
17587
17588 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17589 return true;
17590
17591 /* There are plenty of patterns in sse.md that are written for
17592 SEL+CONCAT and are not replicated for a single op. Perhaps
17593 that should be changed, to avoid the nastiness here. */
17594
17595 /* Recognize interleave style patterns, which means incrementing
17596 every other permutation operand. */
17597 for (i = 0; i < nelt; i += 2)
17598 {
17599 nd.perm[i] = d->perm[i] & mask;
17600 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17601 }
17602 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17603 d->testing_p))
17604 return true;
17605
17606 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17607 if (nelt >= 4)
17608 {
17609 for (i = 0; i < nelt; i += 4)
17610 {
17611 nd.perm[i + 0] = d->perm[i + 0] & mask;
17612 nd.perm[i + 1] = d->perm[i + 1] & mask;
17613 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17614 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17615 }
17616
17617 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17618 d->testing_p))
17619 return true;
17620 }
17621 }
17622
17623 /* Try movss/movsd instructions. */
17624 if (expand_vec_perm_movs (d))
17625 return true;
17626
17627 /* Finally, try the fully general two operand permute. */
17628 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17629 d->testing_p))
17630 return true;
17631
17632 /* Recognize interleave style patterns with reversed operands. */
17633 if (!d->one_operand_p)
17634 {
17635 for (i = 0; i < nelt; ++i)
17636 {
17637 unsigned e = d->perm[i];
17638 if (e >= nelt)
17639 e -= nelt;
17640 else
17641 e += nelt;
17642 nd.perm[i] = e;
17643 }
17644
17645 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17646 d->testing_p))
17647 return true;
17648 }
17649
17650 /* Try the SSE4.1 blend variable merge instructions. */
17651 if (expand_vec_perm_blend (d))
17652 return true;
17653
17654 /* Try one of the AVX vpermil variable permutations. */
17655 if (expand_vec_perm_vpermil (d))
17656 return true;
17657
17658 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17659 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17660 if (expand_vec_perm_pshufb (d))
17661 return true;
17662
17663 /* Try the AVX2 vpalignr instruction. */
17664 if (expand_vec_perm_palignr (d, true))
17665 return true;
17666
17667 /* Try the AVX512F vperm{s,d} instructions. */
17668 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17669 return true;
17670
17671 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17672 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17673 return true;
17674
17675 /* See if we can get the same permutation in different vector integer
17676 mode. */
17677 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17678 {
17679 if (!d->testing_p)
17680 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17681 return true;
17682 }
17683 return false;
17684 }
17685
17686 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17687 in terms of a pair of pshuflw + pshufhw instructions. */
17688
17689 static bool
17690 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17691 {
17692 unsigned char perm2[MAX_VECT_LEN];
17693 unsigned i;
17694 bool ok;
17695
17696 if (d->vmode != V8HImode || !d->one_operand_p)
17697 return false;
17698
17699 /* The two permutations only operate in 64-bit lanes. */
17700 for (i = 0; i < 4; ++i)
17701 if (d->perm[i] >= 4)
17702 return false;
17703 for (i = 4; i < 8; ++i)
17704 if (d->perm[i] < 4)
17705 return false;
17706
17707 if (d->testing_p)
17708 return true;
17709
17710 /* Emit the pshuflw. */
17711 memcpy (perm2, d->perm, 4);
17712 for (i = 4; i < 8; ++i)
17713 perm2[i] = i;
17714 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17715 gcc_assert (ok);
17716
17717 /* Emit the pshufhw. */
17718 memcpy (perm2 + 4, d->perm + 4, 4);
17719 for (i = 0; i < 4; ++i)
17720 perm2[i] = i;
17721 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17722 gcc_assert (ok);
17723
17724 return true;
17725 }
17726
17727 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17728 the permutation using the SSSE3 palignr instruction. This succeeds
17729 when all of the elements in PERM fit within one vector and we merely
17730 need to shift them down so that a single vector permutation has a
17731 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17732 the vpalignr instruction itself can perform the requested permutation. */
17733
17734 static bool
17735 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17736 {
17737 unsigned i, nelt = d->nelt;
17738 unsigned min, max, minswap, maxswap;
17739 bool in_order, ok, swap = false;
17740 rtx shift, target;
17741 struct expand_vec_perm_d dcopy;
17742
17743 /* Even with AVX, palignr only operates on 128-bit vectors,
17744 in AVX2 palignr operates on both 128-bit lanes. */
17745 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17746 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17747 return false;
17748
17749 min = 2 * nelt;
17750 max = 0;
17751 minswap = 2 * nelt;
17752 maxswap = 0;
17753 for (i = 0; i < nelt; ++i)
17754 {
17755 unsigned e = d->perm[i];
17756 unsigned eswap = d->perm[i] ^ nelt;
17757 if (GET_MODE_SIZE (d->vmode) == 32)
17758 {
17759 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17760 eswap = e ^ (nelt / 2);
17761 }
17762 if (e < min)
17763 min = e;
17764 if (e > max)
17765 max = e;
17766 if (eswap < minswap)
17767 minswap = eswap;
17768 if (eswap > maxswap)
17769 maxswap = eswap;
17770 }
17771 if (min == 0
17772 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17773 {
17774 if (d->one_operand_p
17775 || minswap == 0
17776 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17777 ? nelt / 2 : nelt))
17778 return false;
17779 swap = true;
17780 min = minswap;
17781 max = maxswap;
17782 }
17783
17784 /* Given that we have SSSE3, we know we'll be able to implement the
17785 single operand permutation after the palignr with pshufb for
17786 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17787 first. */
17788 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17789 return true;
17790
17791 dcopy = *d;
17792 if (swap)
17793 {
17794 dcopy.op0 = d->op1;
17795 dcopy.op1 = d->op0;
17796 for (i = 0; i < nelt; ++i)
17797 dcopy.perm[i] ^= nelt;
17798 }
17799
17800 in_order = true;
17801 for (i = 0; i < nelt; ++i)
17802 {
17803 unsigned e = dcopy.perm[i];
17804 if (GET_MODE_SIZE (d->vmode) == 32
17805 && e >= nelt
17806 && (e & (nelt / 2 - 1)) < min)
17807 e = e - min - (nelt / 2);
17808 else
17809 e = e - min;
17810 if (e != i)
17811 in_order = false;
17812 dcopy.perm[i] = e;
17813 }
17814 dcopy.one_operand_p = true;
17815
17816 if (single_insn_only_p && !in_order)
17817 return false;
17818
17819 /* For AVX2, test whether we can permute the result in one instruction. */
17820 if (d->testing_p)
17821 {
17822 if (in_order)
17823 return true;
17824 dcopy.op1 = dcopy.op0;
17825 return expand_vec_perm_1 (&dcopy);
17826 }
17827
17828 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17829 if (GET_MODE_SIZE (d->vmode) == 16)
17830 {
17831 target = gen_reg_rtx (TImode);
17832 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17833 gen_lowpart (TImode, dcopy.op0), shift));
17834 }
17835 else
17836 {
17837 target = gen_reg_rtx (V2TImode);
17838 emit_insn (gen_avx2_palignrv2ti (target,
17839 gen_lowpart (V2TImode, dcopy.op1),
17840 gen_lowpart (V2TImode, dcopy.op0),
17841 shift));
17842 }
17843
17844 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17845
17846 /* Test for the degenerate case where the alignment by itself
17847 produces the desired permutation. */
17848 if (in_order)
17849 {
17850 emit_move_insn (d->target, dcopy.op0);
17851 return true;
17852 }
17853
17854 ok = expand_vec_perm_1 (&dcopy);
17855 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17856
17857 return ok;
17858 }
17859
17860 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17861 the permutation using the SSE4_1 pblendv instruction. Potentially
17862 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17863
17864 static bool
17865 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17866 {
17867 unsigned i, which, nelt = d->nelt;
17868 struct expand_vec_perm_d dcopy, dcopy1;
17869 machine_mode vmode = d->vmode;
17870 bool ok;
17871
17872 /* Use the same checks as in expand_vec_perm_blend. */
17873 if (d->one_operand_p)
17874 return false;
17875 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17876 ;
17877 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17878 ;
17879 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17880 ;
17881 else
17882 return false;
17883
17884 /* Figure out where permutation elements stay not in their
17885 respective lanes. */
17886 for (i = 0, which = 0; i < nelt; ++i)
17887 {
17888 unsigned e = d->perm[i];
17889 if (e != i)
17890 which |= (e < nelt ? 1 : 2);
17891 }
17892 /* We can pblend the part where elements stay not in their
17893 respective lanes only when these elements are all in one
17894 half of a permutation.
17895 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17896 lanes, but both 8 and 9 >= 8
17897 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17898 respective lanes and 8 >= 8, but 2 not. */
17899 if (which != 1 && which != 2)
17900 return false;
17901 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17902 return true;
17903
17904 /* First we apply one operand permutation to the part where
17905 elements stay not in their respective lanes. */
17906 dcopy = *d;
17907 if (which == 2)
17908 dcopy.op0 = dcopy.op1 = d->op1;
17909 else
17910 dcopy.op0 = dcopy.op1 = d->op0;
17911 if (!d->testing_p)
17912 dcopy.target = gen_reg_rtx (vmode);
17913 dcopy.one_operand_p = true;
17914
17915 for (i = 0; i < nelt; ++i)
17916 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17917
17918 ok = expand_vec_perm_1 (&dcopy);
17919 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17920 return false;
17921 else
17922 gcc_assert (ok);
17923 if (d->testing_p)
17924 return true;
17925
17926 /* Next we put permuted elements into their positions. */
17927 dcopy1 = *d;
17928 if (which == 2)
17929 dcopy1.op1 = dcopy.target;
17930 else
17931 dcopy1.op0 = dcopy.target;
17932
17933 for (i = 0; i < nelt; ++i)
17934 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17935
17936 ok = expand_vec_perm_blend (&dcopy1);
17937 gcc_assert (ok);
17938
17939 return true;
17940 }
17941
17942 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17943
17944 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17945 a two vector permutation into a single vector permutation by using
17946 an interleave operation to merge the vectors. */
17947
17948 static bool
17949 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17950 {
17951 struct expand_vec_perm_d dremap, dfinal;
17952 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17953 unsigned HOST_WIDE_INT contents;
17954 unsigned char remap[2 * MAX_VECT_LEN];
17955 rtx_insn *seq;
17956 bool ok, same_halves = false;
17957
17958 if (GET_MODE_SIZE (d->vmode) == 16)
17959 {
17960 if (d->one_operand_p)
17961 return false;
17962 }
17963 else if (GET_MODE_SIZE (d->vmode) == 32)
17964 {
17965 if (!TARGET_AVX)
17966 return false;
17967 /* For 32-byte modes allow even d->one_operand_p.
17968 The lack of cross-lane shuffling in some instructions
17969 might prevent a single insn shuffle. */
17970 dfinal = *d;
17971 dfinal.testing_p = true;
17972 /* If expand_vec_perm_interleave3 can expand this into
17973 a 3 insn sequence, give up and let it be expanded as
17974 3 insn sequence. While that is one insn longer,
17975 it doesn't need a memory operand and in the common
17976 case that both interleave low and high permutations
17977 with the same operands are adjacent needs 4 insns
17978 for both after CSE. */
17979 if (expand_vec_perm_interleave3 (&dfinal))
17980 return false;
17981 }
17982 else
17983 return false;
17984
17985 /* Examine from whence the elements come. */
17986 contents = 0;
17987 for (i = 0; i < nelt; ++i)
17988 contents |= HOST_WIDE_INT_1U << d->perm[i];
17989
17990 memset (remap, 0xff, sizeof (remap));
17991 dremap = *d;
17992
17993 if (GET_MODE_SIZE (d->vmode) == 16)
17994 {
17995 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17996
17997 /* Split the two input vectors into 4 halves. */
17998 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17999 h2 = h1 << nelt2;
18000 h3 = h2 << nelt2;
18001 h4 = h3 << nelt2;
18002
18003 /* If the elements from the low halves use interleave low, and similarly
18004 for interleave high. If the elements are from mis-matched halves, we
18005 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18006 if ((contents & (h1 | h3)) == contents)
18007 {
18008 /* punpckl* */
18009 for (i = 0; i < nelt2; ++i)
18010 {
18011 remap[i] = i * 2;
18012 remap[i + nelt] = i * 2 + 1;
18013 dremap.perm[i * 2] = i;
18014 dremap.perm[i * 2 + 1] = i + nelt;
18015 }
18016 if (!TARGET_SSE2 && d->vmode == V4SImode)
18017 dremap.vmode = V4SFmode;
18018 }
18019 else if ((contents & (h2 | h4)) == contents)
18020 {
18021 /* punpckh* */
18022 for (i = 0; i < nelt2; ++i)
18023 {
18024 remap[i + nelt2] = i * 2;
18025 remap[i + nelt + nelt2] = i * 2 + 1;
18026 dremap.perm[i * 2] = i + nelt2;
18027 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18028 }
18029 if (!TARGET_SSE2 && d->vmode == V4SImode)
18030 dremap.vmode = V4SFmode;
18031 }
18032 else if ((contents & (h1 | h4)) == contents)
18033 {
18034 /* shufps */
18035 for (i = 0; i < nelt2; ++i)
18036 {
18037 remap[i] = i;
18038 remap[i + nelt + nelt2] = i + nelt2;
18039 dremap.perm[i] = i;
18040 dremap.perm[i + nelt2] = i + nelt + nelt2;
18041 }
18042 if (nelt != 4)
18043 {
18044 /* shufpd */
18045 dremap.vmode = V2DImode;
18046 dremap.nelt = 2;
18047 dremap.perm[0] = 0;
18048 dremap.perm[1] = 3;
18049 }
18050 }
18051 else if ((contents & (h2 | h3)) == contents)
18052 {
18053 /* shufps */
18054 for (i = 0; i < nelt2; ++i)
18055 {
18056 remap[i + nelt2] = i;
18057 remap[i + nelt] = i + nelt2;
18058 dremap.perm[i] = i + nelt2;
18059 dremap.perm[i + nelt2] = i + nelt;
18060 }
18061 if (nelt != 4)
18062 {
18063 /* shufpd */
18064 dremap.vmode = V2DImode;
18065 dremap.nelt = 2;
18066 dremap.perm[0] = 1;
18067 dremap.perm[1] = 2;
18068 }
18069 }
18070 else
18071 return false;
18072 }
18073 else
18074 {
18075 unsigned int nelt4 = nelt / 4, nzcnt = 0;
18076 unsigned HOST_WIDE_INT q[8];
18077 unsigned int nonzero_halves[4];
18078
18079 /* Split the two input vectors into 8 quarters. */
18080 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18081 for (i = 1; i < 8; ++i)
18082 q[i] = q[0] << (nelt4 * i);
18083 for (i = 0; i < 4; ++i)
18084 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18085 {
18086 nonzero_halves[nzcnt] = i;
18087 ++nzcnt;
18088 }
18089
18090 if (nzcnt == 1)
18091 {
18092 gcc_assert (d->one_operand_p);
18093 nonzero_halves[1] = nonzero_halves[0];
18094 same_halves = true;
18095 }
18096 else if (d->one_operand_p)
18097 {
18098 gcc_assert (nonzero_halves[0] == 0);
18099 gcc_assert (nonzero_halves[1] == 1);
18100 }
18101
18102 if (nzcnt <= 2)
18103 {
18104 if (d->perm[0] / nelt2 == nonzero_halves[1])
18105 {
18106 /* Attempt to increase the likelihood that dfinal
18107 shuffle will be intra-lane. */
18108 std::swap (nonzero_halves[0], nonzero_halves[1]);
18109 }
18110
18111 /* vperm2f128 or vperm2i128. */
18112 for (i = 0; i < nelt2; ++i)
18113 {
18114 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
18115 remap[i + nonzero_halves[0] * nelt2] = i;
18116 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
18117 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
18118 }
18119
18120 if (d->vmode != V8SFmode
18121 && d->vmode != V4DFmode
18122 && d->vmode != V8SImode)
18123 {
18124 dremap.vmode = V8SImode;
18125 dremap.nelt = 8;
18126 for (i = 0; i < 4; ++i)
18127 {
18128 dremap.perm[i] = i + nonzero_halves[0] * 4;
18129 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
18130 }
18131 }
18132 }
18133 else if (d->one_operand_p)
18134 return false;
18135 else if (TARGET_AVX2
18136 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
18137 {
18138 /* vpunpckl* */
18139 for (i = 0; i < nelt4; ++i)
18140 {
18141 remap[i] = i * 2;
18142 remap[i + nelt] = i * 2 + 1;
18143 remap[i + nelt2] = i * 2 + nelt2;
18144 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
18145 dremap.perm[i * 2] = i;
18146 dremap.perm[i * 2 + 1] = i + nelt;
18147 dremap.perm[i * 2 + nelt2] = i + nelt2;
18148 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
18149 }
18150 }
18151 else if (TARGET_AVX2
18152 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
18153 {
18154 /* vpunpckh* */
18155 for (i = 0; i < nelt4; ++i)
18156 {
18157 remap[i + nelt4] = i * 2;
18158 remap[i + nelt + nelt4] = i * 2 + 1;
18159 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
18160 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
18161 dremap.perm[i * 2] = i + nelt4;
18162 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
18163 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
18164 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
18165 }
18166 }
18167 else
18168 return false;
18169 }
18170
18171 /* Use the remapping array set up above to move the elements from their
18172 swizzled locations into their final destinations. */
18173 dfinal = *d;
18174 for (i = 0; i < nelt; ++i)
18175 {
18176 unsigned e = remap[d->perm[i]];
18177 gcc_assert (e < nelt);
18178 /* If same_halves is true, both halves of the remapped vector are the
18179 same. Avoid cross-lane accesses if possible. */
18180 if (same_halves && i >= nelt2)
18181 {
18182 gcc_assert (e < nelt2);
18183 dfinal.perm[i] = e + nelt2;
18184 }
18185 else
18186 dfinal.perm[i] = e;
18187 }
18188 if (!d->testing_p)
18189 {
18190 dremap.target = gen_reg_rtx (dremap.vmode);
18191 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18192 }
18193 dfinal.op1 = dfinal.op0;
18194 dfinal.one_operand_p = true;
18195
18196 /* Test if the final remap can be done with a single insn. For V4SFmode or
18197 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18198 start_sequence ();
18199 ok = expand_vec_perm_1 (&dfinal);
18200 seq = get_insns ();
18201 end_sequence ();
18202
18203 if (!ok)
18204 return false;
18205
18206 if (d->testing_p)
18207 return true;
18208
18209 if (dremap.vmode != dfinal.vmode)
18210 {
18211 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
18212 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
18213 }
18214
18215 ok = expand_vec_perm_1 (&dremap);
18216 gcc_assert (ok);
18217
18218 emit_insn (seq);
18219 return true;
18220 }
18221
18222 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18223 a single vector cross-lane permutation into vpermq followed
18224 by any of the single insn permutations. */
18225
18226 static bool
18227 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
18228 {
18229 struct expand_vec_perm_d dremap, dfinal;
18230 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
18231 unsigned contents[2];
18232 bool ok;
18233
18234 if (!(TARGET_AVX2
18235 && (d->vmode == V32QImode || d->vmode == V16HImode)
18236 && d->one_operand_p))
18237 return false;
18238
18239 contents[0] = 0;
18240 contents[1] = 0;
18241 for (i = 0; i < nelt2; ++i)
18242 {
18243 contents[0] |= 1u << (d->perm[i] / nelt4);
18244 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
18245 }
18246
18247 for (i = 0; i < 2; ++i)
18248 {
18249 unsigned int cnt = 0;
18250 for (j = 0; j < 4; ++j)
18251 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
18252 return false;
18253 }
18254
18255 if (d->testing_p)
18256 return true;
18257
18258 dremap = *d;
18259 dremap.vmode = V4DImode;
18260 dremap.nelt = 4;
18261 dremap.target = gen_reg_rtx (V4DImode);
18262 dremap.op0 = gen_lowpart (V4DImode, d->op0);
18263 dremap.op1 = dremap.op0;
18264 dremap.one_operand_p = true;
18265 for (i = 0; i < 2; ++i)
18266 {
18267 unsigned int cnt = 0;
18268 for (j = 0; j < 4; ++j)
18269 if ((contents[i] & (1u << j)) != 0)
18270 dremap.perm[2 * i + cnt++] = j;
18271 for (; cnt < 2; ++cnt)
18272 dremap.perm[2 * i + cnt] = 0;
18273 }
18274
18275 dfinal = *d;
18276 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18277 dfinal.op1 = dfinal.op0;
18278 dfinal.one_operand_p = true;
18279 for (i = 0, j = 0; i < nelt; ++i)
18280 {
18281 if (i == nelt2)
18282 j = 2;
18283 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
18284 if ((d->perm[i] / nelt4) == dremap.perm[j])
18285 ;
18286 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
18287 dfinal.perm[i] |= nelt4;
18288 else
18289 gcc_unreachable ();
18290 }
18291
18292 ok = expand_vec_perm_1 (&dremap);
18293 gcc_assert (ok);
18294
18295 ok = expand_vec_perm_1 (&dfinal);
18296 gcc_assert (ok);
18297
18298 return true;
18299 }
18300
18301 static bool canonicalize_perm (struct expand_vec_perm_d *d);
18302
18303 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18304 a vector permutation using two instructions, vperm2f128 resp.
18305 vperm2i128 followed by any single in-lane permutation. */
18306
18307 static bool
18308 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
18309 {
18310 struct expand_vec_perm_d dfirst, dsecond;
18311 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
18312 bool ok;
18313
18314 if (!TARGET_AVX
18315 || GET_MODE_SIZE (d->vmode) != 32
18316 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
18317 return false;
18318
18319 dsecond = *d;
18320 dsecond.one_operand_p = false;
18321 dsecond.testing_p = true;
18322
18323 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18324 immediate. For perm < 16 the second permutation uses
18325 d->op0 as first operand, for perm >= 16 it uses d->op1
18326 as first operand. The second operand is the result of
18327 vperm2[fi]128. */
18328 for (perm = 0; perm < 32; perm++)
18329 {
18330 /* Ignore permutations which do not move anything cross-lane. */
18331 if (perm < 16)
18332 {
18333 /* The second shuffle for e.g. V4DFmode has
18334 0123 and ABCD operands.
18335 Ignore AB23, as 23 is already in the second lane
18336 of the first operand. */
18337 if ((perm & 0xc) == (1 << 2)) continue;
18338 /* And 01CD, as 01 is in the first lane of the first
18339 operand. */
18340 if ((perm & 3) == 0) continue;
18341 /* And 4567, as then the vperm2[fi]128 doesn't change
18342 anything on the original 4567 second operand. */
18343 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
18344 }
18345 else
18346 {
18347 /* The second shuffle for e.g. V4DFmode has
18348 4567 and ABCD operands.
18349 Ignore AB67, as 67 is already in the second lane
18350 of the first operand. */
18351 if ((perm & 0xc) == (3 << 2)) continue;
18352 /* And 45CD, as 45 is in the first lane of the first
18353 operand. */
18354 if ((perm & 3) == 2) continue;
18355 /* And 0123, as then the vperm2[fi]128 doesn't change
18356 anything on the original 0123 first operand. */
18357 if ((perm & 0xf) == (1 << 2)) continue;
18358 }
18359
18360 for (i = 0; i < nelt; i++)
18361 {
18362 j = d->perm[i] / nelt2;
18363 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
18364 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
18365 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
18366 dsecond.perm[i] = d->perm[i] & (nelt - 1);
18367 else
18368 break;
18369 }
18370
18371 if (i == nelt)
18372 {
18373 start_sequence ();
18374 ok = expand_vec_perm_1 (&dsecond);
18375 end_sequence ();
18376 }
18377 else
18378 ok = false;
18379
18380 if (ok)
18381 {
18382 if (d->testing_p)
18383 return true;
18384
18385 /* Found a usable second shuffle. dfirst will be
18386 vperm2f128 on d->op0 and d->op1. */
18387 dsecond.testing_p = false;
18388 dfirst = *d;
18389 dfirst.target = gen_reg_rtx (d->vmode);
18390 for (i = 0; i < nelt; i++)
18391 dfirst.perm[i] = (i & (nelt2 - 1))
18392 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
18393
18394 canonicalize_perm (&dfirst);
18395 ok = expand_vec_perm_1 (&dfirst);
18396 gcc_assert (ok);
18397
18398 /* And dsecond is some single insn shuffle, taking
18399 d->op0 and result of vperm2f128 (if perm < 16) or
18400 d->op1 and result of vperm2f128 (otherwise). */
18401 if (perm >= 16)
18402 dsecond.op0 = dsecond.op1;
18403 dsecond.op1 = dfirst.target;
18404
18405 ok = expand_vec_perm_1 (&dsecond);
18406 gcc_assert (ok);
18407
18408 return true;
18409 }
18410
18411 /* For one operand, the only useful vperm2f128 permutation is 0x01
18412 aka lanes swap. */
18413 if (d->one_operand_p)
18414 return false;
18415 }
18416
18417 return false;
18418 }
18419
18420 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18421 a two vector permutation using 2 intra-lane interleave insns
18422 and cross-lane shuffle for 32-byte vectors. */
18423
18424 static bool
18425 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
18426 {
18427 unsigned i, nelt;
18428 rtx (*gen) (rtx, rtx, rtx);
18429
18430 if (d->one_operand_p)
18431 return false;
18432 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
18433 ;
18434 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
18435 ;
18436 else
18437 return false;
18438
18439 nelt = d->nelt;
18440 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
18441 return false;
18442 for (i = 0; i < nelt; i += 2)
18443 if (d->perm[i] != d->perm[0] + i / 2
18444 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
18445 return false;
18446
18447 if (d->testing_p)
18448 return true;
18449
18450 switch (d->vmode)
18451 {
18452 case E_V32QImode:
18453 if (d->perm[0])
18454 gen = gen_vec_interleave_highv32qi;
18455 else
18456 gen = gen_vec_interleave_lowv32qi;
18457 break;
18458 case E_V16HImode:
18459 if (d->perm[0])
18460 gen = gen_vec_interleave_highv16hi;
18461 else
18462 gen = gen_vec_interleave_lowv16hi;
18463 break;
18464 case E_V8SImode:
18465 if (d->perm[0])
18466 gen = gen_vec_interleave_highv8si;
18467 else
18468 gen = gen_vec_interleave_lowv8si;
18469 break;
18470 case E_V4DImode:
18471 if (d->perm[0])
18472 gen = gen_vec_interleave_highv4di;
18473 else
18474 gen = gen_vec_interleave_lowv4di;
18475 break;
18476 case E_V8SFmode:
18477 if (d->perm[0])
18478 gen = gen_vec_interleave_highv8sf;
18479 else
18480 gen = gen_vec_interleave_lowv8sf;
18481 break;
18482 case E_V4DFmode:
18483 if (d->perm[0])
18484 gen = gen_vec_interleave_highv4df;
18485 else
18486 gen = gen_vec_interleave_lowv4df;
18487 break;
18488 default:
18489 gcc_unreachable ();
18490 }
18491
18492 emit_insn (gen (d->target, d->op0, d->op1));
18493 return true;
18494 }
18495
18496 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18497 a single vector permutation using a single intra-lane vector
18498 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18499 the non-swapped and swapped vectors together. */
18500
18501 static bool
18502 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18503 {
18504 struct expand_vec_perm_d dfirst, dsecond;
18505 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18506 rtx_insn *seq;
18507 bool ok;
18508 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18509
18510 if (!TARGET_AVX
18511 || TARGET_AVX2
18512 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18513 || !d->one_operand_p)
18514 return false;
18515
18516 dfirst = *d;
18517 for (i = 0; i < nelt; i++)
18518 dfirst.perm[i] = 0xff;
18519 for (i = 0, msk = 0; i < nelt; i++)
18520 {
18521 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18522 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18523 return false;
18524 dfirst.perm[j] = d->perm[i];
18525 if (j != i)
18526 msk |= (1 << i);
18527 }
18528 for (i = 0; i < nelt; i++)
18529 if (dfirst.perm[i] == 0xff)
18530 dfirst.perm[i] = i;
18531
18532 if (!d->testing_p)
18533 dfirst.target = gen_reg_rtx (dfirst.vmode);
18534
18535 start_sequence ();
18536 ok = expand_vec_perm_1 (&dfirst);
18537 seq = get_insns ();
18538 end_sequence ();
18539
18540 if (!ok)
18541 return false;
18542
18543 if (d->testing_p)
18544 return true;
18545
18546 emit_insn (seq);
18547
18548 dsecond = *d;
18549 dsecond.op0 = dfirst.target;
18550 dsecond.op1 = dfirst.target;
18551 dsecond.one_operand_p = true;
18552 dsecond.target = gen_reg_rtx (dsecond.vmode);
18553 for (i = 0; i < nelt; i++)
18554 dsecond.perm[i] = i ^ nelt2;
18555
18556 ok = expand_vec_perm_1 (&dsecond);
18557 gcc_assert (ok);
18558
18559 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18560 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18561 return true;
18562 }
18563
18564 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18565 permutation using two vperm2f128, followed by a vshufpd insn blending
18566 the two vectors together. */
18567
18568 static bool
18569 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18570 {
18571 struct expand_vec_perm_d dfirst, dsecond, dthird;
18572 bool ok;
18573
18574 if (!TARGET_AVX || (d->vmode != V4DFmode))
18575 return false;
18576
18577 if (d->testing_p)
18578 return true;
18579
18580 dfirst = *d;
18581 dsecond = *d;
18582 dthird = *d;
18583
18584 dfirst.perm[0] = (d->perm[0] & ~1);
18585 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18586 dfirst.perm[2] = (d->perm[2] & ~1);
18587 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18588 dsecond.perm[0] = (d->perm[1] & ~1);
18589 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18590 dsecond.perm[2] = (d->perm[3] & ~1);
18591 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18592 dthird.perm[0] = (d->perm[0] % 2);
18593 dthird.perm[1] = (d->perm[1] % 2) + 4;
18594 dthird.perm[2] = (d->perm[2] % 2) + 2;
18595 dthird.perm[3] = (d->perm[3] % 2) + 6;
18596
18597 dfirst.target = gen_reg_rtx (dfirst.vmode);
18598 dsecond.target = gen_reg_rtx (dsecond.vmode);
18599 dthird.op0 = dfirst.target;
18600 dthird.op1 = dsecond.target;
18601 dthird.one_operand_p = false;
18602
18603 canonicalize_perm (&dfirst);
18604 canonicalize_perm (&dsecond);
18605
18606 ok = expand_vec_perm_1 (&dfirst)
18607 && expand_vec_perm_1 (&dsecond)
18608 && expand_vec_perm_1 (&dthird);
18609
18610 gcc_assert (ok);
18611
18612 return true;
18613 }
18614
18615 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18616
18617 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18618 a two vector permutation using two intra-lane vector
18619 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18620 the non-swapped and swapped vectors together. */
18621
18622 static bool
18623 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18624 {
18625 struct expand_vec_perm_d dfirst, dsecond, dthird;
18626 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18627 rtx_insn *seq1, *seq2;
18628 bool ok;
18629 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18630
18631 if (!TARGET_AVX
18632 || TARGET_AVX2
18633 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18634 || d->one_operand_p)
18635 return false;
18636
18637 dfirst = *d;
18638 dsecond = *d;
18639 for (i = 0; i < nelt; i++)
18640 {
18641 dfirst.perm[i] = 0xff;
18642 dsecond.perm[i] = 0xff;
18643 }
18644 for (i = 0, msk = 0; i < nelt; i++)
18645 {
18646 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18647 if (j == i)
18648 {
18649 dfirst.perm[j] = d->perm[i];
18650 which1 |= (d->perm[i] < nelt ? 1 : 2);
18651 }
18652 else
18653 {
18654 dsecond.perm[j] = d->perm[i];
18655 which2 |= (d->perm[i] < nelt ? 1 : 2);
18656 msk |= (1U << i);
18657 }
18658 }
18659 if (msk == 0 || msk == (1U << nelt) - 1)
18660 return false;
18661
18662 if (!d->testing_p)
18663 {
18664 dfirst.target = gen_reg_rtx (dfirst.vmode);
18665 dsecond.target = gen_reg_rtx (dsecond.vmode);
18666 }
18667
18668 for (i = 0; i < nelt; i++)
18669 {
18670 if (dfirst.perm[i] == 0xff)
18671 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18672 if (dsecond.perm[i] == 0xff)
18673 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18674 }
18675 canonicalize_perm (&dfirst);
18676 start_sequence ();
18677 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18678 seq1 = get_insns ();
18679 end_sequence ();
18680
18681 if (!ok)
18682 return false;
18683
18684 canonicalize_perm (&dsecond);
18685 start_sequence ();
18686 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18687 seq2 = get_insns ();
18688 end_sequence ();
18689
18690 if (!ok)
18691 return false;
18692
18693 if (d->testing_p)
18694 return true;
18695
18696 emit_insn (seq1);
18697 emit_insn (seq2);
18698
18699 dthird = *d;
18700 dthird.op0 = dsecond.target;
18701 dthird.op1 = dsecond.target;
18702 dthird.one_operand_p = true;
18703 dthird.target = gen_reg_rtx (dthird.vmode);
18704 for (i = 0; i < nelt; i++)
18705 dthird.perm[i] = i ^ nelt2;
18706
18707 ok = expand_vec_perm_1 (&dthird);
18708 gcc_assert (ok);
18709
18710 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18711 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18712 return true;
18713 }
18714
18715 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18716 permutation with two pshufb insns and an ior. We should have already
18717 failed all two instruction sequences. */
18718
18719 static bool
18720 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18721 {
18722 rtx rperm[2][16], vperm, l, h, op, m128;
18723 unsigned int i, nelt, eltsz;
18724
18725 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18726 return false;
18727 gcc_assert (!d->one_operand_p);
18728
18729 if (d->testing_p)
18730 return true;
18731
18732 nelt = d->nelt;
18733 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18734
18735 /* Generate two permutation masks. If the required element is within
18736 the given vector it is shuffled into the proper lane. If the required
18737 element is in the other vector, force a zero into the lane by setting
18738 bit 7 in the permutation mask. */
18739 m128 = GEN_INT (-128);
18740 for (i = 0; i < nelt; ++i)
18741 {
18742 unsigned j, e = d->perm[i];
18743 unsigned which = (e >= nelt);
18744 if (e >= nelt)
18745 e -= nelt;
18746
18747 for (j = 0; j < eltsz; ++j)
18748 {
18749 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18750 rperm[1-which][i*eltsz + j] = m128;
18751 }
18752 }
18753
18754 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18755 vperm = force_reg (V16QImode, vperm);
18756
18757 l = gen_reg_rtx (V16QImode);
18758 op = gen_lowpart (V16QImode, d->op0);
18759 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18760
18761 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18762 vperm = force_reg (V16QImode, vperm);
18763
18764 h = gen_reg_rtx (V16QImode);
18765 op = gen_lowpart (V16QImode, d->op1);
18766 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18767
18768 op = d->target;
18769 if (d->vmode != V16QImode)
18770 op = gen_reg_rtx (V16QImode);
18771 emit_insn (gen_iorv16qi3 (op, l, h));
18772 if (op != d->target)
18773 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18774
18775 return true;
18776 }
18777
18778 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18779 with two vpshufb insns, vpermq and vpor. We should have already failed
18780 all two or three instruction sequences. */
18781
18782 static bool
18783 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18784 {
18785 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18786 unsigned int i, nelt, eltsz;
18787
18788 if (!TARGET_AVX2
18789 || !d->one_operand_p
18790 || (d->vmode != V32QImode && d->vmode != V16HImode))
18791 return false;
18792
18793 if (d->testing_p)
18794 return true;
18795
18796 nelt = d->nelt;
18797 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18798
18799 /* Generate two permutation masks. If the required element is within
18800 the same lane, it is shuffled in. If the required element from the
18801 other lane, force a zero by setting bit 7 in the permutation mask.
18802 In the other mask the mask has non-negative elements if element
18803 is requested from the other lane, but also moved to the other lane,
18804 so that the result of vpshufb can have the two V2TImode halves
18805 swapped. */
18806 m128 = GEN_INT (-128);
18807 for (i = 0; i < nelt; ++i)
18808 {
18809 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18810 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18811
18812 for (j = 0; j < eltsz; ++j)
18813 {
18814 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18815 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18816 }
18817 }
18818
18819 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18820 vperm = force_reg (V32QImode, vperm);
18821
18822 h = gen_reg_rtx (V32QImode);
18823 op = gen_lowpart (V32QImode, d->op0);
18824 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18825
18826 /* Swap the 128-byte lanes of h into hp. */
18827 hp = gen_reg_rtx (V4DImode);
18828 op = gen_lowpart (V4DImode, h);
18829 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18830 const1_rtx));
18831
18832 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18833 vperm = force_reg (V32QImode, vperm);
18834
18835 l = gen_reg_rtx (V32QImode);
18836 op = gen_lowpart (V32QImode, d->op0);
18837 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18838
18839 op = d->target;
18840 if (d->vmode != V32QImode)
18841 op = gen_reg_rtx (V32QImode);
18842 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18843 if (op != d->target)
18844 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18845
18846 return true;
18847 }
18848
18849 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18850 and extract-odd permutations of two V32QImode and V16QImode operand
18851 with two vpshufb insns, vpor and vpermq. We should have already
18852 failed all two or three instruction sequences. */
18853
18854 static bool
18855 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18856 {
18857 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18858 unsigned int i, nelt, eltsz;
18859
18860 if (!TARGET_AVX2
18861 || d->one_operand_p
18862 || (d->vmode != V32QImode && d->vmode != V16HImode))
18863 return false;
18864
18865 for (i = 0; i < d->nelt; ++i)
18866 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18867 return false;
18868
18869 if (d->testing_p)
18870 return true;
18871
18872 nelt = d->nelt;
18873 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18874
18875 /* Generate two permutation masks. In the first permutation mask
18876 the first quarter will contain indexes for the first half
18877 of the op0, the second quarter will contain bit 7 set, third quarter
18878 will contain indexes for the second half of the op0 and the
18879 last quarter bit 7 set. In the second permutation mask
18880 the first quarter will contain bit 7 set, the second quarter
18881 indexes for the first half of the op1, the third quarter bit 7 set
18882 and last quarter indexes for the second half of the op1.
18883 I.e. the first mask e.g. for V32QImode extract even will be:
18884 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18885 (all values masked with 0xf except for -128) and second mask
18886 for extract even will be
18887 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18888 m128 = GEN_INT (-128);
18889 for (i = 0; i < nelt; ++i)
18890 {
18891 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18892 unsigned which = d->perm[i] >= nelt;
18893 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18894
18895 for (j = 0; j < eltsz; ++j)
18896 {
18897 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18898 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18899 }
18900 }
18901
18902 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18903 vperm = force_reg (V32QImode, vperm);
18904
18905 l = gen_reg_rtx (V32QImode);
18906 op = gen_lowpart (V32QImode, d->op0);
18907 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18908
18909 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18910 vperm = force_reg (V32QImode, vperm);
18911
18912 h = gen_reg_rtx (V32QImode);
18913 op = gen_lowpart (V32QImode, d->op1);
18914 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18915
18916 ior = gen_reg_rtx (V32QImode);
18917 emit_insn (gen_iorv32qi3 (ior, l, h));
18918
18919 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18920 op = gen_reg_rtx (V4DImode);
18921 ior = gen_lowpart (V4DImode, ior);
18922 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18923 const1_rtx, GEN_INT (3)));
18924 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18925
18926 return true;
18927 }
18928
18929 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18930 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18931 with two "and" and "pack" or two "shift" and "pack" insns. We should
18932 have already failed all two instruction sequences. */
18933
18934 static bool
18935 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18936 {
18937 rtx op, dop0, dop1, t;
18938 unsigned i, odd, c, s, nelt = d->nelt;
18939 bool end_perm = false;
18940 machine_mode half_mode;
18941 rtx (*gen_and) (rtx, rtx, rtx);
18942 rtx (*gen_pack) (rtx, rtx, rtx);
18943 rtx (*gen_shift) (rtx, rtx, rtx);
18944
18945 if (d->one_operand_p)
18946 return false;
18947
18948 switch (d->vmode)
18949 {
18950 case E_V8HImode:
18951 /* Required for "pack". */
18952 if (!TARGET_SSE4_1)
18953 return false;
18954 c = 0xffff;
18955 s = 16;
18956 half_mode = V4SImode;
18957 gen_and = gen_andv4si3;
18958 gen_pack = gen_sse4_1_packusdw;
18959 gen_shift = gen_lshrv4si3;
18960 break;
18961 case E_V16QImode:
18962 /* No check as all instructions are SSE2. */
18963 c = 0xff;
18964 s = 8;
18965 half_mode = V8HImode;
18966 gen_and = gen_andv8hi3;
18967 gen_pack = gen_sse2_packuswb;
18968 gen_shift = gen_lshrv8hi3;
18969 break;
18970 case E_V16HImode:
18971 if (!TARGET_AVX2)
18972 return false;
18973 c = 0xffff;
18974 s = 16;
18975 half_mode = V8SImode;
18976 gen_and = gen_andv8si3;
18977 gen_pack = gen_avx2_packusdw;
18978 gen_shift = gen_lshrv8si3;
18979 end_perm = true;
18980 break;
18981 case E_V32QImode:
18982 if (!TARGET_AVX2)
18983 return false;
18984 c = 0xff;
18985 s = 8;
18986 half_mode = V16HImode;
18987 gen_and = gen_andv16hi3;
18988 gen_pack = gen_avx2_packuswb;
18989 gen_shift = gen_lshrv16hi3;
18990 end_perm = true;
18991 break;
18992 default:
18993 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18994 general shuffles. */
18995 return false;
18996 }
18997
18998 /* Check that permutation is even or odd. */
18999 odd = d->perm[0];
19000 if (odd > 1)
19001 return false;
19002
19003 for (i = 1; i < nelt; ++i)
19004 if (d->perm[i] != 2 * i + odd)
19005 return false;
19006
19007 if (d->testing_p)
19008 return true;
19009
19010 dop0 = gen_reg_rtx (half_mode);
19011 dop1 = gen_reg_rtx (half_mode);
19012 if (odd == 0)
19013 {
19014 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
19015 t = force_reg (half_mode, t);
19016 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
19017 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
19018 }
19019 else
19020 {
19021 emit_insn (gen_shift (dop0,
19022 gen_lowpart (half_mode, d->op0),
19023 GEN_INT (s)));
19024 emit_insn (gen_shift (dop1,
19025 gen_lowpart (half_mode, d->op1),
19026 GEN_INT (s)));
19027 }
19028 /* In AVX2 for 256 bit case we need to permute pack result. */
19029 if (TARGET_AVX2 && end_perm)
19030 {
19031 op = gen_reg_rtx (d->vmode);
19032 t = gen_reg_rtx (V4DImode);
19033 emit_insn (gen_pack (op, dop0, dop1));
19034 emit_insn (gen_avx2_permv4di_1 (t,
19035 gen_lowpart (V4DImode, op),
19036 const0_rtx,
19037 const2_rtx,
19038 const1_rtx,
19039 GEN_INT (3)));
19040 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
19041 }
19042 else
19043 emit_insn (gen_pack (d->target, dop0, dop1));
19044
19045 return true;
19046 }
19047
19048 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19049 and extract-odd permutations of two V64QI operands
19050 with two "shifts", two "truncs" and one "concat" insns for "odd"
19051 and two "truncs" and one concat insn for "even."
19052 Have already failed all two instruction sequences. */
19053
19054 static bool
19055 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
19056 {
19057 rtx t1, t2, t3, t4;
19058 unsigned i, odd, nelt = d->nelt;
19059
19060 if (!TARGET_AVX512BW
19061 || d->one_operand_p
19062 || d->vmode != V64QImode)
19063 return false;
19064
19065 /* Check that permutation is even or odd. */
19066 odd = d->perm[0];
19067 if (odd > 1)
19068 return false;
19069
19070 for (i = 1; i < nelt; ++i)
19071 if (d->perm[i] != 2 * i + odd)
19072 return false;
19073
19074 if (d->testing_p)
19075 return true;
19076
19077
19078 if (odd)
19079 {
19080 t1 = gen_reg_rtx (V32HImode);
19081 t2 = gen_reg_rtx (V32HImode);
19082 emit_insn (gen_lshrv32hi3 (t1,
19083 gen_lowpart (V32HImode, d->op0),
19084 GEN_INT (8)));
19085 emit_insn (gen_lshrv32hi3 (t2,
19086 gen_lowpart (V32HImode, d->op1),
19087 GEN_INT (8)));
19088 }
19089 else
19090 {
19091 t1 = gen_lowpart (V32HImode, d->op0);
19092 t2 = gen_lowpart (V32HImode, d->op1);
19093 }
19094
19095 t3 = gen_reg_rtx (V32QImode);
19096 t4 = gen_reg_rtx (V32QImode);
19097 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
19098 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
19099 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
19100
19101 return true;
19102 }
19103
19104 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19105 and extract-odd permutations. */
19106
19107 static bool
19108 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
19109 {
19110 rtx t1, t2, t3, t4, t5;
19111
19112 switch (d->vmode)
19113 {
19114 case E_V4DFmode:
19115 if (d->testing_p)
19116 break;
19117 t1 = gen_reg_rtx (V4DFmode);
19118 t2 = gen_reg_rtx (V4DFmode);
19119
19120 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19121 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
19122 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
19123
19124 /* Now an unpck[lh]pd will produce the result required. */
19125 if (odd)
19126 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
19127 else
19128 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
19129 emit_insn (t3);
19130 break;
19131
19132 case E_V8SFmode:
19133 {
19134 int mask = odd ? 0xdd : 0x88;
19135
19136 if (d->testing_p)
19137 break;
19138 t1 = gen_reg_rtx (V8SFmode);
19139 t2 = gen_reg_rtx (V8SFmode);
19140 t3 = gen_reg_rtx (V8SFmode);
19141
19142 /* Shuffle within the 128-bit lanes to produce:
19143 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19144 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
19145 GEN_INT (mask)));
19146
19147 /* Shuffle the lanes around to produce:
19148 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19149 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
19150 GEN_INT (0x3)));
19151
19152 /* Shuffle within the 128-bit lanes to produce:
19153 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19154 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
19155
19156 /* Shuffle within the 128-bit lanes to produce:
19157 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19158 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
19159
19160 /* Shuffle the lanes around to produce:
19161 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19162 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
19163 GEN_INT (0x20)));
19164 }
19165 break;
19166
19167 case E_V2DFmode:
19168 case E_V4SFmode:
19169 case E_V2DImode:
19170 case E_V2SImode:
19171 case E_V4SImode:
19172 /* These are always directly implementable by expand_vec_perm_1. */
19173 gcc_unreachable ();
19174
19175 case E_V2SFmode:
19176 gcc_assert (TARGET_MMX_WITH_SSE);
19177 /* We have no suitable instructions. */
19178 if (d->testing_p)
19179 return false;
19180 break;
19181
19182 case E_V4HImode:
19183 if (d->testing_p)
19184 break;
19185 /* We need 2*log2(N)-1 operations to achieve odd/even
19186 with interleave. */
19187 t1 = gen_reg_rtx (V4HImode);
19188 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
19189 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
19190 if (odd)
19191 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
19192 else
19193 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
19194 emit_insn (t2);
19195 break;
19196
19197 case E_V8HImode:
19198 if (TARGET_SSE4_1)
19199 return expand_vec_perm_even_odd_pack (d);
19200 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
19201 return expand_vec_perm_pshufb2 (d);
19202 else
19203 {
19204 if (d->testing_p)
19205 break;
19206 /* We need 2*log2(N)-1 operations to achieve odd/even
19207 with interleave. */
19208 t1 = gen_reg_rtx (V8HImode);
19209 t2 = gen_reg_rtx (V8HImode);
19210 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
19211 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
19212 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
19213 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
19214 if (odd)
19215 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
19216 else
19217 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
19218 emit_insn (t3);
19219 }
19220 break;
19221
19222 case E_V16QImode:
19223 return expand_vec_perm_even_odd_pack (d);
19224
19225 case E_V16HImode:
19226 case E_V32QImode:
19227 return expand_vec_perm_even_odd_pack (d);
19228
19229 case E_V64QImode:
19230 return expand_vec_perm_even_odd_trunc (d);
19231
19232 case E_V4DImode:
19233 if (!TARGET_AVX2)
19234 {
19235 struct expand_vec_perm_d d_copy = *d;
19236 d_copy.vmode = V4DFmode;
19237 if (d->testing_p)
19238 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
19239 else
19240 d_copy.target = gen_reg_rtx (V4DFmode);
19241 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
19242 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
19243 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19244 {
19245 if (!d->testing_p)
19246 emit_move_insn (d->target,
19247 gen_lowpart (V4DImode, d_copy.target));
19248 return true;
19249 }
19250 return false;
19251 }
19252
19253 if (d->testing_p)
19254 break;
19255
19256 t1 = gen_reg_rtx (V4DImode);
19257 t2 = gen_reg_rtx (V4DImode);
19258
19259 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19260 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
19261 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
19262
19263 /* Now an vpunpck[lh]qdq will produce the result required. */
19264 if (odd)
19265 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
19266 else
19267 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
19268 emit_insn (t3);
19269 break;
19270
19271 case E_V8SImode:
19272 if (!TARGET_AVX2)
19273 {
19274 struct expand_vec_perm_d d_copy = *d;
19275 d_copy.vmode = V8SFmode;
19276 if (d->testing_p)
19277 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
19278 else
19279 d_copy.target = gen_reg_rtx (V8SFmode);
19280 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
19281 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
19282 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19283 {
19284 if (!d->testing_p)
19285 emit_move_insn (d->target,
19286 gen_lowpart (V8SImode, d_copy.target));
19287 return true;
19288 }
19289 return false;
19290 }
19291
19292 if (d->testing_p)
19293 break;
19294
19295 t1 = gen_reg_rtx (V8SImode);
19296 t2 = gen_reg_rtx (V8SImode);
19297 t3 = gen_reg_rtx (V4DImode);
19298 t4 = gen_reg_rtx (V4DImode);
19299 t5 = gen_reg_rtx (V4DImode);
19300
19301 /* Shuffle the lanes around into
19302 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19303 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
19304 gen_lowpart (V4DImode, d->op1),
19305 GEN_INT (0x20)));
19306 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
19307 gen_lowpart (V4DImode, d->op1),
19308 GEN_INT (0x31)));
19309
19310 /* Swap the 2nd and 3rd position in each lane into
19311 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19312 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
19313 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19314 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
19315 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19316
19317 /* Now an vpunpck[lh]qdq will produce
19318 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19319 if (odd)
19320 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
19321 gen_lowpart (V4DImode, t2));
19322 else
19323 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
19324 gen_lowpart (V4DImode, t2));
19325 emit_insn (t3);
19326 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
19327 break;
19328
19329 default:
19330 gcc_unreachable ();
19331 }
19332
19333 return true;
19334 }
19335
19336 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19337 extract-even and extract-odd permutations. */
19338
19339 static bool
19340 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
19341 {
19342 unsigned i, odd, nelt = d->nelt;
19343
19344 odd = d->perm[0];
19345 if (odd != 0 && odd != 1)
19346 return false;
19347
19348 for (i = 1; i < nelt; ++i)
19349 if (d->perm[i] != 2 * i + odd)
19350 return false;
19351
19352 return expand_vec_perm_even_odd_1 (d, odd);
19353 }
19354
19355 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19356 permutations. We assume that expand_vec_perm_1 has already failed. */
19357
19358 static bool
19359 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
19360 {
19361 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
19362 machine_mode vmode = d->vmode;
19363 unsigned char perm2[4];
19364 rtx op0 = d->op0, dest;
19365 bool ok;
19366
19367 switch (vmode)
19368 {
19369 case E_V4DFmode:
19370 case E_V8SFmode:
19371 /* These are special-cased in sse.md so that we can optionally
19372 use the vbroadcast instruction. They expand to two insns
19373 if the input happens to be in a register. */
19374 gcc_unreachable ();
19375
19376 case E_V2DFmode:
19377 case E_V2SFmode:
19378 case E_V4SFmode:
19379 case E_V2DImode:
19380 case E_V2SImode:
19381 case E_V4SImode:
19382 /* These are always implementable using standard shuffle patterns. */
19383 gcc_unreachable ();
19384
19385 case E_V8HImode:
19386 case E_V16QImode:
19387 /* These can be implemented via interleave. We save one insn by
19388 stopping once we have promoted to V4SImode and then use pshufd. */
19389 if (d->testing_p)
19390 return true;
19391 do
19392 {
19393 rtx dest;
19394 rtx (*gen) (rtx, rtx, rtx)
19395 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
19396 : gen_vec_interleave_lowv8hi;
19397
19398 if (elt >= nelt2)
19399 {
19400 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
19401 : gen_vec_interleave_highv8hi;
19402 elt -= nelt2;
19403 }
19404 nelt2 /= 2;
19405
19406 dest = gen_reg_rtx (vmode);
19407 emit_insn (gen (dest, op0, op0));
19408 vmode = get_mode_wider_vector (vmode);
19409 op0 = gen_lowpart (vmode, dest);
19410 }
19411 while (vmode != V4SImode);
19412
19413 memset (perm2, elt, 4);
19414 dest = gen_reg_rtx (V4SImode);
19415 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
19416 gcc_assert (ok);
19417 if (!d->testing_p)
19418 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
19419 return true;
19420
19421 case E_V64QImode:
19422 case E_V32QImode:
19423 case E_V16HImode:
19424 case E_V8SImode:
19425 case E_V4DImode:
19426 /* For AVX2 broadcasts of the first element vpbroadcast* or
19427 vpermq should be used by expand_vec_perm_1. */
19428 gcc_assert (!TARGET_AVX2 || d->perm[0]);
19429 return false;
19430
19431 default:
19432 gcc_unreachable ();
19433 }
19434 }
19435
19436 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19437 broadcast permutations. */
19438
19439 static bool
19440 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
19441 {
19442 unsigned i, elt, nelt = d->nelt;
19443
19444 if (!d->one_operand_p)
19445 return false;
19446
19447 elt = d->perm[0];
19448 for (i = 1; i < nelt; ++i)
19449 if (d->perm[i] != elt)
19450 return false;
19451
19452 return expand_vec_perm_broadcast_1 (d);
19453 }
19454
19455 /* Implement arbitrary permutations of two V64QImode operands
19456 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19457 static bool
19458 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
19459 {
19460 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19461 return false;
19462
19463 if (d->testing_p)
19464 return true;
19465
19466 struct expand_vec_perm_d ds[2];
19467 rtx rperm[128], vperm, target0, target1;
19468 unsigned int i, nelt;
19469 machine_mode vmode;
19470
19471 nelt = d->nelt;
19472 vmode = V64QImode;
19473
19474 for (i = 0; i < 2; i++)
19475 {
19476 ds[i] = *d;
19477 ds[i].vmode = V32HImode;
19478 ds[i].nelt = 32;
19479 ds[i].target = gen_reg_rtx (V32HImode);
19480 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19481 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19482 }
19483
19484 /* Prepare permutations such that the first one takes care of
19485 putting the even bytes into the right positions or one higher
19486 positions (ds[0]) and the second one takes care of
19487 putting the odd bytes into the right positions or one below
19488 (ds[1]). */
19489
19490 for (i = 0; i < nelt; i++)
19491 {
19492 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19493 if (i & 1)
19494 {
19495 rperm[i] = constm1_rtx;
19496 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19497 }
19498 else
19499 {
19500 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19501 rperm[i + 64] = constm1_rtx;
19502 }
19503 }
19504
19505 bool ok = expand_vec_perm_1 (&ds[0]);
19506 gcc_assert (ok);
19507 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19508
19509 ok = expand_vec_perm_1 (&ds[1]);
19510 gcc_assert (ok);
19511 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19512
19513 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19514 vperm = force_reg (vmode, vperm);
19515 target0 = gen_reg_rtx (V64QImode);
19516 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19517
19518 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19519 vperm = force_reg (vmode, vperm);
19520 target1 = gen_reg_rtx (V64QImode);
19521 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19522
19523 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19524 return true;
19525 }
19526
19527 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19528 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19529 all the shorter instruction sequences. */
19530
19531 static bool
19532 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19533 {
19534 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19535 unsigned int i, nelt, eltsz;
19536 bool used[4];
19537
19538 if (!TARGET_AVX2
19539 || d->one_operand_p
19540 || (d->vmode != V32QImode && d->vmode != V16HImode))
19541 return false;
19542
19543 if (d->testing_p)
19544 return true;
19545
19546 nelt = d->nelt;
19547 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19548
19549 /* Generate 4 permutation masks. If the required element is within
19550 the same lane, it is shuffled in. If the required element from the
19551 other lane, force a zero by setting bit 7 in the permutation mask.
19552 In the other mask the mask has non-negative elements if element
19553 is requested from the other lane, but also moved to the other lane,
19554 so that the result of vpshufb can have the two V2TImode halves
19555 swapped. */
19556 m128 = GEN_INT (-128);
19557 for (i = 0; i < 32; ++i)
19558 {
19559 rperm[0][i] = m128;
19560 rperm[1][i] = m128;
19561 rperm[2][i] = m128;
19562 rperm[3][i] = m128;
19563 }
19564 used[0] = false;
19565 used[1] = false;
19566 used[2] = false;
19567 used[3] = false;
19568 for (i = 0; i < nelt; ++i)
19569 {
19570 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19571 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19572 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19573
19574 for (j = 0; j < eltsz; ++j)
19575 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19576 used[which] = true;
19577 }
19578
19579 for (i = 0; i < 2; ++i)
19580 {
19581 if (!used[2 * i + 1])
19582 {
19583 h[i] = NULL_RTX;
19584 continue;
19585 }
19586 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19587 gen_rtvec_v (32, rperm[2 * i + 1]));
19588 vperm = force_reg (V32QImode, vperm);
19589 h[i] = gen_reg_rtx (V32QImode);
19590 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19591 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19592 }
19593
19594 /* Swap the 128-byte lanes of h[X]. */
19595 for (i = 0; i < 2; ++i)
19596 {
19597 if (h[i] == NULL_RTX)
19598 continue;
19599 op = gen_reg_rtx (V4DImode);
19600 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19601 const2_rtx, GEN_INT (3), const0_rtx,
19602 const1_rtx));
19603 h[i] = gen_lowpart (V32QImode, op);
19604 }
19605
19606 for (i = 0; i < 2; ++i)
19607 {
19608 if (!used[2 * i])
19609 {
19610 l[i] = NULL_RTX;
19611 continue;
19612 }
19613 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19614 vperm = force_reg (V32QImode, vperm);
19615 l[i] = gen_reg_rtx (V32QImode);
19616 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19617 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19618 }
19619
19620 for (i = 0; i < 2; ++i)
19621 {
19622 if (h[i] && l[i])
19623 {
19624 op = gen_reg_rtx (V32QImode);
19625 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19626 l[i] = op;
19627 }
19628 else if (h[i])
19629 l[i] = h[i];
19630 }
19631
19632 gcc_assert (l[0] && l[1]);
19633 op = d->target;
19634 if (d->vmode != V32QImode)
19635 op = gen_reg_rtx (V32QImode);
19636 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19637 if (op != d->target)
19638 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19639 return true;
19640 }
19641
19642 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19643 taken care of, perform the expansion in D and return true on success. */
19644
19645 static bool
19646 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19647 {
19648 /* Try a single instruction expansion. */
19649 if (expand_vec_perm_1 (d))
19650 return true;
19651
19652 /* Try sequences of two instructions. */
19653
19654 if (expand_vec_perm_pshuflw_pshufhw (d))
19655 return true;
19656
19657 if (expand_vec_perm_palignr (d, false))
19658 return true;
19659
19660 if (expand_vec_perm_interleave2 (d))
19661 return true;
19662
19663 if (expand_vec_perm_broadcast (d))
19664 return true;
19665
19666 if (expand_vec_perm_vpermq_perm_1 (d))
19667 return true;
19668
19669 if (expand_vec_perm_vperm2f128 (d))
19670 return true;
19671
19672 if (expand_vec_perm_pblendv (d))
19673 return true;
19674
19675 /* Try sequences of three instructions. */
19676
19677 if (expand_vec_perm_even_odd_pack (d))
19678 return true;
19679
19680 if (expand_vec_perm_2vperm2f128_vshuf (d))
19681 return true;
19682
19683 if (expand_vec_perm_pshufb2 (d))
19684 return true;
19685
19686 if (expand_vec_perm_interleave3 (d))
19687 return true;
19688
19689 if (expand_vec_perm_vperm2f128_vblend (d))
19690 return true;
19691
19692 /* Try sequences of four instructions. */
19693
19694 if (expand_vec_perm_even_odd_trunc (d))
19695 return true;
19696 if (expand_vec_perm_vpshufb2_vpermq (d))
19697 return true;
19698
19699 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19700 return true;
19701
19702 if (expand_vec_perm_vpermt2_vpshub2 (d))
19703 return true;
19704
19705 /* ??? Look for narrow permutations whose element orderings would
19706 allow the promotion to a wider mode. */
19707
19708 /* ??? Look for sequences of interleave or a wider permute that place
19709 the data into the correct lanes for a half-vector shuffle like
19710 pshuf[lh]w or vpermilps. */
19711
19712 /* ??? Look for sequences of interleave that produce the desired results.
19713 The combinatorics of punpck[lh] get pretty ugly... */
19714
19715 if (expand_vec_perm_even_odd (d))
19716 return true;
19717
19718 /* Even longer sequences. */
19719 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19720 return true;
19721
19722 /* See if we can get the same permutation in different vector integer
19723 mode. */
19724 struct expand_vec_perm_d nd;
19725 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19726 {
19727 if (!d->testing_p)
19728 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19729 return true;
19730 }
19731
19732 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19733 if (expand_vec_perm2_vperm2f128_vblend (d))
19734 return true;
19735
19736 return false;
19737 }
19738
19739 /* If a permutation only uses one operand, make it clear. Returns true
19740 if the permutation references both operands. */
19741
19742 static bool
19743 canonicalize_perm (struct expand_vec_perm_d *d)
19744 {
19745 int i, which, nelt = d->nelt;
19746
19747 for (i = which = 0; i < nelt; ++i)
19748 which |= (d->perm[i] < nelt ? 1 : 2);
19749
19750 d->one_operand_p = true;
19751 switch (which)
19752 {
19753 default:
19754 gcc_unreachable();
19755
19756 case 3:
19757 if (!rtx_equal_p (d->op0, d->op1))
19758 {
19759 d->one_operand_p = false;
19760 break;
19761 }
19762 /* The elements of PERM do not suggest that only the first operand
19763 is used, but both operands are identical. Allow easier matching
19764 of the permutation by folding the permutation into the single
19765 input vector. */
19766 /* FALLTHRU */
19767
19768 case 2:
19769 for (i = 0; i < nelt; ++i)
19770 d->perm[i] &= nelt - 1;
19771 d->op0 = d->op1;
19772 break;
19773
19774 case 1:
19775 d->op1 = d->op0;
19776 break;
19777 }
19778
19779 return (which == 3);
19780 }
19781
19782 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19783
19784 bool
19785 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19786 rtx op1, const vec_perm_indices &sel)
19787 {
19788 struct expand_vec_perm_d d;
19789 unsigned char perm[MAX_VECT_LEN];
19790 unsigned int i, nelt, which;
19791 bool two_args;
19792
19793 d.target = target;
19794 d.op0 = op0;
19795 d.op1 = op1;
19796
19797 d.vmode = vmode;
19798 gcc_assert (VECTOR_MODE_P (d.vmode));
19799 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19800 d.testing_p = !target;
19801
19802 gcc_assert (sel.length () == nelt);
19803 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19804
19805 /* Given sufficient ISA support we can just return true here
19806 for selected vector modes. */
19807 switch (d.vmode)
19808 {
19809 case E_V16SFmode:
19810 case E_V16SImode:
19811 case E_V8DImode:
19812 case E_V8DFmode:
19813 if (!TARGET_AVX512F)
19814 return false;
19815 /* All implementable with a single vperm[it]2 insn. */
19816 if (d.testing_p)
19817 return true;
19818 break;
19819 case E_V32HImode:
19820 if (!TARGET_AVX512BW)
19821 return false;
19822 if (d.testing_p)
19823 /* All implementable with a single vperm[it]2 insn. */
19824 return true;
19825 break;
19826 case E_V64QImode:
19827 if (!TARGET_AVX512BW)
19828 return false;
19829 if (d.testing_p)
19830 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19831 return true;
19832 break;
19833 case E_V8SImode:
19834 case E_V8SFmode:
19835 case E_V4DFmode:
19836 case E_V4DImode:
19837 if (!TARGET_AVX)
19838 return false;
19839 if (d.testing_p && TARGET_AVX512VL)
19840 /* All implementable with a single vperm[it]2 insn. */
19841 return true;
19842 break;
19843 case E_V16HImode:
19844 if (!TARGET_SSE2)
19845 return false;
19846 if (d.testing_p && TARGET_AVX2)
19847 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19848 return true;
19849 break;
19850 case E_V32QImode:
19851 if (!TARGET_SSE2)
19852 return false;
19853 if (d.testing_p && TARGET_AVX2)
19854 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19855 return true;
19856 break;
19857 case E_V8HImode:
19858 case E_V16QImode:
19859 if (!TARGET_SSE2)
19860 return false;
19861 /* Fall through. */
19862 case E_V4SImode:
19863 case E_V4SFmode:
19864 if (!TARGET_SSE)
19865 return false;
19866 /* All implementable with a single vpperm insn. */
19867 if (d.testing_p && TARGET_XOP)
19868 return true;
19869 /* All implementable with 2 pshufb + 1 ior. */
19870 if (d.testing_p && TARGET_SSSE3)
19871 return true;
19872 break;
19873 case E_V2SFmode:
19874 case E_V2SImode:
19875 case E_V4HImode:
19876 if (!TARGET_MMX_WITH_SSE)
19877 return false;
19878 break;
19879 case E_V2DImode:
19880 case E_V2DFmode:
19881 if (!TARGET_SSE)
19882 return false;
19883 /* All implementable with shufpd or unpck[lh]pd. */
19884 if (d.testing_p)
19885 return true;
19886 break;
19887 default:
19888 return false;
19889 }
19890
19891 for (i = which = 0; i < nelt; ++i)
19892 {
19893 unsigned char e = sel[i];
19894 gcc_assert (e < 2 * nelt);
19895 d.perm[i] = e;
19896 perm[i] = e;
19897 which |= (e < nelt ? 1 : 2);
19898 }
19899
19900 if (d.testing_p)
19901 {
19902 /* For all elements from second vector, fold the elements to first. */
19903 if (which == 2)
19904 for (i = 0; i < nelt; ++i)
19905 d.perm[i] -= nelt;
19906
19907 /* Check whether the mask can be applied to the vector type. */
19908 d.one_operand_p = (which != 3);
19909
19910 /* Implementable with shufps or pshufd. */
19911 if (d.one_operand_p
19912 && (d.vmode == V4SFmode || d.vmode == V2SFmode
19913 || d.vmode == V4SImode || d.vmode == V2SImode))
19914 return true;
19915
19916 /* Otherwise we have to go through the motions and see if we can
19917 figure out how to generate the requested permutation. */
19918 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19919 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19920 if (!d.one_operand_p)
19921 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19922
19923 start_sequence ();
19924 bool ret = ix86_expand_vec_perm_const_1 (&d);
19925 end_sequence ();
19926
19927 return ret;
19928 }
19929
19930 two_args = canonicalize_perm (&d);
19931
19932 /* If one of the operands is a zero vector, try to match pmovzx. */
19933 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
19934 {
19935 struct expand_vec_perm_d dzero = d;
19936 if (d.op0 == CONST0_RTX (vmode))
19937 {
19938 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
19939 std::swap (dzero.op0, dzero.op1);
19940 for (i = 0; i < nelt; ++i)
19941 dzero.perm[i] ^= nelt;
19942 }
19943 else
19944 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
19945
19946 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
19947 dzero.perm, nelt, dzero.testing_p))
19948 return true;
19949 }
19950
19951 /* Force operands into registers. */
19952 rtx nop0 = force_reg (vmode, d.op0);
19953 if (d.op0 == d.op1)
19954 d.op1 = nop0;
19955 d.op0 = nop0;
19956 d.op1 = force_reg (vmode, d.op1);
19957
19958 if (ix86_expand_vec_perm_const_1 (&d))
19959 return true;
19960
19961 /* If the selector says both arguments are needed, but the operands are the
19962 same, the above tried to expand with one_operand_p and flattened selector.
19963 If that didn't work, retry without one_operand_p; we succeeded with that
19964 during testing. */
19965 if (two_args && d.one_operand_p)
19966 {
19967 d.one_operand_p = false;
19968 memcpy (d.perm, perm, sizeof (perm));
19969 return ix86_expand_vec_perm_const_1 (&d);
19970 }
19971
19972 return false;
19973 }
19974
19975 void
19976 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19977 {
19978 struct expand_vec_perm_d d;
19979 unsigned i, nelt;
19980
19981 d.target = targ;
19982 d.op0 = op0;
19983 d.op1 = op1;
19984 d.vmode = GET_MODE (targ);
19985 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19986 d.one_operand_p = false;
19987 d.testing_p = false;
19988
19989 for (i = 0; i < nelt; ++i)
19990 d.perm[i] = i * 2 + odd;
19991
19992 /* We'll either be able to implement the permutation directly... */
19993 if (expand_vec_perm_1 (&d))
19994 return;
19995
19996 /* ... or we use the special-case patterns. */
19997 expand_vec_perm_even_odd_1 (&d, odd);
19998 }
19999
20000 static void
20001 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
20002 {
20003 struct expand_vec_perm_d d;
20004 unsigned i, nelt, base;
20005 bool ok;
20006
20007 d.target = targ;
20008 d.op0 = op0;
20009 d.op1 = op1;
20010 d.vmode = GET_MODE (targ);
20011 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20012 d.one_operand_p = false;
20013 d.testing_p = false;
20014
20015 base = high_p ? nelt / 2 : 0;
20016 for (i = 0; i < nelt / 2; ++i)
20017 {
20018 d.perm[i * 2] = i + base;
20019 d.perm[i * 2 + 1] = i + base + nelt;
20020 }
20021
20022 /* Note that for AVX this isn't one instruction. */
20023 ok = ix86_expand_vec_perm_const_1 (&d);
20024 gcc_assert (ok);
20025 }
20026
20027 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20028 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20029
20030 vpmovzxbw ymm2, xmm0
20031 vpmovzxbw ymm3, xmm1
20032 vpmullw ymm4, ymm2, ymm3
20033 vpmovwb xmm0, ymm4
20034
20035 it would take less instructions than ix86_expand_vecop_qihi.
20036 Return true if success. */
20037
20038 bool
20039 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
20040 {
20041 machine_mode himode, qimode = GET_MODE (dest);
20042 rtx hop1, hop2, hdest;
20043 rtx (*gen_extend)(rtx, rtx);
20044 rtx (*gen_truncate)(rtx, rtx);
20045
20046 /* There's no V64HImode multiplication instruction. */
20047 if (qimode == E_V64QImode)
20048 return false;
20049
20050 /* vpmovwb only available under AVX512BW. */
20051 if (!TARGET_AVX512BW)
20052 return false;
20053 if ((qimode == V8QImode || qimode == V16QImode)
20054 && !TARGET_AVX512VL)
20055 return false;
20056 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20057 if (qimode == V32QImode
20058 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
20059 return false;
20060
20061 switch (qimode)
20062 {
20063 case E_V8QImode:
20064 himode = V8HImode;
20065 gen_extend = gen_zero_extendv8qiv8hi2;
20066 gen_truncate = gen_truncv8hiv8qi2;
20067 break;
20068 case E_V16QImode:
20069 himode = V16HImode;
20070 gen_extend = gen_zero_extendv16qiv16hi2;
20071 gen_truncate = gen_truncv16hiv16qi2;
20072 break;
20073 case E_V32QImode:
20074 himode = V32HImode;
20075 gen_extend = gen_zero_extendv32qiv32hi2;
20076 gen_truncate = gen_truncv32hiv32qi2;
20077 break;
20078 default:
20079 gcc_unreachable ();
20080 }
20081
20082 hop1 = gen_reg_rtx (himode);
20083 hop2 = gen_reg_rtx (himode);
20084 hdest = gen_reg_rtx (himode);
20085 emit_insn (gen_extend (hop1, op1));
20086 emit_insn (gen_extend (hop2, op2));
20087 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
20088 hop1, hop2)));
20089 emit_insn (gen_truncate (dest, hdest));
20090 return true;
20091 }
20092
20093 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20094 same operation on V*HImode. Return true if success. */
20095 bool
20096 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20097 {
20098 machine_mode qimode, himode;
20099 HOST_WIDE_INT and_constant, xor_constant;
20100 HOST_WIDE_INT shift_amount;
20101 rtx vec_const_and, vec_const_xor;
20102 rtx tmp, op1_subreg;
20103 rtx (*gen_shift) (rtx, rtx, rtx);
20104 rtx (*gen_and) (rtx, rtx, rtx);
20105 rtx (*gen_xor) (rtx, rtx, rtx);
20106 rtx (*gen_sub) (rtx, rtx, rtx);
20107
20108 /* Only optimize shift by constant. */
20109 if (!CONST_INT_P (op2))
20110 return false;
20111
20112 qimode = GET_MODE (dest);
20113 shift_amount = INTVAL (op2);
20114 /* Do nothing when shift amount greater equal 8. */
20115 if (shift_amount > 7)
20116 return false;
20117
20118 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
20119 /* Record sign bit. */
20120 xor_constant = 1 << (8 - shift_amount - 1);
20121
20122 /* Zero upper/lower bits shift from left/right element. */
20123 and_constant
20124 = (code == ASHIFT ? 256 - (1 << shift_amount)
20125 : (1 << (8 - shift_amount)) - 1);
20126
20127 switch (qimode)
20128 {
20129 case V16QImode:
20130 himode = V8HImode;
20131 gen_shift =
20132 ((code == ASHIFT)
20133 ? gen_ashlv8hi3
20134 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
20135 gen_and = gen_andv16qi3;
20136 gen_xor = gen_xorv16qi3;
20137 gen_sub = gen_subv16qi3;
20138 break;
20139 case V32QImode:
20140 himode = V16HImode;
20141 gen_shift =
20142 ((code == ASHIFT)
20143 ? gen_ashlv16hi3
20144 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
20145 gen_and = gen_andv32qi3;
20146 gen_xor = gen_xorv32qi3;
20147 gen_sub = gen_subv32qi3;
20148 break;
20149 case V64QImode:
20150 himode = V32HImode;
20151 gen_shift =
20152 ((code == ASHIFT)
20153 ? gen_ashlv32hi3
20154 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
20155 gen_and = gen_andv64qi3;
20156 gen_xor = gen_xorv64qi3;
20157 gen_sub = gen_subv64qi3;
20158 break;
20159 default:
20160 gcc_unreachable ();
20161 }
20162
20163 tmp = gen_reg_rtx (himode);
20164 vec_const_and = gen_reg_rtx (qimode);
20165 op1_subreg = lowpart_subreg (himode, op1, qimode);
20166
20167 /* For ASHIFT and LSHIFTRT, perform operation like
20168 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20169 vpand %vec_const_and, %dest. */
20170 emit_insn (gen_shift (tmp, op1_subreg, op2));
20171 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
20172 emit_move_insn (vec_const_and,
20173 ix86_build_const_vector (qimode, true,
20174 gen_int_mode (and_constant, QImode)));
20175 emit_insn (gen_and (dest, dest, vec_const_and));
20176
20177 /* For ASHIFTRT, perform extra operation like
20178 vpxor %vec_const_xor, %dest, %dest
20179 vpsubb %vec_const_xor, %dest, %dest */
20180 if (code == ASHIFTRT)
20181 {
20182 vec_const_xor = gen_reg_rtx (qimode);
20183 emit_move_insn (vec_const_xor,
20184 ix86_build_const_vector (qimode, true,
20185 gen_int_mode (xor_constant, QImode)));
20186 emit_insn (gen_xor (dest, dest, vec_const_xor));
20187 emit_insn (gen_sub (dest, dest, vec_const_xor));
20188 }
20189 return true;
20190 }
20191
20192 /* Expand a vector operation CODE for a V*QImode in terms of the
20193 same operation on V*HImode. */
20194
20195 void
20196 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20197 {
20198 machine_mode qimode = GET_MODE (dest);
20199 machine_mode himode;
20200 rtx (*gen_il) (rtx, rtx, rtx);
20201 rtx (*gen_ih) (rtx, rtx, rtx);
20202 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
20203 struct expand_vec_perm_d d;
20204 bool ok, full_interleave;
20205 bool uns_p = false;
20206 int i;
20207
20208 switch (qimode)
20209 {
20210 case E_V16QImode:
20211 himode = V8HImode;
20212 gen_il = gen_vec_interleave_lowv16qi;
20213 gen_ih = gen_vec_interleave_highv16qi;
20214 break;
20215 case E_V32QImode:
20216 himode = V16HImode;
20217 gen_il = gen_avx2_interleave_lowv32qi;
20218 gen_ih = gen_avx2_interleave_highv32qi;
20219 break;
20220 case E_V64QImode:
20221 himode = V32HImode;
20222 gen_il = gen_avx512bw_interleave_lowv64qi;
20223 gen_ih = gen_avx512bw_interleave_highv64qi;
20224 break;
20225 default:
20226 gcc_unreachable ();
20227 }
20228
20229 op2_l = op2_h = op2;
20230 switch (code)
20231 {
20232 case MULT:
20233 /* Unpack data such that we've got a source byte in each low byte of
20234 each word. We don't care what goes into the high byte of each word.
20235 Rather than trying to get zero in there, most convenient is to let
20236 it be a copy of the low byte. */
20237 op2_l = gen_reg_rtx (qimode);
20238 op2_h = gen_reg_rtx (qimode);
20239 emit_insn (gen_il (op2_l, op2, op2));
20240 emit_insn (gen_ih (op2_h, op2, op2));
20241
20242 op1_l = gen_reg_rtx (qimode);
20243 op1_h = gen_reg_rtx (qimode);
20244 emit_insn (gen_il (op1_l, op1, op1));
20245 emit_insn (gen_ih (op1_h, op1, op1));
20246 full_interleave = qimode == V16QImode;
20247 break;
20248
20249 case ASHIFT:
20250 case LSHIFTRT:
20251 uns_p = true;
20252 /* FALLTHRU */
20253 case ASHIFTRT:
20254 op1_l = gen_reg_rtx (himode);
20255 op1_h = gen_reg_rtx (himode);
20256 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
20257 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
20258 full_interleave = true;
20259 break;
20260 default:
20261 gcc_unreachable ();
20262 }
20263
20264 /* Perform the operation. */
20265 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
20266 1, OPTAB_DIRECT);
20267 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
20268 1, OPTAB_DIRECT);
20269 gcc_assert (res_l && res_h);
20270
20271 /* Merge the data back into the right place. */
20272 d.target = dest;
20273 d.op0 = gen_lowpart (qimode, res_l);
20274 d.op1 = gen_lowpart (qimode, res_h);
20275 d.vmode = qimode;
20276 d.nelt = GET_MODE_NUNITS (qimode);
20277 d.one_operand_p = false;
20278 d.testing_p = false;
20279
20280 if (full_interleave)
20281 {
20282 /* For SSE2, we used an full interleave, so the desired
20283 results are in the even elements. */
20284 for (i = 0; i < d.nelt; ++i)
20285 d.perm[i] = i * 2;
20286 }
20287 else
20288 {
20289 /* For AVX, the interleave used above was not cross-lane. So the
20290 extraction is evens but with the second and third quarter swapped.
20291 Happily, that is even one insn shorter than even extraction.
20292 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20293 always first from the first and then from the second source operand,
20294 the index bits above the low 4 bits remains the same.
20295 Thus, for d.nelt == 32 we want permutation
20296 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20297 and for d.nelt == 64 we want permutation
20298 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20299 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20300 for (i = 0; i < d.nelt; ++i)
20301 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
20302 }
20303
20304 ok = ix86_expand_vec_perm_const_1 (&d);
20305 gcc_assert (ok);
20306
20307 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20308 gen_rtx_fmt_ee (code, qimode, op1, op2));
20309 }
20310
20311 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20312 if op is CONST_VECTOR with all odd elements equal to their
20313 preceding element. */
20314
20315 static bool
20316 const_vector_equal_evenodd_p (rtx op)
20317 {
20318 machine_mode mode = GET_MODE (op);
20319 int i, nunits = GET_MODE_NUNITS (mode);
20320 if (GET_CODE (op) != CONST_VECTOR
20321 || nunits != CONST_VECTOR_NUNITS (op))
20322 return false;
20323 for (i = 0; i < nunits; i += 2)
20324 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
20325 return false;
20326 return true;
20327 }
20328
20329 void
20330 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
20331 bool uns_p, bool odd_p)
20332 {
20333 machine_mode mode = GET_MODE (op1);
20334 machine_mode wmode = GET_MODE (dest);
20335 rtx x;
20336 rtx orig_op1 = op1, orig_op2 = op2;
20337
20338 if (!nonimmediate_operand (op1, mode))
20339 op1 = force_reg (mode, op1);
20340 if (!nonimmediate_operand (op2, mode))
20341 op2 = force_reg (mode, op2);
20342
20343 /* We only play even/odd games with vectors of SImode. */
20344 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
20345
20346 /* If we're looking for the odd results, shift those members down to
20347 the even slots. For some cpus this is faster than a PSHUFD. */
20348 if (odd_p)
20349 {
20350 /* For XOP use vpmacsdqh, but only for smult, as it is only
20351 signed. */
20352 if (TARGET_XOP && mode == V4SImode && !uns_p)
20353 {
20354 x = force_reg (wmode, CONST0_RTX (wmode));
20355 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
20356 return;
20357 }
20358
20359 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
20360 if (!const_vector_equal_evenodd_p (orig_op1))
20361 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
20362 x, NULL, 1, OPTAB_DIRECT);
20363 if (!const_vector_equal_evenodd_p (orig_op2))
20364 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
20365 x, NULL, 1, OPTAB_DIRECT);
20366 op1 = gen_lowpart (mode, op1);
20367 op2 = gen_lowpart (mode, op2);
20368 }
20369
20370 if (mode == V16SImode)
20371 {
20372 if (uns_p)
20373 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
20374 else
20375 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
20376 }
20377 else if (mode == V8SImode)
20378 {
20379 if (uns_p)
20380 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
20381 else
20382 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
20383 }
20384 else if (uns_p)
20385 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
20386 else if (TARGET_SSE4_1)
20387 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
20388 else
20389 {
20390 rtx s1, s2, t0, t1, t2;
20391
20392 /* The easiest way to implement this without PMULDQ is to go through
20393 the motions as if we are performing a full 64-bit multiply. With
20394 the exception that we need to do less shuffling of the elements. */
20395
20396 /* Compute the sign-extension, aka highparts, of the two operands. */
20397 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20398 op1, pc_rtx, pc_rtx);
20399 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20400 op2, pc_rtx, pc_rtx);
20401
20402 /* Multiply LO(A) * HI(B), and vice-versa. */
20403 t1 = gen_reg_rtx (wmode);
20404 t2 = gen_reg_rtx (wmode);
20405 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
20406 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
20407
20408 /* Multiply LO(A) * LO(B). */
20409 t0 = gen_reg_rtx (wmode);
20410 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
20411
20412 /* Combine and shift the highparts into place. */
20413 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
20414 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
20415 1, OPTAB_DIRECT);
20416
20417 /* Combine high and low parts. */
20418 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
20419 return;
20420 }
20421 emit_insn (x);
20422 }
20423
20424 void
20425 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
20426 bool uns_p, bool high_p)
20427 {
20428 machine_mode wmode = GET_MODE (dest);
20429 machine_mode mode = GET_MODE (op1);
20430 rtx t1, t2, t3, t4, mask;
20431
20432 switch (mode)
20433 {
20434 case E_V4SImode:
20435 t1 = gen_reg_rtx (mode);
20436 t2 = gen_reg_rtx (mode);
20437 if (TARGET_XOP && !uns_p)
20438 {
20439 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20440 shuffle the elements once so that all elements are in the right
20441 place for immediate use: { A C B D }. */
20442 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
20443 const1_rtx, GEN_INT (3)));
20444 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
20445 const1_rtx, GEN_INT (3)));
20446 }
20447 else
20448 {
20449 /* Put the elements into place for the multiply. */
20450 ix86_expand_vec_interleave (t1, op1, op1, high_p);
20451 ix86_expand_vec_interleave (t2, op2, op2, high_p);
20452 high_p = false;
20453 }
20454 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
20455 break;
20456
20457 case E_V8SImode:
20458 /* Shuffle the elements between the lanes. After this we
20459 have { A B E F | C D G H } for each operand. */
20460 t1 = gen_reg_rtx (V4DImode);
20461 t2 = gen_reg_rtx (V4DImode);
20462 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
20463 const0_rtx, const2_rtx,
20464 const1_rtx, GEN_INT (3)));
20465 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
20466 const0_rtx, const2_rtx,
20467 const1_rtx, GEN_INT (3)));
20468
20469 /* Shuffle the elements within the lanes. After this we
20470 have { A A B B | C C D D } or { E E F F | G G H H }. */
20471 t3 = gen_reg_rtx (V8SImode);
20472 t4 = gen_reg_rtx (V8SImode);
20473 mask = GEN_INT (high_p
20474 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20475 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20476 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
20477 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
20478
20479 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
20480 break;
20481
20482 case E_V8HImode:
20483 case E_V16HImode:
20484 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
20485 uns_p, OPTAB_DIRECT);
20486 t2 = expand_binop (mode,
20487 uns_p ? umul_highpart_optab : smul_highpart_optab,
20488 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20489 gcc_assert (t1 && t2);
20490
20491 t3 = gen_reg_rtx (mode);
20492 ix86_expand_vec_interleave (t3, t1, t2, high_p);
20493 emit_move_insn (dest, gen_lowpart (wmode, t3));
20494 break;
20495
20496 case E_V16QImode:
20497 case E_V32QImode:
20498 case E_V32HImode:
20499 case E_V16SImode:
20500 case E_V64QImode:
20501 t1 = gen_reg_rtx (wmode);
20502 t2 = gen_reg_rtx (wmode);
20503 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20504 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20505
20506 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20507 break;
20508
20509 default:
20510 gcc_unreachable ();
20511 }
20512 }
20513
20514 void
20515 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20516 {
20517 rtx res_1, res_2, res_3, res_4;
20518
20519 res_1 = gen_reg_rtx (V4SImode);
20520 res_2 = gen_reg_rtx (V4SImode);
20521 res_3 = gen_reg_rtx (V2DImode);
20522 res_4 = gen_reg_rtx (V2DImode);
20523 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20524 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20525
20526 /* Move the results in element 2 down to element 1; we don't care
20527 what goes in elements 2 and 3. Then we can merge the parts
20528 back together with an interleave.
20529
20530 Note that two other sequences were tried:
20531 (1) Use interleaves at the start instead of psrldq, which allows
20532 us to use a single shufps to merge things back at the end.
20533 (2) Use shufps here to combine the two vectors, then pshufd to
20534 put the elements in the correct order.
20535 In both cases the cost of the reformatting stall was too high
20536 and the overall sequence slower. */
20537
20538 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20539 const0_rtx, const2_rtx,
20540 const0_rtx, const0_rtx));
20541 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20542 const0_rtx, const2_rtx,
20543 const0_rtx, const0_rtx));
20544 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20545
20546 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20547 }
20548
20549 void
20550 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20551 {
20552 machine_mode mode = GET_MODE (op0);
20553 rtx t1, t2, t3, t4, t5, t6;
20554
20555 if (TARGET_AVX512DQ && mode == V8DImode)
20556 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20557 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20558 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20559 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20560 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20561 else if (TARGET_XOP && mode == V2DImode)
20562 {
20563 /* op1: A,B,C,D, op2: E,F,G,H */
20564 op1 = gen_lowpart (V4SImode, op1);
20565 op2 = gen_lowpart (V4SImode, op2);
20566
20567 t1 = gen_reg_rtx (V4SImode);
20568 t2 = gen_reg_rtx (V4SImode);
20569 t3 = gen_reg_rtx (V2DImode);
20570 t4 = gen_reg_rtx (V2DImode);
20571
20572 /* t1: B,A,D,C */
20573 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20574 GEN_INT (1),
20575 GEN_INT (0),
20576 GEN_INT (3),
20577 GEN_INT (2)));
20578
20579 /* t2: (B*E),(A*F),(D*G),(C*H) */
20580 emit_insn (gen_mulv4si3 (t2, t1, op2));
20581
20582 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20583 emit_insn (gen_xop_phadddq (t3, t2));
20584
20585 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20586 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20587
20588 /* Multiply lower parts and add all */
20589 t5 = gen_reg_rtx (V2DImode);
20590 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20591 gen_lowpart (V4SImode, op1),
20592 gen_lowpart (V4SImode, op2)));
20593 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20594 }
20595 else
20596 {
20597 machine_mode nmode;
20598 rtx (*umul) (rtx, rtx, rtx);
20599
20600 if (mode == V2DImode)
20601 {
20602 umul = gen_vec_widen_umult_even_v4si;
20603 nmode = V4SImode;
20604 }
20605 else if (mode == V4DImode)
20606 {
20607 umul = gen_vec_widen_umult_even_v8si;
20608 nmode = V8SImode;
20609 }
20610 else if (mode == V8DImode)
20611 {
20612 umul = gen_vec_widen_umult_even_v16si;
20613 nmode = V16SImode;
20614 }
20615 else
20616 gcc_unreachable ();
20617
20618
20619 /* Multiply low parts. */
20620 t1 = gen_reg_rtx (mode);
20621 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20622
20623 /* Shift input vectors right 32 bits so we can multiply high parts. */
20624 t6 = GEN_INT (32);
20625 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20626 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20627
20628 /* Multiply high parts by low parts. */
20629 t4 = gen_reg_rtx (mode);
20630 t5 = gen_reg_rtx (mode);
20631 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20632 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20633
20634 /* Combine and shift the highparts back. */
20635 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20636 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20637
20638 /* Combine high and low parts. */
20639 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20640 }
20641
20642 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20643 gen_rtx_MULT (mode, op1, op2));
20644 }
20645
20646 /* Return 1 if control tansfer instruction INSN
20647 should be encoded with notrack prefix. */
20648
20649 bool
20650 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20651 {
20652 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20653 return false;
20654
20655 if (CALL_P (insn))
20656 {
20657 rtx call = get_call_rtx_from (insn);
20658 gcc_assert (call != NULL_RTX);
20659 rtx addr = XEXP (call, 0);
20660
20661 /* Do not emit 'notrack' if it's not an indirect call. */
20662 if (MEM_P (addr)
20663 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20664 return false;
20665 else
20666 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20667 }
20668
20669 if (JUMP_P (insn) && !flag_cet_switch)
20670 {
20671 rtx target = JUMP_LABEL (insn);
20672 if (target == NULL_RTX || ANY_RETURN_P (target))
20673 return false;
20674
20675 /* Check the jump is a switch table. */
20676 rtx_insn *label = as_a<rtx_insn *> (target);
20677 rtx_insn *table = next_insn (label);
20678 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20679 return false;
20680 else
20681 return true;
20682 }
20683 return false;
20684 }
20685
20686 /* Calculate integer abs() using only SSE2 instructions. */
20687
20688 void
20689 ix86_expand_sse2_abs (rtx target, rtx input)
20690 {
20691 machine_mode mode = GET_MODE (target);
20692 rtx tmp0, tmp1, x;
20693
20694 switch (mode)
20695 {
20696 case E_V2DImode:
20697 case E_V4DImode:
20698 /* For 64-bit signed integer X, with SSE4.2 use
20699 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20700 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20701 32 and use logical instead of arithmetic right shift (which is
20702 unimplemented) and subtract. */
20703 if (TARGET_SSE4_2)
20704 {
20705 tmp0 = gen_reg_rtx (mode);
20706 tmp1 = gen_reg_rtx (mode);
20707 emit_move_insn (tmp1, CONST0_RTX (mode));
20708 if (mode == E_V2DImode)
20709 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20710 else
20711 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20712 }
20713 else
20714 {
20715 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20716 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20717 - 1), NULL, 0, OPTAB_DIRECT);
20718 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20719 }
20720
20721 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20722 NULL, 0, OPTAB_DIRECT);
20723 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20724 target, 0, OPTAB_DIRECT);
20725 break;
20726
20727 case E_V4SImode:
20728 /* For 32-bit signed integer X, the best way to calculate the absolute
20729 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20730 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20731 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20732 NULL, 0, OPTAB_DIRECT);
20733 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20734 NULL, 0, OPTAB_DIRECT);
20735 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20736 target, 0, OPTAB_DIRECT);
20737 break;
20738
20739 case E_V8HImode:
20740 /* For 16-bit signed integer X, the best way to calculate the absolute
20741 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20742 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20743
20744 x = expand_simple_binop (mode, SMAX, tmp0, input,
20745 target, 0, OPTAB_DIRECT);
20746 break;
20747
20748 case E_V16QImode:
20749 /* For 8-bit signed integer X, the best way to calculate the absolute
20750 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20751 as SSE2 provides the PMINUB insn. */
20752 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20753
20754 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20755 target, 0, OPTAB_DIRECT);
20756 break;
20757
20758 default:
20759 gcc_unreachable ();
20760 }
20761
20762 if (x != target)
20763 emit_move_insn (target, x);
20764 }
20765
20766 /* Expand an extract from a vector register through pextr insn.
20767 Return true if successful. */
20768
20769 bool
20770 ix86_expand_pextr (rtx *operands)
20771 {
20772 rtx dst = operands[0];
20773 rtx src = operands[1];
20774
20775 unsigned int size = INTVAL (operands[2]);
20776 unsigned int pos = INTVAL (operands[3]);
20777
20778 if (SUBREG_P (dst))
20779 {
20780 /* Reject non-lowpart subregs. */
20781 if (SUBREG_BYTE (dst) > 0)
20782 return false;
20783 dst = SUBREG_REG (dst);
20784 }
20785
20786 if (SUBREG_P (src))
20787 {
20788 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20789 src = SUBREG_REG (src);
20790 }
20791
20792 switch (GET_MODE (src))
20793 {
20794 case E_V16QImode:
20795 case E_V8HImode:
20796 case E_V4SImode:
20797 case E_V2DImode:
20798 case E_V1TImode:
20799 {
20800 machine_mode srcmode, dstmode;
20801 rtx d, pat;
20802
20803 if (!int_mode_for_size (size, 0).exists (&dstmode))
20804 return false;
20805
20806 switch (dstmode)
20807 {
20808 case E_QImode:
20809 if (!TARGET_SSE4_1)
20810 return false;
20811 srcmode = V16QImode;
20812 break;
20813
20814 case E_HImode:
20815 if (!TARGET_SSE2)
20816 return false;
20817 srcmode = V8HImode;
20818 break;
20819
20820 case E_SImode:
20821 if (!TARGET_SSE4_1)
20822 return false;
20823 srcmode = V4SImode;
20824 break;
20825
20826 case E_DImode:
20827 gcc_assert (TARGET_64BIT);
20828 if (!TARGET_SSE4_1)
20829 return false;
20830 srcmode = V2DImode;
20831 break;
20832
20833 default:
20834 return false;
20835 }
20836
20837 /* Reject extractions from misaligned positions. */
20838 if (pos & (size-1))
20839 return false;
20840
20841 if (GET_MODE (dst) == dstmode)
20842 d = dst;
20843 else
20844 d = gen_reg_rtx (dstmode);
20845
20846 /* Construct insn pattern. */
20847 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20848 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20849
20850 /* Let the rtl optimizers know about the zero extension performed. */
20851 if (dstmode == QImode || dstmode == HImode)
20852 {
20853 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20854 d = gen_lowpart (SImode, d);
20855 }
20856
20857 emit_insn (gen_rtx_SET (d, pat));
20858
20859 if (d != dst)
20860 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20861 return true;
20862 }
20863
20864 default:
20865 return false;
20866 }
20867 }
20868
20869 /* Expand an insert into a vector register through pinsr insn.
20870 Return true if successful. */
20871
20872 bool
20873 ix86_expand_pinsr (rtx *operands)
20874 {
20875 rtx dst = operands[0];
20876 rtx src = operands[3];
20877
20878 unsigned int size = INTVAL (operands[1]);
20879 unsigned int pos = INTVAL (operands[2]);
20880
20881 if (SUBREG_P (dst))
20882 {
20883 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20884 dst = SUBREG_REG (dst);
20885 }
20886
20887 switch (GET_MODE (dst))
20888 {
20889 case E_V16QImode:
20890 case E_V8HImode:
20891 case E_V4SImode:
20892 case E_V2DImode:
20893 case E_V1TImode:
20894 {
20895 machine_mode srcmode, dstmode;
20896 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20897 rtx d;
20898
20899 if (!int_mode_for_size (size, 0).exists (&srcmode))
20900 return false;
20901
20902 switch (srcmode)
20903 {
20904 case E_QImode:
20905 if (!TARGET_SSE4_1)
20906 return false;
20907 dstmode = V16QImode;
20908 pinsr = gen_sse4_1_pinsrb;
20909 break;
20910
20911 case E_HImode:
20912 if (!TARGET_SSE2)
20913 return false;
20914 dstmode = V8HImode;
20915 pinsr = gen_sse2_pinsrw;
20916 break;
20917
20918 case E_SImode:
20919 if (!TARGET_SSE4_1)
20920 return false;
20921 dstmode = V4SImode;
20922 pinsr = gen_sse4_1_pinsrd;
20923 break;
20924
20925 case E_DImode:
20926 gcc_assert (TARGET_64BIT);
20927 if (!TARGET_SSE4_1)
20928 return false;
20929 dstmode = V2DImode;
20930 pinsr = gen_sse4_1_pinsrq;
20931 break;
20932
20933 default:
20934 return false;
20935 }
20936
20937 /* Reject insertions to misaligned positions. */
20938 if (pos & (size-1))
20939 return false;
20940
20941 if (SUBREG_P (src))
20942 {
20943 unsigned int srcpos = SUBREG_BYTE (src);
20944
20945 if (srcpos > 0)
20946 {
20947 rtx extr_ops[4];
20948
20949 extr_ops[0] = gen_reg_rtx (srcmode);
20950 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20951 extr_ops[2] = GEN_INT (size);
20952 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20953
20954 if (!ix86_expand_pextr (extr_ops))
20955 return false;
20956
20957 src = extr_ops[0];
20958 }
20959 else
20960 src = gen_lowpart (srcmode, SUBREG_REG (src));
20961 }
20962
20963 if (GET_MODE (dst) == dstmode)
20964 d = dst;
20965 else
20966 d = gen_reg_rtx (dstmode);
20967
20968 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20969 gen_lowpart (srcmode, src),
20970 GEN_INT (1 << (pos / size))));
20971 if (d != dst)
20972 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20973 return true;
20974 }
20975
20976 default:
20977 return false;
20978 }
20979 }
20980
20981 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20982 upper against lower halves up to SSE reg size. */
20983
20984 machine_mode
20985 ix86_split_reduction (machine_mode mode)
20986 {
20987 /* Reduce lowpart against highpart until we reach SSE reg width to
20988 avoid cross-lane operations. */
20989 switch (mode)
20990 {
20991 case E_V8DImode:
20992 case E_V4DImode:
20993 return V2DImode;
20994 case E_V16SImode:
20995 case E_V8SImode:
20996 return V4SImode;
20997 case E_V32HImode:
20998 case E_V16HImode:
20999 return V8HImode;
21000 case E_V64QImode:
21001 case E_V32QImode:
21002 return V16QImode;
21003 case E_V16SFmode:
21004 case E_V8SFmode:
21005 return V4SFmode;
21006 case E_V8DFmode:
21007 case E_V4DFmode:
21008 return V2DFmode;
21009 default:
21010 return mode;
21011 }
21012 }
21013
21014 /* Generate call to __divmoddi4. */
21015
21016 void
21017 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
21018 rtx op0, rtx op1,
21019 rtx *quot_p, rtx *rem_p)
21020 {
21021 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
21022
21023 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
21024 mode, op0, mode, op1, mode,
21025 XEXP (rem, 0), Pmode);
21026 *quot_p = quot;
21027 *rem_p = rem;
21028 }
21029
21030 #include "gt-i386-expand.h"