Daily bump.
[gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53
54 /*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61 /* Renames the use *OP_P. */
62
63 static void
64 rename_use_op (use_operand_p op_p)
65 {
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80 }
81
82
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
86
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
103
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
112 FOR_EACH_EDGE (e, ei, bb->preds)
113 {
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
129 }
130 }
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134 }
135 }
136
137
138 struct adjust_info
139 {
140 tree from, to;
141 basic_block bb;
142 };
143
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
151
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197 }
198
199 /* Adjust debug stmts as scheduled before. */
200
201 static void
202 adjust_vec_debug_stmts (void)
203 {
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
206
207 gcc_assert (adjust_vec.exists ());
208
209 while (!adjust_vec.is_empty ())
210 {
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
213 }
214 }
215
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224 adjust_info ai;
225
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240 }
241
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257 }
258
259 /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
261 value that it should have on subsequent iterations. */
262
263 static void
264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
266 {
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271
272 /* Add SEQ to the end of LOOP's preheader block. */
273
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283 }
284
285 /* Add SEQ to the beginning of LOOP's header block. */
286
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295 }
296
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305 {
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
325 {
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
342 {
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
375 {
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385 }
386
387 /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGC belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
394
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
400 It is known that:
401
402 NITERS * RGC->max_nscalars_per_iter * RGC->factor
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGC->max_nscalars_per_iter * RGC->factor
408
409 might overflow before hitting a value above:
410
411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks or zero
415 lengths for RGC.
416
417 Note: the cost of the code generated by this function is modeled
418 by vect_estimate_min_profitable_iters, so changes here may need
419 corresponding changes there. */
420
421 static tree
422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 gimple_seq *preheader_seq,
424 gimple_stmt_iterator loop_cond_gsi,
425 rgroup_controls *rgc, tree niters,
426 tree niters_skip, bool might_wrap_p)
427 {
428 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
429 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
430 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
431
432 tree ctrl_type = rgc->type;
433 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
434 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
435 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
436 tree length_limit = NULL_TREE;
437 /* For length, we need length_limit to ensure length in range. */
438 if (!use_masks_p)
439 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
440
441 /* Calculate the maximum number of item values that the rgroup
442 handles in total, the number that it handles for each iteration
443 of the vector loop, and the number that it should skip during the
444 first iteration of the vector loop. */
445 tree nitems_total = niters;
446 tree nitems_step = build_int_cst (iv_type, vf);
447 tree nitems_skip = niters_skip;
448 if (nitems_per_iter != 1)
449 {
450 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
451 these multiplications don't overflow. */
452 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
453 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
454 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
455 nitems_total, compare_factor);
456 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
457 nitems_step, iv_factor);
458 if (nitems_skip)
459 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
460 nitems_skip, compare_factor);
461 }
462
463 /* Create an induction variable that counts the number of items
464 processed. */
465 tree index_before_incr, index_after_incr;
466 gimple_stmt_iterator incr_gsi;
467 bool insert_after;
468 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
469 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
470 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
471
472 tree zero_index = build_int_cst (compare_type, 0);
473 tree test_index, test_limit, first_limit;
474 gimple_stmt_iterator *test_gsi;
475 if (might_wrap_p)
476 {
477 /* In principle the loop should stop iterating once the incremented
478 IV reaches a value greater than or equal to:
479
480 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
481
482 However, there's no guarantee that this addition doesn't overflow
483 the comparison type, or that the IV hits a value above it before
484 wrapping around. We therefore adjust the limit down by one
485 IV step:
486
487 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
488 -[infinite-prec] NITEMS_STEP
489
490 and compare the IV against this limit _before_ incrementing it.
491 Since the comparison type is unsigned, we actually want the
492 subtraction to saturate at zero:
493
494 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
495 -[sat] NITEMS_STEP
496
497 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
498
499 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
500
501 where the rightmost subtraction can be done directly in
502 COMPARE_TYPE. */
503 test_index = index_before_incr;
504 tree adjust = gimple_convert (preheader_seq, compare_type,
505 nitems_step);
506 if (nitems_skip)
507 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
508 adjust, nitems_skip);
509 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
510 nitems_total, adjust);
511 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
512 test_limit, adjust);
513 test_gsi = &incr_gsi;
514
515 /* Get a safe limit for the first iteration. */
516 if (nitems_skip)
517 {
518 /* The first vector iteration can handle at most NITEMS_STEP
519 items. NITEMS_STEP <= CONST_LIMIT, and adding
520 NITEMS_SKIP to that cannot overflow. */
521 tree const_limit = build_int_cst (compare_type,
522 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
523 * nitems_per_iter);
524 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
525 nitems_total, const_limit);
526 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
527 first_limit, nitems_skip);
528 }
529 else
530 /* For the first iteration it doesn't matter whether the IV hits
531 a value above NITEMS_TOTAL. That only matters for the latch
532 condition. */
533 first_limit = nitems_total;
534 }
535 else
536 {
537 /* Test the incremented IV, which will always hit a value above
538 the bound before wrapping. */
539 test_index = index_after_incr;
540 test_limit = nitems_total;
541 if (nitems_skip)
542 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
543 test_limit, nitems_skip);
544 test_gsi = &loop_cond_gsi;
545
546 first_limit = test_limit;
547 }
548
549 /* Convert the IV value to the comparison type (either a no-op or
550 a demotion). */
551 gimple_seq test_seq = NULL;
552 test_index = gimple_convert (&test_seq, compare_type, test_index);
553 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
554
555 /* Provide a definition of each control in the group. */
556 tree next_ctrl = NULL_TREE;
557 tree ctrl;
558 unsigned int i;
559 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
560 {
561 /* Previous controls will cover BIAS items. This control covers the
562 next batch. */
563 poly_uint64 bias = nitems_per_ctrl * i;
564 tree bias_tree = build_int_cst (compare_type, bias);
565
566 /* See whether the first iteration of the vector loop is known
567 to have a full control. */
568 poly_uint64 const_limit;
569 bool first_iteration_full
570 = (poly_int_tree_p (first_limit, &const_limit)
571 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
572
573 /* Rather than have a new IV that starts at BIAS and goes up to
574 TEST_LIMIT, prefer to use the same 0-based IV for each control
575 and adjust the bound down by BIAS. */
576 tree this_test_limit = test_limit;
577 if (i != 0)
578 {
579 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
580 compare_type, this_test_limit,
581 bias_tree);
582 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
583 compare_type, this_test_limit,
584 bias_tree);
585 }
586
587 /* Create the initial control. First include all items that
588 are within the loop limit. */
589 tree init_ctrl = NULL_TREE;
590 if (!first_iteration_full)
591 {
592 tree start, end;
593 if (first_limit == test_limit)
594 {
595 /* Use a natural test between zero (the initial IV value)
596 and the loop limit. The "else" block would be valid too,
597 but this choice can avoid the need to load BIAS_TREE into
598 a register. */
599 start = zero_index;
600 end = this_test_limit;
601 }
602 else
603 {
604 /* FIRST_LIMIT is the maximum number of items handled by the
605 first iteration of the vector loop. Test the portion
606 associated with this control. */
607 start = bias_tree;
608 end = first_limit;
609 }
610
611 if (use_masks_p)
612 {
613 init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
614 gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
615 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
616 }
617 else
618 {
619 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
620 gimple_seq seq = vect_gen_len (init_ctrl, start,
621 end, length_limit);
622 gimple_seq_add_seq (preheader_seq, seq);
623 }
624 }
625
626 /* Now AND out the bits that are within the number of skipped
627 items. */
628 poly_uint64 const_skip;
629 if (nitems_skip
630 && !(poly_int_tree_p (nitems_skip, &const_skip)
631 && known_le (const_skip, bias)))
632 {
633 gcc_assert (use_masks_p);
634 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
635 bias_tree, nitems_skip);
636 if (init_ctrl)
637 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
638 init_ctrl, unskipped_mask);
639 else
640 init_ctrl = unskipped_mask;
641 }
642
643 if (!init_ctrl)
644 {
645 /* First iteration is full. */
646 if (use_masks_p)
647 init_ctrl = build_minus_one_cst (ctrl_type);
648 else
649 init_ctrl = length_limit;
650 }
651
652 /* Get the control value for the next iteration of the loop. */
653 if (use_masks_p)
654 {
655 next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
656 gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
657 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
658 }
659 else
660 {
661 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
662 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
663 length_limit);
664 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
665 }
666
667 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
668 }
669 return next_ctrl;
670 }
671
672 /* Set up the iteration condition and rgroup controls for LOOP, given
673 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
674 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
675 the number of iterations of the original scalar loop that should be
676 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
677 for vect_set_loop_condition.
678
679 Insert the branch-back condition before LOOP_COND_GSI and return the
680 final gcond. */
681
682 static gcond *
683 vect_set_loop_condition_partial_vectors (class loop *loop,
684 loop_vec_info loop_vinfo, tree niters,
685 tree final_iv, bool niters_maybe_zero,
686 gimple_stmt_iterator loop_cond_gsi)
687 {
688 gimple_seq preheader_seq = NULL;
689 gimple_seq header_seq = NULL;
690
691 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
692 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
693 unsigned int compare_precision = TYPE_PRECISION (compare_type);
694 tree orig_niters = niters;
695
696 /* Type of the initial value of NITERS. */
697 tree ni_actual_type = TREE_TYPE (niters);
698 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
699 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
700
701 /* Convert NITERS to the same size as the compare. */
702 if (compare_precision > ni_actual_precision
703 && niters_maybe_zero)
704 {
705 /* We know that there is always at least one iteration, so if the
706 count is zero then it must have wrapped. Cope with this by
707 subtracting 1 before the conversion and adding 1 to the result. */
708 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
709 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
710 niters, build_minus_one_cst (ni_actual_type));
711 niters = gimple_convert (&preheader_seq, compare_type, niters);
712 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
713 niters, build_one_cst (compare_type));
714 }
715 else
716 niters = gimple_convert (&preheader_seq, compare_type, niters);
717
718 /* Iterate over all the rgroups and fill in their controls. We could use
719 the first control from any rgroup for the loop condition; here we
720 arbitrarily pick the last. */
721 tree test_ctrl = NULL_TREE;
722 rgroup_controls *rgc;
723 unsigned int i;
724 auto_vec<rgroup_controls> *controls = use_masks_p
725 ? &LOOP_VINFO_MASKS (loop_vinfo)
726 : &LOOP_VINFO_LENS (loop_vinfo);
727 FOR_EACH_VEC_ELT (*controls, i, rgc)
728 if (!rgc->controls.is_empty ())
729 {
730 /* First try using permutes. This adds a single vector
731 instruction to the loop for each mask, but needs no extra
732 loop invariants or IVs. */
733 unsigned int nmasks = i + 1;
734 if (use_masks_p && (nmasks & 1) == 0)
735 {
736 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
737 if (!half_rgc->controls.is_empty ()
738 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
739 continue;
740 }
741
742 /* See whether zero-based IV would ever generate all-false masks
743 or zero length before wrapping around. */
744 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
745
746 /* Set up all controls for this group. */
747 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
748 &preheader_seq,
749 loop_cond_gsi, rgc,
750 niters, niters_skip,
751 might_wrap_p);
752 }
753
754 /* Emit all accumulated statements. */
755 add_preheader_seq (loop, preheader_seq);
756 add_header_seq (loop, header_seq);
757
758 /* Get a boolean result that tells us whether to iterate. */
759 edge exit_edge = single_exit (loop);
760 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
761 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
762 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
763 NULL_TREE, NULL_TREE);
764 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
765
766 /* The loop iterates (NITERS - 1) / VF + 1 times.
767 Subtract one from this to get the latch count. */
768 tree step = build_int_cst (compare_type,
769 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
770 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
771 build_minus_one_cst (compare_type));
772 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
773 niters_minus_one, step);
774
775 if (final_iv)
776 {
777 gassign *assign = gimple_build_assign (final_iv, orig_niters);
778 gsi_insert_on_edge_immediate (single_exit (loop), assign);
779 }
780
781 return cond_stmt;
782 }
783
784 /* Like vect_set_loop_condition, but handle the case in which the vector
785 loop handles exactly VF scalars per iteration. */
786
787 static gcond *
788 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
789 tree final_iv, bool niters_maybe_zero,
790 gimple_stmt_iterator loop_cond_gsi)
791 {
792 tree indx_before_incr, indx_after_incr;
793 gcond *cond_stmt;
794 gcond *orig_cond;
795 edge pe = loop_preheader_edge (loop);
796 edge exit_edge = single_exit (loop);
797 gimple_stmt_iterator incr_gsi;
798 bool insert_after;
799 enum tree_code code;
800 tree niters_type = TREE_TYPE (niters);
801
802 orig_cond = get_loop_exit_condition (loop);
803 gcc_assert (orig_cond);
804 loop_cond_gsi = gsi_for_stmt (orig_cond);
805
806 tree init, limit;
807 if (!niters_maybe_zero && integer_onep (step))
808 {
809 /* In this case we can use a simple 0-based IV:
810
811 A:
812 x = 0;
813 do
814 {
815 ...
816 x += 1;
817 }
818 while (x < NITERS); */
819 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
820 init = build_zero_cst (niters_type);
821 limit = niters;
822 }
823 else
824 {
825 /* The following works for all values of NITERS except 0:
826
827 B:
828 x = 0;
829 do
830 {
831 ...
832 x += STEP;
833 }
834 while (x <= NITERS - STEP);
835
836 so that the loop continues to iterate if x + STEP - 1 < NITERS
837 but stops if x + STEP - 1 >= NITERS.
838
839 However, if NITERS is zero, x never hits a value above NITERS - STEP
840 before wrapping around. There are two obvious ways of dealing with
841 this:
842
843 - start at STEP - 1 and compare x before incrementing it
844 - start at -1 and compare x after incrementing it
845
846 The latter is simpler and is what we use. The loop in this case
847 looks like:
848
849 C:
850 x = -1;
851 do
852 {
853 ...
854 x += STEP;
855 }
856 while (x < NITERS - STEP);
857
858 In both cases the loop limit is NITERS - STEP. */
859 gimple_seq seq = NULL;
860 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
861 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
862 if (seq)
863 {
864 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
865 gcc_assert (!new_bb);
866 }
867 if (niters_maybe_zero)
868 {
869 /* Case C. */
870 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
871 init = build_all_ones_cst (niters_type);
872 }
873 else
874 {
875 /* Case B. */
876 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
877 init = build_zero_cst (niters_type);
878 }
879 }
880
881 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
882 create_iv (init, step, NULL_TREE, loop,
883 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
884 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
885 true, NULL_TREE, true,
886 GSI_SAME_STMT);
887 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
888 true, GSI_SAME_STMT);
889
890 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
891 NULL_TREE);
892
893 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
894
895 /* Record the number of latch iterations. */
896 if (limit == niters)
897 /* Case A: the loop iterates NITERS times. Subtract one to get the
898 latch count. */
899 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
900 build_int_cst (niters_type, 1));
901 else
902 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
903 Subtract one from this to get the latch count. */
904 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
905 limit, step);
906
907 if (final_iv)
908 {
909 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
910 indx_after_incr, init);
911 gsi_insert_on_edge_immediate (single_exit (loop), assign);
912 }
913
914 return cond_stmt;
915 }
916
917 /* If we're using fully-masked loops, make LOOP iterate:
918
919 N == (NITERS - 1) / STEP + 1
920
921 times. When NITERS is zero, this is equivalent to making the loop
922 execute (1 << M) / STEP times, where M is the precision of NITERS.
923 NITERS_MAYBE_ZERO is true if this last case might occur.
924
925 If we're not using fully-masked loops, make LOOP iterate:
926
927 N == (NITERS - STEP) / STEP + 1
928
929 times, where NITERS is known to be outside the range [1, STEP - 1].
930 This is equivalent to making the loop execute NITERS / STEP times
931 when NITERS is nonzero and (1 << M) / STEP times otherwise.
932 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
933
934 If FINAL_IV is nonnull, it is an SSA name that should be set to
935 N * STEP on exit from the loop.
936
937 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
938
939 void
940 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
941 tree niters, tree step, tree final_iv,
942 bool niters_maybe_zero)
943 {
944 gcond *cond_stmt;
945 gcond *orig_cond = get_loop_exit_condition (loop);
946 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
947
948 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
949 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
950 niters, final_iv,
951 niters_maybe_zero,
952 loop_cond_gsi);
953 else
954 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
955 niters_maybe_zero,
956 loop_cond_gsi);
957
958 /* Remove old loop exit test. */
959 stmt_vec_info orig_cond_info;
960 if (loop_vinfo
961 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
962 loop_vinfo->remove_stmt (orig_cond_info);
963 else
964 gsi_remove (&loop_cond_gsi, true);
965
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
968 cond_stmt);
969 }
970
971 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
972 For all PHI arguments in FROM->dest and TO->dest from those
973 edges ensure that TO->dest PHI arguments have current_def
974 to that in from. */
975
976 static void
977 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
978 {
979 gimple_stmt_iterator gsi_from, gsi_to;
980
981 for (gsi_from = gsi_start_phis (from->dest),
982 gsi_to = gsi_start_phis (to->dest);
983 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
984 {
985 gimple *from_phi = gsi_stmt (gsi_from);
986 gimple *to_phi = gsi_stmt (gsi_to);
987 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
988 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
989 if (virtual_operand_p (from_arg))
990 {
991 gsi_next (&gsi_from);
992 continue;
993 }
994 if (virtual_operand_p (to_arg))
995 {
996 gsi_next (&gsi_to);
997 continue;
998 }
999 if (TREE_CODE (from_arg) != SSA_NAME)
1000 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1001 else if (TREE_CODE (to_arg) == SSA_NAME
1002 && from_arg != to_arg)
1003 {
1004 if (get_current_def (to_arg) == NULL_TREE)
1005 {
1006 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1007 TREE_TYPE (get_current_def
1008 (from_arg))));
1009 set_current_def (to_arg, get_current_def (from_arg));
1010 }
1011 }
1012 gsi_next (&gsi_from);
1013 gsi_next (&gsi_to);
1014 }
1015
1016 gphi *from_phi = get_virtual_phi (from->dest);
1017 gphi *to_phi = get_virtual_phi (to->dest);
1018 if (from_phi)
1019 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1020 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1021 }
1022
1023
1024 /* Given LOOP this function generates a new copy of it and puts it
1025 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1026 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1027 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1028 entry or exit of LOOP. */
1029
1030 class loop *
1031 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1032 class loop *scalar_loop, edge e)
1033 {
1034 class loop *new_loop;
1035 basic_block *new_bbs, *bbs, *pbbs;
1036 bool at_exit;
1037 bool was_imm_dom;
1038 basic_block exit_dest;
1039 edge exit, new_exit;
1040 bool duplicate_outer_loop = false;
1041
1042 exit = single_exit (loop);
1043 at_exit = (e == exit);
1044 if (!at_exit && e != loop_preheader_edge (loop))
1045 return NULL;
1046
1047 if (scalar_loop == NULL)
1048 scalar_loop = loop;
1049
1050 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1051 pbbs = bbs + 1;
1052 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1053 /* Allow duplication of outer loops. */
1054 if (scalar_loop->inner)
1055 duplicate_outer_loop = true;
1056 /* Check whether duplication is possible. */
1057 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1058 {
1059 free (bbs);
1060 return NULL;
1061 }
1062
1063 /* Generate new loop structure. */
1064 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1065 duplicate_subloops (scalar_loop, new_loop);
1066
1067 exit_dest = exit->dest;
1068 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1069 exit_dest) == loop->header ?
1070 true : false);
1071
1072 /* Also copy the pre-header, this avoids jumping through hoops to
1073 duplicate the loop entry PHI arguments. Create an empty
1074 pre-header unconditionally for this. */
1075 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1076 edge entry_e = single_pred_edge (preheader);
1077 bbs[0] = preheader;
1078 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1079
1080 exit = single_exit (scalar_loop);
1081 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1082 &exit, 1, &new_exit, NULL,
1083 at_exit ? loop->latch : e->src, true);
1084 exit = single_exit (loop);
1085 basic_block new_preheader = new_bbs[0];
1086
1087 /* Before installing PHI arguments make sure that the edges
1088 into them match that of the scalar loop we analyzed. This
1089 makes sure the SLP tree matches up between the main vectorized
1090 loop and the epilogue vectorized copies. */
1091 if (single_succ_edge (preheader)->dest_idx
1092 != single_succ_edge (new_bbs[0])->dest_idx)
1093 {
1094 basic_block swap_bb = new_bbs[1];
1095 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1096 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1097 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1098 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1099 }
1100 if (duplicate_outer_loop)
1101 {
1102 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1103 if (loop_preheader_edge (scalar_loop)->dest_idx
1104 != loop_preheader_edge (new_inner_loop)->dest_idx)
1105 {
1106 basic_block swap_bb = new_inner_loop->header;
1107 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1108 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1109 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1110 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1111 }
1112 }
1113
1114 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1115
1116 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1117 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1118 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1119
1120 if (scalar_loop != loop)
1121 {
1122 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1123 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1124 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1125 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1126 header) to have current_def set, so copy them over. */
1127 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1128 exit);
1129 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1130 0),
1131 EDGE_SUCC (loop->latch, 0));
1132 }
1133
1134 if (at_exit) /* Add the loop copy at exit. */
1135 {
1136 if (scalar_loop != loop)
1137 {
1138 gphi_iterator gsi;
1139 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1140
1141 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1142 gsi_next (&gsi))
1143 {
1144 gphi *phi = gsi.phi ();
1145 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1146 location_t orig_locus
1147 = gimple_phi_arg_location_from_edge (phi, e);
1148
1149 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1150 }
1151 }
1152 redirect_edge_and_branch_force (e, new_preheader);
1153 flush_pending_stmts (e);
1154 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1155 if (was_imm_dom || duplicate_outer_loop)
1156 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1157
1158 /* And remove the non-necessary forwarder again. Keep the other
1159 one so we have a proper pre-header for the loop at the exit edge. */
1160 redirect_edge_pred (single_succ_edge (preheader),
1161 single_pred (preheader));
1162 delete_basic_block (preheader);
1163 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1164 loop_preheader_edge (scalar_loop)->src);
1165 }
1166 else /* Add the copy at entry. */
1167 {
1168 if (scalar_loop != loop)
1169 {
1170 /* Remove the non-necessary forwarder of scalar_loop again. */
1171 redirect_edge_pred (single_succ_edge (preheader),
1172 single_pred (preheader));
1173 delete_basic_block (preheader);
1174 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1175 loop_preheader_edge (scalar_loop)->src);
1176 preheader = split_edge (loop_preheader_edge (loop));
1177 entry_e = single_pred_edge (preheader);
1178 }
1179
1180 redirect_edge_and_branch_force (entry_e, new_preheader);
1181 flush_pending_stmts (entry_e);
1182 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1183
1184 redirect_edge_and_branch_force (new_exit, preheader);
1185 flush_pending_stmts (new_exit);
1186 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1187
1188 /* And remove the non-necessary forwarder again. Keep the other
1189 one so we have a proper pre-header for the loop at the exit edge. */
1190 redirect_edge_pred (single_succ_edge (new_preheader),
1191 single_pred (new_preheader));
1192 delete_basic_block (new_preheader);
1193 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1194 loop_preheader_edge (new_loop)->src);
1195 }
1196
1197 if (scalar_loop != loop)
1198 {
1199 /* Update new_loop->header PHIs, so that on the preheader
1200 edge they are the ones from loop rather than scalar_loop. */
1201 gphi_iterator gsi_orig, gsi_new;
1202 edge orig_e = loop_preheader_edge (loop);
1203 edge new_e = loop_preheader_edge (new_loop);
1204
1205 for (gsi_orig = gsi_start_phis (loop->header),
1206 gsi_new = gsi_start_phis (new_loop->header);
1207 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1208 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1209 {
1210 gphi *orig_phi = gsi_orig.phi ();
1211 gphi *new_phi = gsi_new.phi ();
1212 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1213 location_t orig_locus
1214 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1215
1216 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1217 }
1218 }
1219
1220 free (new_bbs);
1221 free (bbs);
1222
1223 checking_verify_dominators (CDI_DOMINATORS);
1224
1225 return new_loop;
1226 }
1227
1228
1229 /* Given the condition expression COND, put it as the last statement of
1230 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1231 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1232 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1233 new edge as irreducible if IRREDUCIBLE_P is true. */
1234
1235 static edge
1236 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1237 basic_block guard_to, basic_block dom_bb,
1238 profile_probability probability, bool irreducible_p)
1239 {
1240 gimple_stmt_iterator gsi;
1241 edge new_e, enter_e;
1242 gcond *cond_stmt;
1243 gimple_seq gimplify_stmt_list = NULL;
1244
1245 enter_e = EDGE_SUCC (guard_bb, 0);
1246 enter_e->flags &= ~EDGE_FALLTHRU;
1247 enter_e->flags |= EDGE_FALSE_VALUE;
1248 gsi = gsi_last_bb (guard_bb);
1249
1250 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1251 NULL_TREE);
1252 if (gimplify_stmt_list)
1253 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1254
1255 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1256 gsi = gsi_last_bb (guard_bb);
1257 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1258
1259 /* Add new edge to connect guard block to the merge/loop-exit block. */
1260 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1261
1262 new_e->probability = probability;
1263 if (irreducible_p)
1264 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1265
1266 enter_e->probability = probability.invert ();
1267 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1268
1269 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1270 if (enter_e->dest->loop_father->header == enter_e->dest)
1271 split_edge (enter_e);
1272
1273 return new_e;
1274 }
1275
1276
1277 /* This function verifies that the following restrictions apply to LOOP:
1278 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1279 for innermost loop and 5 basic blocks for outer-loop.
1280 (2) it is single entry, single exit
1281 (3) its exit condition is the last stmt in the header
1282 (4) E is the entry/exit edge of LOOP.
1283 */
1284
1285 bool
1286 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1287 {
1288 edge exit_e = single_exit (loop);
1289 edge entry_e = loop_preheader_edge (loop);
1290 gcond *orig_cond = get_loop_exit_condition (loop);
1291 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1292 unsigned int num_bb = loop->inner? 5 : 2;
1293
1294 /* All loops have an outer scope; the only case loop->outer is NULL is for
1295 the function itself. */
1296 if (!loop_outer (loop)
1297 || loop->num_nodes != num_bb
1298 || !empty_block_p (loop->latch)
1299 || !single_exit (loop)
1300 /* Verify that new loop exit condition can be trivially modified. */
1301 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1302 || (e != exit_e && e != entry_e))
1303 return false;
1304
1305 return true;
1306 }
1307
1308 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1309 in the exit bb and rename all the uses after the loop. This simplifies
1310 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1311 (but normally loop closed SSA form doesn't require virtual PHIs to be
1312 in the same form). Doing this early simplifies the checking what
1313 uses should be renamed.
1314
1315 If we create a new phi after the loop, return the definition that
1316 applies on entry to the loop, otherwise return null. */
1317
1318 static tree
1319 create_lcssa_for_virtual_phi (class loop *loop)
1320 {
1321 gphi_iterator gsi;
1322 edge exit_e = single_exit (loop);
1323
1324 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1325 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1326 {
1327 gphi *phi = gsi.phi ();
1328 for (gsi = gsi_start_phis (exit_e->dest);
1329 !gsi_end_p (gsi); gsi_next (&gsi))
1330 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1331 break;
1332 if (gsi_end_p (gsi))
1333 {
1334 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1335 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1336 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1337 imm_use_iterator imm_iter;
1338 gimple *stmt;
1339 use_operand_p use_p;
1340
1341 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1342 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1343 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1344 gimple_phi_set_result (new_phi, new_vop);
1345 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1346 if (stmt != new_phi
1347 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1348 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1349 SET_USE (use_p, new_vop);
1350
1351 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1352 }
1353 break;
1354 }
1355 return NULL_TREE;
1356 }
1357
1358 /* Function vect_get_loop_location.
1359
1360 Extract the location of the loop in the source code.
1361 If the loop is not well formed for vectorization, an estimated
1362 location is calculated.
1363 Return the loop location if succeed and NULL if not. */
1364
1365 dump_user_location_t
1366 find_loop_location (class loop *loop)
1367 {
1368 gimple *stmt = NULL;
1369 basic_block bb;
1370 gimple_stmt_iterator si;
1371
1372 if (!loop)
1373 return dump_user_location_t ();
1374
1375 stmt = get_loop_exit_condition (loop);
1376
1377 if (stmt
1378 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1379 return stmt;
1380
1381 /* If we got here the loop is probably not "well formed",
1382 try to estimate the loop location */
1383
1384 if (!loop->header)
1385 return dump_user_location_t ();
1386
1387 bb = loop->header;
1388
1389 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1390 {
1391 stmt = gsi_stmt (si);
1392 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1393 return stmt;
1394 }
1395
1396 return dump_user_location_t ();
1397 }
1398
1399 /* Return true if the phi described by STMT_INFO defines an IV of the
1400 loop to be vectorized. */
1401
1402 static bool
1403 iv_phi_p (stmt_vec_info stmt_info)
1404 {
1405 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1406 if (virtual_operand_p (PHI_RESULT (phi)))
1407 return false;
1408
1409 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1410 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1411 return false;
1412
1413 return true;
1414 }
1415
1416 /* Function vect_can_advance_ivs_p
1417
1418 In case the number of iterations that LOOP iterates is unknown at compile
1419 time, an epilog loop will be generated, and the loop induction variables
1420 (IVs) will be "advanced" to the value they are supposed to take just before
1421 the epilog loop. Here we check that the access function of the loop IVs
1422 and the expression that represents the loop bound are simple enough.
1423 These restrictions will be relaxed in the future. */
1424
1425 bool
1426 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1427 {
1428 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1429 basic_block bb = loop->header;
1430 gphi_iterator gsi;
1431
1432 /* Analyze phi functions of the loop header. */
1433
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1436 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1437 {
1438 tree evolution_part;
1439
1440 gphi *phi = gsi.phi ();
1441 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1442 if (dump_enabled_p ())
1443 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1444 phi_info->stmt);
1445
1446 /* Skip virtual phi's. The data dependences that are associated with
1447 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1448
1449 Skip reduction phis. */
1450 if (!iv_phi_p (phi_info))
1451 {
1452 if (dump_enabled_p ())
1453 dump_printf_loc (MSG_NOTE, vect_location,
1454 "reduc or virtual phi. skip.\n");
1455 continue;
1456 }
1457
1458 /* Analyze the evolution function. */
1459
1460 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1461 if (evolution_part == NULL_TREE)
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf (MSG_MISSED_OPTIMIZATION,
1465 "No access function or evolution.\n");
1466 return false;
1467 }
1468
1469 /* FORNOW: We do not transform initial conditions of IVs
1470 which evolution functions are not invariants in the loop. */
1471
1472 if (!expr_invariant_in_loop_p (loop, evolution_part))
1473 {
1474 if (dump_enabled_p ())
1475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476 "evolution not invariant in loop.\n");
1477 return false;
1478 }
1479
1480 /* FORNOW: We do not transform initial conditions of IVs
1481 which evolution functions are a polynomial of degree >= 2. */
1482
1483 if (tree_is_chrec (evolution_part))
1484 {
1485 if (dump_enabled_p ())
1486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 "evolution is chrec.\n");
1488 return false;
1489 }
1490 }
1491
1492 return true;
1493 }
1494
1495
1496 /* Function vect_update_ivs_after_vectorizer.
1497
1498 "Advance" the induction variables of LOOP to the value they should take
1499 after the execution of LOOP. This is currently necessary because the
1500 vectorizer does not handle induction variables that are used after the
1501 loop. Such a situation occurs when the last iterations of LOOP are
1502 peeled, because:
1503 1. We introduced new uses after LOOP for IVs that were not originally used
1504 after LOOP: the IVs of LOOP are now used by an epilog loop.
1505 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1506 times, whereas the loop IVs should be bumped N times.
1507
1508 Input:
1509 - LOOP - a loop that is going to be vectorized. The last few iterations
1510 of LOOP were peeled.
1511 - NITERS - the number of iterations that LOOP executes (before it is
1512 vectorized). i.e, the number of times the ivs should be bumped.
1513 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1514 coming out from LOOP on which there are uses of the LOOP ivs
1515 (this is the path from LOOP->exit to epilog_loop->preheader).
1516
1517 The new definitions of the ivs are placed in LOOP->exit.
1518 The phi args associated with the edge UPDATE_E in the bb
1519 UPDATE_E->dest are updated accordingly.
1520
1521 Assumption 1: Like the rest of the vectorizer, this function assumes
1522 a single loop exit that has a single predecessor.
1523
1524 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1525 organized in the same order.
1526
1527 Assumption 3: The access function of the ivs is simple enough (see
1528 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1529
1530 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1531 coming out of LOOP on which the ivs of LOOP are used (this is the path
1532 that leads to the epilog loop; other paths skip the epilog loop). This
1533 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1534 needs to have its phis updated.
1535 */
1536
1537 static void
1538 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1539 tree niters, edge update_e)
1540 {
1541 gphi_iterator gsi, gsi1;
1542 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1543 basic_block update_bb = update_e->dest;
1544 basic_block exit_bb = single_exit (loop)->dest;
1545
1546 /* Make sure there exists a single-predecessor exit bb: */
1547 gcc_assert (single_pred_p (exit_bb));
1548 gcc_assert (single_succ_edge (exit_bb) == update_e);
1549
1550 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1551 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1552 gsi_next (&gsi), gsi_next (&gsi1))
1553 {
1554 tree init_expr;
1555 tree step_expr, off;
1556 tree type;
1557 tree var, ni, ni_name;
1558 gimple_stmt_iterator last_gsi;
1559
1560 gphi *phi = gsi.phi ();
1561 gphi *phi1 = gsi1.phi ();
1562 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1563 if (dump_enabled_p ())
1564 dump_printf_loc (MSG_NOTE, vect_location,
1565 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1566
1567 /* Skip reduction and virtual phis. */
1568 if (!iv_phi_p (phi_info))
1569 {
1570 if (dump_enabled_p ())
1571 dump_printf_loc (MSG_NOTE, vect_location,
1572 "reduc or virtual phi. skip.\n");
1573 continue;
1574 }
1575
1576 type = TREE_TYPE (gimple_phi_result (phi));
1577 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1578 step_expr = unshare_expr (step_expr);
1579
1580 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1581 of degree >= 2 or exponential. */
1582 gcc_assert (!tree_is_chrec (step_expr));
1583
1584 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1585
1586 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1587 fold_convert (TREE_TYPE (step_expr), niters),
1588 step_expr);
1589 if (POINTER_TYPE_P (type))
1590 ni = fold_build_pointer_plus (init_expr, off);
1591 else
1592 ni = fold_build2 (PLUS_EXPR, type,
1593 init_expr, fold_convert (type, off));
1594
1595 var = create_tmp_var (type, "tmp");
1596
1597 last_gsi = gsi_last_bb (exit_bb);
1598 gimple_seq new_stmts = NULL;
1599 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1600 /* Exit_bb shouldn't be empty. */
1601 if (!gsi_end_p (last_gsi))
1602 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1603 else
1604 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1605
1606 /* Fix phi expressions in the successor bb. */
1607 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1608 }
1609 }
1610
1611 /* Return a gimple value containing the misalignment (measured in vector
1612 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1613 it is away from a perfectly aligned address. Add any new statements
1614 to SEQ. */
1615
1616 static tree
1617 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1618 {
1619 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1620 stmt_vec_info stmt_info = dr_info->stmt;
1621 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1622
1623 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1624 unsigned HOST_WIDE_INT target_align_c;
1625 tree target_align_minus_1;
1626
1627 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1628 size_zero_node) < 0;
1629 tree offset = (negative
1630 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1631 : size_zero_node);
1632 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1633 stmt_info, seq,
1634 offset);
1635 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1636 if (target_align.is_constant (&target_align_c))
1637 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1638 else
1639 {
1640 tree vla = build_int_cst (type, target_align);
1641 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1642 fold_build2 (MINUS_EXPR, type,
1643 build_int_cst (type, 0), vla));
1644 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1645 build_int_cst (type, 1));
1646 }
1647
1648 HOST_WIDE_INT elem_size
1649 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1650 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1651
1652 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1653 tree int_start_addr = fold_convert (type, start_addr);
1654 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1655 target_align_minus_1);
1656
1657 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1658 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1659 elem_size_log);
1660
1661 return misalign_in_elems;
1662 }
1663
1664 /* Function vect_gen_prolog_loop_niters
1665
1666 Generate the number of iterations which should be peeled as prolog for the
1667 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1668 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1669 As a result, after the execution of this loop, the data reference DR will
1670 refer to an aligned location. The following computation is generated:
1671
1672 If the misalignment of DR is known at compile time:
1673 addr_mis = int mis = DR_MISALIGNMENT (dr);
1674 Else, compute address misalignment in bytes:
1675 addr_mis = addr & (target_align - 1)
1676
1677 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1678
1679 (elem_size = element type size; an element is the scalar element whose type
1680 is the inner type of the vectype)
1681
1682 The computations will be emitted at the end of BB. We also compute and
1683 store upper bound (included) of the result in BOUND.
1684
1685 When the step of the data-ref in the loop is not 1 (as in interleaved data
1686 and SLP), the number of iterations of the prolog must be divided by the step
1687 (which is equal to the size of interleaved group).
1688
1689 The above formulas assume that VF == number of elements in the vector. This
1690 may not hold when there are multiple-types in the loop.
1691 In this case, for some data-references in the loop the VF does not represent
1692 the number of elements that fit in the vector. Therefore, instead of VF we
1693 use TYPE_VECTOR_SUBPARTS. */
1694
1695 static tree
1696 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1697 basic_block bb, int *bound)
1698 {
1699 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1700 tree var;
1701 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1702 gimple_seq stmts = NULL, new_stmts = NULL;
1703 tree iters, iters_name;
1704 stmt_vec_info stmt_info = dr_info->stmt;
1705 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1706 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1707
1708 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1709 {
1710 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1711
1712 if (dump_enabled_p ())
1713 dump_printf_loc (MSG_NOTE, vect_location,
1714 "known peeling = %d.\n", npeel);
1715
1716 iters = build_int_cst (niters_type, npeel);
1717 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1718 }
1719 else
1720 {
1721 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1722 tree type = TREE_TYPE (misalign_in_elems);
1723 HOST_WIDE_INT elem_size
1724 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1725 /* We only do prolog peeling if the target alignment is known at compile
1726 time. */
1727 poly_uint64 align_in_elems =
1728 exact_div (target_align, elem_size);
1729 tree align_in_elems_minus_1 =
1730 build_int_cst (type, align_in_elems - 1);
1731 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1732
1733 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1734 & (align_in_elems - 1)). */
1735 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1736 size_zero_node) < 0;
1737 if (negative)
1738 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1739 align_in_elems_tree);
1740 else
1741 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1742 misalign_in_elems);
1743 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1744 iters = fold_convert (niters_type, iters);
1745 unsigned HOST_WIDE_INT align_in_elems_c;
1746 if (align_in_elems.is_constant (&align_in_elems_c))
1747 *bound = align_in_elems_c - 1;
1748 else
1749 *bound = -1;
1750 }
1751
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_NOTE, vect_location,
1754 "niters for prolog loop: %T\n", iters);
1755
1756 var = create_tmp_var (niters_type, "prolog_loop_niters");
1757 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1758
1759 if (new_stmts)
1760 gimple_seq_add_seq (&stmts, new_stmts);
1761 if (stmts)
1762 {
1763 gcc_assert (single_succ_p (bb));
1764 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1765 if (gsi_end_p (gsi))
1766 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1767 else
1768 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1769 }
1770 return iters_name;
1771 }
1772
1773
1774 /* Function vect_update_init_of_dr
1775
1776 If CODE is PLUS, the vector loop starts NITERS iterations after the
1777 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1778 iterations before the scalar one (using masking to skip inactive
1779 elements). This function updates the information recorded in DR to
1780 account for the difference. Specifically, it updates the OFFSET
1781 field of DR_INFO. */
1782
1783 static void
1784 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1785 {
1786 struct data_reference *dr = dr_info->dr;
1787 tree offset = dr_info->offset;
1788 if (!offset)
1789 offset = build_zero_cst (sizetype);
1790
1791 niters = fold_build2 (MULT_EXPR, sizetype,
1792 fold_convert (sizetype, niters),
1793 fold_convert (sizetype, DR_STEP (dr)));
1794 offset = fold_build2 (code, sizetype,
1795 fold_convert (sizetype, offset), niters);
1796 dr_info->offset = offset;
1797 }
1798
1799
1800 /* Function vect_update_inits_of_drs
1801
1802 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1803 CODE and NITERS are as for vect_update_inits_of_dr. */
1804
1805 void
1806 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1807 tree_code code)
1808 {
1809 unsigned int i;
1810 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1811 struct data_reference *dr;
1812
1813 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1814
1815 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1816 here, but since we might use these niters to update the epilogues niters
1817 and data references we can't insert them here as this definition might not
1818 always dominate its uses. */
1819 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1820 niters = fold_convert (sizetype, niters);
1821
1822 FOR_EACH_VEC_ELT (datarefs, i, dr)
1823 {
1824 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1825 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1826 vect_update_init_of_dr (dr_info, niters, code);
1827 }
1828 }
1829
1830 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1831 by masking. This involves calculating the number of iterations to
1832 be peeled and then aligning all memory references appropriately. */
1833
1834 void
1835 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1836 {
1837 tree misalign_in_elems;
1838 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1839
1840 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1841
1842 /* From the information recorded in LOOP_VINFO get the number of iterations
1843 that need to be skipped via masking. */
1844 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1845 {
1846 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1847 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1848 misalign_in_elems = build_int_cst (type, misalign);
1849 }
1850 else
1851 {
1852 gimple_seq seq1 = NULL, seq2 = NULL;
1853 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1854 misalign_in_elems = fold_convert (type, misalign_in_elems);
1855 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1856 &seq2, true, NULL_TREE);
1857 gimple_seq_add_seq (&seq1, seq2);
1858 if (seq1)
1859 {
1860 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1861 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1862 gcc_assert (!new_bb);
1863 }
1864 }
1865
1866 if (dump_enabled_p ())
1867 dump_printf_loc (MSG_NOTE, vect_location,
1868 "misalignment for fully-masked loop: %T\n",
1869 misalign_in_elems);
1870
1871 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1872
1873 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1874 }
1875
1876 /* This function builds ni_name = number of iterations. Statements
1877 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1878 it to TRUE if new ssa_var is generated. */
1879
1880 tree
1881 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1882 {
1883 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1884 if (TREE_CODE (ni) == INTEGER_CST)
1885 return ni;
1886 else
1887 {
1888 tree ni_name, var;
1889 gimple_seq stmts = NULL;
1890 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1891
1892 var = create_tmp_var (TREE_TYPE (ni), "niters");
1893 ni_name = force_gimple_operand (ni, &stmts, false, var);
1894 if (stmts)
1895 {
1896 gsi_insert_seq_on_edge_immediate (pe, stmts);
1897 if (new_var_p != NULL)
1898 *new_var_p = true;
1899 }
1900
1901 return ni_name;
1902 }
1903 }
1904
1905 /* Calculate the number of iterations above which vectorized loop will be
1906 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1907 of prolog loop. If it's integer const, the integer number is also passed
1908 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1909 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1910 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1911 threshold below which the scalar (rather than vectorized) loop will be
1912 executed. This function stores the upper bound (inclusive) of the result
1913 in BOUND_SCALAR. */
1914
1915 static tree
1916 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1917 int bound_prolog, poly_int64 bound_epilog, int th,
1918 poly_uint64 *bound_scalar,
1919 bool check_profitability)
1920 {
1921 tree type = TREE_TYPE (niters_prolog);
1922 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1923 build_int_cst (type, bound_epilog));
1924
1925 *bound_scalar = bound_prolog + bound_epilog;
1926 if (check_profitability)
1927 {
1928 /* TH indicates the minimum niters of vectorized loop, while we
1929 compute the maximum niters of scalar loop. */
1930 th--;
1931 /* Peeling for constant times. */
1932 if (int_niters_prolog >= 0)
1933 {
1934 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1935 return build_int_cst (type, *bound_scalar);
1936 }
1937 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1938 and BOUND_EPILOG are inclusive upper bounds. */
1939 if (known_ge (th, bound_prolog + bound_epilog))
1940 {
1941 *bound_scalar = th;
1942 return build_int_cst (type, th);
1943 }
1944 /* Need to do runtime comparison. */
1945 else if (maybe_gt (th, bound_epilog))
1946 {
1947 *bound_scalar = upper_bound (*bound_scalar, th);
1948 return fold_build2 (MAX_EXPR, type,
1949 build_int_cst (type, th), niters);
1950 }
1951 }
1952 return niters;
1953 }
1954
1955 /* NITERS is the number of times that the original scalar loop executes
1956 after peeling. Work out the maximum number of iterations N that can
1957 be handled by the vectorized form of the loop and then either:
1958
1959 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1960
1961 niters_vector = N
1962
1963 b) set *STEP_VECTOR_PTR to one and generate:
1964
1965 niters_vector = N / vf
1966
1967 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1968 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1969 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1970
1971 void
1972 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1973 tree *niters_vector_ptr, tree *step_vector_ptr,
1974 bool niters_no_overflow)
1975 {
1976 tree ni_minus_gap, var;
1977 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1978 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1980 tree log_vf = NULL_TREE;
1981
1982 /* If epilogue loop is required because of data accesses with gaps, we
1983 subtract one iteration from the total number of iterations here for
1984 correct calculation of RATIO. */
1985 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1986 {
1987 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1988 build_one_cst (type));
1989 if (!is_gimple_val (ni_minus_gap))
1990 {
1991 var = create_tmp_var (type, "ni_gap");
1992 gimple *stmts = NULL;
1993 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1994 true, var);
1995 gsi_insert_seq_on_edge_immediate (pe, stmts);
1996 }
1997 }
1998 else
1999 ni_minus_gap = niters;
2000
2001 unsigned HOST_WIDE_INT const_vf;
2002 if (vf.is_constant (&const_vf)
2003 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2004 {
2005 /* Create: niters >> log2(vf) */
2006 /* If it's known that niters == number of latch executions + 1 doesn't
2007 overflow, we can generate niters >> log2(vf); otherwise we generate
2008 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2009 will be at least one. */
2010 log_vf = build_int_cst (type, exact_log2 (const_vf));
2011 if (niters_no_overflow)
2012 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2013 else
2014 niters_vector
2015 = fold_build2 (PLUS_EXPR, type,
2016 fold_build2 (RSHIFT_EXPR, type,
2017 fold_build2 (MINUS_EXPR, type,
2018 ni_minus_gap,
2019 build_int_cst (type, vf)),
2020 log_vf),
2021 build_int_cst (type, 1));
2022 step_vector = build_one_cst (type);
2023 }
2024 else
2025 {
2026 niters_vector = ni_minus_gap;
2027 step_vector = build_int_cst (type, vf);
2028 }
2029
2030 if (!is_gimple_val (niters_vector))
2031 {
2032 var = create_tmp_var (type, "bnd");
2033 gimple_seq stmts = NULL;
2034 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2035 gsi_insert_seq_on_edge_immediate (pe, stmts);
2036 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2037 we set range information to make niters analyzer's life easier.
2038 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2039 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2040 if (stmts != NULL && log_vf)
2041 {
2042 if (niters_no_overflow)
2043 set_range_info (niters_vector, VR_RANGE,
2044 wi::one (TYPE_PRECISION (type)),
2045 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2046 TYPE_SIGN (type)),
2047 exact_log2 (const_vf),
2048 TYPE_SIGN (type)));
2049 /* For VF == 1 the vector IV might also overflow so we cannot
2050 assert a minimum value of 1. */
2051 else if (const_vf > 1)
2052 set_range_info (niters_vector, VR_RANGE,
2053 wi::one (TYPE_PRECISION (type)),
2054 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2055 TYPE_SIGN (type))
2056 - (const_vf - 1),
2057 exact_log2 (const_vf), TYPE_SIGN (type))
2058 + 1);
2059 }
2060 }
2061 *niters_vector_ptr = niters_vector;
2062 *step_vector_ptr = step_vector;
2063
2064 return;
2065 }
2066
2067 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2068 loop specified by LOOP_VINFO after vectorization, compute the number
2069 of iterations before vectorization (niters_vector * vf) and store it
2070 to NITERS_VECTOR_MULT_VF_PTR. */
2071
2072 static void
2073 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2074 tree niters_vector,
2075 tree *niters_vector_mult_vf_ptr)
2076 {
2077 /* We should be using a step_vector of VF if VF is variable. */
2078 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2079 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2080 tree type = TREE_TYPE (niters_vector);
2081 tree log_vf = build_int_cst (type, exact_log2 (vf));
2082 basic_block exit_bb = single_exit (loop)->dest;
2083
2084 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2085 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2086 niters_vector, log_vf);
2087 if (!is_gimple_val (niters_vector_mult_vf))
2088 {
2089 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2090 gimple_seq stmts = NULL;
2091 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2092 &stmts, true, var);
2093 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2094 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2095 }
2096 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2097 }
2098
2099 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2100 this function searches for the corresponding lcssa phi node in exit
2101 bb of LOOP. If it is found, return the phi result; otherwise return
2102 NULL. */
2103
2104 static tree
2105 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2106 gphi *lcssa_phi)
2107 {
2108 gphi_iterator gsi;
2109 edge e = single_exit (loop);
2110
2111 gcc_assert (single_pred_p (e->dest));
2112 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2113 {
2114 gphi *phi = gsi.phi ();
2115 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2116 PHI_ARG_DEF (lcssa_phi, 0), 0))
2117 return PHI_RESULT (phi);
2118 }
2119 return NULL_TREE;
2120 }
2121
2122 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2123 from SECOND/FIRST and puts it at the original loop's preheader/exit
2124 edge, the two loops are arranged as below:
2125
2126 preheader_a:
2127 first_loop:
2128 header_a:
2129 i_1 = PHI<i_0, i_2>;
2130 ...
2131 i_2 = i_1 + 1;
2132 if (cond_a)
2133 goto latch_a;
2134 else
2135 goto between_bb;
2136 latch_a:
2137 goto header_a;
2138
2139 between_bb:
2140 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2141
2142 second_loop:
2143 header_b:
2144 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2145 or with i_2 if no LCSSA phi is created
2146 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2147 ...
2148 i_4 = i_3 + 1;
2149 if (cond_b)
2150 goto latch_b;
2151 else
2152 goto exit_bb;
2153 latch_b:
2154 goto header_b;
2155
2156 exit_bb:
2157
2158 This function creates loop closed SSA for the first loop; update the
2159 second loop's PHI nodes by replacing argument on incoming edge with the
2160 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2161 is false, Loop closed ssa phis will only be created for non-iv phis for
2162 the first loop.
2163
2164 This function assumes exit bb of the first loop is preheader bb of the
2165 second loop, i.e, between_bb in the example code. With PHIs updated,
2166 the second loop will execute rest iterations of the first. */
2167
2168 static void
2169 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2170 class loop *first, class loop *second,
2171 bool create_lcssa_for_iv_phis)
2172 {
2173 gphi_iterator gsi_update, gsi_orig;
2174 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2175
2176 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2177 edge second_preheader_e = loop_preheader_edge (second);
2178 basic_block between_bb = single_exit (first)->dest;
2179
2180 gcc_assert (between_bb == second_preheader_e->src);
2181 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2182 /* Either the first loop or the second is the loop to be vectorized. */
2183 gcc_assert (loop == first || loop == second);
2184
2185 for (gsi_orig = gsi_start_phis (first->header),
2186 gsi_update = gsi_start_phis (second->header);
2187 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2188 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2189 {
2190 gphi *orig_phi = gsi_orig.phi ();
2191 gphi *update_phi = gsi_update.phi ();
2192
2193 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2194 /* Generate lcssa PHI node for the first loop. */
2195 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2196 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2197 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2198 {
2199 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2200 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2201 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2202 arg = new_res;
2203 }
2204
2205 /* Update PHI node in the second loop by replacing arg on the loop's
2206 incoming edge. */
2207 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2208 }
2209
2210 /* For epilogue peeling we have to make sure to copy all LC PHIs
2211 for correct vectorization of live stmts. */
2212 if (loop == first)
2213 {
2214 basic_block orig_exit = single_exit (second)->dest;
2215 for (gsi_orig = gsi_start_phis (orig_exit);
2216 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2217 {
2218 gphi *orig_phi = gsi_orig.phi ();
2219 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2220 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2221 continue;
2222
2223 /* Already created in the above loop. */
2224 if (find_guard_arg (first, second, orig_phi))
2225 continue;
2226
2227 tree new_res = copy_ssa_name (orig_arg);
2228 gphi *lcphi = create_phi_node (new_res, between_bb);
2229 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2230 }
2231 }
2232 }
2233
2234 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2235 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2236 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2237 appear like below:
2238
2239 guard_bb:
2240 if (cond)
2241 goto merge_bb;
2242 else
2243 goto skip_loop;
2244
2245 skip_loop:
2246 header_a:
2247 i_1 = PHI<i_0, i_2>;
2248 ...
2249 i_2 = i_1 + 1;
2250 if (cond_a)
2251 goto latch_a;
2252 else
2253 goto exit_a;
2254 latch_a:
2255 goto header_a;
2256
2257 exit_a:
2258 i_5 = PHI<i_2>;
2259
2260 merge_bb:
2261 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2262
2263 update_loop:
2264 header_b:
2265 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2266 ...
2267 i_4 = i_3 + 1;
2268 if (cond_b)
2269 goto latch_b;
2270 else
2271 goto exit_bb;
2272 latch_b:
2273 goto header_b;
2274
2275 exit_bb:
2276
2277 This function creates PHI nodes at merge_bb and replaces the use of i_5
2278 in the update_loop's PHI node with the result of new PHI result. */
2279
2280 static void
2281 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2282 class loop *update_loop,
2283 edge guard_edge, edge merge_edge)
2284 {
2285 location_t merge_loc, guard_loc;
2286 edge orig_e = loop_preheader_edge (skip_loop);
2287 edge update_e = loop_preheader_edge (update_loop);
2288 gphi_iterator gsi_orig, gsi_update;
2289
2290 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2291 gsi_update = gsi_start_phis (update_loop->header));
2292 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2293 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2294 {
2295 gphi *orig_phi = gsi_orig.phi ();
2296 gphi *update_phi = gsi_update.phi ();
2297
2298 /* Generate new phi node at merge bb of the guard. */
2299 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2300 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2301
2302 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2303 args in NEW_PHI for these edges. */
2304 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2305 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2306 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2307 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2308 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2309 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2310
2311 /* Update phi in UPDATE_PHI. */
2312 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2313 }
2314 }
2315
2316 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2317 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2318 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2319 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2320 The CFG looks like:
2321
2322 loop:
2323 header_a:
2324 i_1 = PHI<i_0, i_2>;
2325 ...
2326 i_2 = i_1 + 1;
2327 if (cond_a)
2328 goto latch_a;
2329 else
2330 goto exit_a;
2331 latch_a:
2332 goto header_a;
2333
2334 exit_a:
2335
2336 guard_bb:
2337 if (cond)
2338 goto merge_bb;
2339 else
2340 goto epilog_loop;
2341
2342 ;; fall_through_bb
2343
2344 epilog_loop:
2345 header_b:
2346 i_3 = PHI<i_2, i_4>;
2347 ...
2348 i_4 = i_3 + 1;
2349 if (cond_b)
2350 goto latch_b;
2351 else
2352 goto merge_bb;
2353 latch_b:
2354 goto header_b;
2355
2356 merge_bb:
2357 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2358
2359 exit_bb:
2360 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2361
2362 For each name used out side EPILOG (i.e - for each name that has a lcssa
2363 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2364 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2365 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2366 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2367 in exit_bb will also be updated. */
2368
2369 static void
2370 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2371 edge guard_edge, edge merge_edge)
2372 {
2373 gphi_iterator gsi;
2374 basic_block merge_bb = guard_edge->dest;
2375
2376 gcc_assert (single_succ_p (merge_bb));
2377 edge e = single_succ_edge (merge_bb);
2378 basic_block exit_bb = e->dest;
2379 gcc_assert (single_pred_p (exit_bb));
2380 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2381
2382 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2383 {
2384 gphi *update_phi = gsi.phi ();
2385 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2386
2387 tree merge_arg = NULL_TREE;
2388
2389 /* If the old argument is a SSA_NAME use its current_def. */
2390 if (TREE_CODE (old_arg) == SSA_NAME)
2391 merge_arg = get_current_def (old_arg);
2392 /* If it's a constant or doesn't have a current_def, just use the old
2393 argument. */
2394 if (!merge_arg)
2395 merge_arg = old_arg;
2396
2397 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2398 /* If the var is live after loop but not a reduction, we simply
2399 use the old arg. */
2400 if (!guard_arg)
2401 guard_arg = old_arg;
2402
2403 /* Create new phi node in MERGE_BB: */
2404 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2405 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2406
2407 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2408 the two PHI args in merge_phi for these edges. */
2409 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2410 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2411
2412 /* Update the original phi in exit_bb. */
2413 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2414 }
2415 }
2416
2417 /* EPILOG loop is duplicated from the original loop for vectorizing,
2418 the arg of its loop closed ssa PHI needs to be updated. */
2419
2420 static void
2421 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2422 {
2423 gphi_iterator gsi;
2424 basic_block exit_bb = single_exit (epilog)->dest;
2425
2426 gcc_assert (single_pred_p (exit_bb));
2427 edge e = EDGE_PRED (exit_bb, 0);
2428 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2429 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2430 }
2431
2432 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2433 iterate exactly CONST_NITERS times. Make a final decision about
2434 whether the epilogue loop should be used, returning true if so. */
2435
2436 static bool
2437 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2438 unsigned HOST_WIDE_INT const_niters)
2439 {
2440 /* Avoid wrap-around when computing const_niters - 1. Also reject
2441 using an epilogue loop for a single scalar iteration, even if
2442 we could in principle implement that using partial vectors. */
2443 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2444 if (const_niters <= gap_niters + 1)
2445 return false;
2446
2447 /* Install the number of iterations. */
2448 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2449 tree niters_tree = build_int_cst (niters_type, const_niters);
2450 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2451
2452 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2453 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2454
2455 /* Decide what to do if the number of epilogue iterations is not
2456 a multiple of the epilogue loop's vectorization factor. */
2457 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2458 }
2459
2460 /* Function vect_do_peeling.
2461
2462 Input:
2463 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2464
2465 preheader:
2466 LOOP:
2467 header_bb:
2468 loop_body
2469 if (exit_loop_cond) goto exit_bb
2470 else goto header_bb
2471 exit_bb:
2472
2473 - NITERS: The number of iterations of the loop.
2474 - NITERSM1: The number of iterations of the loop's latch.
2475 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2476 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2477 CHECK_PROFITABILITY is true.
2478 Output:
2479 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2480 iterate after vectorization; see vect_set_loop_condition for details.
2481 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2482 should be set to the number of scalar iterations handled by the
2483 vector loop. The SSA name is only used on exit from the loop.
2484
2485 This function peels prolog and epilog from the loop, adds guards skipping
2486 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2487 would look like:
2488
2489 guard_bb_1:
2490 if (prefer_scalar_loop) goto merge_bb_1
2491 else goto guard_bb_2
2492
2493 guard_bb_2:
2494 if (skip_prolog) goto merge_bb_2
2495 else goto prolog_preheader
2496
2497 prolog_preheader:
2498 PROLOG:
2499 prolog_header_bb:
2500 prolog_body
2501 if (exit_prolog_cond) goto prolog_exit_bb
2502 else goto prolog_header_bb
2503 prolog_exit_bb:
2504
2505 merge_bb_2:
2506
2507 vector_preheader:
2508 VECTOR LOOP:
2509 vector_header_bb:
2510 vector_body
2511 if (exit_vector_cond) goto vector_exit_bb
2512 else goto vector_header_bb
2513 vector_exit_bb:
2514
2515 guard_bb_3:
2516 if (skip_epilog) goto merge_bb_3
2517 else goto epilog_preheader
2518
2519 merge_bb_1:
2520
2521 epilog_preheader:
2522 EPILOG:
2523 epilog_header_bb:
2524 epilog_body
2525 if (exit_epilog_cond) goto merge_bb_3
2526 else goto epilog_header_bb
2527
2528 merge_bb_3:
2529
2530 Note this function peels prolog and epilog only if it's necessary,
2531 as well as guards.
2532 This function returns the epilogue loop if a decision was made to vectorize
2533 it, otherwise NULL.
2534
2535 The analysis resulting in this epilogue loop's loop_vec_info was performed
2536 in the same vect_analyze_loop call as the main loop's. At that time
2537 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2538 vectorization factors than the main loop. This list is stored in the main
2539 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2540 vectorize the epilogue loop for a lower vectorization factor, the
2541 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2542 updated and linked to the epilogue loop. This is later used to vectorize
2543 the epilogue. The reason the loop_vec_info needs updating is that it was
2544 constructed based on the original main loop, and the epilogue loop is a
2545 copy of this loop, so all links pointing to statements in the original loop
2546 need updating. Furthermore, these loop_vec_infos share the
2547 data_reference's records, which will also need to be updated.
2548
2549 TODO: Guard for prefer_scalar_loop should be emitted along with
2550 versioning conditions if loop versioning is needed. */
2551
2552
2553 class loop *
2554 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2555 tree *niters_vector, tree *step_vector,
2556 tree *niters_vector_mult_vf_var, int th,
2557 bool check_profitability, bool niters_no_overflow,
2558 tree *advance)
2559 {
2560 edge e, guard_e;
2561 tree type = TREE_TYPE (niters), guard_cond;
2562 basic_block guard_bb, guard_to;
2563 profile_probability prob_prolog, prob_vector, prob_epilog;
2564 int estimated_vf;
2565 int prolog_peeling = 0;
2566 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2567 bool vect_epilogues_updated_niters = false;
2568 /* We currently do not support prolog peeling if the target alignment is not
2569 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2570 target alignment being constant. */
2571 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2572 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2573 return NULL;
2574
2575 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2576 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2577
2578 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2579 poly_uint64 bound_epilog = 0;
2580 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2581 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2582 bound_epilog += vf - 1;
2583 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2584 bound_epilog += 1;
2585 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2586 poly_uint64 bound_scalar = bound_epilog;
2587
2588 if (!prolog_peeling && !epilog_peeling)
2589 return NULL;
2590
2591 /* Before doing any peeling make sure to reset debug binds outside of
2592 the loop refering to defs not in LC SSA. */
2593 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594 for (unsigned i = 0; i < loop->num_nodes; ++i)
2595 {
2596 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2597 imm_use_iterator ui;
2598 gimple *use_stmt;
2599 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2600 gsi_next (&gsi))
2601 {
2602 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2603 if (gimple_debug_bind_p (use_stmt)
2604 && loop != gimple_bb (use_stmt)->loop_father
2605 && !flow_loop_nested_p (loop,
2606 gimple_bb (use_stmt)->loop_father))
2607 {
2608 gimple_debug_bind_reset_value (use_stmt);
2609 update_stmt (use_stmt);
2610 }
2611 }
2612 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2613 gsi_next (&gsi))
2614 {
2615 ssa_op_iter op_iter;
2616 def_operand_p def_p;
2617 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2618 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2619 if (gimple_debug_bind_p (use_stmt)
2620 && loop != gimple_bb (use_stmt)->loop_father
2621 && !flow_loop_nested_p (loop,
2622 gimple_bb (use_stmt)->loop_father))
2623 {
2624 gimple_debug_bind_reset_value (use_stmt);
2625 update_stmt (use_stmt);
2626 }
2627 }
2628 }
2629
2630 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2631 estimated_vf = vect_vf_for_cost (loop_vinfo);
2632 if (estimated_vf == 2)
2633 estimated_vf = 3;
2634 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2635 .apply_scale (estimated_vf - 1, estimated_vf);
2636
2637 class loop *prolog, *epilog = NULL;
2638 class loop *first_loop = loop;
2639 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2640
2641 /* We might have a queued need to update virtual SSA form. As we
2642 delete the update SSA machinery below after doing a regular
2643 incremental SSA update during loop copying make sure we don't
2644 lose that fact.
2645 ??? Needing to update virtual SSA form by renaming is unfortunate
2646 but not all of the vectorizer code inserting new loads / stores
2647 properly assigns virtual operands to those statements. */
2648 update_ssa (TODO_update_ssa_only_virtuals);
2649
2650 create_lcssa_for_virtual_phi (loop);
2651
2652 /* If we're vectorizing an epilogue loop, the update_ssa above will
2653 have ensured that the virtual operand is in SSA form throughout the
2654 vectorized main loop. Normally it is possible to trace the updated
2655 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2656 back to scalar-stmt vuses, meaning that the effect of the SSA update
2657 remains local to the main loop. However, there are rare cases in
2658 which the vectorized loop has vdefs even when the original scalar
2659 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2660 introduces clobbers of the temporary vector array, which in turn
2661 needs new vdefs. If the scalar loop doesn't write to memory, these
2662 new vdefs will be the only ones in the vector loop.
2663
2664 In that case, update_ssa will have added a new virtual phi to the
2665 main loop, which previously didn't need one. Ensure that we (locally)
2666 maintain LCSSA form for the virtual operand, just as we would have
2667 done if the virtual phi had existed from the outset. This makes it
2668 easier to duplicate the scalar epilogue loop below. */
2669 tree vop_to_rename = NULL_TREE;
2670 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2671 {
2672 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2673 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2674 }
2675
2676 if (MAY_HAVE_DEBUG_BIND_STMTS)
2677 {
2678 gcc_assert (!adjust_vec.exists ());
2679 adjust_vec.create (32);
2680 }
2681 initialize_original_copy_tables ();
2682
2683 /* Record the anchor bb at which the guard should be placed if the scalar
2684 loop might be preferred. */
2685 basic_block anchor = loop_preheader_edge (loop)->src;
2686
2687 /* Generate the number of iterations for the prolog loop. We do this here
2688 so that we can also get the upper bound on the number of iterations. */
2689 tree niters_prolog;
2690 int bound_prolog = 0;
2691 if (prolog_peeling)
2692 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2693 &bound_prolog);
2694 else
2695 niters_prolog = build_int_cst (type, 0);
2696
2697 loop_vec_info epilogue_vinfo = NULL;
2698 if (vect_epilogues)
2699 {
2700 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2701 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2702 }
2703
2704 tree niters_vector_mult_vf = NULL_TREE;
2705 /* Saving NITERs before the loop, as this may be changed by prologue. */
2706 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2707 edge update_e = NULL, skip_e = NULL;
2708 unsigned int lowest_vf = constant_lower_bound (vf);
2709 /* If we know the number of scalar iterations for the main loop we should
2710 check whether after the main loop there are enough iterations left over
2711 for the epilogue. */
2712 if (vect_epilogues
2713 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2714 && prolog_peeling >= 0
2715 && known_eq (vf, lowest_vf))
2716 {
2717 unsigned HOST_WIDE_INT eiters
2718 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2719 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2720
2721 eiters -= prolog_peeling;
2722 eiters
2723 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2724
2725 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2726 {
2727 delete epilogue_vinfo;
2728 epilogue_vinfo = NULL;
2729 if (loop_vinfo->epilogue_vinfos.length () == 0)
2730 {
2731 vect_epilogues = false;
2732 break;
2733 }
2734 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2735 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2736 }
2737 vect_epilogues_updated_niters = true;
2738 }
2739 /* Prolog loop may be skipped. */
2740 bool skip_prolog = (prolog_peeling != 0);
2741 /* Skip this loop to epilog when there are not enough iterations to enter this
2742 vectorized loop. If true we should perform runtime checks on the NITERS
2743 to check whether we should skip the current vectorized loop. If we know
2744 the number of scalar iterations we may choose to add a runtime check if
2745 this number "maybe" smaller than the number of iterations required
2746 when we know the number of scalar iterations may potentially
2747 be smaller than the number of iterations required to enter this loop, for
2748 this we use the upper bounds on the prolog and epilog peeling. When we
2749 don't know the number of iterations and don't require versioning it is
2750 because we have asserted that there are enough scalar iterations to enter
2751 the main loop, so this skip is not necessary. When we are versioning then
2752 we only add such a skip if we have chosen to vectorize the epilogue. */
2753 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2754 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2755 bound_prolog + bound_epilog)
2756 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2757 || vect_epilogues));
2758 /* Epilog loop must be executed if the number of iterations for epilog
2759 loop is known at compile time, otherwise we need to add a check at
2760 the end of vector loop and skip to the end of epilog loop. */
2761 bool skip_epilog = (prolog_peeling < 0
2762 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2763 || !vf.is_constant ());
2764 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2765 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2766 skip_epilog = false;
2767
2768 if (skip_vector)
2769 {
2770 split_edge (loop_preheader_edge (loop));
2771
2772 /* Due to the order in which we peel prolog and epilog, we first
2773 propagate probability to the whole loop. The purpose is to
2774 avoid adjusting probabilities of both prolog and vector loops
2775 separately. Note in this case, the probability of epilog loop
2776 needs to be scaled back later. */
2777 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2778 if (prob_vector.initialized_p ())
2779 {
2780 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2781 scale_loop_profile (loop, prob_vector, 0);
2782 }
2783 }
2784
2785 dump_user_location_t loop_loc = find_loop_location (loop);
2786 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2787 if (vect_epilogues)
2788 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2789 use the original scalar loop as remaining epilogue if necessary. */
2790 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2791 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2792
2793 if (prolog_peeling)
2794 {
2795 e = loop_preheader_edge (loop);
2796 if (!slpeel_can_duplicate_loop_p (loop, e))
2797 {
2798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2799 "loop can't be duplicated to preheader edge.\n");
2800 gcc_unreachable ();
2801 }
2802 /* Peel prolog and put it on preheader edge of loop. */
2803 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2804 if (!prolog)
2805 {
2806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2807 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2808 gcc_unreachable ();
2809 }
2810 prolog->force_vectorize = false;
2811 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2812 first_loop = prolog;
2813 reset_original_copy_tables ();
2814
2815 /* Update the number of iterations for prolog loop. */
2816 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2817 vect_set_loop_condition (prolog, NULL, niters_prolog,
2818 step_prolog, NULL_TREE, false);
2819
2820 /* Skip the prolog loop. */
2821 if (skip_prolog)
2822 {
2823 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2824 niters_prolog, build_int_cst (type, 0));
2825 guard_bb = loop_preheader_edge (prolog)->src;
2826 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2827 guard_to = split_edge (loop_preheader_edge (loop));
2828 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2829 guard_to, guard_bb,
2830 prob_prolog.invert (),
2831 irred_flag);
2832 e = EDGE_PRED (guard_to, 0);
2833 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2834 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2835
2836 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2837 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2838 }
2839
2840 /* Update init address of DRs. */
2841 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2842 /* Update niters for vector loop. */
2843 LOOP_VINFO_NITERS (loop_vinfo)
2844 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2845 LOOP_VINFO_NITERSM1 (loop_vinfo)
2846 = fold_build2 (MINUS_EXPR, type,
2847 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2848 bool new_var_p = false;
2849 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2850 /* It's guaranteed that vector loop bound before vectorization is at
2851 least VF, so set range information for newly generated var. */
2852 if (new_var_p)
2853 set_range_info (niters, VR_RANGE,
2854 wi::to_wide (build_int_cst (type, vf)),
2855 wi::to_wide (TYPE_MAX_VALUE (type)));
2856
2857 /* Prolog iterates at most bound_prolog times, latch iterates at
2858 most bound_prolog - 1 times. */
2859 record_niter_bound (prolog, bound_prolog - 1, false, true);
2860 delete_update_ssa ();
2861 adjust_vec_debug_stmts ();
2862 scev_reset ();
2863 }
2864
2865 if (epilog_peeling)
2866 {
2867 e = single_exit (loop);
2868 if (!slpeel_can_duplicate_loop_p (loop, e))
2869 {
2870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2871 "loop can't be duplicated to exit edge.\n");
2872 gcc_unreachable ();
2873 }
2874 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2875 said epilog then we should use a copy of the main loop as a starting
2876 point. This loop may have already had some preliminary transformations
2877 to allow for more optimal vectorization, for example if-conversion.
2878 If we are not vectorizing the epilog then we should use the scalar loop
2879 as the transformations mentioned above make less or no sense when not
2880 vectorizing. */
2881 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2882 if (vop_to_rename)
2883 {
2884 /* Vectorizing the main loop can sometimes introduce a vdef to
2885 a loop that previously didn't have one; see the comment above
2886 the definition of VOP_TO_RENAME for details. The definition
2887 D that holds on E will then be different from the definition
2888 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2889 rename VOP_TO_RENAME to D when copying the loop.
2890
2891 The virtual operand is in LCSSA form for the main loop,
2892 and no stmt between the main loop and E needs a vdef,
2893 so we know that D is provided by a phi rather than by a
2894 vdef on a normal gimple stmt. */
2895 basic_block vdef_bb = e->src;
2896 gphi *vphi;
2897 while (!(vphi = get_virtual_phi (vdef_bb)))
2898 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2899 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2900 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2901 }
2902 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2903 if (!epilog)
2904 {
2905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2906 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2907 gcc_unreachable ();
2908 }
2909 epilog->force_vectorize = false;
2910 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2911
2912 /* Scalar version loop may be preferred. In this case, add guard
2913 and skip to epilog. Note this only happens when the number of
2914 iterations of loop is unknown at compile time, otherwise this
2915 won't be vectorized. */
2916 if (skip_vector)
2917 {
2918 /* Additional epilogue iteration is peeled if gap exists. */
2919 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2920 bound_prolog, bound_epilog,
2921 th, &bound_scalar,
2922 check_profitability);
2923 /* Build guard against NITERSM1 since NITERS may overflow. */
2924 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2925 guard_bb = anchor;
2926 guard_to = split_edge (loop_preheader_edge (epilog));
2927 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2928 guard_to, guard_bb,
2929 prob_vector.invert (),
2930 irred_flag);
2931 skip_e = guard_e;
2932 e = EDGE_PRED (guard_to, 0);
2933 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2934 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2935
2936 /* Simply propagate profile info from guard_bb to guard_to which is
2937 a merge point of control flow. */
2938 guard_to->count = guard_bb->count;
2939
2940 /* Scale probability of epilog loop back.
2941 FIXME: We should avoid scaling down and back up. Profile may
2942 get lost if we scale down to 0. */
2943 basic_block *bbs = get_loop_body (epilog);
2944 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2945 bbs[i]->count = bbs[i]->count.apply_scale
2946 (bbs[i]->count,
2947 bbs[i]->count.apply_probability
2948 (prob_vector));
2949 free (bbs);
2950 }
2951
2952 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2953 /* If loop is peeled for non-zero constant times, now niters refers to
2954 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2955 overflows. */
2956 niters_no_overflow |= (prolog_peeling > 0);
2957 vect_gen_vector_loop_niters (loop_vinfo, niters,
2958 niters_vector, step_vector,
2959 niters_no_overflow);
2960 if (!integer_onep (*step_vector))
2961 {
2962 /* On exit from the loop we will have an easy way of calcalating
2963 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2964 until then. */
2965 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2966 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2967 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2968 }
2969 else
2970 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2971 &niters_vector_mult_vf);
2972 /* Update IVs of original loop as if they were advanced by
2973 niters_vector_mult_vf steps. */
2974 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2975 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2976 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2977 update_e);
2978
2979 if (skip_epilog)
2980 {
2981 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2982 niters, niters_vector_mult_vf);
2983 guard_bb = single_exit (loop)->dest;
2984 guard_to = split_edge (single_exit (epilog));
2985 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2986 skip_vector ? anchor : guard_bb,
2987 prob_epilog.invert (),
2988 irred_flag);
2989 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2990 single_exit (epilog));
2991 /* Only need to handle basic block before epilog loop if it's not
2992 the guard_bb, which is the case when skip_vector is true. */
2993 if (guard_bb != bb_before_epilog)
2994 {
2995 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2996
2997 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2998 }
2999 scale_loop_profile (epilog, prob_epilog, 0);
3000 }
3001 else
3002 slpeel_update_phi_nodes_for_lcssa (epilog);
3003
3004 unsigned HOST_WIDE_INT bound;
3005 if (bound_scalar.is_constant (&bound))
3006 {
3007 gcc_assert (bound != 0);
3008 /* -1 to convert loop iterations to latch iterations. */
3009 record_niter_bound (epilog, bound - 1, false, true);
3010 }
3011
3012 delete_update_ssa ();
3013 adjust_vec_debug_stmts ();
3014 scev_reset ();
3015 }
3016
3017 if (vect_epilogues)
3018 {
3019 epilog->aux = epilogue_vinfo;
3020 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3021
3022 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3023
3024 /* We now must calculate the number of NITERS performed by the previous
3025 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3026 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3027 niters_prolog, niters_vector_mult_vf);
3028
3029 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3030 determine whether we are coming from the previous vectorized loop
3031 using the update_e edge or the skip_vector basic block using the
3032 skip_e edge. */
3033 if (skip_vector)
3034 {
3035 gcc_assert (update_e != NULL
3036 && skip_e != NULL
3037 && !vect_epilogues_updated_niters);
3038 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3039 update_e->dest);
3040 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3041 gimple *stmt = gimple_build_assign (new_ssa, niters);
3042 gimple_stmt_iterator gsi;
3043 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3044 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3045 {
3046 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3047 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3048 }
3049 else
3050 {
3051 gsi = gsi_last_bb (update_e->src);
3052 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3053 }
3054
3055 niters = new_ssa;
3056 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3057 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3058 UNKNOWN_LOCATION);
3059 niters = PHI_RESULT (new_phi);
3060 }
3061
3062 /* Set ADVANCE to the number of iterations performed by the previous
3063 loop and its prologue. */
3064 *advance = niters;
3065
3066 if (!vect_epilogues_updated_niters)
3067 {
3068 /* Subtract the number of iterations performed by the vectorized loop
3069 from the number of total iterations. */
3070 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3071 before_loop_niters,
3072 niters);
3073
3074 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3075 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3076 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3077 epilogue_niters,
3078 build_one_cst (TREE_TYPE (epilogue_niters)));
3079
3080 /* Decide what to do if the number of epilogue iterations is not
3081 a multiple of the epilogue loop's vectorization factor.
3082 We should have rejected the loop during the analysis phase
3083 if this fails. */
3084 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3085 true))
3086 gcc_unreachable ();
3087 }
3088 }
3089
3090 adjust_vec.release ();
3091 free_original_copy_tables ();
3092
3093 return vect_epilogues ? epilog : NULL;
3094 }
3095
3096 /* Function vect_create_cond_for_niters_checks.
3097
3098 Create a conditional expression that represents the run-time checks for
3099 loop's niter. The loop is guaranteed to terminate if the run-time
3100 checks hold.
3101
3102 Input:
3103 COND_EXPR - input conditional expression. New conditions will be chained
3104 with logical AND operation. If it is NULL, then the function
3105 is used to return the number of alias checks.
3106 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3107 to be checked.
3108
3109 Output:
3110 COND_EXPR - conditional expression.
3111
3112 The returned COND_EXPR is the conditional expression to be used in the
3113 if statement that controls which version of the loop gets executed at
3114 runtime. */
3115
3116 static void
3117 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3118 {
3119 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3120
3121 if (*cond_expr)
3122 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3123 *cond_expr, part_cond_expr);
3124 else
3125 *cond_expr = part_cond_expr;
3126 }
3127
3128 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3129 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3130
3131 static void
3132 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3133 {
3134 if (*cond_expr)
3135 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3136 *cond_expr, part_cond_expr);
3137 else
3138 *cond_expr = part_cond_expr;
3139 }
3140
3141 /* Function vect_create_cond_for_align_checks.
3142
3143 Create a conditional expression that represents the alignment checks for
3144 all of data references (array element references) whose alignment must be
3145 checked at runtime.
3146
3147 Input:
3148 COND_EXPR - input conditional expression. New conditions will be chained
3149 with logical AND operation.
3150 LOOP_VINFO - two fields of the loop information are used.
3151 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3152 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3153
3154 Output:
3155 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3156 expression.
3157 The returned value is the conditional expression to be used in the if
3158 statement that controls which version of the loop gets executed at runtime.
3159
3160 The algorithm makes two assumptions:
3161 1) The number of bytes "n" in a vector is a power of 2.
3162 2) An address "a" is aligned if a%n is zero and that this
3163 test can be done as a&(n-1) == 0. For example, for 16
3164 byte vectors the test is a&0xf == 0. */
3165
3166 static void
3167 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3168 tree *cond_expr,
3169 gimple_seq *cond_expr_stmt_list)
3170 {
3171 vec<stmt_vec_info> may_misalign_stmts
3172 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3173 stmt_vec_info stmt_info;
3174 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3175 tree mask_cst;
3176 unsigned int i;
3177 tree int_ptrsize_type;
3178 char tmp_name[20];
3179 tree or_tmp_name = NULL_TREE;
3180 tree and_tmp_name;
3181 gimple *and_stmt;
3182 tree ptrsize_zero;
3183 tree part_cond_expr;
3184
3185 /* Check that mask is one less than a power of 2, i.e., mask is
3186 all zeros followed by all ones. */
3187 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3188
3189 int_ptrsize_type = signed_type_for (ptr_type_node);
3190
3191 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3192 of the first vector of the i'th data reference. */
3193
3194 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3195 {
3196 gimple_seq new_stmt_list = NULL;
3197 tree addr_base;
3198 tree addr_tmp_name;
3199 tree new_or_tmp_name;
3200 gimple *addr_stmt, *or_stmt;
3201 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3202 bool negative = tree_int_cst_compare
3203 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3204 tree offset = negative
3205 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3206
3207 /* create: addr_tmp = (int)(address_of_first_vector) */
3208 addr_base =
3209 vect_create_addr_base_for_vector_ref (loop_vinfo,
3210 stmt_info, &new_stmt_list,
3211 offset);
3212 if (new_stmt_list != NULL)
3213 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3214
3215 sprintf (tmp_name, "addr2int%d", i);
3216 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3217 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3218 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3219
3220 /* The addresses are OR together. */
3221
3222 if (or_tmp_name != NULL_TREE)
3223 {
3224 /* create: or_tmp = or_tmp | addr_tmp */
3225 sprintf (tmp_name, "orptrs%d", i);
3226 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3227 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3228 or_tmp_name, addr_tmp_name);
3229 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3230 or_tmp_name = new_or_tmp_name;
3231 }
3232 else
3233 or_tmp_name = addr_tmp_name;
3234
3235 } /* end for i */
3236
3237 mask_cst = build_int_cst (int_ptrsize_type, mask);
3238
3239 /* create: and_tmp = or_tmp & mask */
3240 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3241
3242 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3243 or_tmp_name, mask_cst);
3244 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3245
3246 /* Make and_tmp the left operand of the conditional test against zero.
3247 if and_tmp has a nonzero bit then some address is unaligned. */
3248 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3249 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3250 and_tmp_name, ptrsize_zero);
3251 chain_cond_expr (cond_expr, part_cond_expr);
3252 }
3253
3254 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3255 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3256 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3257 and this new condition are true. Treat a null *COND_EXPR as "true". */
3258
3259 static void
3260 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3261 {
3262 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3263 unsigned int i;
3264 vec_object_pair *pair;
3265 FOR_EACH_VEC_ELT (pairs, i, pair)
3266 {
3267 tree addr1 = build_fold_addr_expr (pair->first);
3268 tree addr2 = build_fold_addr_expr (pair->second);
3269 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3270 addr1, addr2);
3271 chain_cond_expr (cond_expr, part_cond_expr);
3272 }
3273 }
3274
3275 /* Create an expression that is true when all lower-bound conditions for
3276 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3277
3278 static void
3279 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3280 {
3281 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3282 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3283 {
3284 tree expr = lower_bounds[i].expr;
3285 tree type = unsigned_type_for (TREE_TYPE (expr));
3286 expr = fold_convert (type, expr);
3287 poly_uint64 bound = lower_bounds[i].min_value;
3288 if (!lower_bounds[i].unsigned_p)
3289 {
3290 expr = fold_build2 (PLUS_EXPR, type, expr,
3291 build_int_cstu (type, bound - 1));
3292 bound += bound - 1;
3293 }
3294 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3295 build_int_cstu (type, bound));
3296 chain_cond_expr (cond_expr, part_cond_expr);
3297 }
3298 }
3299
3300 /* Function vect_create_cond_for_alias_checks.
3301
3302 Create a conditional expression that represents the run-time checks for
3303 overlapping of address ranges represented by a list of data references
3304 relations passed as input.
3305
3306 Input:
3307 COND_EXPR - input conditional expression. New conditions will be chained
3308 with logical AND operation. If it is NULL, then the function
3309 is used to return the number of alias checks.
3310 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3311 to be checked.
3312
3313 Output:
3314 COND_EXPR - conditional expression.
3315
3316 The returned COND_EXPR is the conditional expression to be used in the if
3317 statement that controls which version of the loop gets executed at runtime.
3318 */
3319
3320 void
3321 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3322 {
3323 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3324 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3325
3326 if (comp_alias_ddrs.is_empty ())
3327 return;
3328
3329 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3330 &comp_alias_ddrs, cond_expr);
3331 if (dump_enabled_p ())
3332 dump_printf_loc (MSG_NOTE, vect_location,
3333 "created %u versioning for alias checks.\n",
3334 comp_alias_ddrs.length ());
3335 }
3336
3337
3338 /* Function vect_loop_versioning.
3339
3340 If the loop has data references that may or may not be aligned or/and
3341 has data reference relations whose independence was not proven then
3342 two versions of the loop need to be generated, one which is vectorized
3343 and one which isn't. A test is then generated to control which of the
3344 loops is executed. The test checks for the alignment of all of the
3345 data references that may or may not be aligned. An additional
3346 sequence of runtime tests is generated for each pairs of DDRs whose
3347 independence was not proven. The vectorized version of loop is
3348 executed only if both alias and alignment tests are passed.
3349
3350 The test generated to check which version of loop is executed
3351 is modified to also check for profitability as indicated by the
3352 cost model threshold TH.
3353
3354 The versioning precondition(s) are placed in *COND_EXPR and
3355 *COND_EXPR_STMT_LIST. */
3356
3357 class loop *
3358 vect_loop_versioning (loop_vec_info loop_vinfo,
3359 gimple *loop_vectorized_call)
3360 {
3361 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3362 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3363 basic_block condition_bb;
3364 gphi_iterator gsi;
3365 gimple_stmt_iterator cond_exp_gsi;
3366 basic_block merge_bb;
3367 basic_block new_exit_bb;
3368 edge new_exit_e, e;
3369 gphi *orig_phi, *new_phi;
3370 tree cond_expr = NULL_TREE;
3371 gimple_seq cond_expr_stmt_list = NULL;
3372 tree arg;
3373 profile_probability prob = profile_probability::likely ();
3374 gimple_seq gimplify_stmt_list = NULL;
3375 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3376 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3377 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3378 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3379 poly_uint64 versioning_threshold
3380 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3381 tree version_simd_if_cond
3382 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3383 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3384
3385 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3386 && !ordered_p (th, versioning_threshold))
3387 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3388 build_int_cst (TREE_TYPE (scalar_loop_iters),
3389 th - 1));
3390 if (maybe_ne (versioning_threshold, 0U))
3391 {
3392 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3393 build_int_cst (TREE_TYPE (scalar_loop_iters),
3394 versioning_threshold - 1));
3395 if (cond_expr)
3396 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3397 expr, cond_expr);
3398 else
3399 cond_expr = expr;
3400 }
3401
3402 if (version_niter)
3403 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3404
3405 if (cond_expr)
3406 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3407 &cond_expr_stmt_list,
3408 is_gimple_condexpr, NULL_TREE);
3409
3410 if (version_align)
3411 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3412 &cond_expr_stmt_list);
3413
3414 if (version_alias)
3415 {
3416 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3417 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3418 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3419 }
3420
3421 if (version_simd_if_cond)
3422 {
3423 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3424 if (flag_checking)
3425 if (basic_block bb
3426 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3427 gcc_assert (bb != loop->header
3428 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3429 && (scalar_loop == NULL
3430 || (bb != scalar_loop->header
3431 && dominated_by_p (CDI_DOMINATORS,
3432 scalar_loop->header, bb))));
3433 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3434 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3435 version_simd_if_cond, zero);
3436 if (cond_expr)
3437 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3438 c, cond_expr);
3439 else
3440 cond_expr = c;
3441 if (dump_enabled_p ())
3442 dump_printf_loc (MSG_NOTE, vect_location,
3443 "created versioning for simd if condition check.\n");
3444 }
3445
3446 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3447 &gimplify_stmt_list,
3448 is_gimple_condexpr, NULL_TREE);
3449 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3450
3451 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3452 invariant in. */
3453 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3454 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3455 !gsi_end_p (gsi); gsi_next (&gsi))
3456 {
3457 gimple *stmt = gsi_stmt (gsi);
3458 update_stmt (stmt);
3459 ssa_op_iter iter;
3460 use_operand_p use_p;
3461 basic_block def_bb;
3462 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3463 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3464 && flow_bb_inside_loop_p (outermost, def_bb))
3465 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3466 }
3467
3468 /* Search for the outermost loop we can version. Avoid versioning of
3469 non-perfect nests but allow if-conversion versioned loops inside. */
3470 class loop *loop_to_version = loop;
3471 if (flow_loop_nested_p (outermost, loop))
3472 {
3473 if (dump_enabled_p ())
3474 dump_printf_loc (MSG_NOTE, vect_location,
3475 "trying to apply versioning to outer loop %d\n",
3476 outermost->num);
3477 if (outermost->num == 0)
3478 outermost = superloop_at_depth (loop, 1);
3479 /* And avoid applying versioning on non-perfect nests. */
3480 while (loop_to_version != outermost
3481 && single_exit (loop_outer (loop_to_version))
3482 && (!loop_outer (loop_to_version)->inner->next
3483 || vect_loop_vectorized_call (loop_to_version))
3484 && (!loop_outer (loop_to_version)->inner->next
3485 || !loop_outer (loop_to_version)->inner->next->next))
3486 loop_to_version = loop_outer (loop_to_version);
3487 }
3488
3489 /* Apply versioning. If there is already a scalar version created by
3490 if-conversion re-use that. Note we cannot re-use the copy of
3491 an if-converted outer-loop when vectorizing the inner loop only. */
3492 gcond *cond;
3493 if ((!loop_to_version->inner || loop == loop_to_version)
3494 && loop_vectorized_call)
3495 {
3496 gcc_assert (scalar_loop);
3497 condition_bb = gimple_bb (loop_vectorized_call);
3498 cond = as_a <gcond *> (last_stmt (condition_bb));
3499 gimple_cond_set_condition_from_tree (cond, cond_expr);
3500 update_stmt (cond);
3501
3502 if (cond_expr_stmt_list)
3503 {
3504 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3505 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3506 GSI_SAME_STMT);
3507 }
3508
3509 /* if-conversion uses profile_probability::always () for both paths,
3510 reset the paths probabilities appropriately. */
3511 edge te, fe;
3512 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3513 te->probability = prob;
3514 fe->probability = prob.invert ();
3515 /* We can scale loops counts immediately but have to postpone
3516 scaling the scalar loop because we re-use it during peeling. */
3517 scale_loop_frequencies (loop_to_version, te->probability);
3518 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3519
3520 nloop = scalar_loop;
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location,
3523 "reusing %sloop version created by if conversion\n",
3524 loop_to_version != loop ? "outer " : "");
3525 }
3526 else
3527 {
3528 if (loop_to_version != loop
3529 && dump_enabled_p ())
3530 dump_printf_loc (MSG_NOTE, vect_location,
3531 "applying loop versioning to outer loop %d\n",
3532 loop_to_version->num);
3533
3534 initialize_original_copy_tables ();
3535 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3536 prob, prob.invert (), prob, prob.invert (), true);
3537 gcc_assert (nloop);
3538 nloop = get_loop_copy (loop);
3539
3540 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3541 reap those otherwise; they also refer to the original
3542 loops. */
3543 class loop *l = loop;
3544 while (gimple *call = vect_loop_vectorized_call (l))
3545 {
3546 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3547 fold_loop_internal_call (call, boolean_false_node);
3548 l = loop_outer (l);
3549 }
3550 free_original_copy_tables ();
3551
3552 if (cond_expr_stmt_list)
3553 {
3554 cond_exp_gsi = gsi_last_bb (condition_bb);
3555 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3556 GSI_SAME_STMT);
3557 }
3558
3559 /* Loop versioning violates an assumption we try to maintain during
3560 vectorization - that the loop exit block has a single predecessor.
3561 After versioning, the exit block of both loop versions is the same
3562 basic block (i.e. it has two predecessors). Just in order to simplify
3563 following transformations in the vectorizer, we fix this situation
3564 here by adding a new (empty) block on the exit-edge of the loop,
3565 with the proper loop-exit phis to maintain loop-closed-form.
3566 If loop versioning wasn't done from loop, but scalar_loop instead,
3567 merge_bb will have already just a single successor. */
3568
3569 merge_bb = single_exit (loop_to_version)->dest;
3570 if (EDGE_COUNT (merge_bb->preds) >= 2)
3571 {
3572 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3573 new_exit_bb = split_edge (single_exit (loop_to_version));
3574 new_exit_e = single_exit (loop_to_version);
3575 e = EDGE_SUCC (new_exit_bb, 0);
3576
3577 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3578 gsi_next (&gsi))
3579 {
3580 tree new_res;
3581 orig_phi = gsi.phi ();
3582 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3583 new_phi = create_phi_node (new_res, new_exit_bb);
3584 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3585 add_phi_arg (new_phi, arg, new_exit_e,
3586 gimple_phi_arg_location_from_edge (orig_phi, e));
3587 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3588 }
3589 }
3590
3591 update_ssa (TODO_update_ssa);
3592 }
3593
3594 if (version_niter)
3595 {
3596 /* The versioned loop could be infinite, we need to clear existing
3597 niter information which is copied from the original loop. */
3598 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3599 vect_free_loop_info_assumptions (nloop);
3600 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3601 loop_constraint_set (loop, LOOP_C_INFINITE);
3602 }
3603
3604 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3605 && dump_enabled_p ())
3606 {
3607 if (version_alias)
3608 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3609 vect_location,
3610 "loop versioned for vectorization because of "
3611 "possible aliasing\n");
3612 if (version_align)
3613 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3614 vect_location,
3615 "loop versioned for vectorization to enhance "
3616 "alignment\n");
3617
3618 }
3619
3620 return nloop;
3621 }