gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50 #include "insn-config.h"
  51 #include "rtl.h"
  52 #include "recog.h"
  53
  54 /*************************************************************************
  55   Simple Loop Peeling Utilities
  56
  57   Utilities to support loop peeling for vectorization purposes.
  58  *************************************************************************/
  59
  60
  61 /* Renames the use *OP_P.  */
  62
  63 static void
  64 rename_use_op (use_operand_p op_p)
  65 {
  66   tree new_name;
  67
  68   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  69     return;
  70
  71   new_name = get_current_def (USE_FROM_PTR (op_p));
  72
  73   /* Something defined outside of the loop.  */
  74   if (!new_name)
  75     return;
  76
  77   /* An ordinary ssa name defined in the loop.  */
  78
  79   SET_USE (op_p, new_name);
  80 }
  81
  82
  83 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  84    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  85    true.  */
  86
  87 static void
  88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  89 {
  90   gimple *stmt;
  91   use_operand_p use_p;
  92   ssa_op_iter iter;
  93   edge e;
  94   edge_iterator ei;
  95   class loop *loop = bb->loop_father;
  96   class loop *outer_loop = NULL;
  97
  98   if (rename_from_outer_loop)
  99     {
 100       gcc_assert (loop);
 101       outer_loop = loop_outer (loop);
 102     }
 103
 104   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 105        gsi_next (&gsi))
 106     {
 107       stmt = gsi_stmt (gsi);
 108       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 109         rename_use_op (use_p);
 110     }
 111
 112   FOR_EACH_EDGE (e, ei, bb->preds)
 113     {
 114       if (!flow_bb_inside_loop_p (loop, e->src))
 115         {
 116           if (!rename_from_outer_loop)
 117             continue;
 118           if (e->src != outer_loop->header)
 119             {
 120               if (outer_loop->inner->next)
 121                 {
 122                   /* If outer_loop has 2 inner loops, allow there to
 123                      be an extra basic block which decides which of the
 124                      two loops to use using LOOP_VECTORIZED.  */
 125                   if (!single_pred_p (e->src)
 126                       || single_pred (e->src) != outer_loop->header)
 127                     continue;
 128                 }
 129             }
 130         }
 131       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 132            gsi_next (&gsi))
 133         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 134     }
 135 }
 136
 137
 138 struct adjust_info
 139 {
 140   tree from, to;
 141   basic_block bb;
 142 };
 143
 144 /* A stack of values to be adjusted in debug stmts.  We have to
 145    process them LIFO, so that the closest substitution applies.  If we
 146    processed them FIFO, without the stack, we might substitute uses
 147    with a PHI DEF that would soon become non-dominant, and when we got
 148    to the suitable one, it wouldn't have anything to substitute any
 149    more.  */
 150 static vec<adjust_info, va_heap> adjust_vec;
 151
 152 /* Adjust any debug stmts that referenced AI->from values to use the
 153    loop-closed AI->to, if the references are dominated by AI->bb and
 154    not by the definition of AI->from.  */
 155
 156 static void
 157 adjust_debug_stmts_now (adjust_info *ai)
 158 {
 159   basic_block bbphi = ai->bb;
 160   tree orig_def = ai->from;
 161   tree new_def = ai->to;
 162   imm_use_iterator imm_iter;
 163   gimple *stmt;
 164   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 165
 166   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 167
 168   /* Adjust any debug stmts that held onto non-loop-closed
 169      references.  */
 170   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 171     {
 172       use_operand_p use_p;
 173       basic_block bbuse;
 174
 175       if (!is_gimple_debug (stmt))
 176         continue;
 177
 178       gcc_assert (gimple_debug_bind_p (stmt));
 179
 180       bbuse = gimple_bb (stmt);
 181
 182       if ((bbuse == bbphi
 183            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 184           && !(bbuse == bbdef
 185                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 186         {
 187           if (new_def)
 188             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 189               SET_USE (use_p, new_def);
 190           else
 191             {
 192               gimple_debug_bind_reset_value (stmt);
 193               update_stmt (stmt);
 194             }
 195         }
 196     }
 197 }
 198
 199 /* Adjust debug stmts as scheduled before.  */
 200
 201 static void
 202 adjust_vec_debug_stmts (void)
 203 {
 204   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 205     return;
 206
 207   gcc_assert (adjust_vec.exists ());
 208
 209   while (!adjust_vec.is_empty ())
 210     {
 211       adjust_debug_stmts_now (&adjust_vec.last ());
 212       adjust_vec.pop ();
 213     }
 214 }
 215
 216 /* Adjust any debug stmts that referenced FROM values to use the
 217    loop-closed TO, if the references are dominated by BB and not by
 218    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 219    will be postponed until adjust_vec_debug_stmts is called.  */
 220
 221 static void
 222 adjust_debug_stmts (tree from, tree to, basic_block bb)
 223 {
 224   adjust_info ai;
 225
 226   if (MAY_HAVE_DEBUG_BIND_STMTS
 227       && TREE_CODE (from) == SSA_NAME
 228       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 229       && ! virtual_operand_p (from))
 230     {
 231       ai.from = from;
 232       ai.to = to;
 233       ai.bb = bb;
 234
 235       if (adjust_vec.exists ())
 236         adjust_vec.safe_push (ai);
 237       else
 238         adjust_debug_stmts_now (&ai);
 239     }
 240 }
 241
 242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 243    to adjust any debug stmts that referenced the old phi arg,
 244    presumably non-loop-closed references left over from other
 245    transformations.  */
 246
 247 static void
 248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 249 {
 250   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 251
 252   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 253
 254   if (MAY_HAVE_DEBUG_BIND_STMTS)
 255     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 256                         gimple_bb (update_phi));
 257 }
 258
 259 /* Define one loop rgroup control CTRL from loop LOOP.  INIT_CTRL is the value
 260    that the control should have during the first iteration and NEXT_CTRL is the
 261    value that it should have on subsequent iterations.  */
 262
 263 static void
 264 vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
 265                        tree next_ctrl)
 266 {
 267   gphi *phi = create_phi_node (ctrl, loop->header);
 268   add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 269   add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
 270 }
 271
 272 /* Add SEQ to the end of LOOP's preheader block.  */
 273
 274 static void
 275 add_preheader_seq (class loop *loop, gimple_seq seq)
 276 {
 277   if (seq)
 278     {
 279       edge pe = loop_preheader_edge (loop);
 280       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 281       gcc_assert (!new_bb);
 282     }
 283 }
 284
 285 /* Add SEQ to the beginning of LOOP's header block.  */
 286
 287 static void
 288 add_header_seq (class loop *loop, gimple_seq seq)
 289 {
 290   if (seq)
 291     {
 292       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 293       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 294     }
 295 }
 296
 297 /* Return true if the target can interleave elements of two vectors.
 298    OFFSET is 0 if the first half of the vectors should be interleaved
 299    or 1 if the second half should.  When returning true, store the
 300    associated permutation in INDICES.  */
 301
 302 static bool
 303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 304                         unsigned int offset)
 305 {
 306   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 307   poly_uint64 base = exact_div (nelts, 2) * offset;
 308   vec_perm_builder sel (nelts, 2, 3);
 309   for (unsigned int i = 0; i < 3; ++i)
 310     {
 311       sel.quick_push (base + i);
 312       sel.quick_push (base + i + nelts);
 313     }
 314   indices->new_vector (sel, 2, nelts);
 315   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 316 }
 317
 318 /* Try to use permutes to define the masks in DEST_RGM using the masks
 319    in SRC_RGM, given that the former has twice as many masks as the
 320    latter.  Return true on success, adding any new statements to SEQ.  */
 321
 322 static bool
 323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 324                                rgroup_controls *src_rgm)
 325 {
 326   tree src_masktype = src_rgm->type;
 327   tree dest_masktype = dest_rgm->type;
 328   machine_mode src_mode = TYPE_MODE (src_masktype);
 329   insn_code icode1, icode2;
 330   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 331       && (icode1 = optab_handler (vec_unpacku_hi_optab,
 332                                   src_mode)) != CODE_FOR_nothing
 333       && (icode2 = optab_handler (vec_unpacku_lo_optab,
 334                                   src_mode)) != CODE_FOR_nothing)
 335     {
 336       /* Unpacking the source masks gives at least as many mask bits as
 337          we need.  We can then VIEW_CONVERT any excess bits away.  */
 338       machine_mode dest_mode = insn_data[icode1].operand[0].mode;
 339       gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
 340       tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
 341       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 342         {
 343           tree src = src_rgm->controls[i / 2];
 344           tree dest = dest_rgm->controls[i];
 345           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 346                             ? VEC_UNPACK_HI_EXPR
 347                             : VEC_UNPACK_LO_EXPR);
 348           gassign *stmt;
 349           if (dest_masktype == unpack_masktype)
 350             stmt = gimple_build_assign (dest, code, src);
 351           else
 352             {
 353               tree temp = make_ssa_name (unpack_masktype);
 354               stmt = gimple_build_assign (temp, code, src);
 355               gimple_seq_add_stmt (seq, stmt);
 356               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 357                                           build1 (VIEW_CONVERT_EXPR,
 358                                                   dest_masktype, temp));
 359             }
 360           gimple_seq_add_stmt (seq, stmt);
 361         }
 362       return true;
 363     }
 364   vec_perm_indices indices[2];
 365   if (dest_masktype == src_masktype
 366       && interleave_supported_p (&indices[0], src_masktype, 0)
 367       && interleave_supported_p (&indices[1], src_masktype, 1))
 368     {
 369       /* The destination requires twice as many mask bits as the source, so
 370          we can use interleaving permutes to double up the number of bits.  */
 371       tree masks[2];
 372       for (unsigned int i = 0; i < 2; ++i)
 373         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 374       for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
 375         {
 376           tree src = src_rgm->controls[i / 2];
 377           tree dest = dest_rgm->controls[i];
 378           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 379                                               src, src, masks[i & 1]);
 380           gimple_seq_add_stmt (seq, stmt);
 381         }
 382       return true;
 383     }
 384   return false;
 385 }
 386
 387 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
 388    for all the rgroup controls in RGC and return a control that is nonzero
 389    when the loop needs to iterate.  Add any new preheader statements to
 390    PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
 391
 392    RGC belongs to loop LOOP.  The loop originally iterated NITERS
 393    times and has been vectorized according to LOOP_VINFO.
 394
 395    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 396    starts with NITERS_SKIP dummy iterations of the scalar loop before
 397    the real work starts.  The mask elements for these dummy iterations
 398    must be 0, to ensure that the extra iterations do not have an effect.
 399
 400    It is known that:
 401
 402      NITERS * RGC->max_nscalars_per_iter * RGC->factor
 403
 404    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 405    variable that starts at 0 and has step:
 406
 407      VF * RGC->max_nscalars_per_iter * RGC->factor
 408
 409    might overflow before hitting a value above:
 410
 411      (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
 412
 413    This means that we cannot guarantee that such an induction variable
 414    would ever hit a value that produces a set of all-false masks or zero
 415    lengths for RGC.
 416
 417    Note: the cost of the code generated by this function is modeled
 418    by vect_estimate_min_profitable_iters, so changes here may need
 419    corresponding changes there.  */
 420
 421 static tree
 422 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 423                                  gimple_seq *preheader_seq,
 424                                  gimple_stmt_iterator loop_cond_gsi,
 425                                  rgroup_controls *rgc, tree niters,
 426                                  tree niters_skip, bool might_wrap_p)
 427 {
 428   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 429   tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 430   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 431
 432   tree ctrl_type = rgc->type;
 433   unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
 434   poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
 435   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 436   tree length_limit = NULL_TREE;
 437   /* For length, we need length_limit to ensure length in range.  */
 438   if (!use_masks_p)
 439     length_limit = build_int_cst (compare_type, nitems_per_ctrl);
 440
 441   /* Calculate the maximum number of item values that the rgroup
 442      handles in total, the number that it handles for each iteration
 443      of the vector loop, and the number that it should skip during the
 444      first iteration of the vector loop.  */
 445   tree nitems_total = niters;
 446   tree nitems_step = build_int_cst (iv_type, vf);
 447   tree nitems_skip = niters_skip;
 448   if (nitems_per_iter != 1)
 449     {
 450       /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
 451          these multiplications don't overflow.  */
 452       tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
 453       tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
 454       nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 455                                    nitems_total, compare_factor);
 456       nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
 457                                   nitems_step, iv_factor);
 458       if (nitems_skip)
 459         nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 460                                     nitems_skip, compare_factor);
 461     }
 462
 463   /* Create an induction variable that counts the number of items
 464      processed.  */
 465   tree index_before_incr, index_after_incr;
 466   gimple_stmt_iterator incr_gsi;
 467   bool insert_after;
 468   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 469   create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
 470              &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
 471
 472   tree zero_index = build_int_cst (compare_type, 0);
 473   tree test_index, test_limit, first_limit;
 474   gimple_stmt_iterator *test_gsi;
 475   if (might_wrap_p)
 476     {
 477       /* In principle the loop should stop iterating once the incremented
 478          IV reaches a value greater than or equal to:
 479
 480            NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
 481
 482          However, there's no guarantee that this addition doesn't overflow
 483          the comparison type, or that the IV hits a value above it before
 484          wrapping around.  We therefore adjust the limit down by one
 485          IV step:
 486
 487            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 488            -[infinite-prec] NITEMS_STEP
 489
 490          and compare the IV against this limit _before_ incrementing it.
 491          Since the comparison type is unsigned, we actually want the
 492          subtraction to saturate at zero:
 493
 494            (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
 495            -[sat] NITEMS_STEP
 496
 497          And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
 498
 499            NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
 500
 501          where the rightmost subtraction can be done directly in
 502          COMPARE_TYPE.  */
 503       test_index = index_before_incr;
 504       tree adjust = gimple_convert (preheader_seq, compare_type,
 505                                     nitems_step);
 506       if (nitems_skip)
 507         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 508                                adjust, nitems_skip);
 509       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 510                                  nitems_total, adjust);
 511       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 512                                  test_limit, adjust);
 513       test_gsi = &incr_gsi;
 514
 515       /* Get a safe limit for the first iteration.  */
 516       if (nitems_skip)
 517         {
 518           /* The first vector iteration can handle at most NITEMS_STEP
 519              items.  NITEMS_STEP <= CONST_LIMIT, and adding
 520              NITEMS_SKIP to that cannot overflow.  */
 521           tree const_limit = build_int_cst (compare_type,
 522                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 523                                             * nitems_per_iter);
 524           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 525                                       nitems_total, const_limit);
 526           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 527                                       first_limit, nitems_skip);
 528         }
 529       else
 530         /* For the first iteration it doesn't matter whether the IV hits
 531            a value above NITEMS_TOTAL.  That only matters for the latch
 532            condition.  */
 533         first_limit = nitems_total;
 534     }
 535   else
 536     {
 537       /* Test the incremented IV, which will always hit a value above
 538          the bound before wrapping.  */
 539       test_index = index_after_incr;
 540       test_limit = nitems_total;
 541       if (nitems_skip)
 542         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 543                                    test_limit, nitems_skip);
 544       test_gsi = &loop_cond_gsi;
 545
 546       first_limit = test_limit;
 547     }
 548
 549   /* Convert the IV value to the comparison type (either a no-op or
 550      a demotion).  */
 551   gimple_seq test_seq = NULL;
 552   test_index = gimple_convert (&test_seq, compare_type, test_index);
 553   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 554
 555   /* Provide a definition of each control in the group.  */
 556   tree next_ctrl = NULL_TREE;
 557   tree ctrl;
 558   unsigned int i;
 559   FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
 560     {
 561       /* Previous controls will cover BIAS items.  This control covers the
 562          next batch.  */
 563       poly_uint64 bias = nitems_per_ctrl * i;
 564       tree bias_tree = build_int_cst (compare_type, bias);
 565
 566       /* See whether the first iteration of the vector loop is known
 567          to have a full control.  */
 568       poly_uint64 const_limit;
 569       bool first_iteration_full
 570         = (poly_int_tree_p (first_limit, &const_limit)
 571            && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
 572
 573       /* Rather than have a new IV that starts at BIAS and goes up to
 574          TEST_LIMIT, prefer to use the same 0-based IV for each control
 575          and adjust the bound down by BIAS.  */
 576       tree this_test_limit = test_limit;
 577       if (i != 0)
 578         {
 579           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 580                                           compare_type, this_test_limit,
 581                                           bias_tree);
 582           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 583                                           compare_type, this_test_limit,
 584                                           bias_tree);
 585         }
 586
 587       /* Create the initial control.  First include all items that
 588          are within the loop limit.  */
 589       tree init_ctrl = NULL_TREE;
 590       if (!first_iteration_full)
 591         {
 592           tree start, end;
 593           if (first_limit == test_limit)
 594             {
 595               /* Use a natural test between zero (the initial IV value)
 596                  and the loop limit.  The "else" block would be valid too,
 597                  but this choice can avoid the need to load BIAS_TREE into
 598                  a register.  */
 599               start = zero_index;
 600               end = this_test_limit;
 601             }
 602           else
 603             {
 604               /* FIRST_LIMIT is the maximum number of items handled by the
 605                  first iteration of the vector loop.  Test the portion
 606                  associated with this control.  */
 607               start = bias_tree;
 608               end = first_limit;
 609             }
 610
 611           if (use_masks_p)
 612             {
 613               init_ctrl = make_temp_ssa_name (ctrl_type, NULL, "max_mask");
 614               gimple *tmp_stmt = vect_gen_while (init_ctrl, start, end);
 615               gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 616             }
 617           else
 618             {
 619               init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
 620               gimple_seq seq = vect_gen_len (init_ctrl, start,
 621                                              end, length_limit);
 622               gimple_seq_add_seq (preheader_seq, seq);
 623             }
 624         }
 625
 626       /* Now AND out the bits that are within the number of skipped
 627          items.  */
 628       poly_uint64 const_skip;
 629       if (nitems_skip
 630           && !(poly_int_tree_p (nitems_skip, &const_skip)
 631                && known_le (const_skip, bias)))
 632         {
 633           gcc_assert (use_masks_p);
 634           tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
 635                                                     bias_tree, nitems_skip);
 636           if (init_ctrl)
 637             init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
 638                                       init_ctrl, unskipped_mask);
 639           else
 640             init_ctrl = unskipped_mask;
 641         }
 642
 643       if (!init_ctrl)
 644         {
 645           /* First iteration is full.  */
 646           if (use_masks_p)
 647             init_ctrl = build_minus_one_cst (ctrl_type);
 648           else
 649             init_ctrl = length_limit;
 650         }
 651
 652       /* Get the control value for the next iteration of the loop.  */
 653       if (use_masks_p)
 654         {
 655           next_ctrl = make_temp_ssa_name (ctrl_type, NULL, "next_mask");
 656           gcall *call = vect_gen_while (next_ctrl, test_index, this_test_limit);
 657           gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 658         }
 659       else
 660         {
 661           next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
 662           gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
 663                                          length_limit);
 664           gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
 665         }
 666
 667       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
 668     }
 669   return next_ctrl;
 670 }
 671
 672 /* Set up the iteration condition and rgroup controls for LOOP, given
 673    that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
 674    loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
 675    the number of iterations of the original scalar loop that should be
 676    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are as
 677    for vect_set_loop_condition.
 678
 679    Insert the branch-back condition before LOOP_COND_GSI and return the
 680    final gcond.  */
 681
 682 static gcond *
 683 vect_set_loop_condition_partial_vectors (class loop *loop,
 684                                          loop_vec_info loop_vinfo, tree niters,
 685                                          tree final_iv, bool niters_maybe_zero,
 686                                          gimple_stmt_iterator loop_cond_gsi)
 687 {
 688   gimple_seq preheader_seq = NULL;
 689   gimple_seq header_seq = NULL;
 690
 691   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 692   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 693   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 694   tree orig_niters = niters;
 695
 696   /* Type of the initial value of NITERS.  */
 697   tree ni_actual_type = TREE_TYPE (niters);
 698   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 699   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 700
 701   /* Convert NITERS to the same size as the compare.  */
 702   if (compare_precision > ni_actual_precision
 703       && niters_maybe_zero)
 704     {
 705       /* We know that there is always at least one iteration, so if the
 706          count is zero then it must have wrapped.  Cope with this by
 707          subtracting 1 before the conversion and adding 1 to the result.  */
 708       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 709       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 710                              niters, build_minus_one_cst (ni_actual_type));
 711       niters = gimple_convert (&preheader_seq, compare_type, niters);
 712       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 713                              niters, build_one_cst (compare_type));
 714     }
 715   else
 716     niters = gimple_convert (&preheader_seq, compare_type, niters);
 717
 718   /* Iterate over all the rgroups and fill in their controls.  We could use
 719      the first control from any rgroup for the loop condition; here we
 720      arbitrarily pick the last.  */
 721   tree test_ctrl = NULL_TREE;
 722   rgroup_controls *rgc;
 723   unsigned int i;
 724   auto_vec<rgroup_controls> *controls = use_masks_p
 725                                           ? &LOOP_VINFO_MASKS (loop_vinfo)
 726                                           : &LOOP_VINFO_LENS (loop_vinfo);
 727   FOR_EACH_VEC_ELT (*controls, i, rgc)
 728     if (!rgc->controls.is_empty ())
 729       {
 730         /* First try using permutes.  This adds a single vector
 731            instruction to the loop for each mask, but needs no extra
 732            loop invariants or IVs.  */
 733         unsigned int nmasks = i + 1;
 734         if (use_masks_p && (nmasks & 1) == 0)
 735           {
 736             rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
 737             if (!half_rgc->controls.is_empty ()
 738                 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
 739               continue;
 740           }
 741
 742         /* See whether zero-based IV would ever generate all-false masks
 743            or zero length before wrapping around.  */
 744         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 745
 746         /* Set up all controls for this group.  */
 747         test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 748                                                      &preheader_seq,
 749                                                      loop_cond_gsi, rgc,
 750                                                      niters, niters_skip,
 751                                                      might_wrap_p);
 752       }
 753
 754   /* Emit all accumulated statements.  */
 755   add_preheader_seq (loop, preheader_seq);
 756   add_header_seq (loop, header_seq);
 757
 758   /* Get a boolean result that tells us whether to iterate.  */
 759   edge exit_edge = single_exit (loop);
 760   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 761   tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
 762   gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
 763                                         NULL_TREE, NULL_TREE);
 764   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 765
 766   /* The loop iterates (NITERS - 1) / VF + 1 times.
 767      Subtract one from this to get the latch count.  */
 768   tree step = build_int_cst (compare_type,
 769                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 770   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 771                                        build_minus_one_cst (compare_type));
 772   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 773                                      niters_minus_one, step);
 774
 775   if (final_iv)
 776     {
 777       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 778       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 779     }
 780
 781   return cond_stmt;
 782 }
 783
 784 /* Like vect_set_loop_condition, but handle the case in which the vector
 785    loop handles exactly VF scalars per iteration.  */
 786
 787 static gcond *
 788 vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
 789                                 tree final_iv, bool niters_maybe_zero,
 790                                 gimple_stmt_iterator loop_cond_gsi)
 791 {
 792   tree indx_before_incr, indx_after_incr;
 793   gcond *cond_stmt;
 794   gcond *orig_cond;
 795   edge pe = loop_preheader_edge (loop);
 796   edge exit_edge = single_exit (loop);
 797   gimple_stmt_iterator incr_gsi;
 798   bool insert_after;
 799   enum tree_code code;
 800   tree niters_type = TREE_TYPE (niters);
 801
 802   orig_cond = get_loop_exit_condition (loop);
 803   gcc_assert (orig_cond);
 804   loop_cond_gsi = gsi_for_stmt (orig_cond);
 805
 806   tree init, limit;
 807   if (!niters_maybe_zero && integer_onep (step))
 808     {
 809       /* In this case we can use a simple 0-based IV:
 810
 811          A:
 812            x = 0;
 813            do
 814              {
 815                ...
 816                x += 1;
 817              }
 818            while (x < NITERS);  */
 819       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 820       init = build_zero_cst (niters_type);
 821       limit = niters;
 822     }
 823   else
 824     {
 825       /* The following works for all values of NITERS except 0:
 826
 827          B:
 828            x = 0;
 829            do
 830              {
 831                ...
 832                x += STEP;
 833              }
 834            while (x <= NITERS - STEP);
 835
 836          so that the loop continues to iterate if x + STEP - 1 < NITERS
 837          but stops if x + STEP - 1 >= NITERS.
 838
 839          However, if NITERS is zero, x never hits a value above NITERS - STEP
 840          before wrapping around.  There are two obvious ways of dealing with
 841          this:
 842
 843          - start at STEP - 1 and compare x before incrementing it
 844          - start at -1 and compare x after incrementing it
 845
 846          The latter is simpler and is what we use.  The loop in this case
 847          looks like:
 848
 849          C:
 850            x = -1;
 851            do
 852              {
 853                ...
 854                x += STEP;
 855              }
 856            while (x < NITERS - STEP);
 857
 858          In both cases the loop limit is NITERS - STEP.  */
 859       gimple_seq seq = NULL;
 860       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 861       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 862       if (seq)
 863         {
 864           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 865           gcc_assert (!new_bb);
 866         }
 867       if (niters_maybe_zero)
 868         {
 869           /* Case C.  */
 870           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 871           init = build_all_ones_cst (niters_type);
 872         }
 873       else
 874         {
 875           /* Case B.  */
 876           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 877           init = build_zero_cst (niters_type);
 878         }
 879     }
 880
 881   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 882   create_iv (init, step, NULL_TREE, loop,
 883              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 884   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 885                                               true, NULL_TREE, true,
 886                                               GSI_SAME_STMT);
 887   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 888                                      true, GSI_SAME_STMT);
 889
 890   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 891                                  NULL_TREE);
 892
 893   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 894
 895   /* Record the number of latch iterations.  */
 896   if (limit == niters)
 897     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 898        latch count.  */
 899     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 900                                        build_int_cst (niters_type, 1));
 901   else
 902     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 903        Subtract one from this to get the latch count.  */
 904     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 905                                        limit, step);
 906
 907   if (final_iv)
 908     {
 909       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 910                                              indx_after_incr, init);
 911       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 912     }
 913
 914   return cond_stmt;
 915 }
 916
 917 /* If we're using fully-masked loops, make LOOP iterate:
 918
 919       N == (NITERS - 1) / STEP + 1
 920
 921    times.  When NITERS is zero, this is equivalent to making the loop
 922    execute (1 << M) / STEP times, where M is the precision of NITERS.
 923    NITERS_MAYBE_ZERO is true if this last case might occur.
 924
 925    If we're not using fully-masked loops, make LOOP iterate:
 926
 927       N == (NITERS - STEP) / STEP + 1
 928
 929    times, where NITERS is known to be outside the range [1, STEP - 1].
 930    This is equivalent to making the loop execute NITERS / STEP times
 931    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 932    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 933
 934    If FINAL_IV is nonnull, it is an SSA name that should be set to
 935    N * STEP on exit from the loop.
 936
 937    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 938
 939 void
 940 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
 941                          tree niters, tree step, tree final_iv,
 942                          bool niters_maybe_zero)
 943 {
 944   gcond *cond_stmt;
 945   gcond *orig_cond = get_loop_exit_condition (loop);
 946   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 947
 948   if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
 949     cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
 950                                                          niters, final_iv,
 951                                                          niters_maybe_zero,
 952                                                          loop_cond_gsi);
 953   else
 954     cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
 955                                                 niters_maybe_zero,
 956                                                 loop_cond_gsi);
 957
 958   /* Remove old loop exit test.  */
 959   stmt_vec_info orig_cond_info;
 960   if (loop_vinfo
 961       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 962     loop_vinfo->remove_stmt (orig_cond_info);
 963   else
 964     gsi_remove (&loop_cond_gsi, true);
 965
 966   if (dump_enabled_p ())
 967     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 968                      cond_stmt);
 969 }
 970
 971 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 972    For all PHI arguments in FROM->dest and TO->dest from those
 973    edges ensure that TO->dest PHI arguments have current_def
 974    to that in from.  */
 975
 976 static void
 977 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 978 {
 979   gimple_stmt_iterator gsi_from, gsi_to;
 980
 981   for (gsi_from = gsi_start_phis (from->dest),
 982        gsi_to = gsi_start_phis (to->dest);
 983        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 984     {
 985       gimple *from_phi = gsi_stmt (gsi_from);
 986       gimple *to_phi = gsi_stmt (gsi_to);
 987       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 988       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 989       if (virtual_operand_p (from_arg))
 990         {
 991           gsi_next (&gsi_from);
 992           continue;
 993         }
 994       if (virtual_operand_p (to_arg))
 995         {
 996           gsi_next (&gsi_to);
 997           continue;
 998         }
 999       if (TREE_CODE (from_arg) != SSA_NAME)
1000         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
1001       else if (TREE_CODE (to_arg) == SSA_NAME
1002                && from_arg != to_arg)
1003         {
1004           if (get_current_def (to_arg) == NULL_TREE)
1005             {
1006               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1007                                               TREE_TYPE (get_current_def
1008                                                            (from_arg))));
1009               set_current_def (to_arg, get_current_def (from_arg));
1010             }
1011         }
1012       gsi_next (&gsi_from);
1013       gsi_next (&gsi_to);
1014     }
1015
1016   gphi *from_phi = get_virtual_phi (from->dest);
1017   gphi *to_phi = get_virtual_phi (to->dest);
1018   if (from_phi)
1019     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1020                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1021 }
1022
1023
1024 /* Given LOOP this function generates a new copy of it and puts it
1025    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1026    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1027    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1028    entry or exit of LOOP.  */
1029
1030 class loop *
1031 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1032                                         class loop *scalar_loop, edge e)
1033 {
1034   class loop *new_loop;
1035   basic_block *new_bbs, *bbs, *pbbs;
1036   bool at_exit;
1037   bool was_imm_dom;
1038   basic_block exit_dest;
1039   edge exit, new_exit;
1040   bool duplicate_outer_loop = false;
1041
1042   exit = single_exit (loop);
1043   at_exit = (e == exit);
1044   if (!at_exit && e != loop_preheader_edge (loop))
1045     return NULL;
1046
1047   if (scalar_loop == NULL)
1048     scalar_loop = loop;
1049
1050   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1051   pbbs = bbs + 1;
1052   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1053   /* Allow duplication of outer loops.  */
1054   if (scalar_loop->inner)
1055     duplicate_outer_loop = true;
1056   /* Check whether duplication is possible.  */
1057   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1058     {
1059       free (bbs);
1060       return NULL;
1061     }
1062
1063   /* Generate new loop structure.  */
1064   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1065   duplicate_subloops (scalar_loop, new_loop);
1066
1067   exit_dest = exit->dest;
1068   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1069                                           exit_dest) == loop->header ?
1070                  true : false);
1071
1072   /* Also copy the pre-header, this avoids jumping through hoops to
1073      duplicate the loop entry PHI arguments.  Create an empty
1074      pre-header unconditionally for this.  */
1075   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1076   edge entry_e = single_pred_edge (preheader);
1077   bbs[0] = preheader;
1078   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1079
1080   exit = single_exit (scalar_loop);
1081   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1082             &exit, 1, &new_exit, NULL,
1083             at_exit ? loop->latch : e->src, true);
1084   exit = single_exit (loop);
1085   basic_block new_preheader = new_bbs[0];
1086
1087   /* Before installing PHI arguments make sure that the edges
1088      into them match that of the scalar loop we analyzed.  This
1089      makes sure the SLP tree matches up between the main vectorized
1090      loop and the epilogue vectorized copies.  */
1091   if (single_succ_edge (preheader)->dest_idx
1092       != single_succ_edge (new_bbs[0])->dest_idx)
1093     {
1094       basic_block swap_bb = new_bbs[1];
1095       gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1096       std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1097       EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1098       EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1099     }
1100   if (duplicate_outer_loop)
1101     {
1102       class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1103       if (loop_preheader_edge (scalar_loop)->dest_idx
1104           != loop_preheader_edge (new_inner_loop)->dest_idx)
1105         {
1106           basic_block swap_bb = new_inner_loop->header;
1107           gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1108           std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1109           EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1110           EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1111         }
1112     }
1113
1114   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1115
1116   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1117   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1118     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1119
1120   if (scalar_loop != loop)
1121     {
1122       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1123          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1124          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1125          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1126          header) to have current_def set, so copy them over.  */
1127       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1128                                                 exit);
1129       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1130                                                            0),
1131                                                 EDGE_SUCC (loop->latch, 0));
1132     }
1133
1134   if (at_exit) /* Add the loop copy at exit.  */
1135     {
1136       if (scalar_loop != loop)
1137         {
1138           gphi_iterator gsi;
1139           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1140
1141           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1142                gsi_next (&gsi))
1143             {
1144               gphi *phi = gsi.phi ();
1145               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1146               location_t orig_locus
1147                 = gimple_phi_arg_location_from_edge (phi, e);
1148
1149               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1150             }
1151         }
1152       redirect_edge_and_branch_force (e, new_preheader);
1153       flush_pending_stmts (e);
1154       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1155       if (was_imm_dom || duplicate_outer_loop)
1156         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1157
1158       /* And remove the non-necessary forwarder again.  Keep the other
1159          one so we have a proper pre-header for the loop at the exit edge.  */
1160       redirect_edge_pred (single_succ_edge (preheader),
1161                           single_pred (preheader));
1162       delete_basic_block (preheader);
1163       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1164                                loop_preheader_edge (scalar_loop)->src);
1165     }
1166   else /* Add the copy at entry.  */
1167     {
1168       if (scalar_loop != loop)
1169         {
1170           /* Remove the non-necessary forwarder of scalar_loop again.  */
1171           redirect_edge_pred (single_succ_edge (preheader),
1172                               single_pred (preheader));
1173           delete_basic_block (preheader);
1174           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1175                                    loop_preheader_edge (scalar_loop)->src);
1176           preheader = split_edge (loop_preheader_edge (loop));
1177           entry_e = single_pred_edge (preheader);
1178         }
1179
1180       redirect_edge_and_branch_force (entry_e, new_preheader);
1181       flush_pending_stmts (entry_e);
1182       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1183
1184       redirect_edge_and_branch_force (new_exit, preheader);
1185       flush_pending_stmts (new_exit);
1186       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1187
1188       /* And remove the non-necessary forwarder again.  Keep the other
1189          one so we have a proper pre-header for the loop at the exit edge.  */
1190       redirect_edge_pred (single_succ_edge (new_preheader),
1191                           single_pred (new_preheader));
1192       delete_basic_block (new_preheader);
1193       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1194                                loop_preheader_edge (new_loop)->src);
1195     }
1196
1197   if (scalar_loop != loop)
1198     {
1199       /* Update new_loop->header PHIs, so that on the preheader
1200          edge they are the ones from loop rather than scalar_loop.  */
1201       gphi_iterator gsi_orig, gsi_new;
1202       edge orig_e = loop_preheader_edge (loop);
1203       edge new_e = loop_preheader_edge (new_loop);
1204
1205       for (gsi_orig = gsi_start_phis (loop->header),
1206            gsi_new = gsi_start_phis (new_loop->header);
1207            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1208            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1209         {
1210           gphi *orig_phi = gsi_orig.phi ();
1211           gphi *new_phi = gsi_new.phi ();
1212           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1213           location_t orig_locus
1214             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1215
1216           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1217         }
1218     }
1219
1220   free (new_bbs);
1221   free (bbs);
1222
1223   checking_verify_dominators (CDI_DOMINATORS);
1224
1225   return new_loop;
1226 }
1227
1228
1229 /* Given the condition expression COND, put it as the last statement of
1230    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1231    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1232    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1233    new edge as irreducible if IRREDUCIBLE_P is true.  */
1234
1235 static edge
1236 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1237                        basic_block guard_to, basic_block dom_bb,
1238                        profile_probability probability, bool irreducible_p)
1239 {
1240   gimple_stmt_iterator gsi;
1241   edge new_e, enter_e;
1242   gcond *cond_stmt;
1243   gimple_seq gimplify_stmt_list = NULL;
1244
1245   enter_e = EDGE_SUCC (guard_bb, 0);
1246   enter_e->flags &= ~EDGE_FALLTHRU;
1247   enter_e->flags |= EDGE_FALSE_VALUE;
1248   gsi = gsi_last_bb (guard_bb);
1249
1250   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1251                                  NULL_TREE);
1252   if (gimplify_stmt_list)
1253     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1254
1255   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1256   gsi = gsi_last_bb (guard_bb);
1257   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1258
1259   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1260   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1261
1262   new_e->probability = probability;
1263   if (irreducible_p)
1264     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1265
1266   enter_e->probability = probability.invert ();
1267   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1268
1269   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1270   if (enter_e->dest->loop_father->header == enter_e->dest)
1271     split_edge (enter_e);
1272
1273   return new_e;
1274 }
1275
1276
1277 /* This function verifies that the following restrictions apply to LOOP:
1278    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1279        for innermost loop and 5 basic blocks for outer-loop.
1280    (2) it is single entry, single exit
1281    (3) its exit condition is the last stmt in the header
1282    (4) E is the entry/exit edge of LOOP.
1283  */
1284
1285 bool
1286 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1287 {
1288   edge exit_e = single_exit (loop);
1289   edge entry_e = loop_preheader_edge (loop);
1290   gcond *orig_cond = get_loop_exit_condition (loop);
1291   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1292   unsigned int num_bb = loop->inner? 5 : 2;
1293
1294   /* All loops have an outer scope; the only case loop->outer is NULL is for
1295      the function itself.  */
1296   if (!loop_outer (loop)
1297       || loop->num_nodes != num_bb
1298       || !empty_block_p (loop->latch)
1299       || !single_exit (loop)
1300       /* Verify that new loop exit condition can be trivially modified.  */
1301       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1302       || (e != exit_e && e != entry_e))
1303     return false;
1304
1305   return true;
1306 }
1307
1308 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1309    in the exit bb and rename all the uses after the loop.  This simplifies
1310    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1311    (but normally loop closed SSA form doesn't require virtual PHIs to be
1312    in the same form).  Doing this early simplifies the checking what
1313    uses should be renamed.
1314
1315    If we create a new phi after the loop, return the definition that
1316    applies on entry to the loop, otherwise return null.  */
1317
1318 static tree
1319 create_lcssa_for_virtual_phi (class loop *loop)
1320 {
1321   gphi_iterator gsi;
1322   edge exit_e = single_exit (loop);
1323
1324   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1325     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1326       {
1327         gphi *phi = gsi.phi ();
1328         for (gsi = gsi_start_phis (exit_e->dest);
1329              !gsi_end_p (gsi); gsi_next (&gsi))
1330           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1331             break;
1332         if (gsi_end_p (gsi))
1333           {
1334             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1335             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1336             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1337             imm_use_iterator imm_iter;
1338             gimple *stmt;
1339             use_operand_p use_p;
1340
1341             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1342               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1343             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1344             gimple_phi_set_result (new_phi, new_vop);
1345             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1346               if (stmt != new_phi
1347                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1348                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1349                   SET_USE (use_p, new_vop);
1350
1351             return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1352           }
1353         break;
1354       }
1355   return NULL_TREE;
1356 }
1357
1358 /* Function vect_get_loop_location.
1359
1360    Extract the location of the loop in the source code.
1361    If the loop is not well formed for vectorization, an estimated
1362    location is calculated.
1363    Return the loop location if succeed and NULL if not.  */
1364
1365 dump_user_location_t
1366 find_loop_location (class loop *loop)
1367 {
1368   gimple *stmt = NULL;
1369   basic_block bb;
1370   gimple_stmt_iterator si;
1371
1372   if (!loop)
1373     return dump_user_location_t ();
1374
1375   stmt = get_loop_exit_condition (loop);
1376
1377   if (stmt
1378       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1379     return stmt;
1380
1381   /* If we got here the loop is probably not "well formed",
1382      try to estimate the loop location */
1383
1384   if (!loop->header)
1385     return dump_user_location_t ();
1386
1387   bb = loop->header;
1388
1389   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1390     {
1391       stmt = gsi_stmt (si);
1392       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1393         return stmt;
1394     }
1395
1396   return dump_user_location_t ();
1397 }
1398
1399 /* Return true if the phi described by STMT_INFO defines an IV of the
1400    loop to be vectorized.  */
1401
1402 static bool
1403 iv_phi_p (stmt_vec_info stmt_info)
1404 {
1405   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1406   if (virtual_operand_p (PHI_RESULT (phi)))
1407     return false;
1408
1409   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1410       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1411     return false;
1412
1413   return true;
1414 }
1415
1416 /* Function vect_can_advance_ivs_p
1417
1418    In case the number of iterations that LOOP iterates is unknown at compile
1419    time, an epilog loop will be generated, and the loop induction variables
1420    (IVs) will be "advanced" to the value they are supposed to take just before
1421    the epilog loop.  Here we check that the access function of the loop IVs
1422    and the expression that represents the loop bound are simple enough.
1423    These restrictions will be relaxed in the future.  */
1424
1425 bool
1426 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1427 {
1428   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1429   basic_block bb = loop->header;
1430   gphi_iterator gsi;
1431
1432   /* Analyze phi functions of the loop header.  */
1433
1434   if (dump_enabled_p ())
1435     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1436   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1437     {
1438       tree evolution_part;
1439
1440       gphi *phi = gsi.phi ();
1441       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1442       if (dump_enabled_p ())
1443         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1444                          phi_info->stmt);
1445
1446       /* Skip virtual phi's. The data dependences that are associated with
1447          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1448
1449          Skip reduction phis.  */
1450       if (!iv_phi_p (phi_info))
1451         {
1452           if (dump_enabled_p ())
1453             dump_printf_loc (MSG_NOTE, vect_location,
1454                              "reduc or virtual phi. skip.\n");
1455           continue;
1456         }
1457
1458       /* Analyze the evolution function.  */
1459
1460       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1461       if (evolution_part == NULL_TREE)
1462         {
1463           if (dump_enabled_p ())
1464             dump_printf (MSG_MISSED_OPTIMIZATION,
1465                          "No access function or evolution.\n");
1466           return false;
1467         }
1468
1469       /* FORNOW: We do not transform initial conditions of IVs
1470          which evolution functions are not invariants in the loop.  */
1471
1472       if (!expr_invariant_in_loop_p (loop, evolution_part))
1473         {
1474           if (dump_enabled_p ())
1475             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476                              "evolution not invariant in loop.\n");
1477           return false;
1478         }
1479
1480       /* FORNOW: We do not transform initial conditions of IVs
1481          which evolution functions are a polynomial of degree >= 2.  */
1482
1483       if (tree_is_chrec (evolution_part))
1484         {
1485           if (dump_enabled_p ())
1486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487                              "evolution is chrec.\n");
1488           return false;
1489         }
1490     }
1491
1492   return true;
1493 }
1494
1495
1496 /*   Function vect_update_ivs_after_vectorizer.
1497
1498      "Advance" the induction variables of LOOP to the value they should take
1499      after the execution of LOOP.  This is currently necessary because the
1500      vectorizer does not handle induction variables that are used after the
1501      loop.  Such a situation occurs when the last iterations of LOOP are
1502      peeled, because:
1503      1. We introduced new uses after LOOP for IVs that were not originally used
1504         after LOOP: the IVs of LOOP are now used by an epilog loop.
1505      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1506         times, whereas the loop IVs should be bumped N times.
1507
1508      Input:
1509      - LOOP - a loop that is going to be vectorized. The last few iterations
1510               of LOOP were peeled.
1511      - NITERS - the number of iterations that LOOP executes (before it is
1512                 vectorized). i.e, the number of times the ivs should be bumped.
1513      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1514                   coming out from LOOP on which there are uses of the LOOP ivs
1515                   (this is the path from LOOP->exit to epilog_loop->preheader).
1516
1517                   The new definitions of the ivs are placed in LOOP->exit.
1518                   The phi args associated with the edge UPDATE_E in the bb
1519                   UPDATE_E->dest are updated accordingly.
1520
1521      Assumption 1: Like the rest of the vectorizer, this function assumes
1522      a single loop exit that has a single predecessor.
1523
1524      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1525      organized in the same order.
1526
1527      Assumption 3: The access function of the ivs is simple enough (see
1528      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1529
1530      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1531      coming out of LOOP on which the ivs of LOOP are used (this is the path
1532      that leads to the epilog loop; other paths skip the epilog loop).  This
1533      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1534      needs to have its phis updated.
1535  */
1536
1537 static void
1538 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1539                                   tree niters, edge update_e)
1540 {
1541   gphi_iterator gsi, gsi1;
1542   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1543   basic_block update_bb = update_e->dest;
1544   basic_block exit_bb = single_exit (loop)->dest;
1545
1546   /* Make sure there exists a single-predecessor exit bb:  */
1547   gcc_assert (single_pred_p (exit_bb));
1548   gcc_assert (single_succ_edge (exit_bb) == update_e);
1549
1550   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1551        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1552        gsi_next (&gsi), gsi_next (&gsi1))
1553     {
1554       tree init_expr;
1555       tree step_expr, off;
1556       tree type;
1557       tree var, ni, ni_name;
1558       gimple_stmt_iterator last_gsi;
1559
1560       gphi *phi = gsi.phi ();
1561       gphi *phi1 = gsi1.phi ();
1562       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1563       if (dump_enabled_p ())
1564         dump_printf_loc (MSG_NOTE, vect_location,
1565                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1566
1567       /* Skip reduction and virtual phis.  */
1568       if (!iv_phi_p (phi_info))
1569         {
1570           if (dump_enabled_p ())
1571             dump_printf_loc (MSG_NOTE, vect_location,
1572                              "reduc or virtual phi. skip.\n");
1573           continue;
1574         }
1575
1576       type = TREE_TYPE (gimple_phi_result (phi));
1577       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1578       step_expr = unshare_expr (step_expr);
1579
1580       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1581          of degree >= 2 or exponential.  */
1582       gcc_assert (!tree_is_chrec (step_expr));
1583
1584       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1585
1586       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1587                          fold_convert (TREE_TYPE (step_expr), niters),
1588                          step_expr);
1589       if (POINTER_TYPE_P (type))
1590         ni = fold_build_pointer_plus (init_expr, off);
1591       else
1592         ni = fold_build2 (PLUS_EXPR, type,
1593                           init_expr, fold_convert (type, off));
1594
1595       var = create_tmp_var (type, "tmp");
1596
1597       last_gsi = gsi_last_bb (exit_bb);
1598       gimple_seq new_stmts = NULL;
1599       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1600       /* Exit_bb shouldn't be empty.  */
1601       if (!gsi_end_p (last_gsi))
1602         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1603       else
1604         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1605
1606       /* Fix phi expressions in the successor bb.  */
1607       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1608     }
1609 }
1610
1611 /* Return a gimple value containing the misalignment (measured in vector
1612    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1613    it is away from a perfectly aligned address.  Add any new statements
1614    to SEQ.  */
1615
1616 static tree
1617 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1618 {
1619   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1620   stmt_vec_info stmt_info = dr_info->stmt;
1621   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1622
1623   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1624   unsigned HOST_WIDE_INT target_align_c;
1625   tree target_align_minus_1;
1626
1627   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1628                                         size_zero_node) < 0;
1629   tree offset = (negative
1630                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1631                  : size_zero_node);
1632   tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1633                                                           stmt_info, seq,
1634                                                           offset);
1635   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1636   if (target_align.is_constant (&target_align_c))
1637     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1638   else
1639     {
1640       tree vla = build_int_cst (type, target_align);
1641       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1642                                     fold_build2 (MINUS_EXPR, type,
1643                                                  build_int_cst (type, 0), vla));
1644       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1645                                           build_int_cst (type, 1));
1646     }
1647
1648   HOST_WIDE_INT elem_size
1649     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1650   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1651
1652   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1653   tree int_start_addr = fold_convert (type, start_addr);
1654   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1655                                         target_align_minus_1);
1656
1657   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1658   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1659                                         elem_size_log);
1660
1661   return misalign_in_elems;
1662 }
1663
1664 /* Function vect_gen_prolog_loop_niters
1665
1666    Generate the number of iterations which should be peeled as prolog for the
1667    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1668    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1669    As a result, after the execution of this loop, the data reference DR will
1670    refer to an aligned location.  The following computation is generated:
1671
1672    If the misalignment of DR is known at compile time:
1673      addr_mis = int mis = DR_MISALIGNMENT (dr);
1674    Else, compute address misalignment in bytes:
1675      addr_mis = addr & (target_align - 1)
1676
1677    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1678
1679    (elem_size = element type size; an element is the scalar element whose type
1680    is the inner type of the vectype)
1681
1682    The computations will be emitted at the end of BB.  We also compute and
1683    store upper bound (included) of the result in BOUND.
1684
1685    When the step of the data-ref in the loop is not 1 (as in interleaved data
1686    and SLP), the number of iterations of the prolog must be divided by the step
1687    (which is equal to the size of interleaved group).
1688
1689    The above formulas assume that VF == number of elements in the vector. This
1690    may not hold when there are multiple-types in the loop.
1691    In this case, for some data-references in the loop the VF does not represent
1692    the number of elements that fit in the vector.  Therefore, instead of VF we
1693    use TYPE_VECTOR_SUBPARTS.  */
1694
1695 static tree
1696 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1697                              basic_block bb, int *bound)
1698 {
1699   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1700   tree var;
1701   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1702   gimple_seq stmts = NULL, new_stmts = NULL;
1703   tree iters, iters_name;
1704   stmt_vec_info stmt_info = dr_info->stmt;
1705   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1706   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1707
1708   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1709     {
1710       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1711
1712       if (dump_enabled_p ())
1713         dump_printf_loc (MSG_NOTE, vect_location,
1714                          "known peeling = %d.\n", npeel);
1715
1716       iters = build_int_cst (niters_type, npeel);
1717       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1718     }
1719   else
1720     {
1721       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1722       tree type = TREE_TYPE (misalign_in_elems);
1723       HOST_WIDE_INT elem_size
1724         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1725       /* We only do prolog peeling if the target alignment is known at compile
1726          time.  */
1727       poly_uint64 align_in_elems =
1728         exact_div (target_align, elem_size);
1729       tree align_in_elems_minus_1 =
1730         build_int_cst (type, align_in_elems - 1);
1731       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1732
1733       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1734                                  & (align_in_elems - 1)).  */
1735       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1736                                             size_zero_node) < 0;
1737       if (negative)
1738         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1739                              align_in_elems_tree);
1740       else
1741         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1742                              misalign_in_elems);
1743       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1744       iters = fold_convert (niters_type, iters);
1745       unsigned HOST_WIDE_INT align_in_elems_c;
1746       if (align_in_elems.is_constant (&align_in_elems_c))
1747         *bound = align_in_elems_c - 1;
1748       else
1749         *bound = -1;
1750     }
1751
1752   if (dump_enabled_p ())
1753     dump_printf_loc (MSG_NOTE, vect_location,
1754                      "niters for prolog loop: %T\n", iters);
1755
1756   var = create_tmp_var (niters_type, "prolog_loop_niters");
1757   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1758
1759   if (new_stmts)
1760     gimple_seq_add_seq (&stmts, new_stmts);
1761   if (stmts)
1762     {
1763       gcc_assert (single_succ_p (bb));
1764       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1765       if (gsi_end_p (gsi))
1766         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1767       else
1768         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1769     }
1770   return iters_name;
1771 }
1772
1773
1774 /* Function vect_update_init_of_dr
1775
1776    If CODE is PLUS, the vector loop starts NITERS iterations after the
1777    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1778    iterations before the scalar one (using masking to skip inactive
1779    elements).  This function updates the information recorded in DR to
1780    account for the difference.  Specifically, it updates the OFFSET
1781    field of DR_INFO.  */
1782
1783 static void
1784 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1785 {
1786   struct data_reference *dr = dr_info->dr;
1787   tree offset = dr_info->offset;
1788   if (!offset)
1789     offset = build_zero_cst (sizetype);
1790
1791   niters = fold_build2 (MULT_EXPR, sizetype,
1792                         fold_convert (sizetype, niters),
1793                         fold_convert (sizetype, DR_STEP (dr)));
1794   offset = fold_build2 (code, sizetype,
1795                         fold_convert (sizetype, offset), niters);
1796   dr_info->offset = offset;
1797 }
1798
1799
1800 /* Function vect_update_inits_of_drs
1801
1802    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1803    CODE and NITERS are as for vect_update_inits_of_dr.  */
1804
1805 void
1806 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1807                           tree_code code)
1808 {
1809   unsigned int i;
1810   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1811   struct data_reference *dr;
1812
1813   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1814
1815   /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1816      here, but since we might use these niters to update the epilogues niters
1817      and data references we can't insert them here as this definition might not
1818      always dominate its uses.  */
1819   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1820     niters = fold_convert (sizetype, niters);
1821
1822   FOR_EACH_VEC_ELT (datarefs, i, dr)
1823     {
1824       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1825       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1826         vect_update_init_of_dr (dr_info, niters, code);
1827     }
1828 }
1829
1830 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1831    by masking.  This involves calculating the number of iterations to
1832    be peeled and then aligning all memory references appropriately.  */
1833
1834 void
1835 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1836 {
1837   tree misalign_in_elems;
1838   tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
1839
1840   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1841
1842   /* From the information recorded in LOOP_VINFO get the number of iterations
1843      that need to be skipped via masking.  */
1844   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1845     {
1846       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1847                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1848       misalign_in_elems = build_int_cst (type, misalign);
1849     }
1850   else
1851     {
1852       gimple_seq seq1 = NULL, seq2 = NULL;
1853       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1854       misalign_in_elems = fold_convert (type, misalign_in_elems);
1855       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1856                                                 &seq2, true, NULL_TREE);
1857       gimple_seq_add_seq (&seq1, seq2);
1858       if (seq1)
1859         {
1860           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1861           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1862           gcc_assert (!new_bb);
1863         }
1864     }
1865
1866   if (dump_enabled_p ())
1867     dump_printf_loc (MSG_NOTE, vect_location,
1868                      "misalignment for fully-masked loop: %T\n",
1869                      misalign_in_elems);
1870
1871   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1872
1873   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1874 }
1875
1876 /* This function builds ni_name = number of iterations.  Statements
1877    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1878    it to TRUE if new ssa_var is generated.  */
1879
1880 tree
1881 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1882 {
1883   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1884   if (TREE_CODE (ni) == INTEGER_CST)
1885     return ni;
1886   else
1887     {
1888       tree ni_name, var;
1889       gimple_seq stmts = NULL;
1890       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1891
1892       var = create_tmp_var (TREE_TYPE (ni), "niters");
1893       ni_name = force_gimple_operand (ni, &stmts, false, var);
1894       if (stmts)
1895         {
1896           gsi_insert_seq_on_edge_immediate (pe, stmts);
1897           if (new_var_p != NULL)
1898             *new_var_p = true;
1899         }
1900
1901       return ni_name;
1902     }
1903 }
1904
1905 /* Calculate the number of iterations above which vectorized loop will be
1906    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1907    of prolog loop.  If it's integer const, the integer number is also passed
1908    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1909    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1910    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1911    threshold below which the scalar (rather than vectorized) loop will be
1912    executed.  This function stores the upper bound (inclusive) of the result
1913    in BOUND_SCALAR.  */
1914
1915 static tree
1916 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1917                              int bound_prolog, poly_int64 bound_epilog, int th,
1918                              poly_uint64 *bound_scalar,
1919                              bool check_profitability)
1920 {
1921   tree type = TREE_TYPE (niters_prolog);
1922   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1923                              build_int_cst (type, bound_epilog));
1924
1925   *bound_scalar = bound_prolog + bound_epilog;
1926   if (check_profitability)
1927     {
1928       /* TH indicates the minimum niters of vectorized loop, while we
1929          compute the maximum niters of scalar loop.  */
1930       th--;
1931       /* Peeling for constant times.  */
1932       if (int_niters_prolog >= 0)
1933         {
1934           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1935           return build_int_cst (type, *bound_scalar);
1936         }
1937       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1938          and BOUND_EPILOG are inclusive upper bounds.  */
1939       if (known_ge (th, bound_prolog + bound_epilog))
1940         {
1941           *bound_scalar = th;
1942           return build_int_cst (type, th);
1943         }
1944       /* Need to do runtime comparison.  */
1945       else if (maybe_gt (th, bound_epilog))
1946         {
1947           *bound_scalar = upper_bound (*bound_scalar, th);
1948           return fold_build2 (MAX_EXPR, type,
1949                               build_int_cst (type, th), niters);
1950         }
1951     }
1952   return niters;
1953 }
1954
1955 /* NITERS is the number of times that the original scalar loop executes
1956    after peeling.  Work out the maximum number of iterations N that can
1957    be handled by the vectorized form of the loop and then either:
1958
1959    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1960
1961         niters_vector = N
1962
1963    b) set *STEP_VECTOR_PTR to one and generate:
1964
1965         niters_vector = N / vf
1966
1967    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1968    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1969    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1970
1971 void
1972 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1973                              tree *niters_vector_ptr, tree *step_vector_ptr,
1974                              bool niters_no_overflow)
1975 {
1976   tree ni_minus_gap, var;
1977   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1978   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1980   tree log_vf = NULL_TREE;
1981
1982   /* If epilogue loop is required because of data accesses with gaps, we
1983      subtract one iteration from the total number of iterations here for
1984      correct calculation of RATIO.  */
1985   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1986     {
1987       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1988                                   build_one_cst (type));
1989       if (!is_gimple_val (ni_minus_gap))
1990         {
1991           var = create_tmp_var (type, "ni_gap");
1992           gimple *stmts = NULL;
1993           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1994                                                true, var);
1995           gsi_insert_seq_on_edge_immediate (pe, stmts);
1996         }
1997     }
1998   else
1999     ni_minus_gap = niters;
2000
2001   unsigned HOST_WIDE_INT const_vf;
2002   if (vf.is_constant (&const_vf)
2003       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2004     {
2005       /* Create: niters >> log2(vf) */
2006       /* If it's known that niters == number of latch executions + 1 doesn't
2007          overflow, we can generate niters >> log2(vf); otherwise we generate
2008          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2009          will be at least one.  */
2010       log_vf = build_int_cst (type, exact_log2 (const_vf));
2011       if (niters_no_overflow)
2012         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2013       else
2014         niters_vector
2015           = fold_build2 (PLUS_EXPR, type,
2016                          fold_build2 (RSHIFT_EXPR, type,
2017                                       fold_build2 (MINUS_EXPR, type,
2018                                                    ni_minus_gap,
2019                                                    build_int_cst (type, vf)),
2020                                       log_vf),
2021                          build_int_cst (type, 1));
2022       step_vector = build_one_cst (type);
2023     }
2024   else
2025     {
2026       niters_vector = ni_minus_gap;
2027       step_vector = build_int_cst (type, vf);
2028     }
2029
2030   if (!is_gimple_val (niters_vector))
2031     {
2032       var = create_tmp_var (type, "bnd");
2033       gimple_seq stmts = NULL;
2034       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2035       gsi_insert_seq_on_edge_immediate (pe, stmts);
2036       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2037          we set range information to make niters analyzer's life easier.
2038          Note the number of latch iteration value can be TYPE_MAX_VALUE so
2039          we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
2040       if (stmts != NULL && log_vf)
2041         {
2042           if (niters_no_overflow)
2043             set_range_info (niters_vector, VR_RANGE,
2044                             wi::one (TYPE_PRECISION (type)),
2045                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2046                                                        TYPE_SIGN (type)),
2047                                         exact_log2 (const_vf),
2048                                         TYPE_SIGN (type)));
2049           /* For VF == 1 the vector IV might also overflow so we cannot
2050              assert a minimum value of 1.  */
2051           else if (const_vf > 1)
2052             set_range_info (niters_vector, VR_RANGE,
2053                             wi::one (TYPE_PRECISION (type)),
2054                             wi::rshift (wi::max_value (TYPE_PRECISION (type),
2055                                                        TYPE_SIGN (type))
2056                                         - (const_vf - 1),
2057                                         exact_log2 (const_vf), TYPE_SIGN (type))
2058                             + 1);
2059         }
2060     }
2061   *niters_vector_ptr = niters_vector;
2062   *step_vector_ptr = step_vector;
2063
2064   return;
2065 }
2066
2067 /* Given NITERS_VECTOR which is the number of iterations for vectorized
2068    loop specified by LOOP_VINFO after vectorization, compute the number
2069    of iterations before vectorization (niters_vector * vf) and store it
2070    to NITERS_VECTOR_MULT_VF_PTR.  */
2071
2072 static void
2073 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2074                                      tree niters_vector,
2075                                      tree *niters_vector_mult_vf_ptr)
2076 {
2077   /* We should be using a step_vector of VF if VF is variable.  */
2078   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2079   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2080   tree type = TREE_TYPE (niters_vector);
2081   tree log_vf = build_int_cst (type, exact_log2 (vf));
2082   basic_block exit_bb = single_exit (loop)->dest;
2083
2084   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2085   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2086                                             niters_vector, log_vf);
2087   if (!is_gimple_val (niters_vector_mult_vf))
2088     {
2089       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2090       gimple_seq stmts = NULL;
2091       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2092                                                     &stmts, true, var);
2093       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2094       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2095     }
2096   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2097 }
2098
2099 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2100    this function searches for the corresponding lcssa phi node in exit
2101    bb of LOOP.  If it is found, return the phi result; otherwise return
2102    NULL.  */
2103
2104 static tree
2105 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2106                 gphi *lcssa_phi)
2107 {
2108   gphi_iterator gsi;
2109   edge e = single_exit (loop);
2110
2111   gcc_assert (single_pred_p (e->dest));
2112   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2113     {
2114       gphi *phi = gsi.phi ();
2115       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2116                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2117         return PHI_RESULT (phi);
2118     }
2119   return NULL_TREE;
2120 }
2121
2122 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2123    from SECOND/FIRST and puts it at the original loop's preheader/exit
2124    edge, the two loops are arranged as below:
2125
2126        preheader_a:
2127      first_loop:
2128        header_a:
2129          i_1 = PHI<i_0, i_2>;
2130          ...
2131          i_2 = i_1 + 1;
2132          if (cond_a)
2133            goto latch_a;
2134          else
2135            goto between_bb;
2136        latch_a:
2137          goto header_a;
2138
2139        between_bb:
2140          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2141
2142      second_loop:
2143        header_b:
2144          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2145                                  or with i_2 if no LCSSA phi is created
2146                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2147          ...
2148          i_4 = i_3 + 1;
2149          if (cond_b)
2150            goto latch_b;
2151          else
2152            goto exit_bb;
2153        latch_b:
2154          goto header_b;
2155
2156        exit_bb:
2157
2158    This function creates loop closed SSA for the first loop; update the
2159    second loop's PHI nodes by replacing argument on incoming edge with the
2160    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2161    is false, Loop closed ssa phis will only be created for non-iv phis for
2162    the first loop.
2163
2164    This function assumes exit bb of the first loop is preheader bb of the
2165    second loop, i.e, between_bb in the example code.  With PHIs updated,
2166    the second loop will execute rest iterations of the first.  */
2167
2168 static void
2169 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2170                                    class loop *first, class loop *second,
2171                                    bool create_lcssa_for_iv_phis)
2172 {
2173   gphi_iterator gsi_update, gsi_orig;
2174   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2175
2176   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2177   edge second_preheader_e = loop_preheader_edge (second);
2178   basic_block between_bb = single_exit (first)->dest;
2179
2180   gcc_assert (between_bb == second_preheader_e->src);
2181   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2182   /* Either the first loop or the second is the loop to be vectorized.  */
2183   gcc_assert (loop == first || loop == second);
2184
2185   for (gsi_orig = gsi_start_phis (first->header),
2186        gsi_update = gsi_start_phis (second->header);
2187        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2188        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2189     {
2190       gphi *orig_phi = gsi_orig.phi ();
2191       gphi *update_phi = gsi_update.phi ();
2192
2193       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2194       /* Generate lcssa PHI node for the first loop.  */
2195       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2196       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2197       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2198         {
2199           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2200           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2201           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2202           arg = new_res;
2203         }
2204
2205       /* Update PHI node in the second loop by replacing arg on the loop's
2206          incoming edge.  */
2207       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2208     }
2209
2210   /* For epilogue peeling we have to make sure to copy all LC PHIs
2211      for correct vectorization of live stmts.  */
2212   if (loop == first)
2213     {
2214       basic_block orig_exit = single_exit (second)->dest;
2215       for (gsi_orig = gsi_start_phis (orig_exit);
2216            !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2217         {
2218           gphi *orig_phi = gsi_orig.phi ();
2219           tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2220           if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2221             continue;
2222
2223           /* Already created in the above loop.   */
2224           if (find_guard_arg (first, second, orig_phi))
2225             continue;
2226
2227           tree new_res = copy_ssa_name (orig_arg);
2228           gphi *lcphi = create_phi_node (new_res, between_bb);
2229           add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2230         }
2231     }
2232 }
2233
2234 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2235    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2236    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2237    appear like below:
2238
2239        guard_bb:
2240          if (cond)
2241            goto merge_bb;
2242          else
2243            goto skip_loop;
2244
2245      skip_loop:
2246        header_a:
2247          i_1 = PHI<i_0, i_2>;
2248          ...
2249          i_2 = i_1 + 1;
2250          if (cond_a)
2251            goto latch_a;
2252          else
2253            goto exit_a;
2254        latch_a:
2255          goto header_a;
2256
2257        exit_a:
2258          i_5 = PHI<i_2>;
2259
2260        merge_bb:
2261          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2262
2263      update_loop:
2264        header_b:
2265          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2266          ...
2267          i_4 = i_3 + 1;
2268          if (cond_b)
2269            goto latch_b;
2270          else
2271            goto exit_bb;
2272        latch_b:
2273          goto header_b;
2274
2275        exit_bb:
2276
2277    This function creates PHI nodes at merge_bb and replaces the use of i_5
2278    in the update_loop's PHI node with the result of new PHI result.  */
2279
2280 static void
2281 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2282                                     class loop *update_loop,
2283                                     edge guard_edge, edge merge_edge)
2284 {
2285   location_t merge_loc, guard_loc;
2286   edge orig_e = loop_preheader_edge (skip_loop);
2287   edge update_e = loop_preheader_edge (update_loop);
2288   gphi_iterator gsi_orig, gsi_update;
2289
2290   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2291         gsi_update = gsi_start_phis (update_loop->header));
2292        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2293        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2294     {
2295       gphi *orig_phi = gsi_orig.phi ();
2296       gphi *update_phi = gsi_update.phi ();
2297
2298       /* Generate new phi node at merge bb of the guard.  */
2299       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2300       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2301
2302       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2303          args in NEW_PHI for these edges.  */
2304       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2305       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2306       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2307       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2308       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2309       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2310
2311       /* Update phi in UPDATE_PHI.  */
2312       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2313     }
2314 }
2315
2316 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2317    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2318    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2319    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2320    The CFG looks like:
2321
2322      loop:
2323        header_a:
2324          i_1 = PHI<i_0, i_2>;
2325          ...
2326          i_2 = i_1 + 1;
2327          if (cond_a)
2328            goto latch_a;
2329          else
2330            goto exit_a;
2331        latch_a:
2332          goto header_a;
2333
2334        exit_a:
2335
2336        guard_bb:
2337          if (cond)
2338            goto merge_bb;
2339          else
2340            goto epilog_loop;
2341
2342        ;; fall_through_bb
2343
2344      epilog_loop:
2345        header_b:
2346          i_3 = PHI<i_2, i_4>;
2347          ...
2348          i_4 = i_3 + 1;
2349          if (cond_b)
2350            goto latch_b;
2351          else
2352            goto merge_bb;
2353        latch_b:
2354          goto header_b;
2355
2356        merge_bb:
2357          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2358
2359        exit_bb:
2360          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2361
2362    For each name used out side EPILOG (i.e - for each name that has a lcssa
2363    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2364    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2365    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2366    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2367    in exit_bb will also be updated.  */
2368
2369 static void
2370 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2371                                     edge guard_edge, edge merge_edge)
2372 {
2373   gphi_iterator gsi;
2374   basic_block merge_bb = guard_edge->dest;
2375
2376   gcc_assert (single_succ_p (merge_bb));
2377   edge e = single_succ_edge (merge_bb);
2378   basic_block exit_bb = e->dest;
2379   gcc_assert (single_pred_p (exit_bb));
2380   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2381
2382   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2383     {
2384       gphi *update_phi = gsi.phi ();
2385       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2386
2387       tree merge_arg = NULL_TREE;
2388
2389       /* If the old argument is a SSA_NAME use its current_def.  */
2390       if (TREE_CODE (old_arg) == SSA_NAME)
2391         merge_arg = get_current_def (old_arg);
2392       /* If it's a constant or doesn't have a current_def, just use the old
2393          argument.  */
2394       if (!merge_arg)
2395         merge_arg = old_arg;
2396
2397       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2398       /* If the var is live after loop but not a reduction, we simply
2399          use the old arg.  */
2400       if (!guard_arg)
2401         guard_arg = old_arg;
2402
2403       /* Create new phi node in MERGE_BB:  */
2404       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2405       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2406
2407       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2408          the two PHI args in merge_phi for these edges.  */
2409       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2410       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2411
2412       /* Update the original phi in exit_bb.  */
2413       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2414     }
2415 }
2416
2417 /* EPILOG loop is duplicated from the original loop for vectorizing,
2418    the arg of its loop closed ssa PHI needs to be updated.  */
2419
2420 static void
2421 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2422 {
2423   gphi_iterator gsi;
2424   basic_block exit_bb = single_exit (epilog)->dest;
2425
2426   gcc_assert (single_pred_p (exit_bb));
2427   edge e = EDGE_PRED (exit_bb, 0);
2428   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2429     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2430 }
2431
2432 /* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2433    iterate exactly CONST_NITERS times.  Make a final decision about
2434    whether the epilogue loop should be used, returning true if so.  */
2435
2436 static bool
2437 vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2438                              unsigned HOST_WIDE_INT const_niters)
2439 {
2440   /* Avoid wrap-around when computing const_niters - 1.  Also reject
2441      using an epilogue loop for a single scalar iteration, even if
2442      we could in principle implement that using partial vectors.  */
2443   unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2444   if (const_niters <= gap_niters + 1)
2445     return false;
2446
2447   /* Install the number of iterations.  */
2448   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2449   tree niters_tree = build_int_cst (niters_type, const_niters);
2450   tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2451
2452   LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2453   LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2454
2455   /* Decide what to do if the number of epilogue iterations is not
2456      a multiple of the epilogue loop's vectorization factor.  */
2457   return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2458 }
2459
2460 /* Function vect_do_peeling.
2461
2462    Input:
2463    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2464
2465        preheader:
2466      LOOP:
2467        header_bb:
2468          loop_body
2469          if (exit_loop_cond) goto exit_bb
2470          else                goto header_bb
2471        exit_bb:
2472
2473    - NITERS: The number of iterations of the loop.
2474    - NITERSM1: The number of iterations of the loop's latch.
2475    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2476    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2477                               CHECK_PROFITABILITY is true.
2478    Output:
2479    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2480      iterate after vectorization; see vect_set_loop_condition for details.
2481    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2482      should be set to the number of scalar iterations handled by the
2483      vector loop.  The SSA name is only used on exit from the loop.
2484
2485    This function peels prolog and epilog from the loop, adds guards skipping
2486    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2487    would look like:
2488
2489        guard_bb_1:
2490          if (prefer_scalar_loop) goto merge_bb_1
2491          else                    goto guard_bb_2
2492
2493        guard_bb_2:
2494          if (skip_prolog) goto merge_bb_2
2495          else             goto prolog_preheader
2496
2497        prolog_preheader:
2498      PROLOG:
2499        prolog_header_bb:
2500          prolog_body
2501          if (exit_prolog_cond) goto prolog_exit_bb
2502          else                  goto prolog_header_bb
2503        prolog_exit_bb:
2504
2505        merge_bb_2:
2506
2507        vector_preheader:
2508      VECTOR LOOP:
2509        vector_header_bb:
2510          vector_body
2511          if (exit_vector_cond) goto vector_exit_bb
2512          else                  goto vector_header_bb
2513        vector_exit_bb:
2514
2515        guard_bb_3:
2516          if (skip_epilog) goto merge_bb_3
2517          else             goto epilog_preheader
2518
2519        merge_bb_1:
2520
2521        epilog_preheader:
2522      EPILOG:
2523        epilog_header_bb:
2524          epilog_body
2525          if (exit_epilog_cond) goto merge_bb_3
2526          else                  goto epilog_header_bb
2527
2528        merge_bb_3:
2529
2530    Note this function peels prolog and epilog only if it's necessary,
2531    as well as guards.
2532    This function returns the epilogue loop if a decision was made to vectorize
2533    it, otherwise NULL.
2534
2535    The analysis resulting in this epilogue loop's loop_vec_info was performed
2536    in the same vect_analyze_loop call as the main loop's.  At that time
2537    vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2538    vectorization factors than the main loop.  This list is stored in the main
2539    loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2540    vectorize the epilogue loop for a lower vectorization factor,  the
2541    loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2542    updated and linked to the epilogue loop.  This is later used to vectorize
2543    the epilogue.  The reason the loop_vec_info needs updating is that it was
2544    constructed based on the original main loop, and the epilogue loop is a
2545    copy of this loop, so all links pointing to statements in the original loop
2546    need updating.  Furthermore, these loop_vec_infos share the
2547    data_reference's records, which will also need to be updated.
2548
2549    TODO: Guard for prefer_scalar_loop should be emitted along with
2550    versioning conditions if loop versioning is needed.  */
2551
2552
2553 class loop *
2554 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2555                  tree *niters_vector, tree *step_vector,
2556                  tree *niters_vector_mult_vf_var, int th,
2557                  bool check_profitability, bool niters_no_overflow,
2558                  tree *advance)
2559 {
2560   edge e, guard_e;
2561   tree type = TREE_TYPE (niters), guard_cond;
2562   basic_block guard_bb, guard_to;
2563   profile_probability prob_prolog, prob_vector, prob_epilog;
2564   int estimated_vf;
2565   int prolog_peeling = 0;
2566   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2567   bool vect_epilogues_updated_niters = false;
2568   /* We currently do not support prolog peeling if the target alignment is not
2569      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2570      target alignment being constant.  */
2571   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2572   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2573     return NULL;
2574
2575   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2576     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2577
2578   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2579   poly_uint64 bound_epilog = 0;
2580   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2581       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2582     bound_epilog += vf - 1;
2583   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2584     bound_epilog += 1;
2585   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2586   poly_uint64 bound_scalar = bound_epilog;
2587
2588   if (!prolog_peeling && !epilog_peeling)
2589     return NULL;
2590
2591   /* Before doing any peeling make sure to reset debug binds outside of
2592      the loop refering to defs not in LC SSA.  */
2593   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594   for (unsigned i = 0; i < loop->num_nodes; ++i)
2595     {
2596       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2597       imm_use_iterator ui;
2598       gimple *use_stmt;
2599       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2600            gsi_next (&gsi))
2601         {
2602           FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2603             if (gimple_debug_bind_p (use_stmt)
2604                 && loop != gimple_bb (use_stmt)->loop_father
2605                 && !flow_loop_nested_p (loop,
2606                                         gimple_bb (use_stmt)->loop_father))
2607               {
2608                 gimple_debug_bind_reset_value (use_stmt);
2609                 update_stmt (use_stmt);
2610               }
2611         }
2612       for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2613            gsi_next (&gsi))
2614         {
2615           ssa_op_iter op_iter;
2616           def_operand_p def_p;
2617           FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2618             FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2619               if (gimple_debug_bind_p (use_stmt)
2620                   && loop != gimple_bb (use_stmt)->loop_father
2621                   && !flow_loop_nested_p (loop,
2622                                           gimple_bb (use_stmt)->loop_father))
2623                 {
2624                   gimple_debug_bind_reset_value (use_stmt);
2625                   update_stmt (use_stmt);
2626                 }
2627         }
2628     }
2629
2630   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2631   estimated_vf = vect_vf_for_cost (loop_vinfo);
2632   if (estimated_vf == 2)
2633     estimated_vf = 3;
2634   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2635                         .apply_scale (estimated_vf - 1, estimated_vf);
2636
2637   class loop *prolog, *epilog = NULL;
2638   class loop *first_loop = loop;
2639   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2640
2641   /* We might have a queued need to update virtual SSA form.  As we
2642      delete the update SSA machinery below after doing a regular
2643      incremental SSA update during loop copying make sure we don't
2644      lose that fact.
2645      ???  Needing to update virtual SSA form by renaming is unfortunate
2646      but not all of the vectorizer code inserting new loads / stores
2647      properly assigns virtual operands to those statements.  */
2648   update_ssa (TODO_update_ssa_only_virtuals);
2649
2650   create_lcssa_for_virtual_phi (loop);
2651
2652   /* If we're vectorizing an epilogue loop, the update_ssa above will
2653      have ensured that the virtual operand is in SSA form throughout the
2654      vectorized main loop.  Normally it is possible to trace the updated
2655      vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2656      back to scalar-stmt vuses, meaning that the effect of the SSA update
2657      remains local to the main loop.  However, there are rare cases in
2658      which the vectorized loop has vdefs even when the original scalar
2659      loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2660      introduces clobbers of the temporary vector array, which in turn
2661      needs new vdefs.  If the scalar loop doesn't write to memory, these
2662      new vdefs will be the only ones in the vector loop.
2663
2664      In that case, update_ssa will have added a new virtual phi to the
2665      main loop, which previously didn't need one.  Ensure that we (locally)
2666      maintain LCSSA form for the virtual operand, just as we would have
2667      done if the virtual phi had existed from the outset.  This makes it
2668      easier to duplicate the scalar epilogue loop below.  */
2669   tree vop_to_rename = NULL_TREE;
2670   if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2671     {
2672       class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2673       vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2674     }
2675
2676   if (MAY_HAVE_DEBUG_BIND_STMTS)
2677     {
2678       gcc_assert (!adjust_vec.exists ());
2679       adjust_vec.create (32);
2680     }
2681   initialize_original_copy_tables ();
2682
2683   /* Record the anchor bb at which the guard should be placed if the scalar
2684      loop might be preferred.  */
2685   basic_block anchor = loop_preheader_edge (loop)->src;
2686
2687   /* Generate the number of iterations for the prolog loop.  We do this here
2688      so that we can also get the upper bound on the number of iterations.  */
2689   tree niters_prolog;
2690   int bound_prolog = 0;
2691   if (prolog_peeling)
2692     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2693                                                   &bound_prolog);
2694   else
2695     niters_prolog = build_int_cst (type, 0);
2696
2697   loop_vec_info epilogue_vinfo = NULL;
2698   if (vect_epilogues)
2699     {
2700       epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2701       loop_vinfo->epilogue_vinfos.ordered_remove (0);
2702     }
2703
2704   tree niters_vector_mult_vf = NULL_TREE;
2705   /* Saving NITERs before the loop, as this may be changed by prologue.  */
2706   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2707   edge update_e = NULL, skip_e = NULL;
2708   unsigned int lowest_vf = constant_lower_bound (vf);
2709   /* If we know the number of scalar iterations for the main loop we should
2710      check whether after the main loop there are enough iterations left over
2711      for the epilogue.  */
2712   if (vect_epilogues
2713       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2714       && prolog_peeling >= 0
2715       && known_eq (vf, lowest_vf))
2716     {
2717       unsigned HOST_WIDE_INT eiters
2718         = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2719            - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2720
2721       eiters -= prolog_peeling;
2722       eiters
2723         = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2724
2725       while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
2726         {
2727           delete epilogue_vinfo;
2728           epilogue_vinfo = NULL;
2729           if (loop_vinfo->epilogue_vinfos.length () == 0)
2730             {
2731               vect_epilogues = false;
2732               break;
2733             }
2734           epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2735           loop_vinfo->epilogue_vinfos.ordered_remove (0);
2736         }
2737       vect_epilogues_updated_niters = true;
2738     }
2739   /* Prolog loop may be skipped.  */
2740   bool skip_prolog = (prolog_peeling != 0);
2741   /* Skip this loop to epilog when there are not enough iterations to enter this
2742      vectorized loop.  If true we should perform runtime checks on the NITERS
2743      to check whether we should skip the current vectorized loop.  If we know
2744      the number of scalar iterations we may choose to add a runtime check if
2745      this number "maybe" smaller than the number of iterations required
2746      when we know the number of scalar iterations may potentially
2747      be smaller than the number of iterations required to enter this loop, for
2748      this we use the upper bounds on the prolog and epilog peeling.  When we
2749      don't know the number of iterations and don't require versioning it is
2750      because we have asserted that there are enough scalar iterations to enter
2751      the main loop, so this skip is not necessary.  When we are versioning then
2752      we only add such a skip if we have chosen to vectorize the epilogue.  */
2753   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2754                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2755                                   bound_prolog + bound_epilog)
2756                       : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2757                          || vect_epilogues));
2758   /* Epilog loop must be executed if the number of iterations for epilog
2759      loop is known at compile time, otherwise we need to add a check at
2760      the end of vector loop and skip to the end of epilog loop.  */
2761   bool skip_epilog = (prolog_peeling < 0
2762                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2763                       || !vf.is_constant ());
2764   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2765   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2766     skip_epilog = false;
2767
2768   if (skip_vector)
2769     {
2770       split_edge (loop_preheader_edge (loop));
2771
2772       /* Due to the order in which we peel prolog and epilog, we first
2773          propagate probability to the whole loop.  The purpose is to
2774          avoid adjusting probabilities of both prolog and vector loops
2775          separately.  Note in this case, the probability of epilog loop
2776          needs to be scaled back later.  */
2777       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2778       if (prob_vector.initialized_p ())
2779         {
2780           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2781           scale_loop_profile (loop, prob_vector, 0);
2782         }
2783     }
2784
2785   dump_user_location_t loop_loc = find_loop_location (loop);
2786   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2787   if (vect_epilogues)
2788     /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2789        use the original scalar loop as remaining epilogue if necessary.  */
2790     LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2791       = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2792
2793   if (prolog_peeling)
2794     {
2795       e = loop_preheader_edge (loop);
2796       if (!slpeel_can_duplicate_loop_p (loop, e))
2797         {
2798           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2799                            "loop can't be duplicated to preheader edge.\n");
2800           gcc_unreachable ();
2801         }
2802       /* Peel prolog and put it on preheader edge of loop.  */
2803       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2804       if (!prolog)
2805         {
2806           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2807                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2808           gcc_unreachable ();
2809         }
2810       prolog->force_vectorize = false;
2811       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2812       first_loop = prolog;
2813       reset_original_copy_tables ();
2814
2815       /* Update the number of iterations for prolog loop.  */
2816       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2817       vect_set_loop_condition (prolog, NULL, niters_prolog,
2818                                step_prolog, NULL_TREE, false);
2819
2820       /* Skip the prolog loop.  */
2821       if (skip_prolog)
2822         {
2823           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2824                                     niters_prolog, build_int_cst (type, 0));
2825           guard_bb = loop_preheader_edge (prolog)->src;
2826           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2827           guard_to = split_edge (loop_preheader_edge (loop));
2828           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2829                                            guard_to, guard_bb,
2830                                            prob_prolog.invert (),
2831                                            irred_flag);
2832           e = EDGE_PRED (guard_to, 0);
2833           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2834           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2835
2836           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2837           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2838         }
2839
2840       /* Update init address of DRs.  */
2841       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2842       /* Update niters for vector loop.  */
2843       LOOP_VINFO_NITERS (loop_vinfo)
2844         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2845       LOOP_VINFO_NITERSM1 (loop_vinfo)
2846         = fold_build2 (MINUS_EXPR, type,
2847                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2848       bool new_var_p = false;
2849       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2850       /* It's guaranteed that vector loop bound before vectorization is at
2851          least VF, so set range information for newly generated var.  */
2852       if (new_var_p)
2853         set_range_info (niters, VR_RANGE,
2854                         wi::to_wide (build_int_cst (type, vf)),
2855                         wi::to_wide (TYPE_MAX_VALUE (type)));
2856
2857       /* Prolog iterates at most bound_prolog times, latch iterates at
2858          most bound_prolog - 1 times.  */
2859       record_niter_bound (prolog, bound_prolog - 1, false, true);
2860       delete_update_ssa ();
2861       adjust_vec_debug_stmts ();
2862       scev_reset ();
2863     }
2864
2865   if (epilog_peeling)
2866     {
2867       e = single_exit (loop);
2868       if (!slpeel_can_duplicate_loop_p (loop, e))
2869         {
2870           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2871                            "loop can't be duplicated to exit edge.\n");
2872           gcc_unreachable ();
2873         }
2874       /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2875          said epilog then we should use a copy of the main loop as a starting
2876          point.  This loop may have already had some preliminary transformations
2877          to allow for more optimal vectorization, for example if-conversion.
2878          If we are not vectorizing the epilog then we should use the scalar loop
2879          as the transformations mentioned above make less or no sense when not
2880          vectorizing.  */
2881       epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2882       if (vop_to_rename)
2883         {
2884           /* Vectorizing the main loop can sometimes introduce a vdef to
2885              a loop that previously didn't have one; see the comment above
2886              the definition of VOP_TO_RENAME for details.  The definition
2887              D that holds on E will then be different from the definition
2888              VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2889              rename VOP_TO_RENAME to D when copying the loop.
2890
2891              The virtual operand is in LCSSA form for the main loop,
2892              and no stmt between the main loop and E needs a vdef,
2893              so we know that D is provided by a phi rather than by a
2894              vdef on a normal gimple stmt.  */
2895           basic_block vdef_bb = e->src;
2896           gphi *vphi;
2897           while (!(vphi = get_virtual_phi (vdef_bb)))
2898             vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2899           gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2900           set_current_def (vop_to_rename, gimple_phi_result (vphi));
2901         }
2902       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2903       if (!epilog)
2904         {
2905           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2906                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2907           gcc_unreachable ();
2908         }
2909       epilog->force_vectorize = false;
2910       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2911
2912       /* Scalar version loop may be preferred.  In this case, add guard
2913          and skip to epilog.  Note this only happens when the number of
2914          iterations of loop is unknown at compile time, otherwise this
2915          won't be vectorized.  */
2916       if (skip_vector)
2917         {
2918           /* Additional epilogue iteration is peeled if gap exists.  */
2919           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2920                                                 bound_prolog, bound_epilog,
2921                                                 th, &bound_scalar,
2922                                                 check_profitability);
2923           /* Build guard against NITERSM1 since NITERS may overflow.  */
2924           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2925           guard_bb = anchor;
2926           guard_to = split_edge (loop_preheader_edge (epilog));
2927           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2928                                            guard_to, guard_bb,
2929                                            prob_vector.invert (),
2930                                            irred_flag);
2931           skip_e = guard_e;
2932           e = EDGE_PRED (guard_to, 0);
2933           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2934           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2935
2936           /* Simply propagate profile info from guard_bb to guard_to which is
2937              a merge point of control flow.  */
2938           guard_to->count = guard_bb->count;
2939
2940           /* Scale probability of epilog loop back.
2941              FIXME: We should avoid scaling down and back up.  Profile may
2942              get lost if we scale down to 0.  */
2943           basic_block *bbs = get_loop_body (epilog);
2944           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2945             bbs[i]->count = bbs[i]->count.apply_scale
2946                                  (bbs[i]->count,
2947                                   bbs[i]->count.apply_probability
2948                                     (prob_vector));
2949           free (bbs);
2950         }
2951
2952       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2953       /* If loop is peeled for non-zero constant times, now niters refers to
2954          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2955          overflows.  */
2956       niters_no_overflow |= (prolog_peeling > 0);
2957       vect_gen_vector_loop_niters (loop_vinfo, niters,
2958                                    niters_vector, step_vector,
2959                                    niters_no_overflow);
2960       if (!integer_onep (*step_vector))
2961         {
2962           /* On exit from the loop we will have an easy way of calcalating
2963              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2964              until then.  */
2965           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2966           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2967           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2968         }
2969       else
2970         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2971                                              &niters_vector_mult_vf);
2972       /* Update IVs of original loop as if they were advanced by
2973          niters_vector_mult_vf steps.  */
2974       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2975       update_e = skip_vector ? e : loop_preheader_edge (epilog);
2976       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2977                                         update_e);
2978
2979       if (skip_epilog)
2980         {
2981           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2982                                     niters, niters_vector_mult_vf);
2983           guard_bb = single_exit (loop)->dest;
2984           guard_to = split_edge (single_exit (epilog));
2985           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2986                                            skip_vector ? anchor : guard_bb,
2987                                            prob_epilog.invert (),
2988                                            irred_flag);
2989           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2990                                               single_exit (epilog));
2991           /* Only need to handle basic block before epilog loop if it's not
2992              the guard_bb, which is the case when skip_vector is true.  */
2993           if (guard_bb != bb_before_epilog)
2994             {
2995               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2996
2997               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2998             }
2999           scale_loop_profile (epilog, prob_epilog, 0);
3000         }
3001       else
3002         slpeel_update_phi_nodes_for_lcssa (epilog);
3003
3004       unsigned HOST_WIDE_INT bound;
3005       if (bound_scalar.is_constant (&bound))
3006         {
3007           gcc_assert (bound != 0);
3008           /* -1 to convert loop iterations to latch iterations.  */
3009           record_niter_bound (epilog, bound - 1, false, true);
3010         }
3011
3012       delete_update_ssa ();
3013       adjust_vec_debug_stmts ();
3014       scev_reset ();
3015     }
3016
3017   if (vect_epilogues)
3018     {
3019       epilog->aux = epilogue_vinfo;
3020       LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3021
3022       loop_constraint_clear (epilog, LOOP_C_INFINITE);
3023
3024       /* We now must calculate the number of NITERS performed by the previous
3025          loop and EPILOGUE_NITERS to be performed by the epilogue.  */
3026       tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3027                                  niters_prolog, niters_vector_mult_vf);
3028
3029       /* If skip_vector we may skip the previous loop, we insert a phi-node to
3030          determine whether we are coming from the previous vectorized loop
3031          using the update_e edge or the skip_vector basic block using the
3032          skip_e edge.  */
3033       if (skip_vector)
3034         {
3035           gcc_assert (update_e != NULL
3036                       && skip_e != NULL
3037                       && !vect_epilogues_updated_niters);
3038           gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3039                                            update_e->dest);
3040           tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3041           gimple *stmt = gimple_build_assign (new_ssa, niters);
3042           gimple_stmt_iterator gsi;
3043           if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3044               && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3045             {
3046               gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3047               gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3048             }
3049           else
3050             {
3051               gsi = gsi_last_bb (update_e->src);
3052               gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3053             }
3054
3055           niters = new_ssa;
3056           add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3057           add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3058                        UNKNOWN_LOCATION);
3059           niters = PHI_RESULT (new_phi);
3060         }
3061
3062       /* Set ADVANCE to the number of iterations performed by the previous
3063          loop and its prologue.  */
3064       *advance = niters;
3065
3066       if (!vect_epilogues_updated_niters)
3067         {
3068           /* Subtract the number of iterations performed by the vectorized loop
3069              from the number of total iterations.  */
3070           tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3071                                               before_loop_niters,
3072                                               niters);
3073
3074           LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3075           LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3076             = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3077                            epilogue_niters,
3078                            build_one_cst (TREE_TYPE (epilogue_niters)));
3079
3080           /* Decide what to do if the number of epilogue iterations is not
3081              a multiple of the epilogue loop's vectorization factor.
3082              We should have rejected the loop during the analysis phase
3083              if this fails.  */
3084           if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3085                                                            true))
3086             gcc_unreachable ();
3087         }
3088     }
3089
3090   adjust_vec.release ();
3091   free_original_copy_tables ();
3092
3093   return vect_epilogues ? epilog : NULL;
3094 }
3095
3096 /* Function vect_create_cond_for_niters_checks.
3097
3098    Create a conditional expression that represents the run-time checks for
3099    loop's niter.  The loop is guaranteed to terminate if the run-time
3100    checks hold.
3101
3102    Input:
3103    COND_EXPR  - input conditional expression.  New conditions will be chained
3104                 with logical AND operation.  If it is NULL, then the function
3105                 is used to return the number of alias checks.
3106    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3107                 to be checked.
3108
3109    Output:
3110    COND_EXPR - conditional expression.
3111
3112    The returned COND_EXPR is the conditional expression to be used in the
3113    if statement that controls which version of the loop gets executed at
3114    runtime.  */
3115
3116 static void
3117 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3118 {
3119   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3120
3121   if (*cond_expr)
3122     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3123                               *cond_expr, part_cond_expr);
3124   else
3125     *cond_expr = part_cond_expr;
3126 }
3127
3128 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3129    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3130
3131 static void
3132 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3133 {
3134   if (*cond_expr)
3135     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3136                               *cond_expr, part_cond_expr);
3137   else
3138     *cond_expr = part_cond_expr;
3139 }
3140
3141 /* Function vect_create_cond_for_align_checks.
3142
3143    Create a conditional expression that represents the alignment checks for
3144    all of data references (array element references) whose alignment must be
3145    checked at runtime.
3146
3147    Input:
3148    COND_EXPR  - input conditional expression.  New conditions will be chained
3149                 with logical AND operation.
3150    LOOP_VINFO - two fields of the loop information are used.
3151                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3152                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3153
3154    Output:
3155    COND_EXPR_STMT_LIST - statements needed to construct the conditional
3156                          expression.
3157    The returned value is the conditional expression to be used in the if
3158    statement that controls which version of the loop gets executed at runtime.
3159
3160    The algorithm makes two assumptions:
3161      1) The number of bytes "n" in a vector is a power of 2.
3162      2) An address "a" is aligned if a%n is zero and that this
3163         test can be done as a&(n-1) == 0.  For example, for 16
3164         byte vectors the test is a&0xf == 0.  */
3165
3166 static void
3167 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3168                                    tree *cond_expr,
3169                                    gimple_seq *cond_expr_stmt_list)
3170 {
3171   vec<stmt_vec_info> may_misalign_stmts
3172     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3173   stmt_vec_info stmt_info;
3174   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3175   tree mask_cst;
3176   unsigned int i;
3177   tree int_ptrsize_type;
3178   char tmp_name[20];
3179   tree or_tmp_name = NULL_TREE;
3180   tree and_tmp_name;
3181   gimple *and_stmt;
3182   tree ptrsize_zero;
3183   tree part_cond_expr;
3184
3185   /* Check that mask is one less than a power of 2, i.e., mask is
3186      all zeros followed by all ones.  */
3187   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3188
3189   int_ptrsize_type = signed_type_for (ptr_type_node);
3190
3191   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3192      of the first vector of the i'th data reference. */
3193
3194   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3195     {
3196       gimple_seq new_stmt_list = NULL;
3197       tree addr_base;
3198       tree addr_tmp_name;
3199       tree new_or_tmp_name;
3200       gimple *addr_stmt, *or_stmt;
3201       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3202       bool negative = tree_int_cst_compare
3203         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3204       tree offset = negative
3205         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3206
3207       /* create: addr_tmp = (int)(address_of_first_vector) */
3208       addr_base =
3209         vect_create_addr_base_for_vector_ref (loop_vinfo,
3210                                               stmt_info, &new_stmt_list,
3211                                               offset);
3212       if (new_stmt_list != NULL)
3213         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3214
3215       sprintf (tmp_name, "addr2int%d", i);
3216       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3217       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3218       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3219
3220       /* The addresses are OR together.  */
3221
3222       if (or_tmp_name != NULL_TREE)
3223         {
3224           /* create: or_tmp = or_tmp | addr_tmp */
3225           sprintf (tmp_name, "orptrs%d", i);
3226           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3227           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3228                                          or_tmp_name, addr_tmp_name);
3229           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3230           or_tmp_name = new_or_tmp_name;
3231         }
3232       else
3233         or_tmp_name = addr_tmp_name;
3234
3235     } /* end for i */
3236
3237   mask_cst = build_int_cst (int_ptrsize_type, mask);
3238
3239   /* create: and_tmp = or_tmp & mask  */
3240   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3241
3242   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3243                                   or_tmp_name, mask_cst);
3244   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3245
3246   /* Make and_tmp the left operand of the conditional test against zero.
3247      if and_tmp has a nonzero bit then some address is unaligned.  */
3248   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3249   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3250                                 and_tmp_name, ptrsize_zero);
3251   chain_cond_expr (cond_expr, part_cond_expr);
3252 }
3253
3254 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3255    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3256    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3257    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3258
3259 static void
3260 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3261 {
3262   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3263   unsigned int i;
3264   vec_object_pair *pair;
3265   FOR_EACH_VEC_ELT (pairs, i, pair)
3266     {
3267       tree addr1 = build_fold_addr_expr (pair->first);
3268       tree addr2 = build_fold_addr_expr (pair->second);
3269       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3270                                          addr1, addr2);
3271       chain_cond_expr (cond_expr, part_cond_expr);
3272     }
3273 }
3274
3275 /* Create an expression that is true when all lower-bound conditions for
3276    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3277
3278 static void
3279 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3280 {
3281   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3282   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3283     {
3284       tree expr = lower_bounds[i].expr;
3285       tree type = unsigned_type_for (TREE_TYPE (expr));
3286       expr = fold_convert (type, expr);
3287       poly_uint64 bound = lower_bounds[i].min_value;
3288       if (!lower_bounds[i].unsigned_p)
3289         {
3290           expr = fold_build2 (PLUS_EXPR, type, expr,
3291                               build_int_cstu (type, bound - 1));
3292           bound += bound - 1;
3293         }
3294       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3295                                          build_int_cstu (type, bound));
3296       chain_cond_expr (cond_expr, part_cond_expr);
3297     }
3298 }
3299
3300 /* Function vect_create_cond_for_alias_checks.
3301
3302    Create a conditional expression that represents the run-time checks for
3303    overlapping of address ranges represented by a list of data references
3304    relations passed as input.
3305
3306    Input:
3307    COND_EXPR  - input conditional expression.  New conditions will be chained
3308                 with logical AND operation.  If it is NULL, then the function
3309                 is used to return the number of alias checks.
3310    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3311                 to be checked.
3312
3313    Output:
3314    COND_EXPR - conditional expression.
3315
3316    The returned COND_EXPR is the conditional expression to be used in the if
3317    statement that controls which version of the loop gets executed at runtime.
3318 */
3319
3320 void
3321 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3322 {
3323   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3324     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3325
3326   if (comp_alias_ddrs.is_empty ())
3327     return;
3328
3329   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3330                                &comp_alias_ddrs, cond_expr);
3331   if (dump_enabled_p ())
3332     dump_printf_loc (MSG_NOTE, vect_location,
3333                      "created %u versioning for alias checks.\n",
3334                      comp_alias_ddrs.length ());
3335 }
3336
3337
3338 /* Function vect_loop_versioning.
3339
3340    If the loop has data references that may or may not be aligned or/and
3341    has data reference relations whose independence was not proven then
3342    two versions of the loop need to be generated, one which is vectorized
3343    and one which isn't.  A test is then generated to control which of the
3344    loops is executed.  The test checks for the alignment of all of the
3345    data references that may or may not be aligned.  An additional
3346    sequence of runtime tests is generated for each pairs of DDRs whose
3347    independence was not proven.  The vectorized version of loop is
3348    executed only if both alias and alignment tests are passed.
3349
3350    The test generated to check which version of loop is executed
3351    is modified to also check for profitability as indicated by the
3352    cost model threshold TH.
3353
3354    The versioning precondition(s) are placed in *COND_EXPR and
3355    *COND_EXPR_STMT_LIST.  */
3356
3357 class loop *
3358 vect_loop_versioning (loop_vec_info loop_vinfo,
3359                       gimple *loop_vectorized_call)
3360 {
3361   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3362   class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3363   basic_block condition_bb;
3364   gphi_iterator gsi;
3365   gimple_stmt_iterator cond_exp_gsi;
3366   basic_block merge_bb;
3367   basic_block new_exit_bb;
3368   edge new_exit_e, e;
3369   gphi *orig_phi, *new_phi;
3370   tree cond_expr = NULL_TREE;
3371   gimple_seq cond_expr_stmt_list = NULL;
3372   tree arg;
3373   profile_probability prob = profile_probability::likely ();
3374   gimple_seq gimplify_stmt_list = NULL;
3375   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3376   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3377   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3378   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3379   poly_uint64 versioning_threshold
3380     = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3381   tree version_simd_if_cond
3382     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3383   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3384
3385   if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3386       && !ordered_p (th, versioning_threshold))
3387     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3388                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3389                                             th - 1));
3390   if (maybe_ne (versioning_threshold, 0U))
3391     {
3392       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3393                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3394                                               versioning_threshold - 1));
3395       if (cond_expr)
3396         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3397                                  expr, cond_expr);
3398       else
3399         cond_expr = expr;
3400     }
3401
3402   if (version_niter)
3403     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3404
3405   if (cond_expr)
3406     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3407                                         &cond_expr_stmt_list,
3408                                         is_gimple_condexpr, NULL_TREE);
3409
3410   if (version_align)
3411     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3412                                        &cond_expr_stmt_list);
3413
3414   if (version_alias)
3415     {
3416       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3417       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3418       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3419     }
3420
3421   if (version_simd_if_cond)
3422     {
3423       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3424       if (flag_checking)
3425         if (basic_block bb
3426             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3427           gcc_assert (bb != loop->header
3428                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3429                       && (scalar_loop == NULL
3430                           || (bb != scalar_loop->header
3431                               && dominated_by_p (CDI_DOMINATORS,
3432                                                  scalar_loop->header, bb))));
3433       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3434       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3435                             version_simd_if_cond, zero);
3436       if (cond_expr)
3437         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3438                                  c, cond_expr);
3439       else
3440         cond_expr = c;
3441       if (dump_enabled_p ())
3442         dump_printf_loc (MSG_NOTE, vect_location,
3443                          "created versioning for simd if condition check.\n");
3444     }
3445
3446   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3447                                       &gimplify_stmt_list,
3448                                       is_gimple_condexpr, NULL_TREE);
3449   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3450
3451   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3452      invariant in.  */
3453   class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3454   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3455        !gsi_end_p (gsi); gsi_next (&gsi))
3456     {
3457       gimple *stmt = gsi_stmt (gsi);
3458       update_stmt (stmt);
3459       ssa_op_iter iter;
3460       use_operand_p use_p;
3461       basic_block def_bb;
3462       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3463         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3464             && flow_bb_inside_loop_p (outermost, def_bb))
3465           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3466     }
3467
3468   /* Search for the outermost loop we can version.  Avoid versioning of
3469      non-perfect nests but allow if-conversion versioned loops inside.  */
3470   class loop *loop_to_version = loop;
3471   if (flow_loop_nested_p (outermost, loop))
3472     {
3473       if (dump_enabled_p ())
3474         dump_printf_loc (MSG_NOTE, vect_location,
3475                          "trying to apply versioning to outer loop %d\n",
3476                          outermost->num);
3477       if (outermost->num == 0)
3478         outermost = superloop_at_depth (loop, 1);
3479       /* And avoid applying versioning on non-perfect nests.  */
3480       while (loop_to_version != outermost
3481              && single_exit (loop_outer (loop_to_version))
3482              && (!loop_outer (loop_to_version)->inner->next
3483                  || vect_loop_vectorized_call (loop_to_version))
3484              && (!loop_outer (loop_to_version)->inner->next
3485                  || !loop_outer (loop_to_version)->inner->next->next))
3486         loop_to_version = loop_outer (loop_to_version);
3487     }
3488
3489   /* Apply versioning.  If there is already a scalar version created by
3490      if-conversion re-use that.  Note we cannot re-use the copy of
3491      an if-converted outer-loop when vectorizing the inner loop only.  */
3492   gcond *cond;
3493   if ((!loop_to_version->inner || loop == loop_to_version)
3494       && loop_vectorized_call)
3495     {
3496       gcc_assert (scalar_loop);
3497       condition_bb = gimple_bb (loop_vectorized_call);
3498       cond = as_a <gcond *> (last_stmt (condition_bb));
3499       gimple_cond_set_condition_from_tree (cond, cond_expr);
3500       update_stmt (cond);
3501
3502       if (cond_expr_stmt_list)
3503         {
3504           cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3505           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3506                                  GSI_SAME_STMT);
3507         }
3508
3509       /* if-conversion uses profile_probability::always () for both paths,
3510          reset the paths probabilities appropriately.  */
3511       edge te, fe;
3512       extract_true_false_edges_from_block (condition_bb, &te, &fe);
3513       te->probability = prob;
3514       fe->probability = prob.invert ();
3515       /* We can scale loops counts immediately but have to postpone
3516          scaling the scalar loop because we re-use it during peeling.  */
3517       scale_loop_frequencies (loop_to_version, te->probability);
3518       LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3519
3520       nloop = scalar_loop;
3521       if (dump_enabled_p ())
3522         dump_printf_loc (MSG_NOTE, vect_location,
3523                          "reusing %sloop version created by if conversion\n",
3524                          loop_to_version != loop ? "outer " : "");
3525     }
3526   else
3527     {
3528       if (loop_to_version != loop
3529           && dump_enabled_p ())
3530         dump_printf_loc (MSG_NOTE, vect_location,
3531                          "applying loop versioning to outer loop %d\n",
3532                          loop_to_version->num);
3533
3534       initialize_original_copy_tables ();
3535       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3536                             prob, prob.invert (), prob, prob.invert (), true);
3537       gcc_assert (nloop);
3538       nloop = get_loop_copy (loop);
3539
3540       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3541          reap those otherwise;  they also refer to the original
3542          loops.  */
3543       class loop *l = loop;
3544       while (gimple *call = vect_loop_vectorized_call (l))
3545         {
3546           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3547           fold_loop_internal_call (call, boolean_false_node);
3548           l = loop_outer (l);
3549         }
3550       free_original_copy_tables ();
3551
3552       if (cond_expr_stmt_list)
3553         {
3554           cond_exp_gsi = gsi_last_bb (condition_bb);
3555           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3556                                  GSI_SAME_STMT);
3557         }
3558
3559       /* Loop versioning violates an assumption we try to maintain during
3560          vectorization - that the loop exit block has a single predecessor.
3561          After versioning, the exit block of both loop versions is the same
3562          basic block (i.e. it has two predecessors). Just in order to simplify
3563          following transformations in the vectorizer, we fix this situation
3564          here by adding a new (empty) block on the exit-edge of the loop,
3565          with the proper loop-exit phis to maintain loop-closed-form.
3566          If loop versioning wasn't done from loop, but scalar_loop instead,
3567          merge_bb will have already just a single successor.  */
3568
3569       merge_bb = single_exit (loop_to_version)->dest;
3570       if (EDGE_COUNT (merge_bb->preds) >= 2)
3571         {
3572           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3573           new_exit_bb = split_edge (single_exit (loop_to_version));
3574           new_exit_e = single_exit (loop_to_version);
3575           e = EDGE_SUCC (new_exit_bb, 0);
3576
3577           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3578                gsi_next (&gsi))
3579             {
3580               tree new_res;
3581               orig_phi = gsi.phi ();
3582               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3583               new_phi = create_phi_node (new_res, new_exit_bb);
3584               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3585               add_phi_arg (new_phi, arg, new_exit_e,
3586                            gimple_phi_arg_location_from_edge (orig_phi, e));
3587               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3588             }
3589         }
3590
3591       update_ssa (TODO_update_ssa);
3592     }
3593
3594   if (version_niter)
3595     {
3596       /* The versioned loop could be infinite, we need to clear existing
3597          niter information which is copied from the original loop.  */
3598       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3599       vect_free_loop_info_assumptions (nloop);
3600       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3601       loop_constraint_set (loop, LOOP_C_INFINITE);
3602     }
3603
3604   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3605       && dump_enabled_p ())
3606     {
3607       if (version_alias)
3608         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3609                          vect_location,
3610                          "loop versioned for vectorization because of "
3611                          "possible aliasing\n");
3612       if (version_align)
3613         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3614                          vect_location,
3615                          "loop versioned for vectorization to enhance "
3616                          "alignment\n");
3617
3618     }
3619
3620   return nloop;
3621 }