gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 142     {
 143       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146       if (rhs < lhs)
 147         scalar_type = rhs_type;
 148     }
 149   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 150     {
 151       unsigned int i = 0;
 152       if (gimple_call_internal_p (call))
 153         {
 154           internal_fn ifn = gimple_call_internal_fn (call);
 155           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 156             /* gimple_expr_type already picked the type of the loaded
 157                or stored data.  */
 158             i = ~0U;
 159           else if (internal_fn_mask_index (ifn) == 0)
 160             i = 1;
 161         }
 162       if (i < gimple_call_num_args (call))
 163         {
 164           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 165           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 166             {
 167               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 168               if (rhs < lhs)
 169                 scalar_type = rhs_type;
 170             }
 171         }
 172     }
 173
 174   *lhs_size_unit = lhs;
 175   *rhs_size_unit = rhs;
 176   return scalar_type;
 177 }
 178
 179
 180 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 181    tested at run-time.  Return TRUE if DDR was successfully inserted.
 182    Return false if versioning is not supported.  */
 183
 184 static opt_result
 185 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 186 {
 187   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 188
 189   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 190     return opt_result::failure_at (vect_location,
 191                                    "will not create alias checks, as"
 192                                    " --param vect-max-version-for-alias-checks"
 193                                    " == 0\n");
 194
 195   opt_result res
 196     = runtime_alias_check_p (ddr, loop,
 197                              optimize_loop_nest_for_speed_p (loop));
 198   if (!res)
 199     return res;
 200
 201   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 202   return opt_result::success ();
 203 }
 204
 205 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 206
 207 static void
 208 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 209 {
 210   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 211   for (unsigned int i = 0; i < checks.length(); ++i)
 212     if (checks[i] == value)
 213       return;
 214
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location,
 217                      "need run-time check that %T is nonzero\n",
 218                      value);
 219   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 220 }
 221
 222 /* Return true if we know that the order of vectorized DR_INFO_A and
 223    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 224    DR_INFO_B.  At least one of the accesses is a write.  */
 225
 226 static bool
 227 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 228 {
 229   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 230   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 231
 232   /* Single statements are always kept in their original order.  */
 233   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 234       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 235     return true;
 236
 237   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 238      emitted at the position of the first scalar load.
 239      Stores in a group are emitted at the position of the last scalar store.
 240      Compute that position and check whether the resulting order matches
 241      the current one.  */
 242   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 243   if (il_a)
 244     {
 245       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 246         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 247              s = DR_GROUP_NEXT_ELEMENT (s))
 248           il_a = get_later_stmt (il_a, s);
 249       else /* DR_IS_READ */
 250         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 251              s = DR_GROUP_NEXT_ELEMENT (s))
 252           if (get_later_stmt (il_a, s) == il_a)
 253             il_a = s;
 254     }
 255   else
 256     il_a = stmtinfo_a;
 257   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 258   if (il_b)
 259     {
 260       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 261         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 262              s = DR_GROUP_NEXT_ELEMENT (s))
 263           il_b = get_later_stmt (il_b, s);
 264       else /* DR_IS_READ */
 265         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 266              s = DR_GROUP_NEXT_ELEMENT (s))
 267           if (get_later_stmt (il_b, s) == il_b)
 268             il_b = s;
 269     }
 270   else
 271     il_b = stmtinfo_b;
 272   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 273   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 274 }
 275
 276 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 277    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 278    distances.  These distances are conservatively correct but they don't
 279    reflect a guaranteed dependence.
 280
 281    Return true if this function does all the work necessary to avoid
 282    an alias or false if the caller should use the dependence distances
 283    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 284    the depth of the loop described by LOOP_VINFO and the other arguments
 285    are as for vect_analyze_data_ref_dependence.  */
 286
 287 static bool
 288 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 289                                        loop_vec_info loop_vinfo,
 290                                        int loop_depth, unsigned int *max_vf)
 291 {
 292   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 293   lambda_vector dist_v;
 294   unsigned int i;
 295   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 296     {
 297       int dist = dist_v[loop_depth];
 298       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 299         {
 300           /* If the user asserted safelen >= DIST consecutive iterations
 301              can be executed concurrently, assume independence.
 302
 303              ??? An alternative would be to add the alias check even
 304              in this case, and vectorize the fallback loop with the
 305              maximum VF set to safelen.  However, if the user has
 306              explicitly given a length, it's less likely that that
 307              would be a win.  */
 308           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 309             {
 310               if ((unsigned int) loop->safelen < *max_vf)
 311                 *max_vf = loop->safelen;
 312               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 313               continue;
 314             }
 315
 316           /* For dependence distances of 2 or more, we have the option
 317              of limiting VF or checking for an alias at runtime.
 318              Prefer to check at runtime if we can, to avoid limiting
 319              the VF unnecessarily when the bases are in fact independent.
 320
 321              Note that the alias checks will be removed if the VF ends up
 322              being small enough.  */
 323           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 324           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 325           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 326                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 327                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 328         }
 329     }
 330   return true;
 331 }
 332
 333
 334 /* Function vect_analyze_data_ref_dependence.
 335
 336    FIXME: I needed to change the sense of the returned flag.
 337
 338    Return FALSE if there (might) exist a dependence between a memory-reference
 339    DRA and a memory-reference DRB.  When versioning for alias may check a
 340    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 341    the data dependence.  */
 342
 343 static opt_result
 344 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 345                                   loop_vec_info loop_vinfo,
 346                                   unsigned int *max_vf)
 347 {
 348   unsigned int i;
 349   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 350   struct data_reference *dra = DDR_A (ddr);
 351   struct data_reference *drb = DDR_B (ddr);
 352   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 353   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 354   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 355   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 356   lambda_vector dist_v;
 357   unsigned int loop_depth;
 358
 359   /* In loop analysis all data references should be vectorizable.  */
 360   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 361       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 362     gcc_unreachable ();
 363
 364   /* Independent data accesses.  */
 365   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 366     return opt_result::success ();
 367
 368   if (dra == drb
 369       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 370     return opt_result::success ();
 371
 372   /* We do not have to consider dependences between accesses that belong
 373      to the same group, unless the stride could be smaller than the
 374      group size.  */
 375   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 376       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 377           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 378       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 379     return opt_result::success ();
 380
 381   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 382      least two scalar iterations, there is always also a true dependence.
 383      As the vectorizer does not re-order loads and stores we can ignore
 384      the anti-dependence if TBAA can disambiguate both DRs similar to the
 385      case with known negative distance anti-dependences (positive
 386      distance anti-dependences would violate TBAA constraints).  */
 387   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 388        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 389       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 390                                  get_alias_set (DR_REF (drb))))
 391     return opt_result::success ();
 392
 393   /* Unknown data dependence.  */
 394   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 395     {
 396       /* If user asserted safelen consecutive iterations can be
 397          executed concurrently, assume independence.  */
 398       if (loop->safelen >= 2)
 399         {
 400           if ((unsigned int) loop->safelen < *max_vf)
 401             *max_vf = loop->safelen;
 402           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 403           return opt_result::success ();
 404         }
 405
 406       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 407           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 408         return opt_result::failure_at
 409           (stmtinfo_a->stmt,
 410            "versioning for alias not supported for: "
 411            "can't determine dependence between %T and %T\n",
 412            DR_REF (dra), DR_REF (drb));
 413
 414       if (dump_enabled_p ())
 415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 416                          "versioning for alias required: "
 417                          "can't determine dependence between %T and %T\n",
 418                          DR_REF (dra), DR_REF (drb));
 419
 420       /* Add to list of ddrs that need to be tested at run-time.  */
 421       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 422     }
 423
 424   /* Known data dependence.  */
 425   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 426     {
 427       /* If user asserted safelen consecutive iterations can be
 428          executed concurrently, assume independence.  */
 429       if (loop->safelen >= 2)
 430         {
 431           if ((unsigned int) loop->safelen < *max_vf)
 432             *max_vf = loop->safelen;
 433           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 434           return opt_result::success ();
 435         }
 436
 437       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 438           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 439         return opt_result::failure_at
 440           (stmtinfo_a->stmt,
 441            "versioning for alias not supported for: "
 442            "bad dist vector for %T and %T\n",
 443            DR_REF (dra), DR_REF (drb));
 444
 445       if (dump_enabled_p ())
 446         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 447                          "versioning for alias required: "
 448                          "bad dist vector for %T and %T\n",
 449                          DR_REF (dra), DR_REF (drb));
 450       /* Add to list of ddrs that need to be tested at run-time.  */
 451       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 452     }
 453
 454   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 455
 456   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 457       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 458                                                 loop_depth, max_vf))
 459     return opt_result::success ();
 460
 461   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 462     {
 463       int dist = dist_v[loop_depth];
 464
 465       if (dump_enabled_p ())
 466         dump_printf_loc (MSG_NOTE, vect_location,
 467                          "dependence distance  = %d.\n", dist);
 468
 469       if (dist == 0)
 470         {
 471           if (dump_enabled_p ())
 472             dump_printf_loc (MSG_NOTE, vect_location,
 473                              "dependence distance == 0 between %T and %T\n",
 474                              DR_REF (dra), DR_REF (drb));
 475
 476           /* When we perform grouped accesses and perform implicit CSE
 477              by detecting equal accesses and doing disambiguation with
 478              runtime alias tests like for
 479                 .. = a[i];
 480                 .. = a[i+1];
 481                 a[i] = ..;
 482                 a[i+1] = ..;
 483                 *p = ..;
 484                 .. = a[i];
 485                 .. = a[i+1];
 486              where we will end up loading { a[i], a[i+1] } once, make
 487              sure that inserting group loads before the first load and
 488              stores after the last store will do the right thing.
 489              Similar for groups like
 490                 a[i] = ...;
 491                 ... = a[i];
 492                 a[i+1] = ...;
 493              where loads from the group interleave with the store.  */
 494           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 495             return opt_result::failure_at (stmtinfo_a->stmt,
 496                                            "READ_WRITE dependence"
 497                                            " in interleaving.\n");
 498
 499           if (loop->safelen < 2)
 500             {
 501               tree indicator = dr_zero_step_indicator (dra);
 502               if (!indicator || integer_zerop (indicator))
 503                 return opt_result::failure_at (stmtinfo_a->stmt,
 504                                                "access also has a zero step\n");
 505               else if (TREE_CODE (indicator) != INTEGER_CST)
 506                 vect_check_nonzero_value (loop_vinfo, indicator);
 507             }
 508           continue;
 509         }
 510
 511       if (dist > 0 && DDR_REVERSED_P (ddr))
 512         {
 513           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 514              reversed (to make distance vector positive), and the actual
 515              distance is negative.  */
 516           if (dump_enabled_p ())
 517             dump_printf_loc (MSG_NOTE, vect_location,
 518                              "dependence distance negative.\n");
 519           /* When doing outer loop vectorization, we need to check if there is
 520              a backward dependence at the inner loop level if the dependence
 521              at the outer loop is reversed.  See PR81740.  */
 522           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 523               || nested_in_vect_loop_p (loop, stmtinfo_b))
 524             {
 525               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 526                                                          DDR_LOOP_NEST (ddr));
 527               if (dist_v[inner_depth] < 0)
 528                 return opt_result::failure_at (stmtinfo_a->stmt,
 529                                                "not vectorized, dependence "
 530                                                "between data-refs %T and %T\n",
 531                                                DR_REF (dra), DR_REF (drb));
 532             }
 533           /* Record a negative dependence distance to later limit the
 534              amount of stmt copying / unrolling we can perform.
 535              Only need to handle read-after-write dependence.  */
 536           if (DR_IS_READ (drb)
 537               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 538                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 539             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 540           continue;
 541         }
 542
 543       unsigned int abs_dist = abs (dist);
 544       if (abs_dist >= 2 && abs_dist < *max_vf)
 545         {
 546           /* The dependence distance requires reduction of the maximal
 547              vectorization factor.  */
 548           *max_vf = abs_dist;
 549           if (dump_enabled_p ())
 550             dump_printf_loc (MSG_NOTE, vect_location,
 551                              "adjusting maximal vectorization factor to %i\n",
 552                              *max_vf);
 553         }
 554
 555       if (abs_dist >= *max_vf)
 556         {
 557           /* Dependence distance does not create dependence, as far as
 558              vectorization is concerned, in this case.  */
 559           if (dump_enabled_p ())
 560             dump_printf_loc (MSG_NOTE, vect_location,
 561                              "dependence distance >= VF.\n");
 562           continue;
 563         }
 564
 565       return opt_result::failure_at (stmtinfo_a->stmt,
 566                                      "not vectorized, possible dependence "
 567                                      "between data-refs %T and %T\n",
 568                                      DR_REF (dra), DR_REF (drb));
 569     }
 570
 571   return opt_result::success ();
 572 }
 573
 574 /* Function vect_analyze_data_ref_dependences.
 575
 576    Examine all the data references in the loop, and make sure there do not
 577    exist any data dependences between them.  Set *MAX_VF according to
 578    the maximum vectorization factor the data dependences allow.  */
 579
 580 opt_result
 581 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 582                                    unsigned int *max_vf)
 583 {
 584   unsigned int i;
 585   struct data_dependence_relation *ddr;
 586
 587   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 588
 589   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 590     {
 591       LOOP_VINFO_DDRS (loop_vinfo)
 592         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 593                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 594       /* We do not need read-read dependences.  */
 595       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 596                                           &LOOP_VINFO_DDRS (loop_vinfo),
 597                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 598                                           false);
 599       gcc_assert (res);
 600     }
 601
 602   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 603
 604   /* For epilogues we either have no aliases or alias versioning
 605      was applied to original loop.  Therefore we may just get max_vf
 606      using VF of original loop.  */
 607   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 608     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 609   else
 610     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 611       {
 612         opt_result res
 613           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 614         if (!res)
 615           return res;
 616       }
 617
 618   return opt_result::success ();
 619 }
 620
 621
 622 /* Function vect_slp_analyze_data_ref_dependence.
 623
 624    Return TRUE if there (might) exist a dependence between a memory-reference
 625    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 626    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 627    according to the data dependence.  */
 628
 629 static bool
 630 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 631                                       struct data_dependence_relation *ddr)
 632 {
 633   struct data_reference *dra = DDR_A (ddr);
 634   struct data_reference *drb = DDR_B (ddr);
 635   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 636   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 637
 638   /* We need to check dependences of statements marked as unvectorizable
 639      as well, they still can prohibit vectorization.  */
 640
 641   /* Independent data accesses.  */
 642   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 643     return false;
 644
 645   if (dra == drb)
 646     return false;
 647
 648   /* Read-read is OK.  */
 649   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 650     return false;
 651
 652   /* If dra and drb are part of the same interleaving chain consider
 653      them independent.  */
 654   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 655       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 656           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 657     return false;
 658
 659   /* Unknown data dependence.  */
 660   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 661     {
 662       if  (dump_enabled_p ())
 663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 664                          "can't determine dependence between %T and %T\n",
 665                          DR_REF (dra), DR_REF (drb));
 666     }
 667   else if (dump_enabled_p ())
 668     dump_printf_loc (MSG_NOTE, vect_location,
 669                      "determined dependence between %T and %T\n",
 670                      DR_REF (dra), DR_REF (drb));
 671
 672   return true;
 673 }
 674
 675
 676 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 677    contain the vector of scalar stores of this instance if we are
 678    disambiguating the loads.  */
 679
 680 static bool
 681 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 682                                    vec<stmt_vec_info> stores,
 683                                    stmt_vec_info last_store_info)
 684 {
 685   /* This walks over all stmts involved in the SLP load/store done
 686      in NODE verifying we can sink them up to the last stmt in the
 687      group.  */
 688   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 689     {
 690       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 691       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 692         {
 693           stmt_vec_info access_info
 694             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 695           if (access_info == last_access_info)
 696             continue;
 697           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 698           ao_ref ref;
 699           bool ref_initialized_p = false;
 700           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 701                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 702             {
 703               gimple *stmt = gsi_stmt (gsi);
 704               if (! gimple_vuse (stmt))
 705                 continue;
 706
 707               /* If we couldn't record a (single) data reference for this
 708                  stmt we have to resort to the alias oracle.  */
 709               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 710               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 711               if (!dr_b)
 712                 {
 713                   /* We are moving a store - this means
 714                      we cannot use TBAA for disambiguation.  */
 715                   if (!ref_initialized_p)
 716                     ao_ref_init (&ref, DR_REF (dr_a));
 717                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 718                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 719                     return false;
 720                   continue;
 721                 }
 722
 723               bool dependent = false;
 724               /* If we run into a store of this same instance (we've just
 725                  marked those) then delay dependence checking until we run
 726                  into the last store because this is where it will have
 727                  been sunk to (and we verify if we can do that as well).  */
 728               if (gimple_visited_p (stmt))
 729                 {
 730                   if (stmt_info != last_store_info)
 731                     continue;
 732                   unsigned i;
 733                   stmt_vec_info store_info;
 734                   FOR_EACH_VEC_ELT (stores, i, store_info)
 735                     {
 736                       data_reference *store_dr
 737                         = STMT_VINFO_DATA_REF (store_info);
 738                       ddr_p ddr = initialize_data_dependence_relation
 739                                     (dr_a, store_dr, vNULL);
 740                       dependent
 741                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 742                       free_dependence_relation (ddr);
 743                       if (dependent)
 744                         break;
 745                     }
 746                 }
 747               else
 748                 {
 749                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 750                                                                    dr_b, vNULL);
 751                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 752                   free_dependence_relation (ddr);
 753                 }
 754               if (dependent)
 755                 return false;
 756             }
 757         }
 758     }
 759   else /* DR_IS_READ */
 760     {
 761       stmt_vec_info first_access_info
 762         = vect_find_first_scalar_stmt_in_slp (node);
 763       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 764         {
 765           stmt_vec_info access_info
 766             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 767           if (access_info == first_access_info)
 768             continue;
 769           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 770           ao_ref ref;
 771           bool ref_initialized_p = false;
 772           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 773                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 774             {
 775               gimple *stmt = gsi_stmt (gsi);
 776               if (! gimple_vdef (stmt))
 777                 continue;
 778
 779               /* If we couldn't record a (single) data reference for this
 780                  stmt we have to resort to the alias oracle.  */
 781               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 782               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 783               if (!dr_b)
 784                 {
 785                   /* We are hoisting a load - this means we can use
 786                      TBAA for disambiguation.  */
 787                   if (!ref_initialized_p)
 788                     ao_ref_init (&ref, DR_REF (dr_a));
 789                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 790                     return false;
 791                   continue;
 792                 }
 793
 794               bool dependent = false;
 795               /* If we run into a store of this same instance (we've just
 796                  marked those) then delay dependence checking until we run
 797                  into the last store because this is where it will have
 798                  been sunk to (and we verify if we can do that as well).  */
 799               if (gimple_visited_p (stmt))
 800                 {
 801                   if (stmt_info != last_store_info)
 802                     continue;
 803                   unsigned i;
 804                   stmt_vec_info store_info;
 805                   FOR_EACH_VEC_ELT (stores, i, store_info)
 806                     {
 807                       data_reference *store_dr
 808                         = STMT_VINFO_DATA_REF (store_info);
 809                       ddr_p ddr = initialize_data_dependence_relation
 810                                     (dr_a, store_dr, vNULL);
 811                       dependent
 812                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 813                       free_dependence_relation (ddr);
 814                       if (dependent)
 815                         break;
 816                     }
 817                 }
 818               else
 819                 {
 820                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 821                                                                    dr_b, vNULL);
 822                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 823                   free_dependence_relation (ddr);
 824                 }
 825               if (dependent)
 826                 return false;
 827             }
 828         }
 829     }
 830   return true;
 831 }
 832
 833
 834 /* Function vect_analyze_data_ref_dependences.
 835
 836    Examine all the data references in the basic-block, and make sure there
 837    do not exist any data dependences between them.  Set *MAX_VF according to
 838    the maximum vectorization factor the data dependences allow.  */
 839
 840 bool
 841 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 842 {
 843   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 844
 845   /* The stores of this instance are at the root of the SLP tree.  */
 846   slp_tree store = SLP_INSTANCE_TREE (instance);
 847   if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
 848     store = NULL;
 849
 850   /* Verify we can sink stores to the vectorized stmt insert location.  */
 851   stmt_vec_info last_store_info = NULL;
 852   if (store)
 853     {
 854       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 855         return false;
 856
 857       /* Mark stores in this instance and remember the last one.  */
 858       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 859       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 860         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 861     }
 862
 863   bool res = true;
 864
 865   /* Verify we can sink loads to the vectorized stmt insert location,
 866      special-casing stores of this instance.  */
 867   slp_tree load;
 868   unsigned int i;
 869   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 870     if (! vect_slp_analyze_node_dependences (vinfo, load,
 871                                              store
 872                                              ? SLP_TREE_SCALAR_STMTS (store)
 873                                              : vNULL, last_store_info))
 874       {
 875         res = false;
 876         break;
 877       }
 878
 879   /* Unset the visited flag.  */
 880   if (store)
 881     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 882       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 883
 884   return res;
 885 }
 886
 887 /* Record the base alignment guarantee given by DRB, which occurs
 888    in STMT_INFO.  */
 889
 890 static void
 891 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 892                             innermost_loop_behavior *drb)
 893 {
 894   bool existed;
 895   innermost_loop_behavior *&entry
 896     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 897   if (!existed || entry->base_alignment < drb->base_alignment)
 898     {
 899       entry = drb;
 900       if (dump_enabled_p ())
 901         dump_printf_loc (MSG_NOTE, vect_location,
 902                          "recording new base alignment for %T\n"
 903                          "  alignment:    %d\n"
 904                          "  misalignment: %d\n"
 905                          "  based on:     %G",
 906                          drb->base_address,
 907                          drb->base_alignment,
 908                          drb->base_misalignment,
 909                          stmt_info->stmt);
 910     }
 911 }
 912
 913 /* If the region we're going to vectorize is reached, all unconditional
 914    data references occur at least once.  We can therefore pool the base
 915    alignment guarantees from each unconditional reference.  Do this by
 916    going through all the data references in VINFO and checking whether
 917    the containing statement makes the reference unconditionally.  If so,
 918    record the alignment of the base address in VINFO so that it can be
 919    used for all other references with the same base.  */
 920
 921 void
 922 vect_record_base_alignments (vec_info *vinfo)
 923 {
 924   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 925   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 926   data_reference *dr;
 927   unsigned int i;
 928   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 929     {
 930       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 931       stmt_vec_info stmt_info = dr_info->stmt;
 932       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 933           && STMT_VINFO_VECTORIZABLE (stmt_info)
 934           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 935         {
 936           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 937
 938           /* If DR is nested in the loop that is being vectorized, we can also
 939              record the alignment of the base wrt the outer loop.  */
 940           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 941             vect_record_base_alignment
 942               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 943         }
 944     }
 945 }
 946
 947 /* Return the target alignment for the vectorized form of DR_INFO.  */
 948
 949 static poly_uint64
 950 vect_calculate_target_alignment (dr_vec_info *dr_info)
 951 {
 952   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 953   return targetm.vectorize.preferred_vector_alignment (vectype);
 954 }
 955
 956 /* Function vect_compute_data_ref_alignment
 957
 958    Compute the misalignment of the data reference DR_INFO.
 959
 960    Output:
 961    1. DR_MISALIGNMENT (DR_INFO) is defined.
 962
 963    FOR NOW: No analysis is actually performed. Misalignment is calculated
 964    only for trivial cases. TODO.  */
 965
 966 static void
 967 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 968 {
 969   stmt_vec_info stmt_info = dr_info->stmt;
 970   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 971   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 972   class loop *loop = NULL;
 973   tree ref = DR_REF (dr_info->dr);
 974   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 975
 976   if (dump_enabled_p ())
 977     dump_printf_loc (MSG_NOTE, vect_location,
 978                      "vect_compute_data_ref_alignment:\n");
 979
 980   if (loop_vinfo)
 981     loop = LOOP_VINFO_LOOP (loop_vinfo);
 982
 983   /* Initialize misalignment to unknown.  */
 984   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 985
 986   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 987     return;
 988
 989   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 990   bool step_preserves_misalignment_p;
 991
 992   poly_uint64 vector_alignment
 993     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 994   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 995
 996   /* If the main loop has peeled for alignment we have no way of knowing
 997      whether the data accesses in the epilogues are aligned.  We can't at
 998      compile time answer the question whether we have entered the main loop or
 999      not.  Fixes PR 92351.  */
1000   if (loop_vinfo)
1001     {
1002       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1003       if (orig_loop_vinfo
1004           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1005         return;
1006     }
1007
1008   unsigned HOST_WIDE_INT vect_align_c;
1009   if (!vector_alignment.is_constant (&vect_align_c))
1010     return;
1011
1012   /* No step for BB vectorization.  */
1013   if (!loop)
1014     {
1015       gcc_assert (integer_zerop (drb->step));
1016       step_preserves_misalignment_p = true;
1017     }
1018
1019   /* In case the dataref is in an inner-loop of the loop that is being
1020      vectorized (LOOP), we use the base and misalignment information
1021      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1022      stays the same throughout the execution of the inner-loop, which is why
1023      we have to check that the stride of the dataref in the inner-loop evenly
1024      divides by the vector alignment.  */
1025   else if (nested_in_vect_loop_p (loop, stmt_info))
1026     {
1027       step_preserves_misalignment_p
1028         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1029
1030       if (dump_enabled_p ())
1031         {
1032           if (step_preserves_misalignment_p)
1033             dump_printf_loc (MSG_NOTE, vect_location,
1034                              "inner step divides the vector alignment.\n");
1035           else
1036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037                              "inner step doesn't divide the vector"
1038                              " alignment.\n");
1039         }
1040     }
1041
1042   /* Similarly we can only use base and misalignment information relative to
1043      an innermost loop if the misalignment stays the same throughout the
1044      execution of the loop.  As above, this is the case if the stride of
1045      the dataref evenly divides by the alignment.  */
1046   else
1047     {
1048       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1049       step_preserves_misalignment_p
1050         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1051
1052       if (!step_preserves_misalignment_p && dump_enabled_p ())
1053         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054                          "step doesn't divide the vector alignment.\n");
1055     }
1056
1057   unsigned int base_alignment = drb->base_alignment;
1058   unsigned int base_misalignment = drb->base_misalignment;
1059
1060   /* Calculate the maximum of the pooled base address alignment and the
1061      alignment that we can compute for DR itself.  */
1062   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1063   if (entry && base_alignment < (*entry)->base_alignment)
1064     {
1065       base_alignment = (*entry)->base_alignment;
1066       base_misalignment = (*entry)->base_misalignment;
1067     }
1068
1069   if (drb->offset_alignment < vect_align_c
1070       || !step_preserves_misalignment_p
1071       /* We need to know whether the step wrt the vectorized loop is
1072          negative when computing the starting misalignment below.  */
1073       || TREE_CODE (drb->step) != INTEGER_CST)
1074     {
1075       if (dump_enabled_p ())
1076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1077                          "Unknown alignment for access: %T\n", ref);
1078       return;
1079     }
1080
1081   if (base_alignment < vect_align_c)
1082     {
1083       unsigned int max_alignment;
1084       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1085       if (max_alignment < vect_align_c
1086           || !vect_can_force_dr_alignment_p (base,
1087                                              vect_align_c * BITS_PER_UNIT))
1088         {
1089           if (dump_enabled_p ())
1090             dump_printf_loc (MSG_NOTE, vect_location,
1091                              "can't force alignment of ref: %T\n", ref);
1092           return;
1093         }
1094
1095       /* Force the alignment of the decl.
1096          NOTE: This is the only change to the code we make during
1097          the analysis phase, before deciding to vectorize the loop.  */
1098       if (dump_enabled_p ())
1099         dump_printf_loc (MSG_NOTE, vect_location,
1100                          "force alignment of %T\n", ref);
1101
1102       dr_info->base_decl = base;
1103       dr_info->base_misaligned = true;
1104       base_misalignment = 0;
1105     }
1106   poly_int64 misalignment
1107     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1108
1109   /* If this is a backward running DR then first access in the larger
1110      vectype actually is N-1 elements before the address in the DR.
1111      Adjust misalign accordingly.  */
1112   if (tree_int_cst_sgn (drb->step) < 0)
1113     /* PLUS because STEP is negative.  */
1114     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1115                      * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1116
1117   unsigned int const_misalignment;
1118   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1119     {
1120       if (dump_enabled_p ())
1121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1122                          "Non-constant misalignment for access: %T\n", ref);
1123       return;
1124     }
1125
1126   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1127
1128   if (dump_enabled_p ())
1129     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                      "misalign = %d bytes of ref %T\n",
1131                      DR_MISALIGNMENT (dr_info), ref);
1132
1133   return;
1134 }
1135
1136 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1137    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1138    is made aligned via peeling.  */
1139
1140 static bool
1141 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1142                                          dr_vec_info *dr_peel_info)
1143 {
1144   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1145                   DR_TARGET_ALIGNMENT (dr_info)))
1146     {
1147       poly_offset_int diff
1148         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1149            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1150       if (known_eq (diff, 0)
1151           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1152         return true;
1153     }
1154   return false;
1155 }
1156
1157 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1158    aligned via peeling.  */
1159
1160 static bool
1161 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1162                                  dr_vec_info *dr_peel_info)
1163 {
1164   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1165                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1166       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1167                            DR_OFFSET (dr_peel_info->dr), 0)
1168       || !operand_equal_p (DR_STEP (dr_info->dr),
1169                            DR_STEP (dr_peel_info->dr), 0))
1170     return false;
1171
1172   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1173 }
1174
1175 /* Function vect_update_misalignment_for_peel.
1176    Sets DR_INFO's misalignment
1177    - to 0 if it has the same alignment as DR_PEEL_INFO,
1178    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1179    - to -1 (unknown) otherwise.
1180
1181    DR_INFO - the data reference whose misalignment is to be adjusted.
1182    DR_PEEL_INFO - the data reference whose misalignment is being made
1183                   zero in the vector loop by the peel.
1184    NPEEL - the number of iterations in the peel loop if the misalignment
1185            of DR_PEEL_INFO is known at compile time.  */
1186
1187 static void
1188 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1189                                    dr_vec_info *dr_peel_info, int npeel)
1190 {
1191   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1192   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1193     {
1194       SET_DR_MISALIGNMENT (dr_info, 0);
1195       return;
1196     }
1197
1198   unsigned HOST_WIDE_INT alignment;
1199   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1200       && known_alignment_for_access_p (dr_info)
1201       && known_alignment_for_access_p (dr_peel_info))
1202     {
1203       int misal = DR_MISALIGNMENT (dr_info);
1204       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1205       misal &= alignment - 1;
1206       SET_DR_MISALIGNMENT (dr_info, misal);
1207       return;
1208     }
1209
1210   if (dump_enabled_p ())
1211     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1212                      "to unknown (-1).\n");
1213   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1214 }
1215
1216 /* Return true if alignment is relevant for DR_INFO.  */
1217
1218 static bool
1219 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1220 {
1221   stmt_vec_info stmt_info = dr_info->stmt;
1222
1223   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1224     return false;
1225
1226   /* For interleaving, only the alignment of the first access matters.  */
1227   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1228       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1229     return false;
1230
1231   /* Scatter-gather and invariant accesses continue to address individual
1232      scalars, so vector-level alignment is irrelevant.  */
1233   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1234       || integer_zerop (DR_STEP (dr_info->dr)))
1235     return false;
1236
1237   /* Strided accesses perform only component accesses, alignment is
1238      irrelevant for them.  */
1239   if (STMT_VINFO_STRIDED_P (stmt_info)
1240       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1241     return false;
1242
1243   return true;
1244 }
1245
1246 /* Given an memory reference EXP return whether its alignment is less
1247    than its size.  */
1248
1249 static bool
1250 not_size_aligned (tree exp)
1251 {
1252   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1253     return true;
1254
1255   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1256           > get_object_alignment (exp));
1257 }
1258
1259 /* Function vector_alignment_reachable_p
1260
1261    Return true if vector alignment for DR_INFO is reachable by peeling
1262    a few loop iterations.  Return false otherwise.  */
1263
1264 static bool
1265 vector_alignment_reachable_p (dr_vec_info *dr_info)
1266 {
1267   stmt_vec_info stmt_info = dr_info->stmt;
1268   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1269
1270   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1271     {
1272       /* For interleaved access we peel only if number of iterations in
1273          the prolog loop ({VF - misalignment}), is a multiple of the
1274          number of the interleaved accesses.  */
1275       int elem_size, mis_in_elements;
1276
1277       /* FORNOW: handle only known alignment.  */
1278       if (!known_alignment_for_access_p (dr_info))
1279         return false;
1280
1281       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1282       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1283       elem_size = vector_element_size (vector_size, nelements);
1284       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1285
1286       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1287         return false;
1288     }
1289
1290   /* If misalignment is known at the compile time then allow peeling
1291      only if natural alignment is reachable through peeling.  */
1292   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1293     {
1294       HOST_WIDE_INT elmsize =
1295                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1296       if (dump_enabled_p ())
1297         {
1298           dump_printf_loc (MSG_NOTE, vect_location,
1299                            "data size = %wd. misalignment = %d.\n", elmsize,
1300                            DR_MISALIGNMENT (dr_info));
1301         }
1302       if (DR_MISALIGNMENT (dr_info) % elmsize)
1303         {
1304           if (dump_enabled_p ())
1305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                              "data size does not divide the misalignment.\n");
1307           return false;
1308         }
1309     }
1310
1311   if (!known_alignment_for_access_p (dr_info))
1312     {
1313       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1314       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1315       if (dump_enabled_p ())
1316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317                          "Unknown misalignment, %snaturally aligned\n",
1318                          is_packed ? "not " : "");
1319       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1320     }
1321
1322   return true;
1323 }
1324
1325
1326 /* Calculate the cost of the memory access represented by DR_INFO.  */
1327
1328 static void
1329 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1330                            unsigned int *inside_cost,
1331                            unsigned int *outside_cost,
1332                            stmt_vector_for_cost *body_cost_vec,
1333                            stmt_vector_for_cost *prologue_cost_vec)
1334 {
1335   stmt_vec_info stmt_info = dr_info->stmt;
1336   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1337   int ncopies;
1338
1339   if (PURE_SLP_STMT (stmt_info))
1340     ncopies = 1;
1341   else
1342     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1343
1344   if (DR_IS_READ (dr_info->dr))
1345     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1346                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1347   else
1348     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1349
1350   if (dump_enabled_p ())
1351     dump_printf_loc (MSG_NOTE, vect_location,
1352                      "vect_get_data_access_cost: inside_cost = %d, "
1353                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1354 }
1355
1356
1357 typedef struct _vect_peel_info
1358 {
1359   dr_vec_info *dr_info;
1360   int npeel;
1361   unsigned int count;
1362 } *vect_peel_info;
1363
1364 typedef struct _vect_peel_extended_info
1365 {
1366   vec_info *vinfo;
1367   struct _vect_peel_info peel_info;
1368   unsigned int inside_cost;
1369   unsigned int outside_cost;
1370 } *vect_peel_extended_info;
1371
1372
1373 /* Peeling hashtable helpers.  */
1374
1375 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1376 {
1377   static inline hashval_t hash (const _vect_peel_info *);
1378   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1379 };
1380
1381 inline hashval_t
1382 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1383 {
1384   return (hashval_t) peel_info->npeel;
1385 }
1386
1387 inline bool
1388 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1389 {
1390   return (a->npeel == b->npeel);
1391 }
1392
1393
1394 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1395
1396 static void
1397 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1398                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1399                           int npeel)
1400 {
1401   struct _vect_peel_info elem, *slot;
1402   _vect_peel_info **new_slot;
1403   bool supportable_dr_alignment
1404     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1405
1406   elem.npeel = npeel;
1407   slot = peeling_htab->find (&elem);
1408   if (slot)
1409     slot->count++;
1410   else
1411     {
1412       slot = XNEW (struct _vect_peel_info);
1413       slot->npeel = npeel;
1414       slot->dr_info = dr_info;
1415       slot->count = 1;
1416       new_slot = peeling_htab->find_slot (slot, INSERT);
1417       *new_slot = slot;
1418     }
1419
1420   if (!supportable_dr_alignment
1421       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1422     slot->count += VECT_MAX_COST;
1423 }
1424
1425
1426 /* Traverse peeling hash table to find peeling option that aligns maximum
1427    number of data accesses.  */
1428
1429 int
1430 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1431                                      _vect_peel_extended_info *max)
1432 {
1433   vect_peel_info elem = *slot;
1434
1435   if (elem->count > max->peel_info.count
1436       || (elem->count == max->peel_info.count
1437           && max->peel_info.npeel > elem->npeel))
1438     {
1439       max->peel_info.npeel = elem->npeel;
1440       max->peel_info.count = elem->count;
1441       max->peel_info.dr_info = elem->dr_info;
1442     }
1443
1444   return 1;
1445 }
1446
1447 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1448    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1449    we assume DR0_INFO's misalignment will be zero after peeling.  */
1450
1451 static void
1452 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1453                                 dr_vec_info *dr0_info,
1454                                 unsigned int *inside_cost,
1455                                 unsigned int *outside_cost,
1456                                 stmt_vector_for_cost *body_cost_vec,
1457                                 stmt_vector_for_cost *prologue_cost_vec,
1458                                 unsigned int npeel,
1459                                 bool unknown_misalignment)
1460 {
1461   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1462   unsigned i;
1463   data_reference *dr;
1464
1465   FOR_EACH_VEC_ELT (datarefs, i, dr)
1466     {
1467       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1468       if (!vect_relevant_for_alignment_p (dr_info))
1469         continue;
1470
1471       int save_misalignment;
1472       save_misalignment = DR_MISALIGNMENT (dr_info);
1473       if (npeel == 0)
1474         ;
1475       else if (unknown_misalignment && dr_info == dr0_info)
1476         SET_DR_MISALIGNMENT (dr_info, 0);
1477       else
1478         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1479       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1480                                  body_cost_vec, prologue_cost_vec);
1481       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1482     }
1483 }
1484
1485 /* Traverse peeling hash table and calculate cost for each peeling option.
1486    Find the one with the lowest cost.  */
1487
1488 int
1489 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1490                                    _vect_peel_extended_info *min)
1491 {
1492   vect_peel_info elem = *slot;
1493   int dummy;
1494   unsigned int inside_cost = 0, outside_cost = 0;
1495   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1496   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1497                        epilogue_cost_vec;
1498
1499   prologue_cost_vec.create (2);
1500   body_cost_vec.create (2);
1501   epilogue_cost_vec.create (2);
1502
1503   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1504                                   &outside_cost, &body_cost_vec,
1505                                   &prologue_cost_vec, elem->npeel, false);
1506
1507   body_cost_vec.release ();
1508
1509   outside_cost += vect_get_known_peeling_cost
1510     (loop_vinfo, elem->npeel, &dummy,
1511      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1512      &prologue_cost_vec, &epilogue_cost_vec);
1513
1514   /* Prologue and epilogue costs are added to the target model later.
1515      These costs depend only on the scalar iteration cost, the
1516      number of peeling iterations finally chosen, and the number of
1517      misaligned statements.  So discard the information found here.  */
1518   prologue_cost_vec.release ();
1519   epilogue_cost_vec.release ();
1520
1521   if (inside_cost < min->inside_cost
1522       || (inside_cost == min->inside_cost
1523           && outside_cost < min->outside_cost))
1524     {
1525       min->inside_cost = inside_cost;
1526       min->outside_cost = outside_cost;
1527       min->peel_info.dr_info = elem->dr_info;
1528       min->peel_info.npeel = elem->npeel;
1529       min->peel_info.count = elem->count;
1530     }
1531
1532   return 1;
1533 }
1534
1535
1536 /* Choose best peeling option by traversing peeling hash table and either
1537    choosing an option with the lowest cost (if cost model is enabled) or the
1538    option that aligns as many accesses as possible.  */
1539
1540 static struct _vect_peel_extended_info
1541 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1542                                        loop_vec_info loop_vinfo)
1543 {
1544    struct _vect_peel_extended_info res;
1545
1546    res.peel_info.dr_info = NULL;
1547    res.vinfo = loop_vinfo;
1548
1549    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1550      {
1551        res.inside_cost = INT_MAX;
1552        res.outside_cost = INT_MAX;
1553        peeling_htab->traverse <_vect_peel_extended_info *,
1554                                vect_peeling_hash_get_lowest_cost> (&res);
1555      }
1556    else
1557      {
1558        res.peel_info.count = 0;
1559        peeling_htab->traverse <_vect_peel_extended_info *,
1560                                vect_peeling_hash_get_most_frequent> (&res);
1561        res.inside_cost = 0;
1562        res.outside_cost = 0;
1563      }
1564
1565    return res;
1566 }
1567
1568 /* Return true if the new peeling NPEEL is supported.  */
1569
1570 static bool
1571 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1572                           unsigned npeel)
1573 {
1574   unsigned i;
1575   struct data_reference *dr = NULL;
1576   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1577   enum dr_alignment_support supportable_dr_alignment;
1578
1579   /* Ensure that all data refs can be vectorized after the peel.  */
1580   FOR_EACH_VEC_ELT (datarefs, i, dr)
1581     {
1582       int save_misalignment;
1583
1584       if (dr == dr0_info->dr)
1585         continue;
1586
1587       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1588       if (!vect_relevant_for_alignment_p (dr_info))
1589         continue;
1590
1591       save_misalignment = DR_MISALIGNMENT (dr_info);
1592       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1593       supportable_dr_alignment
1594         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1595       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1596
1597       if (!supportable_dr_alignment)
1598         return false;
1599     }
1600
1601   return true;
1602 }
1603
1604 /* Compare two data-references DRA and DRB to group them into chunks
1605    with related alignment.  */
1606
1607 static int
1608 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1609 {
1610   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1611   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1612   int cmp;
1613
1614   /* Stabilize sort.  */
1615   if (dra == drb)
1616     return 0;
1617
1618   /* Ordering of DRs according to base.  */
1619   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1620                                DR_BASE_ADDRESS (drb));
1621   if (cmp != 0)
1622     return cmp;
1623
1624   /* And according to DR_OFFSET.  */
1625   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1626   if (cmp != 0)
1627     return cmp;
1628
1629   /* And after step.  */
1630   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1631   if (cmp != 0)
1632     return cmp;
1633
1634   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1635   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1636   if (cmp == 0)
1637     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1638   return cmp;
1639 }
1640
1641 /* Function vect_enhance_data_refs_alignment
1642
1643    This pass will use loop versioning and loop peeling in order to enhance
1644    the alignment of data references in the loop.
1645
1646    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1647    original loop is to be vectorized.  Any other loops that are created by
1648    the transformations performed in this pass - are not supposed to be
1649    vectorized.  This restriction will be relaxed.
1650
1651    This pass will require a cost model to guide it whether to apply peeling
1652    or versioning or a combination of the two.  For example, the scheme that
1653    intel uses when given a loop with several memory accesses, is as follows:
1654    choose one memory access ('p') which alignment you want to force by doing
1655    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1656    other accesses are not necessarily aligned, or (2) use loop versioning to
1657    generate one loop in which all accesses are aligned, and another loop in
1658    which only 'p' is necessarily aligned.
1659
1660    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1661    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1662    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1663
1664    Devising a cost model is the most critical aspect of this work.  It will
1665    guide us on which access to peel for, whether to use loop versioning, how
1666    many versions to create, etc.  The cost model will probably consist of
1667    generic considerations as well as target specific considerations (on
1668    powerpc for example, misaligned stores are more painful than misaligned
1669    loads).
1670
1671    Here are the general steps involved in alignment enhancements:
1672
1673      -- original loop, before alignment analysis:
1674         for (i=0; i<N; i++){
1675           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1676           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1677         }
1678
1679      -- After vect_compute_data_refs_alignment:
1680         for (i=0; i<N; i++){
1681           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1682           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1683         }
1684
1685      -- Possibility 1: we do loop versioning:
1686      if (p is aligned) {
1687         for (i=0; i<N; i++){    # loop 1A
1688           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1689           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1690         }
1691      }
1692      else {
1693         for (i=0; i<N; i++){    # loop 1B
1694           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1695           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1696         }
1697      }
1698
1699      -- Possibility 2: we do loop peeling:
1700      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1701         x = q[i];
1702         p[i] = y;
1703      }
1704      for (i = 3; i < N; i++){   # loop 2A
1705         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1706         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1707      }
1708
1709      -- Possibility 3: combination of loop peeling and versioning:
1710      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1711         x = q[i];
1712         p[i] = y;
1713      }
1714      if (p is aligned) {
1715         for (i = 3; i<N; i++){  # loop 3A
1716           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1717           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1718         }
1719      }
1720      else {
1721         for (i = 3; i<N; i++){  # loop 3B
1722           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1723           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1724         }
1725      }
1726
1727      These loops are later passed to loop_transform to be vectorized.  The
1728      vectorizer will use the alignment information to guide the transformation
1729      (whether to generate regular loads/stores, or with special handling for
1730      misalignment).  */
1731
1732 opt_result
1733 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1734 {
1735   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1736   enum dr_alignment_support supportable_dr_alignment;
1737   dr_vec_info *first_store = NULL;
1738   dr_vec_info *dr0_info = NULL;
1739   struct data_reference *dr;
1740   unsigned int i;
1741   bool do_peeling = false;
1742   bool do_versioning = false;
1743   unsigned int npeel = 0;
1744   bool one_misalignment_known = false;
1745   bool one_misalignment_unknown = false;
1746   bool one_dr_unsupportable = false;
1747   dr_vec_info *unsupportable_dr_info = NULL;
1748   unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1749   hash_table<peel_info_hasher> peeling_htab (1);
1750
1751   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1752
1753   /* Reset data so we can safely be called multiple times.  */
1754   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1755   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1756
1757   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1758     return opt_result::success ();
1759
1760   /* Sort the vector of datarefs so DRs that have the same or dependent
1761      alignment are next to each other.  */
1762   auto_vec<data_reference_p> datarefs
1763     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1764   datarefs.qsort (dr_align_group_sort_cmp);
1765
1766   /* Compute the number of DRs that become aligned when we peel
1767      a dataref so it becomes aligned.  */
1768   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1769   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1770   unsigned i0;
1771   for (i0 = 0; i0 < datarefs.length (); ++i0)
1772     if (DR_BASE_ADDRESS (datarefs[i0]))
1773       break;
1774   for (i = i0 + 1; i <= datarefs.length (); ++i)
1775     {
1776       if (i == datarefs.length ()
1777           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1778                                DR_BASE_ADDRESS (datarefs[i]), 0)
1779           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1780                                DR_OFFSET (datarefs[i]), 0)
1781           || !operand_equal_p (DR_STEP (datarefs[i0]),
1782                                DR_STEP (datarefs[i]), 0))
1783         {
1784           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1785              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1786              will get known misalignment if we align one of the refs
1787              with the largest DR_TARGET_ALIGNMENT.  */
1788           for (unsigned j = i0; j < i; ++j)
1789             {
1790               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1791               for (unsigned k = i0; k < i; ++k)
1792                 {
1793                   if (k == j)
1794                     continue;
1795                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1796                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1797                                                                dr_infoj))
1798                     n_same_align_refs[j]++;
1799                 }
1800             }
1801           i0 = i;
1802         }
1803     }
1804
1805   /* While cost model enhancements are expected in the future, the high level
1806      view of the code at this time is as follows:
1807
1808      A) If there is a misaligned access then see if peeling to align
1809         this access can make all data references satisfy
1810         vect_supportable_dr_alignment.  If so, update data structures
1811         as needed and return true.
1812
1813      B) If peeling wasn't possible and there is a data reference with an
1814         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1815         then see if loop versioning checks can be used to make all data
1816         references satisfy vect_supportable_dr_alignment.  If so, update
1817         data structures as needed and return true.
1818
1819      C) If neither peeling nor versioning were successful then return false if
1820         any data reference does not satisfy vect_supportable_dr_alignment.
1821
1822      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1823
1824      Note, Possibility 3 above (which is peeling and versioning together) is not
1825      being done at this time.  */
1826
1827   /* (1) Peeling to force alignment.  */
1828
1829   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1830      Considerations:
1831      + How many accesses will become aligned due to the peeling
1832      - How many accesses will become unaligned due to the peeling,
1833        and the cost of misaligned accesses.
1834      - The cost of peeling (the extra runtime checks, the increase
1835        in code size).  */
1836
1837   FOR_EACH_VEC_ELT (datarefs, i, dr)
1838     {
1839       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1840       if (!vect_relevant_for_alignment_p (dr_info))
1841         continue;
1842
1843       stmt_vec_info stmt_info = dr_info->stmt;
1844       supportable_dr_alignment
1845         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1846       do_peeling = vector_alignment_reachable_p (dr_info);
1847       if (do_peeling)
1848         {
1849           if (known_alignment_for_access_p (dr_info))
1850             {
1851               unsigned int npeel_tmp = 0;
1852               bool negative = tree_int_cst_compare (DR_STEP (dr),
1853                                                     size_zero_node) < 0;
1854
1855               /* If known_alignment_for_access_p then we have set
1856                  DR_MISALIGNMENT which is only done if we know it at compiler
1857                  time, so it is safe to assume target alignment is constant.
1858                */
1859               unsigned int target_align =
1860                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1861               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1862               mis = (negative
1863                      ? DR_MISALIGNMENT (dr_info)
1864                      : -DR_MISALIGNMENT (dr_info));
1865               if (DR_MISALIGNMENT (dr_info) != 0)
1866                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1867
1868               /* For multiple types, it is possible that the bigger type access
1869                  will have more than one peeling option.  E.g., a loop with two
1870                  types: one of size (vector size / 4), and the other one of
1871                  size (vector size / 8).  Vectorization factor will 8.  If both
1872                  accesses are misaligned by 3, the first one needs one scalar
1873                  iteration to be aligned, and the second one needs 5.  But the
1874                  first one will be aligned also by peeling 5 scalar
1875                  iterations, and in that case both accesses will be aligned.
1876                  Hence, except for the immediate peeling amount, we also want
1877                  to try to add full vector size, while we don't exceed
1878                  vectorization factor.
1879                  We do this automatically for cost model, since we calculate
1880                  cost for every peeling option.  */
1881               poly_uint64 nscalars = npeel_tmp;
1882               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1883                 {
1884                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1885                   nscalars = (STMT_SLP_TYPE (stmt_info)
1886                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1887                 }
1888
1889               /* Save info about DR in the hash table.  Also include peeling
1890                  amounts according to the explanation above.  */
1891               while (known_le (npeel_tmp, nscalars))
1892                 {
1893                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1894                                             dr_info, npeel_tmp);
1895                   npeel_tmp += MAX (1, target_align / dr_size);
1896                 }
1897
1898               one_misalignment_known = true;
1899             }
1900           else
1901             {
1902               /* If we don't know any misalignment values, we prefer
1903                  peeling for data-ref that has the maximum number of data-refs
1904                  with the same alignment, unless the target prefers to align
1905                  stores over load.  */
1906               unsigned same_align_drs = n_same_align_refs[i];
1907               if (!dr0_info
1908                   || dr0_same_align_drs < same_align_drs)
1909                 {
1910                   dr0_same_align_drs = same_align_drs;
1911                   dr0_info = dr_info;
1912                 }
1913               /* For data-refs with the same number of related
1914                  accesses prefer the one where the misalign
1915                  computation will be invariant in the outermost loop.  */
1916               else if (dr0_same_align_drs == same_align_drs)
1917                 {
1918                   class loop *ivloop0, *ivloop;
1919                   ivloop0 = outermost_invariant_loop_for_expr
1920                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1921                   ivloop = outermost_invariant_loop_for_expr
1922                     (loop, DR_BASE_ADDRESS (dr));
1923                   if ((ivloop && !ivloop0)
1924                       || (ivloop && ivloop0
1925                           && flow_loop_nested_p (ivloop, ivloop0)))
1926                     dr0_info = dr_info;
1927                 }
1928
1929               one_misalignment_unknown = true;
1930
1931               /* Check for data refs with unsupportable alignment that
1932                  can be peeled.  */
1933               if (!supportable_dr_alignment)
1934               {
1935                 one_dr_unsupportable = true;
1936                 unsupportable_dr_info = dr_info;
1937               }
1938
1939               if (!first_store && DR_IS_WRITE (dr))
1940                 {
1941                   first_store = dr_info;
1942                   first_store_same_align_drs = same_align_drs;
1943                 }
1944             }
1945         }
1946       else
1947         {
1948           if (!aligned_access_p (dr_info))
1949             {
1950               if (dump_enabled_p ())
1951                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1952                                  "vector alignment may not be reachable\n");
1953               break;
1954             }
1955         }
1956     }
1957
1958   /* Check if we can possibly peel the loop.  */
1959   if (!vect_can_advance_ivs_p (loop_vinfo)
1960       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1961       || loop->inner)
1962     do_peeling = false;
1963
1964   struct _vect_peel_extended_info peel_for_known_alignment;
1965   struct _vect_peel_extended_info peel_for_unknown_alignment;
1966   struct _vect_peel_extended_info best_peel;
1967
1968   peel_for_unknown_alignment.inside_cost = INT_MAX;
1969   peel_for_unknown_alignment.outside_cost = INT_MAX;
1970   peel_for_unknown_alignment.peel_info.count = 0;
1971
1972   if (do_peeling
1973       && one_misalignment_unknown)
1974     {
1975       /* Check if the target requires to prefer stores over loads, i.e., if
1976          misaligned stores are more expensive than misaligned loads (taking
1977          drs with same alignment into account).  */
1978       unsigned int load_inside_cost = 0;
1979       unsigned int load_outside_cost = 0;
1980       unsigned int store_inside_cost = 0;
1981       unsigned int store_outside_cost = 0;
1982       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1983
1984       stmt_vector_for_cost dummy;
1985       dummy.create (2);
1986       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1987                                       &load_inside_cost,
1988                                       &load_outside_cost,
1989                                       &dummy, &dummy, estimated_npeels, true);
1990       dummy.release ();
1991
1992       if (first_store)
1993         {
1994           dummy.create (2);
1995           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1996                                           &store_inside_cost,
1997                                           &store_outside_cost,
1998                                           &dummy, &dummy,
1999                                           estimated_npeels, true);
2000           dummy.release ();
2001         }
2002       else
2003         {
2004           store_inside_cost = INT_MAX;
2005           store_outside_cost = INT_MAX;
2006         }
2007
2008       if (load_inside_cost > store_inside_cost
2009           || (load_inside_cost == store_inside_cost
2010               && load_outside_cost > store_outside_cost))
2011         {
2012           dr0_info = first_store;
2013           dr0_same_align_drs = first_store_same_align_drs;
2014           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2015           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2016         }
2017       else
2018         {
2019           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2020           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2021         }
2022
2023       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2024       prologue_cost_vec.create (2);
2025       epilogue_cost_vec.create (2);
2026
2027       int dummy2;
2028       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2029         (loop_vinfo, estimated_npeels, &dummy2,
2030          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2031          &prologue_cost_vec, &epilogue_cost_vec);
2032
2033       prologue_cost_vec.release ();
2034       epilogue_cost_vec.release ();
2035
2036       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2037     }
2038
2039   peel_for_unknown_alignment.peel_info.npeel = 0;
2040   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2041
2042   best_peel = peel_for_unknown_alignment;
2043
2044   peel_for_known_alignment.inside_cost = INT_MAX;
2045   peel_for_known_alignment.outside_cost = INT_MAX;
2046   peel_for_known_alignment.peel_info.count = 0;
2047   peel_for_known_alignment.peel_info.dr_info = NULL;
2048
2049   if (do_peeling && one_misalignment_known)
2050     {
2051       /* Peeling is possible, but there is no data access that is not supported
2052          unless aligned.  So we try to choose the best possible peeling from
2053          the hash table.  */
2054       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2055         (&peeling_htab, loop_vinfo);
2056     }
2057
2058   /* Compare costs of peeling for known and unknown alignment. */
2059   if (peel_for_known_alignment.peel_info.dr_info != NULL
2060       && peel_for_unknown_alignment.inside_cost
2061       >= peel_for_known_alignment.inside_cost)
2062     {
2063       best_peel = peel_for_known_alignment;
2064
2065       /* If the best peeling for known alignment has NPEEL == 0, perform no
2066          peeling at all except if there is an unsupportable dr that we can
2067          align.  */
2068       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2069         do_peeling = false;
2070     }
2071
2072   /* If there is an unsupportable data ref, prefer this over all choices so far
2073      since we'd have to discard a chosen peeling except when it accidentally
2074      aligned the unsupportable data ref.  */
2075   if (one_dr_unsupportable)
2076     dr0_info = unsupportable_dr_info;
2077   else if (do_peeling)
2078     {
2079       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2080          TODO: Use nopeel_outside_cost or get rid of it?  */
2081       unsigned nopeel_inside_cost = 0;
2082       unsigned nopeel_outside_cost = 0;
2083
2084       stmt_vector_for_cost dummy;
2085       dummy.create (2);
2086       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2087                                       &nopeel_outside_cost, &dummy, &dummy,
2088                                       0, false);
2089       dummy.release ();
2090
2091       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2092          costs will be recorded.  */
2093       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2094       prologue_cost_vec.create (2);
2095       epilogue_cost_vec.create (2);
2096
2097       int dummy2;
2098       nopeel_outside_cost += vect_get_known_peeling_cost
2099         (loop_vinfo, 0, &dummy2,
2100          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2101          &prologue_cost_vec, &epilogue_cost_vec);
2102
2103       prologue_cost_vec.release ();
2104       epilogue_cost_vec.release ();
2105
2106       npeel = best_peel.peel_info.npeel;
2107       dr0_info = best_peel.peel_info.dr_info;
2108
2109       /* If no peeling is not more expensive than the best peeling we
2110          have so far, don't perform any peeling.  */
2111       if (nopeel_inside_cost <= best_peel.inside_cost)
2112         do_peeling = false;
2113     }
2114
2115   if (do_peeling)
2116     {
2117       stmt_vec_info stmt_info = dr0_info->stmt;
2118       if (known_alignment_for_access_p (dr0_info))
2119         {
2120           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2121                                                 size_zero_node) < 0;
2122           if (!npeel)
2123             {
2124               /* Since it's known at compile time, compute the number of
2125                  iterations in the peeled loop (the peeling factor) for use in
2126                  updating DR_MISALIGNMENT values.  The peeling factor is the
2127                  vectorization factor minus the misalignment as an element
2128                  count.  */
2129               mis = (negative
2130                      ? DR_MISALIGNMENT (dr0_info)
2131                      : -DR_MISALIGNMENT (dr0_info));
2132               /* If known_alignment_for_access_p then we have set
2133                  DR_MISALIGNMENT which is only done if we know it at compiler
2134                  time, so it is safe to assume target alignment is constant.
2135                */
2136               unsigned int target_align =
2137                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2138               npeel = ((mis & (target_align - 1))
2139                        / vect_get_scalar_dr_size (dr0_info));
2140             }
2141
2142           /* For interleaved data access every iteration accesses all the
2143              members of the group, therefore we divide the number of iterations
2144              by the group size.  */
2145           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146             npeel /= DR_GROUP_SIZE (stmt_info);
2147
2148           if (dump_enabled_p ())
2149             dump_printf_loc (MSG_NOTE, vect_location,
2150                              "Try peeling by %d\n", npeel);
2151         }
2152
2153       /* Ensure that all datarefs can be vectorized after the peel.  */
2154       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2155         do_peeling = false;
2156
2157       /* Check if all datarefs are supportable and log.  */
2158       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2159         return opt_result::success ();
2160
2161       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2162       if (do_peeling)
2163         {
2164           unsigned max_allowed_peel
2165             = param_vect_max_peeling_for_alignment;
2166           if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2167             max_allowed_peel = 0;
2168           if (max_allowed_peel != (unsigned)-1)
2169             {
2170               unsigned max_peel = npeel;
2171               if (max_peel == 0)
2172                 {
2173                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2174                   unsigned HOST_WIDE_INT target_align_c;
2175                   if (target_align.is_constant (&target_align_c))
2176                     max_peel =
2177                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2178                   else
2179                     {
2180                       do_peeling = false;
2181                       if (dump_enabled_p ())
2182                         dump_printf_loc (MSG_NOTE, vect_location,
2183                           "Disable peeling, max peels set and vector"
2184                           " alignment unknown\n");
2185                     }
2186                 }
2187               if (max_peel > max_allowed_peel)
2188                 {
2189                   do_peeling = false;
2190                   if (dump_enabled_p ())
2191                     dump_printf_loc (MSG_NOTE, vect_location,
2192                         "Disable peeling, max peels reached: %d\n", max_peel);
2193                 }
2194             }
2195         }
2196
2197       /* Cost model #2 - if peeling may result in a remaining loop not
2198          iterating enough to be vectorized then do not peel.  Since this
2199          is a cost heuristic rather than a correctness decision, use the
2200          most likely runtime value for variable vectorization factors.  */
2201       if (do_peeling
2202           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2203         {
2204           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2205           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2206           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2207               < assumed_vf + max_peel)
2208             do_peeling = false;
2209         }
2210
2211       if (do_peeling)
2212         {
2213           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2214              If the misalignment of DR_i is identical to that of dr0 then set
2215              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2216              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2217              by the peeling factor times the element size of DR_i (MOD the
2218              vectorization factor times the size).  Otherwise, the
2219              misalignment of DR_i must be set to unknown.  */
2220           FOR_EACH_VEC_ELT (datarefs, i, dr)
2221             if (dr != dr0_info->dr)
2222               {
2223                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224                 if (!vect_relevant_for_alignment_p (dr_info))
2225                   continue;
2226
2227                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2228               }
2229
2230           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2231           if (npeel)
2232             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2233           else
2234             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2235               = DR_MISALIGNMENT (dr0_info);
2236           SET_DR_MISALIGNMENT (dr0_info, 0);
2237           if (dump_enabled_p ())
2238             {
2239               dump_printf_loc (MSG_NOTE, vect_location,
2240                                "Alignment of access forced using peeling.\n");
2241               dump_printf_loc (MSG_NOTE, vect_location,
2242                                "Peeling for alignment will be applied.\n");
2243             }
2244
2245           /* The inside-loop cost will be accounted for in vectorizable_load
2246              and vectorizable_store correctly with adjusted alignments.
2247              Drop the body_cst_vec on the floor here.  */
2248           return opt_result::success ();
2249         }
2250     }
2251
2252   /* (2) Versioning to force alignment.  */
2253
2254   /* Try versioning if:
2255      1) optimize loop for speed and the cost-model is not cheap
2256      2) there is at least one unsupported misaligned data ref with an unknown
2257         misalignment, and
2258      3) all misaligned data refs with a known misalignment are supported, and
2259      4) the number of runtime alignment checks is within reason.  */
2260
2261   do_versioning
2262     = (optimize_loop_nest_for_speed_p (loop)
2263        && !loop->inner /* FORNOW */
2264        && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2265
2266   if (do_versioning)
2267     {
2268       FOR_EACH_VEC_ELT (datarefs, i, dr)
2269         {
2270           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2271           if (aligned_access_p (dr_info)
2272               || !vect_relevant_for_alignment_p (dr_info))
2273             continue;
2274
2275           stmt_vec_info stmt_info = dr_info->stmt;
2276           if (STMT_VINFO_STRIDED_P (stmt_info))
2277             {
2278               do_versioning = false;
2279               break;
2280             }
2281
2282           supportable_dr_alignment
2283             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2284
2285           if (!supportable_dr_alignment)
2286             {
2287               int mask;
2288               tree vectype;
2289
2290               if (known_alignment_for_access_p (dr_info)
2291                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2292                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2293                 {
2294                   do_versioning = false;
2295                   break;
2296                 }
2297
2298               vectype = STMT_VINFO_VECTYPE (stmt_info);
2299               gcc_assert (vectype);
2300
2301               /* At present we don't support versioning for alignment
2302                  with variable VF, since there's no guarantee that the
2303                  VF is a power of two.  We could relax this if we added
2304                  a way of enforcing a power-of-two size.  */
2305               unsigned HOST_WIDE_INT size;
2306               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2307                 {
2308                   do_versioning = false;
2309                   break;
2310                 }
2311
2312               /* Forcing alignment in the first iteration is no good if
2313                  we don't keep it across iterations.  For now, just disable
2314                  versioning in this case.
2315                  ?? We could actually unroll the loop to achieve the required
2316                  overall step alignment, and forcing the alignment could be
2317                  done by doing some iterations of the non-vectorized loop.  */
2318               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2319                                * DR_STEP_ALIGNMENT (dr),
2320                                DR_TARGET_ALIGNMENT (dr_info)))
2321                 {
2322                   do_versioning = false;
2323                   break;
2324                 }
2325
2326               /* The rightmost bits of an aligned address must be zeros.
2327                  Construct the mask needed for this test.  For example,
2328                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2329                  mask must be 15 = 0xf. */
2330               mask = size - 1;
2331
2332               /* FORNOW: use the same mask to test all potentially unaligned
2333                  references in the loop.  */
2334               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2335                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2336                 {
2337                   do_versioning = false;
2338                   break;
2339                 }
2340
2341               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2342               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2343             }
2344         }
2345
2346       /* Versioning requires at least one misaligned data reference.  */
2347       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2348         do_versioning = false;
2349       else if (!do_versioning)
2350         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2351     }
2352
2353   if (do_versioning)
2354     {
2355       vec<stmt_vec_info> may_misalign_stmts
2356         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2357       stmt_vec_info stmt_info;
2358
2359       /* It can now be assumed that the data references in the statements
2360          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2361          of the loop being vectorized.  */
2362       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2363         {
2364           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2365           SET_DR_MISALIGNMENT (dr_info, 0);
2366           if (dump_enabled_p ())
2367             dump_printf_loc (MSG_NOTE, vect_location,
2368                              "Alignment of access forced using versioning.\n");
2369         }
2370
2371       if (dump_enabled_p ())
2372         dump_printf_loc (MSG_NOTE, vect_location,
2373                          "Versioning for alignment will be applied.\n");
2374
2375       /* Peeling and versioning can't be done together at this time.  */
2376       gcc_assert (! (do_peeling && do_versioning));
2377
2378       return opt_result::success ();
2379     }
2380
2381   /* This point is reached if neither peeling nor versioning is being done.  */
2382   gcc_assert (! (do_peeling || do_versioning));
2383
2384   return opt_result::success ();
2385 }
2386
2387
2388 /* Function vect_analyze_data_refs_alignment
2389
2390    Analyze the alignment of the data-references in the loop.
2391    Return FALSE if a data reference is found that cannot be vectorized.  */
2392
2393 opt_result
2394 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2395 {
2396   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2397
2398   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2399   struct data_reference *dr;
2400   unsigned int i;
2401
2402   vect_record_base_alignments (loop_vinfo);
2403   FOR_EACH_VEC_ELT (datarefs, i, dr)
2404     {
2405       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2406       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2407         vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2408     }
2409
2410   return opt_result::success ();
2411 }
2412
2413
2414 /* Analyze alignment of DRs of stmts in NODE.  */
2415
2416 static bool
2417 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2418 {
2419   /* We vectorize from the first scalar stmt in the node unless
2420      the node is permuted in which case we start from the first
2421      element in the group.  */
2422   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2423   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2424   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2425     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2426
2427   /* We need to commit to a vector type for the group now.  */
2428   if (is_a <bb_vec_info> (vinfo)
2429       && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2430     {
2431       if (dump_enabled_p ())
2432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433                          "desired vector type conflicts with earlier one "
2434                          "for %G", first_stmt_info->stmt);
2435       return false;
2436     }
2437
2438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439   vect_compute_data_ref_alignment (vinfo, dr_info);
2440   /* In several places we need alignment of the first element anyway.  */
2441   if (dr_info != first_dr_info)
2442     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2443
2444   /* For creating the data-ref pointer we need alignment of the
2445      first element as well.  */
2446   first_stmt_info
2447     = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2448   if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2449     {
2450       first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2451       if (dr_info != first_dr_info)
2452         vect_compute_data_ref_alignment (vinfo, first_dr_info);
2453     }
2454
2455   return true;
2456 }
2457
2458 /* Function vect_slp_analyze_instance_alignment
2459
2460    Analyze the alignment of the data-references in the SLP instance.
2461    Return FALSE if a data reference is found that cannot be vectorized.  */
2462
2463 bool
2464 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2465                                                 slp_instance instance)
2466 {
2467   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2468
2469   slp_tree node;
2470   unsigned i;
2471   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2472     if (! vect_slp_analyze_node_alignment (vinfo, node))
2473       return false;
2474
2475   node = SLP_INSTANCE_TREE (instance);
2476   if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2477       && ! vect_slp_analyze_node_alignment
2478              (vinfo, SLP_INSTANCE_TREE (instance)))
2479     return false;
2480
2481   return true;
2482 }
2483
2484
2485 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2486    accesses of legal size, step, etc.  Detect gaps, single element
2487    interleaving, and other special cases. Set grouped access info.
2488    Collect groups of strided stores for further use in SLP analysis.
2489    Worker for vect_analyze_group_access.  */
2490
2491 static bool
2492 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2493 {
2494   data_reference *dr = dr_info->dr;
2495   tree step = DR_STEP (dr);
2496   tree scalar_type = TREE_TYPE (DR_REF (dr));
2497   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2498   stmt_vec_info stmt_info = dr_info->stmt;
2499   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2500   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2501   HOST_WIDE_INT dr_step = -1;
2502   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2503   bool slp_impossible = false;
2504
2505   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2506      size of the interleaving group (including gaps).  */
2507   if (tree_fits_shwi_p (step))
2508     {
2509       dr_step = tree_to_shwi (step);
2510       /* Check that STEP is a multiple of type size.  Otherwise there is
2511          a non-element-sized gap at the end of the group which we
2512          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2513          ???  As we can handle non-constant step fine here we should
2514          simply remove uses of DR_GROUP_GAP between the last and first
2515          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2516          simply not include that gap.  */
2517       if ((dr_step % type_size) != 0)
2518         {
2519           if (dump_enabled_p ())
2520             dump_printf_loc (MSG_NOTE, vect_location,
2521                              "Step %T is not a multiple of the element size"
2522                              " for %T\n",
2523                              step, DR_REF (dr));
2524           return false;
2525         }
2526       groupsize = absu_hwi (dr_step) / type_size;
2527     }
2528   else
2529     groupsize = 0;
2530
2531   /* Not consecutive access is possible only if it is a part of interleaving.  */
2532   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2533     {
2534       /* Check if it this DR is a part of interleaving, and is a single
2535          element of the group that is accessed in the loop.  */
2536
2537       /* Gaps are supported only for loads. STEP must be a multiple of the type
2538          size.  */
2539       if (DR_IS_READ (dr)
2540           && (dr_step % type_size) == 0
2541           && groupsize > 0
2542           /* This could be UINT_MAX but as we are generating code in a very
2543              inefficient way we have to cap earlier.
2544              See PR91403 for example.  */
2545           && groupsize <= 4096)
2546         {
2547           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2548           DR_GROUP_SIZE (stmt_info) = groupsize;
2549           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2550           if (dump_enabled_p ())
2551             dump_printf_loc (MSG_NOTE, vect_location,
2552                              "Detected single element interleaving %T"
2553                              " step %T\n",
2554                              DR_REF (dr), step);
2555
2556           return true;
2557         }
2558
2559       if (dump_enabled_p ())
2560         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561                          "not consecutive access %G", stmt_info->stmt);
2562
2563       if (bb_vinfo)
2564         {
2565           /* Mark the statement as unvectorizable.  */
2566           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2567           return true;
2568         }
2569
2570       if (dump_enabled_p ())
2571         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2572       STMT_VINFO_STRIDED_P (stmt_info) = true;
2573       return true;
2574     }
2575
2576   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2577     {
2578       /* First stmt in the interleaving chain. Check the chain.  */
2579       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2580       struct data_reference *data_ref = dr;
2581       unsigned int count = 1;
2582       tree prev_init = DR_INIT (data_ref);
2583       HOST_WIDE_INT diff, gaps = 0;
2584
2585       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2586       while (next)
2587         {
2588           /* We never have the same DR multiple times.  */
2589           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2590                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2591
2592           data_ref = STMT_VINFO_DATA_REF (next);
2593
2594           /* All group members have the same STEP by construction.  */
2595           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2596
2597           /* Check that the distance between two accesses is equal to the type
2598              size. Otherwise, we have gaps.  */
2599           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2600                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2601           if (diff != 1)
2602             {
2603               /* FORNOW: SLP of accesses with gaps is not supported.  */
2604               slp_impossible = true;
2605               if (DR_IS_WRITE (data_ref))
2606                 {
2607                   if (dump_enabled_p ())
2608                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609                                      "interleaved store with gaps\n");
2610                   return false;
2611                 }
2612
2613               gaps += diff - 1;
2614             }
2615
2616           last_accessed_element += diff;
2617
2618           /* Store the gap from the previous member of the group. If there is no
2619              gap in the access, DR_GROUP_GAP is always 1.  */
2620           DR_GROUP_GAP (next) = diff;
2621
2622           prev_init = DR_INIT (data_ref);
2623           next = DR_GROUP_NEXT_ELEMENT (next);
2624           /* Count the number of data-refs in the chain.  */
2625           count++;
2626         }
2627
2628       if (groupsize == 0)
2629         groupsize = count + gaps;
2630
2631       /* This could be UINT_MAX but as we are generating code in a very
2632          inefficient way we have to cap earlier.  See PR78699 for example.  */
2633       if (groupsize > 4096)
2634         {
2635           if (dump_enabled_p ())
2636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637                              "group is too large\n");
2638           return false;
2639         }
2640
2641       /* Check that the size of the interleaving is equal to count for stores,
2642          i.e., that there are no gaps.  */
2643       if (groupsize != count
2644           && !DR_IS_READ (dr))
2645         {
2646           groupsize = count;
2647           STMT_VINFO_STRIDED_P (stmt_info) = true;
2648         }
2649
2650       /* If there is a gap after the last load in the group it is the
2651          difference between the groupsize and the last accessed
2652          element.
2653          When there is no gap, this difference should be 0.  */
2654       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2655
2656       DR_GROUP_SIZE (stmt_info) = groupsize;
2657       if (dump_enabled_p ())
2658         {
2659           dump_printf_loc (MSG_NOTE, vect_location,
2660                            "Detected interleaving ");
2661           if (DR_IS_READ (dr))
2662             dump_printf (MSG_NOTE, "load ");
2663           else if (STMT_VINFO_STRIDED_P (stmt_info))
2664             dump_printf (MSG_NOTE, "strided store ");
2665           else
2666             dump_printf (MSG_NOTE, "store ");
2667           dump_printf (MSG_NOTE, "of size %u\n",
2668                        (unsigned)groupsize);
2669           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2670           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2671           while (next)
2672             {
2673               if (DR_GROUP_GAP (next) != 1)
2674                 dump_printf_loc (MSG_NOTE, vect_location,
2675                                  "\t<gap of %d elements>\n",
2676                                  DR_GROUP_GAP (next) - 1);
2677               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2678               next = DR_GROUP_NEXT_ELEMENT (next);
2679             }
2680           if (DR_GROUP_GAP (stmt_info) != 0)
2681             dump_printf_loc (MSG_NOTE, vect_location,
2682                              "\t<gap of %d elements>\n",
2683                              DR_GROUP_GAP (stmt_info));
2684         }
2685
2686       /* SLP: create an SLP data structure for every interleaving group of
2687          stores for further analysis in vect_analyse_slp.  */
2688       if (DR_IS_WRITE (dr) && !slp_impossible)
2689         {
2690           if (loop_vinfo)
2691             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2692           if (bb_vinfo)
2693             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2694         }
2695     }
2696
2697   return true;
2698 }
2699
2700 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2701    accesses of legal size, step, etc.  Detect gaps, single element
2702    interleaving, and other special cases. Set grouped access info.
2703    Collect groups of strided stores for further use in SLP analysis.  */
2704
2705 static bool
2706 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2707 {
2708   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2709     {
2710       /* Dissolve the group if present.  */
2711       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2712       while (stmt_info)
2713         {
2714           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2715           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2716           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2717           stmt_info = next;
2718         }
2719       return false;
2720     }
2721   return true;
2722 }
2723
2724 /* Analyze the access pattern of the data-reference DR_INFO.
2725    In case of non-consecutive accesses call vect_analyze_group_access() to
2726    analyze groups of accesses.  */
2727
2728 static bool
2729 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2730 {
2731   data_reference *dr = dr_info->dr;
2732   tree step = DR_STEP (dr);
2733   tree scalar_type = TREE_TYPE (DR_REF (dr));
2734   stmt_vec_info stmt_info = dr_info->stmt;
2735   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2736   class loop *loop = NULL;
2737
2738   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2739     return true;
2740
2741   if (loop_vinfo)
2742     loop = LOOP_VINFO_LOOP (loop_vinfo);
2743
2744   if (loop_vinfo && !step)
2745     {
2746       if (dump_enabled_p ())
2747         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2748                          "bad data-ref access in loop\n");
2749       return false;
2750     }
2751
2752   /* Allow loads with zero step in inner-loop vectorization.  */
2753   if (loop_vinfo && integer_zerop (step))
2754     {
2755       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2756       if (!nested_in_vect_loop_p (loop, stmt_info))
2757         return DR_IS_READ (dr);
2758       /* Allow references with zero step for outer loops marked
2759          with pragma omp simd only - it guarantees absence of
2760          loop-carried dependencies between inner loop iterations.  */
2761       if (loop->safelen < 2)
2762         {
2763           if (dump_enabled_p ())
2764             dump_printf_loc (MSG_NOTE, vect_location,
2765                              "zero step in inner loop of nest\n");
2766           return false;
2767         }
2768     }
2769
2770   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2771     {
2772       /* Interleaved accesses are not yet supported within outer-loop
2773         vectorization for references in the inner-loop.  */
2774       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2775
2776       /* For the rest of the analysis we use the outer-loop step.  */
2777       step = STMT_VINFO_DR_STEP (stmt_info);
2778       if (integer_zerop (step))
2779         {
2780           if (dump_enabled_p ())
2781             dump_printf_loc (MSG_NOTE, vect_location,
2782                              "zero step in outer loop.\n");
2783           return DR_IS_READ (dr);
2784         }
2785     }
2786
2787   /* Consecutive?  */
2788   if (TREE_CODE (step) == INTEGER_CST)
2789     {
2790       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2791       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2792           || (dr_step < 0
2793               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2794         {
2795           /* Mark that it is not interleaving.  */
2796           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2797           return true;
2798         }
2799     }
2800
2801   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2802     {
2803       if (dump_enabled_p ())
2804         dump_printf_loc (MSG_NOTE, vect_location,
2805                          "grouped access in outer loop.\n");
2806       return false;
2807     }
2808
2809
2810   /* Assume this is a DR handled by non-constant strided load case.  */
2811   if (TREE_CODE (step) != INTEGER_CST)
2812     return (STMT_VINFO_STRIDED_P (stmt_info)
2813             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2814                 || vect_analyze_group_access (vinfo, dr_info)));
2815
2816   /* Not consecutive access - check if it's a part of interleaving group.  */
2817   return vect_analyze_group_access (vinfo, dr_info);
2818 }
2819
2820 typedef std::pair<data_reference_p, int> data_ref_pair;
2821
2822 /* Compare two data-references DRA and DRB to group them into chunks
2823    suitable for grouping.  */
2824
2825 static int
2826 dr_group_sort_cmp (const void *dra_, const void *drb_)
2827 {
2828   data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2829   data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2830   data_reference_p dra = dra_pair.first;
2831   data_reference_p drb = drb_pair.first;
2832   int cmp;
2833
2834   /* Stabilize sort.  */
2835   if (dra == drb)
2836     return 0;
2837
2838   /* DRs in different basic-blocks never belong to the same group.  */
2839   int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2840   int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2841   if (bb_index1 != bb_index2)
2842     return bb_index1 < bb_index2 ? -1 : 1;
2843
2844   /* Different group IDs lead never belong to the same group.  */
2845   if (dra_pair.second != drb_pair.second)
2846     return dra_pair.second < drb_pair.second ? -1 : 1;
2847
2848   /* Ordering of DRs according to base.  */
2849   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2850                                DR_BASE_ADDRESS (drb));
2851   if (cmp != 0)
2852     return cmp;
2853
2854   /* And according to DR_OFFSET.  */
2855   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2856   if (cmp != 0)
2857     return cmp;
2858
2859   /* Put reads before writes.  */
2860   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2861     return DR_IS_READ (dra) ? -1 : 1;
2862
2863   /* Then sort after access size.  */
2864   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2865                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2866   if (cmp != 0)
2867     return cmp;
2868
2869   /* And after step.  */
2870   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2871   if (cmp != 0)
2872     return cmp;
2873
2874   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2875   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2876   if (cmp == 0)
2877     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2878   return cmp;
2879 }
2880
2881 /* If OP is the result of a conversion, return the unconverted value,
2882    otherwise return null.  */
2883
2884 static tree
2885 strip_conversion (tree op)
2886 {
2887   if (TREE_CODE (op) != SSA_NAME)
2888     return NULL_TREE;
2889   gimple *stmt = SSA_NAME_DEF_STMT (op);
2890   if (!is_gimple_assign (stmt)
2891       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2892     return NULL_TREE;
2893   return gimple_assign_rhs1 (stmt);
2894 }
2895
2896 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2897    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2898    be grouped in SLP mode.  */
2899
2900 static bool
2901 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2902                    bool allow_slp_p)
2903 {
2904   if (gimple_assign_single_p (stmt1_info->stmt))
2905     return gimple_assign_single_p (stmt2_info->stmt);
2906
2907   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2908   if (call1 && gimple_call_internal_p (call1))
2909     {
2910       /* Check for two masked loads or two masked stores.  */
2911       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2912       if (!call2 || !gimple_call_internal_p (call2))
2913         return false;
2914       internal_fn ifn = gimple_call_internal_fn (call1);
2915       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2916         return false;
2917       if (ifn != gimple_call_internal_fn (call2))
2918         return false;
2919
2920       /* Check that the masks are the same.  Cope with casts of masks,
2921          like those created by build_mask_conversion.  */
2922       tree mask1 = gimple_call_arg (call1, 2);
2923       tree mask2 = gimple_call_arg (call2, 2);
2924       if (!operand_equal_p (mask1, mask2, 0)
2925           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2926         {
2927           mask1 = strip_conversion (mask1);
2928           if (!mask1)
2929             return false;
2930           mask2 = strip_conversion (mask2);
2931           if (!mask2)
2932             return false;
2933           if (!operand_equal_p (mask1, mask2, 0))
2934             return false;
2935         }
2936       return true;
2937     }
2938
2939   return false;
2940 }
2941
2942 /* Function vect_analyze_data_ref_accesses.
2943
2944    Analyze the access pattern of all the data references in the loop.
2945
2946    FORNOW: the only access pattern that is considered vectorizable is a
2947            simple step 1 (consecutive) access.
2948
2949    FORNOW: handle only arrays and pointer accesses.  */
2950
2951 opt_result
2952 vect_analyze_data_ref_accesses (vec_info *vinfo,
2953                                 vec<int> *dataref_groups)
2954 {
2955   unsigned int i;
2956   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2957
2958   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2959
2960   if (datarefs.is_empty ())
2961     return opt_result::success ();
2962
2963   /* Sort the array of datarefs to make building the interleaving chains
2964      linear.  Don't modify the original vector's order, it is needed for
2965      determining what dependencies are reversed.  */
2966   vec<data_ref_pair> datarefs_copy;
2967   datarefs_copy.create (datarefs.length ());
2968   for (unsigned i = 0; i < datarefs.length (); i++)
2969     {
2970       int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2971       datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2972     }
2973   datarefs_copy.qsort (dr_group_sort_cmp);
2974   hash_set<stmt_vec_info> to_fixup;
2975
2976   /* Build the interleaving chains.  */
2977   for (i = 0; i < datarefs_copy.length () - 1;)
2978     {
2979       data_reference_p dra = datarefs_copy[i].first;
2980       int dra_group_id = datarefs_copy[i].second;
2981       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2982       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2983       stmt_vec_info lastinfo = NULL;
2984       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2985           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2986         {
2987           ++i;
2988           continue;
2989         }
2990       for (i = i + 1; i < datarefs_copy.length (); ++i)
2991         {
2992           data_reference_p drb = datarefs_copy[i].first;
2993           int drb_group_id = datarefs_copy[i].second;
2994           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2995           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2996           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2997               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2998             break;
2999
3000           /* ???  Imperfect sorting (non-compatible types, non-modulo
3001              accesses, same accesses) can lead to a group to be artificially
3002              split here as we don't just skip over those.  If it really
3003              matters we can push those to a worklist and re-iterate
3004              over them.  The we can just skip ahead to the next DR here.  */
3005
3006           /* DRs in a different BBs should not be put into the same
3007              interleaving group.  */
3008           int bb_index1 = gimple_bb (DR_STMT (dra))->index;
3009           int bb_index2 = gimple_bb (DR_STMT (drb))->index;
3010           if (bb_index1 != bb_index2)
3011             break;
3012
3013           if (dra_group_id != drb_group_id)
3014             break;
3015
3016           /* Check that the data-refs have same first location (except init)
3017              and they are both either store or load (not load and store,
3018              not masked loads or stores).  */
3019           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3020               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3021                                         DR_BASE_ADDRESS (drb)) != 0
3022               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3023               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3024             break;
3025
3026           /* Check that the data-refs have the same constant size.  */
3027           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3028           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3029           if (!tree_fits_uhwi_p (sza)
3030               || !tree_fits_uhwi_p (szb)
3031               || !tree_int_cst_equal (sza, szb))
3032             break;
3033
3034           /* Check that the data-refs have the same step.  */
3035           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3036             break;
3037
3038           /* Check the types are compatible.
3039              ???  We don't distinguish this during sorting.  */
3040           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3041                                    TREE_TYPE (DR_REF (drb))))
3042             break;
3043
3044           /* Check that the DR_INITs are compile-time constants.  */
3045           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3046               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3047             break;
3048
3049           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3050              just hold extra information.  */
3051           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3052               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3053               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3054             break;
3055
3056           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3057           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3058           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3059           HOST_WIDE_INT init_prev
3060             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3061           gcc_assert (init_a <= init_b
3062                       && init_a <= init_prev
3063                       && init_prev <= init_b);
3064
3065           /* Do not place the same access in the interleaving chain twice.  */
3066           if (init_b == init_prev)
3067             {
3068               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3069                           < gimple_uid (DR_STMT (drb)));
3070               /* Simply link in duplicates and fix up the chain below.  */
3071             }
3072           else
3073             {
3074               /* If init_b == init_a + the size of the type * k, we have an
3075                  interleaving, and DRA is accessed before DRB.  */
3076               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3077               if (type_size_a == 0
3078                   || (init_b - init_a) % type_size_a != 0)
3079                 break;
3080
3081               /* If we have a store, the accesses are adjacent.  This splits
3082                  groups into chunks we support (we don't support vectorization
3083                  of stores with gaps).  */
3084               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3085                 break;
3086
3087               /* If the step (if not zero or non-constant) is smaller than the
3088                  difference between data-refs' inits this splits groups into
3089                  suitable sizes.  */
3090               if (tree_fits_shwi_p (DR_STEP (dra)))
3091                 {
3092                   unsigned HOST_WIDE_INT step
3093                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3094                   if (step != 0
3095                       && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3096                     break;
3097                 }
3098             }
3099
3100           if (dump_enabled_p ())
3101             dump_printf_loc (MSG_NOTE, vect_location,
3102                              DR_IS_READ (dra)
3103                              ? "Detected interleaving load %T and %T\n"
3104                              : "Detected interleaving store %T and %T\n",
3105                              DR_REF (dra), DR_REF (drb));
3106
3107           /* Link the found element into the group list.  */
3108           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3109             {
3110               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3111               lastinfo = stmtinfo_a;
3112             }
3113           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3114           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3115           lastinfo = stmtinfo_b;
3116
3117           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3118             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3119
3120           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3121             dump_printf_loc (MSG_NOTE, vect_location,
3122                              "Load suitable for SLP vectorization only.\n");
3123
3124           if (init_b == init_prev
3125               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3126               && dump_enabled_p ())
3127             dump_printf_loc (MSG_NOTE, vect_location,
3128                              "Queuing group with duplicate access for fixup\n");
3129         }
3130     }
3131
3132   /* Fixup groups with duplicate entries by splitting it.  */
3133   while (1)
3134     {
3135       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3136       if (!(it != to_fixup.end ()))
3137         break;
3138       stmt_vec_info grp = *it;
3139       to_fixup.remove (grp);
3140
3141       /* Find the earliest duplicate group member.  */
3142       unsigned first_duplicate = -1u;
3143       stmt_vec_info next, g = grp;
3144       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3145         {
3146           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3147                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3148               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3149             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3150           g = next;
3151         }
3152       if (first_duplicate == -1U)
3153         continue;
3154
3155       /* Then move all stmts after the first duplicate to a new group.
3156          Note this is a heuristic but one with the property that *it
3157          is fixed up completely.  */
3158       g = grp;
3159       stmt_vec_info newgroup = NULL, ng = grp;
3160       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3161         {
3162           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3163             {
3164               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3165               if (!newgroup)
3166                 newgroup = next;
3167               else
3168                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3169               ng = next;
3170               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3171             }
3172           else
3173             g = DR_GROUP_NEXT_ELEMENT (g);
3174         }
3175       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3176
3177       /* Fixup the new group which still may contain duplicates.  */
3178       to_fixup.add (newgroup);
3179     }
3180
3181   data_ref_pair *dr_pair;
3182   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3183     {
3184       dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3185       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3186           && !vect_analyze_data_ref_access (vinfo, dr_info))
3187         {
3188           if (dump_enabled_p ())
3189             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190                              "not vectorized: complicated access pattern.\n");
3191
3192           if (is_a <bb_vec_info> (vinfo))
3193             {
3194               /* Mark the statement as not vectorizable.  */
3195               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3196               continue;
3197             }
3198           else
3199             {
3200               datarefs_copy.release ();
3201               return opt_result::failure_at (dr_info->stmt->stmt,
3202                                              "not vectorized:"
3203                                              " complicated access pattern.\n");
3204             }
3205         }
3206     }
3207
3208   datarefs_copy.release ();
3209   return opt_result::success ();
3210 }
3211
3212 /* Function vect_vfa_segment_size.
3213
3214    Input:
3215      DR_INFO: The data reference.
3216      LENGTH_FACTOR: segment length to consider.
3217
3218    Return a value suitable for the dr_with_seg_len::seg_len field.
3219    This is the "distance travelled" by the pointer from the first
3220    iteration in the segment to the last.  Note that it does not include
3221    the size of the access; in effect it only describes the first byte.  */
3222
3223 static tree
3224 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3225 {
3226   length_factor = size_binop (MINUS_EXPR,
3227                               fold_convert (sizetype, length_factor),
3228                               size_one_node);
3229   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3230                      length_factor);
3231 }
3232
3233 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3234    gives the worst-case number of bytes covered by the segment.  */
3235
3236 static unsigned HOST_WIDE_INT
3237 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3238 {
3239   stmt_vec_info stmt_vinfo = dr_info->stmt;
3240   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3241   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3242   unsigned HOST_WIDE_INT access_size = ref_size;
3243   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3244     {
3245       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3246       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3247     }
3248   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3249       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3250           == dr_explicit_realign_optimized))
3251     {
3252       /* We might access a full vector's worth.  */
3253       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3254       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3255     }
3256   return access_size;
3257 }
3258
3259 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3260    describes.  */
3261
3262 static unsigned int
3263 vect_vfa_align (dr_vec_info *dr_info)
3264 {
3265   return dr_alignment (dr_info->dr);
3266 }
3267
3268 /* Function vect_no_alias_p.
3269
3270    Given data references A and B with equal base and offset, see whether
3271    the alias relation can be decided at compilation time.  Return 1 if
3272    it can and the references alias, 0 if it can and the references do
3273    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3274    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3275    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3276
3277 static int
3278 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3279                          tree segment_length_a, tree segment_length_b,
3280                          unsigned HOST_WIDE_INT access_size_a,
3281                          unsigned HOST_WIDE_INT access_size_b)
3282 {
3283   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3284   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3285   poly_uint64 const_length_a;
3286   poly_uint64 const_length_b;
3287
3288   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3289      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3290      [a, a+12) */
3291   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3292     {
3293       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3294       offset_a -= const_length_a;
3295     }
3296   else
3297     const_length_a = tree_to_poly_uint64 (segment_length_a);
3298   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3299     {
3300       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3301       offset_b -= const_length_b;
3302     }
3303   else
3304     const_length_b = tree_to_poly_uint64 (segment_length_b);
3305
3306   const_length_a += access_size_a;
3307   const_length_b += access_size_b;
3308
3309   if (ranges_known_overlap_p (offset_a, const_length_a,
3310                               offset_b, const_length_b))
3311     return 1;
3312
3313   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3314                                offset_b, const_length_b))
3315     return 0;
3316
3317   return -1;
3318 }
3319
3320 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3321    in DDR is >= VF.  */
3322
3323 static bool
3324 dependence_distance_ge_vf (data_dependence_relation *ddr,
3325                            unsigned int loop_depth, poly_uint64 vf)
3326 {
3327   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3328       || DDR_NUM_DIST_VECTS (ddr) == 0)
3329     return false;
3330
3331   /* If the dependence is exact, we should have limited the VF instead.  */
3332   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3333
3334   unsigned int i;
3335   lambda_vector dist_v;
3336   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3337     {
3338       HOST_WIDE_INT dist = dist_v[loop_depth];
3339       if (dist != 0
3340           && !(dist > 0 && DDR_REVERSED_P (ddr))
3341           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3342         return false;
3343     }
3344
3345   if (dump_enabled_p ())
3346     dump_printf_loc (MSG_NOTE, vect_location,
3347                      "dependence distance between %T and %T is >= VF\n",
3348                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3349
3350   return true;
3351 }
3352
3353 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3354
3355 static void
3356 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3357 {
3358   dump_printf (dump_kind, "%s (%T) >= ",
3359                lower_bound.unsigned_p ? "unsigned" : "abs",
3360                lower_bound.expr);
3361   dump_dec (dump_kind, lower_bound.min_value);
3362 }
3363
3364 /* Record that the vectorized loop requires the vec_lower_bound described
3365    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3366
3367 static void
3368 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3369                         poly_uint64 min_value)
3370 {
3371   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3372   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3373     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3374       {
3375         unsigned_p &= lower_bounds[i].unsigned_p;
3376         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3377         if (lower_bounds[i].unsigned_p != unsigned_p
3378             || maybe_lt (lower_bounds[i].min_value, min_value))
3379           {
3380             lower_bounds[i].unsigned_p = unsigned_p;
3381             lower_bounds[i].min_value = min_value;
3382             if (dump_enabled_p ())
3383               {
3384                 dump_printf_loc (MSG_NOTE, vect_location,
3385                                  "updating run-time check to ");
3386                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3387                 dump_printf (MSG_NOTE, "\n");
3388               }
3389           }
3390         return;
3391       }
3392
3393   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3394   if (dump_enabled_p ())
3395     {
3396       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3397       dump_lower_bound (MSG_NOTE, lower_bound);
3398       dump_printf (MSG_NOTE, "\n");
3399     }
3400   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3401 }
3402
3403 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3404    will span fewer than GAP bytes.  */
3405
3406 static bool
3407 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3408                   poly_int64 gap)
3409 {
3410   stmt_vec_info stmt_info = dr_info->stmt;
3411   HOST_WIDE_INT count
3412     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3413   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3414     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3415   return (estimated_poly_value (gap)
3416           <= count * vect_get_scalar_dr_size (dr_info));
3417 }
3418
3419 /* Return true if we know that there is no alias between DR_INFO_A and
3420    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3421    When returning true, set *LOWER_BOUND_OUT to this N.  */
3422
3423 static bool
3424 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3425                                 poly_uint64 *lower_bound_out)
3426 {
3427   /* Check that there is a constant gap of known sign between DR_A
3428      and DR_B.  */
3429   data_reference *dr_a = dr_info_a->dr;
3430   data_reference *dr_b = dr_info_b->dr;
3431   poly_int64 init_a, init_b;
3432   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3433       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3434       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3435       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3436       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3437       || !ordered_p (init_a, init_b))
3438     return false;
3439
3440   /* Sort DR_A and DR_B by the address they access.  */
3441   if (maybe_lt (init_b, init_a))
3442     {
3443       std::swap (init_a, init_b);
3444       std::swap (dr_info_a, dr_info_b);
3445       std::swap (dr_a, dr_b);
3446     }
3447
3448   /* If the two accesses could be dependent within a scalar iteration,
3449      make sure that we'd retain their order.  */
3450   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3451       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3452     return false;
3453
3454   /* There is no alias if abs (DR_STEP) is greater than or equal to
3455      the bytes spanned by the combination of the two accesses.  */
3456   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3457   return true;
3458 }
3459
3460 /* Function vect_prune_runtime_alias_test_list.
3461
3462    Prune a list of ddrs to be tested at run-time by versioning for alias.
3463    Merge several alias checks into one if possible.
3464    Return FALSE if resulting list of ddrs is longer then allowed by
3465    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3466
3467 opt_result
3468 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3469 {
3470   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3471   hash_set <tree_pair_hash> compared_objects;
3472
3473   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3474   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3475     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3476   vec<vec_object_pair> &check_unequal_addrs
3477     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3478   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3479   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3480
3481   ddr_p ddr;
3482   unsigned int i;
3483   tree length_factor;
3484
3485   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3486
3487   /* Step values are irrelevant for aliasing if the number of vector
3488      iterations is equal to the number of scalar iterations (which can
3489      happen for fully-SLP loops).  */
3490   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3491
3492   if (!ignore_step_p)
3493     {
3494       /* Convert the checks for nonzero steps into bound tests.  */
3495       tree value;
3496       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3497         vect_check_lower_bound (loop_vinfo, value, true, 1);
3498     }
3499
3500   if (may_alias_ddrs.is_empty ())
3501     return opt_result::success ();
3502
3503   comp_alias_ddrs.create (may_alias_ddrs.length ());
3504
3505   unsigned int loop_depth
3506     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3507                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3508
3509   /* First, we collect all data ref pairs for aliasing checks.  */
3510   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3511     {
3512       poly_uint64 lower_bound;
3513       tree segment_length_a, segment_length_b;
3514       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3515       unsigned int align_a, align_b;
3516
3517       /* Ignore the alias if the VF we chose ended up being no greater
3518          than the dependence distance.  */
3519       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3520         continue;
3521
3522       if (DDR_OBJECT_A (ddr))
3523         {
3524           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3525           if (!compared_objects.add (new_pair))
3526             {
3527               if (dump_enabled_p ())
3528                 dump_printf_loc (MSG_NOTE, vect_location,
3529                                  "checking that %T and %T"
3530                                  " have different addresses\n",
3531                                  new_pair.first, new_pair.second);
3532               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3533             }
3534           continue;
3535         }
3536
3537       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3538       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3539
3540       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3541       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3542
3543       bool preserves_scalar_order_p
3544         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3545
3546       /* Skip the pair if inter-iteration dependencies are irrelevant
3547          and intra-iteration dependencies are guaranteed to be honored.  */
3548       if (ignore_step_p
3549           && (preserves_scalar_order_p
3550               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3551                                                  &lower_bound)))
3552         {
3553           if (dump_enabled_p ())
3554             dump_printf_loc (MSG_NOTE, vect_location,
3555                              "no need for alias check between "
3556                              "%T and %T when VF is 1\n",
3557                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3558           continue;
3559         }
3560
3561       /* See whether we can handle the alias using a bounds check on
3562          the step, and whether that's likely to be the best approach.
3563          (It might not be, for example, if the minimum step is much larger
3564          than the number of bytes handled by one vector iteration.)  */
3565       if (!ignore_step_p
3566           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3567           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3568                                              &lower_bound)
3569           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3570               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3571         {
3572           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3573           if (dump_enabled_p ())
3574             {
3575               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3576                                "%T and %T when the step %T is outside ",
3577                                DR_REF (dr_info_a->dr),
3578                                DR_REF (dr_info_b->dr),
3579                                DR_STEP (dr_info_a->dr));
3580               if (unsigned_p)
3581                 dump_printf (MSG_NOTE, "[0");
3582               else
3583                 {
3584                   dump_printf (MSG_NOTE, "(");
3585                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3586                 }
3587               dump_printf (MSG_NOTE, ", ");
3588               dump_dec (MSG_NOTE, lower_bound);
3589               dump_printf (MSG_NOTE, ")\n");
3590             }
3591           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3592                                   unsigned_p, lower_bound);
3593           continue;
3594         }
3595
3596       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3597       if (dr_group_first_a)
3598         {
3599           stmt_info_a = dr_group_first_a;
3600           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3601         }
3602
3603       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3604       if (dr_group_first_b)
3605         {
3606           stmt_info_b = dr_group_first_b;
3607           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3608         }
3609
3610       if (ignore_step_p)
3611         {
3612           segment_length_a = size_zero_node;
3613           segment_length_b = size_zero_node;
3614         }
3615       else
3616         {
3617           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3618                                 DR_STEP (dr_info_b->dr), 0))
3619             length_factor = scalar_loop_iters;
3620           else
3621             length_factor = size_int (vect_factor);
3622           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3623           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3624         }
3625       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3626       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3627       align_a = vect_vfa_align (dr_info_a);
3628       align_b = vect_vfa_align (dr_info_b);
3629
3630       /* See whether the alias is known at compilation time.  */
3631       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3632                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3633           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3634                               DR_OFFSET (dr_info_b->dr), 0)
3635           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3636           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3637           && poly_int_tree_p (segment_length_a)
3638           && poly_int_tree_p (segment_length_b))
3639         {
3640           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3641                                              segment_length_a,
3642                                              segment_length_b,
3643                                              access_size_a,
3644                                              access_size_b);
3645           if (res >= 0 && dump_enabled_p ())
3646             {
3647               dump_printf_loc (MSG_NOTE, vect_location,
3648                                "can tell at compile time that %T and %T",
3649                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3650               if (res == 0)
3651                 dump_printf (MSG_NOTE, " do not alias\n");
3652               else
3653                 dump_printf (MSG_NOTE, " alias\n");
3654             }
3655
3656           if (res == 0)
3657             continue;
3658
3659           if (res == 1)
3660             return opt_result::failure_at (stmt_info_b->stmt,
3661                                            "not vectorized:"
3662                                            " compilation time alias: %G%G",
3663                                            stmt_info_a->stmt,
3664                                            stmt_info_b->stmt);
3665         }
3666
3667       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3668                             access_size_a, align_a);
3669       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3670                             access_size_b, align_b);
3671       /* Canonicalize the order to be the one that's needed for accurate
3672          RAW, WAR and WAW flags, in cases where the data references are
3673          well-ordered.  The order doesn't really matter otherwise,
3674          but we might as well be consistent.  */
3675       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3676         std::swap (dr_a, dr_b);
3677
3678       dr_with_seg_len_pair_t dr_with_seg_len_pair
3679         (dr_a, dr_b, (preserves_scalar_order_p
3680                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3681                       : dr_with_seg_len_pair_t::REORDERED));
3682
3683       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3684     }
3685
3686   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3687
3688   unsigned int count = (comp_alias_ddrs.length ()
3689                         + check_unequal_addrs.length ());
3690
3691   if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3692     return opt_result::failure_at
3693       (vect_location, "would need a runtime alias check\n");
3694
3695   if (dump_enabled_p ())
3696     dump_printf_loc (MSG_NOTE, vect_location,
3697                      "improved number of alias checks from %d to %d\n",
3698                      may_alias_ddrs.length (), count);
3699   unsigned limit = param_vect_max_version_for_alias_checks;
3700   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3701     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3702   if (count > limit)
3703     return opt_result::failure_at
3704       (vect_location,
3705        "number of versioning for alias run-time tests exceeds %d "
3706        "(--param vect-max-version-for-alias-checks)\n", limit);
3707
3708   return opt_result::success ();
3709 }
3710
3711 /* Check whether we can use an internal function for a gather load
3712    or scatter store.  READ_P is true for loads and false for stores.
3713    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3714    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3715    is the type of the offset that is being applied to the invariant
3716    base address.  SCALE is the amount by which the offset should
3717    be multiplied *after* it has been converted to address width.
3718
3719    Return true if the function is supported, storing the function id in
3720    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3721
3722 bool
3723 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3724                           tree vectype, tree memory_type, tree offset_type,
3725                           int scale, internal_fn *ifn_out,
3726                           tree *offset_vectype_out)
3727 {
3728   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3729   unsigned int element_bits = vector_element_bits (vectype);
3730   if (element_bits != memory_bits)
3731     /* For now the vector elements must be the same width as the
3732        memory elements.  */
3733     return false;
3734
3735   /* Work out which function we need.  */
3736   internal_fn ifn;
3737   if (read_p)
3738     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3739   else
3740     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3741
3742   for (;;)
3743     {
3744       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3745       if (!offset_vectype)
3746         return false;
3747
3748       /* Test whether the target supports this combination.  */
3749       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3750                                                   offset_vectype, scale))
3751         {
3752           *ifn_out = ifn;
3753           *offset_vectype_out = offset_vectype;
3754           return true;
3755         }
3756
3757       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3758           && TYPE_PRECISION (offset_type) >= element_bits)
3759         return false;
3760
3761       offset_type = build_nonstandard_integer_type
3762         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3763     }
3764 }
3765
3766 /* STMT_INFO is a call to an internal gather load or scatter store function.
3767    Describe the operation in INFO.  */
3768
3769 static void
3770 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3771                                    gather_scatter_info *info)
3772 {
3773   gcall *call = as_a <gcall *> (stmt_info->stmt);
3774   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3775   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3776
3777   info->ifn = gimple_call_internal_fn (call);
3778   info->decl = NULL_TREE;
3779   info->base = gimple_call_arg (call, 0);
3780   info->offset = gimple_call_arg (call, 1);
3781   info->offset_dt = vect_unknown_def_type;
3782   info->offset_vectype = NULL_TREE;
3783   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3784   info->element_type = TREE_TYPE (vectype);
3785   info->memory_type = TREE_TYPE (DR_REF (dr));
3786 }
3787
3788 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3789    gather load or scatter store.  Describe the operation in *INFO if so.  */
3790
3791 bool
3792 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3793                            gather_scatter_info *info)
3794 {
3795   HOST_WIDE_INT scale = 1;
3796   poly_int64 pbitpos, pbitsize;
3797   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3798   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3799   tree offtype = NULL_TREE;
3800   tree decl = NULL_TREE, base, off;
3801   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3802   tree memory_type = TREE_TYPE (DR_REF (dr));
3803   machine_mode pmode;
3804   int punsignedp, reversep, pvolatilep = 0;
3805   internal_fn ifn;
3806   tree offset_vectype;
3807   bool masked_p = false;
3808
3809   /* See whether this is already a call to a gather/scatter internal function.
3810      If not, see whether it's a masked load or store.  */
3811   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3812   if (call && gimple_call_internal_p (call))
3813     {
3814       ifn = gimple_call_internal_fn (call);
3815       if (internal_gather_scatter_fn_p (ifn))
3816         {
3817           vect_describe_gather_scatter_call (stmt_info, info);
3818           return true;
3819         }
3820       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3821     }
3822
3823   /* True if we should aim to use internal functions rather than
3824      built-in functions.  */
3825   bool use_ifn_p = (DR_IS_READ (dr)
3826                     ? supports_vec_gather_load_p ()
3827                     : supports_vec_scatter_store_p ());
3828
3829   base = DR_REF (dr);
3830   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3831      see if we can use the def stmt of the address.  */
3832   if (masked_p
3833       && TREE_CODE (base) == MEM_REF
3834       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3835       && integer_zerop (TREE_OPERAND (base, 1))
3836       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3837     {
3838       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3839       if (is_gimple_assign (def_stmt)
3840           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3841         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3842     }
3843
3844   /* The gather and scatter builtins need address of the form
3845      loop_invariant + vector * {1, 2, 4, 8}
3846      or
3847      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3848      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3849      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3850      multiplications and additions in it.  To get a vector, we need
3851      a single SSA_NAME that will be defined in the loop and will
3852      contain everything that is not loop invariant and that can be
3853      vectorized.  The following code attempts to find such a preexistng
3854      SSA_NAME OFF and put the loop invariants into a tree BASE
3855      that can be gimplified before the loop.  */
3856   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3857                               &punsignedp, &reversep, &pvolatilep);
3858   if (reversep)
3859     return false;
3860
3861   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3862
3863   if (TREE_CODE (base) == MEM_REF)
3864     {
3865       if (!integer_zerop (TREE_OPERAND (base, 1)))
3866         {
3867           if (off == NULL_TREE)
3868             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3869           else
3870             off = size_binop (PLUS_EXPR, off,
3871                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3872         }
3873       base = TREE_OPERAND (base, 0);
3874     }
3875   else
3876     base = build_fold_addr_expr (base);
3877
3878   if (off == NULL_TREE)
3879     off = size_zero_node;
3880
3881   /* If base is not loop invariant, either off is 0, then we start with just
3882      the constant offset in the loop invariant BASE and continue with base
3883      as OFF, otherwise give up.
3884      We could handle that case by gimplifying the addition of base + off
3885      into some SSA_NAME and use that as off, but for now punt.  */
3886   if (!expr_invariant_in_loop_p (loop, base))
3887     {
3888       if (!integer_zerop (off))
3889         return false;
3890       off = base;
3891       base = size_int (pbytepos);
3892     }
3893   /* Otherwise put base + constant offset into the loop invariant BASE
3894      and continue with OFF.  */
3895   else
3896     {
3897       base = fold_convert (sizetype, base);
3898       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3899     }
3900
3901   /* OFF at this point may be either a SSA_NAME or some tree expression
3902      from get_inner_reference.  Try to peel off loop invariants from it
3903      into BASE as long as possible.  */
3904   STRIP_NOPS (off);
3905   while (offtype == NULL_TREE)
3906     {
3907       enum tree_code code;
3908       tree op0, op1, add = NULL_TREE;
3909
3910       if (TREE_CODE (off) == SSA_NAME)
3911         {
3912           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3913
3914           if (expr_invariant_in_loop_p (loop, off))
3915             return false;
3916
3917           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3918             break;
3919
3920           op0 = gimple_assign_rhs1 (def_stmt);
3921           code = gimple_assign_rhs_code (def_stmt);
3922           op1 = gimple_assign_rhs2 (def_stmt);
3923         }
3924       else
3925         {
3926           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3927             return false;
3928           code = TREE_CODE (off);
3929           extract_ops_from_tree (off, &code, &op0, &op1);
3930         }
3931       switch (code)
3932         {
3933         case POINTER_PLUS_EXPR:
3934         case PLUS_EXPR:
3935           if (expr_invariant_in_loop_p (loop, op0))
3936             {
3937               add = op0;
3938               off = op1;
3939             do_add:
3940               add = fold_convert (sizetype, add);
3941               if (scale != 1)
3942                 add = size_binop (MULT_EXPR, add, size_int (scale));
3943               base = size_binop (PLUS_EXPR, base, add);
3944               continue;
3945             }
3946           if (expr_invariant_in_loop_p (loop, op1))
3947             {
3948               add = op1;
3949               off = op0;
3950               goto do_add;
3951             }
3952           break;
3953         case MINUS_EXPR:
3954           if (expr_invariant_in_loop_p (loop, op1))
3955             {
3956               add = fold_convert (sizetype, op1);
3957               add = size_binop (MINUS_EXPR, size_zero_node, add);
3958               off = op0;
3959               goto do_add;
3960             }
3961           break;
3962         case MULT_EXPR:
3963           if (scale == 1 && tree_fits_shwi_p (op1))
3964             {
3965               int new_scale = tree_to_shwi (op1);
3966               /* Only treat this as a scaling operation if the target
3967                  supports it for at least some offset type.  */
3968               if (use_ifn_p
3969                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3970                                                 masked_p, vectype, memory_type,
3971                                                 signed_char_type_node,
3972                                                 new_scale, &ifn,
3973                                                 &offset_vectype)
3974                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3975                                                 masked_p, vectype, memory_type,
3976                                                 unsigned_char_type_node,
3977                                                 new_scale, &ifn,
3978                                                 &offset_vectype))
3979                 break;
3980               scale = new_scale;
3981               off = op0;
3982               continue;
3983             }
3984           break;
3985         case SSA_NAME:
3986           off = op0;
3987           continue;
3988         CASE_CONVERT:
3989           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3990               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3991             break;
3992
3993           /* Don't include the conversion if the target is happy with
3994              the current offset type.  */
3995           if (use_ifn_p
3996               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3997                                            masked_p, vectype, memory_type,
3998                                            TREE_TYPE (off), scale, &ifn,
3999                                            &offset_vectype))
4000             break;
4001
4002           if (TYPE_PRECISION (TREE_TYPE (op0))
4003               == TYPE_PRECISION (TREE_TYPE (off)))
4004             {
4005               off = op0;
4006               continue;
4007             }
4008
4009           if (TYPE_PRECISION (TREE_TYPE (op0))
4010               < TYPE_PRECISION (TREE_TYPE (off)))
4011             {
4012               off = op0;
4013               offtype = TREE_TYPE (off);
4014               STRIP_NOPS (off);
4015               continue;
4016             }
4017           break;
4018         default:
4019           break;
4020         }
4021       break;
4022     }
4023
4024   /* If at the end OFF still isn't a SSA_NAME or isn't
4025      defined in the loop, punt.  */
4026   if (TREE_CODE (off) != SSA_NAME
4027       || expr_invariant_in_loop_p (loop, off))
4028     return false;
4029
4030   if (offtype == NULL_TREE)
4031     offtype = TREE_TYPE (off);
4032
4033   if (use_ifn_p)
4034     {
4035       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4036                                      vectype, memory_type, offtype, scale,
4037                                      &ifn, &offset_vectype))
4038         return false;
4039     }
4040   else
4041     {
4042       if (DR_IS_READ (dr))
4043         {
4044           if (targetm.vectorize.builtin_gather)
4045             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4046         }
4047       else
4048         {
4049           if (targetm.vectorize.builtin_scatter)
4050             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4051         }
4052
4053       if (!decl)
4054         return false;
4055
4056       ifn = IFN_LAST;
4057       /* The offset vector type will be read from DECL when needed.  */
4058       offset_vectype = NULL_TREE;
4059     }
4060
4061   info->ifn = ifn;
4062   info->decl = decl;
4063   info->base = base;
4064   info->offset = off;
4065   info->offset_dt = vect_unknown_def_type;
4066   info->offset_vectype = offset_vectype;
4067   info->scale = scale;
4068   info->element_type = TREE_TYPE (vectype);
4069   info->memory_type = memory_type;
4070   return true;
4071 }
4072
4073 /* Find the data references in STMT, analyze them with respect to LOOP and
4074    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4075    be handled.  */
4076
4077 opt_result
4078 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4079                                vec<data_reference_p> *datarefs,
4080                                vec<int> *dataref_groups, int group_id)
4081 {
4082   /* We can ignore clobbers for dataref analysis - they are removed during
4083      loop vectorization and BB vectorization checks dependences with a
4084      stmt walk.  */
4085   if (gimple_clobber_p (stmt))
4086     return opt_result::success ();
4087
4088   if (gimple_has_volatile_ops (stmt))
4089     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4090                                    stmt);
4091
4092   if (stmt_can_throw_internal (cfun, stmt))
4093     return opt_result::failure_at (stmt,
4094                                    "not vectorized:"
4095                                    " statement can throw an exception: %G",
4096                                    stmt);
4097
4098   auto_vec<data_reference_p, 2> refs;
4099   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4100   if (!res)
4101     return res;
4102
4103   if (refs.is_empty ())
4104     return opt_result::success ();
4105
4106   if (refs.length () > 1)
4107     {
4108       while (!refs.is_empty ())
4109         free_data_ref (refs.pop ());
4110       return opt_result::failure_at (stmt,
4111                                      "not vectorized: more than one "
4112                                      "data ref in stmt: %G", stmt);
4113     }
4114
4115   data_reference_p dr = refs.pop ();
4116   if (gcall *call = dyn_cast <gcall *> (stmt))
4117     if (!gimple_call_internal_p (call)
4118         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4119             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4120       {
4121         free_data_ref (dr);
4122         return opt_result::failure_at (stmt,
4123                                        "not vectorized: dr in a call %G", stmt);
4124       }
4125
4126   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4127       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4128     {
4129       free_data_ref (dr);
4130       return opt_result::failure_at (stmt,
4131                                      "not vectorized:"
4132                                      " statement is bitfield access %G", stmt);
4133     }
4134
4135   if (DR_BASE_ADDRESS (dr)
4136       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4137     {
4138       free_data_ref (dr);
4139       return opt_result::failure_at (stmt,
4140                                      "not vectorized:"
4141                                      " base addr of dr is a constant\n");
4142     }
4143
4144   /* Check whether this may be a SIMD lane access and adjust the
4145      DR to make it easier for us to handle it.  */
4146   if (loop
4147       && loop->simduid
4148       && (!DR_BASE_ADDRESS (dr)
4149           || !DR_OFFSET (dr)
4150           || !DR_INIT (dr)
4151           || !DR_STEP (dr)))
4152     {
4153       struct data_reference *newdr
4154         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4155                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4156       if (DR_BASE_ADDRESS (newdr)
4157           && DR_OFFSET (newdr)
4158           && DR_INIT (newdr)
4159           && DR_STEP (newdr)
4160           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4161           && integer_zerop (DR_STEP (newdr)))
4162         {
4163           tree base_address = DR_BASE_ADDRESS (newdr);
4164           tree off = DR_OFFSET (newdr);
4165           tree step = ssize_int (1);
4166           if (integer_zerop (off)
4167               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4168             {
4169               off = TREE_OPERAND (base_address, 1);
4170               base_address = TREE_OPERAND (base_address, 0);
4171             }
4172           STRIP_NOPS (off);
4173           if (TREE_CODE (off) == MULT_EXPR
4174               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4175             {
4176               step = TREE_OPERAND (off, 1);
4177               off = TREE_OPERAND (off, 0);
4178               STRIP_NOPS (off);
4179             }
4180           if (CONVERT_EXPR_P (off)
4181               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4182                   < TYPE_PRECISION (TREE_TYPE (off))))
4183             off = TREE_OPERAND (off, 0);
4184           if (TREE_CODE (off) == SSA_NAME)
4185             {
4186               gimple *def = SSA_NAME_DEF_STMT (off);
4187               /* Look through widening conversion.  */
4188               if (is_gimple_assign (def)
4189                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4190                 {
4191                   tree rhs1 = gimple_assign_rhs1 (def);
4192                   if (TREE_CODE (rhs1) == SSA_NAME
4193                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4194                       && (TYPE_PRECISION (TREE_TYPE (off))
4195                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4196                     def = SSA_NAME_DEF_STMT (rhs1);
4197                 }
4198               if (is_gimple_call (def)
4199                   && gimple_call_internal_p (def)
4200                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4201                 {
4202                   tree arg = gimple_call_arg (def, 0);
4203                   tree reft = TREE_TYPE (DR_REF (newdr));
4204                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4205                   arg = SSA_NAME_VAR (arg);
4206                   if (arg == loop->simduid
4207                       /* For now.  */
4208                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4209                     {
4210                       DR_BASE_ADDRESS (newdr) = base_address;
4211                       DR_OFFSET (newdr) = ssize_int (0);
4212                       DR_STEP (newdr) = step;
4213                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4214                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4215                       /* Mark as simd-lane access.  */
4216                       tree arg2 = gimple_call_arg (def, 1);
4217                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4218                       free_data_ref (dr);
4219                       datarefs->safe_push (newdr);
4220                       if (dataref_groups)
4221                         dataref_groups->safe_push (group_id);
4222                       return opt_result::success ();
4223                     }
4224                 }
4225             }
4226         }
4227       free_data_ref (newdr);
4228     }
4229
4230   datarefs->safe_push (dr);
4231   if (dataref_groups)
4232     dataref_groups->safe_push (group_id);
4233   return opt_result::success ();
4234 }
4235
4236 /* Function vect_analyze_data_refs.
4237
4238   Find all the data references in the loop or basic block.
4239
4240    The general structure of the analysis of data refs in the vectorizer is as
4241    follows:
4242    1- vect_analyze_data_refs(loop/bb): call
4243       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4244       in the loop/bb and their dependences.
4245    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4246    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4247    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4248
4249 */
4250
4251 opt_result
4252 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4253 {
4254   class loop *loop = NULL;
4255   unsigned int i;
4256   struct data_reference *dr;
4257   tree scalar_type;
4258
4259   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4260
4261   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4262     loop = LOOP_VINFO_LOOP (loop_vinfo);
4263
4264   /* Go through the data-refs, check that the analysis succeeded.  Update
4265      pointer from stmt_vec_info struct to DR and vectype.  */
4266
4267   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4268   FOR_EACH_VEC_ELT (datarefs, i, dr)
4269     {
4270       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4271       poly_uint64 vf;
4272
4273       gcc_assert (DR_REF (dr));
4274       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4275       gcc_assert (!stmt_info->dr_aux.dr);
4276       stmt_info->dr_aux.dr = dr;
4277       stmt_info->dr_aux.stmt = stmt_info;
4278
4279       /* Check that analysis of the data-ref succeeded.  */
4280       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4281           || !DR_STEP (dr))
4282         {
4283           bool maybe_gather
4284             = DR_IS_READ (dr)
4285               && !TREE_THIS_VOLATILE (DR_REF (dr))
4286               && (targetm.vectorize.builtin_gather != NULL
4287                   || supports_vec_gather_load_p ());
4288           bool maybe_scatter
4289             = DR_IS_WRITE (dr)
4290               && !TREE_THIS_VOLATILE (DR_REF (dr))
4291               && (targetm.vectorize.builtin_scatter != NULL
4292                   || supports_vec_scatter_store_p ());
4293
4294           /* If target supports vector gather loads or scatter stores,
4295              see if they can't be used.  */
4296           if (is_a <loop_vec_info> (vinfo)
4297               && !nested_in_vect_loop_p (loop, stmt_info))
4298             {
4299               if (maybe_gather || maybe_scatter)
4300                 {
4301                   if (maybe_gather)
4302                     gatherscatter = GATHER;
4303                   else
4304                     gatherscatter = SCATTER;
4305                 }
4306             }
4307
4308           if (gatherscatter == SG_NONE)
4309             {
4310               if (dump_enabled_p ())
4311                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4312                                  "not vectorized: data ref analysis "
4313                                  "failed %G", stmt_info->stmt);
4314               if (is_a <bb_vec_info> (vinfo))
4315                 {
4316                   /* In BB vectorization the ref can still participate
4317                      in dependence analysis, we just can't vectorize it.  */
4318                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4319                   continue;
4320                 }
4321               return opt_result::failure_at (stmt_info->stmt,
4322                                              "not vectorized:"
4323                                              " data ref analysis failed: %G",
4324                                              stmt_info->stmt);
4325             }
4326         }
4327
4328       /* See if this was detected as SIMD lane access.  */
4329       if (dr->aux == (void *)-1
4330           || dr->aux == (void *)-2
4331           || dr->aux == (void *)-3
4332           || dr->aux == (void *)-4)
4333         {
4334           if (nested_in_vect_loop_p (loop, stmt_info))
4335             return opt_result::failure_at (stmt_info->stmt,
4336                                            "not vectorized:"
4337                                            " data ref analysis failed: %G",
4338                                            stmt_info->stmt);
4339           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4340             = -(uintptr_t) dr->aux;
4341         }
4342
4343       tree base = get_base_address (DR_REF (dr));
4344       if (base && VAR_P (base) && DECL_NONALIASED (base))
4345         {
4346           if (dump_enabled_p ())
4347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348                              "not vectorized: base object not addressable "
4349                              "for stmt: %G", stmt_info->stmt);
4350           if (is_a <bb_vec_info> (vinfo))
4351             {
4352               /* In BB vectorization the ref can still participate
4353                  in dependence analysis, we just can't vectorize it.  */
4354               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4355               continue;
4356             }
4357           return opt_result::failure_at (stmt_info->stmt,
4358                                          "not vectorized: base object not"
4359                                          " addressable for stmt: %G",
4360                                          stmt_info->stmt);
4361         }
4362
4363       if (is_a <loop_vec_info> (vinfo)
4364           && DR_STEP (dr)
4365           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4366         {
4367           if (nested_in_vect_loop_p (loop, stmt_info))
4368             return opt_result::failure_at (stmt_info->stmt,
4369                                            "not vectorized: "
4370                                            "not suitable for strided load %G",
4371                                            stmt_info->stmt);
4372           STMT_VINFO_STRIDED_P (stmt_info) = true;
4373         }
4374
4375       /* Update DR field in stmt_vec_info struct.  */
4376
4377       /* If the dataref is in an inner-loop of the loop that is considered for
4378          for vectorization, we also want to analyze the access relative to
4379          the outer-loop (DR contains information only relative to the
4380          inner-most enclosing loop).  We do that by building a reference to the
4381          first location accessed by the inner-loop, and analyze it relative to
4382          the outer-loop.  */
4383       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4384         {
4385           /* Build a reference to the first location accessed by the
4386              inner loop: *(BASE + INIT + OFFSET).  By construction,
4387              this address must be invariant in the inner loop, so we
4388              can consider it as being used in the outer loop.  */
4389           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4390           tree offset = unshare_expr (DR_OFFSET (dr));
4391           tree init = unshare_expr (DR_INIT (dr));
4392           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4393                                           init, offset);
4394           tree init_addr = fold_build_pointer_plus (base, init_offset);
4395           tree init_ref = build_fold_indirect_ref (init_addr);
4396
4397           if (dump_enabled_p ())
4398             dump_printf_loc (MSG_NOTE, vect_location,
4399                              "analyze in outer loop: %T\n", init_ref);
4400
4401           opt_result res
4402             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4403                                     init_ref, loop, stmt_info->stmt);
4404           if (!res)
4405             /* dr_analyze_innermost already explained the failure.  */
4406             return res;
4407
4408           if (dump_enabled_p ())
4409             dump_printf_loc (MSG_NOTE, vect_location,
4410                              "\touter base_address: %T\n"
4411                              "\touter offset from base address: %T\n"
4412                              "\touter constant offset from base address: %T\n"
4413                              "\touter step: %T\n"
4414                              "\touter base alignment: %d\n\n"
4415                              "\touter base misalignment: %d\n"
4416                              "\touter offset alignment: %d\n"
4417                              "\touter step alignment: %d\n",
4418                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4419                              STMT_VINFO_DR_OFFSET (stmt_info),
4420                              STMT_VINFO_DR_INIT (stmt_info),
4421                              STMT_VINFO_DR_STEP (stmt_info),
4422                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4423                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4424                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4425                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4426         }
4427
4428       /* Set vectype for STMT.  */
4429       scalar_type = TREE_TYPE (DR_REF (dr));
4430       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4431       if (!vectype)
4432         {
4433           if (dump_enabled_p ())
4434             {
4435               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4436                                "not vectorized: no vectype for stmt: %G",
4437                                stmt_info->stmt);
4438               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4439               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4440                                  scalar_type);
4441               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4442             }
4443
4444           if (is_a <bb_vec_info> (vinfo))
4445             {
4446               /* No vector type is fine, the ref can still participate
4447                  in dependence analysis, we just can't vectorize it.  */
4448               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4449               continue;
4450             }
4451           if (fatal)
4452             *fatal = false;
4453           return opt_result::failure_at (stmt_info->stmt,
4454                                          "not vectorized:"
4455                                          " no vectype for stmt: %G"
4456                                          " scalar_type: %T\n",
4457                                          stmt_info->stmt, scalar_type);
4458         }
4459       else
4460         {
4461           if (dump_enabled_p ())
4462             dump_printf_loc (MSG_NOTE, vect_location,
4463                              "got vectype for stmt: %G%T\n",
4464                              stmt_info->stmt, vectype);
4465         }
4466
4467       /* Adjust the minimal vectorization factor according to the
4468          vector type.  */
4469       vf = TYPE_VECTOR_SUBPARTS (vectype);
4470       *min_vf = upper_bound (*min_vf, vf);
4471
4472       /* Leave the BB vectorizer to pick the vector type later, based on
4473          the final dataref group size and SLP node size.  */
4474       if (is_a <loop_vec_info> (vinfo))
4475         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4476
4477       if (gatherscatter != SG_NONE)
4478         {
4479           gather_scatter_info gs_info;
4480           if (!vect_check_gather_scatter (stmt_info,
4481                                           as_a <loop_vec_info> (vinfo),
4482                                           &gs_info)
4483               || !get_vectype_for_scalar_type (vinfo,
4484                                                TREE_TYPE (gs_info.offset)))
4485             {
4486               if (fatal)
4487                 *fatal = false;
4488               return opt_result::failure_at
4489                         (stmt_info->stmt,
4490                          (gatherscatter == GATHER)
4491                          ? "not vectorized: not suitable for gather load %G"
4492                          : "not vectorized: not suitable for scatter store %G",
4493                          stmt_info->stmt);
4494             }
4495           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4496         }
4497     }
4498
4499   /* We used to stop processing and prune the list here.  Verify we no
4500      longer need to.  */
4501   gcc_assert (i == datarefs.length ());
4502
4503   return opt_result::success ();
4504 }
4505
4506
4507 /* Function vect_get_new_vect_var.
4508
4509    Returns a name for a new variable.  The current naming scheme appends the
4510    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4511    the name of vectorizer generated variables, and appends that to NAME if
4512    provided.  */
4513
4514 tree
4515 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4516 {
4517   const char *prefix;
4518   tree new_vect_var;
4519
4520   switch (var_kind)
4521   {
4522   case vect_simple_var:
4523     prefix = "vect";
4524     break;
4525   case vect_scalar_var:
4526     prefix = "stmp";
4527     break;
4528   case vect_mask_var:
4529     prefix = "mask";
4530     break;
4531   case vect_pointer_var:
4532     prefix = "vectp";
4533     break;
4534   default:
4535     gcc_unreachable ();
4536   }
4537
4538   if (name)
4539     {
4540       char* tmp = concat (prefix, "_", name, NULL);
4541       new_vect_var = create_tmp_reg (type, tmp);
4542       free (tmp);
4543     }
4544   else
4545     new_vect_var = create_tmp_reg (type, prefix);
4546
4547   return new_vect_var;
4548 }
4549
4550 /* Like vect_get_new_vect_var but return an SSA name.  */
4551
4552 tree
4553 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4554 {
4555   const char *prefix;
4556   tree new_vect_var;
4557
4558   switch (var_kind)
4559   {
4560   case vect_simple_var:
4561     prefix = "vect";
4562     break;
4563   case vect_scalar_var:
4564     prefix = "stmp";
4565     break;
4566   case vect_pointer_var:
4567     prefix = "vectp";
4568     break;
4569   default:
4570     gcc_unreachable ();
4571   }
4572
4573   if (name)
4574     {
4575       char* tmp = concat (prefix, "_", name, NULL);
4576       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4577       free (tmp);
4578     }
4579   else
4580     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4581
4582   return new_vect_var;
4583 }
4584
4585 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4586
4587 static void
4588 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4589 {
4590   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4591   int misalign = DR_MISALIGNMENT (dr_info);
4592   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4593     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4594   else
4595     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4596                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4597                             misalign);
4598 }
4599
4600 /* Function vect_create_addr_base_for_vector_ref.
4601
4602    Create an expression that computes the address of the first memory location
4603    that will be accessed for a data reference.
4604
4605    Input:
4606    STMT_INFO: The statement containing the data reference.
4607    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4608    OFFSET: Optional. If supplied, it is be added to the initial address.
4609    LOOP:    Specify relative to which loop-nest should the address be computed.
4610             For example, when the dataref is in an inner-loop nested in an
4611             outer-loop that is now being vectorized, LOOP can be either the
4612             outer-loop, or the inner-loop.  The first memory location accessed
4613             by the following dataref ('in' points to short):
4614
4615                 for (i=0; i<N; i++)
4616                    for (j=0; j<M; j++)
4617                      s += in[i+j]
4618
4619             is as follows:
4620             if LOOP=i_loop:     &in             (relative to i_loop)
4621             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4622    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4623             initial address.  Unlike OFFSET, which is number of elements to
4624             be added, BYTE_OFFSET is measured in bytes.
4625
4626    Output:
4627    1. Return an SSA_NAME whose value is the address of the memory location of
4628       the first vector of the data reference.
4629    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4630       these statement(s) which define the returned SSA_NAME.
4631
4632    FORNOW: We are only handling array accesses with step 1.  */
4633
4634 tree
4635 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4636                                       gimple_seq *new_stmt_list,
4637                                       tree offset,
4638                                       tree byte_offset)
4639 {
4640   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4641   struct data_reference *dr = dr_info->dr;
4642   const char *base_name;
4643   tree addr_base;
4644   tree dest;
4645   gimple_seq seq = NULL;
4646   tree vect_ptr_type;
4647   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4648   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4649   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4650
4651   tree data_ref_base = unshare_expr (drb->base_address);
4652   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4653   tree init = unshare_expr (drb->init);
4654
4655   if (loop_vinfo)
4656     base_name = get_name (data_ref_base);
4657   else
4658     {
4659       base_offset = ssize_int (0);
4660       init = ssize_int (0);
4661       base_name = get_name (DR_REF (dr));
4662     }
4663
4664   /* Create base_offset */
4665   base_offset = size_binop (PLUS_EXPR,
4666                             fold_convert (sizetype, base_offset),
4667                             fold_convert (sizetype, init));
4668
4669   if (offset)
4670     {
4671       offset = fold_build2 (MULT_EXPR, sizetype,
4672                             fold_convert (sizetype, offset), step);
4673       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4674                                  base_offset, offset);
4675     }
4676   if (byte_offset)
4677     {
4678       byte_offset = fold_convert (sizetype, byte_offset);
4679       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4680                                  base_offset, byte_offset);
4681     }
4682
4683   /* base + base_offset */
4684   if (loop_vinfo)
4685     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4686   else
4687     {
4688       addr_base = build1 (ADDR_EXPR,
4689                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4690                           unshare_expr (DR_REF (dr)));
4691     }
4692
4693   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4694   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4695   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4696   gimple_seq_add_seq (new_stmt_list, seq);
4697
4698   if (DR_PTR_INFO (dr)
4699       && TREE_CODE (addr_base) == SSA_NAME
4700       && !SSA_NAME_PTR_INFO (addr_base))
4701     {
4702       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4703       if (offset || byte_offset)
4704         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4705     }
4706
4707   if (dump_enabled_p ())
4708     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4709
4710   return addr_base;
4711 }
4712
4713
4714 /* Function vect_create_data_ref_ptr.
4715
4716    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4717    location accessed in the loop by STMT_INFO, along with the def-use update
4718    chain to appropriately advance the pointer through the loop iterations.
4719    Also set aliasing information for the pointer.  This pointer is used by
4720    the callers to this function to create a memory reference expression for
4721    vector load/store access.
4722
4723    Input:
4724    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4725          GIMPLE_ASSIGN <name, data-ref> or
4726          GIMPLE_ASSIGN <data-ref, name>.
4727    2. AGGR_TYPE: the type of the reference, which should be either a vector
4728         or an array.
4729    3. AT_LOOP: the loop where the vector memref is to be created.
4730    4. OFFSET (optional): an offset to be added to the initial address accessed
4731         by the data-ref in STMT_INFO.
4732    5. BSI: location where the new stmts are to be placed if there is no loop
4733    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4734         pointing to the initial address.
4735    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4736         to the initial address accessed by the data-ref in STMT_INFO.  This is
4737         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4738         in bytes.
4739    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4740         to the IV during each iteration of the loop.  NULL says to move
4741         by one copy of AGGR_TYPE up or down, depending on the step of the
4742         data reference.
4743
4744    Output:
4745    1. Declare a new ptr to vector_type, and have it point to the base of the
4746       data reference (initial addressed accessed by the data reference).
4747       For example, for vector of type V8HI, the following code is generated:
4748
4749       v8hi *ap;
4750       ap = (v8hi *)initial_address;
4751
4752       if OFFSET is not supplied:
4753          initial_address = &a[init];
4754       if OFFSET is supplied:
4755          initial_address = &a[init + OFFSET];
4756       if BYTE_OFFSET is supplied:
4757          initial_address = &a[init] + BYTE_OFFSET;
4758
4759       Return the initial_address in INITIAL_ADDRESS.
4760
4761    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4762       update the pointer in each iteration of the loop.
4763
4764       Return the increment stmt that updates the pointer in PTR_INCR.
4765
4766    3. Return the pointer.  */
4767
4768 tree
4769 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4770                           tree aggr_type, class loop *at_loop, tree offset,
4771                           tree *initial_address, gimple_stmt_iterator *gsi,
4772                           gimple **ptr_incr, bool only_init,
4773                           tree byte_offset, tree iv_step)
4774 {
4775   const char *base_name;
4776   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4777   class loop *loop = NULL;
4778   bool nested_in_vect_loop = false;
4779   class loop *containing_loop = NULL;
4780   tree aggr_ptr_type;
4781   tree aggr_ptr;
4782   tree new_temp;
4783   gimple_seq new_stmt_list = NULL;
4784   edge pe = NULL;
4785   basic_block new_bb;
4786   tree aggr_ptr_init;
4787   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4788   struct data_reference *dr = dr_info->dr;
4789   tree aptr;
4790   gimple_stmt_iterator incr_gsi;
4791   bool insert_after;
4792   tree indx_before_incr, indx_after_incr;
4793   gimple *incr;
4794   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4795
4796   gcc_assert (iv_step != NULL_TREE
4797               || TREE_CODE (aggr_type) == ARRAY_TYPE
4798               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4799
4800   if (loop_vinfo)
4801     {
4802       loop = LOOP_VINFO_LOOP (loop_vinfo);
4803       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4804       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4805       pe = loop_preheader_edge (loop);
4806     }
4807   else
4808     {
4809       gcc_assert (bb_vinfo);
4810       only_init = true;
4811       *ptr_incr = NULL;
4812     }
4813
4814   /* Create an expression for the first address accessed by this load
4815      in LOOP.  */
4816   base_name = get_name (DR_BASE_ADDRESS (dr));
4817
4818   if (dump_enabled_p ())
4819     {
4820       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4821       dump_printf_loc (MSG_NOTE, vect_location,
4822                        "create %s-pointer variable to type: %T",
4823                        get_tree_code_name (TREE_CODE (aggr_type)),
4824                        aggr_type);
4825       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4826         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4827       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4828         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4829       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4830         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4831       else
4832         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4833       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4834     }
4835
4836   /* (1) Create the new aggregate-pointer variable.
4837      Vector and array types inherit the alias set of their component
4838      type by default so we need to use a ref-all pointer if the data
4839      reference does not conflict with the created aggregated data
4840      reference because it is not addressable.  */
4841   bool need_ref_all = false;
4842   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4843                               get_alias_set (DR_REF (dr))))
4844     need_ref_all = true;
4845   /* Likewise for any of the data references in the stmt group.  */
4846   else if (DR_GROUP_SIZE (stmt_info) > 1)
4847     {
4848       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4849       do
4850         {
4851           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4852           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4853                                       get_alias_set (DR_REF (sdr))))
4854             {
4855               need_ref_all = true;
4856               break;
4857             }
4858           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4859         }
4860       while (sinfo);
4861     }
4862   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4863                                                need_ref_all);
4864   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4865
4866
4867   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4868      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4869      def-use update cycles for the pointer: one relative to the outer-loop
4870      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4871      to the inner-loop (which is the inner-most loop containing the dataref),
4872      and this is done be step (5) below.
4873
4874      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4875      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4876      redundant.  Steps (3),(4) create the following:
4877
4878         vp0 = &base_addr;
4879         LOOP:   vp1 = phi(vp0,vp2)
4880                 ...
4881                 ...
4882                 vp2 = vp1 + step
4883                 goto LOOP
4884
4885      If there is an inner-loop nested in loop, then step (5) will also be
4886      applied, and an additional update in the inner-loop will be created:
4887
4888         vp0 = &base_addr;
4889         LOOP:   vp1 = phi(vp0,vp2)
4890                 ...
4891         inner:     vp3 = phi(vp1,vp4)
4892                    vp4 = vp3 + inner_step
4893                    if () goto inner
4894                 ...
4895                 vp2 = vp1 + step
4896                 if () goto LOOP   */
4897
4898   /* (2) Calculate the initial address of the aggregate-pointer, and set
4899      the aggregate-pointer to point to it before the loop.  */
4900
4901   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4902
4903   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4904                                                    stmt_info, &new_stmt_list,
4905                                                    offset, byte_offset);
4906   if (new_stmt_list)
4907     {
4908       if (pe)
4909         {
4910           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4911           gcc_assert (!new_bb);
4912         }
4913       else
4914         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4915     }
4916
4917   *initial_address = new_temp;
4918   aggr_ptr_init = new_temp;
4919
4920   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4921      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4922      inner-loop nested in LOOP (during outer-loop vectorization).  */
4923
4924   /* No update in loop is required.  */
4925   if (only_init && (!loop_vinfo || at_loop == loop))
4926     aptr = aggr_ptr_init;
4927   else
4928     {
4929       /* Accesses to invariant addresses should be handled specially
4930          by the caller.  */
4931       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4932       gcc_assert (!integer_zerop (step));
4933
4934       if (iv_step == NULL_TREE)
4935         {
4936           /* The step of the aggregate pointer is the type size,
4937              negated for downward accesses.  */
4938           iv_step = TYPE_SIZE_UNIT (aggr_type);
4939           if (tree_int_cst_sgn (step) == -1)
4940             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4941         }
4942
4943       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4944
4945       create_iv (aggr_ptr_init,
4946                  fold_convert (aggr_ptr_type, iv_step),
4947                  aggr_ptr, loop, &incr_gsi, insert_after,
4948                  &indx_before_incr, &indx_after_incr);
4949       incr = gsi_stmt (incr_gsi);
4950
4951       /* Copy the points-to information if it exists. */
4952       if (DR_PTR_INFO (dr))
4953         {
4954           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4955           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4956         }
4957       if (ptr_incr)
4958         *ptr_incr = incr;
4959
4960       aptr = indx_before_incr;
4961     }
4962
4963   if (!nested_in_vect_loop || only_init)
4964     return aptr;
4965
4966
4967   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4968      nested in LOOP, if exists.  */
4969
4970   gcc_assert (nested_in_vect_loop);
4971   if (!only_init)
4972     {
4973       standard_iv_increment_position (containing_loop, &incr_gsi,
4974                                       &insert_after);
4975       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4976                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4977                  &indx_after_incr);
4978       incr = gsi_stmt (incr_gsi);
4979
4980       /* Copy the points-to information if it exists. */
4981       if (DR_PTR_INFO (dr))
4982         {
4983           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4984           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4985         }
4986       if (ptr_incr)
4987         *ptr_incr = incr;
4988
4989       return indx_before_incr;
4990     }
4991   else
4992     gcc_unreachable ();
4993 }
4994
4995
4996 /* Function bump_vector_ptr
4997
4998    Increment a pointer (to a vector type) by vector-size. If requested,
4999    i.e. if PTR-INCR is given, then also connect the new increment stmt
5000    to the existing def-use update-chain of the pointer, by modifying
5001    the PTR_INCR as illustrated below:
5002
5003    The pointer def-use update-chain before this function:
5004                         DATAREF_PTR = phi (p_0, p_2)
5005                         ....
5006         PTR_INCR:       p_2 = DATAREF_PTR + step
5007
5008    The pointer def-use update-chain after this function:
5009                         DATAREF_PTR = phi (p_0, p_2)
5010                         ....
5011                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5012                         ....
5013         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5014
5015    Input:
5016    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5017                  in the loop.
5018    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5019               the loop.  The increment amount across iterations is expected
5020               to be vector_size.
5021    BSI - location where the new update stmt is to be placed.
5022    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5023    BUMP - optional. The offset by which to bump the pointer. If not given,
5024           the offset is assumed to be vector_size.
5025
5026    Output: Return NEW_DATAREF_PTR as illustrated above.
5027
5028 */
5029
5030 tree
5031 bump_vector_ptr (vec_info *vinfo,
5032                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5033                  stmt_vec_info stmt_info, tree bump)
5034 {
5035   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5036   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5037   tree update = TYPE_SIZE_UNIT (vectype);
5038   gassign *incr_stmt;
5039   ssa_op_iter iter;
5040   use_operand_p use_p;
5041   tree new_dataref_ptr;
5042
5043   if (bump)
5044     update = bump;
5045
5046   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5047     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5048   else
5049     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5050   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5051                                    dataref_ptr, update);
5052   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5053
5054   /* Copy the points-to information if it exists. */
5055   if (DR_PTR_INFO (dr))
5056     {
5057       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5058       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5059     }
5060
5061   if (!ptr_incr)
5062     return new_dataref_ptr;
5063
5064   /* Update the vector-pointer's cross-iteration increment.  */
5065   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5066     {
5067       tree use = USE_FROM_PTR (use_p);
5068
5069       if (use == dataref_ptr)
5070         SET_USE (use_p, new_dataref_ptr);
5071       else
5072         gcc_assert (operand_equal_p (use, update, 0));
5073     }
5074
5075   return new_dataref_ptr;
5076 }
5077
5078
5079 /* Copy memory reference info such as base/clique from the SRC reference
5080    to the DEST MEM_REF.  */
5081
5082 void
5083 vect_copy_ref_info (tree dest, tree src)
5084 {
5085   if (TREE_CODE (dest) != MEM_REF)
5086     return;
5087
5088   tree src_base = src;
5089   while (handled_component_p (src_base))
5090     src_base = TREE_OPERAND (src_base, 0);
5091   if (TREE_CODE (src_base) != MEM_REF
5092       && TREE_CODE (src_base) != TARGET_MEM_REF)
5093     return;
5094
5095   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5096   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5097 }
5098
5099
5100 /* Function vect_create_destination_var.
5101
5102    Create a new temporary of type VECTYPE.  */
5103
5104 tree
5105 vect_create_destination_var (tree scalar_dest, tree vectype)
5106 {
5107   tree vec_dest;
5108   const char *name;
5109   char *new_name;
5110   tree type;
5111   enum vect_var_kind kind;
5112
5113   kind = vectype
5114     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5115     ? vect_mask_var
5116     : vect_simple_var
5117     : vect_scalar_var;
5118   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5119
5120   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5121
5122   name = get_name (scalar_dest);
5123   if (name)
5124     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5125   else
5126     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5127   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5128   free (new_name);
5129
5130   return vec_dest;
5131 }
5132
5133 /* Function vect_grouped_store_supported.
5134
5135    Returns TRUE if interleave high and interleave low permutations
5136    are supported, and FALSE otherwise.  */
5137
5138 bool
5139 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5140 {
5141   machine_mode mode = TYPE_MODE (vectype);
5142
5143   /* vect_permute_store_chain requires the group size to be equal to 3 or
5144      be a power of two.  */
5145   if (count != 3 && exact_log2 (count) == -1)
5146     {
5147       if (dump_enabled_p ())
5148         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149                          "the size of the group of accesses"
5150                          " is not a power of 2 or not eqaul to 3\n");
5151       return false;
5152     }
5153
5154   /* Check that the permutation is supported.  */
5155   if (VECTOR_MODE_P (mode))
5156     {
5157       unsigned int i;
5158       if (count == 3)
5159         {
5160           unsigned int j0 = 0, j1 = 0, j2 = 0;
5161           unsigned int i, j;
5162
5163           unsigned int nelt;
5164           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5165             {
5166               if (dump_enabled_p ())
5167                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5168                                  "cannot handle groups of 3 stores for"
5169                                  " variable-length vectors\n");
5170               return false;
5171             }
5172
5173           vec_perm_builder sel (nelt, nelt, 1);
5174           sel.quick_grow (nelt);
5175           vec_perm_indices indices;
5176           for (j = 0; j < 3; j++)
5177             {
5178               int nelt0 = ((3 - j) * nelt) % 3;
5179               int nelt1 = ((3 - j) * nelt + 1) % 3;
5180               int nelt2 = ((3 - j) * nelt + 2) % 3;
5181               for (i = 0; i < nelt; i++)
5182                 {
5183                   if (3 * i + nelt0 < nelt)
5184                     sel[3 * i + nelt0] = j0++;
5185                   if (3 * i + nelt1 < nelt)
5186                     sel[3 * i + nelt1] = nelt + j1++;
5187                   if (3 * i + nelt2 < nelt)
5188                     sel[3 * i + nelt2] = 0;
5189                 }
5190               indices.new_vector (sel, 2, nelt);
5191               if (!can_vec_perm_const_p (mode, indices))
5192                 {
5193                   if (dump_enabled_p ())
5194                     dump_printf (MSG_MISSED_OPTIMIZATION,
5195                                  "permutation op not supported by target.\n");
5196                   return false;
5197                 }
5198
5199               for (i = 0; i < nelt; i++)
5200                 {
5201                   if (3 * i + nelt0 < nelt)
5202                     sel[3 * i + nelt0] = 3 * i + nelt0;
5203                   if (3 * i + nelt1 < nelt)
5204                     sel[3 * i + nelt1] = 3 * i + nelt1;
5205                   if (3 * i + nelt2 < nelt)
5206                     sel[3 * i + nelt2] = nelt + j2++;
5207                 }
5208               indices.new_vector (sel, 2, nelt);
5209               if (!can_vec_perm_const_p (mode, indices))
5210                 {
5211                   if (dump_enabled_p ())
5212                     dump_printf (MSG_MISSED_OPTIMIZATION,
5213                                  "permutation op not supported by target.\n");
5214                   return false;
5215                 }
5216             }
5217           return true;
5218         }
5219       else
5220         {
5221           /* If length is not equal to 3 then only power of 2 is supported.  */
5222           gcc_assert (pow2p_hwi (count));
5223           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5224
5225           /* The encoding has 2 interleaved stepped patterns.  */
5226           vec_perm_builder sel (nelt, 2, 3);
5227           sel.quick_grow (6);
5228           for (i = 0; i < 3; i++)
5229             {
5230               sel[i * 2] = i;
5231               sel[i * 2 + 1] = i + nelt;
5232             }
5233           vec_perm_indices indices (sel, 2, nelt);
5234           if (can_vec_perm_const_p (mode, indices))
5235             {
5236               for (i = 0; i < 6; i++)
5237                 sel[i] += exact_div (nelt, 2);
5238               indices.new_vector (sel, 2, nelt);
5239               if (can_vec_perm_const_p (mode, indices))
5240                 return true;
5241             }
5242         }
5243     }
5244
5245   if (dump_enabled_p ())
5246     dump_printf (MSG_MISSED_OPTIMIZATION,
5247                  "permutation op not supported by target.\n");
5248   return false;
5249 }
5250
5251
5252 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5253    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5254
5255 bool
5256 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5257                             bool masked_p)
5258 {
5259   if (masked_p)
5260     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5261                                          vec_mask_store_lanes_optab,
5262                                          vectype, count);
5263   else
5264     return vect_lanes_optab_supported_p ("vec_store_lanes",
5265                                          vec_store_lanes_optab,
5266                                          vectype, count);
5267 }
5268
5269
5270 /* Function vect_permute_store_chain.
5271
5272    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5273    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5274    the data correctly for the stores.  Return the final references for stores
5275    in RESULT_CHAIN.
5276
5277    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5278    The input is 4 vectors each containing 8 elements.  We assign a number to
5279    each element, the input sequence is:
5280
5281    1st vec:   0  1  2  3  4  5  6  7
5282    2nd vec:   8  9 10 11 12 13 14 15
5283    3rd vec:  16 17 18 19 20 21 22 23
5284    4th vec:  24 25 26 27 28 29 30 31
5285
5286    The output sequence should be:
5287
5288    1st vec:  0  8 16 24  1  9 17 25
5289    2nd vec:  2 10 18 26  3 11 19 27
5290    3rd vec:  4 12 20 28  5 13 21 30
5291    4th vec:  6 14 22 30  7 15 23 31
5292
5293    i.e., we interleave the contents of the four vectors in their order.
5294
5295    We use interleave_high/low instructions to create such output.  The input of
5296    each interleave_high/low operation is two vectors:
5297    1st vec    2nd vec
5298    0 1 2 3    4 5 6 7
5299    the even elements of the result vector are obtained left-to-right from the
5300    high/low elements of the first vector.  The odd elements of the result are
5301    obtained left-to-right from the high/low elements of the second vector.
5302    The output of interleave_high will be:   0 4 1 5
5303    and of interleave_low:                   2 6 3 7
5304
5305
5306    The permutation is done in log LENGTH stages.  In each stage interleave_high
5307    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5308    where the first argument is taken from the first half of DR_CHAIN and the
5309    second argument from it's second half.
5310    In our example,
5311
5312    I1: interleave_high (1st vec, 3rd vec)
5313    I2: interleave_low (1st vec, 3rd vec)
5314    I3: interleave_high (2nd vec, 4th vec)
5315    I4: interleave_low (2nd vec, 4th vec)
5316
5317    The output for the first stage is:
5318
5319    I1:  0 16  1 17  2 18  3 19
5320    I2:  4 20  5 21  6 22  7 23
5321    I3:  8 24  9 25 10 26 11 27
5322    I4: 12 28 13 29 14 30 15 31
5323
5324    The output of the second stage, i.e. the final result is:
5325
5326    I1:  0  8 16 24  1  9 17 25
5327    I2:  2 10 18 26  3 11 19 27
5328    I3:  4 12 20 28  5 13 21 30
5329    I4:  6 14 22 30  7 15 23 31.  */
5330
5331 void
5332 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5333                           unsigned int length,
5334                           stmt_vec_info stmt_info,
5335                           gimple_stmt_iterator *gsi,
5336                           vec<tree> *result_chain)
5337 {
5338   tree vect1, vect2, high, low;
5339   gimple *perm_stmt;
5340   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5341   tree perm_mask_low, perm_mask_high;
5342   tree data_ref;
5343   tree perm3_mask_low, perm3_mask_high;
5344   unsigned int i, j, n, log_length = exact_log2 (length);
5345
5346   result_chain->quick_grow (length);
5347   memcpy (result_chain->address (), dr_chain.address (),
5348           length * sizeof (tree));
5349
5350   if (length == 3)
5351     {
5352       /* vect_grouped_store_supported ensures that this is constant.  */
5353       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5354       unsigned int j0 = 0, j1 = 0, j2 = 0;
5355
5356       vec_perm_builder sel (nelt, nelt, 1);
5357       sel.quick_grow (nelt);
5358       vec_perm_indices indices;
5359       for (j = 0; j < 3; j++)
5360         {
5361           int nelt0 = ((3 - j) * nelt) % 3;
5362           int nelt1 = ((3 - j) * nelt + 1) % 3;
5363           int nelt2 = ((3 - j) * nelt + 2) % 3;
5364
5365           for (i = 0; i < nelt; i++)
5366             {
5367               if (3 * i + nelt0 < nelt)
5368                 sel[3 * i + nelt0] = j0++;
5369               if (3 * i + nelt1 < nelt)
5370                 sel[3 * i + nelt1] = nelt + j1++;
5371               if (3 * i + nelt2 < nelt)
5372                 sel[3 * i + nelt2] = 0;
5373             }
5374           indices.new_vector (sel, 2, nelt);
5375           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5376
5377           for (i = 0; i < nelt; i++)
5378             {
5379               if (3 * i + nelt0 < nelt)
5380                 sel[3 * i + nelt0] = 3 * i + nelt0;
5381               if (3 * i + nelt1 < nelt)
5382                 sel[3 * i + nelt1] = 3 * i + nelt1;
5383               if (3 * i + nelt2 < nelt)
5384                 sel[3 * i + nelt2] = nelt + j2++;
5385             }
5386           indices.new_vector (sel, 2, nelt);
5387           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5388
5389           vect1 = dr_chain[0];
5390           vect2 = dr_chain[1];
5391
5392           /* Create interleaving stmt:
5393              low = VEC_PERM_EXPR <vect1, vect2,
5394                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5395                                    j + 2, nelt + j + 2, *, ...}>  */
5396           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5397           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5398                                            vect2, perm3_mask_low);
5399           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5400
5401           vect1 = data_ref;
5402           vect2 = dr_chain[2];
5403           /* Create interleaving stmt:
5404              low = VEC_PERM_EXPR <vect1, vect2,
5405                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5406                                    6, 7, nelt + j + 2, ...}>  */
5407           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5408           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5409                                            vect2, perm3_mask_high);
5410           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5411           (*result_chain)[j] = data_ref;
5412         }
5413     }
5414   else
5415     {
5416       /* If length is not equal to 3 then only power of 2 is supported.  */
5417       gcc_assert (pow2p_hwi (length));
5418
5419       /* The encoding has 2 interleaved stepped patterns.  */
5420       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5421       vec_perm_builder sel (nelt, 2, 3);
5422       sel.quick_grow (6);
5423       for (i = 0; i < 3; i++)
5424         {
5425           sel[i * 2] = i;
5426           sel[i * 2 + 1] = i + nelt;
5427         }
5428         vec_perm_indices indices (sel, 2, nelt);
5429         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5430
5431         for (i = 0; i < 6; i++)
5432           sel[i] += exact_div (nelt, 2);
5433         indices.new_vector (sel, 2, nelt);
5434         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5435
5436         for (i = 0, n = log_length; i < n; i++)
5437           {
5438             for (j = 0; j < length/2; j++)
5439               {
5440                 vect1 = dr_chain[j];
5441                 vect2 = dr_chain[j+length/2];
5442
5443                 /* Create interleaving stmt:
5444                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5445                                                         ...}>  */
5446                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5447                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5448                                                  vect2, perm_mask_high);
5449                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5450                 (*result_chain)[2*j] = high;
5451
5452                 /* Create interleaving stmt:
5453                    low = VEC_PERM_EXPR <vect1, vect2,
5454                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5455                                          ...}>  */
5456                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5457                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5458                                                  vect2, perm_mask_low);
5459                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5460                 (*result_chain)[2*j+1] = low;
5461               }
5462             memcpy (dr_chain.address (), result_chain->address (),
5463                     length * sizeof (tree));
5464           }
5465     }
5466 }
5467
5468 /* Function vect_setup_realignment
5469
5470    This function is called when vectorizing an unaligned load using
5471    the dr_explicit_realign[_optimized] scheme.
5472    This function generates the following code at the loop prolog:
5473
5474       p = initial_addr;
5475    x  msq_init = *(floor(p));   # prolog load
5476       realignment_token = call target_builtin;
5477     loop:
5478    x  msq = phi (msq_init, ---)
5479
5480    The stmts marked with x are generated only for the case of
5481    dr_explicit_realign_optimized.
5482
5483    The code above sets up a new (vector) pointer, pointing to the first
5484    location accessed by STMT_INFO, and a "floor-aligned" load using that
5485    pointer.  It also generates code to compute the "realignment-token"
5486    (if the relevant target hook was defined), and creates a phi-node at the
5487    loop-header bb whose arguments are the result of the prolog-load (created
5488    by this function) and the result of a load that takes place in the loop
5489    (to be created by the caller to this function).
5490
5491    For the case of dr_explicit_realign_optimized:
5492    The caller to this function uses the phi-result (msq) to create the
5493    realignment code inside the loop, and sets up the missing phi argument,
5494    as follows:
5495     loop:
5496       msq = phi (msq_init, lsq)
5497       lsq = *(floor(p'));        # load in loop
5498       result = realign_load (msq, lsq, realignment_token);
5499
5500    For the case of dr_explicit_realign:
5501     loop:
5502       msq = *(floor(p));        # load in loop
5503       p' = p + (VS-1);
5504       lsq = *(floor(p'));       # load in loop
5505       result = realign_load (msq, lsq, realignment_token);
5506
5507    Input:
5508    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5509                a memory location that may be unaligned.
5510    BSI - place where new code is to be inserted.
5511    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5512                               is used.
5513
5514    Output:
5515    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5516                        target hook, if defined.
5517    Return value - the result of the loop-header phi node.  */
5518
5519 tree
5520 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5521                         gimple_stmt_iterator *gsi, tree *realignment_token,
5522                         enum dr_alignment_support alignment_support_scheme,
5523                         tree init_addr,
5524                         class loop **at_loop)
5525 {
5526   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5527   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5528   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5529   struct data_reference *dr = dr_info->dr;
5530   class loop *loop = NULL;
5531   edge pe = NULL;
5532   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5533   tree vec_dest;
5534   gimple *inc;
5535   tree ptr;
5536   tree data_ref;
5537   basic_block new_bb;
5538   tree msq_init = NULL_TREE;
5539   tree new_temp;
5540   gphi *phi_stmt;
5541   tree msq = NULL_TREE;
5542   gimple_seq stmts = NULL;
5543   bool compute_in_loop = false;
5544   bool nested_in_vect_loop = false;
5545   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5546   class loop *loop_for_initial_load = NULL;
5547
5548   if (loop_vinfo)
5549     {
5550       loop = LOOP_VINFO_LOOP (loop_vinfo);
5551       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5552     }
5553
5554   gcc_assert (alignment_support_scheme == dr_explicit_realign
5555               || alignment_support_scheme == dr_explicit_realign_optimized);
5556
5557   /* We need to generate three things:
5558      1. the misalignment computation
5559      2. the extra vector load (for the optimized realignment scheme).
5560      3. the phi node for the two vectors from which the realignment is
5561       done (for the optimized realignment scheme).  */
5562
5563   /* 1. Determine where to generate the misalignment computation.
5564
5565      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5566      calculation will be generated by this function, outside the loop (in the
5567      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5568      caller, inside the loop.
5569
5570      Background: If the misalignment remains fixed throughout the iterations of
5571      the loop, then both realignment schemes are applicable, and also the
5572      misalignment computation can be done outside LOOP.  This is because we are
5573      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5574      are a multiple of VS (the Vector Size), and therefore the misalignment in
5575      different vectorized LOOP iterations is always the same.
5576      The problem arises only if the memory access is in an inner-loop nested
5577      inside LOOP, which is now being vectorized using outer-loop vectorization.
5578      This is the only case when the misalignment of the memory access may not
5579      remain fixed throughout the iterations of the inner-loop (as explained in
5580      detail in vect_supportable_dr_alignment).  In this case, not only is the
5581      optimized realignment scheme not applicable, but also the misalignment
5582      computation (and generation of the realignment token that is passed to
5583      REALIGN_LOAD) have to be done inside the loop.
5584
5585      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5586      or not, which in turn determines if the misalignment is computed inside
5587      the inner-loop, or outside LOOP.  */
5588
5589   if (init_addr != NULL_TREE || !loop_vinfo)
5590     {
5591       compute_in_loop = true;
5592       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5593     }
5594
5595
5596   /* 2. Determine where to generate the extra vector load.
5597
5598      For the optimized realignment scheme, instead of generating two vector
5599      loads in each iteration, we generate a single extra vector load in the
5600      preheader of the loop, and in each iteration reuse the result of the
5601      vector load from the previous iteration.  In case the memory access is in
5602      an inner-loop nested inside LOOP, which is now being vectorized using
5603      outer-loop vectorization, we need to determine whether this initial vector
5604      load should be generated at the preheader of the inner-loop, or can be
5605      generated at the preheader of LOOP.  If the memory access has no evolution
5606      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5607      to be generated inside LOOP (in the preheader of the inner-loop).  */
5608
5609   if (nested_in_vect_loop)
5610     {
5611       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5612       bool invariant_in_outerloop =
5613             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5614       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5615     }
5616   else
5617     loop_for_initial_load = loop;
5618   if (at_loop)
5619     *at_loop = loop_for_initial_load;
5620
5621   if (loop_for_initial_load)
5622     pe = loop_preheader_edge (loop_for_initial_load);
5623
5624   /* 3. For the case of the optimized realignment, create the first vector
5625       load at the loop preheader.  */
5626
5627   if (alignment_support_scheme == dr_explicit_realign_optimized)
5628     {
5629       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5630       gassign *new_stmt;
5631
5632       gcc_assert (!compute_in_loop);
5633       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5635                                       loop_for_initial_load, NULL_TREE,
5636                                       &init_addr, NULL, &inc, true);
5637       if (TREE_CODE (ptr) == SSA_NAME)
5638         new_temp = copy_ssa_name (ptr);
5639       else
5640         new_temp = make_ssa_name (TREE_TYPE (ptr));
5641       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5642       tree type = TREE_TYPE (ptr);
5643       new_stmt = gimple_build_assign
5644                    (new_temp, BIT_AND_EXPR, ptr,
5645                     fold_build2 (MINUS_EXPR, type,
5646                                  build_int_cst (type, 0),
5647                                  build_int_cst (type, align)));
5648       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5649       gcc_assert (!new_bb);
5650       data_ref
5651         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5652                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5653       vect_copy_ref_info (data_ref, DR_REF (dr));
5654       new_stmt = gimple_build_assign (vec_dest, data_ref);
5655       new_temp = make_ssa_name (vec_dest, new_stmt);
5656       gimple_assign_set_lhs (new_stmt, new_temp);
5657       if (pe)
5658         {
5659           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5660           gcc_assert (!new_bb);
5661         }
5662       else
5663          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5664
5665       msq_init = gimple_assign_lhs (new_stmt);
5666     }
5667
5668   /* 4. Create realignment token using a target builtin, if available.
5669       It is done either inside the containing loop, or before LOOP (as
5670       determined above).  */
5671
5672   if (targetm.vectorize.builtin_mask_for_load)
5673     {
5674       gcall *new_stmt;
5675       tree builtin_decl;
5676
5677       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5678       if (!init_addr)
5679         {
5680           /* Generate the INIT_ADDR computation outside LOOP.  */
5681           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5682                                                             stmt_info, &stmts,
5683                                                             NULL_TREE);
5684           if (loop)
5685             {
5686               pe = loop_preheader_edge (loop);
5687               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5688               gcc_assert (!new_bb);
5689             }
5690           else
5691              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5692         }
5693
5694       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5695       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5696       vec_dest =
5697         vect_create_destination_var (scalar_dest,
5698                                      gimple_call_return_type (new_stmt));
5699       new_temp = make_ssa_name (vec_dest, new_stmt);
5700       gimple_call_set_lhs (new_stmt, new_temp);
5701
5702       if (compute_in_loop)
5703         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5704       else
5705         {
5706           /* Generate the misalignment computation outside LOOP.  */
5707           pe = loop_preheader_edge (loop);
5708           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5709           gcc_assert (!new_bb);
5710         }
5711
5712       *realignment_token = gimple_call_lhs (new_stmt);
5713
5714       /* The result of the CALL_EXPR to this builtin is determined from
5715          the value of the parameter and no global variables are touched
5716          which makes the builtin a "const" function.  Requiring the
5717          builtin to have the "const" attribute makes it unnecessary
5718          to call mark_call_clobbered.  */
5719       gcc_assert (TREE_READONLY (builtin_decl));
5720     }
5721
5722   if (alignment_support_scheme == dr_explicit_realign)
5723     return msq;
5724
5725   gcc_assert (!compute_in_loop);
5726   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5727
5728
5729   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5730
5731   pe = loop_preheader_edge (containing_loop);
5732   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5733   msq = make_ssa_name (vec_dest);
5734   phi_stmt = create_phi_node (msq, containing_loop->header);
5735   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5736
5737   return msq;
5738 }
5739
5740
5741 /* Function vect_grouped_load_supported.
5742
5743    COUNT is the size of the load group (the number of statements plus the
5744    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5745    only one statement, with a gap of COUNT - 1.
5746
5747    Returns true if a suitable permute exists.  */
5748
5749 bool
5750 vect_grouped_load_supported (tree vectype, bool single_element_p,
5751                              unsigned HOST_WIDE_INT count)
5752 {
5753   machine_mode mode = TYPE_MODE (vectype);
5754
5755   /* If this is single-element interleaving with an element distance
5756      that leaves unused vector loads around punt - we at least create
5757      very sub-optimal code in that case (and blow up memory,
5758      see PR65518).  */
5759   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5760     {
5761       if (dump_enabled_p ())
5762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763                          "single-element interleaving not supported "
5764                          "for not adjacent vector loads\n");
5765       return false;
5766     }
5767
5768   /* vect_permute_load_chain requires the group size to be equal to 3 or
5769      be a power of two.  */
5770   if (count != 3 && exact_log2 (count) == -1)
5771     {
5772       if (dump_enabled_p ())
5773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774                          "the size of the group of accesses"
5775                          " is not a power of 2 or not equal to 3\n");
5776       return false;
5777     }
5778
5779   /* Check that the permutation is supported.  */
5780   if (VECTOR_MODE_P (mode))
5781     {
5782       unsigned int i, j;
5783       if (count == 3)
5784         {
5785           unsigned int nelt;
5786           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5787             {
5788               if (dump_enabled_p ())
5789                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790                                  "cannot handle groups of 3 loads for"
5791                                  " variable-length vectors\n");
5792               return false;
5793             }
5794
5795           vec_perm_builder sel (nelt, nelt, 1);
5796           sel.quick_grow (nelt);
5797           vec_perm_indices indices;
5798           unsigned int k;
5799           for (k = 0; k < 3; k++)
5800             {
5801               for (i = 0; i < nelt; i++)
5802                 if (3 * i + k < 2 * nelt)
5803                   sel[i] = 3 * i + k;
5804                 else
5805                   sel[i] = 0;
5806               indices.new_vector (sel, 2, nelt);
5807               if (!can_vec_perm_const_p (mode, indices))
5808                 {
5809                   if (dump_enabled_p ())
5810                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5811                                      "shuffle of 3 loads is not supported by"
5812                                      " target\n");
5813                   return false;
5814                 }
5815               for (i = 0, j = 0; i < nelt; i++)
5816                 if (3 * i + k < 2 * nelt)
5817                   sel[i] = i;
5818                 else
5819                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5820               indices.new_vector (sel, 2, nelt);
5821               if (!can_vec_perm_const_p (mode, indices))
5822                 {
5823                   if (dump_enabled_p ())
5824                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825                                      "shuffle of 3 loads is not supported by"
5826                                      " target\n");
5827                   return false;
5828                 }
5829             }
5830           return true;
5831         }
5832       else
5833         {
5834           /* If length is not equal to 3 then only power of 2 is supported.  */
5835           gcc_assert (pow2p_hwi (count));
5836           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5837
5838           /* The encoding has a single stepped pattern.  */
5839           vec_perm_builder sel (nelt, 1, 3);
5840           sel.quick_grow (3);
5841           for (i = 0; i < 3; i++)
5842             sel[i] = i * 2;
5843           vec_perm_indices indices (sel, 2, nelt);
5844           if (can_vec_perm_const_p (mode, indices))
5845             {
5846               for (i = 0; i < 3; i++)
5847                 sel[i] = i * 2 + 1;
5848               indices.new_vector (sel, 2, nelt);
5849               if (can_vec_perm_const_p (mode, indices))
5850                 return true;
5851             }
5852         }
5853     }
5854
5855   if (dump_enabled_p ())
5856     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5857                      "extract even/odd not supported by target\n");
5858   return false;
5859 }
5860
5861 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5862    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5863
5864 bool
5865 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5866                            bool masked_p)
5867 {
5868   if (masked_p)
5869     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5870                                          vec_mask_load_lanes_optab,
5871                                          vectype, count);
5872   else
5873     return vect_lanes_optab_supported_p ("vec_load_lanes",
5874                                          vec_load_lanes_optab,
5875                                          vectype, count);
5876 }
5877
5878 /* Function vect_permute_load_chain.
5879
5880    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5881    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5882    the input data correctly.  Return the final references for loads in
5883    RESULT_CHAIN.
5884
5885    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5886    The input is 4 vectors each containing 8 elements. We assign a number to each
5887    element, the input sequence is:
5888
5889    1st vec:   0  1  2  3  4  5  6  7
5890    2nd vec:   8  9 10 11 12 13 14 15
5891    3rd vec:  16 17 18 19 20 21 22 23
5892    4th vec:  24 25 26 27 28 29 30 31
5893
5894    The output sequence should be:
5895
5896    1st vec:  0 4  8 12 16 20 24 28
5897    2nd vec:  1 5  9 13 17 21 25 29
5898    3rd vec:  2 6 10 14 18 22 26 30
5899    4th vec:  3 7 11 15 19 23 27 31
5900
5901    i.e., the first output vector should contain the first elements of each
5902    interleaving group, etc.
5903
5904    We use extract_even/odd instructions to create such output.  The input of
5905    each extract_even/odd operation is two vectors
5906    1st vec    2nd vec
5907    0 1 2 3    4 5 6 7
5908
5909    and the output is the vector of extracted even/odd elements.  The output of
5910    extract_even will be:   0 2 4 6
5911    and of extract_odd:     1 3 5 7
5912
5913
5914    The permutation is done in log LENGTH stages.  In each stage extract_even
5915    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5916    their order.  In our example,
5917
5918    E1: extract_even (1st vec, 2nd vec)
5919    E2: extract_odd (1st vec, 2nd vec)
5920    E3: extract_even (3rd vec, 4th vec)
5921    E4: extract_odd (3rd vec, 4th vec)
5922
5923    The output for the first stage will be:
5924
5925    E1:  0  2  4  6  8 10 12 14
5926    E2:  1  3  5  7  9 11 13 15
5927    E3: 16 18 20 22 24 26 28 30
5928    E4: 17 19 21 23 25 27 29 31
5929
5930    In order to proceed and create the correct sequence for the next stage (or
5931    for the correct output, if the second stage is the last one, as in our
5932    example), we first put the output of extract_even operation and then the
5933    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5934    The input for the second stage is:
5935
5936    1st vec (E1):  0  2  4  6  8 10 12 14
5937    2nd vec (E3): 16 18 20 22 24 26 28 30
5938    3rd vec (E2):  1  3  5  7  9 11 13 15
5939    4th vec (E4): 17 19 21 23 25 27 29 31
5940
5941    The output of the second stage:
5942
5943    E1: 0 4  8 12 16 20 24 28
5944    E2: 2 6 10 14 18 22 26 30
5945    E3: 1 5  9 13 17 21 25 29
5946    E4: 3 7 11 15 19 23 27 31
5947
5948    And RESULT_CHAIN after reordering:
5949
5950    1st vec (E1):  0 4  8 12 16 20 24 28
5951    2nd vec (E3):  1 5  9 13 17 21 25 29
5952    3rd vec (E2):  2 6 10 14 18 22 26 30
5953    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5954
5955 static void
5956 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5957                          unsigned int length,
5958                          stmt_vec_info stmt_info,
5959                          gimple_stmt_iterator *gsi,
5960                          vec<tree> *result_chain)
5961 {
5962   tree data_ref, first_vect, second_vect;
5963   tree perm_mask_even, perm_mask_odd;
5964   tree perm3_mask_low, perm3_mask_high;
5965   gimple *perm_stmt;
5966   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5967   unsigned int i, j, log_length = exact_log2 (length);
5968
5969   result_chain->quick_grow (length);
5970   memcpy (result_chain->address (), dr_chain.address (),
5971           length * sizeof (tree));
5972
5973   if (length == 3)
5974     {
5975       /* vect_grouped_load_supported ensures that this is constant.  */
5976       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5977       unsigned int k;
5978
5979       vec_perm_builder sel (nelt, nelt, 1);
5980       sel.quick_grow (nelt);
5981       vec_perm_indices indices;
5982       for (k = 0; k < 3; k++)
5983         {
5984           for (i = 0; i < nelt; i++)
5985             if (3 * i + k < 2 * nelt)
5986               sel[i] = 3 * i + k;
5987             else
5988               sel[i] = 0;
5989           indices.new_vector (sel, 2, nelt);
5990           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5991
5992           for (i = 0, j = 0; i < nelt; i++)
5993             if (3 * i + k < 2 * nelt)
5994               sel[i] = i;
5995             else
5996               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5997           indices.new_vector (sel, 2, nelt);
5998           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5999
6000           first_vect = dr_chain[0];
6001           second_vect = dr_chain[1];
6002
6003           /* Create interleaving stmt (low part of):
6004              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6005                                                              ...}>  */
6006           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6007           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6008                                            second_vect, perm3_mask_low);
6009           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6010
6011           /* Create interleaving stmt (high part of):
6012              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6013                                                               ...}>  */
6014           first_vect = data_ref;
6015           second_vect = dr_chain[2];
6016           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6017           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6018                                            second_vect, perm3_mask_high);
6019           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6020           (*result_chain)[k] = data_ref;
6021         }
6022     }
6023   else
6024     {
6025       /* If length is not equal to 3 then only power of 2 is supported.  */
6026       gcc_assert (pow2p_hwi (length));
6027
6028       /* The encoding has a single stepped pattern.  */
6029       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6030       vec_perm_builder sel (nelt, 1, 3);
6031       sel.quick_grow (3);
6032       for (i = 0; i < 3; ++i)
6033         sel[i] = i * 2;
6034       vec_perm_indices indices (sel, 2, nelt);
6035       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6036
6037       for (i = 0; i < 3; ++i)
6038         sel[i] = i * 2 + 1;
6039       indices.new_vector (sel, 2, nelt);
6040       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6041
6042       for (i = 0; i < log_length; i++)
6043         {
6044           for (j = 0; j < length; j += 2)
6045             {
6046               first_vect = dr_chain[j];
6047               second_vect = dr_chain[j+1];
6048
6049               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6050               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6051               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6052                                                first_vect, second_vect,
6053                                                perm_mask_even);
6054               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6055               (*result_chain)[j/2] = data_ref;
6056
6057               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6058               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6059               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6060                                                first_vect, second_vect,
6061                                                perm_mask_odd);
6062               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6063               (*result_chain)[j/2+length/2] = data_ref;
6064             }
6065           memcpy (dr_chain.address (), result_chain->address (),
6066                   length * sizeof (tree));
6067         }
6068     }
6069 }
6070
6071 /* Function vect_shift_permute_load_chain.
6072
6073    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6074    sequence of stmts to reorder the input data accordingly.
6075    Return the final references for loads in RESULT_CHAIN.
6076    Return true if successed, false otherwise.
6077
6078    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6079    The input is 3 vectors each containing 8 elements.  We assign a
6080    number to each element, the input sequence is:
6081
6082    1st vec:   0  1  2  3  4  5  6  7
6083    2nd vec:   8  9 10 11 12 13 14 15
6084    3rd vec:  16 17 18 19 20 21 22 23
6085
6086    The output sequence should be:
6087
6088    1st vec:  0 3 6  9 12 15 18 21
6089    2nd vec:  1 4 7 10 13 16 19 22
6090    3rd vec:  2 5 8 11 14 17 20 23
6091
6092    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6093
6094    First we shuffle all 3 vectors to get correct elements order:
6095
6096    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6097    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6098    3rd vec:  (16 19 22) (17 20 23) (18 21)
6099
6100    Next we unite and shift vector 3 times:
6101
6102    1st step:
6103      shift right by 6 the concatenation of:
6104      "1st vec" and  "2nd vec"
6105        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6106      "2nd vec" and  "3rd vec"
6107        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6108      "3rd vec" and  "1st vec"
6109        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6110                              | New vectors                   |
6111
6112      So that now new vectors are:
6113
6114      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6115      2nd vec:  (10 13) (16 19 22) (17 20 23)
6116      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6117
6118    2nd step:
6119      shift right by 5 the concatenation of:
6120      "1st vec" and  "3rd vec"
6121        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6122      "2nd vec" and  "1st vec"
6123        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6124      "3rd vec" and  "2nd vec"
6125        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6126                           | New vectors                   |
6127
6128      So that now new vectors are:
6129
6130      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6131      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6132      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6133
6134    3rd step:
6135      shift right by 5 the concatenation of:
6136      "1st vec" and  "1st vec"
6137        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6138      shift right by 3 the concatenation of:
6139      "2nd vec" and  "2nd vec"
6140                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6141                           | New vectors                   |
6142
6143      So that now all vectors are READY:
6144      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6145      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6146      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6147
6148    This algorithm is faster than one in vect_permute_load_chain if:
6149      1.  "shift of a concatination" is faster than general permutation.
6150          This is usually so.
6151      2.  The TARGET machine can't execute vector instructions in parallel.
6152          This is because each step of the algorithm depends on previous.
6153          The algorithm in vect_permute_load_chain is much more parallel.
6154
6155    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6156 */
6157
6158 static bool
6159 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6160                                unsigned int length,
6161                                stmt_vec_info stmt_info,
6162                                gimple_stmt_iterator *gsi,
6163                                vec<tree> *result_chain)
6164 {
6165   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6166   tree perm2_mask1, perm2_mask2, perm3_mask;
6167   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6168   gimple *perm_stmt;
6169
6170   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6171   unsigned int i;
6172   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6173
6174   unsigned HOST_WIDE_INT nelt, vf;
6175   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6176       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6177     /* Not supported for variable-length vectors.  */
6178     return false;
6179
6180   vec_perm_builder sel (nelt, nelt, 1);
6181   sel.quick_grow (nelt);
6182
6183   result_chain->quick_grow (length);
6184   memcpy (result_chain->address (), dr_chain.address (),
6185           length * sizeof (tree));
6186
6187   if (pow2p_hwi (length) && vf > 4)
6188     {
6189       unsigned int j, log_length = exact_log2 (length);
6190       for (i = 0; i < nelt / 2; ++i)
6191         sel[i] = i * 2;
6192       for (i = 0; i < nelt / 2; ++i)
6193         sel[nelt / 2 + i] = i * 2 + 1;
6194       vec_perm_indices indices (sel, 2, nelt);
6195       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6196         {
6197           if (dump_enabled_p ())
6198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199                              "shuffle of 2 fields structure is not \
6200                               supported by target\n");
6201           return false;
6202         }
6203       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6204
6205       for (i = 0; i < nelt / 2; ++i)
6206         sel[i] = i * 2 + 1;
6207       for (i = 0; i < nelt / 2; ++i)
6208         sel[nelt / 2 + i] = i * 2;
6209       indices.new_vector (sel, 2, nelt);
6210       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211         {
6212           if (dump_enabled_p ())
6213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214                              "shuffle of 2 fields structure is not \
6215                               supported by target\n");
6216           return false;
6217         }
6218       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6219
6220       /* Generating permutation constant to shift all elements.
6221          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6222       for (i = 0; i < nelt; i++)
6223         sel[i] = nelt / 2 + i;
6224       indices.new_vector (sel, 2, nelt);
6225       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226         {
6227           if (dump_enabled_p ())
6228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229                              "shift permutation is not supported by target\n");
6230           return false;
6231         }
6232       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6233
6234       /* Generating permutation constant to select vector from 2.
6235          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6236       for (i = 0; i < nelt / 2; i++)
6237         sel[i] = i;
6238       for (i = nelt / 2; i < nelt; i++)
6239         sel[i] = nelt + i;
6240       indices.new_vector (sel, 2, nelt);
6241       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6242         {
6243           if (dump_enabled_p ())
6244             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245                              "select is not supported by target\n");
6246           return false;
6247         }
6248       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6249
6250       for (i = 0; i < log_length; i++)
6251         {
6252           for (j = 0; j < length; j += 2)
6253             {
6254               first_vect = dr_chain[j];
6255               second_vect = dr_chain[j + 1];
6256
6257               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6258               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6259                                                first_vect, first_vect,
6260                                                perm2_mask1);
6261               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6262               vect[0] = data_ref;
6263
6264               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6265               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6266                                                second_vect, second_vect,
6267                                                perm2_mask2);
6268               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6269               vect[1] = data_ref;
6270
6271               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6272               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6273                                                vect[0], vect[1], shift1_mask);
6274               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6275               (*result_chain)[j/2 + length/2] = data_ref;
6276
6277               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6278               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6279                                                vect[0], vect[1], select_mask);
6280               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6281               (*result_chain)[j/2] = data_ref;
6282             }
6283           memcpy (dr_chain.address (), result_chain->address (),
6284                   length * sizeof (tree));
6285         }
6286       return true;
6287     }
6288   if (length == 3 && vf > 2)
6289     {
6290       unsigned int k = 0, l = 0;
6291
6292       /* Generating permutation constant to get all elements in rigth order.
6293          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6294       for (i = 0; i < nelt; i++)
6295         {
6296           if (3 * k + (l % 3) >= nelt)
6297             {
6298               k = 0;
6299               l += (3 - (nelt % 3));
6300             }
6301           sel[i] = 3 * k + (l % 3);
6302           k++;
6303         }
6304       vec_perm_indices indices (sel, 2, nelt);
6305       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6306         {
6307           if (dump_enabled_p ())
6308             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309                              "shuffle of 3 fields structure is not \
6310                               supported by target\n");
6311           return false;
6312         }
6313       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6314
6315       /* Generating permutation constant to shift all elements.
6316          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6317       for (i = 0; i < nelt; i++)
6318         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6319       indices.new_vector (sel, 2, nelt);
6320       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6321         {
6322           if (dump_enabled_p ())
6323             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324                              "shift permutation is not supported by target\n");
6325           return false;
6326         }
6327       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6328
6329       /* Generating permutation constant to shift all elements.
6330          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6331       for (i = 0; i < nelt; i++)
6332         sel[i] = 2 * (nelt / 3) + 1 + i;
6333       indices.new_vector (sel, 2, nelt);
6334       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6335         {
6336           if (dump_enabled_p ())
6337             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6338                              "shift permutation is not supported by target\n");
6339           return false;
6340         }
6341       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6342
6343       /* Generating permutation constant to shift all elements.
6344          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6345       for (i = 0; i < nelt; i++)
6346         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6347       indices.new_vector (sel, 2, nelt);
6348       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6349         {
6350           if (dump_enabled_p ())
6351             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6352                              "shift permutation is not supported by target\n");
6353           return false;
6354         }
6355       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6356
6357       /* Generating permutation constant to shift all elements.
6358          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6359       for (i = 0; i < nelt; i++)
6360         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6361       indices.new_vector (sel, 2, nelt);
6362       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6363         {
6364           if (dump_enabled_p ())
6365             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6366                              "shift permutation is not supported by target\n");
6367           return false;
6368         }
6369       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6370
6371       for (k = 0; k < 3; k++)
6372         {
6373           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6374           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6375                                            dr_chain[k], dr_chain[k],
6376                                            perm3_mask);
6377           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6378           vect[k] = data_ref;
6379         }
6380
6381       for (k = 0; k < 3; k++)
6382         {
6383           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6384           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6385                                            vect[k % 3], vect[(k + 1) % 3],
6386                                            shift1_mask);
6387           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6388           vect_shift[k] = data_ref;
6389         }
6390
6391       for (k = 0; k < 3; k++)
6392         {
6393           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6394           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6395                                            vect_shift[(4 - k) % 3],
6396                                            vect_shift[(3 - k) % 3],
6397                                            shift2_mask);
6398           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6399           vect[k] = data_ref;
6400         }
6401
6402       (*result_chain)[3 - (nelt % 3)] = vect[2];
6403
6404       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6405       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6406                                        vect[0], shift3_mask);
6407       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6408       (*result_chain)[nelt % 3] = data_ref;
6409
6410       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6411       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6412                                        vect[1], shift4_mask);
6413       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6414       (*result_chain)[0] = data_ref;
6415       return true;
6416     }
6417   return false;
6418 }
6419
6420 /* Function vect_transform_grouped_load.
6421
6422    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6423    to perform their permutation and ascribe the result vectorized statements to
6424    the scalar statements.
6425 */
6426
6427 void
6428 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6429                              vec<tree> dr_chain,
6430                              int size, gimple_stmt_iterator *gsi)
6431 {
6432   machine_mode mode;
6433   vec<tree> result_chain = vNULL;
6434
6435   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6436      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6437      vectors, that are ready for vector computation.  */
6438   result_chain.create (size);
6439
6440   /* If reassociation width for vector type is 2 or greater target machine can
6441      execute 2 or more vector instructions in parallel.  Otherwise try to
6442      get chain for loads group using vect_shift_permute_load_chain.  */
6443   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6444   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6445       || pow2p_hwi (size)
6446       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6447                                          gsi, &result_chain))
6448     vect_permute_load_chain (vinfo, dr_chain,
6449                              size, stmt_info, gsi, &result_chain);
6450   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6451   result_chain.release ();
6452 }
6453
6454 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6455    generated as part of the vectorization of STMT_INFO.  Assign the statement
6456    for each vector to the associated scalar statement.  */
6457
6458 void
6459 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6460                                   vec<tree> result_chain)
6461 {
6462   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6463   unsigned int i, gap_count;
6464   tree tmp_data_ref;
6465
6466   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6467      Since we scan the chain starting from it's first node, their order
6468      corresponds the order of data-refs in RESULT_CHAIN.  */
6469   stmt_vec_info next_stmt_info = first_stmt_info;
6470   gap_count = 1;
6471   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6472     {
6473       if (!next_stmt_info)
6474         break;
6475
6476       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6477        code elimination pass later.  No need to check for the first stmt in
6478        the group, since it always exists.
6479        DR_GROUP_GAP is the number of steps in elements from the previous
6480        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6481        correspond to the gaps.  */
6482       if (next_stmt_info != first_stmt_info
6483           && gap_count < DR_GROUP_GAP (next_stmt_info))
6484         {
6485           gap_count++;
6486           continue;
6487         }
6488
6489       /* ???  The following needs cleanup after the removal of
6490          DR_GROUP_SAME_DR_STMT.  */
6491       if (next_stmt_info)
6492         {
6493           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6494           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6495              copies, and we put the new vector statement last.  */
6496           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6497
6498           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6499           gap_count = 1;
6500         }
6501     }
6502 }
6503
6504 /* Function vect_force_dr_alignment_p.
6505
6506    Returns whether the alignment of a DECL can be forced to be aligned
6507    on ALIGNMENT bit boundary.  */
6508
6509 bool
6510 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6511 {
6512   if (!VAR_P (decl))
6513     return false;
6514
6515   if (decl_in_symtab_p (decl)
6516       && !symtab_node::get (decl)->can_increase_alignment_p ())
6517     return false;
6518
6519   if (TREE_STATIC (decl))
6520     return (known_le (alignment,
6521                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6522   else
6523     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6524 }
6525
6526
6527 /* Return whether the data reference DR_INFO is supported with respect to its
6528    alignment.
6529    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6530    it is aligned, i.e., check if it is possible to vectorize it with different
6531    alignment.  */
6532
6533 enum dr_alignment_support
6534 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6535                                bool check_aligned_accesses)
6536 {
6537   data_reference *dr = dr_info->dr;
6538   stmt_vec_info stmt_info = dr_info->stmt;
6539   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6540   machine_mode mode = TYPE_MODE (vectype);
6541   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6542   class loop *vect_loop = NULL;
6543   bool nested_in_vect_loop = false;
6544
6545   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6546     return dr_aligned;
6547
6548   /* For now assume all conditional loads/stores support unaligned
6549      access without any special code.  */
6550   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6551     if (gimple_call_internal_p (stmt)
6552         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6553             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6554       return dr_unaligned_supported;
6555
6556   if (loop_vinfo)
6557     {
6558       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6559       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6560     }
6561
6562   /* Possibly unaligned access.  */
6563
6564   /* We can choose between using the implicit realignment scheme (generating
6565      a misaligned_move stmt) and the explicit realignment scheme (generating
6566      aligned loads with a REALIGN_LOAD).  There are two variants to the
6567      explicit realignment scheme: optimized, and unoptimized.
6568      We can optimize the realignment only if the step between consecutive
6569      vector loads is equal to the vector size.  Since the vector memory
6570      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6571      is guaranteed that the misalignment amount remains the same throughout the
6572      execution of the vectorized loop.  Therefore, we can create the
6573      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6574      at the loop preheader.
6575
6576      However, in the case of outer-loop vectorization, when vectorizing a
6577      memory access in the inner-loop nested within the LOOP that is now being
6578      vectorized, while it is guaranteed that the misalignment of the
6579      vectorized memory access will remain the same in different outer-loop
6580      iterations, it is *not* guaranteed that is will remain the same throughout
6581      the execution of the inner-loop.  This is because the inner-loop advances
6582      with the original scalar step (and not in steps of VS).  If the inner-loop
6583      step happens to be a multiple of VS, then the misalignment remains fixed
6584      and we can use the optimized realignment scheme.  For example:
6585
6586       for (i=0; i<N; i++)
6587         for (j=0; j<M; j++)
6588           s += a[i+j];
6589
6590      When vectorizing the i-loop in the above example, the step between
6591      consecutive vector loads is 1, and so the misalignment does not remain
6592      fixed across the execution of the inner-loop, and the realignment cannot
6593      be optimized (as illustrated in the following pseudo vectorized loop):
6594
6595       for (i=0; i<N; i+=4)
6596         for (j=0; j<M; j++){
6597           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6598                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6599                          // (assuming that we start from an aligned address).
6600           }
6601
6602      We therefore have to use the unoptimized realignment scheme:
6603
6604       for (i=0; i<N; i+=4)
6605           for (j=k; j<M; j+=4)
6606           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6607                            // that the misalignment of the initial address is
6608                            // 0).
6609
6610      The loop can then be vectorized as follows:
6611
6612       for (k=0; k<4; k++){
6613         rt = get_realignment_token (&vp[k]);
6614         for (i=0; i<N; i+=4){
6615           v1 = vp[i+k];
6616           for (j=k; j<M; j+=4){
6617             v2 = vp[i+j+VS-1];
6618             va = REALIGN_LOAD <v1,v2,rt>;
6619             vs += va;
6620             v1 = v2;
6621           }
6622         }
6623     } */
6624
6625   if (DR_IS_READ (dr))
6626     {
6627       bool is_packed = false;
6628       tree type = (TREE_TYPE (DR_REF (dr)));
6629
6630       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6631           && (!targetm.vectorize.builtin_mask_for_load
6632               || targetm.vectorize.builtin_mask_for_load ()))
6633         {
6634           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6635
6636           /* If we are doing SLP then the accesses need not have the
6637              same alignment, instead it depends on the SLP group size.  */
6638           if (loop_vinfo
6639               && STMT_SLP_TYPE (stmt_info)
6640               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6641                               * (DR_GROUP_SIZE
6642                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6643                               TYPE_VECTOR_SUBPARTS (vectype)))
6644             ;
6645           else if (!loop_vinfo
6646                    || (nested_in_vect_loop
6647                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6648                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6649             return dr_explicit_realign;
6650           else
6651             return dr_explicit_realign_optimized;
6652         }
6653       if (!known_alignment_for_access_p (dr_info))
6654         is_packed = not_size_aligned (DR_REF (dr));
6655
6656       if (targetm.vectorize.support_vector_misalignment
6657             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6658         /* Can't software pipeline the loads, but can at least do them.  */
6659         return dr_unaligned_supported;
6660     }
6661   else
6662     {
6663       bool is_packed = false;
6664       tree type = (TREE_TYPE (DR_REF (dr)));
6665
6666       if (!known_alignment_for_access_p (dr_info))
6667         is_packed = not_size_aligned (DR_REF (dr));
6668
6669      if (targetm.vectorize.support_vector_misalignment
6670            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6671        return dr_unaligned_supported;
6672     }
6673
6674   /* Unsupported.  */
6675   return dr_unaligned_unsupported;
6676 }