Daily bump.
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63 {
64 machine_mode mode, array_mode;
65 bool limit_p;
66
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
69 {
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73 {
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
79 }
80 }
81
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83 {
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
89 }
90
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
95
96 return true;
97 }
98
99
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
116
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
121 {
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
124
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
129
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
142 {
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
148 }
149 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
150 {
151 unsigned int i = 0;
152 if (gimple_call_internal_p (call))
153 {
154 internal_fn ifn = gimple_call_internal_fn (call);
155 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
156 /* gimple_expr_type already picked the type of the loaded
157 or stored data. */
158 i = ~0U;
159 else if (internal_fn_mask_index (ifn) == 0)
160 i = 1;
161 }
162 if (i < gimple_call_num_args (call))
163 {
164 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
165 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
166 {
167 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
168 if (rhs < lhs)
169 scalar_type = rhs_type;
170 }
171 }
172 }
173
174 *lhs_size_unit = lhs;
175 *rhs_size_unit = rhs;
176 return scalar_type;
177 }
178
179
180 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
181 tested at run-time. Return TRUE if DDR was successfully inserted.
182 Return false if versioning is not supported. */
183
184 static opt_result
185 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
186 {
187 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
188
189 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
190 return opt_result::failure_at (vect_location,
191 "will not create alias checks, as"
192 " --param vect-max-version-for-alias-checks"
193 " == 0\n");
194
195 opt_result res
196 = runtime_alias_check_p (ddr, loop,
197 optimize_loop_nest_for_speed_p (loop));
198 if (!res)
199 return res;
200
201 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
202 return opt_result::success ();
203 }
204
205 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
206
207 static void
208 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
209 {
210 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
211 for (unsigned int i = 0; i < checks.length(); ++i)
212 if (checks[i] == value)
213 return;
214
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "need run-time check that %T is nonzero\n",
218 value);
219 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
220 }
221
222 /* Return true if we know that the order of vectorized DR_INFO_A and
223 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
224 DR_INFO_B. At least one of the accesses is a write. */
225
226 static bool
227 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
228 {
229 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
230 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
231
232 /* Single statements are always kept in their original order. */
233 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
234 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
235 return true;
236
237 /* STMT_A and STMT_B belong to overlapping groups. All loads are
238 emitted at the position of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
241 the current one. */
242 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
243 if (il_a)
244 {
245 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
246 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
247 s = DR_GROUP_NEXT_ELEMENT (s))
248 il_a = get_later_stmt (il_a, s);
249 else /* DR_IS_READ */
250 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
251 s = DR_GROUP_NEXT_ELEMENT (s))
252 if (get_later_stmt (il_a, s) == il_a)
253 il_a = s;
254 }
255 else
256 il_a = stmtinfo_a;
257 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
258 if (il_b)
259 {
260 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
261 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
262 s = DR_GROUP_NEXT_ELEMENT (s))
263 il_b = get_later_stmt (il_b, s);
264 else /* DR_IS_READ */
265 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
266 s = DR_GROUP_NEXT_ELEMENT (s))
267 if (get_later_stmt (il_b, s) == il_b)
268 il_b = s;
269 }
270 else
271 il_b = stmtinfo_b;
272 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
273 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
274 }
275
276 /* A subroutine of vect_analyze_data_ref_dependence. Handle
277 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
278 distances. These distances are conservatively correct but they don't
279 reflect a guaranteed dependence.
280
281 Return true if this function does all the work necessary to avoid
282 an alias or false if the caller should use the dependence distances
283 to limit the vectorization factor in the usual way. LOOP_DEPTH is
284 the depth of the loop described by LOOP_VINFO and the other arguments
285 are as for vect_analyze_data_ref_dependence. */
286
287 static bool
288 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
289 loop_vec_info loop_vinfo,
290 int loop_depth, unsigned int *max_vf)
291 {
292 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
293 lambda_vector dist_v;
294 unsigned int i;
295 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
296 {
297 int dist = dist_v[loop_depth];
298 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
299 {
300 /* If the user asserted safelen >= DIST consecutive iterations
301 can be executed concurrently, assume independence.
302
303 ??? An alternative would be to add the alias check even
304 in this case, and vectorize the fallback loop with the
305 maximum VF set to safelen. However, if the user has
306 explicitly given a length, it's less likely that that
307 would be a win. */
308 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
309 {
310 if ((unsigned int) loop->safelen < *max_vf)
311 *max_vf = loop->safelen;
312 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
313 continue;
314 }
315
316 /* For dependence distances of 2 or more, we have the option
317 of limiting VF or checking for an alias at runtime.
318 Prefer to check at runtime if we can, to avoid limiting
319 the VF unnecessarily when the bases are in fact independent.
320
321 Note that the alias checks will be removed if the VF ends up
322 being small enough. */
323 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
324 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
325 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
326 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
327 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
328 }
329 }
330 return true;
331 }
332
333
334 /* Function vect_analyze_data_ref_dependence.
335
336 FIXME: I needed to change the sense of the returned flag.
337
338 Return FALSE if there (might) exist a dependence between a memory-reference
339 DRA and a memory-reference DRB. When versioning for alias may check a
340 dependence at run-time, return TRUE. Adjust *MAX_VF according to
341 the data dependence. */
342
343 static opt_result
344 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
345 loop_vec_info loop_vinfo,
346 unsigned int *max_vf)
347 {
348 unsigned int i;
349 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
350 struct data_reference *dra = DDR_A (ddr);
351 struct data_reference *drb = DDR_B (ddr);
352 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
353 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
354 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
355 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
356 lambda_vector dist_v;
357 unsigned int loop_depth;
358
359 /* In loop analysis all data references should be vectorizable. */
360 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
361 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
362 gcc_unreachable ();
363
364 /* Independent data accesses. */
365 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
366 return opt_result::success ();
367
368 if (dra == drb
369 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
370 return opt_result::success ();
371
372 /* We do not have to consider dependences between accesses that belong
373 to the same group, unless the stride could be smaller than the
374 group size. */
375 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
376 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
377 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
378 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
379 return opt_result::success ();
380
381 /* Even if we have an anti-dependence then, as the vectorized loop covers at
382 least two scalar iterations, there is always also a true dependence.
383 As the vectorizer does not re-order loads and stores we can ignore
384 the anti-dependence if TBAA can disambiguate both DRs similar to the
385 case with known negative distance anti-dependences (positive
386 distance anti-dependences would violate TBAA constraints). */
387 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
388 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
389 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
390 get_alias_set (DR_REF (drb))))
391 return opt_result::success ();
392
393 /* Unknown data dependence. */
394 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
395 {
396 /* If user asserted safelen consecutive iterations can be
397 executed concurrently, assume independence. */
398 if (loop->safelen >= 2)
399 {
400 if ((unsigned int) loop->safelen < *max_vf)
401 *max_vf = loop->safelen;
402 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
403 return opt_result::success ();
404 }
405
406 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
407 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
408 return opt_result::failure_at
409 (stmtinfo_a->stmt,
410 "versioning for alias not supported for: "
411 "can't determine dependence between %T and %T\n",
412 DR_REF (dra), DR_REF (drb));
413
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
416 "versioning for alias required: "
417 "can't determine dependence between %T and %T\n",
418 DR_REF (dra), DR_REF (drb));
419
420 /* Add to list of ddrs that need to be tested at run-time. */
421 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
422 }
423
424 /* Known data dependence. */
425 if (DDR_NUM_DIST_VECTS (ddr) == 0)
426 {
427 /* If user asserted safelen consecutive iterations can be
428 executed concurrently, assume independence. */
429 if (loop->safelen >= 2)
430 {
431 if ((unsigned int) loop->safelen < *max_vf)
432 *max_vf = loop->safelen;
433 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
434 return opt_result::success ();
435 }
436
437 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
438 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
439 return opt_result::failure_at
440 (stmtinfo_a->stmt,
441 "versioning for alias not supported for: "
442 "bad dist vector for %T and %T\n",
443 DR_REF (dra), DR_REF (drb));
444
445 if (dump_enabled_p ())
446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
447 "versioning for alias required: "
448 "bad dist vector for %T and %T\n",
449 DR_REF (dra), DR_REF (drb));
450 /* Add to list of ddrs that need to be tested at run-time. */
451 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
452 }
453
454 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
455
456 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
457 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
458 loop_depth, max_vf))
459 return opt_result::success ();
460
461 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
462 {
463 int dist = dist_v[loop_depth];
464
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE, vect_location,
467 "dependence distance = %d.\n", dist);
468
469 if (dist == 0)
470 {
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_NOTE, vect_location,
473 "dependence distance == 0 between %T and %T\n",
474 DR_REF (dra), DR_REF (drb));
475
476 /* When we perform grouped accesses and perform implicit CSE
477 by detecting equal accesses and doing disambiguation with
478 runtime alias tests like for
479 .. = a[i];
480 .. = a[i+1];
481 a[i] = ..;
482 a[i+1] = ..;
483 *p = ..;
484 .. = a[i];
485 .. = a[i+1];
486 where we will end up loading { a[i], a[i+1] } once, make
487 sure that inserting group loads before the first load and
488 stores after the last store will do the right thing.
489 Similar for groups like
490 a[i] = ...;
491 ... = a[i];
492 a[i+1] = ...;
493 where loads from the group interleave with the store. */
494 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
495 return opt_result::failure_at (stmtinfo_a->stmt,
496 "READ_WRITE dependence"
497 " in interleaving.\n");
498
499 if (loop->safelen < 2)
500 {
501 tree indicator = dr_zero_step_indicator (dra);
502 if (!indicator || integer_zerop (indicator))
503 return opt_result::failure_at (stmtinfo_a->stmt,
504 "access also has a zero step\n");
505 else if (TREE_CODE (indicator) != INTEGER_CST)
506 vect_check_nonzero_value (loop_vinfo, indicator);
507 }
508 continue;
509 }
510
511 if (dist > 0 && DDR_REVERSED_P (ddr))
512 {
513 /* If DDR_REVERSED_P the order of the data-refs in DDR was
514 reversed (to make distance vector positive), and the actual
515 distance is negative. */
516 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE, vect_location,
518 "dependence distance negative.\n");
519 /* When doing outer loop vectorization, we need to check if there is
520 a backward dependence at the inner loop level if the dependence
521 at the outer loop is reversed. See PR81740. */
522 if (nested_in_vect_loop_p (loop, stmtinfo_a)
523 || nested_in_vect_loop_p (loop, stmtinfo_b))
524 {
525 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
526 DDR_LOOP_NEST (ddr));
527 if (dist_v[inner_depth] < 0)
528 return opt_result::failure_at (stmtinfo_a->stmt,
529 "not vectorized, dependence "
530 "between data-refs %T and %T\n",
531 DR_REF (dra), DR_REF (drb));
532 }
533 /* Record a negative dependence distance to later limit the
534 amount of stmt copying / unrolling we can perform.
535 Only need to handle read-after-write dependence. */
536 if (DR_IS_READ (drb)
537 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
538 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
539 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
540 continue;
541 }
542
543 unsigned int abs_dist = abs (dist);
544 if (abs_dist >= 2 && abs_dist < *max_vf)
545 {
546 /* The dependence distance requires reduction of the maximal
547 vectorization factor. */
548 *max_vf = abs_dist;
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE, vect_location,
551 "adjusting maximal vectorization factor to %i\n",
552 *max_vf);
553 }
554
555 if (abs_dist >= *max_vf)
556 {
557 /* Dependence distance does not create dependence, as far as
558 vectorization is concerned, in this case. */
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE, vect_location,
561 "dependence distance >= VF.\n");
562 continue;
563 }
564
565 return opt_result::failure_at (stmtinfo_a->stmt,
566 "not vectorized, possible dependence "
567 "between data-refs %T and %T\n",
568 DR_REF (dra), DR_REF (drb));
569 }
570
571 return opt_result::success ();
572 }
573
574 /* Function vect_analyze_data_ref_dependences.
575
576 Examine all the data references in the loop, and make sure there do not
577 exist any data dependences between them. Set *MAX_VF according to
578 the maximum vectorization factor the data dependences allow. */
579
580 opt_result
581 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
582 unsigned int *max_vf)
583 {
584 unsigned int i;
585 struct data_dependence_relation *ddr;
586
587 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
588
589 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
590 {
591 LOOP_VINFO_DDRS (loop_vinfo)
592 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
593 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
594 /* We do not need read-read dependences. */
595 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
596 &LOOP_VINFO_DDRS (loop_vinfo),
597 LOOP_VINFO_LOOP_NEST (loop_vinfo),
598 false);
599 gcc_assert (res);
600 }
601
602 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
603
604 /* For epilogues we either have no aliases or alias versioning
605 was applied to original loop. Therefore we may just get max_vf
606 using VF of original loop. */
607 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
608 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
609 else
610 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
611 {
612 opt_result res
613 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
614 if (!res)
615 return res;
616 }
617
618 return opt_result::success ();
619 }
620
621
622 /* Function vect_slp_analyze_data_ref_dependence.
623
624 Return TRUE if there (might) exist a dependence between a memory-reference
625 DRA and a memory-reference DRB for VINFO. When versioning for alias
626 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
627 according to the data dependence. */
628
629 static bool
630 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
631 struct data_dependence_relation *ddr)
632 {
633 struct data_reference *dra = DDR_A (ddr);
634 struct data_reference *drb = DDR_B (ddr);
635 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
636 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
637
638 /* We need to check dependences of statements marked as unvectorizable
639 as well, they still can prohibit vectorization. */
640
641 /* Independent data accesses. */
642 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
643 return false;
644
645 if (dra == drb)
646 return false;
647
648 /* Read-read is OK. */
649 if (DR_IS_READ (dra) && DR_IS_READ (drb))
650 return false;
651
652 /* If dra and drb are part of the same interleaving chain consider
653 them independent. */
654 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
655 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
656 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
657 return false;
658
659 /* Unknown data dependence. */
660 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
661 {
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
664 "can't determine dependence between %T and %T\n",
665 DR_REF (dra), DR_REF (drb));
666 }
667 else if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE, vect_location,
669 "determined dependence between %T and %T\n",
670 DR_REF (dra), DR_REF (drb));
671
672 return true;
673 }
674
675
676 /* Analyze dependences involved in the transform of SLP NODE. STORES
677 contain the vector of scalar stores of this instance if we are
678 disambiguating the loads. */
679
680 static bool
681 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
682 vec<stmt_vec_info> stores,
683 stmt_vec_info last_store_info)
684 {
685 /* This walks over all stmts involved in the SLP load/store done
686 in NODE verifying we can sink them up to the last stmt in the
687 group. */
688 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
689 {
690 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
691 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
692 {
693 stmt_vec_info access_info
694 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
695 if (access_info == last_access_info)
696 continue;
697 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
698 ao_ref ref;
699 bool ref_initialized_p = false;
700 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
701 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
702 {
703 gimple *stmt = gsi_stmt (gsi);
704 if (! gimple_vuse (stmt))
705 continue;
706
707 /* If we couldn't record a (single) data reference for this
708 stmt we have to resort to the alias oracle. */
709 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
710 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
711 if (!dr_b)
712 {
713 /* We are moving a store - this means
714 we cannot use TBAA for disambiguation. */
715 if (!ref_initialized_p)
716 ao_ref_init (&ref, DR_REF (dr_a));
717 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
718 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
719 return false;
720 continue;
721 }
722
723 bool dependent = false;
724 /* If we run into a store of this same instance (we've just
725 marked those) then delay dependence checking until we run
726 into the last store because this is where it will have
727 been sunk to (and we verify if we can do that as well). */
728 if (gimple_visited_p (stmt))
729 {
730 if (stmt_info != last_store_info)
731 continue;
732 unsigned i;
733 stmt_vec_info store_info;
734 FOR_EACH_VEC_ELT (stores, i, store_info)
735 {
736 data_reference *store_dr
737 = STMT_VINFO_DATA_REF (store_info);
738 ddr_p ddr = initialize_data_dependence_relation
739 (dr_a, store_dr, vNULL);
740 dependent
741 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
742 free_dependence_relation (ddr);
743 if (dependent)
744 break;
745 }
746 }
747 else
748 {
749 ddr_p ddr = initialize_data_dependence_relation (dr_a,
750 dr_b, vNULL);
751 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
752 free_dependence_relation (ddr);
753 }
754 if (dependent)
755 return false;
756 }
757 }
758 }
759 else /* DR_IS_READ */
760 {
761 stmt_vec_info first_access_info
762 = vect_find_first_scalar_stmt_in_slp (node);
763 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
764 {
765 stmt_vec_info access_info
766 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
767 if (access_info == first_access_info)
768 continue;
769 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
770 ao_ref ref;
771 bool ref_initialized_p = false;
772 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
773 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
774 {
775 gimple *stmt = gsi_stmt (gsi);
776 if (! gimple_vdef (stmt))
777 continue;
778
779 /* If we couldn't record a (single) data reference for this
780 stmt we have to resort to the alias oracle. */
781 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
782 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
783 if (!dr_b)
784 {
785 /* We are hoisting a load - this means we can use
786 TBAA for disambiguation. */
787 if (!ref_initialized_p)
788 ao_ref_init (&ref, DR_REF (dr_a));
789 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
790 return false;
791 continue;
792 }
793
794 bool dependent = false;
795 /* If we run into a store of this same instance (we've just
796 marked those) then delay dependence checking until we run
797 into the last store because this is where it will have
798 been sunk to (and we verify if we can do that as well). */
799 if (gimple_visited_p (stmt))
800 {
801 if (stmt_info != last_store_info)
802 continue;
803 unsigned i;
804 stmt_vec_info store_info;
805 FOR_EACH_VEC_ELT (stores, i, store_info)
806 {
807 data_reference *store_dr
808 = STMT_VINFO_DATA_REF (store_info);
809 ddr_p ddr = initialize_data_dependence_relation
810 (dr_a, store_dr, vNULL);
811 dependent
812 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
813 free_dependence_relation (ddr);
814 if (dependent)
815 break;
816 }
817 }
818 else
819 {
820 ddr_p ddr = initialize_data_dependence_relation (dr_a,
821 dr_b, vNULL);
822 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
823 free_dependence_relation (ddr);
824 }
825 if (dependent)
826 return false;
827 }
828 }
829 }
830 return true;
831 }
832
833
834 /* Function vect_analyze_data_ref_dependences.
835
836 Examine all the data references in the basic-block, and make sure there
837 do not exist any data dependences between them. Set *MAX_VF according to
838 the maximum vectorization factor the data dependences allow. */
839
840 bool
841 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
842 {
843 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
844
845 /* The stores of this instance are at the root of the SLP tree. */
846 slp_tree store = SLP_INSTANCE_TREE (instance);
847 if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store)))
848 store = NULL;
849
850 /* Verify we can sink stores to the vectorized stmt insert location. */
851 stmt_vec_info last_store_info = NULL;
852 if (store)
853 {
854 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
855 return false;
856
857 /* Mark stores in this instance and remember the last one. */
858 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
859 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
860 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
861 }
862
863 bool res = true;
864
865 /* Verify we can sink loads to the vectorized stmt insert location,
866 special-casing stores of this instance. */
867 slp_tree load;
868 unsigned int i;
869 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
870 if (! vect_slp_analyze_node_dependences (vinfo, load,
871 store
872 ? SLP_TREE_SCALAR_STMTS (store)
873 : vNULL, last_store_info))
874 {
875 res = false;
876 break;
877 }
878
879 /* Unset the visited flag. */
880 if (store)
881 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
882 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
883
884 return res;
885 }
886
887 /* Record the base alignment guarantee given by DRB, which occurs
888 in STMT_INFO. */
889
890 static void
891 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
892 innermost_loop_behavior *drb)
893 {
894 bool existed;
895 innermost_loop_behavior *&entry
896 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
897 if (!existed || entry->base_alignment < drb->base_alignment)
898 {
899 entry = drb;
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_NOTE, vect_location,
902 "recording new base alignment for %T\n"
903 " alignment: %d\n"
904 " misalignment: %d\n"
905 " based on: %G",
906 drb->base_address,
907 drb->base_alignment,
908 drb->base_misalignment,
909 stmt_info->stmt);
910 }
911 }
912
913 /* If the region we're going to vectorize is reached, all unconditional
914 data references occur at least once. We can therefore pool the base
915 alignment guarantees from each unconditional reference. Do this by
916 going through all the data references in VINFO and checking whether
917 the containing statement makes the reference unconditionally. If so,
918 record the alignment of the base address in VINFO so that it can be
919 used for all other references with the same base. */
920
921 void
922 vect_record_base_alignments (vec_info *vinfo)
923 {
924 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
925 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
926 data_reference *dr;
927 unsigned int i;
928 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
929 {
930 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
931 stmt_vec_info stmt_info = dr_info->stmt;
932 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
933 && STMT_VINFO_VECTORIZABLE (stmt_info)
934 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
935 {
936 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
937
938 /* If DR is nested in the loop that is being vectorized, we can also
939 record the alignment of the base wrt the outer loop. */
940 if (loop && nested_in_vect_loop_p (loop, stmt_info))
941 vect_record_base_alignment
942 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
943 }
944 }
945 }
946
947 /* Return the target alignment for the vectorized form of DR_INFO. */
948
949 static poly_uint64
950 vect_calculate_target_alignment (dr_vec_info *dr_info)
951 {
952 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
953 return targetm.vectorize.preferred_vector_alignment (vectype);
954 }
955
956 /* Function vect_compute_data_ref_alignment
957
958 Compute the misalignment of the data reference DR_INFO.
959
960 Output:
961 1. DR_MISALIGNMENT (DR_INFO) is defined.
962
963 FOR NOW: No analysis is actually performed. Misalignment is calculated
964 only for trivial cases. TODO. */
965
966 static void
967 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
968 {
969 stmt_vec_info stmt_info = dr_info->stmt;
970 vec_base_alignments *base_alignments = &vinfo->base_alignments;
971 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
972 class loop *loop = NULL;
973 tree ref = DR_REF (dr_info->dr);
974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
975
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_NOTE, vect_location,
978 "vect_compute_data_ref_alignment:\n");
979
980 if (loop_vinfo)
981 loop = LOOP_VINFO_LOOP (loop_vinfo);
982
983 /* Initialize misalignment to unknown. */
984 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
985
986 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
987 return;
988
989 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
990 bool step_preserves_misalignment_p;
991
992 poly_uint64 vector_alignment
993 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
994 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
995
996 /* If the main loop has peeled for alignment we have no way of knowing
997 whether the data accesses in the epilogues are aligned. We can't at
998 compile time answer the question whether we have entered the main loop or
999 not. Fixes PR 92351. */
1000 if (loop_vinfo)
1001 {
1002 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1003 if (orig_loop_vinfo
1004 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1005 return;
1006 }
1007
1008 unsigned HOST_WIDE_INT vect_align_c;
1009 if (!vector_alignment.is_constant (&vect_align_c))
1010 return;
1011
1012 /* No step for BB vectorization. */
1013 if (!loop)
1014 {
1015 gcc_assert (integer_zerop (drb->step));
1016 step_preserves_misalignment_p = true;
1017 }
1018
1019 /* In case the dataref is in an inner-loop of the loop that is being
1020 vectorized (LOOP), we use the base and misalignment information
1021 relative to the outer-loop (LOOP). This is ok only if the misalignment
1022 stays the same throughout the execution of the inner-loop, which is why
1023 we have to check that the stride of the dataref in the inner-loop evenly
1024 divides by the vector alignment. */
1025 else if (nested_in_vect_loop_p (loop, stmt_info))
1026 {
1027 step_preserves_misalignment_p
1028 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1029
1030 if (dump_enabled_p ())
1031 {
1032 if (step_preserves_misalignment_p)
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "inner step divides the vector alignment.\n");
1035 else
1036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037 "inner step doesn't divide the vector"
1038 " alignment.\n");
1039 }
1040 }
1041
1042 /* Similarly we can only use base and misalignment information relative to
1043 an innermost loop if the misalignment stays the same throughout the
1044 execution of the loop. As above, this is the case if the stride of
1045 the dataref evenly divides by the alignment. */
1046 else
1047 {
1048 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1049 step_preserves_misalignment_p
1050 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1051
1052 if (!step_preserves_misalignment_p && dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054 "step doesn't divide the vector alignment.\n");
1055 }
1056
1057 unsigned int base_alignment = drb->base_alignment;
1058 unsigned int base_misalignment = drb->base_misalignment;
1059
1060 /* Calculate the maximum of the pooled base address alignment and the
1061 alignment that we can compute for DR itself. */
1062 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1063 if (entry && base_alignment < (*entry)->base_alignment)
1064 {
1065 base_alignment = (*entry)->base_alignment;
1066 base_misalignment = (*entry)->base_misalignment;
1067 }
1068
1069 if (drb->offset_alignment < vect_align_c
1070 || !step_preserves_misalignment_p
1071 /* We need to know whether the step wrt the vectorized loop is
1072 negative when computing the starting misalignment below. */
1073 || TREE_CODE (drb->step) != INTEGER_CST)
1074 {
1075 if (dump_enabled_p ())
1076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1077 "Unknown alignment for access: %T\n", ref);
1078 return;
1079 }
1080
1081 if (base_alignment < vect_align_c)
1082 {
1083 unsigned int max_alignment;
1084 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1085 if (max_alignment < vect_align_c
1086 || !vect_can_force_dr_alignment_p (base,
1087 vect_align_c * BITS_PER_UNIT))
1088 {
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE, vect_location,
1091 "can't force alignment of ref: %T\n", ref);
1092 return;
1093 }
1094
1095 /* Force the alignment of the decl.
1096 NOTE: This is the only change to the code we make during
1097 the analysis phase, before deciding to vectorize the loop. */
1098 if (dump_enabled_p ())
1099 dump_printf_loc (MSG_NOTE, vect_location,
1100 "force alignment of %T\n", ref);
1101
1102 dr_info->base_decl = base;
1103 dr_info->base_misaligned = true;
1104 base_misalignment = 0;
1105 }
1106 poly_int64 misalignment
1107 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1108
1109 /* If this is a backward running DR then first access in the larger
1110 vectype actually is N-1 elements before the address in the DR.
1111 Adjust misalign accordingly. */
1112 if (tree_int_cst_sgn (drb->step) < 0)
1113 /* PLUS because STEP is negative. */
1114 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1115 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1116
1117 unsigned int const_misalignment;
1118 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1119 {
1120 if (dump_enabled_p ())
1121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1122 "Non-constant misalignment for access: %T\n", ref);
1123 return;
1124 }
1125
1126 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1127
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130 "misalign = %d bytes of ref %T\n",
1131 DR_MISALIGNMENT (dr_info), ref);
1132
1133 return;
1134 }
1135
1136 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1137 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1138 is made aligned via peeling. */
1139
1140 static bool
1141 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1142 dr_vec_info *dr_peel_info)
1143 {
1144 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1145 DR_TARGET_ALIGNMENT (dr_info)))
1146 {
1147 poly_offset_int diff
1148 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1149 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1150 if (known_eq (diff, 0)
1151 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1152 return true;
1153 }
1154 return false;
1155 }
1156
1157 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1158 aligned via peeling. */
1159
1160 static bool
1161 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1162 dr_vec_info *dr_peel_info)
1163 {
1164 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1165 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1166 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1167 DR_OFFSET (dr_peel_info->dr), 0)
1168 || !operand_equal_p (DR_STEP (dr_info->dr),
1169 DR_STEP (dr_peel_info->dr), 0))
1170 return false;
1171
1172 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1173 }
1174
1175 /* Function vect_update_misalignment_for_peel.
1176 Sets DR_INFO's misalignment
1177 - to 0 if it has the same alignment as DR_PEEL_INFO,
1178 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1179 - to -1 (unknown) otherwise.
1180
1181 DR_INFO - the data reference whose misalignment is to be adjusted.
1182 DR_PEEL_INFO - the data reference whose misalignment is being made
1183 zero in the vector loop by the peel.
1184 NPEEL - the number of iterations in the peel loop if the misalignment
1185 of DR_PEEL_INFO is known at compile time. */
1186
1187 static void
1188 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1189 dr_vec_info *dr_peel_info, int npeel)
1190 {
1191 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1192 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1193 {
1194 SET_DR_MISALIGNMENT (dr_info, 0);
1195 return;
1196 }
1197
1198 unsigned HOST_WIDE_INT alignment;
1199 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1200 && known_alignment_for_access_p (dr_info)
1201 && known_alignment_for_access_p (dr_peel_info))
1202 {
1203 int misal = DR_MISALIGNMENT (dr_info);
1204 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1205 misal &= alignment - 1;
1206 SET_DR_MISALIGNMENT (dr_info, misal);
1207 return;
1208 }
1209
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1212 "to unknown (-1).\n");
1213 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1214 }
1215
1216 /* Return true if alignment is relevant for DR_INFO. */
1217
1218 static bool
1219 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1220 {
1221 stmt_vec_info stmt_info = dr_info->stmt;
1222
1223 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1224 return false;
1225
1226 /* For interleaving, only the alignment of the first access matters. */
1227 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1228 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1229 return false;
1230
1231 /* Scatter-gather and invariant accesses continue to address individual
1232 scalars, so vector-level alignment is irrelevant. */
1233 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1234 || integer_zerop (DR_STEP (dr_info->dr)))
1235 return false;
1236
1237 /* Strided accesses perform only component accesses, alignment is
1238 irrelevant for them. */
1239 if (STMT_VINFO_STRIDED_P (stmt_info)
1240 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1241 return false;
1242
1243 return true;
1244 }
1245
1246 /* Given an memory reference EXP return whether its alignment is less
1247 than its size. */
1248
1249 static bool
1250 not_size_aligned (tree exp)
1251 {
1252 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1253 return true;
1254
1255 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1256 > get_object_alignment (exp));
1257 }
1258
1259 /* Function vector_alignment_reachable_p
1260
1261 Return true if vector alignment for DR_INFO is reachable by peeling
1262 a few loop iterations. Return false otherwise. */
1263
1264 static bool
1265 vector_alignment_reachable_p (dr_vec_info *dr_info)
1266 {
1267 stmt_vec_info stmt_info = dr_info->stmt;
1268 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1269
1270 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1271 {
1272 /* For interleaved access we peel only if number of iterations in
1273 the prolog loop ({VF - misalignment}), is a multiple of the
1274 number of the interleaved accesses. */
1275 int elem_size, mis_in_elements;
1276
1277 /* FORNOW: handle only known alignment. */
1278 if (!known_alignment_for_access_p (dr_info))
1279 return false;
1280
1281 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1282 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1283 elem_size = vector_element_size (vector_size, nelements);
1284 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1285
1286 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1287 return false;
1288 }
1289
1290 /* If misalignment is known at the compile time then allow peeling
1291 only if natural alignment is reachable through peeling. */
1292 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1293 {
1294 HOST_WIDE_INT elmsize =
1295 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1296 if (dump_enabled_p ())
1297 {
1298 dump_printf_loc (MSG_NOTE, vect_location,
1299 "data size = %wd. misalignment = %d.\n", elmsize,
1300 DR_MISALIGNMENT (dr_info));
1301 }
1302 if (DR_MISALIGNMENT (dr_info) % elmsize)
1303 {
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306 "data size does not divide the misalignment.\n");
1307 return false;
1308 }
1309 }
1310
1311 if (!known_alignment_for_access_p (dr_info))
1312 {
1313 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1314 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1317 "Unknown misalignment, %snaturally aligned\n",
1318 is_packed ? "not " : "");
1319 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1320 }
1321
1322 return true;
1323 }
1324
1325
1326 /* Calculate the cost of the memory access represented by DR_INFO. */
1327
1328 static void
1329 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1330 unsigned int *inside_cost,
1331 unsigned int *outside_cost,
1332 stmt_vector_for_cost *body_cost_vec,
1333 stmt_vector_for_cost *prologue_cost_vec)
1334 {
1335 stmt_vec_info stmt_info = dr_info->stmt;
1336 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1337 int ncopies;
1338
1339 if (PURE_SLP_STMT (stmt_info))
1340 ncopies = 1;
1341 else
1342 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1343
1344 if (DR_IS_READ (dr_info->dr))
1345 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1346 outside_cost, prologue_cost_vec, body_cost_vec, false);
1347 else
1348 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1349
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_NOTE, vect_location,
1352 "vect_get_data_access_cost: inside_cost = %d, "
1353 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1354 }
1355
1356
1357 typedef struct _vect_peel_info
1358 {
1359 dr_vec_info *dr_info;
1360 int npeel;
1361 unsigned int count;
1362 } *vect_peel_info;
1363
1364 typedef struct _vect_peel_extended_info
1365 {
1366 vec_info *vinfo;
1367 struct _vect_peel_info peel_info;
1368 unsigned int inside_cost;
1369 unsigned int outside_cost;
1370 } *vect_peel_extended_info;
1371
1372
1373 /* Peeling hashtable helpers. */
1374
1375 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1376 {
1377 static inline hashval_t hash (const _vect_peel_info *);
1378 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1379 };
1380
1381 inline hashval_t
1382 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1383 {
1384 return (hashval_t) peel_info->npeel;
1385 }
1386
1387 inline bool
1388 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1389 {
1390 return (a->npeel == b->npeel);
1391 }
1392
1393
1394 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1395
1396 static void
1397 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1398 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1399 int npeel)
1400 {
1401 struct _vect_peel_info elem, *slot;
1402 _vect_peel_info **new_slot;
1403 bool supportable_dr_alignment
1404 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1405
1406 elem.npeel = npeel;
1407 slot = peeling_htab->find (&elem);
1408 if (slot)
1409 slot->count++;
1410 else
1411 {
1412 slot = XNEW (struct _vect_peel_info);
1413 slot->npeel = npeel;
1414 slot->dr_info = dr_info;
1415 slot->count = 1;
1416 new_slot = peeling_htab->find_slot (slot, INSERT);
1417 *new_slot = slot;
1418 }
1419
1420 if (!supportable_dr_alignment
1421 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1422 slot->count += VECT_MAX_COST;
1423 }
1424
1425
1426 /* Traverse peeling hash table to find peeling option that aligns maximum
1427 number of data accesses. */
1428
1429 int
1430 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1431 _vect_peel_extended_info *max)
1432 {
1433 vect_peel_info elem = *slot;
1434
1435 if (elem->count > max->peel_info.count
1436 || (elem->count == max->peel_info.count
1437 && max->peel_info.npeel > elem->npeel))
1438 {
1439 max->peel_info.npeel = elem->npeel;
1440 max->peel_info.count = elem->count;
1441 max->peel_info.dr_info = elem->dr_info;
1442 }
1443
1444 return 1;
1445 }
1446
1447 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1448 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1449 we assume DR0_INFO's misalignment will be zero after peeling. */
1450
1451 static void
1452 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1453 dr_vec_info *dr0_info,
1454 unsigned int *inside_cost,
1455 unsigned int *outside_cost,
1456 stmt_vector_for_cost *body_cost_vec,
1457 stmt_vector_for_cost *prologue_cost_vec,
1458 unsigned int npeel,
1459 bool unknown_misalignment)
1460 {
1461 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1462 unsigned i;
1463 data_reference *dr;
1464
1465 FOR_EACH_VEC_ELT (datarefs, i, dr)
1466 {
1467 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1468 if (!vect_relevant_for_alignment_p (dr_info))
1469 continue;
1470
1471 int save_misalignment;
1472 save_misalignment = DR_MISALIGNMENT (dr_info);
1473 if (npeel == 0)
1474 ;
1475 else if (unknown_misalignment && dr_info == dr0_info)
1476 SET_DR_MISALIGNMENT (dr_info, 0);
1477 else
1478 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1479 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1480 body_cost_vec, prologue_cost_vec);
1481 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1482 }
1483 }
1484
1485 /* Traverse peeling hash table and calculate cost for each peeling option.
1486 Find the one with the lowest cost. */
1487
1488 int
1489 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1490 _vect_peel_extended_info *min)
1491 {
1492 vect_peel_info elem = *slot;
1493 int dummy;
1494 unsigned int inside_cost = 0, outside_cost = 0;
1495 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1496 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1497 epilogue_cost_vec;
1498
1499 prologue_cost_vec.create (2);
1500 body_cost_vec.create (2);
1501 epilogue_cost_vec.create (2);
1502
1503 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1504 &outside_cost, &body_cost_vec,
1505 &prologue_cost_vec, elem->npeel, false);
1506
1507 body_cost_vec.release ();
1508
1509 outside_cost += vect_get_known_peeling_cost
1510 (loop_vinfo, elem->npeel, &dummy,
1511 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1512 &prologue_cost_vec, &epilogue_cost_vec);
1513
1514 /* Prologue and epilogue costs are added to the target model later.
1515 These costs depend only on the scalar iteration cost, the
1516 number of peeling iterations finally chosen, and the number of
1517 misaligned statements. So discard the information found here. */
1518 prologue_cost_vec.release ();
1519 epilogue_cost_vec.release ();
1520
1521 if (inside_cost < min->inside_cost
1522 || (inside_cost == min->inside_cost
1523 && outside_cost < min->outside_cost))
1524 {
1525 min->inside_cost = inside_cost;
1526 min->outside_cost = outside_cost;
1527 min->peel_info.dr_info = elem->dr_info;
1528 min->peel_info.npeel = elem->npeel;
1529 min->peel_info.count = elem->count;
1530 }
1531
1532 return 1;
1533 }
1534
1535
1536 /* Choose best peeling option by traversing peeling hash table and either
1537 choosing an option with the lowest cost (if cost model is enabled) or the
1538 option that aligns as many accesses as possible. */
1539
1540 static struct _vect_peel_extended_info
1541 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1542 loop_vec_info loop_vinfo)
1543 {
1544 struct _vect_peel_extended_info res;
1545
1546 res.peel_info.dr_info = NULL;
1547 res.vinfo = loop_vinfo;
1548
1549 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1550 {
1551 res.inside_cost = INT_MAX;
1552 res.outside_cost = INT_MAX;
1553 peeling_htab->traverse <_vect_peel_extended_info *,
1554 vect_peeling_hash_get_lowest_cost> (&res);
1555 }
1556 else
1557 {
1558 res.peel_info.count = 0;
1559 peeling_htab->traverse <_vect_peel_extended_info *,
1560 vect_peeling_hash_get_most_frequent> (&res);
1561 res.inside_cost = 0;
1562 res.outside_cost = 0;
1563 }
1564
1565 return res;
1566 }
1567
1568 /* Return true if the new peeling NPEEL is supported. */
1569
1570 static bool
1571 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1572 unsigned npeel)
1573 {
1574 unsigned i;
1575 struct data_reference *dr = NULL;
1576 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1577 enum dr_alignment_support supportable_dr_alignment;
1578
1579 /* Ensure that all data refs can be vectorized after the peel. */
1580 FOR_EACH_VEC_ELT (datarefs, i, dr)
1581 {
1582 int save_misalignment;
1583
1584 if (dr == dr0_info->dr)
1585 continue;
1586
1587 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1588 if (!vect_relevant_for_alignment_p (dr_info))
1589 continue;
1590
1591 save_misalignment = DR_MISALIGNMENT (dr_info);
1592 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1593 supportable_dr_alignment
1594 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1595 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1596
1597 if (!supportable_dr_alignment)
1598 return false;
1599 }
1600
1601 return true;
1602 }
1603
1604 /* Compare two data-references DRA and DRB to group them into chunks
1605 with related alignment. */
1606
1607 static int
1608 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1609 {
1610 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1611 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1612 int cmp;
1613
1614 /* Stabilize sort. */
1615 if (dra == drb)
1616 return 0;
1617
1618 /* Ordering of DRs according to base. */
1619 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1620 DR_BASE_ADDRESS (drb));
1621 if (cmp != 0)
1622 return cmp;
1623
1624 /* And according to DR_OFFSET. */
1625 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1626 if (cmp != 0)
1627 return cmp;
1628
1629 /* And after step. */
1630 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1631 if (cmp != 0)
1632 return cmp;
1633
1634 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1635 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1636 if (cmp == 0)
1637 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1638 return cmp;
1639 }
1640
1641 /* Function vect_enhance_data_refs_alignment
1642
1643 This pass will use loop versioning and loop peeling in order to enhance
1644 the alignment of data references in the loop.
1645
1646 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1647 original loop is to be vectorized. Any other loops that are created by
1648 the transformations performed in this pass - are not supposed to be
1649 vectorized. This restriction will be relaxed.
1650
1651 This pass will require a cost model to guide it whether to apply peeling
1652 or versioning or a combination of the two. For example, the scheme that
1653 intel uses when given a loop with several memory accesses, is as follows:
1654 choose one memory access ('p') which alignment you want to force by doing
1655 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1656 other accesses are not necessarily aligned, or (2) use loop versioning to
1657 generate one loop in which all accesses are aligned, and another loop in
1658 which only 'p' is necessarily aligned.
1659
1660 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1661 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1662 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1663
1664 Devising a cost model is the most critical aspect of this work. It will
1665 guide us on which access to peel for, whether to use loop versioning, how
1666 many versions to create, etc. The cost model will probably consist of
1667 generic considerations as well as target specific considerations (on
1668 powerpc for example, misaligned stores are more painful than misaligned
1669 loads).
1670
1671 Here are the general steps involved in alignment enhancements:
1672
1673 -- original loop, before alignment analysis:
1674 for (i=0; i<N; i++){
1675 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1676 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1677 }
1678
1679 -- After vect_compute_data_refs_alignment:
1680 for (i=0; i<N; i++){
1681 x = q[i]; # DR_MISALIGNMENT(q) = 3
1682 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1683 }
1684
1685 -- Possibility 1: we do loop versioning:
1686 if (p is aligned) {
1687 for (i=0; i<N; i++){ # loop 1A
1688 x = q[i]; # DR_MISALIGNMENT(q) = 3
1689 p[i] = y; # DR_MISALIGNMENT(p) = 0
1690 }
1691 }
1692 else {
1693 for (i=0; i<N; i++){ # loop 1B
1694 x = q[i]; # DR_MISALIGNMENT(q) = 3
1695 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1696 }
1697 }
1698
1699 -- Possibility 2: we do loop peeling:
1700 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1701 x = q[i];
1702 p[i] = y;
1703 }
1704 for (i = 3; i < N; i++){ # loop 2A
1705 x = q[i]; # DR_MISALIGNMENT(q) = 0
1706 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1707 }
1708
1709 -- Possibility 3: combination of loop peeling and versioning:
1710 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1711 x = q[i];
1712 p[i] = y;
1713 }
1714 if (p is aligned) {
1715 for (i = 3; i<N; i++){ # loop 3A
1716 x = q[i]; # DR_MISALIGNMENT(q) = 0
1717 p[i] = y; # DR_MISALIGNMENT(p) = 0
1718 }
1719 }
1720 else {
1721 for (i = 3; i<N; i++){ # loop 3B
1722 x = q[i]; # DR_MISALIGNMENT(q) = 0
1723 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1724 }
1725 }
1726
1727 These loops are later passed to loop_transform to be vectorized. The
1728 vectorizer will use the alignment information to guide the transformation
1729 (whether to generate regular loads/stores, or with special handling for
1730 misalignment). */
1731
1732 opt_result
1733 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1734 {
1735 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1736 enum dr_alignment_support supportable_dr_alignment;
1737 dr_vec_info *first_store = NULL;
1738 dr_vec_info *dr0_info = NULL;
1739 struct data_reference *dr;
1740 unsigned int i;
1741 bool do_peeling = false;
1742 bool do_versioning = false;
1743 unsigned int npeel = 0;
1744 bool one_misalignment_known = false;
1745 bool one_misalignment_unknown = false;
1746 bool one_dr_unsupportable = false;
1747 dr_vec_info *unsupportable_dr_info = NULL;
1748 unsigned int mis, dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1749 hash_table<peel_info_hasher> peeling_htab (1);
1750
1751 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1752
1753 /* Reset data so we can safely be called multiple times. */
1754 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1755 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1756
1757 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1758 return opt_result::success ();
1759
1760 /* Sort the vector of datarefs so DRs that have the same or dependent
1761 alignment are next to each other. */
1762 auto_vec<data_reference_p> datarefs
1763 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1764 datarefs.qsort (dr_align_group_sort_cmp);
1765
1766 /* Compute the number of DRs that become aligned when we peel
1767 a dataref so it becomes aligned. */
1768 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1769 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1770 unsigned i0;
1771 for (i0 = 0; i0 < datarefs.length (); ++i0)
1772 if (DR_BASE_ADDRESS (datarefs[i0]))
1773 break;
1774 for (i = i0 + 1; i <= datarefs.length (); ++i)
1775 {
1776 if (i == datarefs.length ()
1777 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1778 DR_BASE_ADDRESS (datarefs[i]), 0)
1779 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1780 DR_OFFSET (datarefs[i]), 0)
1781 || !operand_equal_p (DR_STEP (datarefs[i0]),
1782 DR_STEP (datarefs[i]), 0))
1783 {
1784 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1785 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1786 will get known misalignment if we align one of the refs
1787 with the largest DR_TARGET_ALIGNMENT. */
1788 for (unsigned j = i0; j < i; ++j)
1789 {
1790 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1791 for (unsigned k = i0; k < i; ++k)
1792 {
1793 if (k == j)
1794 continue;
1795 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1796 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1797 dr_infoj))
1798 n_same_align_refs[j]++;
1799 }
1800 }
1801 i0 = i;
1802 }
1803 }
1804
1805 /* While cost model enhancements are expected in the future, the high level
1806 view of the code at this time is as follows:
1807
1808 A) If there is a misaligned access then see if peeling to align
1809 this access can make all data references satisfy
1810 vect_supportable_dr_alignment. If so, update data structures
1811 as needed and return true.
1812
1813 B) If peeling wasn't possible and there is a data reference with an
1814 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1815 then see if loop versioning checks can be used to make all data
1816 references satisfy vect_supportable_dr_alignment. If so, update
1817 data structures as needed and return true.
1818
1819 C) If neither peeling nor versioning were successful then return false if
1820 any data reference does not satisfy vect_supportable_dr_alignment.
1821
1822 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1823
1824 Note, Possibility 3 above (which is peeling and versioning together) is not
1825 being done at this time. */
1826
1827 /* (1) Peeling to force alignment. */
1828
1829 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1830 Considerations:
1831 + How many accesses will become aligned due to the peeling
1832 - How many accesses will become unaligned due to the peeling,
1833 and the cost of misaligned accesses.
1834 - The cost of peeling (the extra runtime checks, the increase
1835 in code size). */
1836
1837 FOR_EACH_VEC_ELT (datarefs, i, dr)
1838 {
1839 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1840 if (!vect_relevant_for_alignment_p (dr_info))
1841 continue;
1842
1843 stmt_vec_info stmt_info = dr_info->stmt;
1844 supportable_dr_alignment
1845 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1846 do_peeling = vector_alignment_reachable_p (dr_info);
1847 if (do_peeling)
1848 {
1849 if (known_alignment_for_access_p (dr_info))
1850 {
1851 unsigned int npeel_tmp = 0;
1852 bool negative = tree_int_cst_compare (DR_STEP (dr),
1853 size_zero_node) < 0;
1854
1855 /* If known_alignment_for_access_p then we have set
1856 DR_MISALIGNMENT which is only done if we know it at compiler
1857 time, so it is safe to assume target alignment is constant.
1858 */
1859 unsigned int target_align =
1860 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1861 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1862 mis = (negative
1863 ? DR_MISALIGNMENT (dr_info)
1864 : -DR_MISALIGNMENT (dr_info));
1865 if (DR_MISALIGNMENT (dr_info) != 0)
1866 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1867
1868 /* For multiple types, it is possible that the bigger type access
1869 will have more than one peeling option. E.g., a loop with two
1870 types: one of size (vector size / 4), and the other one of
1871 size (vector size / 8). Vectorization factor will 8. If both
1872 accesses are misaligned by 3, the first one needs one scalar
1873 iteration to be aligned, and the second one needs 5. But the
1874 first one will be aligned also by peeling 5 scalar
1875 iterations, and in that case both accesses will be aligned.
1876 Hence, except for the immediate peeling amount, we also want
1877 to try to add full vector size, while we don't exceed
1878 vectorization factor.
1879 We do this automatically for cost model, since we calculate
1880 cost for every peeling option. */
1881 poly_uint64 nscalars = npeel_tmp;
1882 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1883 {
1884 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1885 nscalars = (STMT_SLP_TYPE (stmt_info)
1886 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1887 }
1888
1889 /* Save info about DR in the hash table. Also include peeling
1890 amounts according to the explanation above. */
1891 while (known_le (npeel_tmp, nscalars))
1892 {
1893 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1894 dr_info, npeel_tmp);
1895 npeel_tmp += MAX (1, target_align / dr_size);
1896 }
1897
1898 one_misalignment_known = true;
1899 }
1900 else
1901 {
1902 /* If we don't know any misalignment values, we prefer
1903 peeling for data-ref that has the maximum number of data-refs
1904 with the same alignment, unless the target prefers to align
1905 stores over load. */
1906 unsigned same_align_drs = n_same_align_refs[i];
1907 if (!dr0_info
1908 || dr0_same_align_drs < same_align_drs)
1909 {
1910 dr0_same_align_drs = same_align_drs;
1911 dr0_info = dr_info;
1912 }
1913 /* For data-refs with the same number of related
1914 accesses prefer the one where the misalign
1915 computation will be invariant in the outermost loop. */
1916 else if (dr0_same_align_drs == same_align_drs)
1917 {
1918 class loop *ivloop0, *ivloop;
1919 ivloop0 = outermost_invariant_loop_for_expr
1920 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1921 ivloop = outermost_invariant_loop_for_expr
1922 (loop, DR_BASE_ADDRESS (dr));
1923 if ((ivloop && !ivloop0)
1924 || (ivloop && ivloop0
1925 && flow_loop_nested_p (ivloop, ivloop0)))
1926 dr0_info = dr_info;
1927 }
1928
1929 one_misalignment_unknown = true;
1930
1931 /* Check for data refs with unsupportable alignment that
1932 can be peeled. */
1933 if (!supportable_dr_alignment)
1934 {
1935 one_dr_unsupportable = true;
1936 unsupportable_dr_info = dr_info;
1937 }
1938
1939 if (!first_store && DR_IS_WRITE (dr))
1940 {
1941 first_store = dr_info;
1942 first_store_same_align_drs = same_align_drs;
1943 }
1944 }
1945 }
1946 else
1947 {
1948 if (!aligned_access_p (dr_info))
1949 {
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1952 "vector alignment may not be reachable\n");
1953 break;
1954 }
1955 }
1956 }
1957
1958 /* Check if we can possibly peel the loop. */
1959 if (!vect_can_advance_ivs_p (loop_vinfo)
1960 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1961 || loop->inner)
1962 do_peeling = false;
1963
1964 struct _vect_peel_extended_info peel_for_known_alignment;
1965 struct _vect_peel_extended_info peel_for_unknown_alignment;
1966 struct _vect_peel_extended_info best_peel;
1967
1968 peel_for_unknown_alignment.inside_cost = INT_MAX;
1969 peel_for_unknown_alignment.outside_cost = INT_MAX;
1970 peel_for_unknown_alignment.peel_info.count = 0;
1971
1972 if (do_peeling
1973 && one_misalignment_unknown)
1974 {
1975 /* Check if the target requires to prefer stores over loads, i.e., if
1976 misaligned stores are more expensive than misaligned loads (taking
1977 drs with same alignment into account). */
1978 unsigned int load_inside_cost = 0;
1979 unsigned int load_outside_cost = 0;
1980 unsigned int store_inside_cost = 0;
1981 unsigned int store_outside_cost = 0;
1982 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1983
1984 stmt_vector_for_cost dummy;
1985 dummy.create (2);
1986 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1987 &load_inside_cost,
1988 &load_outside_cost,
1989 &dummy, &dummy, estimated_npeels, true);
1990 dummy.release ();
1991
1992 if (first_store)
1993 {
1994 dummy.create (2);
1995 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1996 &store_inside_cost,
1997 &store_outside_cost,
1998 &dummy, &dummy,
1999 estimated_npeels, true);
2000 dummy.release ();
2001 }
2002 else
2003 {
2004 store_inside_cost = INT_MAX;
2005 store_outside_cost = INT_MAX;
2006 }
2007
2008 if (load_inside_cost > store_inside_cost
2009 || (load_inside_cost == store_inside_cost
2010 && load_outside_cost > store_outside_cost))
2011 {
2012 dr0_info = first_store;
2013 dr0_same_align_drs = first_store_same_align_drs;
2014 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2015 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2016 }
2017 else
2018 {
2019 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2020 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2021 }
2022
2023 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2024 prologue_cost_vec.create (2);
2025 epilogue_cost_vec.create (2);
2026
2027 int dummy2;
2028 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2029 (loop_vinfo, estimated_npeels, &dummy2,
2030 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2031 &prologue_cost_vec, &epilogue_cost_vec);
2032
2033 prologue_cost_vec.release ();
2034 epilogue_cost_vec.release ();
2035
2036 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2037 }
2038
2039 peel_for_unknown_alignment.peel_info.npeel = 0;
2040 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2041
2042 best_peel = peel_for_unknown_alignment;
2043
2044 peel_for_known_alignment.inside_cost = INT_MAX;
2045 peel_for_known_alignment.outside_cost = INT_MAX;
2046 peel_for_known_alignment.peel_info.count = 0;
2047 peel_for_known_alignment.peel_info.dr_info = NULL;
2048
2049 if (do_peeling && one_misalignment_known)
2050 {
2051 /* Peeling is possible, but there is no data access that is not supported
2052 unless aligned. So we try to choose the best possible peeling from
2053 the hash table. */
2054 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2055 (&peeling_htab, loop_vinfo);
2056 }
2057
2058 /* Compare costs of peeling for known and unknown alignment. */
2059 if (peel_for_known_alignment.peel_info.dr_info != NULL
2060 && peel_for_unknown_alignment.inside_cost
2061 >= peel_for_known_alignment.inside_cost)
2062 {
2063 best_peel = peel_for_known_alignment;
2064
2065 /* If the best peeling for known alignment has NPEEL == 0, perform no
2066 peeling at all except if there is an unsupportable dr that we can
2067 align. */
2068 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2069 do_peeling = false;
2070 }
2071
2072 /* If there is an unsupportable data ref, prefer this over all choices so far
2073 since we'd have to discard a chosen peeling except when it accidentally
2074 aligned the unsupportable data ref. */
2075 if (one_dr_unsupportable)
2076 dr0_info = unsupportable_dr_info;
2077 else if (do_peeling)
2078 {
2079 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2080 TODO: Use nopeel_outside_cost or get rid of it? */
2081 unsigned nopeel_inside_cost = 0;
2082 unsigned nopeel_outside_cost = 0;
2083
2084 stmt_vector_for_cost dummy;
2085 dummy.create (2);
2086 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2087 &nopeel_outside_cost, &dummy, &dummy,
2088 0, false);
2089 dummy.release ();
2090
2091 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2092 costs will be recorded. */
2093 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2094 prologue_cost_vec.create (2);
2095 epilogue_cost_vec.create (2);
2096
2097 int dummy2;
2098 nopeel_outside_cost += vect_get_known_peeling_cost
2099 (loop_vinfo, 0, &dummy2,
2100 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2101 &prologue_cost_vec, &epilogue_cost_vec);
2102
2103 prologue_cost_vec.release ();
2104 epilogue_cost_vec.release ();
2105
2106 npeel = best_peel.peel_info.npeel;
2107 dr0_info = best_peel.peel_info.dr_info;
2108
2109 /* If no peeling is not more expensive than the best peeling we
2110 have so far, don't perform any peeling. */
2111 if (nopeel_inside_cost <= best_peel.inside_cost)
2112 do_peeling = false;
2113 }
2114
2115 if (do_peeling)
2116 {
2117 stmt_vec_info stmt_info = dr0_info->stmt;
2118 if (known_alignment_for_access_p (dr0_info))
2119 {
2120 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2121 size_zero_node) < 0;
2122 if (!npeel)
2123 {
2124 /* Since it's known at compile time, compute the number of
2125 iterations in the peeled loop (the peeling factor) for use in
2126 updating DR_MISALIGNMENT values. The peeling factor is the
2127 vectorization factor minus the misalignment as an element
2128 count. */
2129 mis = (negative
2130 ? DR_MISALIGNMENT (dr0_info)
2131 : -DR_MISALIGNMENT (dr0_info));
2132 /* If known_alignment_for_access_p then we have set
2133 DR_MISALIGNMENT which is only done if we know it at compiler
2134 time, so it is safe to assume target alignment is constant.
2135 */
2136 unsigned int target_align =
2137 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2138 npeel = ((mis & (target_align - 1))
2139 / vect_get_scalar_dr_size (dr0_info));
2140 }
2141
2142 /* For interleaved data access every iteration accesses all the
2143 members of the group, therefore we divide the number of iterations
2144 by the group size. */
2145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 npeel /= DR_GROUP_SIZE (stmt_info);
2147
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_NOTE, vect_location,
2150 "Try peeling by %d\n", npeel);
2151 }
2152
2153 /* Ensure that all datarefs can be vectorized after the peel. */
2154 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2155 do_peeling = false;
2156
2157 /* Check if all datarefs are supportable and log. */
2158 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2159 return opt_result::success ();
2160
2161 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2162 if (do_peeling)
2163 {
2164 unsigned max_allowed_peel
2165 = param_vect_max_peeling_for_alignment;
2166 if (flag_vect_cost_model <= VECT_COST_MODEL_CHEAP)
2167 max_allowed_peel = 0;
2168 if (max_allowed_peel != (unsigned)-1)
2169 {
2170 unsigned max_peel = npeel;
2171 if (max_peel == 0)
2172 {
2173 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2174 unsigned HOST_WIDE_INT target_align_c;
2175 if (target_align.is_constant (&target_align_c))
2176 max_peel =
2177 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2178 else
2179 {
2180 do_peeling = false;
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE, vect_location,
2183 "Disable peeling, max peels set and vector"
2184 " alignment unknown\n");
2185 }
2186 }
2187 if (max_peel > max_allowed_peel)
2188 {
2189 do_peeling = false;
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE, vect_location,
2192 "Disable peeling, max peels reached: %d\n", max_peel);
2193 }
2194 }
2195 }
2196
2197 /* Cost model #2 - if peeling may result in a remaining loop not
2198 iterating enough to be vectorized then do not peel. Since this
2199 is a cost heuristic rather than a correctness decision, use the
2200 most likely runtime value for variable vectorization factors. */
2201 if (do_peeling
2202 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2203 {
2204 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2205 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2206 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2207 < assumed_vf + max_peel)
2208 do_peeling = false;
2209 }
2210
2211 if (do_peeling)
2212 {
2213 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2214 If the misalignment of DR_i is identical to that of dr0 then set
2215 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2216 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2217 by the peeling factor times the element size of DR_i (MOD the
2218 vectorization factor times the size). Otherwise, the
2219 misalignment of DR_i must be set to unknown. */
2220 FOR_EACH_VEC_ELT (datarefs, i, dr)
2221 if (dr != dr0_info->dr)
2222 {
2223 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224 if (!vect_relevant_for_alignment_p (dr_info))
2225 continue;
2226
2227 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2228 }
2229
2230 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2231 if (npeel)
2232 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2233 else
2234 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2235 = DR_MISALIGNMENT (dr0_info);
2236 SET_DR_MISALIGNMENT (dr0_info, 0);
2237 if (dump_enabled_p ())
2238 {
2239 dump_printf_loc (MSG_NOTE, vect_location,
2240 "Alignment of access forced using peeling.\n");
2241 dump_printf_loc (MSG_NOTE, vect_location,
2242 "Peeling for alignment will be applied.\n");
2243 }
2244
2245 /* The inside-loop cost will be accounted for in vectorizable_load
2246 and vectorizable_store correctly with adjusted alignments.
2247 Drop the body_cst_vec on the floor here. */
2248 return opt_result::success ();
2249 }
2250 }
2251
2252 /* (2) Versioning to force alignment. */
2253
2254 /* Try versioning if:
2255 1) optimize loop for speed and the cost-model is not cheap
2256 2) there is at least one unsupported misaligned data ref with an unknown
2257 misalignment, and
2258 3) all misaligned data refs with a known misalignment are supported, and
2259 4) the number of runtime alignment checks is within reason. */
2260
2261 do_versioning
2262 = (optimize_loop_nest_for_speed_p (loop)
2263 && !loop->inner /* FORNOW */
2264 && flag_vect_cost_model > VECT_COST_MODEL_CHEAP);
2265
2266 if (do_versioning)
2267 {
2268 FOR_EACH_VEC_ELT (datarefs, i, dr)
2269 {
2270 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2271 if (aligned_access_p (dr_info)
2272 || !vect_relevant_for_alignment_p (dr_info))
2273 continue;
2274
2275 stmt_vec_info stmt_info = dr_info->stmt;
2276 if (STMT_VINFO_STRIDED_P (stmt_info))
2277 {
2278 do_versioning = false;
2279 break;
2280 }
2281
2282 supportable_dr_alignment
2283 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2284
2285 if (!supportable_dr_alignment)
2286 {
2287 int mask;
2288 tree vectype;
2289
2290 if (known_alignment_for_access_p (dr_info)
2291 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2292 >= (unsigned) param_vect_max_version_for_alignment_checks)
2293 {
2294 do_versioning = false;
2295 break;
2296 }
2297
2298 vectype = STMT_VINFO_VECTYPE (stmt_info);
2299 gcc_assert (vectype);
2300
2301 /* At present we don't support versioning for alignment
2302 with variable VF, since there's no guarantee that the
2303 VF is a power of two. We could relax this if we added
2304 a way of enforcing a power-of-two size. */
2305 unsigned HOST_WIDE_INT size;
2306 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2307 {
2308 do_versioning = false;
2309 break;
2310 }
2311
2312 /* Forcing alignment in the first iteration is no good if
2313 we don't keep it across iterations. For now, just disable
2314 versioning in this case.
2315 ?? We could actually unroll the loop to achieve the required
2316 overall step alignment, and forcing the alignment could be
2317 done by doing some iterations of the non-vectorized loop. */
2318 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2319 * DR_STEP_ALIGNMENT (dr),
2320 DR_TARGET_ALIGNMENT (dr_info)))
2321 {
2322 do_versioning = false;
2323 break;
2324 }
2325
2326 /* The rightmost bits of an aligned address must be zeros.
2327 Construct the mask needed for this test. For example,
2328 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2329 mask must be 15 = 0xf. */
2330 mask = size - 1;
2331
2332 /* FORNOW: use the same mask to test all potentially unaligned
2333 references in the loop. */
2334 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2335 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2336 {
2337 do_versioning = false;
2338 break;
2339 }
2340
2341 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2342 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2343 }
2344 }
2345
2346 /* Versioning requires at least one misaligned data reference. */
2347 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2348 do_versioning = false;
2349 else if (!do_versioning)
2350 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2351 }
2352
2353 if (do_versioning)
2354 {
2355 vec<stmt_vec_info> may_misalign_stmts
2356 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2357 stmt_vec_info stmt_info;
2358
2359 /* It can now be assumed that the data references in the statements
2360 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2361 of the loop being vectorized. */
2362 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2363 {
2364 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2365 SET_DR_MISALIGNMENT (dr_info, 0);
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "Alignment of access forced using versioning.\n");
2369 }
2370
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "Versioning for alignment will be applied.\n");
2374
2375 /* Peeling and versioning can't be done together at this time. */
2376 gcc_assert (! (do_peeling && do_versioning));
2377
2378 return opt_result::success ();
2379 }
2380
2381 /* This point is reached if neither peeling nor versioning is being done. */
2382 gcc_assert (! (do_peeling || do_versioning));
2383
2384 return opt_result::success ();
2385 }
2386
2387
2388 /* Function vect_analyze_data_refs_alignment
2389
2390 Analyze the alignment of the data-references in the loop.
2391 Return FALSE if a data reference is found that cannot be vectorized. */
2392
2393 opt_result
2394 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2395 {
2396 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2397
2398 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2399 struct data_reference *dr;
2400 unsigned int i;
2401
2402 vect_record_base_alignments (loop_vinfo);
2403 FOR_EACH_VEC_ELT (datarefs, i, dr)
2404 {
2405 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2406 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2407 vect_compute_data_ref_alignment (loop_vinfo, dr_info);
2408 }
2409
2410 return opt_result::success ();
2411 }
2412
2413
2414 /* Analyze alignment of DRs of stmts in NODE. */
2415
2416 static bool
2417 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2418 {
2419 /* We vectorize from the first scalar stmt in the node unless
2420 the node is permuted in which case we start from the first
2421 element in the group. */
2422 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2423 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2424 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2425 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2426
2427 /* We need to commit to a vector type for the group now. */
2428 if (is_a <bb_vec_info> (vinfo)
2429 && !vect_update_shared_vectype (first_stmt_info, SLP_TREE_VECTYPE (node)))
2430 {
2431 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433 "desired vector type conflicts with earlier one "
2434 "for %G", first_stmt_info->stmt);
2435 return false;
2436 }
2437
2438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439 vect_compute_data_ref_alignment (vinfo, dr_info);
2440 /* In several places we need alignment of the first element anyway. */
2441 if (dr_info != first_dr_info)
2442 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2443
2444 /* For creating the data-ref pointer we need alignment of the
2445 first element as well. */
2446 first_stmt_info
2447 = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node));
2448 if (first_stmt_info != SLP_TREE_SCALAR_STMTS (node)[0])
2449 {
2450 first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2451 if (dr_info != first_dr_info)
2452 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2453 }
2454
2455 return true;
2456 }
2457
2458 /* Function vect_slp_analyze_instance_alignment
2459
2460 Analyze the alignment of the data-references in the SLP instance.
2461 Return FALSE if a data reference is found that cannot be vectorized. */
2462
2463 bool
2464 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2465 slp_instance instance)
2466 {
2467 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2468
2469 slp_tree node;
2470 unsigned i;
2471 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2472 if (! vect_slp_analyze_node_alignment (vinfo, node))
2473 return false;
2474
2475 node = SLP_INSTANCE_TREE (instance);
2476 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
2477 && ! vect_slp_analyze_node_alignment
2478 (vinfo, SLP_INSTANCE_TREE (instance)))
2479 return false;
2480
2481 return true;
2482 }
2483
2484
2485 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2486 accesses of legal size, step, etc. Detect gaps, single element
2487 interleaving, and other special cases. Set grouped access info.
2488 Collect groups of strided stores for further use in SLP analysis.
2489 Worker for vect_analyze_group_access. */
2490
2491 static bool
2492 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2493 {
2494 data_reference *dr = dr_info->dr;
2495 tree step = DR_STEP (dr);
2496 tree scalar_type = TREE_TYPE (DR_REF (dr));
2497 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2498 stmt_vec_info stmt_info = dr_info->stmt;
2499 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2500 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2501 HOST_WIDE_INT dr_step = -1;
2502 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2503 bool slp_impossible = false;
2504
2505 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2506 size of the interleaving group (including gaps). */
2507 if (tree_fits_shwi_p (step))
2508 {
2509 dr_step = tree_to_shwi (step);
2510 /* Check that STEP is a multiple of type size. Otherwise there is
2511 a non-element-sized gap at the end of the group which we
2512 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2513 ??? As we can handle non-constant step fine here we should
2514 simply remove uses of DR_GROUP_GAP between the last and first
2515 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2516 simply not include that gap. */
2517 if ((dr_step % type_size) != 0)
2518 {
2519 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_NOTE, vect_location,
2521 "Step %T is not a multiple of the element size"
2522 " for %T\n",
2523 step, DR_REF (dr));
2524 return false;
2525 }
2526 groupsize = absu_hwi (dr_step) / type_size;
2527 }
2528 else
2529 groupsize = 0;
2530
2531 /* Not consecutive access is possible only if it is a part of interleaving. */
2532 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2533 {
2534 /* Check if it this DR is a part of interleaving, and is a single
2535 element of the group that is accessed in the loop. */
2536
2537 /* Gaps are supported only for loads. STEP must be a multiple of the type
2538 size. */
2539 if (DR_IS_READ (dr)
2540 && (dr_step % type_size) == 0
2541 && groupsize > 0
2542 /* This could be UINT_MAX but as we are generating code in a very
2543 inefficient way we have to cap earlier.
2544 See PR91403 for example. */
2545 && groupsize <= 4096)
2546 {
2547 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2548 DR_GROUP_SIZE (stmt_info) = groupsize;
2549 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_NOTE, vect_location,
2552 "Detected single element interleaving %T"
2553 " step %T\n",
2554 DR_REF (dr), step);
2555
2556 return true;
2557 }
2558
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561 "not consecutive access %G", stmt_info->stmt);
2562
2563 if (bb_vinfo)
2564 {
2565 /* Mark the statement as unvectorizable. */
2566 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2567 return true;
2568 }
2569
2570 if (dump_enabled_p ())
2571 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2572 STMT_VINFO_STRIDED_P (stmt_info) = true;
2573 return true;
2574 }
2575
2576 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2577 {
2578 /* First stmt in the interleaving chain. Check the chain. */
2579 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2580 struct data_reference *data_ref = dr;
2581 unsigned int count = 1;
2582 tree prev_init = DR_INIT (data_ref);
2583 HOST_WIDE_INT diff, gaps = 0;
2584
2585 /* By construction, all group members have INTEGER_CST DR_INITs. */
2586 while (next)
2587 {
2588 /* We never have the same DR multiple times. */
2589 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2590 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2591
2592 data_ref = STMT_VINFO_DATA_REF (next);
2593
2594 /* All group members have the same STEP by construction. */
2595 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2596
2597 /* Check that the distance between two accesses is equal to the type
2598 size. Otherwise, we have gaps. */
2599 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2600 - TREE_INT_CST_LOW (prev_init)) / type_size;
2601 if (diff != 1)
2602 {
2603 /* FORNOW: SLP of accesses with gaps is not supported. */
2604 slp_impossible = true;
2605 if (DR_IS_WRITE (data_ref))
2606 {
2607 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609 "interleaved store with gaps\n");
2610 return false;
2611 }
2612
2613 gaps += diff - 1;
2614 }
2615
2616 last_accessed_element += diff;
2617
2618 /* Store the gap from the previous member of the group. If there is no
2619 gap in the access, DR_GROUP_GAP is always 1. */
2620 DR_GROUP_GAP (next) = diff;
2621
2622 prev_init = DR_INIT (data_ref);
2623 next = DR_GROUP_NEXT_ELEMENT (next);
2624 /* Count the number of data-refs in the chain. */
2625 count++;
2626 }
2627
2628 if (groupsize == 0)
2629 groupsize = count + gaps;
2630
2631 /* This could be UINT_MAX but as we are generating code in a very
2632 inefficient way we have to cap earlier. See PR78699 for example. */
2633 if (groupsize > 4096)
2634 {
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637 "group is too large\n");
2638 return false;
2639 }
2640
2641 /* Check that the size of the interleaving is equal to count for stores,
2642 i.e., that there are no gaps. */
2643 if (groupsize != count
2644 && !DR_IS_READ (dr))
2645 {
2646 groupsize = count;
2647 STMT_VINFO_STRIDED_P (stmt_info) = true;
2648 }
2649
2650 /* If there is a gap after the last load in the group it is the
2651 difference between the groupsize and the last accessed
2652 element.
2653 When there is no gap, this difference should be 0. */
2654 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2655
2656 DR_GROUP_SIZE (stmt_info) = groupsize;
2657 if (dump_enabled_p ())
2658 {
2659 dump_printf_loc (MSG_NOTE, vect_location,
2660 "Detected interleaving ");
2661 if (DR_IS_READ (dr))
2662 dump_printf (MSG_NOTE, "load ");
2663 else if (STMT_VINFO_STRIDED_P (stmt_info))
2664 dump_printf (MSG_NOTE, "strided store ");
2665 else
2666 dump_printf (MSG_NOTE, "store ");
2667 dump_printf (MSG_NOTE, "of size %u\n",
2668 (unsigned)groupsize);
2669 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2670 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2671 while (next)
2672 {
2673 if (DR_GROUP_GAP (next) != 1)
2674 dump_printf_loc (MSG_NOTE, vect_location,
2675 "\t<gap of %d elements>\n",
2676 DR_GROUP_GAP (next) - 1);
2677 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2678 next = DR_GROUP_NEXT_ELEMENT (next);
2679 }
2680 if (DR_GROUP_GAP (stmt_info) != 0)
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "\t<gap of %d elements>\n",
2683 DR_GROUP_GAP (stmt_info));
2684 }
2685
2686 /* SLP: create an SLP data structure for every interleaving group of
2687 stores for further analysis in vect_analyse_slp. */
2688 if (DR_IS_WRITE (dr) && !slp_impossible)
2689 {
2690 if (loop_vinfo)
2691 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2692 if (bb_vinfo)
2693 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2694 }
2695 }
2696
2697 return true;
2698 }
2699
2700 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2701 accesses of legal size, step, etc. Detect gaps, single element
2702 interleaving, and other special cases. Set grouped access info.
2703 Collect groups of strided stores for further use in SLP analysis. */
2704
2705 static bool
2706 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2707 {
2708 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2709 {
2710 /* Dissolve the group if present. */
2711 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2712 while (stmt_info)
2713 {
2714 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2715 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2716 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2717 stmt_info = next;
2718 }
2719 return false;
2720 }
2721 return true;
2722 }
2723
2724 /* Analyze the access pattern of the data-reference DR_INFO.
2725 In case of non-consecutive accesses call vect_analyze_group_access() to
2726 analyze groups of accesses. */
2727
2728 static bool
2729 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2730 {
2731 data_reference *dr = dr_info->dr;
2732 tree step = DR_STEP (dr);
2733 tree scalar_type = TREE_TYPE (DR_REF (dr));
2734 stmt_vec_info stmt_info = dr_info->stmt;
2735 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2736 class loop *loop = NULL;
2737
2738 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2739 return true;
2740
2741 if (loop_vinfo)
2742 loop = LOOP_VINFO_LOOP (loop_vinfo);
2743
2744 if (loop_vinfo && !step)
2745 {
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2748 "bad data-ref access in loop\n");
2749 return false;
2750 }
2751
2752 /* Allow loads with zero step in inner-loop vectorization. */
2753 if (loop_vinfo && integer_zerop (step))
2754 {
2755 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2756 if (!nested_in_vect_loop_p (loop, stmt_info))
2757 return DR_IS_READ (dr);
2758 /* Allow references with zero step for outer loops marked
2759 with pragma omp simd only - it guarantees absence of
2760 loop-carried dependencies between inner loop iterations. */
2761 if (loop->safelen < 2)
2762 {
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE, vect_location,
2765 "zero step in inner loop of nest\n");
2766 return false;
2767 }
2768 }
2769
2770 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2771 {
2772 /* Interleaved accesses are not yet supported within outer-loop
2773 vectorization for references in the inner-loop. */
2774 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2775
2776 /* For the rest of the analysis we use the outer-loop step. */
2777 step = STMT_VINFO_DR_STEP (stmt_info);
2778 if (integer_zerop (step))
2779 {
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_NOTE, vect_location,
2782 "zero step in outer loop.\n");
2783 return DR_IS_READ (dr);
2784 }
2785 }
2786
2787 /* Consecutive? */
2788 if (TREE_CODE (step) == INTEGER_CST)
2789 {
2790 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2791 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2792 || (dr_step < 0
2793 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2794 {
2795 /* Mark that it is not interleaving. */
2796 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2797 return true;
2798 }
2799 }
2800
2801 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2802 {
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_NOTE, vect_location,
2805 "grouped access in outer loop.\n");
2806 return false;
2807 }
2808
2809
2810 /* Assume this is a DR handled by non-constant strided load case. */
2811 if (TREE_CODE (step) != INTEGER_CST)
2812 return (STMT_VINFO_STRIDED_P (stmt_info)
2813 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2814 || vect_analyze_group_access (vinfo, dr_info)));
2815
2816 /* Not consecutive access - check if it's a part of interleaving group. */
2817 return vect_analyze_group_access (vinfo, dr_info);
2818 }
2819
2820 typedef std::pair<data_reference_p, int> data_ref_pair;
2821
2822 /* Compare two data-references DRA and DRB to group them into chunks
2823 suitable for grouping. */
2824
2825 static int
2826 dr_group_sort_cmp (const void *dra_, const void *drb_)
2827 {
2828 data_ref_pair dra_pair = *(data_ref_pair *)const_cast<void *>(dra_);
2829 data_ref_pair drb_pair = *(data_ref_pair *)const_cast<void *>(drb_);
2830 data_reference_p dra = dra_pair.first;
2831 data_reference_p drb = drb_pair.first;
2832 int cmp;
2833
2834 /* Stabilize sort. */
2835 if (dra == drb)
2836 return 0;
2837
2838 /* DRs in different basic-blocks never belong to the same group. */
2839 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
2840 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
2841 if (bb_index1 != bb_index2)
2842 return bb_index1 < bb_index2 ? -1 : 1;
2843
2844 /* Different group IDs lead never belong to the same group. */
2845 if (dra_pair.second != drb_pair.second)
2846 return dra_pair.second < drb_pair.second ? -1 : 1;
2847
2848 /* Ordering of DRs according to base. */
2849 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2850 DR_BASE_ADDRESS (drb));
2851 if (cmp != 0)
2852 return cmp;
2853
2854 /* And according to DR_OFFSET. */
2855 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2856 if (cmp != 0)
2857 return cmp;
2858
2859 /* Put reads before writes. */
2860 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2861 return DR_IS_READ (dra) ? -1 : 1;
2862
2863 /* Then sort after access size. */
2864 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2865 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2866 if (cmp != 0)
2867 return cmp;
2868
2869 /* And after step. */
2870 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2871 if (cmp != 0)
2872 return cmp;
2873
2874 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2875 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2876 if (cmp == 0)
2877 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2878 return cmp;
2879 }
2880
2881 /* If OP is the result of a conversion, return the unconverted value,
2882 otherwise return null. */
2883
2884 static tree
2885 strip_conversion (tree op)
2886 {
2887 if (TREE_CODE (op) != SSA_NAME)
2888 return NULL_TREE;
2889 gimple *stmt = SSA_NAME_DEF_STMT (op);
2890 if (!is_gimple_assign (stmt)
2891 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2892 return NULL_TREE;
2893 return gimple_assign_rhs1 (stmt);
2894 }
2895
2896 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2897 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2898 be grouped in SLP mode. */
2899
2900 static bool
2901 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2902 bool allow_slp_p)
2903 {
2904 if (gimple_assign_single_p (stmt1_info->stmt))
2905 return gimple_assign_single_p (stmt2_info->stmt);
2906
2907 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2908 if (call1 && gimple_call_internal_p (call1))
2909 {
2910 /* Check for two masked loads or two masked stores. */
2911 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2912 if (!call2 || !gimple_call_internal_p (call2))
2913 return false;
2914 internal_fn ifn = gimple_call_internal_fn (call1);
2915 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2916 return false;
2917 if (ifn != gimple_call_internal_fn (call2))
2918 return false;
2919
2920 /* Check that the masks are the same. Cope with casts of masks,
2921 like those created by build_mask_conversion. */
2922 tree mask1 = gimple_call_arg (call1, 2);
2923 tree mask2 = gimple_call_arg (call2, 2);
2924 if (!operand_equal_p (mask1, mask2, 0)
2925 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2926 {
2927 mask1 = strip_conversion (mask1);
2928 if (!mask1)
2929 return false;
2930 mask2 = strip_conversion (mask2);
2931 if (!mask2)
2932 return false;
2933 if (!operand_equal_p (mask1, mask2, 0))
2934 return false;
2935 }
2936 return true;
2937 }
2938
2939 return false;
2940 }
2941
2942 /* Function vect_analyze_data_ref_accesses.
2943
2944 Analyze the access pattern of all the data references in the loop.
2945
2946 FORNOW: the only access pattern that is considered vectorizable is a
2947 simple step 1 (consecutive) access.
2948
2949 FORNOW: handle only arrays and pointer accesses. */
2950
2951 opt_result
2952 vect_analyze_data_ref_accesses (vec_info *vinfo,
2953 vec<int> *dataref_groups)
2954 {
2955 unsigned int i;
2956 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2957
2958 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2959
2960 if (datarefs.is_empty ())
2961 return opt_result::success ();
2962
2963 /* Sort the array of datarefs to make building the interleaving chains
2964 linear. Don't modify the original vector's order, it is needed for
2965 determining what dependencies are reversed. */
2966 vec<data_ref_pair> datarefs_copy;
2967 datarefs_copy.create (datarefs.length ());
2968 for (unsigned i = 0; i < datarefs.length (); i++)
2969 {
2970 int group_id = dataref_groups ? (*dataref_groups)[i] : 0;
2971 datarefs_copy.quick_push (data_ref_pair (datarefs[i], group_id));
2972 }
2973 datarefs_copy.qsort (dr_group_sort_cmp);
2974 hash_set<stmt_vec_info> to_fixup;
2975
2976 /* Build the interleaving chains. */
2977 for (i = 0; i < datarefs_copy.length () - 1;)
2978 {
2979 data_reference_p dra = datarefs_copy[i].first;
2980 int dra_group_id = datarefs_copy[i].second;
2981 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2982 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2983 stmt_vec_info lastinfo = NULL;
2984 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2985 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2986 {
2987 ++i;
2988 continue;
2989 }
2990 for (i = i + 1; i < datarefs_copy.length (); ++i)
2991 {
2992 data_reference_p drb = datarefs_copy[i].first;
2993 int drb_group_id = datarefs_copy[i].second;
2994 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2995 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2996 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2997 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2998 break;
2999
3000 /* ??? Imperfect sorting (non-compatible types, non-modulo
3001 accesses, same accesses) can lead to a group to be artificially
3002 split here as we don't just skip over those. If it really
3003 matters we can push those to a worklist and re-iterate
3004 over them. The we can just skip ahead to the next DR here. */
3005
3006 /* DRs in a different BBs should not be put into the same
3007 interleaving group. */
3008 int bb_index1 = gimple_bb (DR_STMT (dra))->index;
3009 int bb_index2 = gimple_bb (DR_STMT (drb))->index;
3010 if (bb_index1 != bb_index2)
3011 break;
3012
3013 if (dra_group_id != drb_group_id)
3014 break;
3015
3016 /* Check that the data-refs have same first location (except init)
3017 and they are both either store or load (not load and store,
3018 not masked loads or stores). */
3019 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3020 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3021 DR_BASE_ADDRESS (drb)) != 0
3022 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3023 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3024 break;
3025
3026 /* Check that the data-refs have the same constant size. */
3027 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3028 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3029 if (!tree_fits_uhwi_p (sza)
3030 || !tree_fits_uhwi_p (szb)
3031 || !tree_int_cst_equal (sza, szb))
3032 break;
3033
3034 /* Check that the data-refs have the same step. */
3035 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3036 break;
3037
3038 /* Check the types are compatible.
3039 ??? We don't distinguish this during sorting. */
3040 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3041 TREE_TYPE (DR_REF (drb))))
3042 break;
3043
3044 /* Check that the DR_INITs are compile-time constants. */
3045 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3046 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3047 break;
3048
3049 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3050 just hold extra information. */
3051 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3052 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3053 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3054 break;
3055
3056 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3057 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3058 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3059 HOST_WIDE_INT init_prev
3060 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1].first));
3061 gcc_assert (init_a <= init_b
3062 && init_a <= init_prev
3063 && init_prev <= init_b);
3064
3065 /* Do not place the same access in the interleaving chain twice. */
3066 if (init_b == init_prev)
3067 {
3068 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1].first))
3069 < gimple_uid (DR_STMT (drb)));
3070 /* Simply link in duplicates and fix up the chain below. */
3071 }
3072 else
3073 {
3074 /* If init_b == init_a + the size of the type * k, we have an
3075 interleaving, and DRA is accessed before DRB. */
3076 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3077 if (type_size_a == 0
3078 || (init_b - init_a) % type_size_a != 0)
3079 break;
3080
3081 /* If we have a store, the accesses are adjacent. This splits
3082 groups into chunks we support (we don't support vectorization
3083 of stores with gaps). */
3084 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3085 break;
3086
3087 /* If the step (if not zero or non-constant) is smaller than the
3088 difference between data-refs' inits this splits groups into
3089 suitable sizes. */
3090 if (tree_fits_shwi_p (DR_STEP (dra)))
3091 {
3092 unsigned HOST_WIDE_INT step
3093 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3094 if (step != 0
3095 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3096 break;
3097 }
3098 }
3099
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE, vect_location,
3102 DR_IS_READ (dra)
3103 ? "Detected interleaving load %T and %T\n"
3104 : "Detected interleaving store %T and %T\n",
3105 DR_REF (dra), DR_REF (drb));
3106
3107 /* Link the found element into the group list. */
3108 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3109 {
3110 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3111 lastinfo = stmtinfo_a;
3112 }
3113 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3114 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3115 lastinfo = stmtinfo_b;
3116
3117 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3118 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3119
3120 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3121 dump_printf_loc (MSG_NOTE, vect_location,
3122 "Load suitable for SLP vectorization only.\n");
3123
3124 if (init_b == init_prev
3125 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3126 && dump_enabled_p ())
3127 dump_printf_loc (MSG_NOTE, vect_location,
3128 "Queuing group with duplicate access for fixup\n");
3129 }
3130 }
3131
3132 /* Fixup groups with duplicate entries by splitting it. */
3133 while (1)
3134 {
3135 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3136 if (!(it != to_fixup.end ()))
3137 break;
3138 stmt_vec_info grp = *it;
3139 to_fixup.remove (grp);
3140
3141 /* Find the earliest duplicate group member. */
3142 unsigned first_duplicate = -1u;
3143 stmt_vec_info next, g = grp;
3144 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3145 {
3146 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3147 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3148 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3149 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3150 g = next;
3151 }
3152 if (first_duplicate == -1U)
3153 continue;
3154
3155 /* Then move all stmts after the first duplicate to a new group.
3156 Note this is a heuristic but one with the property that *it
3157 is fixed up completely. */
3158 g = grp;
3159 stmt_vec_info newgroup = NULL, ng = grp;
3160 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3161 {
3162 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3163 {
3164 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3165 if (!newgroup)
3166 newgroup = next;
3167 else
3168 DR_GROUP_NEXT_ELEMENT (ng) = next;
3169 ng = next;
3170 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3171 }
3172 else
3173 g = DR_GROUP_NEXT_ELEMENT (g);
3174 }
3175 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3176
3177 /* Fixup the new group which still may contain duplicates. */
3178 to_fixup.add (newgroup);
3179 }
3180
3181 data_ref_pair *dr_pair;
3182 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_pair)
3183 {
3184 dr_vec_info *dr_info = vinfo->lookup_dr (dr_pair->first);
3185 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3186 && !vect_analyze_data_ref_access (vinfo, dr_info))
3187 {
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190 "not vectorized: complicated access pattern.\n");
3191
3192 if (is_a <bb_vec_info> (vinfo))
3193 {
3194 /* Mark the statement as not vectorizable. */
3195 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3196 continue;
3197 }
3198 else
3199 {
3200 datarefs_copy.release ();
3201 return opt_result::failure_at (dr_info->stmt->stmt,
3202 "not vectorized:"
3203 " complicated access pattern.\n");
3204 }
3205 }
3206 }
3207
3208 datarefs_copy.release ();
3209 return opt_result::success ();
3210 }
3211
3212 /* Function vect_vfa_segment_size.
3213
3214 Input:
3215 DR_INFO: The data reference.
3216 LENGTH_FACTOR: segment length to consider.
3217
3218 Return a value suitable for the dr_with_seg_len::seg_len field.
3219 This is the "distance travelled" by the pointer from the first
3220 iteration in the segment to the last. Note that it does not include
3221 the size of the access; in effect it only describes the first byte. */
3222
3223 static tree
3224 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3225 {
3226 length_factor = size_binop (MINUS_EXPR,
3227 fold_convert (sizetype, length_factor),
3228 size_one_node);
3229 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3230 length_factor);
3231 }
3232
3233 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3234 gives the worst-case number of bytes covered by the segment. */
3235
3236 static unsigned HOST_WIDE_INT
3237 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3238 {
3239 stmt_vec_info stmt_vinfo = dr_info->stmt;
3240 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3241 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3242 unsigned HOST_WIDE_INT access_size = ref_size;
3243 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3244 {
3245 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3246 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3247 }
3248 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3249 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3250 == dr_explicit_realign_optimized))
3251 {
3252 /* We might access a full vector's worth. */
3253 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3254 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3255 }
3256 return access_size;
3257 }
3258
3259 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3260 describes. */
3261
3262 static unsigned int
3263 vect_vfa_align (dr_vec_info *dr_info)
3264 {
3265 return dr_alignment (dr_info->dr);
3266 }
3267
3268 /* Function vect_no_alias_p.
3269
3270 Given data references A and B with equal base and offset, see whether
3271 the alias relation can be decided at compilation time. Return 1 if
3272 it can and the references alias, 0 if it can and the references do
3273 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3274 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3275 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3276
3277 static int
3278 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3279 tree segment_length_a, tree segment_length_b,
3280 unsigned HOST_WIDE_INT access_size_a,
3281 unsigned HOST_WIDE_INT access_size_b)
3282 {
3283 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3284 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3285 poly_uint64 const_length_a;
3286 poly_uint64 const_length_b;
3287
3288 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3289 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3290 [a, a+12) */
3291 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3292 {
3293 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3294 offset_a -= const_length_a;
3295 }
3296 else
3297 const_length_a = tree_to_poly_uint64 (segment_length_a);
3298 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3299 {
3300 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3301 offset_b -= const_length_b;
3302 }
3303 else
3304 const_length_b = tree_to_poly_uint64 (segment_length_b);
3305
3306 const_length_a += access_size_a;
3307 const_length_b += access_size_b;
3308
3309 if (ranges_known_overlap_p (offset_a, const_length_a,
3310 offset_b, const_length_b))
3311 return 1;
3312
3313 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3314 offset_b, const_length_b))
3315 return 0;
3316
3317 return -1;
3318 }
3319
3320 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3321 in DDR is >= VF. */
3322
3323 static bool
3324 dependence_distance_ge_vf (data_dependence_relation *ddr,
3325 unsigned int loop_depth, poly_uint64 vf)
3326 {
3327 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3328 || DDR_NUM_DIST_VECTS (ddr) == 0)
3329 return false;
3330
3331 /* If the dependence is exact, we should have limited the VF instead. */
3332 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3333
3334 unsigned int i;
3335 lambda_vector dist_v;
3336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3337 {
3338 HOST_WIDE_INT dist = dist_v[loop_depth];
3339 if (dist != 0
3340 && !(dist > 0 && DDR_REVERSED_P (ddr))
3341 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3342 return false;
3343 }
3344
3345 if (dump_enabled_p ())
3346 dump_printf_loc (MSG_NOTE, vect_location,
3347 "dependence distance between %T and %T is >= VF\n",
3348 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3349
3350 return true;
3351 }
3352
3353 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3354
3355 static void
3356 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3357 {
3358 dump_printf (dump_kind, "%s (%T) >= ",
3359 lower_bound.unsigned_p ? "unsigned" : "abs",
3360 lower_bound.expr);
3361 dump_dec (dump_kind, lower_bound.min_value);
3362 }
3363
3364 /* Record that the vectorized loop requires the vec_lower_bound described
3365 by EXPR, UNSIGNED_P and MIN_VALUE. */
3366
3367 static void
3368 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3369 poly_uint64 min_value)
3370 {
3371 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3372 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3373 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3374 {
3375 unsigned_p &= lower_bounds[i].unsigned_p;
3376 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3377 if (lower_bounds[i].unsigned_p != unsigned_p
3378 || maybe_lt (lower_bounds[i].min_value, min_value))
3379 {
3380 lower_bounds[i].unsigned_p = unsigned_p;
3381 lower_bounds[i].min_value = min_value;
3382 if (dump_enabled_p ())
3383 {
3384 dump_printf_loc (MSG_NOTE, vect_location,
3385 "updating run-time check to ");
3386 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3387 dump_printf (MSG_NOTE, "\n");
3388 }
3389 }
3390 return;
3391 }
3392
3393 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3394 if (dump_enabled_p ())
3395 {
3396 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3397 dump_lower_bound (MSG_NOTE, lower_bound);
3398 dump_printf (MSG_NOTE, "\n");
3399 }
3400 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3401 }
3402
3403 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3404 will span fewer than GAP bytes. */
3405
3406 static bool
3407 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3408 poly_int64 gap)
3409 {
3410 stmt_vec_info stmt_info = dr_info->stmt;
3411 HOST_WIDE_INT count
3412 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3413 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3414 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3415 return (estimated_poly_value (gap)
3416 <= count * vect_get_scalar_dr_size (dr_info));
3417 }
3418
3419 /* Return true if we know that there is no alias between DR_INFO_A and
3420 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3421 When returning true, set *LOWER_BOUND_OUT to this N. */
3422
3423 static bool
3424 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3425 poly_uint64 *lower_bound_out)
3426 {
3427 /* Check that there is a constant gap of known sign between DR_A
3428 and DR_B. */
3429 data_reference *dr_a = dr_info_a->dr;
3430 data_reference *dr_b = dr_info_b->dr;
3431 poly_int64 init_a, init_b;
3432 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3433 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3434 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3435 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3436 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3437 || !ordered_p (init_a, init_b))
3438 return false;
3439
3440 /* Sort DR_A and DR_B by the address they access. */
3441 if (maybe_lt (init_b, init_a))
3442 {
3443 std::swap (init_a, init_b);
3444 std::swap (dr_info_a, dr_info_b);
3445 std::swap (dr_a, dr_b);
3446 }
3447
3448 /* If the two accesses could be dependent within a scalar iteration,
3449 make sure that we'd retain their order. */
3450 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3451 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3452 return false;
3453
3454 /* There is no alias if abs (DR_STEP) is greater than or equal to
3455 the bytes spanned by the combination of the two accesses. */
3456 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3457 return true;
3458 }
3459
3460 /* Function vect_prune_runtime_alias_test_list.
3461
3462 Prune a list of ddrs to be tested at run-time by versioning for alias.
3463 Merge several alias checks into one if possible.
3464 Return FALSE if resulting list of ddrs is longer then allowed by
3465 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3466
3467 opt_result
3468 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3469 {
3470 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3471 hash_set <tree_pair_hash> compared_objects;
3472
3473 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3474 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3475 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3476 vec<vec_object_pair> &check_unequal_addrs
3477 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3478 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3479 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3480
3481 ddr_p ddr;
3482 unsigned int i;
3483 tree length_factor;
3484
3485 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3486
3487 /* Step values are irrelevant for aliasing if the number of vector
3488 iterations is equal to the number of scalar iterations (which can
3489 happen for fully-SLP loops). */
3490 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3491
3492 if (!ignore_step_p)
3493 {
3494 /* Convert the checks for nonzero steps into bound tests. */
3495 tree value;
3496 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3497 vect_check_lower_bound (loop_vinfo, value, true, 1);
3498 }
3499
3500 if (may_alias_ddrs.is_empty ())
3501 return opt_result::success ();
3502
3503 comp_alias_ddrs.create (may_alias_ddrs.length ());
3504
3505 unsigned int loop_depth
3506 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3507 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3508
3509 /* First, we collect all data ref pairs for aliasing checks. */
3510 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3511 {
3512 poly_uint64 lower_bound;
3513 tree segment_length_a, segment_length_b;
3514 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3515 unsigned int align_a, align_b;
3516
3517 /* Ignore the alias if the VF we chose ended up being no greater
3518 than the dependence distance. */
3519 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3520 continue;
3521
3522 if (DDR_OBJECT_A (ddr))
3523 {
3524 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3525 if (!compared_objects.add (new_pair))
3526 {
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE, vect_location,
3529 "checking that %T and %T"
3530 " have different addresses\n",
3531 new_pair.first, new_pair.second);
3532 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3533 }
3534 continue;
3535 }
3536
3537 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3538 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3539
3540 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3541 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3542
3543 bool preserves_scalar_order_p
3544 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3545
3546 /* Skip the pair if inter-iteration dependencies are irrelevant
3547 and intra-iteration dependencies are guaranteed to be honored. */
3548 if (ignore_step_p
3549 && (preserves_scalar_order_p
3550 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3551 &lower_bound)))
3552 {
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "no need for alias check between "
3556 "%T and %T when VF is 1\n",
3557 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3558 continue;
3559 }
3560
3561 /* See whether we can handle the alias using a bounds check on
3562 the step, and whether that's likely to be the best approach.
3563 (It might not be, for example, if the minimum step is much larger
3564 than the number of bytes handled by one vector iteration.) */
3565 if (!ignore_step_p
3566 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3567 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3568 &lower_bound)
3569 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3570 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3571 {
3572 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3573 if (dump_enabled_p ())
3574 {
3575 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3576 "%T and %T when the step %T is outside ",
3577 DR_REF (dr_info_a->dr),
3578 DR_REF (dr_info_b->dr),
3579 DR_STEP (dr_info_a->dr));
3580 if (unsigned_p)
3581 dump_printf (MSG_NOTE, "[0");
3582 else
3583 {
3584 dump_printf (MSG_NOTE, "(");
3585 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3586 }
3587 dump_printf (MSG_NOTE, ", ");
3588 dump_dec (MSG_NOTE, lower_bound);
3589 dump_printf (MSG_NOTE, ")\n");
3590 }
3591 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3592 unsigned_p, lower_bound);
3593 continue;
3594 }
3595
3596 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3597 if (dr_group_first_a)
3598 {
3599 stmt_info_a = dr_group_first_a;
3600 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3601 }
3602
3603 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3604 if (dr_group_first_b)
3605 {
3606 stmt_info_b = dr_group_first_b;
3607 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3608 }
3609
3610 if (ignore_step_p)
3611 {
3612 segment_length_a = size_zero_node;
3613 segment_length_b = size_zero_node;
3614 }
3615 else
3616 {
3617 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3618 DR_STEP (dr_info_b->dr), 0))
3619 length_factor = scalar_loop_iters;
3620 else
3621 length_factor = size_int (vect_factor);
3622 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3623 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3624 }
3625 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3626 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3627 align_a = vect_vfa_align (dr_info_a);
3628 align_b = vect_vfa_align (dr_info_b);
3629
3630 /* See whether the alias is known at compilation time. */
3631 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3632 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3633 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3634 DR_OFFSET (dr_info_b->dr), 0)
3635 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3636 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3637 && poly_int_tree_p (segment_length_a)
3638 && poly_int_tree_p (segment_length_b))
3639 {
3640 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3641 segment_length_a,
3642 segment_length_b,
3643 access_size_a,
3644 access_size_b);
3645 if (res >= 0 && dump_enabled_p ())
3646 {
3647 dump_printf_loc (MSG_NOTE, vect_location,
3648 "can tell at compile time that %T and %T",
3649 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3650 if (res == 0)
3651 dump_printf (MSG_NOTE, " do not alias\n");
3652 else
3653 dump_printf (MSG_NOTE, " alias\n");
3654 }
3655
3656 if (res == 0)
3657 continue;
3658
3659 if (res == 1)
3660 return opt_result::failure_at (stmt_info_b->stmt,
3661 "not vectorized:"
3662 " compilation time alias: %G%G",
3663 stmt_info_a->stmt,
3664 stmt_info_b->stmt);
3665 }
3666
3667 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3668 access_size_a, align_a);
3669 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3670 access_size_b, align_b);
3671 /* Canonicalize the order to be the one that's needed for accurate
3672 RAW, WAR and WAW flags, in cases where the data references are
3673 well-ordered. The order doesn't really matter otherwise,
3674 but we might as well be consistent. */
3675 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3676 std::swap (dr_a, dr_b);
3677
3678 dr_with_seg_len_pair_t dr_with_seg_len_pair
3679 (dr_a, dr_b, (preserves_scalar_order_p
3680 ? dr_with_seg_len_pair_t::WELL_ORDERED
3681 : dr_with_seg_len_pair_t::REORDERED));
3682
3683 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3684 }
3685
3686 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3687
3688 unsigned int count = (comp_alias_ddrs.length ()
3689 + check_unequal_addrs.length ());
3690
3691 if (count && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
3692 return opt_result::failure_at
3693 (vect_location, "would need a runtime alias check\n");
3694
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_NOTE, vect_location,
3697 "improved number of alias checks from %d to %d\n",
3698 may_alias_ddrs.length (), count);
3699 unsigned limit = param_vect_max_version_for_alias_checks;
3700 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3701 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3702 if (count > limit)
3703 return opt_result::failure_at
3704 (vect_location,
3705 "number of versioning for alias run-time tests exceeds %d "
3706 "(--param vect-max-version-for-alias-checks)\n", limit);
3707
3708 return opt_result::success ();
3709 }
3710
3711 /* Check whether we can use an internal function for a gather load
3712 or scatter store. READ_P is true for loads and false for stores.
3713 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3714 the type of the memory elements being loaded or stored. OFFSET_TYPE
3715 is the type of the offset that is being applied to the invariant
3716 base address. SCALE is the amount by which the offset should
3717 be multiplied *after* it has been converted to address width.
3718
3719 Return true if the function is supported, storing the function id in
3720 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3721
3722 bool
3723 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3724 tree vectype, tree memory_type, tree offset_type,
3725 int scale, internal_fn *ifn_out,
3726 tree *offset_vectype_out)
3727 {
3728 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3729 unsigned int element_bits = vector_element_bits (vectype);
3730 if (element_bits != memory_bits)
3731 /* For now the vector elements must be the same width as the
3732 memory elements. */
3733 return false;
3734
3735 /* Work out which function we need. */
3736 internal_fn ifn;
3737 if (read_p)
3738 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3739 else
3740 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3741
3742 for (;;)
3743 {
3744 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3745 if (!offset_vectype)
3746 return false;
3747
3748 /* Test whether the target supports this combination. */
3749 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3750 offset_vectype, scale))
3751 {
3752 *ifn_out = ifn;
3753 *offset_vectype_out = offset_vectype;
3754 return true;
3755 }
3756
3757 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3758 && TYPE_PRECISION (offset_type) >= element_bits)
3759 return false;
3760
3761 offset_type = build_nonstandard_integer_type
3762 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3763 }
3764 }
3765
3766 /* STMT_INFO is a call to an internal gather load or scatter store function.
3767 Describe the operation in INFO. */
3768
3769 static void
3770 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3771 gather_scatter_info *info)
3772 {
3773 gcall *call = as_a <gcall *> (stmt_info->stmt);
3774 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3775 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3776
3777 info->ifn = gimple_call_internal_fn (call);
3778 info->decl = NULL_TREE;
3779 info->base = gimple_call_arg (call, 0);
3780 info->offset = gimple_call_arg (call, 1);
3781 info->offset_dt = vect_unknown_def_type;
3782 info->offset_vectype = NULL_TREE;
3783 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3784 info->element_type = TREE_TYPE (vectype);
3785 info->memory_type = TREE_TYPE (DR_REF (dr));
3786 }
3787
3788 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3789 gather load or scatter store. Describe the operation in *INFO if so. */
3790
3791 bool
3792 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3793 gather_scatter_info *info)
3794 {
3795 HOST_WIDE_INT scale = 1;
3796 poly_int64 pbitpos, pbitsize;
3797 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3798 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3799 tree offtype = NULL_TREE;
3800 tree decl = NULL_TREE, base, off;
3801 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3802 tree memory_type = TREE_TYPE (DR_REF (dr));
3803 machine_mode pmode;
3804 int punsignedp, reversep, pvolatilep = 0;
3805 internal_fn ifn;
3806 tree offset_vectype;
3807 bool masked_p = false;
3808
3809 /* See whether this is already a call to a gather/scatter internal function.
3810 If not, see whether it's a masked load or store. */
3811 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3812 if (call && gimple_call_internal_p (call))
3813 {
3814 ifn = gimple_call_internal_fn (call);
3815 if (internal_gather_scatter_fn_p (ifn))
3816 {
3817 vect_describe_gather_scatter_call (stmt_info, info);
3818 return true;
3819 }
3820 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3821 }
3822
3823 /* True if we should aim to use internal functions rather than
3824 built-in functions. */
3825 bool use_ifn_p = (DR_IS_READ (dr)
3826 ? supports_vec_gather_load_p ()
3827 : supports_vec_scatter_store_p ());
3828
3829 base = DR_REF (dr);
3830 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3831 see if we can use the def stmt of the address. */
3832 if (masked_p
3833 && TREE_CODE (base) == MEM_REF
3834 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3835 && integer_zerop (TREE_OPERAND (base, 1))
3836 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3837 {
3838 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3839 if (is_gimple_assign (def_stmt)
3840 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3841 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3842 }
3843
3844 /* The gather and scatter builtins need address of the form
3845 loop_invariant + vector * {1, 2, 4, 8}
3846 or
3847 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3848 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3849 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3850 multiplications and additions in it. To get a vector, we need
3851 a single SSA_NAME that will be defined in the loop and will
3852 contain everything that is not loop invariant and that can be
3853 vectorized. The following code attempts to find such a preexistng
3854 SSA_NAME OFF and put the loop invariants into a tree BASE
3855 that can be gimplified before the loop. */
3856 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3857 &punsignedp, &reversep, &pvolatilep);
3858 if (reversep)
3859 return false;
3860
3861 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3862
3863 if (TREE_CODE (base) == MEM_REF)
3864 {
3865 if (!integer_zerop (TREE_OPERAND (base, 1)))
3866 {
3867 if (off == NULL_TREE)
3868 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3869 else
3870 off = size_binop (PLUS_EXPR, off,
3871 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3872 }
3873 base = TREE_OPERAND (base, 0);
3874 }
3875 else
3876 base = build_fold_addr_expr (base);
3877
3878 if (off == NULL_TREE)
3879 off = size_zero_node;
3880
3881 /* If base is not loop invariant, either off is 0, then we start with just
3882 the constant offset in the loop invariant BASE and continue with base
3883 as OFF, otherwise give up.
3884 We could handle that case by gimplifying the addition of base + off
3885 into some SSA_NAME and use that as off, but for now punt. */
3886 if (!expr_invariant_in_loop_p (loop, base))
3887 {
3888 if (!integer_zerop (off))
3889 return false;
3890 off = base;
3891 base = size_int (pbytepos);
3892 }
3893 /* Otherwise put base + constant offset into the loop invariant BASE
3894 and continue with OFF. */
3895 else
3896 {
3897 base = fold_convert (sizetype, base);
3898 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3899 }
3900
3901 /* OFF at this point may be either a SSA_NAME or some tree expression
3902 from get_inner_reference. Try to peel off loop invariants from it
3903 into BASE as long as possible. */
3904 STRIP_NOPS (off);
3905 while (offtype == NULL_TREE)
3906 {
3907 enum tree_code code;
3908 tree op0, op1, add = NULL_TREE;
3909
3910 if (TREE_CODE (off) == SSA_NAME)
3911 {
3912 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3913
3914 if (expr_invariant_in_loop_p (loop, off))
3915 return false;
3916
3917 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3918 break;
3919
3920 op0 = gimple_assign_rhs1 (def_stmt);
3921 code = gimple_assign_rhs_code (def_stmt);
3922 op1 = gimple_assign_rhs2 (def_stmt);
3923 }
3924 else
3925 {
3926 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3927 return false;
3928 code = TREE_CODE (off);
3929 extract_ops_from_tree (off, &code, &op0, &op1);
3930 }
3931 switch (code)
3932 {
3933 case POINTER_PLUS_EXPR:
3934 case PLUS_EXPR:
3935 if (expr_invariant_in_loop_p (loop, op0))
3936 {
3937 add = op0;
3938 off = op1;
3939 do_add:
3940 add = fold_convert (sizetype, add);
3941 if (scale != 1)
3942 add = size_binop (MULT_EXPR, add, size_int (scale));
3943 base = size_binop (PLUS_EXPR, base, add);
3944 continue;
3945 }
3946 if (expr_invariant_in_loop_p (loop, op1))
3947 {
3948 add = op1;
3949 off = op0;
3950 goto do_add;
3951 }
3952 break;
3953 case MINUS_EXPR:
3954 if (expr_invariant_in_loop_p (loop, op1))
3955 {
3956 add = fold_convert (sizetype, op1);
3957 add = size_binop (MINUS_EXPR, size_zero_node, add);
3958 off = op0;
3959 goto do_add;
3960 }
3961 break;
3962 case MULT_EXPR:
3963 if (scale == 1 && tree_fits_shwi_p (op1))
3964 {
3965 int new_scale = tree_to_shwi (op1);
3966 /* Only treat this as a scaling operation if the target
3967 supports it for at least some offset type. */
3968 if (use_ifn_p
3969 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3970 masked_p, vectype, memory_type,
3971 signed_char_type_node,
3972 new_scale, &ifn,
3973 &offset_vectype)
3974 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3975 masked_p, vectype, memory_type,
3976 unsigned_char_type_node,
3977 new_scale, &ifn,
3978 &offset_vectype))
3979 break;
3980 scale = new_scale;
3981 off = op0;
3982 continue;
3983 }
3984 break;
3985 case SSA_NAME:
3986 off = op0;
3987 continue;
3988 CASE_CONVERT:
3989 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3990 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3991 break;
3992
3993 /* Don't include the conversion if the target is happy with
3994 the current offset type. */
3995 if (use_ifn_p
3996 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3997 masked_p, vectype, memory_type,
3998 TREE_TYPE (off), scale, &ifn,
3999 &offset_vectype))
4000 break;
4001
4002 if (TYPE_PRECISION (TREE_TYPE (op0))
4003 == TYPE_PRECISION (TREE_TYPE (off)))
4004 {
4005 off = op0;
4006 continue;
4007 }
4008
4009 if (TYPE_PRECISION (TREE_TYPE (op0))
4010 < TYPE_PRECISION (TREE_TYPE (off)))
4011 {
4012 off = op0;
4013 offtype = TREE_TYPE (off);
4014 STRIP_NOPS (off);
4015 continue;
4016 }
4017 break;
4018 default:
4019 break;
4020 }
4021 break;
4022 }
4023
4024 /* If at the end OFF still isn't a SSA_NAME or isn't
4025 defined in the loop, punt. */
4026 if (TREE_CODE (off) != SSA_NAME
4027 || expr_invariant_in_loop_p (loop, off))
4028 return false;
4029
4030 if (offtype == NULL_TREE)
4031 offtype = TREE_TYPE (off);
4032
4033 if (use_ifn_p)
4034 {
4035 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4036 vectype, memory_type, offtype, scale,
4037 &ifn, &offset_vectype))
4038 return false;
4039 }
4040 else
4041 {
4042 if (DR_IS_READ (dr))
4043 {
4044 if (targetm.vectorize.builtin_gather)
4045 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4046 }
4047 else
4048 {
4049 if (targetm.vectorize.builtin_scatter)
4050 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4051 }
4052
4053 if (!decl)
4054 return false;
4055
4056 ifn = IFN_LAST;
4057 /* The offset vector type will be read from DECL when needed. */
4058 offset_vectype = NULL_TREE;
4059 }
4060
4061 info->ifn = ifn;
4062 info->decl = decl;
4063 info->base = base;
4064 info->offset = off;
4065 info->offset_dt = vect_unknown_def_type;
4066 info->offset_vectype = offset_vectype;
4067 info->scale = scale;
4068 info->element_type = TREE_TYPE (vectype);
4069 info->memory_type = memory_type;
4070 return true;
4071 }
4072
4073 /* Find the data references in STMT, analyze them with respect to LOOP and
4074 append them to DATAREFS. Return false if datarefs in this stmt cannot
4075 be handled. */
4076
4077 opt_result
4078 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4079 vec<data_reference_p> *datarefs,
4080 vec<int> *dataref_groups, int group_id)
4081 {
4082 /* We can ignore clobbers for dataref analysis - they are removed during
4083 loop vectorization and BB vectorization checks dependences with a
4084 stmt walk. */
4085 if (gimple_clobber_p (stmt))
4086 return opt_result::success ();
4087
4088 if (gimple_has_volatile_ops (stmt))
4089 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4090 stmt);
4091
4092 if (stmt_can_throw_internal (cfun, stmt))
4093 return opt_result::failure_at (stmt,
4094 "not vectorized:"
4095 " statement can throw an exception: %G",
4096 stmt);
4097
4098 auto_vec<data_reference_p, 2> refs;
4099 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4100 if (!res)
4101 return res;
4102
4103 if (refs.is_empty ())
4104 return opt_result::success ();
4105
4106 if (refs.length () > 1)
4107 {
4108 while (!refs.is_empty ())
4109 free_data_ref (refs.pop ());
4110 return opt_result::failure_at (stmt,
4111 "not vectorized: more than one "
4112 "data ref in stmt: %G", stmt);
4113 }
4114
4115 data_reference_p dr = refs.pop ();
4116 if (gcall *call = dyn_cast <gcall *> (stmt))
4117 if (!gimple_call_internal_p (call)
4118 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4119 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4120 {
4121 free_data_ref (dr);
4122 return opt_result::failure_at (stmt,
4123 "not vectorized: dr in a call %G", stmt);
4124 }
4125
4126 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4127 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4128 {
4129 free_data_ref (dr);
4130 return opt_result::failure_at (stmt,
4131 "not vectorized:"
4132 " statement is bitfield access %G", stmt);
4133 }
4134
4135 if (DR_BASE_ADDRESS (dr)
4136 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4137 {
4138 free_data_ref (dr);
4139 return opt_result::failure_at (stmt,
4140 "not vectorized:"
4141 " base addr of dr is a constant\n");
4142 }
4143
4144 /* Check whether this may be a SIMD lane access and adjust the
4145 DR to make it easier for us to handle it. */
4146 if (loop
4147 && loop->simduid
4148 && (!DR_BASE_ADDRESS (dr)
4149 || !DR_OFFSET (dr)
4150 || !DR_INIT (dr)
4151 || !DR_STEP (dr)))
4152 {
4153 struct data_reference *newdr
4154 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4155 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4156 if (DR_BASE_ADDRESS (newdr)
4157 && DR_OFFSET (newdr)
4158 && DR_INIT (newdr)
4159 && DR_STEP (newdr)
4160 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4161 && integer_zerop (DR_STEP (newdr)))
4162 {
4163 tree base_address = DR_BASE_ADDRESS (newdr);
4164 tree off = DR_OFFSET (newdr);
4165 tree step = ssize_int (1);
4166 if (integer_zerop (off)
4167 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4168 {
4169 off = TREE_OPERAND (base_address, 1);
4170 base_address = TREE_OPERAND (base_address, 0);
4171 }
4172 STRIP_NOPS (off);
4173 if (TREE_CODE (off) == MULT_EXPR
4174 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4175 {
4176 step = TREE_OPERAND (off, 1);
4177 off = TREE_OPERAND (off, 0);
4178 STRIP_NOPS (off);
4179 }
4180 if (CONVERT_EXPR_P (off)
4181 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4182 < TYPE_PRECISION (TREE_TYPE (off))))
4183 off = TREE_OPERAND (off, 0);
4184 if (TREE_CODE (off) == SSA_NAME)
4185 {
4186 gimple *def = SSA_NAME_DEF_STMT (off);
4187 /* Look through widening conversion. */
4188 if (is_gimple_assign (def)
4189 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4190 {
4191 tree rhs1 = gimple_assign_rhs1 (def);
4192 if (TREE_CODE (rhs1) == SSA_NAME
4193 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4194 && (TYPE_PRECISION (TREE_TYPE (off))
4195 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4196 def = SSA_NAME_DEF_STMT (rhs1);
4197 }
4198 if (is_gimple_call (def)
4199 && gimple_call_internal_p (def)
4200 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4201 {
4202 tree arg = gimple_call_arg (def, 0);
4203 tree reft = TREE_TYPE (DR_REF (newdr));
4204 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4205 arg = SSA_NAME_VAR (arg);
4206 if (arg == loop->simduid
4207 /* For now. */
4208 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4209 {
4210 DR_BASE_ADDRESS (newdr) = base_address;
4211 DR_OFFSET (newdr) = ssize_int (0);
4212 DR_STEP (newdr) = step;
4213 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4214 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4215 /* Mark as simd-lane access. */
4216 tree arg2 = gimple_call_arg (def, 1);
4217 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4218 free_data_ref (dr);
4219 datarefs->safe_push (newdr);
4220 if (dataref_groups)
4221 dataref_groups->safe_push (group_id);
4222 return opt_result::success ();
4223 }
4224 }
4225 }
4226 }
4227 free_data_ref (newdr);
4228 }
4229
4230 datarefs->safe_push (dr);
4231 if (dataref_groups)
4232 dataref_groups->safe_push (group_id);
4233 return opt_result::success ();
4234 }
4235
4236 /* Function vect_analyze_data_refs.
4237
4238 Find all the data references in the loop or basic block.
4239
4240 The general structure of the analysis of data refs in the vectorizer is as
4241 follows:
4242 1- vect_analyze_data_refs(loop/bb): call
4243 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4244 in the loop/bb and their dependences.
4245 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4246 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4247 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4248
4249 */
4250
4251 opt_result
4252 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4253 {
4254 class loop *loop = NULL;
4255 unsigned int i;
4256 struct data_reference *dr;
4257 tree scalar_type;
4258
4259 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4260
4261 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4262 loop = LOOP_VINFO_LOOP (loop_vinfo);
4263
4264 /* Go through the data-refs, check that the analysis succeeded. Update
4265 pointer from stmt_vec_info struct to DR and vectype. */
4266
4267 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4268 FOR_EACH_VEC_ELT (datarefs, i, dr)
4269 {
4270 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4271 poly_uint64 vf;
4272
4273 gcc_assert (DR_REF (dr));
4274 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4275 gcc_assert (!stmt_info->dr_aux.dr);
4276 stmt_info->dr_aux.dr = dr;
4277 stmt_info->dr_aux.stmt = stmt_info;
4278
4279 /* Check that analysis of the data-ref succeeded. */
4280 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4281 || !DR_STEP (dr))
4282 {
4283 bool maybe_gather
4284 = DR_IS_READ (dr)
4285 && !TREE_THIS_VOLATILE (DR_REF (dr))
4286 && (targetm.vectorize.builtin_gather != NULL
4287 || supports_vec_gather_load_p ());
4288 bool maybe_scatter
4289 = DR_IS_WRITE (dr)
4290 && !TREE_THIS_VOLATILE (DR_REF (dr))
4291 && (targetm.vectorize.builtin_scatter != NULL
4292 || supports_vec_scatter_store_p ());
4293
4294 /* If target supports vector gather loads or scatter stores,
4295 see if they can't be used. */
4296 if (is_a <loop_vec_info> (vinfo)
4297 && !nested_in_vect_loop_p (loop, stmt_info))
4298 {
4299 if (maybe_gather || maybe_scatter)
4300 {
4301 if (maybe_gather)
4302 gatherscatter = GATHER;
4303 else
4304 gatherscatter = SCATTER;
4305 }
4306 }
4307
4308 if (gatherscatter == SG_NONE)
4309 {
4310 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4312 "not vectorized: data ref analysis "
4313 "failed %G", stmt_info->stmt);
4314 if (is_a <bb_vec_info> (vinfo))
4315 {
4316 /* In BB vectorization the ref can still participate
4317 in dependence analysis, we just can't vectorize it. */
4318 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4319 continue;
4320 }
4321 return opt_result::failure_at (stmt_info->stmt,
4322 "not vectorized:"
4323 " data ref analysis failed: %G",
4324 stmt_info->stmt);
4325 }
4326 }
4327
4328 /* See if this was detected as SIMD lane access. */
4329 if (dr->aux == (void *)-1
4330 || dr->aux == (void *)-2
4331 || dr->aux == (void *)-3
4332 || dr->aux == (void *)-4)
4333 {
4334 if (nested_in_vect_loop_p (loop, stmt_info))
4335 return opt_result::failure_at (stmt_info->stmt,
4336 "not vectorized:"
4337 " data ref analysis failed: %G",
4338 stmt_info->stmt);
4339 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4340 = -(uintptr_t) dr->aux;
4341 }
4342
4343 tree base = get_base_address (DR_REF (dr));
4344 if (base && VAR_P (base) && DECL_NONALIASED (base))
4345 {
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348 "not vectorized: base object not addressable "
4349 "for stmt: %G", stmt_info->stmt);
4350 if (is_a <bb_vec_info> (vinfo))
4351 {
4352 /* In BB vectorization the ref can still participate
4353 in dependence analysis, we just can't vectorize it. */
4354 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4355 continue;
4356 }
4357 return opt_result::failure_at (stmt_info->stmt,
4358 "not vectorized: base object not"
4359 " addressable for stmt: %G",
4360 stmt_info->stmt);
4361 }
4362
4363 if (is_a <loop_vec_info> (vinfo)
4364 && DR_STEP (dr)
4365 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4366 {
4367 if (nested_in_vect_loop_p (loop, stmt_info))
4368 return opt_result::failure_at (stmt_info->stmt,
4369 "not vectorized: "
4370 "not suitable for strided load %G",
4371 stmt_info->stmt);
4372 STMT_VINFO_STRIDED_P (stmt_info) = true;
4373 }
4374
4375 /* Update DR field in stmt_vec_info struct. */
4376
4377 /* If the dataref is in an inner-loop of the loop that is considered for
4378 for vectorization, we also want to analyze the access relative to
4379 the outer-loop (DR contains information only relative to the
4380 inner-most enclosing loop). We do that by building a reference to the
4381 first location accessed by the inner-loop, and analyze it relative to
4382 the outer-loop. */
4383 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4384 {
4385 /* Build a reference to the first location accessed by the
4386 inner loop: *(BASE + INIT + OFFSET). By construction,
4387 this address must be invariant in the inner loop, so we
4388 can consider it as being used in the outer loop. */
4389 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4390 tree offset = unshare_expr (DR_OFFSET (dr));
4391 tree init = unshare_expr (DR_INIT (dr));
4392 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4393 init, offset);
4394 tree init_addr = fold_build_pointer_plus (base, init_offset);
4395 tree init_ref = build_fold_indirect_ref (init_addr);
4396
4397 if (dump_enabled_p ())
4398 dump_printf_loc (MSG_NOTE, vect_location,
4399 "analyze in outer loop: %T\n", init_ref);
4400
4401 opt_result res
4402 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4403 init_ref, loop, stmt_info->stmt);
4404 if (!res)
4405 /* dr_analyze_innermost already explained the failure. */
4406 return res;
4407
4408 if (dump_enabled_p ())
4409 dump_printf_loc (MSG_NOTE, vect_location,
4410 "\touter base_address: %T\n"
4411 "\touter offset from base address: %T\n"
4412 "\touter constant offset from base address: %T\n"
4413 "\touter step: %T\n"
4414 "\touter base alignment: %d\n\n"
4415 "\touter base misalignment: %d\n"
4416 "\touter offset alignment: %d\n"
4417 "\touter step alignment: %d\n",
4418 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4419 STMT_VINFO_DR_OFFSET (stmt_info),
4420 STMT_VINFO_DR_INIT (stmt_info),
4421 STMT_VINFO_DR_STEP (stmt_info),
4422 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4423 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4424 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4425 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4426 }
4427
4428 /* Set vectype for STMT. */
4429 scalar_type = TREE_TYPE (DR_REF (dr));
4430 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4431 if (!vectype)
4432 {
4433 if (dump_enabled_p ())
4434 {
4435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4436 "not vectorized: no vectype for stmt: %G",
4437 stmt_info->stmt);
4438 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4439 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4440 scalar_type);
4441 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4442 }
4443
4444 if (is_a <bb_vec_info> (vinfo))
4445 {
4446 /* No vector type is fine, the ref can still participate
4447 in dependence analysis, we just can't vectorize it. */
4448 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4449 continue;
4450 }
4451 if (fatal)
4452 *fatal = false;
4453 return opt_result::failure_at (stmt_info->stmt,
4454 "not vectorized:"
4455 " no vectype for stmt: %G"
4456 " scalar_type: %T\n",
4457 stmt_info->stmt, scalar_type);
4458 }
4459 else
4460 {
4461 if (dump_enabled_p ())
4462 dump_printf_loc (MSG_NOTE, vect_location,
4463 "got vectype for stmt: %G%T\n",
4464 stmt_info->stmt, vectype);
4465 }
4466
4467 /* Adjust the minimal vectorization factor according to the
4468 vector type. */
4469 vf = TYPE_VECTOR_SUBPARTS (vectype);
4470 *min_vf = upper_bound (*min_vf, vf);
4471
4472 /* Leave the BB vectorizer to pick the vector type later, based on
4473 the final dataref group size and SLP node size. */
4474 if (is_a <loop_vec_info> (vinfo))
4475 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4476
4477 if (gatherscatter != SG_NONE)
4478 {
4479 gather_scatter_info gs_info;
4480 if (!vect_check_gather_scatter (stmt_info,
4481 as_a <loop_vec_info> (vinfo),
4482 &gs_info)
4483 || !get_vectype_for_scalar_type (vinfo,
4484 TREE_TYPE (gs_info.offset)))
4485 {
4486 if (fatal)
4487 *fatal = false;
4488 return opt_result::failure_at
4489 (stmt_info->stmt,
4490 (gatherscatter == GATHER)
4491 ? "not vectorized: not suitable for gather load %G"
4492 : "not vectorized: not suitable for scatter store %G",
4493 stmt_info->stmt);
4494 }
4495 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4496 }
4497 }
4498
4499 /* We used to stop processing and prune the list here. Verify we no
4500 longer need to. */
4501 gcc_assert (i == datarefs.length ());
4502
4503 return opt_result::success ();
4504 }
4505
4506
4507 /* Function vect_get_new_vect_var.
4508
4509 Returns a name for a new variable. The current naming scheme appends the
4510 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4511 the name of vectorizer generated variables, and appends that to NAME if
4512 provided. */
4513
4514 tree
4515 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4516 {
4517 const char *prefix;
4518 tree new_vect_var;
4519
4520 switch (var_kind)
4521 {
4522 case vect_simple_var:
4523 prefix = "vect";
4524 break;
4525 case vect_scalar_var:
4526 prefix = "stmp";
4527 break;
4528 case vect_mask_var:
4529 prefix = "mask";
4530 break;
4531 case vect_pointer_var:
4532 prefix = "vectp";
4533 break;
4534 default:
4535 gcc_unreachable ();
4536 }
4537
4538 if (name)
4539 {
4540 char* tmp = concat (prefix, "_", name, NULL);
4541 new_vect_var = create_tmp_reg (type, tmp);
4542 free (tmp);
4543 }
4544 else
4545 new_vect_var = create_tmp_reg (type, prefix);
4546
4547 return new_vect_var;
4548 }
4549
4550 /* Like vect_get_new_vect_var but return an SSA name. */
4551
4552 tree
4553 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4554 {
4555 const char *prefix;
4556 tree new_vect_var;
4557
4558 switch (var_kind)
4559 {
4560 case vect_simple_var:
4561 prefix = "vect";
4562 break;
4563 case vect_scalar_var:
4564 prefix = "stmp";
4565 break;
4566 case vect_pointer_var:
4567 prefix = "vectp";
4568 break;
4569 default:
4570 gcc_unreachable ();
4571 }
4572
4573 if (name)
4574 {
4575 char* tmp = concat (prefix, "_", name, NULL);
4576 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4577 free (tmp);
4578 }
4579 else
4580 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4581
4582 return new_vect_var;
4583 }
4584
4585 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4586
4587 static void
4588 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4589 {
4590 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4591 int misalign = DR_MISALIGNMENT (dr_info);
4592 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4593 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4594 else
4595 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4596 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4597 misalign);
4598 }
4599
4600 /* Function vect_create_addr_base_for_vector_ref.
4601
4602 Create an expression that computes the address of the first memory location
4603 that will be accessed for a data reference.
4604
4605 Input:
4606 STMT_INFO: The statement containing the data reference.
4607 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4608 OFFSET: Optional. If supplied, it is be added to the initial address.
4609 LOOP: Specify relative to which loop-nest should the address be computed.
4610 For example, when the dataref is in an inner-loop nested in an
4611 outer-loop that is now being vectorized, LOOP can be either the
4612 outer-loop, or the inner-loop. The first memory location accessed
4613 by the following dataref ('in' points to short):
4614
4615 for (i=0; i<N; i++)
4616 for (j=0; j<M; j++)
4617 s += in[i+j]
4618
4619 is as follows:
4620 if LOOP=i_loop: &in (relative to i_loop)
4621 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4622 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4623 initial address. Unlike OFFSET, which is number of elements to
4624 be added, BYTE_OFFSET is measured in bytes.
4625
4626 Output:
4627 1. Return an SSA_NAME whose value is the address of the memory location of
4628 the first vector of the data reference.
4629 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4630 these statement(s) which define the returned SSA_NAME.
4631
4632 FORNOW: We are only handling array accesses with step 1. */
4633
4634 tree
4635 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4636 gimple_seq *new_stmt_list,
4637 tree offset,
4638 tree byte_offset)
4639 {
4640 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4641 struct data_reference *dr = dr_info->dr;
4642 const char *base_name;
4643 tree addr_base;
4644 tree dest;
4645 gimple_seq seq = NULL;
4646 tree vect_ptr_type;
4647 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4648 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4649 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4650
4651 tree data_ref_base = unshare_expr (drb->base_address);
4652 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4653 tree init = unshare_expr (drb->init);
4654
4655 if (loop_vinfo)
4656 base_name = get_name (data_ref_base);
4657 else
4658 {
4659 base_offset = ssize_int (0);
4660 init = ssize_int (0);
4661 base_name = get_name (DR_REF (dr));
4662 }
4663
4664 /* Create base_offset */
4665 base_offset = size_binop (PLUS_EXPR,
4666 fold_convert (sizetype, base_offset),
4667 fold_convert (sizetype, init));
4668
4669 if (offset)
4670 {
4671 offset = fold_build2 (MULT_EXPR, sizetype,
4672 fold_convert (sizetype, offset), step);
4673 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4674 base_offset, offset);
4675 }
4676 if (byte_offset)
4677 {
4678 byte_offset = fold_convert (sizetype, byte_offset);
4679 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4680 base_offset, byte_offset);
4681 }
4682
4683 /* base + base_offset */
4684 if (loop_vinfo)
4685 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4686 else
4687 {
4688 addr_base = build1 (ADDR_EXPR,
4689 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4690 unshare_expr (DR_REF (dr)));
4691 }
4692
4693 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4694 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4695 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4696 gimple_seq_add_seq (new_stmt_list, seq);
4697
4698 if (DR_PTR_INFO (dr)
4699 && TREE_CODE (addr_base) == SSA_NAME
4700 && !SSA_NAME_PTR_INFO (addr_base))
4701 {
4702 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4703 if (offset || byte_offset)
4704 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4705 }
4706
4707 if (dump_enabled_p ())
4708 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4709
4710 return addr_base;
4711 }
4712
4713
4714 /* Function vect_create_data_ref_ptr.
4715
4716 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4717 location accessed in the loop by STMT_INFO, along with the def-use update
4718 chain to appropriately advance the pointer through the loop iterations.
4719 Also set aliasing information for the pointer. This pointer is used by
4720 the callers to this function to create a memory reference expression for
4721 vector load/store access.
4722
4723 Input:
4724 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4725 GIMPLE_ASSIGN <name, data-ref> or
4726 GIMPLE_ASSIGN <data-ref, name>.
4727 2. AGGR_TYPE: the type of the reference, which should be either a vector
4728 or an array.
4729 3. AT_LOOP: the loop where the vector memref is to be created.
4730 4. OFFSET (optional): an offset to be added to the initial address accessed
4731 by the data-ref in STMT_INFO.
4732 5. BSI: location where the new stmts are to be placed if there is no loop
4733 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4734 pointing to the initial address.
4735 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4736 to the initial address accessed by the data-ref in STMT_INFO. This is
4737 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4738 in bytes.
4739 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4740 to the IV during each iteration of the loop. NULL says to move
4741 by one copy of AGGR_TYPE up or down, depending on the step of the
4742 data reference.
4743
4744 Output:
4745 1. Declare a new ptr to vector_type, and have it point to the base of the
4746 data reference (initial addressed accessed by the data reference).
4747 For example, for vector of type V8HI, the following code is generated:
4748
4749 v8hi *ap;
4750 ap = (v8hi *)initial_address;
4751
4752 if OFFSET is not supplied:
4753 initial_address = &a[init];
4754 if OFFSET is supplied:
4755 initial_address = &a[init + OFFSET];
4756 if BYTE_OFFSET is supplied:
4757 initial_address = &a[init] + BYTE_OFFSET;
4758
4759 Return the initial_address in INITIAL_ADDRESS.
4760
4761 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4762 update the pointer in each iteration of the loop.
4763
4764 Return the increment stmt that updates the pointer in PTR_INCR.
4765
4766 3. Return the pointer. */
4767
4768 tree
4769 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4770 tree aggr_type, class loop *at_loop, tree offset,
4771 tree *initial_address, gimple_stmt_iterator *gsi,
4772 gimple **ptr_incr, bool only_init,
4773 tree byte_offset, tree iv_step)
4774 {
4775 const char *base_name;
4776 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4777 class loop *loop = NULL;
4778 bool nested_in_vect_loop = false;
4779 class loop *containing_loop = NULL;
4780 tree aggr_ptr_type;
4781 tree aggr_ptr;
4782 tree new_temp;
4783 gimple_seq new_stmt_list = NULL;
4784 edge pe = NULL;
4785 basic_block new_bb;
4786 tree aggr_ptr_init;
4787 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4788 struct data_reference *dr = dr_info->dr;
4789 tree aptr;
4790 gimple_stmt_iterator incr_gsi;
4791 bool insert_after;
4792 tree indx_before_incr, indx_after_incr;
4793 gimple *incr;
4794 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4795
4796 gcc_assert (iv_step != NULL_TREE
4797 || TREE_CODE (aggr_type) == ARRAY_TYPE
4798 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4799
4800 if (loop_vinfo)
4801 {
4802 loop = LOOP_VINFO_LOOP (loop_vinfo);
4803 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4804 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4805 pe = loop_preheader_edge (loop);
4806 }
4807 else
4808 {
4809 gcc_assert (bb_vinfo);
4810 only_init = true;
4811 *ptr_incr = NULL;
4812 }
4813
4814 /* Create an expression for the first address accessed by this load
4815 in LOOP. */
4816 base_name = get_name (DR_BASE_ADDRESS (dr));
4817
4818 if (dump_enabled_p ())
4819 {
4820 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4821 dump_printf_loc (MSG_NOTE, vect_location,
4822 "create %s-pointer variable to type: %T",
4823 get_tree_code_name (TREE_CODE (aggr_type)),
4824 aggr_type);
4825 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4826 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4827 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4828 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4829 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4830 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4831 else
4832 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4833 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4834 }
4835
4836 /* (1) Create the new aggregate-pointer variable.
4837 Vector and array types inherit the alias set of their component
4838 type by default so we need to use a ref-all pointer if the data
4839 reference does not conflict with the created aggregated data
4840 reference because it is not addressable. */
4841 bool need_ref_all = false;
4842 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4843 get_alias_set (DR_REF (dr))))
4844 need_ref_all = true;
4845 /* Likewise for any of the data references in the stmt group. */
4846 else if (DR_GROUP_SIZE (stmt_info) > 1)
4847 {
4848 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4849 do
4850 {
4851 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4852 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4853 get_alias_set (DR_REF (sdr))))
4854 {
4855 need_ref_all = true;
4856 break;
4857 }
4858 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4859 }
4860 while (sinfo);
4861 }
4862 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4863 need_ref_all);
4864 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4865
4866
4867 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4868 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4869 def-use update cycles for the pointer: one relative to the outer-loop
4870 (LOOP), which is what steps (3) and (4) below do. The other is relative
4871 to the inner-loop (which is the inner-most loop containing the dataref),
4872 and this is done be step (5) below.
4873
4874 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4875 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4876 redundant. Steps (3),(4) create the following:
4877
4878 vp0 = &base_addr;
4879 LOOP: vp1 = phi(vp0,vp2)
4880 ...
4881 ...
4882 vp2 = vp1 + step
4883 goto LOOP
4884
4885 If there is an inner-loop nested in loop, then step (5) will also be
4886 applied, and an additional update in the inner-loop will be created:
4887
4888 vp0 = &base_addr;
4889 LOOP: vp1 = phi(vp0,vp2)
4890 ...
4891 inner: vp3 = phi(vp1,vp4)
4892 vp4 = vp3 + inner_step
4893 if () goto inner
4894 ...
4895 vp2 = vp1 + step
4896 if () goto LOOP */
4897
4898 /* (2) Calculate the initial address of the aggregate-pointer, and set
4899 the aggregate-pointer to point to it before the loop. */
4900
4901 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4902
4903 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4904 stmt_info, &new_stmt_list,
4905 offset, byte_offset);
4906 if (new_stmt_list)
4907 {
4908 if (pe)
4909 {
4910 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4911 gcc_assert (!new_bb);
4912 }
4913 else
4914 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4915 }
4916
4917 *initial_address = new_temp;
4918 aggr_ptr_init = new_temp;
4919
4920 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4921 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4922 inner-loop nested in LOOP (during outer-loop vectorization). */
4923
4924 /* No update in loop is required. */
4925 if (only_init && (!loop_vinfo || at_loop == loop))
4926 aptr = aggr_ptr_init;
4927 else
4928 {
4929 /* Accesses to invariant addresses should be handled specially
4930 by the caller. */
4931 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4932 gcc_assert (!integer_zerop (step));
4933
4934 if (iv_step == NULL_TREE)
4935 {
4936 /* The step of the aggregate pointer is the type size,
4937 negated for downward accesses. */
4938 iv_step = TYPE_SIZE_UNIT (aggr_type);
4939 if (tree_int_cst_sgn (step) == -1)
4940 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4941 }
4942
4943 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4944
4945 create_iv (aggr_ptr_init,
4946 fold_convert (aggr_ptr_type, iv_step),
4947 aggr_ptr, loop, &incr_gsi, insert_after,
4948 &indx_before_incr, &indx_after_incr);
4949 incr = gsi_stmt (incr_gsi);
4950
4951 /* Copy the points-to information if it exists. */
4952 if (DR_PTR_INFO (dr))
4953 {
4954 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4955 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4956 }
4957 if (ptr_incr)
4958 *ptr_incr = incr;
4959
4960 aptr = indx_before_incr;
4961 }
4962
4963 if (!nested_in_vect_loop || only_init)
4964 return aptr;
4965
4966
4967 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4968 nested in LOOP, if exists. */
4969
4970 gcc_assert (nested_in_vect_loop);
4971 if (!only_init)
4972 {
4973 standard_iv_increment_position (containing_loop, &incr_gsi,
4974 &insert_after);
4975 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4976 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4977 &indx_after_incr);
4978 incr = gsi_stmt (incr_gsi);
4979
4980 /* Copy the points-to information if it exists. */
4981 if (DR_PTR_INFO (dr))
4982 {
4983 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4984 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4985 }
4986 if (ptr_incr)
4987 *ptr_incr = incr;
4988
4989 return indx_before_incr;
4990 }
4991 else
4992 gcc_unreachable ();
4993 }
4994
4995
4996 /* Function bump_vector_ptr
4997
4998 Increment a pointer (to a vector type) by vector-size. If requested,
4999 i.e. if PTR-INCR is given, then also connect the new increment stmt
5000 to the existing def-use update-chain of the pointer, by modifying
5001 the PTR_INCR as illustrated below:
5002
5003 The pointer def-use update-chain before this function:
5004 DATAREF_PTR = phi (p_0, p_2)
5005 ....
5006 PTR_INCR: p_2 = DATAREF_PTR + step
5007
5008 The pointer def-use update-chain after this function:
5009 DATAREF_PTR = phi (p_0, p_2)
5010 ....
5011 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5012 ....
5013 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5014
5015 Input:
5016 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5017 in the loop.
5018 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5019 the loop. The increment amount across iterations is expected
5020 to be vector_size.
5021 BSI - location where the new update stmt is to be placed.
5022 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5023 BUMP - optional. The offset by which to bump the pointer. If not given,
5024 the offset is assumed to be vector_size.
5025
5026 Output: Return NEW_DATAREF_PTR as illustrated above.
5027
5028 */
5029
5030 tree
5031 bump_vector_ptr (vec_info *vinfo,
5032 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5033 stmt_vec_info stmt_info, tree bump)
5034 {
5035 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5036 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5037 tree update = TYPE_SIZE_UNIT (vectype);
5038 gassign *incr_stmt;
5039 ssa_op_iter iter;
5040 use_operand_p use_p;
5041 tree new_dataref_ptr;
5042
5043 if (bump)
5044 update = bump;
5045
5046 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5047 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5048 else
5049 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5050 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5051 dataref_ptr, update);
5052 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5053
5054 /* Copy the points-to information if it exists. */
5055 if (DR_PTR_INFO (dr))
5056 {
5057 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5058 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5059 }
5060
5061 if (!ptr_incr)
5062 return new_dataref_ptr;
5063
5064 /* Update the vector-pointer's cross-iteration increment. */
5065 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5066 {
5067 tree use = USE_FROM_PTR (use_p);
5068
5069 if (use == dataref_ptr)
5070 SET_USE (use_p, new_dataref_ptr);
5071 else
5072 gcc_assert (operand_equal_p (use, update, 0));
5073 }
5074
5075 return new_dataref_ptr;
5076 }
5077
5078
5079 /* Copy memory reference info such as base/clique from the SRC reference
5080 to the DEST MEM_REF. */
5081
5082 void
5083 vect_copy_ref_info (tree dest, tree src)
5084 {
5085 if (TREE_CODE (dest) != MEM_REF)
5086 return;
5087
5088 tree src_base = src;
5089 while (handled_component_p (src_base))
5090 src_base = TREE_OPERAND (src_base, 0);
5091 if (TREE_CODE (src_base) != MEM_REF
5092 && TREE_CODE (src_base) != TARGET_MEM_REF)
5093 return;
5094
5095 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5096 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5097 }
5098
5099
5100 /* Function vect_create_destination_var.
5101
5102 Create a new temporary of type VECTYPE. */
5103
5104 tree
5105 vect_create_destination_var (tree scalar_dest, tree vectype)
5106 {
5107 tree vec_dest;
5108 const char *name;
5109 char *new_name;
5110 tree type;
5111 enum vect_var_kind kind;
5112
5113 kind = vectype
5114 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5115 ? vect_mask_var
5116 : vect_simple_var
5117 : vect_scalar_var;
5118 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5119
5120 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5121
5122 name = get_name (scalar_dest);
5123 if (name)
5124 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5125 else
5126 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5127 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5128 free (new_name);
5129
5130 return vec_dest;
5131 }
5132
5133 /* Function vect_grouped_store_supported.
5134
5135 Returns TRUE if interleave high and interleave low permutations
5136 are supported, and FALSE otherwise. */
5137
5138 bool
5139 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5140 {
5141 machine_mode mode = TYPE_MODE (vectype);
5142
5143 /* vect_permute_store_chain requires the group size to be equal to 3 or
5144 be a power of two. */
5145 if (count != 3 && exact_log2 (count) == -1)
5146 {
5147 if (dump_enabled_p ())
5148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149 "the size of the group of accesses"
5150 " is not a power of 2 or not eqaul to 3\n");
5151 return false;
5152 }
5153
5154 /* Check that the permutation is supported. */
5155 if (VECTOR_MODE_P (mode))
5156 {
5157 unsigned int i;
5158 if (count == 3)
5159 {
5160 unsigned int j0 = 0, j1 = 0, j2 = 0;
5161 unsigned int i, j;
5162
5163 unsigned int nelt;
5164 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5165 {
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5168 "cannot handle groups of 3 stores for"
5169 " variable-length vectors\n");
5170 return false;
5171 }
5172
5173 vec_perm_builder sel (nelt, nelt, 1);
5174 sel.quick_grow (nelt);
5175 vec_perm_indices indices;
5176 for (j = 0; j < 3; j++)
5177 {
5178 int nelt0 = ((3 - j) * nelt) % 3;
5179 int nelt1 = ((3 - j) * nelt + 1) % 3;
5180 int nelt2 = ((3 - j) * nelt + 2) % 3;
5181 for (i = 0; i < nelt; i++)
5182 {
5183 if (3 * i + nelt0 < nelt)
5184 sel[3 * i + nelt0] = j0++;
5185 if (3 * i + nelt1 < nelt)
5186 sel[3 * i + nelt1] = nelt + j1++;
5187 if (3 * i + nelt2 < nelt)
5188 sel[3 * i + nelt2] = 0;
5189 }
5190 indices.new_vector (sel, 2, nelt);
5191 if (!can_vec_perm_const_p (mode, indices))
5192 {
5193 if (dump_enabled_p ())
5194 dump_printf (MSG_MISSED_OPTIMIZATION,
5195 "permutation op not supported by target.\n");
5196 return false;
5197 }
5198
5199 for (i = 0; i < nelt; i++)
5200 {
5201 if (3 * i + nelt0 < nelt)
5202 sel[3 * i + nelt0] = 3 * i + nelt0;
5203 if (3 * i + nelt1 < nelt)
5204 sel[3 * i + nelt1] = 3 * i + nelt1;
5205 if (3 * i + nelt2 < nelt)
5206 sel[3 * i + nelt2] = nelt + j2++;
5207 }
5208 indices.new_vector (sel, 2, nelt);
5209 if (!can_vec_perm_const_p (mode, indices))
5210 {
5211 if (dump_enabled_p ())
5212 dump_printf (MSG_MISSED_OPTIMIZATION,
5213 "permutation op not supported by target.\n");
5214 return false;
5215 }
5216 }
5217 return true;
5218 }
5219 else
5220 {
5221 /* If length is not equal to 3 then only power of 2 is supported. */
5222 gcc_assert (pow2p_hwi (count));
5223 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5224
5225 /* The encoding has 2 interleaved stepped patterns. */
5226 vec_perm_builder sel (nelt, 2, 3);
5227 sel.quick_grow (6);
5228 for (i = 0; i < 3; i++)
5229 {
5230 sel[i * 2] = i;
5231 sel[i * 2 + 1] = i + nelt;
5232 }
5233 vec_perm_indices indices (sel, 2, nelt);
5234 if (can_vec_perm_const_p (mode, indices))
5235 {
5236 for (i = 0; i < 6; i++)
5237 sel[i] += exact_div (nelt, 2);
5238 indices.new_vector (sel, 2, nelt);
5239 if (can_vec_perm_const_p (mode, indices))
5240 return true;
5241 }
5242 }
5243 }
5244
5245 if (dump_enabled_p ())
5246 dump_printf (MSG_MISSED_OPTIMIZATION,
5247 "permutation op not supported by target.\n");
5248 return false;
5249 }
5250
5251
5252 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5253 type VECTYPE. MASKED_P says whether the masked form is needed. */
5254
5255 bool
5256 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5257 bool masked_p)
5258 {
5259 if (masked_p)
5260 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5261 vec_mask_store_lanes_optab,
5262 vectype, count);
5263 else
5264 return vect_lanes_optab_supported_p ("vec_store_lanes",
5265 vec_store_lanes_optab,
5266 vectype, count);
5267 }
5268
5269
5270 /* Function vect_permute_store_chain.
5271
5272 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5273 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5274 the data correctly for the stores. Return the final references for stores
5275 in RESULT_CHAIN.
5276
5277 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5278 The input is 4 vectors each containing 8 elements. We assign a number to
5279 each element, the input sequence is:
5280
5281 1st vec: 0 1 2 3 4 5 6 7
5282 2nd vec: 8 9 10 11 12 13 14 15
5283 3rd vec: 16 17 18 19 20 21 22 23
5284 4th vec: 24 25 26 27 28 29 30 31
5285
5286 The output sequence should be:
5287
5288 1st vec: 0 8 16 24 1 9 17 25
5289 2nd vec: 2 10 18 26 3 11 19 27
5290 3rd vec: 4 12 20 28 5 13 21 30
5291 4th vec: 6 14 22 30 7 15 23 31
5292
5293 i.e., we interleave the contents of the four vectors in their order.
5294
5295 We use interleave_high/low instructions to create such output. The input of
5296 each interleave_high/low operation is two vectors:
5297 1st vec 2nd vec
5298 0 1 2 3 4 5 6 7
5299 the even elements of the result vector are obtained left-to-right from the
5300 high/low elements of the first vector. The odd elements of the result are
5301 obtained left-to-right from the high/low elements of the second vector.
5302 The output of interleave_high will be: 0 4 1 5
5303 and of interleave_low: 2 6 3 7
5304
5305
5306 The permutation is done in log LENGTH stages. In each stage interleave_high
5307 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5308 where the first argument is taken from the first half of DR_CHAIN and the
5309 second argument from it's second half.
5310 In our example,
5311
5312 I1: interleave_high (1st vec, 3rd vec)
5313 I2: interleave_low (1st vec, 3rd vec)
5314 I3: interleave_high (2nd vec, 4th vec)
5315 I4: interleave_low (2nd vec, 4th vec)
5316
5317 The output for the first stage is:
5318
5319 I1: 0 16 1 17 2 18 3 19
5320 I2: 4 20 5 21 6 22 7 23
5321 I3: 8 24 9 25 10 26 11 27
5322 I4: 12 28 13 29 14 30 15 31
5323
5324 The output of the second stage, i.e. the final result is:
5325
5326 I1: 0 8 16 24 1 9 17 25
5327 I2: 2 10 18 26 3 11 19 27
5328 I3: 4 12 20 28 5 13 21 30
5329 I4: 6 14 22 30 7 15 23 31. */
5330
5331 void
5332 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5333 unsigned int length,
5334 stmt_vec_info stmt_info,
5335 gimple_stmt_iterator *gsi,
5336 vec<tree> *result_chain)
5337 {
5338 tree vect1, vect2, high, low;
5339 gimple *perm_stmt;
5340 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5341 tree perm_mask_low, perm_mask_high;
5342 tree data_ref;
5343 tree perm3_mask_low, perm3_mask_high;
5344 unsigned int i, j, n, log_length = exact_log2 (length);
5345
5346 result_chain->quick_grow (length);
5347 memcpy (result_chain->address (), dr_chain.address (),
5348 length * sizeof (tree));
5349
5350 if (length == 3)
5351 {
5352 /* vect_grouped_store_supported ensures that this is constant. */
5353 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5354 unsigned int j0 = 0, j1 = 0, j2 = 0;
5355
5356 vec_perm_builder sel (nelt, nelt, 1);
5357 sel.quick_grow (nelt);
5358 vec_perm_indices indices;
5359 for (j = 0; j < 3; j++)
5360 {
5361 int nelt0 = ((3 - j) * nelt) % 3;
5362 int nelt1 = ((3 - j) * nelt + 1) % 3;
5363 int nelt2 = ((3 - j) * nelt + 2) % 3;
5364
5365 for (i = 0; i < nelt; i++)
5366 {
5367 if (3 * i + nelt0 < nelt)
5368 sel[3 * i + nelt0] = j0++;
5369 if (3 * i + nelt1 < nelt)
5370 sel[3 * i + nelt1] = nelt + j1++;
5371 if (3 * i + nelt2 < nelt)
5372 sel[3 * i + nelt2] = 0;
5373 }
5374 indices.new_vector (sel, 2, nelt);
5375 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5376
5377 for (i = 0; i < nelt; i++)
5378 {
5379 if (3 * i + nelt0 < nelt)
5380 sel[3 * i + nelt0] = 3 * i + nelt0;
5381 if (3 * i + nelt1 < nelt)
5382 sel[3 * i + nelt1] = 3 * i + nelt1;
5383 if (3 * i + nelt2 < nelt)
5384 sel[3 * i + nelt2] = nelt + j2++;
5385 }
5386 indices.new_vector (sel, 2, nelt);
5387 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5388
5389 vect1 = dr_chain[0];
5390 vect2 = dr_chain[1];
5391
5392 /* Create interleaving stmt:
5393 low = VEC_PERM_EXPR <vect1, vect2,
5394 {j, nelt, *, j + 1, nelt + j + 1, *,
5395 j + 2, nelt + j + 2, *, ...}> */
5396 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5397 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5398 vect2, perm3_mask_low);
5399 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5400
5401 vect1 = data_ref;
5402 vect2 = dr_chain[2];
5403 /* Create interleaving stmt:
5404 low = VEC_PERM_EXPR <vect1, vect2,
5405 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5406 6, 7, nelt + j + 2, ...}> */
5407 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5408 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5409 vect2, perm3_mask_high);
5410 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5411 (*result_chain)[j] = data_ref;
5412 }
5413 }
5414 else
5415 {
5416 /* If length is not equal to 3 then only power of 2 is supported. */
5417 gcc_assert (pow2p_hwi (length));
5418
5419 /* The encoding has 2 interleaved stepped patterns. */
5420 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5421 vec_perm_builder sel (nelt, 2, 3);
5422 sel.quick_grow (6);
5423 for (i = 0; i < 3; i++)
5424 {
5425 sel[i * 2] = i;
5426 sel[i * 2 + 1] = i + nelt;
5427 }
5428 vec_perm_indices indices (sel, 2, nelt);
5429 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5430
5431 for (i = 0; i < 6; i++)
5432 sel[i] += exact_div (nelt, 2);
5433 indices.new_vector (sel, 2, nelt);
5434 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5435
5436 for (i = 0, n = log_length; i < n; i++)
5437 {
5438 for (j = 0; j < length/2; j++)
5439 {
5440 vect1 = dr_chain[j];
5441 vect2 = dr_chain[j+length/2];
5442
5443 /* Create interleaving stmt:
5444 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5445 ...}> */
5446 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5447 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5448 vect2, perm_mask_high);
5449 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5450 (*result_chain)[2*j] = high;
5451
5452 /* Create interleaving stmt:
5453 low = VEC_PERM_EXPR <vect1, vect2,
5454 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5455 ...}> */
5456 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5457 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5458 vect2, perm_mask_low);
5459 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5460 (*result_chain)[2*j+1] = low;
5461 }
5462 memcpy (dr_chain.address (), result_chain->address (),
5463 length * sizeof (tree));
5464 }
5465 }
5466 }
5467
5468 /* Function vect_setup_realignment
5469
5470 This function is called when vectorizing an unaligned load using
5471 the dr_explicit_realign[_optimized] scheme.
5472 This function generates the following code at the loop prolog:
5473
5474 p = initial_addr;
5475 x msq_init = *(floor(p)); # prolog load
5476 realignment_token = call target_builtin;
5477 loop:
5478 x msq = phi (msq_init, ---)
5479
5480 The stmts marked with x are generated only for the case of
5481 dr_explicit_realign_optimized.
5482
5483 The code above sets up a new (vector) pointer, pointing to the first
5484 location accessed by STMT_INFO, and a "floor-aligned" load using that
5485 pointer. It also generates code to compute the "realignment-token"
5486 (if the relevant target hook was defined), and creates a phi-node at the
5487 loop-header bb whose arguments are the result of the prolog-load (created
5488 by this function) and the result of a load that takes place in the loop
5489 (to be created by the caller to this function).
5490
5491 For the case of dr_explicit_realign_optimized:
5492 The caller to this function uses the phi-result (msq) to create the
5493 realignment code inside the loop, and sets up the missing phi argument,
5494 as follows:
5495 loop:
5496 msq = phi (msq_init, lsq)
5497 lsq = *(floor(p')); # load in loop
5498 result = realign_load (msq, lsq, realignment_token);
5499
5500 For the case of dr_explicit_realign:
5501 loop:
5502 msq = *(floor(p)); # load in loop
5503 p' = p + (VS-1);
5504 lsq = *(floor(p')); # load in loop
5505 result = realign_load (msq, lsq, realignment_token);
5506
5507 Input:
5508 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5509 a memory location that may be unaligned.
5510 BSI - place where new code is to be inserted.
5511 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5512 is used.
5513
5514 Output:
5515 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5516 target hook, if defined.
5517 Return value - the result of the loop-header phi node. */
5518
5519 tree
5520 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5521 gimple_stmt_iterator *gsi, tree *realignment_token,
5522 enum dr_alignment_support alignment_support_scheme,
5523 tree init_addr,
5524 class loop **at_loop)
5525 {
5526 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5527 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5528 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5529 struct data_reference *dr = dr_info->dr;
5530 class loop *loop = NULL;
5531 edge pe = NULL;
5532 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5533 tree vec_dest;
5534 gimple *inc;
5535 tree ptr;
5536 tree data_ref;
5537 basic_block new_bb;
5538 tree msq_init = NULL_TREE;
5539 tree new_temp;
5540 gphi *phi_stmt;
5541 tree msq = NULL_TREE;
5542 gimple_seq stmts = NULL;
5543 bool compute_in_loop = false;
5544 bool nested_in_vect_loop = false;
5545 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5546 class loop *loop_for_initial_load = NULL;
5547
5548 if (loop_vinfo)
5549 {
5550 loop = LOOP_VINFO_LOOP (loop_vinfo);
5551 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5552 }
5553
5554 gcc_assert (alignment_support_scheme == dr_explicit_realign
5555 || alignment_support_scheme == dr_explicit_realign_optimized);
5556
5557 /* We need to generate three things:
5558 1. the misalignment computation
5559 2. the extra vector load (for the optimized realignment scheme).
5560 3. the phi node for the two vectors from which the realignment is
5561 done (for the optimized realignment scheme). */
5562
5563 /* 1. Determine where to generate the misalignment computation.
5564
5565 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5566 calculation will be generated by this function, outside the loop (in the
5567 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5568 caller, inside the loop.
5569
5570 Background: If the misalignment remains fixed throughout the iterations of
5571 the loop, then both realignment schemes are applicable, and also the
5572 misalignment computation can be done outside LOOP. This is because we are
5573 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5574 are a multiple of VS (the Vector Size), and therefore the misalignment in
5575 different vectorized LOOP iterations is always the same.
5576 The problem arises only if the memory access is in an inner-loop nested
5577 inside LOOP, which is now being vectorized using outer-loop vectorization.
5578 This is the only case when the misalignment of the memory access may not
5579 remain fixed throughout the iterations of the inner-loop (as explained in
5580 detail in vect_supportable_dr_alignment). In this case, not only is the
5581 optimized realignment scheme not applicable, but also the misalignment
5582 computation (and generation of the realignment token that is passed to
5583 REALIGN_LOAD) have to be done inside the loop.
5584
5585 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5586 or not, which in turn determines if the misalignment is computed inside
5587 the inner-loop, or outside LOOP. */
5588
5589 if (init_addr != NULL_TREE || !loop_vinfo)
5590 {
5591 compute_in_loop = true;
5592 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5593 }
5594
5595
5596 /* 2. Determine where to generate the extra vector load.
5597
5598 For the optimized realignment scheme, instead of generating two vector
5599 loads in each iteration, we generate a single extra vector load in the
5600 preheader of the loop, and in each iteration reuse the result of the
5601 vector load from the previous iteration. In case the memory access is in
5602 an inner-loop nested inside LOOP, which is now being vectorized using
5603 outer-loop vectorization, we need to determine whether this initial vector
5604 load should be generated at the preheader of the inner-loop, or can be
5605 generated at the preheader of LOOP. If the memory access has no evolution
5606 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5607 to be generated inside LOOP (in the preheader of the inner-loop). */
5608
5609 if (nested_in_vect_loop)
5610 {
5611 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5612 bool invariant_in_outerloop =
5613 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5614 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5615 }
5616 else
5617 loop_for_initial_load = loop;
5618 if (at_loop)
5619 *at_loop = loop_for_initial_load;
5620
5621 if (loop_for_initial_load)
5622 pe = loop_preheader_edge (loop_for_initial_load);
5623
5624 /* 3. For the case of the optimized realignment, create the first vector
5625 load at the loop preheader. */
5626
5627 if (alignment_support_scheme == dr_explicit_realign_optimized)
5628 {
5629 /* Create msq_init = *(floor(p1)) in the loop preheader */
5630 gassign *new_stmt;
5631
5632 gcc_assert (!compute_in_loop);
5633 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5635 loop_for_initial_load, NULL_TREE,
5636 &init_addr, NULL, &inc, true);
5637 if (TREE_CODE (ptr) == SSA_NAME)
5638 new_temp = copy_ssa_name (ptr);
5639 else
5640 new_temp = make_ssa_name (TREE_TYPE (ptr));
5641 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5642 tree type = TREE_TYPE (ptr);
5643 new_stmt = gimple_build_assign
5644 (new_temp, BIT_AND_EXPR, ptr,
5645 fold_build2 (MINUS_EXPR, type,
5646 build_int_cst (type, 0),
5647 build_int_cst (type, align)));
5648 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5649 gcc_assert (!new_bb);
5650 data_ref
5651 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5652 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5653 vect_copy_ref_info (data_ref, DR_REF (dr));
5654 new_stmt = gimple_build_assign (vec_dest, data_ref);
5655 new_temp = make_ssa_name (vec_dest, new_stmt);
5656 gimple_assign_set_lhs (new_stmt, new_temp);
5657 if (pe)
5658 {
5659 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5660 gcc_assert (!new_bb);
5661 }
5662 else
5663 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5664
5665 msq_init = gimple_assign_lhs (new_stmt);
5666 }
5667
5668 /* 4. Create realignment token using a target builtin, if available.
5669 It is done either inside the containing loop, or before LOOP (as
5670 determined above). */
5671
5672 if (targetm.vectorize.builtin_mask_for_load)
5673 {
5674 gcall *new_stmt;
5675 tree builtin_decl;
5676
5677 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5678 if (!init_addr)
5679 {
5680 /* Generate the INIT_ADDR computation outside LOOP. */
5681 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5682 stmt_info, &stmts,
5683 NULL_TREE);
5684 if (loop)
5685 {
5686 pe = loop_preheader_edge (loop);
5687 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5688 gcc_assert (!new_bb);
5689 }
5690 else
5691 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5692 }
5693
5694 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5695 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5696 vec_dest =
5697 vect_create_destination_var (scalar_dest,
5698 gimple_call_return_type (new_stmt));
5699 new_temp = make_ssa_name (vec_dest, new_stmt);
5700 gimple_call_set_lhs (new_stmt, new_temp);
5701
5702 if (compute_in_loop)
5703 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5704 else
5705 {
5706 /* Generate the misalignment computation outside LOOP. */
5707 pe = loop_preheader_edge (loop);
5708 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5709 gcc_assert (!new_bb);
5710 }
5711
5712 *realignment_token = gimple_call_lhs (new_stmt);
5713
5714 /* The result of the CALL_EXPR to this builtin is determined from
5715 the value of the parameter and no global variables are touched
5716 which makes the builtin a "const" function. Requiring the
5717 builtin to have the "const" attribute makes it unnecessary
5718 to call mark_call_clobbered. */
5719 gcc_assert (TREE_READONLY (builtin_decl));
5720 }
5721
5722 if (alignment_support_scheme == dr_explicit_realign)
5723 return msq;
5724
5725 gcc_assert (!compute_in_loop);
5726 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5727
5728
5729 /* 5. Create msq = phi <msq_init, lsq> in loop */
5730
5731 pe = loop_preheader_edge (containing_loop);
5732 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5733 msq = make_ssa_name (vec_dest);
5734 phi_stmt = create_phi_node (msq, containing_loop->header);
5735 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5736
5737 return msq;
5738 }
5739
5740
5741 /* Function vect_grouped_load_supported.
5742
5743 COUNT is the size of the load group (the number of statements plus the
5744 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5745 only one statement, with a gap of COUNT - 1.
5746
5747 Returns true if a suitable permute exists. */
5748
5749 bool
5750 vect_grouped_load_supported (tree vectype, bool single_element_p,
5751 unsigned HOST_WIDE_INT count)
5752 {
5753 machine_mode mode = TYPE_MODE (vectype);
5754
5755 /* If this is single-element interleaving with an element distance
5756 that leaves unused vector loads around punt - we at least create
5757 very sub-optimal code in that case (and blow up memory,
5758 see PR65518). */
5759 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5760 {
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763 "single-element interleaving not supported "
5764 "for not adjacent vector loads\n");
5765 return false;
5766 }
5767
5768 /* vect_permute_load_chain requires the group size to be equal to 3 or
5769 be a power of two. */
5770 if (count != 3 && exact_log2 (count) == -1)
5771 {
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774 "the size of the group of accesses"
5775 " is not a power of 2 or not equal to 3\n");
5776 return false;
5777 }
5778
5779 /* Check that the permutation is supported. */
5780 if (VECTOR_MODE_P (mode))
5781 {
5782 unsigned int i, j;
5783 if (count == 3)
5784 {
5785 unsigned int nelt;
5786 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5787 {
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790 "cannot handle groups of 3 loads for"
5791 " variable-length vectors\n");
5792 return false;
5793 }
5794
5795 vec_perm_builder sel (nelt, nelt, 1);
5796 sel.quick_grow (nelt);
5797 vec_perm_indices indices;
5798 unsigned int k;
5799 for (k = 0; k < 3; k++)
5800 {
5801 for (i = 0; i < nelt; i++)
5802 if (3 * i + k < 2 * nelt)
5803 sel[i] = 3 * i + k;
5804 else
5805 sel[i] = 0;
5806 indices.new_vector (sel, 2, nelt);
5807 if (!can_vec_perm_const_p (mode, indices))
5808 {
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5811 "shuffle of 3 loads is not supported by"
5812 " target\n");
5813 return false;
5814 }
5815 for (i = 0, j = 0; i < nelt; i++)
5816 if (3 * i + k < 2 * nelt)
5817 sel[i] = i;
5818 else
5819 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5820 indices.new_vector (sel, 2, nelt);
5821 if (!can_vec_perm_const_p (mode, indices))
5822 {
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825 "shuffle of 3 loads is not supported by"
5826 " target\n");
5827 return false;
5828 }
5829 }
5830 return true;
5831 }
5832 else
5833 {
5834 /* If length is not equal to 3 then only power of 2 is supported. */
5835 gcc_assert (pow2p_hwi (count));
5836 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5837
5838 /* The encoding has a single stepped pattern. */
5839 vec_perm_builder sel (nelt, 1, 3);
5840 sel.quick_grow (3);
5841 for (i = 0; i < 3; i++)
5842 sel[i] = i * 2;
5843 vec_perm_indices indices (sel, 2, nelt);
5844 if (can_vec_perm_const_p (mode, indices))
5845 {
5846 for (i = 0; i < 3; i++)
5847 sel[i] = i * 2 + 1;
5848 indices.new_vector (sel, 2, nelt);
5849 if (can_vec_perm_const_p (mode, indices))
5850 return true;
5851 }
5852 }
5853 }
5854
5855 if (dump_enabled_p ())
5856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5857 "extract even/odd not supported by target\n");
5858 return false;
5859 }
5860
5861 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5862 type VECTYPE. MASKED_P says whether the masked form is needed. */
5863
5864 bool
5865 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5866 bool masked_p)
5867 {
5868 if (masked_p)
5869 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5870 vec_mask_load_lanes_optab,
5871 vectype, count);
5872 else
5873 return vect_lanes_optab_supported_p ("vec_load_lanes",
5874 vec_load_lanes_optab,
5875 vectype, count);
5876 }
5877
5878 /* Function vect_permute_load_chain.
5879
5880 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5881 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5882 the input data correctly. Return the final references for loads in
5883 RESULT_CHAIN.
5884
5885 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5886 The input is 4 vectors each containing 8 elements. We assign a number to each
5887 element, the input sequence is:
5888
5889 1st vec: 0 1 2 3 4 5 6 7
5890 2nd vec: 8 9 10 11 12 13 14 15
5891 3rd vec: 16 17 18 19 20 21 22 23
5892 4th vec: 24 25 26 27 28 29 30 31
5893
5894 The output sequence should be:
5895
5896 1st vec: 0 4 8 12 16 20 24 28
5897 2nd vec: 1 5 9 13 17 21 25 29
5898 3rd vec: 2 6 10 14 18 22 26 30
5899 4th vec: 3 7 11 15 19 23 27 31
5900
5901 i.e., the first output vector should contain the first elements of each
5902 interleaving group, etc.
5903
5904 We use extract_even/odd instructions to create such output. The input of
5905 each extract_even/odd operation is two vectors
5906 1st vec 2nd vec
5907 0 1 2 3 4 5 6 7
5908
5909 and the output is the vector of extracted even/odd elements. The output of
5910 extract_even will be: 0 2 4 6
5911 and of extract_odd: 1 3 5 7
5912
5913
5914 The permutation is done in log LENGTH stages. In each stage extract_even
5915 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5916 their order. In our example,
5917
5918 E1: extract_even (1st vec, 2nd vec)
5919 E2: extract_odd (1st vec, 2nd vec)
5920 E3: extract_even (3rd vec, 4th vec)
5921 E4: extract_odd (3rd vec, 4th vec)
5922
5923 The output for the first stage will be:
5924
5925 E1: 0 2 4 6 8 10 12 14
5926 E2: 1 3 5 7 9 11 13 15
5927 E3: 16 18 20 22 24 26 28 30
5928 E4: 17 19 21 23 25 27 29 31
5929
5930 In order to proceed and create the correct sequence for the next stage (or
5931 for the correct output, if the second stage is the last one, as in our
5932 example), we first put the output of extract_even operation and then the
5933 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5934 The input for the second stage is:
5935
5936 1st vec (E1): 0 2 4 6 8 10 12 14
5937 2nd vec (E3): 16 18 20 22 24 26 28 30
5938 3rd vec (E2): 1 3 5 7 9 11 13 15
5939 4th vec (E4): 17 19 21 23 25 27 29 31
5940
5941 The output of the second stage:
5942
5943 E1: 0 4 8 12 16 20 24 28
5944 E2: 2 6 10 14 18 22 26 30
5945 E3: 1 5 9 13 17 21 25 29
5946 E4: 3 7 11 15 19 23 27 31
5947
5948 And RESULT_CHAIN after reordering:
5949
5950 1st vec (E1): 0 4 8 12 16 20 24 28
5951 2nd vec (E3): 1 5 9 13 17 21 25 29
5952 3rd vec (E2): 2 6 10 14 18 22 26 30
5953 4th vec (E4): 3 7 11 15 19 23 27 31. */
5954
5955 static void
5956 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5957 unsigned int length,
5958 stmt_vec_info stmt_info,
5959 gimple_stmt_iterator *gsi,
5960 vec<tree> *result_chain)
5961 {
5962 tree data_ref, first_vect, second_vect;
5963 tree perm_mask_even, perm_mask_odd;
5964 tree perm3_mask_low, perm3_mask_high;
5965 gimple *perm_stmt;
5966 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5967 unsigned int i, j, log_length = exact_log2 (length);
5968
5969 result_chain->quick_grow (length);
5970 memcpy (result_chain->address (), dr_chain.address (),
5971 length * sizeof (tree));
5972
5973 if (length == 3)
5974 {
5975 /* vect_grouped_load_supported ensures that this is constant. */
5976 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5977 unsigned int k;
5978
5979 vec_perm_builder sel (nelt, nelt, 1);
5980 sel.quick_grow (nelt);
5981 vec_perm_indices indices;
5982 for (k = 0; k < 3; k++)
5983 {
5984 for (i = 0; i < nelt; i++)
5985 if (3 * i + k < 2 * nelt)
5986 sel[i] = 3 * i + k;
5987 else
5988 sel[i] = 0;
5989 indices.new_vector (sel, 2, nelt);
5990 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5991
5992 for (i = 0, j = 0; i < nelt; i++)
5993 if (3 * i + k < 2 * nelt)
5994 sel[i] = i;
5995 else
5996 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5997 indices.new_vector (sel, 2, nelt);
5998 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5999
6000 first_vect = dr_chain[0];
6001 second_vect = dr_chain[1];
6002
6003 /* Create interleaving stmt (low part of):
6004 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6005 ...}> */
6006 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6007 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6008 second_vect, perm3_mask_low);
6009 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6010
6011 /* Create interleaving stmt (high part of):
6012 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6013 ...}> */
6014 first_vect = data_ref;
6015 second_vect = dr_chain[2];
6016 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6017 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6018 second_vect, perm3_mask_high);
6019 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6020 (*result_chain)[k] = data_ref;
6021 }
6022 }
6023 else
6024 {
6025 /* If length is not equal to 3 then only power of 2 is supported. */
6026 gcc_assert (pow2p_hwi (length));
6027
6028 /* The encoding has a single stepped pattern. */
6029 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6030 vec_perm_builder sel (nelt, 1, 3);
6031 sel.quick_grow (3);
6032 for (i = 0; i < 3; ++i)
6033 sel[i] = i * 2;
6034 vec_perm_indices indices (sel, 2, nelt);
6035 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6036
6037 for (i = 0; i < 3; ++i)
6038 sel[i] = i * 2 + 1;
6039 indices.new_vector (sel, 2, nelt);
6040 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6041
6042 for (i = 0; i < log_length; i++)
6043 {
6044 for (j = 0; j < length; j += 2)
6045 {
6046 first_vect = dr_chain[j];
6047 second_vect = dr_chain[j+1];
6048
6049 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6050 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6051 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6052 first_vect, second_vect,
6053 perm_mask_even);
6054 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6055 (*result_chain)[j/2] = data_ref;
6056
6057 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6058 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6059 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6060 first_vect, second_vect,
6061 perm_mask_odd);
6062 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6063 (*result_chain)[j/2+length/2] = data_ref;
6064 }
6065 memcpy (dr_chain.address (), result_chain->address (),
6066 length * sizeof (tree));
6067 }
6068 }
6069 }
6070
6071 /* Function vect_shift_permute_load_chain.
6072
6073 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6074 sequence of stmts to reorder the input data accordingly.
6075 Return the final references for loads in RESULT_CHAIN.
6076 Return true if successed, false otherwise.
6077
6078 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6079 The input is 3 vectors each containing 8 elements. We assign a
6080 number to each element, the input sequence is:
6081
6082 1st vec: 0 1 2 3 4 5 6 7
6083 2nd vec: 8 9 10 11 12 13 14 15
6084 3rd vec: 16 17 18 19 20 21 22 23
6085
6086 The output sequence should be:
6087
6088 1st vec: 0 3 6 9 12 15 18 21
6089 2nd vec: 1 4 7 10 13 16 19 22
6090 3rd vec: 2 5 8 11 14 17 20 23
6091
6092 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6093
6094 First we shuffle all 3 vectors to get correct elements order:
6095
6096 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6097 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6098 3rd vec: (16 19 22) (17 20 23) (18 21)
6099
6100 Next we unite and shift vector 3 times:
6101
6102 1st step:
6103 shift right by 6 the concatenation of:
6104 "1st vec" and "2nd vec"
6105 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6106 "2nd vec" and "3rd vec"
6107 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6108 "3rd vec" and "1st vec"
6109 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6110 | New vectors |
6111
6112 So that now new vectors are:
6113
6114 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6115 2nd vec: (10 13) (16 19 22) (17 20 23)
6116 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6117
6118 2nd step:
6119 shift right by 5 the concatenation of:
6120 "1st vec" and "3rd vec"
6121 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6122 "2nd vec" and "1st vec"
6123 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6124 "3rd vec" and "2nd vec"
6125 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6126 | New vectors |
6127
6128 So that now new vectors are:
6129
6130 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6131 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6132 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6133
6134 3rd step:
6135 shift right by 5 the concatenation of:
6136 "1st vec" and "1st vec"
6137 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6138 shift right by 3 the concatenation of:
6139 "2nd vec" and "2nd vec"
6140 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6141 | New vectors |
6142
6143 So that now all vectors are READY:
6144 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6145 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6146 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6147
6148 This algorithm is faster than one in vect_permute_load_chain if:
6149 1. "shift of a concatination" is faster than general permutation.
6150 This is usually so.
6151 2. The TARGET machine can't execute vector instructions in parallel.
6152 This is because each step of the algorithm depends on previous.
6153 The algorithm in vect_permute_load_chain is much more parallel.
6154
6155 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6156 */
6157
6158 static bool
6159 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6160 unsigned int length,
6161 stmt_vec_info stmt_info,
6162 gimple_stmt_iterator *gsi,
6163 vec<tree> *result_chain)
6164 {
6165 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6166 tree perm2_mask1, perm2_mask2, perm3_mask;
6167 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6168 gimple *perm_stmt;
6169
6170 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6171 unsigned int i;
6172 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6173
6174 unsigned HOST_WIDE_INT nelt, vf;
6175 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6176 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6177 /* Not supported for variable-length vectors. */
6178 return false;
6179
6180 vec_perm_builder sel (nelt, nelt, 1);
6181 sel.quick_grow (nelt);
6182
6183 result_chain->quick_grow (length);
6184 memcpy (result_chain->address (), dr_chain.address (),
6185 length * sizeof (tree));
6186
6187 if (pow2p_hwi (length) && vf > 4)
6188 {
6189 unsigned int j, log_length = exact_log2 (length);
6190 for (i = 0; i < nelt / 2; ++i)
6191 sel[i] = i * 2;
6192 for (i = 0; i < nelt / 2; ++i)
6193 sel[nelt / 2 + i] = i * 2 + 1;
6194 vec_perm_indices indices (sel, 2, nelt);
6195 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6196 {
6197 if (dump_enabled_p ())
6198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199 "shuffle of 2 fields structure is not \
6200 supported by target\n");
6201 return false;
6202 }
6203 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6204
6205 for (i = 0; i < nelt / 2; ++i)
6206 sel[i] = i * 2 + 1;
6207 for (i = 0; i < nelt / 2; ++i)
6208 sel[nelt / 2 + i] = i * 2;
6209 indices.new_vector (sel, 2, nelt);
6210 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211 {
6212 if (dump_enabled_p ())
6213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214 "shuffle of 2 fields structure is not \
6215 supported by target\n");
6216 return false;
6217 }
6218 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6219
6220 /* Generating permutation constant to shift all elements.
6221 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6222 for (i = 0; i < nelt; i++)
6223 sel[i] = nelt / 2 + i;
6224 indices.new_vector (sel, 2, nelt);
6225 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226 {
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 "shift permutation is not supported by target\n");
6230 return false;
6231 }
6232 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6233
6234 /* Generating permutation constant to select vector from 2.
6235 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6236 for (i = 0; i < nelt / 2; i++)
6237 sel[i] = i;
6238 for (i = nelt / 2; i < nelt; i++)
6239 sel[i] = nelt + i;
6240 indices.new_vector (sel, 2, nelt);
6241 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6242 {
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245 "select is not supported by target\n");
6246 return false;
6247 }
6248 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6249
6250 for (i = 0; i < log_length; i++)
6251 {
6252 for (j = 0; j < length; j += 2)
6253 {
6254 first_vect = dr_chain[j];
6255 second_vect = dr_chain[j + 1];
6256
6257 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6258 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6259 first_vect, first_vect,
6260 perm2_mask1);
6261 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6262 vect[0] = data_ref;
6263
6264 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6265 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6266 second_vect, second_vect,
6267 perm2_mask2);
6268 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6269 vect[1] = data_ref;
6270
6271 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6272 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6273 vect[0], vect[1], shift1_mask);
6274 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6275 (*result_chain)[j/2 + length/2] = data_ref;
6276
6277 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6278 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6279 vect[0], vect[1], select_mask);
6280 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6281 (*result_chain)[j/2] = data_ref;
6282 }
6283 memcpy (dr_chain.address (), result_chain->address (),
6284 length * sizeof (tree));
6285 }
6286 return true;
6287 }
6288 if (length == 3 && vf > 2)
6289 {
6290 unsigned int k = 0, l = 0;
6291
6292 /* Generating permutation constant to get all elements in rigth order.
6293 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6294 for (i = 0; i < nelt; i++)
6295 {
6296 if (3 * k + (l % 3) >= nelt)
6297 {
6298 k = 0;
6299 l += (3 - (nelt % 3));
6300 }
6301 sel[i] = 3 * k + (l % 3);
6302 k++;
6303 }
6304 vec_perm_indices indices (sel, 2, nelt);
6305 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6306 {
6307 if (dump_enabled_p ())
6308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309 "shuffle of 3 fields structure is not \
6310 supported by target\n");
6311 return false;
6312 }
6313 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6314
6315 /* Generating permutation constant to shift all elements.
6316 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6317 for (i = 0; i < nelt; i++)
6318 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6319 indices.new_vector (sel, 2, nelt);
6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6321 {
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324 "shift permutation is not supported by target\n");
6325 return false;
6326 }
6327 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6328
6329 /* Generating permutation constant to shift all elements.
6330 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6331 for (i = 0; i < nelt; i++)
6332 sel[i] = 2 * (nelt / 3) + 1 + i;
6333 indices.new_vector (sel, 2, nelt);
6334 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6335 {
6336 if (dump_enabled_p ())
6337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6338 "shift permutation is not supported by target\n");
6339 return false;
6340 }
6341 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6342
6343 /* Generating permutation constant to shift all elements.
6344 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6345 for (i = 0; i < nelt; i++)
6346 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6347 indices.new_vector (sel, 2, nelt);
6348 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6349 {
6350 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6352 "shift permutation is not supported by target\n");
6353 return false;
6354 }
6355 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6356
6357 /* Generating permutation constant to shift all elements.
6358 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6359 for (i = 0; i < nelt; i++)
6360 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6361 indices.new_vector (sel, 2, nelt);
6362 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6363 {
6364 if (dump_enabled_p ())
6365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6366 "shift permutation is not supported by target\n");
6367 return false;
6368 }
6369 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6370
6371 for (k = 0; k < 3; k++)
6372 {
6373 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6374 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6375 dr_chain[k], dr_chain[k],
6376 perm3_mask);
6377 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6378 vect[k] = data_ref;
6379 }
6380
6381 for (k = 0; k < 3; k++)
6382 {
6383 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6384 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6385 vect[k % 3], vect[(k + 1) % 3],
6386 shift1_mask);
6387 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6388 vect_shift[k] = data_ref;
6389 }
6390
6391 for (k = 0; k < 3; k++)
6392 {
6393 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6394 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6395 vect_shift[(4 - k) % 3],
6396 vect_shift[(3 - k) % 3],
6397 shift2_mask);
6398 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6399 vect[k] = data_ref;
6400 }
6401
6402 (*result_chain)[3 - (nelt % 3)] = vect[2];
6403
6404 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6405 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6406 vect[0], shift3_mask);
6407 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6408 (*result_chain)[nelt % 3] = data_ref;
6409
6410 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6411 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6412 vect[1], shift4_mask);
6413 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6414 (*result_chain)[0] = data_ref;
6415 return true;
6416 }
6417 return false;
6418 }
6419
6420 /* Function vect_transform_grouped_load.
6421
6422 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6423 to perform their permutation and ascribe the result vectorized statements to
6424 the scalar statements.
6425 */
6426
6427 void
6428 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6429 vec<tree> dr_chain,
6430 int size, gimple_stmt_iterator *gsi)
6431 {
6432 machine_mode mode;
6433 vec<tree> result_chain = vNULL;
6434
6435 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6436 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6437 vectors, that are ready for vector computation. */
6438 result_chain.create (size);
6439
6440 /* If reassociation width for vector type is 2 or greater target machine can
6441 execute 2 or more vector instructions in parallel. Otherwise try to
6442 get chain for loads group using vect_shift_permute_load_chain. */
6443 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6444 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6445 || pow2p_hwi (size)
6446 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6447 gsi, &result_chain))
6448 vect_permute_load_chain (vinfo, dr_chain,
6449 size, stmt_info, gsi, &result_chain);
6450 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6451 result_chain.release ();
6452 }
6453
6454 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6455 generated as part of the vectorization of STMT_INFO. Assign the statement
6456 for each vector to the associated scalar statement. */
6457
6458 void
6459 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6460 vec<tree> result_chain)
6461 {
6462 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6463 unsigned int i, gap_count;
6464 tree tmp_data_ref;
6465
6466 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6467 Since we scan the chain starting from it's first node, their order
6468 corresponds the order of data-refs in RESULT_CHAIN. */
6469 stmt_vec_info next_stmt_info = first_stmt_info;
6470 gap_count = 1;
6471 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6472 {
6473 if (!next_stmt_info)
6474 break;
6475
6476 /* Skip the gaps. Loads created for the gaps will be removed by dead
6477 code elimination pass later. No need to check for the first stmt in
6478 the group, since it always exists.
6479 DR_GROUP_GAP is the number of steps in elements from the previous
6480 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6481 correspond to the gaps. */
6482 if (next_stmt_info != first_stmt_info
6483 && gap_count < DR_GROUP_GAP (next_stmt_info))
6484 {
6485 gap_count++;
6486 continue;
6487 }
6488
6489 /* ??? The following needs cleanup after the removal of
6490 DR_GROUP_SAME_DR_STMT. */
6491 if (next_stmt_info)
6492 {
6493 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6494 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6495 copies, and we put the new vector statement last. */
6496 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6497
6498 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6499 gap_count = 1;
6500 }
6501 }
6502 }
6503
6504 /* Function vect_force_dr_alignment_p.
6505
6506 Returns whether the alignment of a DECL can be forced to be aligned
6507 on ALIGNMENT bit boundary. */
6508
6509 bool
6510 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6511 {
6512 if (!VAR_P (decl))
6513 return false;
6514
6515 if (decl_in_symtab_p (decl)
6516 && !symtab_node::get (decl)->can_increase_alignment_p ())
6517 return false;
6518
6519 if (TREE_STATIC (decl))
6520 return (known_le (alignment,
6521 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6522 else
6523 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6524 }
6525
6526
6527 /* Return whether the data reference DR_INFO is supported with respect to its
6528 alignment.
6529 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6530 it is aligned, i.e., check if it is possible to vectorize it with different
6531 alignment. */
6532
6533 enum dr_alignment_support
6534 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6535 bool check_aligned_accesses)
6536 {
6537 data_reference *dr = dr_info->dr;
6538 stmt_vec_info stmt_info = dr_info->stmt;
6539 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6540 machine_mode mode = TYPE_MODE (vectype);
6541 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6542 class loop *vect_loop = NULL;
6543 bool nested_in_vect_loop = false;
6544
6545 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6546 return dr_aligned;
6547
6548 /* For now assume all conditional loads/stores support unaligned
6549 access without any special code. */
6550 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6551 if (gimple_call_internal_p (stmt)
6552 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6553 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6554 return dr_unaligned_supported;
6555
6556 if (loop_vinfo)
6557 {
6558 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6559 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6560 }
6561
6562 /* Possibly unaligned access. */
6563
6564 /* We can choose between using the implicit realignment scheme (generating
6565 a misaligned_move stmt) and the explicit realignment scheme (generating
6566 aligned loads with a REALIGN_LOAD). There are two variants to the
6567 explicit realignment scheme: optimized, and unoptimized.
6568 We can optimize the realignment only if the step between consecutive
6569 vector loads is equal to the vector size. Since the vector memory
6570 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6571 is guaranteed that the misalignment amount remains the same throughout the
6572 execution of the vectorized loop. Therefore, we can create the
6573 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6574 at the loop preheader.
6575
6576 However, in the case of outer-loop vectorization, when vectorizing a
6577 memory access in the inner-loop nested within the LOOP that is now being
6578 vectorized, while it is guaranteed that the misalignment of the
6579 vectorized memory access will remain the same in different outer-loop
6580 iterations, it is *not* guaranteed that is will remain the same throughout
6581 the execution of the inner-loop. This is because the inner-loop advances
6582 with the original scalar step (and not in steps of VS). If the inner-loop
6583 step happens to be a multiple of VS, then the misalignment remains fixed
6584 and we can use the optimized realignment scheme. For example:
6585
6586 for (i=0; i<N; i++)
6587 for (j=0; j<M; j++)
6588 s += a[i+j];
6589
6590 When vectorizing the i-loop in the above example, the step between
6591 consecutive vector loads is 1, and so the misalignment does not remain
6592 fixed across the execution of the inner-loop, and the realignment cannot
6593 be optimized (as illustrated in the following pseudo vectorized loop):
6594
6595 for (i=0; i<N; i+=4)
6596 for (j=0; j<M; j++){
6597 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6598 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6599 // (assuming that we start from an aligned address).
6600 }
6601
6602 We therefore have to use the unoptimized realignment scheme:
6603
6604 for (i=0; i<N; i+=4)
6605 for (j=k; j<M; j+=4)
6606 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6607 // that the misalignment of the initial address is
6608 // 0).
6609
6610 The loop can then be vectorized as follows:
6611
6612 for (k=0; k<4; k++){
6613 rt = get_realignment_token (&vp[k]);
6614 for (i=0; i<N; i+=4){
6615 v1 = vp[i+k];
6616 for (j=k; j<M; j+=4){
6617 v2 = vp[i+j+VS-1];
6618 va = REALIGN_LOAD <v1,v2,rt>;
6619 vs += va;
6620 v1 = v2;
6621 }
6622 }
6623 } */
6624
6625 if (DR_IS_READ (dr))
6626 {
6627 bool is_packed = false;
6628 tree type = (TREE_TYPE (DR_REF (dr)));
6629
6630 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6631 && (!targetm.vectorize.builtin_mask_for_load
6632 || targetm.vectorize.builtin_mask_for_load ()))
6633 {
6634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6635
6636 /* If we are doing SLP then the accesses need not have the
6637 same alignment, instead it depends on the SLP group size. */
6638 if (loop_vinfo
6639 && STMT_SLP_TYPE (stmt_info)
6640 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6641 * (DR_GROUP_SIZE
6642 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6643 TYPE_VECTOR_SUBPARTS (vectype)))
6644 ;
6645 else if (!loop_vinfo
6646 || (nested_in_vect_loop
6647 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6648 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6649 return dr_explicit_realign;
6650 else
6651 return dr_explicit_realign_optimized;
6652 }
6653 if (!known_alignment_for_access_p (dr_info))
6654 is_packed = not_size_aligned (DR_REF (dr));
6655
6656 if (targetm.vectorize.support_vector_misalignment
6657 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6658 /* Can't software pipeline the loads, but can at least do them. */
6659 return dr_unaligned_supported;
6660 }
6661 else
6662 {
6663 bool is_packed = false;
6664 tree type = (TREE_TYPE (DR_REF (dr)));
6665
6666 if (!known_alignment_for_access_p (dr_info))
6667 is_packed = not_size_aligned (DR_REF (dr));
6668
6669 if (targetm.vectorize.support_vector_misalignment
6670 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6671 return dr_unaligned_supported;
6672 }
6673
6674 /* Unsupported. */
6675 return dr_unaligned_unsupported;
6676 }