1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
34 #include "optabs-tree.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 vect_lanes_optab_supported_p (const char *name
, convert_optab optab
,
62 tree vectype
, unsigned HOST_WIDE_INT count
)
64 machine_mode mode
, array_mode
;
67 mode
= TYPE_MODE (vectype
);
68 if (!targetm
.array_mode (mode
, count
).exists (&array_mode
))
70 poly_uint64 bits
= count
* GET_MODE_BITSIZE (mode
);
71 limit_p
= !targetm
.array_mode_supported_p (mode
, count
);
72 if (!int_mode_for_size (bits
, limit_p
).exists (&array_mode
))
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode
), count
);
82 if (convert_optab_handler (optab
, array_mode
, mode
) == CODE_FOR_nothing
)
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
86 "cannot use %s<%s><%s>\n", name
,
87 GET_MODE_NAME (array_mode
), GET_MODE_NAME (mode
));
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE
, vect_location
,
93 "can use %s<%s><%s>\n", name
, GET_MODE_NAME (array_mode
),
94 GET_MODE_NAME (mode
));
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info
,
119 HOST_WIDE_INT
*lhs_size_unit
,
120 HOST_WIDE_INT
*rhs_size_unit
)
122 tree scalar_type
= gimple_expr_type (stmt_info
->stmt
);
123 HOST_WIDE_INT lhs
, rhs
;
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type
)))
130 lhs
= rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
132 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
134 && (gimple_assign_cast_p (assign
)
135 || gimple_assign_rhs_code (assign
) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign
) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign
) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign
) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign
) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign
) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign
) == FLOAT_EXPR
))
143 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
145 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
147 scalar_type
= rhs_type
;
149 else if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
152 if (gimple_call_internal_p (call
))
154 internal_fn ifn
= gimple_call_internal_fn (call
);
155 if (internal_load_fn_p (ifn
) || internal_store_fn_p (ifn
))
156 /* gimple_expr_type already picked the type of the loaded
159 else if (internal_fn_mask_index (ifn
) == 0)
162 if (i
< gimple_call_num_args (call
))
164 tree rhs_type
= TREE_TYPE (gimple_call_arg (call
, i
));
165 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type
)))
167 rhs
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type
));
169 scalar_type
= rhs_type
;
174 *lhs_size_unit
= lhs
;
175 *rhs_size_unit
= rhs
;
180 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
181 tested at run-time. Return TRUE if DDR was successfully inserted.
182 Return false if versioning is not supported. */
185 vect_mark_for_runtime_alias_test (ddr_p ddr
, loop_vec_info loop_vinfo
)
187 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
189 if ((unsigned) param_vect_max_version_for_alias_checks
== 0)
190 return opt_result::failure_at (vect_location
,
191 "will not create alias checks, as"
192 " --param vect-max-version-for-alias-checks"
196 = runtime_alias_check_p (ddr
, loop
,
197 optimize_loop_nest_for_speed_p (loop
));
201 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
).safe_push (ddr
);
202 return opt_result::success ();
205 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
208 vect_check_nonzero_value (loop_vec_info loop_vinfo
, tree value
)
210 vec
<tree
> checks
= LOOP_VINFO_CHECK_NONZERO (loop_vinfo
);
211 for (unsigned int i
= 0; i
< checks
.length(); ++i
)
212 if (checks
[i
] == value
)
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE
, vect_location
,
217 "need run-time check that %T is nonzero\n",
219 LOOP_VINFO_CHECK_NONZERO (loop_vinfo
).safe_push (value
);
222 /* Return true if we know that the order of vectorized DR_INFO_A and
223 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
224 DR_INFO_B. At least one of the accesses is a write. */
227 vect_preserves_scalar_order_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
)
229 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
230 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
232 /* Single statements are always kept in their original order. */
233 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a
)
234 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b
))
237 /* STMT_A and STMT_B belong to overlapping groups. All loads are
238 emitted at the position of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
242 stmt_vec_info il_a
= DR_GROUP_FIRST_ELEMENT (stmtinfo_a
);
245 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a
)))
246 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
247 s
= DR_GROUP_NEXT_ELEMENT (s
))
248 il_a
= get_later_stmt (il_a
, s
);
249 else /* DR_IS_READ */
250 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_a
); s
;
251 s
= DR_GROUP_NEXT_ELEMENT (s
))
252 if (get_later_stmt (il_a
, s
) == il_a
)
257 stmt_vec_info il_b
= DR_GROUP_FIRST_ELEMENT (stmtinfo_b
);
260 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b
)))
261 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
262 s
= DR_GROUP_NEXT_ELEMENT (s
))
263 il_b
= get_later_stmt (il_b
, s
);
264 else /* DR_IS_READ */
265 for (stmt_vec_info s
= DR_GROUP_NEXT_ELEMENT (il_b
); s
;
266 s
= DR_GROUP_NEXT_ELEMENT (s
))
267 if (get_later_stmt (il_b
, s
) == il_b
)
272 bool a_after_b
= (get_later_stmt (stmtinfo_a
, stmtinfo_b
) == stmtinfo_a
);
273 return (get_later_stmt (il_a
, il_b
) == il_a
) == a_after_b
;
276 /* A subroutine of vect_analyze_data_ref_dependence. Handle
277 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
278 distances. These distances are conservatively correct but they don't
279 reflect a guaranteed dependence.
281 Return true if this function does all the work necessary to avoid
282 an alias or false if the caller should use the dependence distances
283 to limit the vectorization factor in the usual way. LOOP_DEPTH is
284 the depth of the loop described by LOOP_VINFO and the other arguments
285 are as for vect_analyze_data_ref_dependence. */
288 vect_analyze_possibly_independent_ddr (data_dependence_relation
*ddr
,
289 loop_vec_info loop_vinfo
,
290 int loop_depth
, unsigned int *max_vf
)
292 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
293 lambda_vector dist_v
;
295 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
297 int dist
= dist_v
[loop_depth
];
298 if (dist
!= 0 && !(dist
> 0 && DDR_REVERSED_P (ddr
)))
300 /* If the user asserted safelen >= DIST consecutive iterations
301 can be executed concurrently, assume independence.
303 ??? An alternative would be to add the alias check even
304 in this case, and vectorize the fallback loop with the
305 maximum VF set to safelen. However, if the user has
306 explicitly given a length, it's less likely that that
308 if (loop
->safelen
>= 2 && abs_hwi (dist
) <= loop
->safelen
)
310 if ((unsigned int) loop
->safelen
< *max_vf
)
311 *max_vf
= loop
->safelen
;
312 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
316 /* For dependence distances of 2 or more, we have the option
317 of limiting VF or checking for an alias at runtime.
318 Prefer to check at runtime if we can, to avoid limiting
319 the VF unnecessarily when the bases are in fact independent.
321 Note that the alias checks will be removed if the VF ends up
322 being small enough. */
323 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
324 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
325 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a
->stmt
)
326 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b
->stmt
)
327 && vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
));
334 /* Function vect_analyze_data_ref_dependence.
336 FIXME: I needed to change the sense of the returned flag.
338 Return FALSE if there (might) exist a dependence between a memory-reference
339 DRA and a memory-reference DRB. When versioning for alias may check a
340 dependence at run-time, return TRUE. Adjust *MAX_VF according to
341 the data dependence. */
344 vect_analyze_data_ref_dependence (struct data_dependence_relation
*ddr
,
345 loop_vec_info loop_vinfo
,
346 unsigned int *max_vf
)
349 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
350 struct data_reference
*dra
= DDR_A (ddr
);
351 struct data_reference
*drb
= DDR_B (ddr
);
352 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (dra
);
353 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (drb
);
354 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
355 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
356 lambda_vector dist_v
;
357 unsigned int loop_depth
;
359 /* In loop analysis all data references should be vectorizable. */
360 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
361 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b
))
364 /* Independent data accesses. */
365 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
366 return opt_result::success ();
369 || (DR_IS_READ (dra
) && DR_IS_READ (drb
)))
370 return opt_result::success ();
372 /* We do not have to consider dependences between accesses that belong
373 to the same group, unless the stride could be smaller than the
375 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
376 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
)
377 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b
))
378 && !STMT_VINFO_STRIDED_P (stmtinfo_a
))
379 return opt_result::success ();
381 /* Even if we have an anti-dependence then, as the vectorized loop covers at
382 least two scalar iterations, there is always also a true dependence.
383 As the vectorizer does not re-order loads and stores we can ignore
384 the anti-dependence if TBAA can disambiguate both DRs similar to the
385 case with known negative distance anti-dependences (positive
386 distance anti-dependences would violate TBAA constraints). */
387 if (((DR_IS_READ (dra
) && DR_IS_WRITE (drb
))
388 || (DR_IS_WRITE (dra
) && DR_IS_READ (drb
)))
389 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra
)),
390 get_alias_set (DR_REF (drb
))))
391 return opt_result::success ();
393 /* Unknown data dependence. */
394 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
396 /* If user asserted safelen consecutive iterations can be
397 executed concurrently, assume independence. */
398 if (loop
->safelen
>= 2)
400 if ((unsigned int) loop
->safelen
< *max_vf
)
401 *max_vf
= loop
->safelen
;
402 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
403 return opt_result::success ();
406 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
407 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
408 return opt_result::failure_at
410 "versioning for alias not supported for: "
411 "can't determine dependence between %T and %T\n",
412 DR_REF (dra
), DR_REF (drb
));
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
416 "versioning for alias required: "
417 "can't determine dependence between %T and %T\n",
418 DR_REF (dra
), DR_REF (drb
));
420 /* Add to list of ddrs that need to be tested at run-time. */
421 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
424 /* Known data dependence. */
425 if (DDR_NUM_DIST_VECTS (ddr
) == 0)
427 /* If user asserted safelen consecutive iterations can be
428 executed concurrently, assume independence. */
429 if (loop
->safelen
>= 2)
431 if ((unsigned int) loop
->safelen
< *max_vf
)
432 *max_vf
= loop
->safelen
;
433 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = false;
434 return opt_result::success ();
437 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
)
438 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
439 return opt_result::failure_at
441 "versioning for alias not supported for: "
442 "bad dist vector for %T and %T\n",
443 DR_REF (dra
), DR_REF (drb
));
445 if (dump_enabled_p ())
446 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, stmtinfo_a
->stmt
,
447 "versioning for alias required: "
448 "bad dist vector for %T and %T\n",
449 DR_REF (dra
), DR_REF (drb
));
450 /* Add to list of ddrs that need to be tested at run-time. */
451 return vect_mark_for_runtime_alias_test (ddr
, loop_vinfo
);
454 loop_depth
= index_in_loop_nest (loop
->num
, DDR_LOOP_NEST (ddr
));
456 if (DDR_COULD_BE_INDEPENDENT_P (ddr
)
457 && vect_analyze_possibly_independent_ddr (ddr
, loop_vinfo
,
459 return opt_result::success ();
461 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
463 int dist
= dist_v
[loop_depth
];
465 if (dump_enabled_p ())
466 dump_printf_loc (MSG_NOTE
, vect_location
,
467 "dependence distance = %d.\n", dist
);
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_NOTE
, vect_location
,
473 "dependence distance == 0 between %T and %T\n",
474 DR_REF (dra
), DR_REF (drb
));
476 /* When we perform grouped accesses and perform implicit CSE
477 by detecting equal accesses and doing disambiguation with
478 runtime alias tests like for
486 where we will end up loading { a[i], a[i+1] } once, make
487 sure that inserting group loads before the first load and
488 stores after the last store will do the right thing.
489 Similar for groups like
493 where loads from the group interleave with the store. */
494 if (!vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
495 return opt_result::failure_at (stmtinfo_a
->stmt
,
496 "READ_WRITE dependence"
497 " in interleaving.\n");
499 if (loop
->safelen
< 2)
501 tree indicator
= dr_zero_step_indicator (dra
);
502 if (!indicator
|| integer_zerop (indicator
))
503 return opt_result::failure_at (stmtinfo_a
->stmt
,
504 "access also has a zero step\n");
505 else if (TREE_CODE (indicator
) != INTEGER_CST
)
506 vect_check_nonzero_value (loop_vinfo
, indicator
);
511 if (dist
> 0 && DDR_REVERSED_P (ddr
))
513 /* If DDR_REVERSED_P the order of the data-refs in DDR was
514 reversed (to make distance vector positive), and the actual
515 distance is negative. */
516 if (dump_enabled_p ())
517 dump_printf_loc (MSG_NOTE
, vect_location
,
518 "dependence distance negative.\n");
519 /* When doing outer loop vectorization, we need to check if there is
520 a backward dependence at the inner loop level if the dependence
521 at the outer loop is reversed. See PR81740. */
522 if (nested_in_vect_loop_p (loop
, stmtinfo_a
)
523 || nested_in_vect_loop_p (loop
, stmtinfo_b
))
525 unsigned inner_depth
= index_in_loop_nest (loop
->inner
->num
,
526 DDR_LOOP_NEST (ddr
));
527 if (dist_v
[inner_depth
] < 0)
528 return opt_result::failure_at (stmtinfo_a
->stmt
,
529 "not vectorized, dependence "
530 "between data-refs %T and %T\n",
531 DR_REF (dra
), DR_REF (drb
));
533 /* Record a negative dependence distance to later limit the
534 amount of stmt copying / unrolling we can perform.
535 Only need to handle read-after-write dependence. */
537 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) == 0
538 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) > (unsigned)dist
))
539 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b
) = dist
;
543 unsigned int abs_dist
= abs (dist
);
544 if (abs_dist
>= 2 && abs_dist
< *max_vf
)
546 /* The dependence distance requires reduction of the maximal
547 vectorization factor. */
549 if (dump_enabled_p ())
550 dump_printf_loc (MSG_NOTE
, vect_location
,
551 "adjusting maximal vectorization factor to %i\n",
555 if (abs_dist
>= *max_vf
)
557 /* Dependence distance does not create dependence, as far as
558 vectorization is concerned, in this case. */
559 if (dump_enabled_p ())
560 dump_printf_loc (MSG_NOTE
, vect_location
,
561 "dependence distance >= VF.\n");
565 return opt_result::failure_at (stmtinfo_a
->stmt
,
566 "not vectorized, possible dependence "
567 "between data-refs %T and %T\n",
568 DR_REF (dra
), DR_REF (drb
));
571 return opt_result::success ();
574 /* Function vect_analyze_data_ref_dependences.
576 Examine all the data references in the loop, and make sure there do not
577 exist any data dependences between them. Set *MAX_VF according to
578 the maximum vectorization factor the data dependences allow. */
581 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo
,
582 unsigned int *max_vf
)
585 struct data_dependence_relation
*ddr
;
587 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
589 if (!LOOP_VINFO_DDRS (loop_vinfo
).exists ())
591 LOOP_VINFO_DDRS (loop_vinfo
)
592 .create (LOOP_VINFO_DATAREFS (loop_vinfo
).length ()
593 * LOOP_VINFO_DATAREFS (loop_vinfo
).length ());
594 /* We do not need read-read dependences. */
595 bool res
= compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo
),
596 &LOOP_VINFO_DDRS (loop_vinfo
),
597 LOOP_VINFO_LOOP_NEST (loop_vinfo
),
602 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo
) = true;
604 /* For epilogues we either have no aliases or alias versioning
605 was applied to original loop. Therefore we may just get max_vf
606 using VF of original loop. */
607 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo
))
608 *max_vf
= LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo
);
610 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo
), i
, ddr
)
613 = vect_analyze_data_ref_dependence (ddr
, loop_vinfo
, max_vf
);
618 return opt_result::success ();
622 /* Function vect_slp_analyze_data_ref_dependence.
624 Return TRUE if there (might) exist a dependence between a memory-reference
625 DRA and a memory-reference DRB for VINFO. When versioning for alias
626 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
627 according to the data dependence. */
630 vect_slp_analyze_data_ref_dependence (vec_info
*vinfo
,
631 struct data_dependence_relation
*ddr
)
633 struct data_reference
*dra
= DDR_A (ddr
);
634 struct data_reference
*drb
= DDR_B (ddr
);
635 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
636 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
638 /* We need to check dependences of statements marked as unvectorizable
639 as well, they still can prohibit vectorization. */
641 /* Independent data accesses. */
642 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
648 /* Read-read is OK. */
649 if (DR_IS_READ (dra
) && DR_IS_READ (drb
))
652 /* If dra and drb are part of the same interleaving chain consider
654 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a
->stmt
)
655 && (DR_GROUP_FIRST_ELEMENT (dr_info_a
->stmt
)
656 == DR_GROUP_FIRST_ELEMENT (dr_info_b
->stmt
)))
659 /* Unknown data dependence. */
660 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
664 "can't determine dependence between %T and %T\n",
665 DR_REF (dra
), DR_REF (drb
));
667 else if (dump_enabled_p ())
668 dump_printf_loc (MSG_NOTE
, vect_location
,
669 "determined dependence between %T and %T\n",
670 DR_REF (dra
), DR_REF (drb
));
676 /* Analyze dependences involved in the transform of SLP NODE. STORES
677 contain the vector of scalar stores of this instance if we are
678 disambiguating the loads. */
681 vect_slp_analyze_node_dependences (vec_info
*vinfo
, slp_tree node
,
682 vec
<stmt_vec_info
> stores
,
683 stmt_vec_info last_store_info
)
685 /* This walks over all stmts involved in the SLP load/store done
686 in NODE verifying we can sink them up to the last stmt in the
688 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
690 stmt_vec_info last_access_info
= vect_find_last_scalar_stmt_in_slp (node
);
691 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
693 stmt_vec_info access_info
694 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
695 if (access_info
== last_access_info
)
697 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
699 bool ref_initialized_p
= false;
700 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
701 gsi_stmt (gsi
) != last_access_info
->stmt
; gsi_next (&gsi
))
703 gimple
*stmt
= gsi_stmt (gsi
);
704 if (! gimple_vuse (stmt
))
707 /* If we couldn't record a (single) data reference for this
708 stmt we have to resort to the alias oracle. */
709 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
710 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
713 /* We are moving a store - this means
714 we cannot use TBAA for disambiguation. */
715 if (!ref_initialized_p
)
716 ao_ref_init (&ref
, DR_REF (dr_a
));
717 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, false)
718 || ref_maybe_used_by_stmt_p (stmt
, &ref
, false))
723 bool dependent
= false;
724 /* If we run into a store of this same instance (we've just
725 marked those) then delay dependence checking until we run
726 into the last store because this is where it will have
727 been sunk to (and we verify if we can do that as well). */
728 if (gimple_visited_p (stmt
))
730 if (stmt_info
!= last_store_info
)
733 stmt_vec_info store_info
;
734 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
736 data_reference
*store_dr
737 = STMT_VINFO_DATA_REF (store_info
);
738 ddr_p ddr
= initialize_data_dependence_relation
739 (dr_a
, store_dr
, vNULL
);
741 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
742 free_dependence_relation (ddr
);
749 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
751 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
752 free_dependence_relation (ddr
);
759 else /* DR_IS_READ */
761 stmt_vec_info first_access_info
762 = vect_find_first_scalar_stmt_in_slp (node
);
763 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (node
).length (); ++k
)
765 stmt_vec_info access_info
766 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node
)[k
]);
767 if (access_info
== first_access_info
)
769 data_reference
*dr_a
= STMT_VINFO_DATA_REF (access_info
);
771 bool ref_initialized_p
= false;
772 for (gimple_stmt_iterator gsi
= gsi_for_stmt (access_info
->stmt
);
773 gsi_stmt (gsi
) != first_access_info
->stmt
; gsi_prev (&gsi
))
775 gimple
*stmt
= gsi_stmt (gsi
);
776 if (! gimple_vdef (stmt
))
779 /* If we couldn't record a (single) data reference for this
780 stmt we have to resort to the alias oracle. */
781 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (stmt
);
782 data_reference
*dr_b
= STMT_VINFO_DATA_REF (stmt_info
);
785 /* We are hoisting a load - this means we can use
786 TBAA for disambiguation. */
787 if (!ref_initialized_p
)
788 ao_ref_init (&ref
, DR_REF (dr_a
));
789 if (stmt_may_clobber_ref_p_1 (stmt
, &ref
, true))
794 bool dependent
= false;
795 /* If we run into a store of this same instance (we've just
796 marked those) then delay dependence checking until we run
797 into the last store because this is where it will have
798 been sunk to (and we verify if we can do that as well). */
799 if (gimple_visited_p (stmt
))
801 if (stmt_info
!= last_store_info
)
804 stmt_vec_info store_info
;
805 FOR_EACH_VEC_ELT (stores
, i
, store_info
)
807 data_reference
*store_dr
808 = STMT_VINFO_DATA_REF (store_info
);
809 ddr_p ddr
= initialize_data_dependence_relation
810 (dr_a
, store_dr
, vNULL
);
812 = vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
813 free_dependence_relation (ddr
);
820 ddr_p ddr
= initialize_data_dependence_relation (dr_a
,
822 dependent
= vect_slp_analyze_data_ref_dependence (vinfo
, ddr
);
823 free_dependence_relation (ddr
);
834 /* Function vect_analyze_data_ref_dependences.
836 Examine all the data references in the basic-block, and make sure there
837 do not exist any data dependences between them. Set *MAX_VF according to
838 the maximum vectorization factor the data dependences allow. */
841 vect_slp_analyze_instance_dependence (vec_info
*vinfo
, slp_instance instance
)
843 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
845 /* The stores of this instance are at the root of the SLP tree. */
846 slp_tree store
= SLP_INSTANCE_TREE (instance
);
847 if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store
)))
850 /* Verify we can sink stores to the vectorized stmt insert location. */
851 stmt_vec_info last_store_info
= NULL
;
854 if (! vect_slp_analyze_node_dependences (vinfo
, store
, vNULL
, NULL
))
857 /* Mark stores in this instance and remember the last one. */
858 last_store_info
= vect_find_last_scalar_stmt_in_slp (store
);
859 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
860 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, true);
865 /* Verify we can sink loads to the vectorized stmt insert location,
866 special-casing stores of this instance. */
869 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, load
)
870 if (! vect_slp_analyze_node_dependences (vinfo
, load
,
872 ? SLP_TREE_SCALAR_STMTS (store
)
873 : vNULL
, last_store_info
))
879 /* Unset the visited flag. */
881 for (unsigned k
= 0; k
< SLP_TREE_SCALAR_STMTS (store
).length (); ++k
)
882 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store
)[k
]->stmt
, false);
887 /* Record the base alignment guarantee given by DRB, which occurs
891 vect_record_base_alignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
892 innermost_loop_behavior
*drb
)
895 innermost_loop_behavior
*&entry
896 = vinfo
->base_alignments
.get_or_insert (drb
->base_address
, &existed
);
897 if (!existed
|| entry
->base_alignment
< drb
->base_alignment
)
900 if (dump_enabled_p ())
901 dump_printf_loc (MSG_NOTE
, vect_location
,
902 "recording new base alignment for %T\n"
904 " misalignment: %d\n"
908 drb
->base_misalignment
,
913 /* If the region we're going to vectorize is reached, all unconditional
914 data references occur at least once. We can therefore pool the base
915 alignment guarantees from each unconditional reference. Do this by
916 going through all the data references in VINFO and checking whether
917 the containing statement makes the reference unconditionally. If so,
918 record the alignment of the base address in VINFO so that it can be
919 used for all other references with the same base. */
922 vect_record_base_alignments (vec_info
*vinfo
)
924 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
925 class loop
*loop
= loop_vinfo
? LOOP_VINFO_LOOP (loop_vinfo
) : NULL
;
928 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
930 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr
);
931 stmt_vec_info stmt_info
= dr_info
->stmt
;
932 if (!DR_IS_CONDITIONAL_IN_STMT (dr
)
933 && STMT_VINFO_VECTORIZABLE (stmt_info
)
934 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
936 vect_record_base_alignment (vinfo
, stmt_info
, &DR_INNERMOST (dr
));
938 /* If DR is nested in the loop that is being vectorized, we can also
939 record the alignment of the base wrt the outer loop. */
940 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
941 vect_record_base_alignment
942 (vinfo
, stmt_info
, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
));
947 /* Return the target alignment for the vectorized form of DR_INFO. */
950 vect_calculate_target_alignment (dr_vec_info
*dr_info
)
952 tree vectype
= STMT_VINFO_VECTYPE (dr_info
->stmt
);
953 return targetm
.vectorize
.preferred_vector_alignment (vectype
);
956 /* Function vect_compute_data_ref_alignment
958 Compute the misalignment of the data reference DR_INFO.
961 1. DR_MISALIGNMENT (DR_INFO) is defined.
963 FOR NOW: No analysis is actually performed. Misalignment is calculated
964 only for trivial cases. TODO. */
967 vect_compute_data_ref_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
)
969 stmt_vec_info stmt_info
= dr_info
->stmt
;
970 vec_base_alignments
*base_alignments
= &vinfo
->base_alignments
;
971 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
972 class loop
*loop
= NULL
;
973 tree ref
= DR_REF (dr_info
->dr
);
974 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
976 if (dump_enabled_p ())
977 dump_printf_loc (MSG_NOTE
, vect_location
,
978 "vect_compute_data_ref_alignment:\n");
981 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
983 /* Initialize misalignment to unknown. */
984 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
986 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
989 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
990 bool step_preserves_misalignment_p
;
992 poly_uint64 vector_alignment
993 = exact_div (vect_calculate_target_alignment (dr_info
), BITS_PER_UNIT
);
994 DR_TARGET_ALIGNMENT (dr_info
) = vector_alignment
;
996 /* If the main loop has peeled for alignment we have no way of knowing
997 whether the data accesses in the epilogues are aligned. We can't at
998 compile time answer the question whether we have entered the main loop or
999 not. Fixes PR 92351. */
1002 loop_vec_info orig_loop_vinfo
= LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo
);
1004 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo
) != 0)
1008 unsigned HOST_WIDE_INT vect_align_c
;
1009 if (!vector_alignment
.is_constant (&vect_align_c
))
1012 /* No step for BB vectorization. */
1015 gcc_assert (integer_zerop (drb
->step
));
1016 step_preserves_misalignment_p
= true;
1019 /* In case the dataref is in an inner-loop of the loop that is being
1020 vectorized (LOOP), we use the base and misalignment information
1021 relative to the outer-loop (LOOP). This is ok only if the misalignment
1022 stays the same throughout the execution of the inner-loop, which is why
1023 we have to check that the stride of the dataref in the inner-loop evenly
1024 divides by the vector alignment. */
1025 else if (nested_in_vect_loop_p (loop
, stmt_info
))
1027 step_preserves_misalignment_p
1028 = (DR_STEP_ALIGNMENT (dr_info
->dr
) % vect_align_c
) == 0;
1030 if (dump_enabled_p ())
1032 if (step_preserves_misalignment_p
)
1033 dump_printf_loc (MSG_NOTE
, vect_location
,
1034 "inner step divides the vector alignment.\n");
1036 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1037 "inner step doesn't divide the vector"
1042 /* Similarly we can only use base and misalignment information relative to
1043 an innermost loop if the misalignment stays the same throughout the
1044 execution of the loop. As above, this is the case if the stride of
1045 the dataref evenly divides by the alignment. */
1048 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1049 step_preserves_misalignment_p
1050 = multiple_p (DR_STEP_ALIGNMENT (dr_info
->dr
) * vf
, vect_align_c
);
1052 if (!step_preserves_misalignment_p
&& dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1054 "step doesn't divide the vector alignment.\n");
1057 unsigned int base_alignment
= drb
->base_alignment
;
1058 unsigned int base_misalignment
= drb
->base_misalignment
;
1060 /* Calculate the maximum of the pooled base address alignment and the
1061 alignment that we can compute for DR itself. */
1062 innermost_loop_behavior
**entry
= base_alignments
->get (drb
->base_address
);
1063 if (entry
&& base_alignment
< (*entry
)->base_alignment
)
1065 base_alignment
= (*entry
)->base_alignment
;
1066 base_misalignment
= (*entry
)->base_misalignment
;
1069 if (drb
->offset_alignment
< vect_align_c
1070 || !step_preserves_misalignment_p
1071 /* We need to know whether the step wrt the vectorized loop is
1072 negative when computing the starting misalignment below. */
1073 || TREE_CODE (drb
->step
) != INTEGER_CST
)
1075 if (dump_enabled_p ())
1076 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1077 "Unknown alignment for access: %T\n", ref
);
1081 if (base_alignment
< vect_align_c
)
1083 unsigned int max_alignment
;
1084 tree base
= get_base_for_alignment (drb
->base_address
, &max_alignment
);
1085 if (max_alignment
< vect_align_c
1086 || !vect_can_force_dr_alignment_p (base
,
1087 vect_align_c
* BITS_PER_UNIT
))
1089 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE
, vect_location
,
1091 "can't force alignment of ref: %T\n", ref
);
1095 /* Force the alignment of the decl.
1096 NOTE: This is the only change to the code we make during
1097 the analysis phase, before deciding to vectorize the loop. */
1098 if (dump_enabled_p ())
1099 dump_printf_loc (MSG_NOTE
, vect_location
,
1100 "force alignment of %T\n", ref
);
1102 dr_info
->base_decl
= base
;
1103 dr_info
->base_misaligned
= true;
1104 base_misalignment
= 0;
1106 poly_int64 misalignment
1107 = base_misalignment
+ wi::to_poly_offset (drb
->init
).force_shwi ();
1109 /* If this is a backward running DR then first access in the larger
1110 vectype actually is N-1 elements before the address in the DR.
1111 Adjust misalign accordingly. */
1112 if (tree_int_cst_sgn (drb
->step
) < 0)
1113 /* PLUS because STEP is negative. */
1114 misalignment
+= ((TYPE_VECTOR_SUBPARTS (vectype
) - 1)
1115 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype
))));
1117 unsigned int const_misalignment
;
1118 if (!known_misalignment (misalignment
, vect_align_c
, &const_misalignment
))
1120 if (dump_enabled_p ())
1121 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1122 "Non-constant misalignment for access: %T\n", ref
);
1126 SET_DR_MISALIGNMENT (dr_info
, const_misalignment
);
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1130 "misalign = %d bytes of ref %T\n",
1131 DR_MISALIGNMENT (dr_info
), ref
);
1136 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1137 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1138 is made aligned via peeling. */
1141 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info
*dr_info
,
1142 dr_vec_info
*dr_peel_info
)
1144 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info
),
1145 DR_TARGET_ALIGNMENT (dr_info
)))
1147 poly_offset_int diff
1148 = (wi::to_poly_offset (DR_INIT (dr_peel_info
->dr
))
1149 - wi::to_poly_offset (DR_INIT (dr_info
->dr
)));
1150 if (known_eq (diff
, 0)
1151 || multiple_p (diff
, DR_TARGET_ALIGNMENT (dr_info
)))
1157 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1158 aligned via peeling. */
1161 vect_dr_aligned_if_peeled_dr_is (dr_vec_info
*dr_info
,
1162 dr_vec_info
*dr_peel_info
)
1164 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info
->dr
),
1165 DR_BASE_ADDRESS (dr_peel_info
->dr
), 0)
1166 || !operand_equal_p (DR_OFFSET (dr_info
->dr
),
1167 DR_OFFSET (dr_peel_info
->dr
), 0)
1168 || !operand_equal_p (DR_STEP (dr_info
->dr
),
1169 DR_STEP (dr_peel_info
->dr
), 0))
1172 return vect_dr_aligned_if_related_peeled_dr_is (dr_info
, dr_peel_info
);
1175 /* Function vect_update_misalignment_for_peel.
1176 Sets DR_INFO's misalignment
1177 - to 0 if it has the same alignment as DR_PEEL_INFO,
1178 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1179 - to -1 (unknown) otherwise.
1181 DR_INFO - the data reference whose misalignment is to be adjusted.
1182 DR_PEEL_INFO - the data reference whose misalignment is being made
1183 zero in the vector loop by the peel.
1184 NPEEL - the number of iterations in the peel loop if the misalignment
1185 of DR_PEEL_INFO is known at compile time. */
1188 vect_update_misalignment_for_peel (dr_vec_info
*dr_info
,
1189 dr_vec_info
*dr_peel_info
, int npeel
)
1191 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1192 if (vect_dr_aligned_if_peeled_dr_is (dr_info
, dr_peel_info
))
1194 SET_DR_MISALIGNMENT (dr_info
, 0);
1198 unsigned HOST_WIDE_INT alignment
;
1199 if (DR_TARGET_ALIGNMENT (dr_info
).is_constant (&alignment
)
1200 && known_alignment_for_access_p (dr_info
)
1201 && known_alignment_for_access_p (dr_peel_info
))
1203 int misal
= DR_MISALIGNMENT (dr_info
);
1204 misal
+= npeel
* TREE_INT_CST_LOW (DR_STEP (dr_info
->dr
));
1205 misal
&= alignment
- 1;
1206 SET_DR_MISALIGNMENT (dr_info
, misal
);
1210 if (dump_enabled_p ())
1211 dump_printf_loc (MSG_NOTE
, vect_location
, "Setting misalignment " \
1212 "to unknown (-1).\n");
1213 SET_DR_MISALIGNMENT (dr_info
, DR_MISALIGNMENT_UNKNOWN
);
1216 /* Return true if alignment is relevant for DR_INFO. */
1219 vect_relevant_for_alignment_p (dr_vec_info
*dr_info
)
1221 stmt_vec_info stmt_info
= dr_info
->stmt
;
1223 if (!STMT_VINFO_RELEVANT_P (stmt_info
))
1226 /* For interleaving, only the alignment of the first access matters. */
1227 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1228 && DR_GROUP_FIRST_ELEMENT (stmt_info
) != stmt_info
)
1231 /* Scatter-gather and invariant accesses continue to address individual
1232 scalars, so vector-level alignment is irrelevant. */
1233 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1234 || integer_zerop (DR_STEP (dr_info
->dr
)))
1237 /* Strided accesses perform only component accesses, alignment is
1238 irrelevant for them. */
1239 if (STMT_VINFO_STRIDED_P (stmt_info
)
1240 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1246 /* Given an memory reference EXP return whether its alignment is less
1250 not_size_aligned (tree exp
)
1252 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp
))))
1255 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp
)))
1256 > get_object_alignment (exp
));
1259 /* Function vector_alignment_reachable_p
1261 Return true if vector alignment for DR_INFO is reachable by peeling
1262 a few loop iterations. Return false otherwise. */
1265 vector_alignment_reachable_p (dr_vec_info
*dr_info
)
1267 stmt_vec_info stmt_info
= dr_info
->stmt
;
1268 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
1270 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1272 /* For interleaved access we peel only if number of iterations in
1273 the prolog loop ({VF - misalignment}), is a multiple of the
1274 number of the interleaved accesses. */
1275 int elem_size
, mis_in_elements
;
1277 /* FORNOW: handle only known alignment. */
1278 if (!known_alignment_for_access_p (dr_info
))
1281 poly_uint64 nelements
= TYPE_VECTOR_SUBPARTS (vectype
);
1282 poly_uint64 vector_size
= GET_MODE_SIZE (TYPE_MODE (vectype
));
1283 elem_size
= vector_element_size (vector_size
, nelements
);
1284 mis_in_elements
= DR_MISALIGNMENT (dr_info
) / elem_size
;
1286 if (!multiple_p (nelements
- mis_in_elements
, DR_GROUP_SIZE (stmt_info
)))
1290 /* If misalignment is known at the compile time then allow peeling
1291 only if natural alignment is reachable through peeling. */
1292 if (known_alignment_for_access_p (dr_info
) && !aligned_access_p (dr_info
))
1294 HOST_WIDE_INT elmsize
=
1295 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype
)));
1296 if (dump_enabled_p ())
1298 dump_printf_loc (MSG_NOTE
, vect_location
,
1299 "data size = %wd. misalignment = %d.\n", elmsize
,
1300 DR_MISALIGNMENT (dr_info
));
1302 if (DR_MISALIGNMENT (dr_info
) % elmsize
)
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1306 "data size does not divide the misalignment.\n");
1311 if (!known_alignment_for_access_p (dr_info
))
1313 tree type
= TREE_TYPE (DR_REF (dr_info
->dr
));
1314 bool is_packed
= not_size_aligned (DR_REF (dr_info
->dr
));
1315 if (dump_enabled_p ())
1316 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1317 "Unknown misalignment, %snaturally aligned\n",
1318 is_packed
? "not " : "");
1319 return targetm
.vectorize
.vector_alignment_reachable (type
, is_packed
);
1326 /* Calculate the cost of the memory access represented by DR_INFO. */
1329 vect_get_data_access_cost (vec_info
*vinfo
, dr_vec_info
*dr_info
,
1330 unsigned int *inside_cost
,
1331 unsigned int *outside_cost
,
1332 stmt_vector_for_cost
*body_cost_vec
,
1333 stmt_vector_for_cost
*prologue_cost_vec
)
1335 stmt_vec_info stmt_info
= dr_info
->stmt
;
1336 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
1339 if (PURE_SLP_STMT (stmt_info
))
1342 ncopies
= vect_get_num_copies (loop_vinfo
, STMT_VINFO_VECTYPE (stmt_info
));
1344 if (DR_IS_READ (dr_info
->dr
))
1345 vect_get_load_cost (vinfo
, stmt_info
, ncopies
, true, inside_cost
,
1346 outside_cost
, prologue_cost_vec
, body_cost_vec
, false);
1348 vect_get_store_cost (vinfo
,stmt_info
, ncopies
, inside_cost
, body_cost_vec
);
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_NOTE
, vect_location
,
1352 "vect_get_data_access_cost: inside_cost = %d, "
1353 "outside_cost = %d.\n", *inside_cost
, *outside_cost
);
1357 typedef struct _vect_peel_info
1359 dr_vec_info
*dr_info
;
1364 typedef struct _vect_peel_extended_info
1367 struct _vect_peel_info peel_info
;
1368 unsigned int inside_cost
;
1369 unsigned int outside_cost
;
1370 } *vect_peel_extended_info
;
1373 /* Peeling hashtable helpers. */
1375 struct peel_info_hasher
: free_ptr_hash
<_vect_peel_info
>
1377 static inline hashval_t
hash (const _vect_peel_info
*);
1378 static inline bool equal (const _vect_peel_info
*, const _vect_peel_info
*);
1382 peel_info_hasher::hash (const _vect_peel_info
*peel_info
)
1384 return (hashval_t
) peel_info
->npeel
;
1388 peel_info_hasher::equal (const _vect_peel_info
*a
, const _vect_peel_info
*b
)
1390 return (a
->npeel
== b
->npeel
);
1394 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1397 vect_peeling_hash_insert (hash_table
<peel_info_hasher
> *peeling_htab
,
1398 loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
1401 struct _vect_peel_info elem
, *slot
;
1402 _vect_peel_info
**new_slot
;
1403 bool supportable_dr_alignment
1404 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1407 slot
= peeling_htab
->find (&elem
);
1412 slot
= XNEW (struct _vect_peel_info
);
1413 slot
->npeel
= npeel
;
1414 slot
->dr_info
= dr_info
;
1416 new_slot
= peeling_htab
->find_slot (slot
, INSERT
);
1420 if (!supportable_dr_alignment
1421 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1422 slot
->count
+= VECT_MAX_COST
;
1426 /* Traverse peeling hash table to find peeling option that aligns maximum
1427 number of data accesses. */
1430 vect_peeling_hash_get_most_frequent (_vect_peel_info
**slot
,
1431 _vect_peel_extended_info
*max
)
1433 vect_peel_info elem
= *slot
;
1435 if (elem
->count
> max
->peel_info
.count
1436 || (elem
->count
== max
->peel_info
.count
1437 && max
->peel_info
.npeel
> elem
->npeel
))
1439 max
->peel_info
.npeel
= elem
->npeel
;
1440 max
->peel_info
.count
= elem
->count
;
1441 max
->peel_info
.dr_info
= elem
->dr_info
;
1447 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1448 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1449 we assume DR0_INFO's misalignment will be zero after peeling. */
1452 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo
,
1453 dr_vec_info
*dr0_info
,
1454 unsigned int *inside_cost
,
1455 unsigned int *outside_cost
,
1456 stmt_vector_for_cost
*body_cost_vec
,
1457 stmt_vector_for_cost
*prologue_cost_vec
,
1459 bool unknown_misalignment
)
1461 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1465 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1467 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1468 if (!vect_relevant_for_alignment_p (dr_info
))
1471 int save_misalignment
;
1472 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1475 else if (unknown_misalignment
&& dr_info
== dr0_info
)
1476 SET_DR_MISALIGNMENT (dr_info
, 0);
1478 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1479 vect_get_data_access_cost (loop_vinfo
, dr_info
, inside_cost
, outside_cost
,
1480 body_cost_vec
, prologue_cost_vec
);
1481 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1485 /* Traverse peeling hash table and calculate cost for each peeling option.
1486 Find the one with the lowest cost. */
1489 vect_peeling_hash_get_lowest_cost (_vect_peel_info
**slot
,
1490 _vect_peel_extended_info
*min
)
1492 vect_peel_info elem
= *slot
;
1494 unsigned int inside_cost
= 0, outside_cost
= 0;
1495 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (min
->vinfo
);
1496 stmt_vector_for_cost prologue_cost_vec
, body_cost_vec
,
1499 prologue_cost_vec
.create (2);
1500 body_cost_vec
.create (2);
1501 epilogue_cost_vec
.create (2);
1503 vect_get_peeling_costs_all_drs (loop_vinfo
, elem
->dr_info
, &inside_cost
,
1504 &outside_cost
, &body_cost_vec
,
1505 &prologue_cost_vec
, elem
->npeel
, false);
1507 body_cost_vec
.release ();
1509 outside_cost
+= vect_get_known_peeling_cost
1510 (loop_vinfo
, elem
->npeel
, &dummy
,
1511 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
1512 &prologue_cost_vec
, &epilogue_cost_vec
);
1514 /* Prologue and epilogue costs are added to the target model later.
1515 These costs depend only on the scalar iteration cost, the
1516 number of peeling iterations finally chosen, and the number of
1517 misaligned statements. So discard the information found here. */
1518 prologue_cost_vec
.release ();
1519 epilogue_cost_vec
.release ();
1521 if (inside_cost
< min
->inside_cost
1522 || (inside_cost
== min
->inside_cost
1523 && outside_cost
< min
->outside_cost
))
1525 min
->inside_cost
= inside_cost
;
1526 min
->outside_cost
= outside_cost
;
1527 min
->peel_info
.dr_info
= elem
->dr_info
;
1528 min
->peel_info
.npeel
= elem
->npeel
;
1529 min
->peel_info
.count
= elem
->count
;
1536 /* Choose best peeling option by traversing peeling hash table and either
1537 choosing an option with the lowest cost (if cost model is enabled) or the
1538 option that aligns as many accesses as possible. */
1540 static struct _vect_peel_extended_info
1541 vect_peeling_hash_choose_best_peeling (hash_table
<peel_info_hasher
> *peeling_htab
,
1542 loop_vec_info loop_vinfo
)
1544 struct _vect_peel_extended_info res
;
1546 res
.peel_info
.dr_info
= NULL
;
1547 res
.vinfo
= loop_vinfo
;
1549 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1551 res
.inside_cost
= INT_MAX
;
1552 res
.outside_cost
= INT_MAX
;
1553 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1554 vect_peeling_hash_get_lowest_cost
> (&res
);
1558 res
.peel_info
.count
= 0;
1559 peeling_htab
->traverse
<_vect_peel_extended_info
*,
1560 vect_peeling_hash_get_most_frequent
> (&res
);
1561 res
.inside_cost
= 0;
1562 res
.outside_cost
= 0;
1568 /* Return true if the new peeling NPEEL is supported. */
1571 vect_peeling_supportable (loop_vec_info loop_vinfo
, dr_vec_info
*dr0_info
,
1575 struct data_reference
*dr
= NULL
;
1576 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
1577 enum dr_alignment_support supportable_dr_alignment
;
1579 /* Ensure that all data refs can be vectorized after the peel. */
1580 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1582 int save_misalignment
;
1584 if (dr
== dr0_info
->dr
)
1587 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1588 if (!vect_relevant_for_alignment_p (dr_info
))
1591 save_misalignment
= DR_MISALIGNMENT (dr_info
);
1592 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
1593 supportable_dr_alignment
1594 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
1595 SET_DR_MISALIGNMENT (dr_info
, save_misalignment
);
1597 if (!supportable_dr_alignment
)
1604 /* Compare two data-references DRA and DRB to group them into chunks
1605 with related alignment. */
1608 dr_align_group_sort_cmp (const void *dra_
, const void *drb_
)
1610 data_reference_p dra
= *(data_reference_p
*)const_cast<void *>(dra_
);
1611 data_reference_p drb
= *(data_reference_p
*)const_cast<void *>(drb_
);
1614 /* Stabilize sort. */
1618 /* Ordering of DRs according to base. */
1619 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
1620 DR_BASE_ADDRESS (drb
));
1624 /* And according to DR_OFFSET. */
1625 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
1629 /* And after step. */
1630 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
1634 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1635 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
1637 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
1641 /* Function vect_enhance_data_refs_alignment
1643 This pass will use loop versioning and loop peeling in order to enhance
1644 the alignment of data references in the loop.
1646 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1647 original loop is to be vectorized. Any other loops that are created by
1648 the transformations performed in this pass - are not supposed to be
1649 vectorized. This restriction will be relaxed.
1651 This pass will require a cost model to guide it whether to apply peeling
1652 or versioning or a combination of the two. For example, the scheme that
1653 intel uses when given a loop with several memory accesses, is as follows:
1654 choose one memory access ('p') which alignment you want to force by doing
1655 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1656 other accesses are not necessarily aligned, or (2) use loop versioning to
1657 generate one loop in which all accesses are aligned, and another loop in
1658 which only 'p' is necessarily aligned.
1660 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1661 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1662 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1664 Devising a cost model is the most critical aspect of this work. It will
1665 guide us on which access to peel for, whether to use loop versioning, how
1666 many versions to create, etc. The cost model will probably consist of
1667 generic considerations as well as target specific considerations (on
1668 powerpc for example, misaligned stores are more painful than misaligned
1671 Here are the general steps involved in alignment enhancements:
1673 -- original loop, before alignment analysis:
1674 for (i=0; i<N; i++){
1675 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1676 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1679 -- After vect_compute_data_refs_alignment:
1680 for (i=0; i<N; i++){
1681 x = q[i]; # DR_MISALIGNMENT(q) = 3
1682 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1685 -- Possibility 1: we do loop versioning:
1687 for (i=0; i<N; i++){ # loop 1A
1688 x = q[i]; # DR_MISALIGNMENT(q) = 3
1689 p[i] = y; # DR_MISALIGNMENT(p) = 0
1693 for (i=0; i<N; i++){ # loop 1B
1694 x = q[i]; # DR_MISALIGNMENT(q) = 3
1695 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1699 -- Possibility 2: we do loop peeling:
1700 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1704 for (i = 3; i < N; i++){ # loop 2A
1705 x = q[i]; # DR_MISALIGNMENT(q) = 0
1706 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1709 -- Possibility 3: combination of loop peeling and versioning:
1710 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1715 for (i = 3; i<N; i++){ # loop 3A
1716 x = q[i]; # DR_MISALIGNMENT(q) = 0
1717 p[i] = y; # DR_MISALIGNMENT(p) = 0
1721 for (i = 3; i<N; i++){ # loop 3B
1722 x = q[i]; # DR_MISALIGNMENT(q) = 0
1723 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1727 These loops are later passed to loop_transform to be vectorized. The
1728 vectorizer will use the alignment information to guide the transformation
1729 (whether to generate regular loads/stores, or with special handling for
1733 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo
)
1735 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1736 enum dr_alignment_support supportable_dr_alignment
;
1737 dr_vec_info
*first_store
= NULL
;
1738 dr_vec_info
*dr0_info
= NULL
;
1739 struct data_reference
*dr
;
1741 bool do_peeling
= false;
1742 bool do_versioning
= false;
1743 unsigned int npeel
= 0;
1744 bool one_misalignment_known
= false;
1745 bool one_misalignment_unknown
= false;
1746 bool one_dr_unsupportable
= false;
1747 dr_vec_info
*unsupportable_dr_info
= NULL
;
1748 unsigned int mis
, dr0_same_align_drs
= 0, first_store_same_align_drs
= 0;
1749 hash_table
<peel_info_hasher
> peeling_htab (1);
1751 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1753 /* Reset data so we can safely be called multiple times. */
1754 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
1755 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = 0;
1757 if (LOOP_VINFO_DATAREFS (loop_vinfo
).is_empty ())
1758 return opt_result::success ();
1760 /* Sort the vector of datarefs so DRs that have the same or dependent
1761 alignment are next to each other. */
1762 auto_vec
<data_reference_p
> datarefs
1763 = LOOP_VINFO_DATAREFS (loop_vinfo
).copy ();
1764 datarefs
.qsort (dr_align_group_sort_cmp
);
1766 /* Compute the number of DRs that become aligned when we peel
1767 a dataref so it becomes aligned. */
1768 auto_vec
<unsigned> n_same_align_refs (datarefs
.length ());
1769 n_same_align_refs
.quick_grow_cleared (datarefs
.length ());
1771 for (i0
= 0; i0
< datarefs
.length (); ++i0
)
1772 if (DR_BASE_ADDRESS (datarefs
[i0
]))
1774 for (i
= i0
+ 1; i
<= datarefs
.length (); ++i
)
1776 if (i
== datarefs
.length ()
1777 || !operand_equal_p (DR_BASE_ADDRESS (datarefs
[i0
]),
1778 DR_BASE_ADDRESS (datarefs
[i
]), 0)
1779 || !operand_equal_p (DR_OFFSET (datarefs
[i0
]),
1780 DR_OFFSET (datarefs
[i
]), 0)
1781 || !operand_equal_p (DR_STEP (datarefs
[i0
]),
1782 DR_STEP (datarefs
[i
]), 0))
1784 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1785 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1786 will get known misalignment if we align one of the refs
1787 with the largest DR_TARGET_ALIGNMENT. */
1788 for (unsigned j
= i0
; j
< i
; ++j
)
1790 dr_vec_info
*dr_infoj
= loop_vinfo
->lookup_dr (datarefs
[j
]);
1791 for (unsigned k
= i0
; k
< i
; ++k
)
1795 dr_vec_info
*dr_infok
= loop_vinfo
->lookup_dr (datarefs
[k
]);
1796 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok
,
1798 n_same_align_refs
[j
]++;
1805 /* While cost model enhancements are expected in the future, the high level
1806 view of the code at this time is as follows:
1808 A) If there is a misaligned access then see if peeling to align
1809 this access can make all data references satisfy
1810 vect_supportable_dr_alignment. If so, update data structures
1811 as needed and return true.
1813 B) If peeling wasn't possible and there is a data reference with an
1814 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1815 then see if loop versioning checks can be used to make all data
1816 references satisfy vect_supportable_dr_alignment. If so, update
1817 data structures as needed and return true.
1819 C) If neither peeling nor versioning were successful then return false if
1820 any data reference does not satisfy vect_supportable_dr_alignment.
1822 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1824 Note, Possibility 3 above (which is peeling and versioning together) is not
1825 being done at this time. */
1827 /* (1) Peeling to force alignment. */
1829 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1831 + How many accesses will become aligned due to the peeling
1832 - How many accesses will become unaligned due to the peeling,
1833 and the cost of misaligned accesses.
1834 - The cost of peeling (the extra runtime checks, the increase
1837 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
1839 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
1840 if (!vect_relevant_for_alignment_p (dr_info
))
1843 stmt_vec_info stmt_info
= dr_info
->stmt
;
1844 supportable_dr_alignment
1845 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, true);
1846 do_peeling
= vector_alignment_reachable_p (dr_info
);
1849 if (known_alignment_for_access_p (dr_info
))
1851 unsigned int npeel_tmp
= 0;
1852 bool negative
= tree_int_cst_compare (DR_STEP (dr
),
1853 size_zero_node
) < 0;
1855 /* If known_alignment_for_access_p then we have set
1856 DR_MISALIGNMENT which is only done if we know it at compiler
1857 time, so it is safe to assume target alignment is constant.
1859 unsigned int target_align
=
1860 DR_TARGET_ALIGNMENT (dr_info
).to_constant ();
1861 unsigned int dr_size
= vect_get_scalar_dr_size (dr_info
);
1863 ? DR_MISALIGNMENT (dr_info
)
1864 : -DR_MISALIGNMENT (dr_info
));
1865 if (DR_MISALIGNMENT (dr_info
) != 0)
1866 npeel_tmp
= (mis
& (target_align
- 1)) / dr_size
;
1868 /* For multiple types, it is possible that the bigger type access
1869 will have more than one peeling option. E.g., a loop with two
1870 types: one of size (vector size / 4), and the other one of
1871 size (vector size / 8). Vectorization factor will 8. If both
1872 accesses are misaligned by 3, the first one needs one scalar
1873 iteration to be aligned, and the second one needs 5. But the
1874 first one will be aligned also by peeling 5 scalar
1875 iterations, and in that case both accesses will be aligned.
1876 Hence, except for the immediate peeling amount, we also want
1877 to try to add full vector size, while we don't exceed
1878 vectorization factor.
1879 We do this automatically for cost model, since we calculate
1880 cost for every peeling option. */
1881 poly_uint64 nscalars
= npeel_tmp
;
1882 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo
)))
1884 poly_uint64 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
1885 nscalars
= (STMT_SLP_TYPE (stmt_info
)
1886 ? vf
* DR_GROUP_SIZE (stmt_info
) : vf
);
1889 /* Save info about DR in the hash table. Also include peeling
1890 amounts according to the explanation above. */
1891 while (known_le (npeel_tmp
, nscalars
))
1893 vect_peeling_hash_insert (&peeling_htab
, loop_vinfo
,
1894 dr_info
, npeel_tmp
);
1895 npeel_tmp
+= MAX (1, target_align
/ dr_size
);
1898 one_misalignment_known
= true;
1902 /* If we don't know any misalignment values, we prefer
1903 peeling for data-ref that has the maximum number of data-refs
1904 with the same alignment, unless the target prefers to align
1905 stores over load. */
1906 unsigned same_align_drs
= n_same_align_refs
[i
];
1908 || dr0_same_align_drs
< same_align_drs
)
1910 dr0_same_align_drs
= same_align_drs
;
1913 /* For data-refs with the same number of related
1914 accesses prefer the one where the misalign
1915 computation will be invariant in the outermost loop. */
1916 else if (dr0_same_align_drs
== same_align_drs
)
1918 class loop
*ivloop0
, *ivloop
;
1919 ivloop0
= outermost_invariant_loop_for_expr
1920 (loop
, DR_BASE_ADDRESS (dr0_info
->dr
));
1921 ivloop
= outermost_invariant_loop_for_expr
1922 (loop
, DR_BASE_ADDRESS (dr
));
1923 if ((ivloop
&& !ivloop0
)
1924 || (ivloop
&& ivloop0
1925 && flow_loop_nested_p (ivloop
, ivloop0
)))
1929 one_misalignment_unknown
= true;
1931 /* Check for data refs with unsupportable alignment that
1933 if (!supportable_dr_alignment
)
1935 one_dr_unsupportable
= true;
1936 unsupportable_dr_info
= dr_info
;
1939 if (!first_store
&& DR_IS_WRITE (dr
))
1941 first_store
= dr_info
;
1942 first_store_same_align_drs
= same_align_drs
;
1948 if (!aligned_access_p (dr_info
))
1950 if (dump_enabled_p ())
1951 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1952 "vector alignment may not be reachable\n");
1958 /* Check if we can possibly peel the loop. */
1959 if (!vect_can_advance_ivs_p (loop_vinfo
)
1960 || !slpeel_can_duplicate_loop_p (loop
, single_exit (loop
))
1964 struct _vect_peel_extended_info peel_for_known_alignment
;
1965 struct _vect_peel_extended_info peel_for_unknown_alignment
;
1966 struct _vect_peel_extended_info best_peel
;
1968 peel_for_unknown_alignment
.inside_cost
= INT_MAX
;
1969 peel_for_unknown_alignment
.outside_cost
= INT_MAX
;
1970 peel_for_unknown_alignment
.peel_info
.count
= 0;
1973 && one_misalignment_unknown
)
1975 /* Check if the target requires to prefer stores over loads, i.e., if
1976 misaligned stores are more expensive than misaligned loads (taking
1977 drs with same alignment into account). */
1978 unsigned int load_inside_cost
= 0;
1979 unsigned int load_outside_cost
= 0;
1980 unsigned int store_inside_cost
= 0;
1981 unsigned int store_outside_cost
= 0;
1982 unsigned int estimated_npeels
= vect_vf_for_cost (loop_vinfo
) / 2;
1984 stmt_vector_for_cost dummy
;
1986 vect_get_peeling_costs_all_drs (loop_vinfo
, dr0_info
,
1989 &dummy
, &dummy
, estimated_npeels
, true);
1995 vect_get_peeling_costs_all_drs (loop_vinfo
, first_store
,
1997 &store_outside_cost
,
1999 estimated_npeels
, true);
2004 store_inside_cost
= INT_MAX
;
2005 store_outside_cost
= INT_MAX
;
2008 if (load_inside_cost
> store_inside_cost
2009 || (load_inside_cost
== store_inside_cost
2010 && load_outside_cost
> store_outside_cost
))
2012 dr0_info
= first_store
;
2013 dr0_same_align_drs
= first_store_same_align_drs
;
2014 peel_for_unknown_alignment
.inside_cost
= store_inside_cost
;
2015 peel_for_unknown_alignment
.outside_cost
= store_outside_cost
;
2019 peel_for_unknown_alignment
.inside_cost
= load_inside_cost
;
2020 peel_for_unknown_alignment
.outside_cost
= load_outside_cost
;
2023 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2024 prologue_cost_vec
.create (2);
2025 epilogue_cost_vec
.create (2);
2028 peel_for_unknown_alignment
.outside_cost
+= vect_get_known_peeling_cost
2029 (loop_vinfo
, estimated_npeels
, &dummy2
,
2030 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2031 &prologue_cost_vec
, &epilogue_cost_vec
);
2033 prologue_cost_vec
.release ();
2034 epilogue_cost_vec
.release ();
2036 peel_for_unknown_alignment
.peel_info
.count
= dr0_same_align_drs
+ 1;
2039 peel_for_unknown_alignment
.peel_info
.npeel
= 0;
2040 peel_for_unknown_alignment
.peel_info
.dr_info
= dr0_info
;
2042 best_peel
= peel_for_unknown_alignment
;
2044 peel_for_known_alignment
.inside_cost
= INT_MAX
;
2045 peel_for_known_alignment
.outside_cost
= INT_MAX
;
2046 peel_for_known_alignment
.peel_info
.count
= 0;
2047 peel_for_known_alignment
.peel_info
.dr_info
= NULL
;
2049 if (do_peeling
&& one_misalignment_known
)
2051 /* Peeling is possible, but there is no data access that is not supported
2052 unless aligned. So we try to choose the best possible peeling from
2054 peel_for_known_alignment
= vect_peeling_hash_choose_best_peeling
2055 (&peeling_htab
, loop_vinfo
);
2058 /* Compare costs of peeling for known and unknown alignment. */
2059 if (peel_for_known_alignment
.peel_info
.dr_info
!= NULL
2060 && peel_for_unknown_alignment
.inside_cost
2061 >= peel_for_known_alignment
.inside_cost
)
2063 best_peel
= peel_for_known_alignment
;
2065 /* If the best peeling for known alignment has NPEEL == 0, perform no
2066 peeling at all except if there is an unsupportable dr that we can
2068 if (best_peel
.peel_info
.npeel
== 0 && !one_dr_unsupportable
)
2072 /* If there is an unsupportable data ref, prefer this over all choices so far
2073 since we'd have to discard a chosen peeling except when it accidentally
2074 aligned the unsupportable data ref. */
2075 if (one_dr_unsupportable
)
2076 dr0_info
= unsupportable_dr_info
;
2077 else if (do_peeling
)
2079 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2080 TODO: Use nopeel_outside_cost or get rid of it? */
2081 unsigned nopeel_inside_cost
= 0;
2082 unsigned nopeel_outside_cost
= 0;
2084 stmt_vector_for_cost dummy
;
2086 vect_get_peeling_costs_all_drs (loop_vinfo
, NULL
, &nopeel_inside_cost
,
2087 &nopeel_outside_cost
, &dummy
, &dummy
,
2091 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2092 costs will be recorded. */
2093 stmt_vector_for_cost prologue_cost_vec
, epilogue_cost_vec
;
2094 prologue_cost_vec
.create (2);
2095 epilogue_cost_vec
.create (2);
2098 nopeel_outside_cost
+= vect_get_known_peeling_cost
2099 (loop_vinfo
, 0, &dummy2
,
2100 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo
),
2101 &prologue_cost_vec
, &epilogue_cost_vec
);
2103 prologue_cost_vec
.release ();
2104 epilogue_cost_vec
.release ();
2106 npeel
= best_peel
.peel_info
.npeel
;
2107 dr0_info
= best_peel
.peel_info
.dr_info
;
2109 /* If no peeling is not more expensive than the best peeling we
2110 have so far, don't perform any peeling. */
2111 if (nopeel_inside_cost
<= best_peel
.inside_cost
)
2117 stmt_vec_info stmt_info
= dr0_info
->stmt
;
2118 if (known_alignment_for_access_p (dr0_info
))
2120 bool negative
= tree_int_cst_compare (DR_STEP (dr0_info
->dr
),
2121 size_zero_node
) < 0;
2124 /* Since it's known at compile time, compute the number of
2125 iterations in the peeled loop (the peeling factor) for use in
2126 updating DR_MISALIGNMENT values. The peeling factor is the
2127 vectorization factor minus the misalignment as an element
2130 ? DR_MISALIGNMENT (dr0_info
)
2131 : -DR_MISALIGNMENT (dr0_info
));
2132 /* If known_alignment_for_access_p then we have set
2133 DR_MISALIGNMENT which is only done if we know it at compiler
2134 time, so it is safe to assume target alignment is constant.
2136 unsigned int target_align
=
2137 DR_TARGET_ALIGNMENT (dr0_info
).to_constant ();
2138 npeel
= ((mis
& (target_align
- 1))
2139 / vect_get_scalar_dr_size (dr0_info
));
2142 /* For interleaved data access every iteration accesses all the
2143 members of the group, therefore we divide the number of iterations
2144 by the group size. */
2145 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2146 npeel
/= DR_GROUP_SIZE (stmt_info
);
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_NOTE
, vect_location
,
2150 "Try peeling by %d\n", npeel
);
2153 /* Ensure that all datarefs can be vectorized after the peel. */
2154 if (!vect_peeling_supportable (loop_vinfo
, dr0_info
, npeel
))
2157 /* Check if all datarefs are supportable and log. */
2158 if (do_peeling
&& known_alignment_for_access_p (dr0_info
) && npeel
== 0)
2159 return opt_result::success ();
2161 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2164 unsigned max_allowed_peel
2165 = param_vect_max_peeling_for_alignment
;
2166 if (flag_vect_cost_model
<= VECT_COST_MODEL_CHEAP
)
2167 max_allowed_peel
= 0;
2168 if (max_allowed_peel
!= (unsigned)-1)
2170 unsigned max_peel
= npeel
;
2173 poly_uint64 target_align
= DR_TARGET_ALIGNMENT (dr0_info
);
2174 unsigned HOST_WIDE_INT target_align_c
;
2175 if (target_align
.is_constant (&target_align_c
))
2177 target_align_c
/ vect_get_scalar_dr_size (dr0_info
) - 1;
2181 if (dump_enabled_p ())
2182 dump_printf_loc (MSG_NOTE
, vect_location
,
2183 "Disable peeling, max peels set and vector"
2184 " alignment unknown\n");
2187 if (max_peel
> max_allowed_peel
)
2190 if (dump_enabled_p ())
2191 dump_printf_loc (MSG_NOTE
, vect_location
,
2192 "Disable peeling, max peels reached: %d\n", max_peel
);
2197 /* Cost model #2 - if peeling may result in a remaining loop not
2198 iterating enough to be vectorized then do not peel. Since this
2199 is a cost heuristic rather than a correctness decision, use the
2200 most likely runtime value for variable vectorization factors. */
2202 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo
))
2204 unsigned int assumed_vf
= vect_vf_for_cost (loop_vinfo
);
2205 unsigned int max_peel
= npeel
== 0 ? assumed_vf
- 1 : npeel
;
2206 if ((unsigned HOST_WIDE_INT
) LOOP_VINFO_INT_NITERS (loop_vinfo
)
2207 < assumed_vf
+ max_peel
)
2213 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2214 If the misalignment of DR_i is identical to that of dr0 then set
2215 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2216 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2217 by the peeling factor times the element size of DR_i (MOD the
2218 vectorization factor times the size). Otherwise, the
2219 misalignment of DR_i must be set to unknown. */
2220 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2221 if (dr
!= dr0_info
->dr
)
2223 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2224 if (!vect_relevant_for_alignment_p (dr_info
))
2227 vect_update_misalignment_for_peel (dr_info
, dr0_info
, npeel
);
2230 LOOP_VINFO_UNALIGNED_DR (loop_vinfo
) = dr0_info
;
2232 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
) = npeel
;
2234 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo
)
2235 = DR_MISALIGNMENT (dr0_info
);
2236 SET_DR_MISALIGNMENT (dr0_info
, 0);
2237 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_NOTE
, vect_location
,
2240 "Alignment of access forced using peeling.\n");
2241 dump_printf_loc (MSG_NOTE
, vect_location
,
2242 "Peeling for alignment will be applied.\n");
2245 /* The inside-loop cost will be accounted for in vectorizable_load
2246 and vectorizable_store correctly with adjusted alignments.
2247 Drop the body_cst_vec on the floor here. */
2248 return opt_result::success ();
2252 /* (2) Versioning to force alignment. */
2254 /* Try versioning if:
2255 1) optimize loop for speed and the cost-model is not cheap
2256 2) there is at least one unsupported misaligned data ref with an unknown
2258 3) all misaligned data refs with a known misalignment are supported, and
2259 4) the number of runtime alignment checks is within reason. */
2262 = (optimize_loop_nest_for_speed_p (loop
)
2263 && !loop
->inner
/* FORNOW */
2264 && flag_vect_cost_model
> VECT_COST_MODEL_CHEAP
);
2268 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2270 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2271 if (aligned_access_p (dr_info
)
2272 || !vect_relevant_for_alignment_p (dr_info
))
2275 stmt_vec_info stmt_info
= dr_info
->stmt
;
2276 if (STMT_VINFO_STRIDED_P (stmt_info
))
2278 do_versioning
= false;
2282 supportable_dr_alignment
2283 = vect_supportable_dr_alignment (loop_vinfo
, dr_info
, false);
2285 if (!supportable_dr_alignment
)
2290 if (known_alignment_for_access_p (dr_info
)
2291 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).length ()
2292 >= (unsigned) param_vect_max_version_for_alignment_checks
)
2294 do_versioning
= false;
2298 vectype
= STMT_VINFO_VECTYPE (stmt_info
);
2299 gcc_assert (vectype
);
2301 /* At present we don't support versioning for alignment
2302 with variable VF, since there's no guarantee that the
2303 VF is a power of two. We could relax this if we added
2304 a way of enforcing a power-of-two size. */
2305 unsigned HOST_WIDE_INT size
;
2306 if (!GET_MODE_SIZE (TYPE_MODE (vectype
)).is_constant (&size
))
2308 do_versioning
= false;
2312 /* Forcing alignment in the first iteration is no good if
2313 we don't keep it across iterations. For now, just disable
2314 versioning in this case.
2315 ?? We could actually unroll the loop to achieve the required
2316 overall step alignment, and forcing the alignment could be
2317 done by doing some iterations of the non-vectorized loop. */
2318 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
2319 * DR_STEP_ALIGNMENT (dr
),
2320 DR_TARGET_ALIGNMENT (dr_info
)))
2322 do_versioning
= false;
2326 /* The rightmost bits of an aligned address must be zeros.
2327 Construct the mask needed for this test. For example,
2328 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2329 mask must be 15 = 0xf. */
2332 /* FORNOW: use the same mask to test all potentially unaligned
2333 references in the loop. */
2334 if (LOOP_VINFO_PTR_MASK (loop_vinfo
)
2335 && LOOP_VINFO_PTR_MASK (loop_vinfo
) != mask
)
2337 do_versioning
= false;
2341 LOOP_VINFO_PTR_MASK (loop_vinfo
) = mask
;
2342 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).safe_push (stmt_info
);
2346 /* Versioning requires at least one misaligned data reference. */
2347 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo
))
2348 do_versioning
= false;
2349 else if (!do_versioning
)
2350 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
).truncate (0);
2355 vec
<stmt_vec_info
> may_misalign_stmts
2356 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo
);
2357 stmt_vec_info stmt_info
;
2359 /* It can now be assumed that the data references in the statements
2360 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2361 of the loop being vectorized. */
2362 FOR_EACH_VEC_ELT (may_misalign_stmts
, i
, stmt_info
)
2364 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
2365 SET_DR_MISALIGNMENT (dr_info
, 0);
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_NOTE
, vect_location
,
2368 "Alignment of access forced using versioning.\n");
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE
, vect_location
,
2373 "Versioning for alignment will be applied.\n");
2375 /* Peeling and versioning can't be done together at this time. */
2376 gcc_assert (! (do_peeling
&& do_versioning
));
2378 return opt_result::success ();
2381 /* This point is reached if neither peeling nor versioning is being done. */
2382 gcc_assert (! (do_peeling
|| do_versioning
));
2384 return opt_result::success ();
2388 /* Function vect_analyze_data_refs_alignment
2390 Analyze the alignment of the data-references in the loop.
2391 Return FALSE if a data reference is found that cannot be vectorized. */
2394 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo
)
2396 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2398 vec
<data_reference_p
> datarefs
= LOOP_VINFO_DATAREFS (loop_vinfo
);
2399 struct data_reference
*dr
;
2402 vect_record_base_alignments (loop_vinfo
);
2403 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
2405 dr_vec_info
*dr_info
= loop_vinfo
->lookup_dr (dr
);
2406 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
))
2407 vect_compute_data_ref_alignment (loop_vinfo
, dr_info
);
2410 return opt_result::success ();
2414 /* Analyze alignment of DRs of stmts in NODE. */
2417 vect_slp_analyze_node_alignment (vec_info
*vinfo
, slp_tree node
)
2419 /* We vectorize from the first scalar stmt in the node unless
2420 the node is permuted in which case we start from the first
2421 element in the group. */
2422 stmt_vec_info first_stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2423 dr_vec_info
*first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2424 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2425 first_stmt_info
= DR_GROUP_FIRST_ELEMENT (first_stmt_info
);
2427 /* We need to commit to a vector type for the group now. */
2428 if (is_a
<bb_vec_info
> (vinfo
)
2429 && !vect_update_shared_vectype (first_stmt_info
, SLP_TREE_VECTYPE (node
)))
2431 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2433 "desired vector type conflicts with earlier one "
2434 "for %G", first_stmt_info
->stmt
);
2438 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2439 vect_compute_data_ref_alignment (vinfo
, dr_info
);
2440 /* In several places we need alignment of the first element anyway. */
2441 if (dr_info
!= first_dr_info
)
2442 vect_compute_data_ref_alignment (vinfo
, first_dr_info
);
2444 /* For creating the data-ref pointer we need alignment of the
2445 first element as well. */
2447 = vect_stmt_to_vectorize (vect_find_first_scalar_stmt_in_slp (node
));
2448 if (first_stmt_info
!= SLP_TREE_SCALAR_STMTS (node
)[0])
2450 first_dr_info
= STMT_VINFO_DR_INFO (first_stmt_info
);
2451 if (dr_info
!= first_dr_info
)
2452 vect_compute_data_ref_alignment (vinfo
, first_dr_info
);
2458 /* Function vect_slp_analyze_instance_alignment
2460 Analyze the alignment of the data-references in the SLP instance.
2461 Return FALSE if a data reference is found that cannot be vectorized. */
2464 vect_slp_analyze_instance_alignment (vec_info
*vinfo
,
2465 slp_instance instance
)
2467 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2471 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance
), i
, node
)
2472 if (! vect_slp_analyze_node_alignment (vinfo
, node
))
2475 node
= SLP_INSTANCE_TREE (instance
);
2476 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2477 && ! vect_slp_analyze_node_alignment
2478 (vinfo
, SLP_INSTANCE_TREE (instance
)))
2485 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2486 accesses of legal size, step, etc. Detect gaps, single element
2487 interleaving, and other special cases. Set grouped access info.
2488 Collect groups of strided stores for further use in SLP analysis.
2489 Worker for vect_analyze_group_access. */
2492 vect_analyze_group_access_1 (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2494 data_reference
*dr
= dr_info
->dr
;
2495 tree step
= DR_STEP (dr
);
2496 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2497 HOST_WIDE_INT type_size
= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type
));
2498 stmt_vec_info stmt_info
= dr_info
->stmt
;
2499 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2500 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
2501 HOST_WIDE_INT dr_step
= -1;
2502 HOST_WIDE_INT groupsize
, last_accessed_element
= 1;
2503 bool slp_impossible
= false;
2505 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2506 size of the interleaving group (including gaps). */
2507 if (tree_fits_shwi_p (step
))
2509 dr_step
= tree_to_shwi (step
);
2510 /* Check that STEP is a multiple of type size. Otherwise there is
2511 a non-element-sized gap at the end of the group which we
2512 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2513 ??? As we can handle non-constant step fine here we should
2514 simply remove uses of DR_GROUP_GAP between the last and first
2515 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2516 simply not include that gap. */
2517 if ((dr_step
% type_size
) != 0)
2519 if (dump_enabled_p ())
2520 dump_printf_loc (MSG_NOTE
, vect_location
,
2521 "Step %T is not a multiple of the element size"
2526 groupsize
= absu_hwi (dr_step
) / type_size
;
2531 /* Not consecutive access is possible only if it is a part of interleaving. */
2532 if (!DR_GROUP_FIRST_ELEMENT (stmt_info
))
2534 /* Check if it this DR is a part of interleaving, and is a single
2535 element of the group that is accessed in the loop. */
2537 /* Gaps are supported only for loads. STEP must be a multiple of the type
2540 && (dr_step
% type_size
) == 0
2542 /* This could be UINT_MAX but as we are generating code in a very
2543 inefficient way we have to cap earlier.
2544 See PR91403 for example. */
2545 && groupsize
<= 4096)
2547 DR_GROUP_FIRST_ELEMENT (stmt_info
) = stmt_info
;
2548 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2549 DR_GROUP_GAP (stmt_info
) = groupsize
- 1;
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_NOTE
, vect_location
,
2552 "Detected single element interleaving %T"
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2561 "not consecutive access %G", stmt_info
->stmt
);
2565 /* Mark the statement as unvectorizable. */
2566 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
2570 if (dump_enabled_p ())
2571 dump_printf_loc (MSG_NOTE
, vect_location
, "using strided accesses\n");
2572 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2576 if (DR_GROUP_FIRST_ELEMENT (stmt_info
) == stmt_info
)
2578 /* First stmt in the interleaving chain. Check the chain. */
2579 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2580 struct data_reference
*data_ref
= dr
;
2581 unsigned int count
= 1;
2582 tree prev_init
= DR_INIT (data_ref
);
2583 HOST_WIDE_INT diff
, gaps
= 0;
2585 /* By construction, all group members have INTEGER_CST DR_INITs. */
2588 /* We never have the same DR multiple times. */
2589 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref
),
2590 DR_INIT (STMT_VINFO_DATA_REF (next
))) != 0);
2592 data_ref
= STMT_VINFO_DATA_REF (next
);
2594 /* All group members have the same STEP by construction. */
2595 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref
), step
, 0));
2597 /* Check that the distance between two accesses is equal to the type
2598 size. Otherwise, we have gaps. */
2599 diff
= (TREE_INT_CST_LOW (DR_INIT (data_ref
))
2600 - TREE_INT_CST_LOW (prev_init
)) / type_size
;
2603 /* FORNOW: SLP of accesses with gaps is not supported. */
2604 slp_impossible
= true;
2605 if (DR_IS_WRITE (data_ref
))
2607 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2609 "interleaved store with gaps\n");
2616 last_accessed_element
+= diff
;
2618 /* Store the gap from the previous member of the group. If there is no
2619 gap in the access, DR_GROUP_GAP is always 1. */
2620 DR_GROUP_GAP (next
) = diff
;
2622 prev_init
= DR_INIT (data_ref
);
2623 next
= DR_GROUP_NEXT_ELEMENT (next
);
2624 /* Count the number of data-refs in the chain. */
2629 groupsize
= count
+ gaps
;
2631 /* This could be UINT_MAX but as we are generating code in a very
2632 inefficient way we have to cap earlier. See PR78699 for example. */
2633 if (groupsize
> 4096)
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2637 "group is too large\n");
2641 /* Check that the size of the interleaving is equal to count for stores,
2642 i.e., that there are no gaps. */
2643 if (groupsize
!= count
2644 && !DR_IS_READ (dr
))
2647 STMT_VINFO_STRIDED_P (stmt_info
) = true;
2650 /* If there is a gap after the last load in the group it is the
2651 difference between the groupsize and the last accessed
2653 When there is no gap, this difference should be 0. */
2654 DR_GROUP_GAP (stmt_info
) = groupsize
- last_accessed_element
;
2656 DR_GROUP_SIZE (stmt_info
) = groupsize
;
2657 if (dump_enabled_p ())
2659 dump_printf_loc (MSG_NOTE
, vect_location
,
2660 "Detected interleaving ");
2661 if (DR_IS_READ (dr
))
2662 dump_printf (MSG_NOTE
, "load ");
2663 else if (STMT_VINFO_STRIDED_P (stmt_info
))
2664 dump_printf (MSG_NOTE
, "strided store ");
2666 dump_printf (MSG_NOTE
, "store ");
2667 dump_printf (MSG_NOTE
, "of size %u\n",
2668 (unsigned)groupsize
);
2669 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", stmt_info
->stmt
);
2670 next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2673 if (DR_GROUP_GAP (next
) != 1)
2674 dump_printf_loc (MSG_NOTE
, vect_location
,
2675 "\t<gap of %d elements>\n",
2676 DR_GROUP_GAP (next
) - 1);
2677 dump_printf_loc (MSG_NOTE
, vect_location
, "\t%G", next
->stmt
);
2678 next
= DR_GROUP_NEXT_ELEMENT (next
);
2680 if (DR_GROUP_GAP (stmt_info
) != 0)
2681 dump_printf_loc (MSG_NOTE
, vect_location
,
2682 "\t<gap of %d elements>\n",
2683 DR_GROUP_GAP (stmt_info
));
2686 /* SLP: create an SLP data structure for every interleaving group of
2687 stores for further analysis in vect_analyse_slp. */
2688 if (DR_IS_WRITE (dr
) && !slp_impossible
)
2691 LOOP_VINFO_GROUPED_STORES (loop_vinfo
).safe_push (stmt_info
);
2693 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
2700 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2701 accesses of legal size, step, etc. Detect gaps, single element
2702 interleaving, and other special cases. Set grouped access info.
2703 Collect groups of strided stores for further use in SLP analysis. */
2706 vect_analyze_group_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2708 if (!vect_analyze_group_access_1 (vinfo
, dr_info
))
2710 /* Dissolve the group if present. */
2711 stmt_vec_info stmt_info
= DR_GROUP_FIRST_ELEMENT (dr_info
->stmt
);
2714 stmt_vec_info next
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2715 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2716 DR_GROUP_NEXT_ELEMENT (stmt_info
) = NULL
;
2724 /* Analyze the access pattern of the data-reference DR_INFO.
2725 In case of non-consecutive accesses call vect_analyze_group_access() to
2726 analyze groups of accesses. */
2729 vect_analyze_data_ref_access (vec_info
*vinfo
, dr_vec_info
*dr_info
)
2731 data_reference
*dr
= dr_info
->dr
;
2732 tree step
= DR_STEP (dr
);
2733 tree scalar_type
= TREE_TYPE (DR_REF (dr
));
2734 stmt_vec_info stmt_info
= dr_info
->stmt
;
2735 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
2736 class loop
*loop
= NULL
;
2738 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2742 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
2744 if (loop_vinfo
&& !step
)
2746 if (dump_enabled_p ())
2747 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2748 "bad data-ref access in loop\n");
2752 /* Allow loads with zero step in inner-loop vectorization. */
2753 if (loop_vinfo
&& integer_zerop (step
))
2755 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2756 if (!nested_in_vect_loop_p (loop
, stmt_info
))
2757 return DR_IS_READ (dr
);
2758 /* Allow references with zero step for outer loops marked
2759 with pragma omp simd only - it guarantees absence of
2760 loop-carried dependencies between inner loop iterations. */
2761 if (loop
->safelen
< 2)
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE
, vect_location
,
2765 "zero step in inner loop of nest\n");
2770 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2772 /* Interleaved accesses are not yet supported within outer-loop
2773 vectorization for references in the inner-loop. */
2774 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2776 /* For the rest of the analysis we use the outer-loop step. */
2777 step
= STMT_VINFO_DR_STEP (stmt_info
);
2778 if (integer_zerop (step
))
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_NOTE
, vect_location
,
2782 "zero step in outer loop.\n");
2783 return DR_IS_READ (dr
);
2788 if (TREE_CODE (step
) == INTEGER_CST
)
2790 HOST_WIDE_INT dr_step
= TREE_INT_CST_LOW (step
);
2791 if (!tree_int_cst_compare (step
, TYPE_SIZE_UNIT (scalar_type
))
2793 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type
), -dr_step
)))
2795 /* Mark that it is not interleaving. */
2796 DR_GROUP_FIRST_ELEMENT (stmt_info
) = NULL
;
2801 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_NOTE
, vect_location
,
2805 "grouped access in outer loop.\n");
2810 /* Assume this is a DR handled by non-constant strided load case. */
2811 if (TREE_CODE (step
) != INTEGER_CST
)
2812 return (STMT_VINFO_STRIDED_P (stmt_info
)
2813 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2814 || vect_analyze_group_access (vinfo
, dr_info
)));
2816 /* Not consecutive access - check if it's a part of interleaving group. */
2817 return vect_analyze_group_access (vinfo
, dr_info
);
2820 typedef std::pair
<data_reference_p
, int> data_ref_pair
;
2822 /* Compare two data-references DRA and DRB to group them into chunks
2823 suitable for grouping. */
2826 dr_group_sort_cmp (const void *dra_
, const void *drb_
)
2828 data_ref_pair dra_pair
= *(data_ref_pair
*)const_cast<void *>(dra_
);
2829 data_ref_pair drb_pair
= *(data_ref_pair
*)const_cast<void *>(drb_
);
2830 data_reference_p dra
= dra_pair
.first
;
2831 data_reference_p drb
= drb_pair
.first
;
2834 /* Stabilize sort. */
2838 /* DRs in different basic-blocks never belong to the same group. */
2839 int bb_index1
= gimple_bb (DR_STMT (dra
))->index
;
2840 int bb_index2
= gimple_bb (DR_STMT (drb
))->index
;
2841 if (bb_index1
!= bb_index2
)
2842 return bb_index1
< bb_index2
? -1 : 1;
2844 /* Different group IDs lead never belong to the same group. */
2845 if (dra_pair
.second
!= drb_pair
.second
)
2846 return dra_pair
.second
< drb_pair
.second
? -1 : 1;
2848 /* Ordering of DRs according to base. */
2849 cmp
= data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
2850 DR_BASE_ADDRESS (drb
));
2854 /* And according to DR_OFFSET. */
2855 cmp
= data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
));
2859 /* Put reads before writes. */
2860 if (DR_IS_READ (dra
) != DR_IS_READ (drb
))
2861 return DR_IS_READ (dra
) ? -1 : 1;
2863 /* Then sort after access size. */
2864 cmp
= data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
))),
2865 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
))));
2869 /* And after step. */
2870 cmp
= data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
));
2874 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2875 cmp
= data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
));
2877 return gimple_uid (DR_STMT (dra
)) < gimple_uid (DR_STMT (drb
)) ? -1 : 1;
2881 /* If OP is the result of a conversion, return the unconverted value,
2882 otherwise return null. */
2885 strip_conversion (tree op
)
2887 if (TREE_CODE (op
) != SSA_NAME
)
2889 gimple
*stmt
= SSA_NAME_DEF_STMT (op
);
2890 if (!is_gimple_assign (stmt
)
2891 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt
)))
2893 return gimple_assign_rhs1 (stmt
);
2896 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2897 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2898 be grouped in SLP mode. */
2901 can_group_stmts_p (stmt_vec_info stmt1_info
, stmt_vec_info stmt2_info
,
2904 if (gimple_assign_single_p (stmt1_info
->stmt
))
2905 return gimple_assign_single_p (stmt2_info
->stmt
);
2907 gcall
*call1
= dyn_cast
<gcall
*> (stmt1_info
->stmt
);
2908 if (call1
&& gimple_call_internal_p (call1
))
2910 /* Check for two masked loads or two masked stores. */
2911 gcall
*call2
= dyn_cast
<gcall
*> (stmt2_info
->stmt
);
2912 if (!call2
|| !gimple_call_internal_p (call2
))
2914 internal_fn ifn
= gimple_call_internal_fn (call1
);
2915 if (ifn
!= IFN_MASK_LOAD
&& ifn
!= IFN_MASK_STORE
)
2917 if (ifn
!= gimple_call_internal_fn (call2
))
2920 /* Check that the masks are the same. Cope with casts of masks,
2921 like those created by build_mask_conversion. */
2922 tree mask1
= gimple_call_arg (call1
, 2);
2923 tree mask2
= gimple_call_arg (call2
, 2);
2924 if (!operand_equal_p (mask1
, mask2
, 0)
2925 && (ifn
== IFN_MASK_STORE
|| !allow_slp_p
))
2927 mask1
= strip_conversion (mask1
);
2930 mask2
= strip_conversion (mask2
);
2933 if (!operand_equal_p (mask1
, mask2
, 0))
2942 /* Function vect_analyze_data_ref_accesses.
2944 Analyze the access pattern of all the data references in the loop.
2946 FORNOW: the only access pattern that is considered vectorizable is a
2947 simple step 1 (consecutive) access.
2949 FORNOW: handle only arrays and pointer accesses. */
2952 vect_analyze_data_ref_accesses (vec_info
*vinfo
,
2953 vec
<int> *dataref_groups
)
2956 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
2958 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2960 if (datarefs
.is_empty ())
2961 return opt_result::success ();
2963 /* Sort the array of datarefs to make building the interleaving chains
2964 linear. Don't modify the original vector's order, it is needed for
2965 determining what dependencies are reversed. */
2966 vec
<data_ref_pair
> datarefs_copy
;
2967 datarefs_copy
.create (datarefs
.length ());
2968 for (unsigned i
= 0; i
< datarefs
.length (); i
++)
2970 int group_id
= dataref_groups
? (*dataref_groups
)[i
] : 0;
2971 datarefs_copy
.quick_push (data_ref_pair (datarefs
[i
], group_id
));
2973 datarefs_copy
.qsort (dr_group_sort_cmp
);
2974 hash_set
<stmt_vec_info
> to_fixup
;
2976 /* Build the interleaving chains. */
2977 for (i
= 0; i
< datarefs_copy
.length () - 1;)
2979 data_reference_p dra
= datarefs_copy
[i
].first
;
2980 int dra_group_id
= datarefs_copy
[i
].second
;
2981 dr_vec_info
*dr_info_a
= vinfo
->lookup_dr (dra
);
2982 stmt_vec_info stmtinfo_a
= dr_info_a
->stmt
;
2983 stmt_vec_info lastinfo
= NULL
;
2984 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a
)
2985 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a
))
2990 for (i
= i
+ 1; i
< datarefs_copy
.length (); ++i
)
2992 data_reference_p drb
= datarefs_copy
[i
].first
;
2993 int drb_group_id
= datarefs_copy
[i
].second
;
2994 dr_vec_info
*dr_info_b
= vinfo
->lookup_dr (drb
);
2995 stmt_vec_info stmtinfo_b
= dr_info_b
->stmt
;
2996 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b
)
2997 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b
))
3000 /* ??? Imperfect sorting (non-compatible types, non-modulo
3001 accesses, same accesses) can lead to a group to be artificially
3002 split here as we don't just skip over those. If it really
3003 matters we can push those to a worklist and re-iterate
3004 over them. The we can just skip ahead to the next DR here. */
3006 /* DRs in a different BBs should not be put into the same
3007 interleaving group. */
3008 int bb_index1
= gimple_bb (DR_STMT (dra
))->index
;
3009 int bb_index2
= gimple_bb (DR_STMT (drb
))->index
;
3010 if (bb_index1
!= bb_index2
)
3013 if (dra_group_id
!= drb_group_id
)
3016 /* Check that the data-refs have same first location (except init)
3017 and they are both either store or load (not load and store,
3018 not masked loads or stores). */
3019 if (DR_IS_READ (dra
) != DR_IS_READ (drb
)
3020 || data_ref_compare_tree (DR_BASE_ADDRESS (dra
),
3021 DR_BASE_ADDRESS (drb
)) != 0
3022 || data_ref_compare_tree (DR_OFFSET (dra
), DR_OFFSET (drb
)) != 0
3023 || !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, true))
3026 /* Check that the data-refs have the same constant size. */
3027 tree sza
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra
)));
3028 tree szb
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb
)));
3029 if (!tree_fits_uhwi_p (sza
)
3030 || !tree_fits_uhwi_p (szb
)
3031 || !tree_int_cst_equal (sza
, szb
))
3034 /* Check that the data-refs have the same step. */
3035 if (data_ref_compare_tree (DR_STEP (dra
), DR_STEP (drb
)) != 0)
3038 /* Check the types are compatible.
3039 ??? We don't distinguish this during sorting. */
3040 if (!types_compatible_p (TREE_TYPE (DR_REF (dra
)),
3041 TREE_TYPE (DR_REF (drb
))))
3044 /* Check that the DR_INITs are compile-time constants. */
3045 if (TREE_CODE (DR_INIT (dra
)) != INTEGER_CST
3046 || TREE_CODE (DR_INIT (drb
)) != INTEGER_CST
)
3049 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3050 just hold extra information. */
3051 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a
)
3052 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b
)
3053 && data_ref_compare_tree (DR_INIT (dra
), DR_INIT (drb
)) == 0)
3056 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3057 HOST_WIDE_INT init_a
= TREE_INT_CST_LOW (DR_INIT (dra
));
3058 HOST_WIDE_INT init_b
= TREE_INT_CST_LOW (DR_INIT (drb
));
3059 HOST_WIDE_INT init_prev
3060 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy
[i
-1].first
));
3061 gcc_assert (init_a
<= init_b
3062 && init_a
<= init_prev
3063 && init_prev
<= init_b
);
3065 /* Do not place the same access in the interleaving chain twice. */
3066 if (init_b
== init_prev
)
3068 gcc_assert (gimple_uid (DR_STMT (datarefs_copy
[i
-1].first
))
3069 < gimple_uid (DR_STMT (drb
)));
3070 /* Simply link in duplicates and fix up the chain below. */
3074 /* If init_b == init_a + the size of the type * k, we have an
3075 interleaving, and DRA is accessed before DRB. */
3076 HOST_WIDE_INT type_size_a
= tree_to_uhwi (sza
);
3077 if (type_size_a
== 0
3078 || (init_b
- init_a
) % type_size_a
!= 0)
3081 /* If we have a store, the accesses are adjacent. This splits
3082 groups into chunks we support (we don't support vectorization
3083 of stores with gaps). */
3084 if (!DR_IS_READ (dra
) && init_b
- init_prev
!= type_size_a
)
3087 /* If the step (if not zero or non-constant) is smaller than the
3088 difference between data-refs' inits this splits groups into
3090 if (tree_fits_shwi_p (DR_STEP (dra
)))
3092 unsigned HOST_WIDE_INT step
3093 = absu_hwi (tree_to_shwi (DR_STEP (dra
)));
3095 && step
<= (unsigned HOST_WIDE_INT
)(init_b
- init_a
))
3100 if (dump_enabled_p ())
3101 dump_printf_loc (MSG_NOTE
, vect_location
,
3103 ? "Detected interleaving load %T and %T\n"
3104 : "Detected interleaving store %T and %T\n",
3105 DR_REF (dra
), DR_REF (drb
));
3107 /* Link the found element into the group list. */
3108 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3110 DR_GROUP_FIRST_ELEMENT (stmtinfo_a
) = stmtinfo_a
;
3111 lastinfo
= stmtinfo_a
;
3113 DR_GROUP_FIRST_ELEMENT (stmtinfo_b
) = stmtinfo_a
;
3114 DR_GROUP_NEXT_ELEMENT (lastinfo
) = stmtinfo_b
;
3115 lastinfo
= stmtinfo_b
;
3117 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
)
3118 = !can_group_stmts_p (stmtinfo_a
, stmtinfo_b
, false);
3120 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a
))
3121 dump_printf_loc (MSG_NOTE
, vect_location
,
3122 "Load suitable for SLP vectorization only.\n");
3124 if (init_b
== init_prev
3125 && !to_fixup
.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a
))
3126 && dump_enabled_p ())
3127 dump_printf_loc (MSG_NOTE
, vect_location
,
3128 "Queuing group with duplicate access for fixup\n");
3132 /* Fixup groups with duplicate entries by splitting it. */
3135 hash_set
<stmt_vec_info
>::iterator it
= to_fixup
.begin ();
3136 if (!(it
!= to_fixup
.end ()))
3138 stmt_vec_info grp
= *it
;
3139 to_fixup
.remove (grp
);
3141 /* Find the earliest duplicate group member. */
3142 unsigned first_duplicate
= -1u;
3143 stmt_vec_info next
, g
= grp
;
3144 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3146 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next
)->dr
),
3147 DR_INIT (STMT_VINFO_DR_INFO (g
)->dr
))
3148 && gimple_uid (STMT_VINFO_STMT (next
)) < first_duplicate
)
3149 first_duplicate
= gimple_uid (STMT_VINFO_STMT (next
));
3152 if (first_duplicate
== -1U)
3155 /* Then move all stmts after the first duplicate to a new group.
3156 Note this is a heuristic but one with the property that *it
3157 is fixed up completely. */
3159 stmt_vec_info newgroup
= NULL
, ng
= grp
;
3160 while ((next
= DR_GROUP_NEXT_ELEMENT (g
)))
3162 if (gimple_uid (STMT_VINFO_STMT (next
)) >= first_duplicate
)
3164 DR_GROUP_NEXT_ELEMENT (g
) = DR_GROUP_NEXT_ELEMENT (next
);
3168 DR_GROUP_NEXT_ELEMENT (ng
) = next
;
3170 DR_GROUP_FIRST_ELEMENT (ng
) = newgroup
;
3173 g
= DR_GROUP_NEXT_ELEMENT (g
);
3175 DR_GROUP_NEXT_ELEMENT (ng
) = NULL
;
3177 /* Fixup the new group which still may contain duplicates. */
3178 to_fixup
.add (newgroup
);
3181 data_ref_pair
*dr_pair
;
3182 FOR_EACH_VEC_ELT (datarefs_copy
, i
, dr_pair
)
3184 dr_vec_info
*dr_info
= vinfo
->lookup_dr (dr_pair
->first
);
3185 if (STMT_VINFO_VECTORIZABLE (dr_info
->stmt
)
3186 && !vect_analyze_data_ref_access (vinfo
, dr_info
))
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3190 "not vectorized: complicated access pattern.\n");
3192 if (is_a
<bb_vec_info
> (vinfo
))
3194 /* Mark the statement as not vectorizable. */
3195 STMT_VINFO_VECTORIZABLE (dr_info
->stmt
) = false;
3200 datarefs_copy
.release ();
3201 return opt_result::failure_at (dr_info
->stmt
->stmt
,
3203 " complicated access pattern.\n");
3208 datarefs_copy
.release ();
3209 return opt_result::success ();
3212 /* Function vect_vfa_segment_size.
3215 DR_INFO: The data reference.
3216 LENGTH_FACTOR: segment length to consider.
3218 Return a value suitable for the dr_with_seg_len::seg_len field.
3219 This is the "distance travelled" by the pointer from the first
3220 iteration in the segment to the last. Note that it does not include
3221 the size of the access; in effect it only describes the first byte. */
3224 vect_vfa_segment_size (dr_vec_info
*dr_info
, tree length_factor
)
3226 length_factor
= size_binop (MINUS_EXPR
,
3227 fold_convert (sizetype
, length_factor
),
3229 return size_binop (MULT_EXPR
, fold_convert (sizetype
, DR_STEP (dr_info
->dr
)),
3233 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3234 gives the worst-case number of bytes covered by the segment. */
3236 static unsigned HOST_WIDE_INT
3237 vect_vfa_access_size (vec_info
*vinfo
, dr_vec_info
*dr_info
)
3239 stmt_vec_info stmt_vinfo
= dr_info
->stmt
;
3240 tree ref_type
= TREE_TYPE (DR_REF (dr_info
->dr
));
3241 unsigned HOST_WIDE_INT ref_size
= tree_to_uhwi (TYPE_SIZE_UNIT (ref_type
));
3242 unsigned HOST_WIDE_INT access_size
= ref_size
;
3243 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
))
3245 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo
) == stmt_vinfo
);
3246 access_size
*= DR_GROUP_SIZE (stmt_vinfo
) - DR_GROUP_GAP (stmt_vinfo
);
3248 if (STMT_VINFO_VEC_STMTS (stmt_vinfo
).exists ()
3249 && (vect_supportable_dr_alignment (vinfo
, dr_info
, false)
3250 == dr_explicit_realign_optimized
))
3252 /* We might access a full vector's worth. */
3253 tree vectype
= STMT_VINFO_VECTYPE (stmt_vinfo
);
3254 access_size
+= tree_to_uhwi (TYPE_SIZE_UNIT (vectype
)) - ref_size
;
3259 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3263 vect_vfa_align (dr_vec_info
*dr_info
)
3265 return dr_alignment (dr_info
->dr
);
3268 /* Function vect_no_alias_p.
3270 Given data references A and B with equal base and offset, see whether
3271 the alias relation can be decided at compilation time. Return 1 if
3272 it can and the references alias, 0 if it can and the references do
3273 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3274 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3275 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3278 vect_compile_time_alias (dr_vec_info
*a
, dr_vec_info
*b
,
3279 tree segment_length_a
, tree segment_length_b
,
3280 unsigned HOST_WIDE_INT access_size_a
,
3281 unsigned HOST_WIDE_INT access_size_b
)
3283 poly_offset_int offset_a
= wi::to_poly_offset (DR_INIT (a
->dr
));
3284 poly_offset_int offset_b
= wi::to_poly_offset (DR_INIT (b
->dr
));
3285 poly_uint64 const_length_a
;
3286 poly_uint64 const_length_b
;
3288 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3289 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3291 if (tree_int_cst_compare (DR_STEP (a
->dr
), size_zero_node
) < 0)
3293 const_length_a
= (-wi::to_poly_wide (segment_length_a
)).force_uhwi ();
3294 offset_a
-= const_length_a
;
3297 const_length_a
= tree_to_poly_uint64 (segment_length_a
);
3298 if (tree_int_cst_compare (DR_STEP (b
->dr
), size_zero_node
) < 0)
3300 const_length_b
= (-wi::to_poly_wide (segment_length_b
)).force_uhwi ();
3301 offset_b
-= const_length_b
;
3304 const_length_b
= tree_to_poly_uint64 (segment_length_b
);
3306 const_length_a
+= access_size_a
;
3307 const_length_b
+= access_size_b
;
3309 if (ranges_known_overlap_p (offset_a
, const_length_a
,
3310 offset_b
, const_length_b
))
3313 if (!ranges_maybe_overlap_p (offset_a
, const_length_a
,
3314 offset_b
, const_length_b
))
3320 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3324 dependence_distance_ge_vf (data_dependence_relation
*ddr
,
3325 unsigned int loop_depth
, poly_uint64 vf
)
3327 if (DDR_ARE_DEPENDENT (ddr
) != NULL_TREE
3328 || DDR_NUM_DIST_VECTS (ddr
) == 0)
3331 /* If the dependence is exact, we should have limited the VF instead. */
3332 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr
));
3335 lambda_vector dist_v
;
3336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
3338 HOST_WIDE_INT dist
= dist_v
[loop_depth
];
3340 && !(dist
> 0 && DDR_REVERSED_P (ddr
))
3341 && maybe_lt ((unsigned HOST_WIDE_INT
) abs_hwi (dist
), vf
))
3345 if (dump_enabled_p ())
3346 dump_printf_loc (MSG_NOTE
, vect_location
,
3347 "dependence distance between %T and %T is >= VF\n",
3348 DR_REF (DDR_A (ddr
)), DR_REF (DDR_B (ddr
)));
3353 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3356 dump_lower_bound (dump_flags_t dump_kind
, const vec_lower_bound
&lower_bound
)
3358 dump_printf (dump_kind
, "%s (%T) >= ",
3359 lower_bound
.unsigned_p
? "unsigned" : "abs",
3361 dump_dec (dump_kind
, lower_bound
.min_value
);
3364 /* Record that the vectorized loop requires the vec_lower_bound described
3365 by EXPR, UNSIGNED_P and MIN_VALUE. */
3368 vect_check_lower_bound (loop_vec_info loop_vinfo
, tree expr
, bool unsigned_p
,
3369 poly_uint64 min_value
)
3371 vec
<vec_lower_bound
> lower_bounds
= LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
);
3372 for (unsigned int i
= 0; i
< lower_bounds
.length (); ++i
)
3373 if (operand_equal_p (lower_bounds
[i
].expr
, expr
, 0))
3375 unsigned_p
&= lower_bounds
[i
].unsigned_p
;
3376 min_value
= upper_bound (lower_bounds
[i
].min_value
, min_value
);
3377 if (lower_bounds
[i
].unsigned_p
!= unsigned_p
3378 || maybe_lt (lower_bounds
[i
].min_value
, min_value
))
3380 lower_bounds
[i
].unsigned_p
= unsigned_p
;
3381 lower_bounds
[i
].min_value
= min_value
;
3382 if (dump_enabled_p ())
3384 dump_printf_loc (MSG_NOTE
, vect_location
,
3385 "updating run-time check to ");
3386 dump_lower_bound (MSG_NOTE
, lower_bounds
[i
]);
3387 dump_printf (MSG_NOTE
, "\n");
3393 vec_lower_bound
lower_bound (expr
, unsigned_p
, min_value
);
3394 if (dump_enabled_p ())
3396 dump_printf_loc (MSG_NOTE
, vect_location
, "need a run-time check that ");
3397 dump_lower_bound (MSG_NOTE
, lower_bound
);
3398 dump_printf (MSG_NOTE
, "\n");
3400 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo
).safe_push (lower_bound
);
3403 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3404 will span fewer than GAP bytes. */
3407 vect_small_gap_p (loop_vec_info loop_vinfo
, dr_vec_info
*dr_info
,
3410 stmt_vec_info stmt_info
= dr_info
->stmt
;
3412 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo
));
3413 if (DR_GROUP_FIRST_ELEMENT (stmt_info
))
3414 count
*= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info
));
3415 return (estimated_poly_value (gap
)
3416 <= count
* vect_get_scalar_dr_size (dr_info
));
3419 /* Return true if we know that there is no alias between DR_INFO_A and
3420 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3421 When returning true, set *LOWER_BOUND_OUT to this N. */
3424 vectorizable_with_step_bound_p (dr_vec_info
*dr_info_a
, dr_vec_info
*dr_info_b
,
3425 poly_uint64
*lower_bound_out
)
3427 /* Check that there is a constant gap of known sign between DR_A
3429 data_reference
*dr_a
= dr_info_a
->dr
;
3430 data_reference
*dr_b
= dr_info_b
->dr
;
3431 poly_int64 init_a
, init_b
;
3432 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a
), DR_BASE_ADDRESS (dr_b
), 0)
3433 || !operand_equal_p (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
), 0)
3434 || !operand_equal_p (DR_STEP (dr_a
), DR_STEP (dr_b
), 0)
3435 || !poly_int_tree_p (DR_INIT (dr_a
), &init_a
)
3436 || !poly_int_tree_p (DR_INIT (dr_b
), &init_b
)
3437 || !ordered_p (init_a
, init_b
))
3440 /* Sort DR_A and DR_B by the address they access. */
3441 if (maybe_lt (init_b
, init_a
))
3443 std::swap (init_a
, init_b
);
3444 std::swap (dr_info_a
, dr_info_b
);
3445 std::swap (dr_a
, dr_b
);
3448 /* If the two accesses could be dependent within a scalar iteration,
3449 make sure that we'd retain their order. */
3450 if (maybe_gt (init_a
+ vect_get_scalar_dr_size (dr_info_a
), init_b
)
3451 && !vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
))
3454 /* There is no alias if abs (DR_STEP) is greater than or equal to
3455 the bytes spanned by the combination of the two accesses. */
3456 *lower_bound_out
= init_b
+ vect_get_scalar_dr_size (dr_info_b
) - init_a
;
3460 /* Function vect_prune_runtime_alias_test_list.
3462 Prune a list of ddrs to be tested at run-time by versioning for alias.
3463 Merge several alias checks into one if possible.
3464 Return FALSE if resulting list of ddrs is longer then allowed by
3465 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3468 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo
)
3470 typedef pair_hash
<tree_operand_hash
, tree_operand_hash
> tree_pair_hash
;
3471 hash_set
<tree_pair_hash
> compared_objects
;
3473 vec
<ddr_p
> may_alias_ddrs
= LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo
);
3474 vec
<dr_with_seg_len_pair_t
> &comp_alias_ddrs
3475 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo
);
3476 vec
<vec_object_pair
> &check_unequal_addrs
3477 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
);
3478 poly_uint64 vect_factor
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
3479 tree scalar_loop_iters
= LOOP_VINFO_NITERS (loop_vinfo
);
3485 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3487 /* Step values are irrelevant for aliasing if the number of vector
3488 iterations is equal to the number of scalar iterations (which can
3489 happen for fully-SLP loops). */
3490 bool ignore_step_p
= known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U);
3494 /* Convert the checks for nonzero steps into bound tests. */
3496 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo
), i
, value
)
3497 vect_check_lower_bound (loop_vinfo
, value
, true, 1);
3500 if (may_alias_ddrs
.is_empty ())
3501 return opt_result::success ();
3503 comp_alias_ddrs
.create (may_alias_ddrs
.length ());
3505 unsigned int loop_depth
3506 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo
)->num
,
3507 LOOP_VINFO_LOOP_NEST (loop_vinfo
));
3509 /* First, we collect all data ref pairs for aliasing checks. */
3510 FOR_EACH_VEC_ELT (may_alias_ddrs
, i
, ddr
)
3512 poly_uint64 lower_bound
;
3513 tree segment_length_a
, segment_length_b
;
3514 unsigned HOST_WIDE_INT access_size_a
, access_size_b
;
3515 unsigned int align_a
, align_b
;
3517 /* Ignore the alias if the VF we chose ended up being no greater
3518 than the dependence distance. */
3519 if (dependence_distance_ge_vf (ddr
, loop_depth
, vect_factor
))
3522 if (DDR_OBJECT_A (ddr
))
3524 vec_object_pair
new_pair (DDR_OBJECT_A (ddr
), DDR_OBJECT_B (ddr
));
3525 if (!compared_objects
.add (new_pair
))
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE
, vect_location
,
3529 "checking that %T and %T"
3530 " have different addresses\n",
3531 new_pair
.first
, new_pair
.second
);
3532 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo
).safe_push (new_pair
);
3537 dr_vec_info
*dr_info_a
= loop_vinfo
->lookup_dr (DDR_A (ddr
));
3538 stmt_vec_info stmt_info_a
= dr_info_a
->stmt
;
3540 dr_vec_info
*dr_info_b
= loop_vinfo
->lookup_dr (DDR_B (ddr
));
3541 stmt_vec_info stmt_info_b
= dr_info_b
->stmt
;
3543 bool preserves_scalar_order_p
3544 = vect_preserves_scalar_order_p (dr_info_a
, dr_info_b
);
3546 /* Skip the pair if inter-iteration dependencies are irrelevant
3547 and intra-iteration dependencies are guaranteed to be honored. */
3549 && (preserves_scalar_order_p
3550 || vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE
, vect_location
,
3555 "no need for alias check between "
3556 "%T and %T when VF is 1\n",
3557 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3561 /* See whether we can handle the alias using a bounds check on
3562 the step, and whether that's likely to be the best approach.
3563 (It might not be, for example, if the minimum step is much larger
3564 than the number of bytes handled by one vector iteration.) */
3566 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) != INTEGER_CST
3567 && vectorizable_with_step_bound_p (dr_info_a
, dr_info_b
,
3569 && (vect_small_gap_p (loop_vinfo
, dr_info_a
, lower_bound
)
3570 || vect_small_gap_p (loop_vinfo
, dr_info_b
, lower_bound
)))
3572 bool unsigned_p
= dr_known_forward_stride_p (dr_info_a
->dr
);
3573 if (dump_enabled_p ())
3575 dump_printf_loc (MSG_NOTE
, vect_location
, "no alias between "
3576 "%T and %T when the step %T is outside ",
3577 DR_REF (dr_info_a
->dr
),
3578 DR_REF (dr_info_b
->dr
),
3579 DR_STEP (dr_info_a
->dr
));
3581 dump_printf (MSG_NOTE
, "[0");
3584 dump_printf (MSG_NOTE
, "(");
3585 dump_dec (MSG_NOTE
, poly_int64 (-lower_bound
));
3587 dump_printf (MSG_NOTE
, ", ");
3588 dump_dec (MSG_NOTE
, lower_bound
);
3589 dump_printf (MSG_NOTE
, ")\n");
3591 vect_check_lower_bound (loop_vinfo
, DR_STEP (dr_info_a
->dr
),
3592 unsigned_p
, lower_bound
);
3596 stmt_vec_info dr_group_first_a
= DR_GROUP_FIRST_ELEMENT (stmt_info_a
);
3597 if (dr_group_first_a
)
3599 stmt_info_a
= dr_group_first_a
;
3600 dr_info_a
= STMT_VINFO_DR_INFO (stmt_info_a
);
3603 stmt_vec_info dr_group_first_b
= DR_GROUP_FIRST_ELEMENT (stmt_info_b
);
3604 if (dr_group_first_b
)
3606 stmt_info_b
= dr_group_first_b
;
3607 dr_info_b
= STMT_VINFO_DR_INFO (stmt_info_b
);
3612 segment_length_a
= size_zero_node
;
3613 segment_length_b
= size_zero_node
;
3617 if (!operand_equal_p (DR_STEP (dr_info_a
->dr
),
3618 DR_STEP (dr_info_b
->dr
), 0))
3619 length_factor
= scalar_loop_iters
;
3621 length_factor
= size_int (vect_factor
);
3622 segment_length_a
= vect_vfa_segment_size (dr_info_a
, length_factor
);
3623 segment_length_b
= vect_vfa_segment_size (dr_info_b
, length_factor
);
3625 access_size_a
= vect_vfa_access_size (loop_vinfo
, dr_info_a
);
3626 access_size_b
= vect_vfa_access_size (loop_vinfo
, dr_info_b
);
3627 align_a
= vect_vfa_align (dr_info_a
);
3628 align_b
= vect_vfa_align (dr_info_b
);
3630 /* See whether the alias is known at compilation time. */
3631 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a
->dr
),
3632 DR_BASE_ADDRESS (dr_info_b
->dr
), 0)
3633 && operand_equal_p (DR_OFFSET (dr_info_a
->dr
),
3634 DR_OFFSET (dr_info_b
->dr
), 0)
3635 && TREE_CODE (DR_STEP (dr_info_a
->dr
)) == INTEGER_CST
3636 && TREE_CODE (DR_STEP (dr_info_b
->dr
)) == INTEGER_CST
3637 && poly_int_tree_p (segment_length_a
)
3638 && poly_int_tree_p (segment_length_b
))
3640 int res
= vect_compile_time_alias (dr_info_a
, dr_info_b
,
3645 if (res
>= 0 && dump_enabled_p ())
3647 dump_printf_loc (MSG_NOTE
, vect_location
,
3648 "can tell at compile time that %T and %T",
3649 DR_REF (dr_info_a
->dr
), DR_REF (dr_info_b
->dr
));
3651 dump_printf (MSG_NOTE
, " do not alias\n");
3653 dump_printf (MSG_NOTE
, " alias\n");
3660 return opt_result::failure_at (stmt_info_b
->stmt
,
3662 " compilation time alias: %G%G",
3667 dr_with_seg_len
dr_a (dr_info_a
->dr
, segment_length_a
,
3668 access_size_a
, align_a
);
3669 dr_with_seg_len
dr_b (dr_info_b
->dr
, segment_length_b
,
3670 access_size_b
, align_b
);
3671 /* Canonicalize the order to be the one that's needed for accurate
3672 RAW, WAR and WAW flags, in cases where the data references are
3673 well-ordered. The order doesn't really matter otherwise,
3674 but we might as well be consistent. */
3675 if (get_later_stmt (stmt_info_a
, stmt_info_b
) == stmt_info_a
)
3676 std::swap (dr_a
, dr_b
);
3678 dr_with_seg_len_pair_t dr_with_seg_len_pair
3679 (dr_a
, dr_b
, (preserves_scalar_order_p
3680 ? dr_with_seg_len_pair_t::WELL_ORDERED
3681 : dr_with_seg_len_pair_t::REORDERED
));
3683 comp_alias_ddrs
.safe_push (dr_with_seg_len_pair
);
3686 prune_runtime_alias_test_list (&comp_alias_ddrs
, vect_factor
);
3688 unsigned int count
= (comp_alias_ddrs
.length ()
3689 + check_unequal_addrs
.length ());
3691 if (count
&& flag_vect_cost_model
== VECT_COST_MODEL_VERY_CHEAP
)
3692 return opt_result::failure_at
3693 (vect_location
, "would need a runtime alias check\n");
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_NOTE
, vect_location
,
3697 "improved number of alias checks from %d to %d\n",
3698 may_alias_ddrs
.length (), count
);
3699 unsigned limit
= param_vect_max_version_for_alias_checks
;
3700 if (flag_simd_cost_model
== VECT_COST_MODEL_CHEAP
)
3701 limit
= param_vect_max_version_for_alias_checks
* 6 / 10;
3703 return opt_result::failure_at
3705 "number of versioning for alias run-time tests exceeds %d "
3706 "(--param vect-max-version-for-alias-checks)\n", limit
);
3708 return opt_result::success ();
3711 /* Check whether we can use an internal function for a gather load
3712 or scatter store. READ_P is true for loads and false for stores.
3713 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3714 the type of the memory elements being loaded or stored. OFFSET_TYPE
3715 is the type of the offset that is being applied to the invariant
3716 base address. SCALE is the amount by which the offset should
3717 be multiplied *after* it has been converted to address width.
3719 Return true if the function is supported, storing the function id in
3720 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3723 vect_gather_scatter_fn_p (vec_info
*vinfo
, bool read_p
, bool masked_p
,
3724 tree vectype
, tree memory_type
, tree offset_type
,
3725 int scale
, internal_fn
*ifn_out
,
3726 tree
*offset_vectype_out
)
3728 unsigned int memory_bits
= tree_to_uhwi (TYPE_SIZE (memory_type
));
3729 unsigned int element_bits
= vector_element_bits (vectype
);
3730 if (element_bits
!= memory_bits
)
3731 /* For now the vector elements must be the same width as the
3735 /* Work out which function we need. */
3738 ifn
= masked_p
? IFN_MASK_GATHER_LOAD
: IFN_GATHER_LOAD
;
3740 ifn
= masked_p
? IFN_MASK_SCATTER_STORE
: IFN_SCATTER_STORE
;
3744 tree offset_vectype
= get_vectype_for_scalar_type (vinfo
, offset_type
);
3745 if (!offset_vectype
)
3748 /* Test whether the target supports this combination. */
3749 if (internal_gather_scatter_fn_supported_p (ifn
, vectype
, memory_type
,
3750 offset_vectype
, scale
))
3753 *offset_vectype_out
= offset_vectype
;
3757 if (TYPE_PRECISION (offset_type
) >= POINTER_SIZE
3758 && TYPE_PRECISION (offset_type
) >= element_bits
)
3761 offset_type
= build_nonstandard_integer_type
3762 (TYPE_PRECISION (offset_type
) * 2, TYPE_UNSIGNED (offset_type
));
3766 /* STMT_INFO is a call to an internal gather load or scatter store function.
3767 Describe the operation in INFO. */
3770 vect_describe_gather_scatter_call (stmt_vec_info stmt_info
,
3771 gather_scatter_info
*info
)
3773 gcall
*call
= as_a
<gcall
*> (stmt_info
->stmt
);
3774 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3775 data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3777 info
->ifn
= gimple_call_internal_fn (call
);
3778 info
->decl
= NULL_TREE
;
3779 info
->base
= gimple_call_arg (call
, 0);
3780 info
->offset
= gimple_call_arg (call
, 1);
3781 info
->offset_dt
= vect_unknown_def_type
;
3782 info
->offset_vectype
= NULL_TREE
;
3783 info
->scale
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
3784 info
->element_type
= TREE_TYPE (vectype
);
3785 info
->memory_type
= TREE_TYPE (DR_REF (dr
));
3788 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3789 gather load or scatter store. Describe the operation in *INFO if so. */
3792 vect_check_gather_scatter (stmt_vec_info stmt_info
, loop_vec_info loop_vinfo
,
3793 gather_scatter_info
*info
)
3795 HOST_WIDE_INT scale
= 1;
3796 poly_int64 pbitpos
, pbitsize
;
3797 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
3798 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
3799 tree offtype
= NULL_TREE
;
3800 tree decl
= NULL_TREE
, base
, off
;
3801 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
3802 tree memory_type
= TREE_TYPE (DR_REF (dr
));
3804 int punsignedp
, reversep
, pvolatilep
= 0;
3806 tree offset_vectype
;
3807 bool masked_p
= false;
3809 /* See whether this is already a call to a gather/scatter internal function.
3810 If not, see whether it's a masked load or store. */
3811 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
3812 if (call
&& gimple_call_internal_p (call
))
3814 ifn
= gimple_call_internal_fn (call
);
3815 if (internal_gather_scatter_fn_p (ifn
))
3817 vect_describe_gather_scatter_call (stmt_info
, info
);
3820 masked_p
= (ifn
== IFN_MASK_LOAD
|| ifn
== IFN_MASK_STORE
);
3823 /* True if we should aim to use internal functions rather than
3824 built-in functions. */
3825 bool use_ifn_p
= (DR_IS_READ (dr
)
3826 ? supports_vec_gather_load_p ()
3827 : supports_vec_scatter_store_p ());
3830 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3831 see if we can use the def stmt of the address. */
3833 && TREE_CODE (base
) == MEM_REF
3834 && TREE_CODE (TREE_OPERAND (base
, 0)) == SSA_NAME
3835 && integer_zerop (TREE_OPERAND (base
, 1))
3836 && !expr_invariant_in_loop_p (loop
, TREE_OPERAND (base
, 0)))
3838 gimple
*def_stmt
= SSA_NAME_DEF_STMT (TREE_OPERAND (base
, 0));
3839 if (is_gimple_assign (def_stmt
)
3840 && gimple_assign_rhs_code (def_stmt
) == ADDR_EXPR
)
3841 base
= TREE_OPERAND (gimple_assign_rhs1 (def_stmt
), 0);
3844 /* The gather and scatter builtins need address of the form
3845 loop_invariant + vector * {1, 2, 4, 8}
3847 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3848 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3849 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3850 multiplications and additions in it. To get a vector, we need
3851 a single SSA_NAME that will be defined in the loop and will
3852 contain everything that is not loop invariant and that can be
3853 vectorized. The following code attempts to find such a preexistng
3854 SSA_NAME OFF and put the loop invariants into a tree BASE
3855 that can be gimplified before the loop. */
3856 base
= get_inner_reference (base
, &pbitsize
, &pbitpos
, &off
, &pmode
,
3857 &punsignedp
, &reversep
, &pvolatilep
);
3861 poly_int64 pbytepos
= exact_div (pbitpos
, BITS_PER_UNIT
);
3863 if (TREE_CODE (base
) == MEM_REF
)
3865 if (!integer_zerop (TREE_OPERAND (base
, 1)))
3867 if (off
== NULL_TREE
)
3868 off
= wide_int_to_tree (sizetype
, mem_ref_offset (base
));
3870 off
= size_binop (PLUS_EXPR
, off
,
3871 fold_convert (sizetype
, TREE_OPERAND (base
, 1)));
3873 base
= TREE_OPERAND (base
, 0);
3876 base
= build_fold_addr_expr (base
);
3878 if (off
== NULL_TREE
)
3879 off
= size_zero_node
;
3881 /* If base is not loop invariant, either off is 0, then we start with just
3882 the constant offset in the loop invariant BASE and continue with base
3883 as OFF, otherwise give up.
3884 We could handle that case by gimplifying the addition of base + off
3885 into some SSA_NAME and use that as off, but for now punt. */
3886 if (!expr_invariant_in_loop_p (loop
, base
))
3888 if (!integer_zerop (off
))
3891 base
= size_int (pbytepos
);
3893 /* Otherwise put base + constant offset into the loop invariant BASE
3894 and continue with OFF. */
3897 base
= fold_convert (sizetype
, base
);
3898 base
= size_binop (PLUS_EXPR
, base
, size_int (pbytepos
));
3901 /* OFF at this point may be either a SSA_NAME or some tree expression
3902 from get_inner_reference. Try to peel off loop invariants from it
3903 into BASE as long as possible. */
3905 while (offtype
== NULL_TREE
)
3907 enum tree_code code
;
3908 tree op0
, op1
, add
= NULL_TREE
;
3910 if (TREE_CODE (off
) == SSA_NAME
)
3912 gimple
*def_stmt
= SSA_NAME_DEF_STMT (off
);
3914 if (expr_invariant_in_loop_p (loop
, off
))
3917 if (gimple_code (def_stmt
) != GIMPLE_ASSIGN
)
3920 op0
= gimple_assign_rhs1 (def_stmt
);
3921 code
= gimple_assign_rhs_code (def_stmt
);
3922 op1
= gimple_assign_rhs2 (def_stmt
);
3926 if (get_gimple_rhs_class (TREE_CODE (off
)) == GIMPLE_TERNARY_RHS
)
3928 code
= TREE_CODE (off
);
3929 extract_ops_from_tree (off
, &code
, &op0
, &op1
);
3933 case POINTER_PLUS_EXPR
:
3935 if (expr_invariant_in_loop_p (loop
, op0
))
3940 add
= fold_convert (sizetype
, add
);
3942 add
= size_binop (MULT_EXPR
, add
, size_int (scale
));
3943 base
= size_binop (PLUS_EXPR
, base
, add
);
3946 if (expr_invariant_in_loop_p (loop
, op1
))
3954 if (expr_invariant_in_loop_p (loop
, op1
))
3956 add
= fold_convert (sizetype
, op1
);
3957 add
= size_binop (MINUS_EXPR
, size_zero_node
, add
);
3963 if (scale
== 1 && tree_fits_shwi_p (op1
))
3965 int new_scale
= tree_to_shwi (op1
);
3966 /* Only treat this as a scaling operation if the target
3967 supports it for at least some offset type. */
3969 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3970 masked_p
, vectype
, memory_type
,
3971 signed_char_type_node
,
3974 && !vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3975 masked_p
, vectype
, memory_type
,
3976 unsigned_char_type_node
,
3989 if (!POINTER_TYPE_P (TREE_TYPE (op0
))
3990 && !INTEGRAL_TYPE_P (TREE_TYPE (op0
)))
3993 /* Don't include the conversion if the target is happy with
3994 the current offset type. */
3996 && vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
),
3997 masked_p
, vectype
, memory_type
,
3998 TREE_TYPE (off
), scale
, &ifn
,
4002 if (TYPE_PRECISION (TREE_TYPE (op0
))
4003 == TYPE_PRECISION (TREE_TYPE (off
)))
4009 if (TYPE_PRECISION (TREE_TYPE (op0
))
4010 < TYPE_PRECISION (TREE_TYPE (off
)))
4013 offtype
= TREE_TYPE (off
);
4024 /* If at the end OFF still isn't a SSA_NAME or isn't
4025 defined in the loop, punt. */
4026 if (TREE_CODE (off
) != SSA_NAME
4027 || expr_invariant_in_loop_p (loop
, off
))
4030 if (offtype
== NULL_TREE
)
4031 offtype
= TREE_TYPE (off
);
4035 if (!vect_gather_scatter_fn_p (loop_vinfo
, DR_IS_READ (dr
), masked_p
,
4036 vectype
, memory_type
, offtype
, scale
,
4037 &ifn
, &offset_vectype
))
4042 if (DR_IS_READ (dr
))
4044 if (targetm
.vectorize
.builtin_gather
)
4045 decl
= targetm
.vectorize
.builtin_gather (vectype
, offtype
, scale
);
4049 if (targetm
.vectorize
.builtin_scatter
)
4050 decl
= targetm
.vectorize
.builtin_scatter (vectype
, offtype
, scale
);
4057 /* The offset vector type will be read from DECL when needed. */
4058 offset_vectype
= NULL_TREE
;
4065 info
->offset_dt
= vect_unknown_def_type
;
4066 info
->offset_vectype
= offset_vectype
;
4067 info
->scale
= scale
;
4068 info
->element_type
= TREE_TYPE (vectype
);
4069 info
->memory_type
= memory_type
;
4073 /* Find the data references in STMT, analyze them with respect to LOOP and
4074 append them to DATAREFS. Return false if datarefs in this stmt cannot
4078 vect_find_stmt_data_reference (loop_p loop
, gimple
*stmt
,
4079 vec
<data_reference_p
> *datarefs
,
4080 vec
<int> *dataref_groups
, int group_id
)
4082 /* We can ignore clobbers for dataref analysis - they are removed during
4083 loop vectorization and BB vectorization checks dependences with a
4085 if (gimple_clobber_p (stmt
))
4086 return opt_result::success ();
4088 if (gimple_has_volatile_ops (stmt
))
4089 return opt_result::failure_at (stmt
, "not vectorized: volatile type: %G",
4092 if (stmt_can_throw_internal (cfun
, stmt
))
4093 return opt_result::failure_at (stmt
,
4095 " statement can throw an exception: %G",
4098 auto_vec
<data_reference_p
, 2> refs
;
4099 opt_result res
= find_data_references_in_stmt (loop
, stmt
, &refs
);
4103 if (refs
.is_empty ())
4104 return opt_result::success ();
4106 if (refs
.length () > 1)
4108 while (!refs
.is_empty ())
4109 free_data_ref (refs
.pop ());
4110 return opt_result::failure_at (stmt
,
4111 "not vectorized: more than one "
4112 "data ref in stmt: %G", stmt
);
4115 data_reference_p dr
= refs
.pop ();
4116 if (gcall
*call
= dyn_cast
<gcall
*> (stmt
))
4117 if (!gimple_call_internal_p (call
)
4118 || (gimple_call_internal_fn (call
) != IFN_MASK_LOAD
4119 && gimple_call_internal_fn (call
) != IFN_MASK_STORE
))
4122 return opt_result::failure_at (stmt
,
4123 "not vectorized: dr in a call %G", stmt
);
4126 if (TREE_CODE (DR_REF (dr
)) == COMPONENT_REF
4127 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr
), 1)))
4130 return opt_result::failure_at (stmt
,
4132 " statement is bitfield access %G", stmt
);
4135 if (DR_BASE_ADDRESS (dr
)
4136 && TREE_CODE (DR_BASE_ADDRESS (dr
)) == INTEGER_CST
)
4139 return opt_result::failure_at (stmt
,
4141 " base addr of dr is a constant\n");
4144 /* Check whether this may be a SIMD lane access and adjust the
4145 DR to make it easier for us to handle it. */
4148 && (!DR_BASE_ADDRESS (dr
)
4153 struct data_reference
*newdr
4154 = create_data_ref (NULL
, loop_containing_stmt (stmt
), DR_REF (dr
), stmt
,
4155 DR_IS_READ (dr
), DR_IS_CONDITIONAL_IN_STMT (dr
));
4156 if (DR_BASE_ADDRESS (newdr
)
4157 && DR_OFFSET (newdr
)
4160 && TREE_CODE (DR_INIT (newdr
)) == INTEGER_CST
4161 && integer_zerop (DR_STEP (newdr
)))
4163 tree base_address
= DR_BASE_ADDRESS (newdr
);
4164 tree off
= DR_OFFSET (newdr
);
4165 tree step
= ssize_int (1);
4166 if (integer_zerop (off
)
4167 && TREE_CODE (base_address
) == POINTER_PLUS_EXPR
)
4169 off
= TREE_OPERAND (base_address
, 1);
4170 base_address
= TREE_OPERAND (base_address
, 0);
4173 if (TREE_CODE (off
) == MULT_EXPR
4174 && tree_fits_uhwi_p (TREE_OPERAND (off
, 1)))
4176 step
= TREE_OPERAND (off
, 1);
4177 off
= TREE_OPERAND (off
, 0);
4180 if (CONVERT_EXPR_P (off
)
4181 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off
, 0)))
4182 < TYPE_PRECISION (TREE_TYPE (off
))))
4183 off
= TREE_OPERAND (off
, 0);
4184 if (TREE_CODE (off
) == SSA_NAME
)
4186 gimple
*def
= SSA_NAME_DEF_STMT (off
);
4187 /* Look through widening conversion. */
4188 if (is_gimple_assign (def
)
4189 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def
)))
4191 tree rhs1
= gimple_assign_rhs1 (def
);
4192 if (TREE_CODE (rhs1
) == SSA_NAME
4193 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1
))
4194 && (TYPE_PRECISION (TREE_TYPE (off
))
4195 > TYPE_PRECISION (TREE_TYPE (rhs1
))))
4196 def
= SSA_NAME_DEF_STMT (rhs1
);
4198 if (is_gimple_call (def
)
4199 && gimple_call_internal_p (def
)
4200 && (gimple_call_internal_fn (def
) == IFN_GOMP_SIMD_LANE
))
4202 tree arg
= gimple_call_arg (def
, 0);
4203 tree reft
= TREE_TYPE (DR_REF (newdr
));
4204 gcc_assert (TREE_CODE (arg
) == SSA_NAME
);
4205 arg
= SSA_NAME_VAR (arg
);
4206 if (arg
== loop
->simduid
4208 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft
), step
))
4210 DR_BASE_ADDRESS (newdr
) = base_address
;
4211 DR_OFFSET (newdr
) = ssize_int (0);
4212 DR_STEP (newdr
) = step
;
4213 DR_OFFSET_ALIGNMENT (newdr
) = BIGGEST_ALIGNMENT
;
4214 DR_STEP_ALIGNMENT (newdr
) = highest_pow2_factor (step
);
4215 /* Mark as simd-lane access. */
4216 tree arg2
= gimple_call_arg (def
, 1);
4217 newdr
->aux
= (void *) (-1 - tree_to_uhwi (arg2
));
4219 datarefs
->safe_push (newdr
);
4221 dataref_groups
->safe_push (group_id
);
4222 return opt_result::success ();
4227 free_data_ref (newdr
);
4230 datarefs
->safe_push (dr
);
4232 dataref_groups
->safe_push (group_id
);
4233 return opt_result::success ();
4236 /* Function vect_analyze_data_refs.
4238 Find all the data references in the loop or basic block.
4240 The general structure of the analysis of data refs in the vectorizer is as
4242 1- vect_analyze_data_refs(loop/bb): call
4243 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4244 in the loop/bb and their dependences.
4245 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4246 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4247 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4252 vect_analyze_data_refs (vec_info
*vinfo
, poly_uint64
*min_vf
, bool *fatal
)
4254 class loop
*loop
= NULL
;
4256 struct data_reference
*dr
;
4259 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4261 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4262 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4264 /* Go through the data-refs, check that the analysis succeeded. Update
4265 pointer from stmt_vec_info struct to DR and vectype. */
4267 vec
<data_reference_p
> datarefs
= vinfo
->shared
->datarefs
;
4268 FOR_EACH_VEC_ELT (datarefs
, i
, dr
)
4270 enum { SG_NONE
, GATHER
, SCATTER
} gatherscatter
= SG_NONE
;
4273 gcc_assert (DR_REF (dr
));
4274 stmt_vec_info stmt_info
= vinfo
->lookup_stmt (DR_STMT (dr
));
4275 gcc_assert (!stmt_info
->dr_aux
.dr
);
4276 stmt_info
->dr_aux
.dr
= dr
;
4277 stmt_info
->dr_aux
.stmt
= stmt_info
;
4279 /* Check that analysis of the data-ref succeeded. */
4280 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
) || !DR_INIT (dr
)
4285 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4286 && (targetm
.vectorize
.builtin_gather
!= NULL
4287 || supports_vec_gather_load_p ());
4290 && !TREE_THIS_VOLATILE (DR_REF (dr
))
4291 && (targetm
.vectorize
.builtin_scatter
!= NULL
4292 || supports_vec_scatter_store_p ());
4294 /* If target supports vector gather loads or scatter stores,
4295 see if they can't be used. */
4296 if (is_a
<loop_vec_info
> (vinfo
)
4297 && !nested_in_vect_loop_p (loop
, stmt_info
))
4299 if (maybe_gather
|| maybe_scatter
)
4302 gatherscatter
= GATHER
;
4304 gatherscatter
= SCATTER
;
4308 if (gatherscatter
== SG_NONE
)
4310 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4312 "not vectorized: data ref analysis "
4313 "failed %G", stmt_info
->stmt
);
4314 if (is_a
<bb_vec_info
> (vinfo
))
4316 /* In BB vectorization the ref can still participate
4317 in dependence analysis, we just can't vectorize it. */
4318 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4321 return opt_result::failure_at (stmt_info
->stmt
,
4323 " data ref analysis failed: %G",
4328 /* See if this was detected as SIMD lane access. */
4329 if (dr
->aux
== (void *)-1
4330 || dr
->aux
== (void *)-2
4331 || dr
->aux
== (void *)-3
4332 || dr
->aux
== (void *)-4)
4334 if (nested_in_vect_loop_p (loop
, stmt_info
))
4335 return opt_result::failure_at (stmt_info
->stmt
,
4337 " data ref analysis failed: %G",
4339 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info
)
4340 = -(uintptr_t) dr
->aux
;
4343 tree base
= get_base_address (DR_REF (dr
));
4344 if (base
&& VAR_P (base
) && DECL_NONALIASED (base
))
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4348 "not vectorized: base object not addressable "
4349 "for stmt: %G", stmt_info
->stmt
);
4350 if (is_a
<bb_vec_info
> (vinfo
))
4352 /* In BB vectorization the ref can still participate
4353 in dependence analysis, we just can't vectorize it. */
4354 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4357 return opt_result::failure_at (stmt_info
->stmt
,
4358 "not vectorized: base object not"
4359 " addressable for stmt: %G",
4363 if (is_a
<loop_vec_info
> (vinfo
)
4365 && TREE_CODE (DR_STEP (dr
)) != INTEGER_CST
)
4367 if (nested_in_vect_loop_p (loop
, stmt_info
))
4368 return opt_result::failure_at (stmt_info
->stmt
,
4370 "not suitable for strided load %G",
4372 STMT_VINFO_STRIDED_P (stmt_info
) = true;
4375 /* Update DR field in stmt_vec_info struct. */
4377 /* If the dataref is in an inner-loop of the loop that is considered for
4378 for vectorization, we also want to analyze the access relative to
4379 the outer-loop (DR contains information only relative to the
4380 inner-most enclosing loop). We do that by building a reference to the
4381 first location accessed by the inner-loop, and analyze it relative to
4383 if (loop
&& nested_in_vect_loop_p (loop
, stmt_info
))
4385 /* Build a reference to the first location accessed by the
4386 inner loop: *(BASE + INIT + OFFSET). By construction,
4387 this address must be invariant in the inner loop, so we
4388 can consider it as being used in the outer loop. */
4389 tree base
= unshare_expr (DR_BASE_ADDRESS (dr
));
4390 tree offset
= unshare_expr (DR_OFFSET (dr
));
4391 tree init
= unshare_expr (DR_INIT (dr
));
4392 tree init_offset
= fold_build2 (PLUS_EXPR
, TREE_TYPE (offset
),
4394 tree init_addr
= fold_build_pointer_plus (base
, init_offset
);
4395 tree init_ref
= build_fold_indirect_ref (init_addr
);
4397 if (dump_enabled_p ())
4398 dump_printf_loc (MSG_NOTE
, vect_location
,
4399 "analyze in outer loop: %T\n", init_ref
);
4402 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info
),
4403 init_ref
, loop
, stmt_info
->stmt
);
4405 /* dr_analyze_innermost already explained the failure. */
4408 if (dump_enabled_p ())
4409 dump_printf_loc (MSG_NOTE
, vect_location
,
4410 "\touter base_address: %T\n"
4411 "\touter offset from base address: %T\n"
4412 "\touter constant offset from base address: %T\n"
4413 "\touter step: %T\n"
4414 "\touter base alignment: %d\n\n"
4415 "\touter base misalignment: %d\n"
4416 "\touter offset alignment: %d\n"
4417 "\touter step alignment: %d\n",
4418 STMT_VINFO_DR_BASE_ADDRESS (stmt_info
),
4419 STMT_VINFO_DR_OFFSET (stmt_info
),
4420 STMT_VINFO_DR_INIT (stmt_info
),
4421 STMT_VINFO_DR_STEP (stmt_info
),
4422 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info
),
4423 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info
),
4424 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info
),
4425 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info
));
4428 /* Set vectype for STMT. */
4429 scalar_type
= TREE_TYPE (DR_REF (dr
));
4430 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
4433 if (dump_enabled_p ())
4435 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4436 "not vectorized: no vectype for stmt: %G",
4438 dump_printf (MSG_MISSED_OPTIMIZATION
, " scalar_type: ");
4439 dump_generic_expr (MSG_MISSED_OPTIMIZATION
, TDF_DETAILS
,
4441 dump_printf (MSG_MISSED_OPTIMIZATION
, "\n");
4444 if (is_a
<bb_vec_info
> (vinfo
))
4446 /* No vector type is fine, the ref can still participate
4447 in dependence analysis, we just can't vectorize it. */
4448 STMT_VINFO_VECTORIZABLE (stmt_info
) = false;
4453 return opt_result::failure_at (stmt_info
->stmt
,
4455 " no vectype for stmt: %G"
4456 " scalar_type: %T\n",
4457 stmt_info
->stmt
, scalar_type
);
4461 if (dump_enabled_p ())
4462 dump_printf_loc (MSG_NOTE
, vect_location
,
4463 "got vectype for stmt: %G%T\n",
4464 stmt_info
->stmt
, vectype
);
4467 /* Adjust the minimal vectorization factor according to the
4469 vf
= TYPE_VECTOR_SUBPARTS (vectype
);
4470 *min_vf
= upper_bound (*min_vf
, vf
);
4472 /* Leave the BB vectorizer to pick the vector type later, based on
4473 the final dataref group size and SLP node size. */
4474 if (is_a
<loop_vec_info
> (vinfo
))
4475 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
4477 if (gatherscatter
!= SG_NONE
)
4479 gather_scatter_info gs_info
;
4480 if (!vect_check_gather_scatter (stmt_info
,
4481 as_a
<loop_vec_info
> (vinfo
),
4483 || !get_vectype_for_scalar_type (vinfo
,
4484 TREE_TYPE (gs_info
.offset
)))
4488 return opt_result::failure_at
4490 (gatherscatter
== GATHER
)
4491 ? "not vectorized: not suitable for gather load %G"
4492 : "not vectorized: not suitable for scatter store %G",
4495 STMT_VINFO_GATHER_SCATTER_P (stmt_info
) = gatherscatter
;
4499 /* We used to stop processing and prune the list here. Verify we no
4501 gcc_assert (i
== datarefs
.length ());
4503 return opt_result::success ();
4507 /* Function vect_get_new_vect_var.
4509 Returns a name for a new variable. The current naming scheme appends the
4510 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4511 the name of vectorizer generated variables, and appends that to NAME if
4515 vect_get_new_vect_var (tree type
, enum vect_var_kind var_kind
, const char *name
)
4522 case vect_simple_var
:
4525 case vect_scalar_var
:
4531 case vect_pointer_var
:
4540 char* tmp
= concat (prefix
, "_", name
, NULL
);
4541 new_vect_var
= create_tmp_reg (type
, tmp
);
4545 new_vect_var
= create_tmp_reg (type
, prefix
);
4547 return new_vect_var
;
4550 /* Like vect_get_new_vect_var but return an SSA name. */
4553 vect_get_new_ssa_name (tree type
, enum vect_var_kind var_kind
, const char *name
)
4560 case vect_simple_var
:
4563 case vect_scalar_var
:
4566 case vect_pointer_var
:
4575 char* tmp
= concat (prefix
, "_", name
, NULL
);
4576 new_vect_var
= make_temp_ssa_name (type
, NULL
, tmp
);
4580 new_vect_var
= make_temp_ssa_name (type
, NULL
, prefix
);
4582 return new_vect_var
;
4585 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4588 vect_duplicate_ssa_name_ptr_info (tree name
, dr_vec_info
*dr_info
)
4590 duplicate_ssa_name_ptr_info (name
, DR_PTR_INFO (dr_info
->dr
));
4591 int misalign
= DR_MISALIGNMENT (dr_info
);
4592 if (misalign
== DR_MISALIGNMENT_UNKNOWN
)
4593 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name
));
4595 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name
),
4596 known_alignment (DR_TARGET_ALIGNMENT (dr_info
)),
4600 /* Function vect_create_addr_base_for_vector_ref.
4602 Create an expression that computes the address of the first memory location
4603 that will be accessed for a data reference.
4606 STMT_INFO: The statement containing the data reference.
4607 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4608 OFFSET: Optional. If supplied, it is be added to the initial address.
4609 LOOP: Specify relative to which loop-nest should the address be computed.
4610 For example, when the dataref is in an inner-loop nested in an
4611 outer-loop that is now being vectorized, LOOP can be either the
4612 outer-loop, or the inner-loop. The first memory location accessed
4613 by the following dataref ('in' points to short):
4620 if LOOP=i_loop: &in (relative to i_loop)
4621 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4622 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4623 initial address. Unlike OFFSET, which is number of elements to
4624 be added, BYTE_OFFSET is measured in bytes.
4627 1. Return an SSA_NAME whose value is the address of the memory location of
4628 the first vector of the data reference.
4629 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4630 these statement(s) which define the returned SSA_NAME.
4632 FORNOW: We are only handling array accesses with step 1. */
4635 vect_create_addr_base_for_vector_ref (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4636 gimple_seq
*new_stmt_list
,
4640 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4641 struct data_reference
*dr
= dr_info
->dr
;
4642 const char *base_name
;
4645 gimple_seq seq
= NULL
;
4647 tree step
= TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr
)));
4648 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4649 innermost_loop_behavior
*drb
= vect_dr_behavior (vinfo
, dr_info
);
4651 tree data_ref_base
= unshare_expr (drb
->base_address
);
4652 tree base_offset
= unshare_expr (get_dr_vinfo_offset (vinfo
, dr_info
, true));
4653 tree init
= unshare_expr (drb
->init
);
4656 base_name
= get_name (data_ref_base
);
4659 base_offset
= ssize_int (0);
4660 init
= ssize_int (0);
4661 base_name
= get_name (DR_REF (dr
));
4664 /* Create base_offset */
4665 base_offset
= size_binop (PLUS_EXPR
,
4666 fold_convert (sizetype
, base_offset
),
4667 fold_convert (sizetype
, init
));
4671 offset
= fold_build2 (MULT_EXPR
, sizetype
,
4672 fold_convert (sizetype
, offset
), step
);
4673 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4674 base_offset
, offset
);
4678 byte_offset
= fold_convert (sizetype
, byte_offset
);
4679 base_offset
= fold_build2 (PLUS_EXPR
, sizetype
,
4680 base_offset
, byte_offset
);
4683 /* base + base_offset */
4685 addr_base
= fold_build_pointer_plus (data_ref_base
, base_offset
);
4688 addr_base
= build1 (ADDR_EXPR
,
4689 build_pointer_type (TREE_TYPE (DR_REF (dr
))),
4690 unshare_expr (DR_REF (dr
)));
4693 vect_ptr_type
= build_pointer_type (STMT_VINFO_VECTYPE (stmt_info
));
4694 dest
= vect_get_new_vect_var (vect_ptr_type
, vect_pointer_var
, base_name
);
4695 addr_base
= force_gimple_operand (addr_base
, &seq
, true, dest
);
4696 gimple_seq_add_seq (new_stmt_list
, seq
);
4698 if (DR_PTR_INFO (dr
)
4699 && TREE_CODE (addr_base
) == SSA_NAME
4700 && !SSA_NAME_PTR_INFO (addr_base
))
4702 vect_duplicate_ssa_name_ptr_info (addr_base
, dr_info
);
4703 if (offset
|| byte_offset
)
4704 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base
));
4707 if (dump_enabled_p ())
4708 dump_printf_loc (MSG_NOTE
, vect_location
, "created %T\n", addr_base
);
4714 /* Function vect_create_data_ref_ptr.
4716 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4717 location accessed in the loop by STMT_INFO, along with the def-use update
4718 chain to appropriately advance the pointer through the loop iterations.
4719 Also set aliasing information for the pointer. This pointer is used by
4720 the callers to this function to create a memory reference expression for
4721 vector load/store access.
4724 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4725 GIMPLE_ASSIGN <name, data-ref> or
4726 GIMPLE_ASSIGN <data-ref, name>.
4727 2. AGGR_TYPE: the type of the reference, which should be either a vector
4729 3. AT_LOOP: the loop where the vector memref is to be created.
4730 4. OFFSET (optional): an offset to be added to the initial address accessed
4731 by the data-ref in STMT_INFO.
4732 5. BSI: location where the new stmts are to be placed if there is no loop
4733 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4734 pointing to the initial address.
4735 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4736 to the initial address accessed by the data-ref in STMT_INFO. This is
4737 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4739 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4740 to the IV during each iteration of the loop. NULL says to move
4741 by one copy of AGGR_TYPE up or down, depending on the step of the
4745 1. Declare a new ptr to vector_type, and have it point to the base of the
4746 data reference (initial addressed accessed by the data reference).
4747 For example, for vector of type V8HI, the following code is generated:
4750 ap = (v8hi *)initial_address;
4752 if OFFSET is not supplied:
4753 initial_address = &a[init];
4754 if OFFSET is supplied:
4755 initial_address = &a[init + OFFSET];
4756 if BYTE_OFFSET is supplied:
4757 initial_address = &a[init] + BYTE_OFFSET;
4759 Return the initial_address in INITIAL_ADDRESS.
4761 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4762 update the pointer in each iteration of the loop.
4764 Return the increment stmt that updates the pointer in PTR_INCR.
4766 3. Return the pointer. */
4769 vect_create_data_ref_ptr (vec_info
*vinfo
, stmt_vec_info stmt_info
,
4770 tree aggr_type
, class loop
*at_loop
, tree offset
,
4771 tree
*initial_address
, gimple_stmt_iterator
*gsi
,
4772 gimple
**ptr_incr
, bool only_init
,
4773 tree byte_offset
, tree iv_step
)
4775 const char *base_name
;
4776 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4777 class loop
*loop
= NULL
;
4778 bool nested_in_vect_loop
= false;
4779 class loop
*containing_loop
= NULL
;
4783 gimple_seq new_stmt_list
= NULL
;
4787 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
4788 struct data_reference
*dr
= dr_info
->dr
;
4790 gimple_stmt_iterator incr_gsi
;
4792 tree indx_before_incr
, indx_after_incr
;
4794 bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
);
4796 gcc_assert (iv_step
!= NULL_TREE
4797 || TREE_CODE (aggr_type
) == ARRAY_TYPE
4798 || TREE_CODE (aggr_type
) == VECTOR_TYPE
);
4802 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
4803 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
4804 containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
4805 pe
= loop_preheader_edge (loop
);
4809 gcc_assert (bb_vinfo
);
4814 /* Create an expression for the first address accessed by this load
4816 base_name
= get_name (DR_BASE_ADDRESS (dr
));
4818 if (dump_enabled_p ())
4820 tree dr_base_type
= TREE_TYPE (DR_BASE_OBJECT (dr
));
4821 dump_printf_loc (MSG_NOTE
, vect_location
,
4822 "create %s-pointer variable to type: %T",
4823 get_tree_code_name (TREE_CODE (aggr_type
)),
4825 if (TREE_CODE (dr_base_type
) == ARRAY_TYPE
)
4826 dump_printf (MSG_NOTE
, " vectorizing an array ref: ");
4827 else if (TREE_CODE (dr_base_type
) == VECTOR_TYPE
)
4828 dump_printf (MSG_NOTE
, " vectorizing a vector ref: ");
4829 else if (TREE_CODE (dr_base_type
) == RECORD_TYPE
)
4830 dump_printf (MSG_NOTE
, " vectorizing a record based array ref: ");
4832 dump_printf (MSG_NOTE
, " vectorizing a pointer ref: ");
4833 dump_printf (MSG_NOTE
, "%T\n", DR_BASE_OBJECT (dr
));
4836 /* (1) Create the new aggregate-pointer variable.
4837 Vector and array types inherit the alias set of their component
4838 type by default so we need to use a ref-all pointer if the data
4839 reference does not conflict with the created aggregated data
4840 reference because it is not addressable. */
4841 bool need_ref_all
= false;
4842 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4843 get_alias_set (DR_REF (dr
))))
4844 need_ref_all
= true;
4845 /* Likewise for any of the data references in the stmt group. */
4846 else if (DR_GROUP_SIZE (stmt_info
) > 1)
4848 stmt_vec_info sinfo
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
4851 struct data_reference
*sdr
= STMT_VINFO_DATA_REF (sinfo
);
4852 if (!alias_sets_conflict_p (get_alias_set (aggr_type
),
4853 get_alias_set (DR_REF (sdr
))))
4855 need_ref_all
= true;
4858 sinfo
= DR_GROUP_NEXT_ELEMENT (sinfo
);
4862 aggr_ptr_type
= build_pointer_type_for_mode (aggr_type
, ptr_mode
,
4864 aggr_ptr
= vect_get_new_vect_var (aggr_ptr_type
, vect_pointer_var
, base_name
);
4867 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4868 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4869 def-use update cycles for the pointer: one relative to the outer-loop
4870 (LOOP), which is what steps (3) and (4) below do. The other is relative
4871 to the inner-loop (which is the inner-most loop containing the dataref),
4872 and this is done be step (5) below.
4874 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4875 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4876 redundant. Steps (3),(4) create the following:
4879 LOOP: vp1 = phi(vp0,vp2)
4885 If there is an inner-loop nested in loop, then step (5) will also be
4886 applied, and an additional update in the inner-loop will be created:
4889 LOOP: vp1 = phi(vp0,vp2)
4891 inner: vp3 = phi(vp1,vp4)
4892 vp4 = vp3 + inner_step
4898 /* (2) Calculate the initial address of the aggregate-pointer, and set
4899 the aggregate-pointer to point to it before the loop. */
4901 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4903 new_temp
= vect_create_addr_base_for_vector_ref (vinfo
,
4904 stmt_info
, &new_stmt_list
,
4905 offset
, byte_offset
);
4910 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, new_stmt_list
);
4911 gcc_assert (!new_bb
);
4914 gsi_insert_seq_before (gsi
, new_stmt_list
, GSI_SAME_STMT
);
4917 *initial_address
= new_temp
;
4918 aggr_ptr_init
= new_temp
;
4920 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4921 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4922 inner-loop nested in LOOP (during outer-loop vectorization). */
4924 /* No update in loop is required. */
4925 if (only_init
&& (!loop_vinfo
|| at_loop
== loop
))
4926 aptr
= aggr_ptr_init
;
4929 /* Accesses to invariant addresses should be handled specially
4931 tree step
= vect_dr_behavior (vinfo
, dr_info
)->step
;
4932 gcc_assert (!integer_zerop (step
));
4934 if (iv_step
== NULL_TREE
)
4936 /* The step of the aggregate pointer is the type size,
4937 negated for downward accesses. */
4938 iv_step
= TYPE_SIZE_UNIT (aggr_type
);
4939 if (tree_int_cst_sgn (step
) == -1)
4940 iv_step
= fold_build1 (NEGATE_EXPR
, TREE_TYPE (iv_step
), iv_step
);
4943 standard_iv_increment_position (loop
, &incr_gsi
, &insert_after
);
4945 create_iv (aggr_ptr_init
,
4946 fold_convert (aggr_ptr_type
, iv_step
),
4947 aggr_ptr
, loop
, &incr_gsi
, insert_after
,
4948 &indx_before_incr
, &indx_after_incr
);
4949 incr
= gsi_stmt (incr_gsi
);
4951 /* Copy the points-to information if it exists. */
4952 if (DR_PTR_INFO (dr
))
4954 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4955 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4960 aptr
= indx_before_incr
;
4963 if (!nested_in_vect_loop
|| only_init
)
4967 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4968 nested in LOOP, if exists. */
4970 gcc_assert (nested_in_vect_loop
);
4973 standard_iv_increment_position (containing_loop
, &incr_gsi
,
4975 create_iv (aptr
, fold_convert (aggr_ptr_type
, DR_STEP (dr
)), aggr_ptr
,
4976 containing_loop
, &incr_gsi
, insert_after
, &indx_before_incr
,
4978 incr
= gsi_stmt (incr_gsi
);
4980 /* Copy the points-to information if it exists. */
4981 if (DR_PTR_INFO (dr
))
4983 vect_duplicate_ssa_name_ptr_info (indx_before_incr
, dr_info
);
4984 vect_duplicate_ssa_name_ptr_info (indx_after_incr
, dr_info
);
4989 return indx_before_incr
;
4996 /* Function bump_vector_ptr
4998 Increment a pointer (to a vector type) by vector-size. If requested,
4999 i.e. if PTR-INCR is given, then also connect the new increment stmt
5000 to the existing def-use update-chain of the pointer, by modifying
5001 the PTR_INCR as illustrated below:
5003 The pointer def-use update-chain before this function:
5004 DATAREF_PTR = phi (p_0, p_2)
5006 PTR_INCR: p_2 = DATAREF_PTR + step
5008 The pointer def-use update-chain after this function:
5009 DATAREF_PTR = phi (p_0, p_2)
5011 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5013 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5016 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5018 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5019 the loop. The increment amount across iterations is expected
5021 BSI - location where the new update stmt is to be placed.
5022 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5023 BUMP - optional. The offset by which to bump the pointer. If not given,
5024 the offset is assumed to be vector_size.
5026 Output: Return NEW_DATAREF_PTR as illustrated above.
5031 bump_vector_ptr (vec_info
*vinfo
,
5032 tree dataref_ptr
, gimple
*ptr_incr
, gimple_stmt_iterator
*gsi
,
5033 stmt_vec_info stmt_info
, tree bump
)
5035 struct data_reference
*dr
= STMT_VINFO_DATA_REF (stmt_info
);
5036 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5037 tree update
= TYPE_SIZE_UNIT (vectype
);
5040 use_operand_p use_p
;
5041 tree new_dataref_ptr
;
5046 if (TREE_CODE (dataref_ptr
) == SSA_NAME
)
5047 new_dataref_ptr
= copy_ssa_name (dataref_ptr
);
5049 new_dataref_ptr
= make_ssa_name (TREE_TYPE (dataref_ptr
));
5050 incr_stmt
= gimple_build_assign (new_dataref_ptr
, POINTER_PLUS_EXPR
,
5051 dataref_ptr
, update
);
5052 vect_finish_stmt_generation (vinfo
, stmt_info
, incr_stmt
, gsi
);
5054 /* Copy the points-to information if it exists. */
5055 if (DR_PTR_INFO (dr
))
5057 duplicate_ssa_name_ptr_info (new_dataref_ptr
, DR_PTR_INFO (dr
));
5058 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr
));
5062 return new_dataref_ptr
;
5064 /* Update the vector-pointer's cross-iteration increment. */
5065 FOR_EACH_SSA_USE_OPERAND (use_p
, ptr_incr
, iter
, SSA_OP_USE
)
5067 tree use
= USE_FROM_PTR (use_p
);
5069 if (use
== dataref_ptr
)
5070 SET_USE (use_p
, new_dataref_ptr
);
5072 gcc_assert (operand_equal_p (use
, update
, 0));
5075 return new_dataref_ptr
;
5079 /* Copy memory reference info such as base/clique from the SRC reference
5080 to the DEST MEM_REF. */
5083 vect_copy_ref_info (tree dest
, tree src
)
5085 if (TREE_CODE (dest
) != MEM_REF
)
5088 tree src_base
= src
;
5089 while (handled_component_p (src_base
))
5090 src_base
= TREE_OPERAND (src_base
, 0);
5091 if (TREE_CODE (src_base
) != MEM_REF
5092 && TREE_CODE (src_base
) != TARGET_MEM_REF
)
5095 MR_DEPENDENCE_CLIQUE (dest
) = MR_DEPENDENCE_CLIQUE (src_base
);
5096 MR_DEPENDENCE_BASE (dest
) = MR_DEPENDENCE_BASE (src_base
);
5100 /* Function vect_create_destination_var.
5102 Create a new temporary of type VECTYPE. */
5105 vect_create_destination_var (tree scalar_dest
, tree vectype
)
5111 enum vect_var_kind kind
;
5114 ? VECTOR_BOOLEAN_TYPE_P (vectype
)
5118 type
= vectype
? vectype
: TREE_TYPE (scalar_dest
);
5120 gcc_assert (TREE_CODE (scalar_dest
) == SSA_NAME
);
5122 name
= get_name (scalar_dest
);
5124 new_name
= xasprintf ("%s_%u", name
, SSA_NAME_VERSION (scalar_dest
));
5126 new_name
= xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest
));
5127 vec_dest
= vect_get_new_vect_var (type
, kind
, new_name
);
5133 /* Function vect_grouped_store_supported.
5135 Returns TRUE if interleave high and interleave low permutations
5136 are supported, and FALSE otherwise. */
5139 vect_grouped_store_supported (tree vectype
, unsigned HOST_WIDE_INT count
)
5141 machine_mode mode
= TYPE_MODE (vectype
);
5143 /* vect_permute_store_chain requires the group size to be equal to 3 or
5144 be a power of two. */
5145 if (count
!= 3 && exact_log2 (count
) == -1)
5147 if (dump_enabled_p ())
5148 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5149 "the size of the group of accesses"
5150 " is not a power of 2 or not eqaul to 3\n");
5154 /* Check that the permutation is supported. */
5155 if (VECTOR_MODE_P (mode
))
5160 unsigned int j0
= 0, j1
= 0, j2
= 0;
5164 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5168 "cannot handle groups of 3 stores for"
5169 " variable-length vectors\n");
5173 vec_perm_builder
sel (nelt
, nelt
, 1);
5174 sel
.quick_grow (nelt
);
5175 vec_perm_indices indices
;
5176 for (j
= 0; j
< 3; j
++)
5178 int nelt0
= ((3 - j
) * nelt
) % 3;
5179 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5180 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5181 for (i
= 0; i
< nelt
; i
++)
5183 if (3 * i
+ nelt0
< nelt
)
5184 sel
[3 * i
+ nelt0
] = j0
++;
5185 if (3 * i
+ nelt1
< nelt
)
5186 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5187 if (3 * i
+ nelt2
< nelt
)
5188 sel
[3 * i
+ nelt2
] = 0;
5190 indices
.new_vector (sel
, 2, nelt
);
5191 if (!can_vec_perm_const_p (mode
, indices
))
5193 if (dump_enabled_p ())
5194 dump_printf (MSG_MISSED_OPTIMIZATION
,
5195 "permutation op not supported by target.\n");
5199 for (i
= 0; i
< nelt
; i
++)
5201 if (3 * i
+ nelt0
< nelt
)
5202 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5203 if (3 * i
+ nelt1
< nelt
)
5204 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5205 if (3 * i
+ nelt2
< nelt
)
5206 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5208 indices
.new_vector (sel
, 2, nelt
);
5209 if (!can_vec_perm_const_p (mode
, indices
))
5211 if (dump_enabled_p ())
5212 dump_printf (MSG_MISSED_OPTIMIZATION
,
5213 "permutation op not supported by target.\n");
5221 /* If length is not equal to 3 then only power of 2 is supported. */
5222 gcc_assert (pow2p_hwi (count
));
5223 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5225 /* The encoding has 2 interleaved stepped patterns. */
5226 vec_perm_builder
sel (nelt
, 2, 3);
5228 for (i
= 0; i
< 3; i
++)
5231 sel
[i
* 2 + 1] = i
+ nelt
;
5233 vec_perm_indices
indices (sel
, 2, nelt
);
5234 if (can_vec_perm_const_p (mode
, indices
))
5236 for (i
= 0; i
< 6; i
++)
5237 sel
[i
] += exact_div (nelt
, 2);
5238 indices
.new_vector (sel
, 2, nelt
);
5239 if (can_vec_perm_const_p (mode
, indices
))
5245 if (dump_enabled_p ())
5246 dump_printf (MSG_MISSED_OPTIMIZATION
,
5247 "permutation op not supported by target.\n");
5252 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5253 type VECTYPE. MASKED_P says whether the masked form is needed. */
5256 vect_store_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5260 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5261 vec_mask_store_lanes_optab
,
5264 return vect_lanes_optab_supported_p ("vec_store_lanes",
5265 vec_store_lanes_optab
,
5270 /* Function vect_permute_store_chain.
5272 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5273 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5274 the data correctly for the stores. Return the final references for stores
5277 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5278 The input is 4 vectors each containing 8 elements. We assign a number to
5279 each element, the input sequence is:
5281 1st vec: 0 1 2 3 4 5 6 7
5282 2nd vec: 8 9 10 11 12 13 14 15
5283 3rd vec: 16 17 18 19 20 21 22 23
5284 4th vec: 24 25 26 27 28 29 30 31
5286 The output sequence should be:
5288 1st vec: 0 8 16 24 1 9 17 25
5289 2nd vec: 2 10 18 26 3 11 19 27
5290 3rd vec: 4 12 20 28 5 13 21 30
5291 4th vec: 6 14 22 30 7 15 23 31
5293 i.e., we interleave the contents of the four vectors in their order.
5295 We use interleave_high/low instructions to create such output. The input of
5296 each interleave_high/low operation is two vectors:
5299 the even elements of the result vector are obtained left-to-right from the
5300 high/low elements of the first vector. The odd elements of the result are
5301 obtained left-to-right from the high/low elements of the second vector.
5302 The output of interleave_high will be: 0 4 1 5
5303 and of interleave_low: 2 6 3 7
5306 The permutation is done in log LENGTH stages. In each stage interleave_high
5307 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5308 where the first argument is taken from the first half of DR_CHAIN and the
5309 second argument from it's second half.
5312 I1: interleave_high (1st vec, 3rd vec)
5313 I2: interleave_low (1st vec, 3rd vec)
5314 I3: interleave_high (2nd vec, 4th vec)
5315 I4: interleave_low (2nd vec, 4th vec)
5317 The output for the first stage is:
5319 I1: 0 16 1 17 2 18 3 19
5320 I2: 4 20 5 21 6 22 7 23
5321 I3: 8 24 9 25 10 26 11 27
5322 I4: 12 28 13 29 14 30 15 31
5324 The output of the second stage, i.e. the final result is:
5326 I1: 0 8 16 24 1 9 17 25
5327 I2: 2 10 18 26 3 11 19 27
5328 I3: 4 12 20 28 5 13 21 30
5329 I4: 6 14 22 30 7 15 23 31. */
5332 vect_permute_store_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5333 unsigned int length
,
5334 stmt_vec_info stmt_info
,
5335 gimple_stmt_iterator
*gsi
,
5336 vec
<tree
> *result_chain
)
5338 tree vect1
, vect2
, high
, low
;
5340 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5341 tree perm_mask_low
, perm_mask_high
;
5343 tree perm3_mask_low
, perm3_mask_high
;
5344 unsigned int i
, j
, n
, log_length
= exact_log2 (length
);
5346 result_chain
->quick_grow (length
);
5347 memcpy (result_chain
->address (), dr_chain
.address (),
5348 length
* sizeof (tree
));
5352 /* vect_grouped_store_supported ensures that this is constant. */
5353 unsigned int nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5354 unsigned int j0
= 0, j1
= 0, j2
= 0;
5356 vec_perm_builder
sel (nelt
, nelt
, 1);
5357 sel
.quick_grow (nelt
);
5358 vec_perm_indices indices
;
5359 for (j
= 0; j
< 3; j
++)
5361 int nelt0
= ((3 - j
) * nelt
) % 3;
5362 int nelt1
= ((3 - j
) * nelt
+ 1) % 3;
5363 int nelt2
= ((3 - j
) * nelt
+ 2) % 3;
5365 for (i
= 0; i
< nelt
; i
++)
5367 if (3 * i
+ nelt0
< nelt
)
5368 sel
[3 * i
+ nelt0
] = j0
++;
5369 if (3 * i
+ nelt1
< nelt
)
5370 sel
[3 * i
+ nelt1
] = nelt
+ j1
++;
5371 if (3 * i
+ nelt2
< nelt
)
5372 sel
[3 * i
+ nelt2
] = 0;
5374 indices
.new_vector (sel
, 2, nelt
);
5375 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5377 for (i
= 0; i
< nelt
; i
++)
5379 if (3 * i
+ nelt0
< nelt
)
5380 sel
[3 * i
+ nelt0
] = 3 * i
+ nelt0
;
5381 if (3 * i
+ nelt1
< nelt
)
5382 sel
[3 * i
+ nelt1
] = 3 * i
+ nelt1
;
5383 if (3 * i
+ nelt2
< nelt
)
5384 sel
[3 * i
+ nelt2
] = nelt
+ j2
++;
5386 indices
.new_vector (sel
, 2, nelt
);
5387 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5389 vect1
= dr_chain
[0];
5390 vect2
= dr_chain
[1];
5392 /* Create interleaving stmt:
5393 low = VEC_PERM_EXPR <vect1, vect2,
5394 {j, nelt, *, j + 1, nelt + j + 1, *,
5395 j + 2, nelt + j + 2, *, ...}> */
5396 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
5397 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5398 vect2
, perm3_mask_low
);
5399 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5402 vect2
= dr_chain
[2];
5403 /* Create interleaving stmt:
5404 low = VEC_PERM_EXPR <vect1, vect2,
5405 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5406 6, 7, nelt + j + 2, ...}> */
5407 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
5408 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect1
,
5409 vect2
, perm3_mask_high
);
5410 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5411 (*result_chain
)[j
] = data_ref
;
5416 /* If length is not equal to 3 then only power of 2 is supported. */
5417 gcc_assert (pow2p_hwi (length
));
5419 /* The encoding has 2 interleaved stepped patterns. */
5420 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
5421 vec_perm_builder
sel (nelt
, 2, 3);
5423 for (i
= 0; i
< 3; i
++)
5426 sel
[i
* 2 + 1] = i
+ nelt
;
5428 vec_perm_indices
indices (sel
, 2, nelt
);
5429 perm_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
5431 for (i
= 0; i
< 6; i
++)
5432 sel
[i
] += exact_div (nelt
, 2);
5433 indices
.new_vector (sel
, 2, nelt
);
5434 perm_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5436 for (i
= 0, n
= log_length
; i
< n
; i
++)
5438 for (j
= 0; j
< length
/2; j
++)
5440 vect1
= dr_chain
[j
];
5441 vect2
= dr_chain
[j
+length
/2];
5443 /* Create interleaving stmt:
5444 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5446 high
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_high");
5447 perm_stmt
= gimple_build_assign (high
, VEC_PERM_EXPR
, vect1
,
5448 vect2
, perm_mask_high
);
5449 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5450 (*result_chain
)[2*j
] = high
;
5452 /* Create interleaving stmt:
5453 low = VEC_PERM_EXPR <vect1, vect2,
5454 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5456 low
= make_temp_ssa_name (vectype
, NULL
, "vect_inter_low");
5457 perm_stmt
= gimple_build_assign (low
, VEC_PERM_EXPR
, vect1
,
5458 vect2
, perm_mask_low
);
5459 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
5460 (*result_chain
)[2*j
+1] = low
;
5462 memcpy (dr_chain
.address (), result_chain
->address (),
5463 length
* sizeof (tree
));
5468 /* Function vect_setup_realignment
5470 This function is called when vectorizing an unaligned load using
5471 the dr_explicit_realign[_optimized] scheme.
5472 This function generates the following code at the loop prolog:
5475 x msq_init = *(floor(p)); # prolog load
5476 realignment_token = call target_builtin;
5478 x msq = phi (msq_init, ---)
5480 The stmts marked with x are generated only for the case of
5481 dr_explicit_realign_optimized.
5483 The code above sets up a new (vector) pointer, pointing to the first
5484 location accessed by STMT_INFO, and a "floor-aligned" load using that
5485 pointer. It also generates code to compute the "realignment-token"
5486 (if the relevant target hook was defined), and creates a phi-node at the
5487 loop-header bb whose arguments are the result of the prolog-load (created
5488 by this function) and the result of a load that takes place in the loop
5489 (to be created by the caller to this function).
5491 For the case of dr_explicit_realign_optimized:
5492 The caller to this function uses the phi-result (msq) to create the
5493 realignment code inside the loop, and sets up the missing phi argument,
5496 msq = phi (msq_init, lsq)
5497 lsq = *(floor(p')); # load in loop
5498 result = realign_load (msq, lsq, realignment_token);
5500 For the case of dr_explicit_realign:
5502 msq = *(floor(p)); # load in loop
5504 lsq = *(floor(p')); # load in loop
5505 result = realign_load (msq, lsq, realignment_token);
5508 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5509 a memory location that may be unaligned.
5510 BSI - place where new code is to be inserted.
5511 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5515 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5516 target hook, if defined.
5517 Return value - the result of the loop-header phi node. */
5520 vect_setup_realignment (vec_info
*vinfo
, stmt_vec_info stmt_info
,
5521 gimple_stmt_iterator
*gsi
, tree
*realignment_token
,
5522 enum dr_alignment_support alignment_support_scheme
,
5524 class loop
**at_loop
)
5526 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5527 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
5528 dr_vec_info
*dr_info
= STMT_VINFO_DR_INFO (stmt_info
);
5529 struct data_reference
*dr
= dr_info
->dr
;
5530 class loop
*loop
= NULL
;
5532 tree scalar_dest
= gimple_assign_lhs (stmt_info
->stmt
);
5538 tree msq_init
= NULL_TREE
;
5541 tree msq
= NULL_TREE
;
5542 gimple_seq stmts
= NULL
;
5543 bool compute_in_loop
= false;
5544 bool nested_in_vect_loop
= false;
5545 class loop
*containing_loop
= (gimple_bb (stmt_info
->stmt
))->loop_father
;
5546 class loop
*loop_for_initial_load
= NULL
;
5550 loop
= LOOP_VINFO_LOOP (loop_vinfo
);
5551 nested_in_vect_loop
= nested_in_vect_loop_p (loop
, stmt_info
);
5554 gcc_assert (alignment_support_scheme
== dr_explicit_realign
5555 || alignment_support_scheme
== dr_explicit_realign_optimized
);
5557 /* We need to generate three things:
5558 1. the misalignment computation
5559 2. the extra vector load (for the optimized realignment scheme).
5560 3. the phi node for the two vectors from which the realignment is
5561 done (for the optimized realignment scheme). */
5563 /* 1. Determine where to generate the misalignment computation.
5565 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5566 calculation will be generated by this function, outside the loop (in the
5567 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5568 caller, inside the loop.
5570 Background: If the misalignment remains fixed throughout the iterations of
5571 the loop, then both realignment schemes are applicable, and also the
5572 misalignment computation can be done outside LOOP. This is because we are
5573 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5574 are a multiple of VS (the Vector Size), and therefore the misalignment in
5575 different vectorized LOOP iterations is always the same.
5576 The problem arises only if the memory access is in an inner-loop nested
5577 inside LOOP, which is now being vectorized using outer-loop vectorization.
5578 This is the only case when the misalignment of the memory access may not
5579 remain fixed throughout the iterations of the inner-loop (as explained in
5580 detail in vect_supportable_dr_alignment). In this case, not only is the
5581 optimized realignment scheme not applicable, but also the misalignment
5582 computation (and generation of the realignment token that is passed to
5583 REALIGN_LOAD) have to be done inside the loop.
5585 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5586 or not, which in turn determines if the misalignment is computed inside
5587 the inner-loop, or outside LOOP. */
5589 if (init_addr
!= NULL_TREE
|| !loop_vinfo
)
5591 compute_in_loop
= true;
5592 gcc_assert (alignment_support_scheme
== dr_explicit_realign
);
5596 /* 2. Determine where to generate the extra vector load.
5598 For the optimized realignment scheme, instead of generating two vector
5599 loads in each iteration, we generate a single extra vector load in the
5600 preheader of the loop, and in each iteration reuse the result of the
5601 vector load from the previous iteration. In case the memory access is in
5602 an inner-loop nested inside LOOP, which is now being vectorized using
5603 outer-loop vectorization, we need to determine whether this initial vector
5604 load should be generated at the preheader of the inner-loop, or can be
5605 generated at the preheader of LOOP. If the memory access has no evolution
5606 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5607 to be generated inside LOOP (in the preheader of the inner-loop). */
5609 if (nested_in_vect_loop
)
5611 tree outerloop_step
= STMT_VINFO_DR_STEP (stmt_info
);
5612 bool invariant_in_outerloop
=
5613 (tree_int_cst_compare (outerloop_step
, size_zero_node
) == 0);
5614 loop_for_initial_load
= (invariant_in_outerloop
? loop
: loop
->inner
);
5617 loop_for_initial_load
= loop
;
5619 *at_loop
= loop_for_initial_load
;
5621 if (loop_for_initial_load
)
5622 pe
= loop_preheader_edge (loop_for_initial_load
);
5624 /* 3. For the case of the optimized realignment, create the first vector
5625 load at the loop preheader. */
5627 if (alignment_support_scheme
== dr_explicit_realign_optimized
)
5629 /* Create msq_init = *(floor(p1)) in the loop preheader */
5632 gcc_assert (!compute_in_loop
);
5633 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5634 ptr
= vect_create_data_ref_ptr (vinfo
, stmt_info
, vectype
,
5635 loop_for_initial_load
, NULL_TREE
,
5636 &init_addr
, NULL
, &inc
, true);
5637 if (TREE_CODE (ptr
) == SSA_NAME
)
5638 new_temp
= copy_ssa_name (ptr
);
5640 new_temp
= make_ssa_name (TREE_TYPE (ptr
));
5641 poly_uint64 align
= DR_TARGET_ALIGNMENT (dr_info
);
5642 tree type
= TREE_TYPE (ptr
);
5643 new_stmt
= gimple_build_assign
5644 (new_temp
, BIT_AND_EXPR
, ptr
,
5645 fold_build2 (MINUS_EXPR
, type
,
5646 build_int_cst (type
, 0),
5647 build_int_cst (type
, align
)));
5648 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5649 gcc_assert (!new_bb
);
5651 = build2 (MEM_REF
, TREE_TYPE (vec_dest
), new_temp
,
5652 build_int_cst (reference_alias_ptr_type (DR_REF (dr
)), 0));
5653 vect_copy_ref_info (data_ref
, DR_REF (dr
));
5654 new_stmt
= gimple_build_assign (vec_dest
, data_ref
);
5655 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5656 gimple_assign_set_lhs (new_stmt
, new_temp
);
5659 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5660 gcc_assert (!new_bb
);
5663 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5665 msq_init
= gimple_assign_lhs (new_stmt
);
5668 /* 4. Create realignment token using a target builtin, if available.
5669 It is done either inside the containing loop, or before LOOP (as
5670 determined above). */
5672 if (targetm
.vectorize
.builtin_mask_for_load
)
5677 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5680 /* Generate the INIT_ADDR computation outside LOOP. */
5681 init_addr
= vect_create_addr_base_for_vector_ref (vinfo
,
5686 pe
= loop_preheader_edge (loop
);
5687 new_bb
= gsi_insert_seq_on_edge_immediate (pe
, stmts
);
5688 gcc_assert (!new_bb
);
5691 gsi_insert_seq_before (gsi
, stmts
, GSI_SAME_STMT
);
5694 builtin_decl
= targetm
.vectorize
.builtin_mask_for_load ();
5695 new_stmt
= gimple_build_call (builtin_decl
, 1, init_addr
);
5697 vect_create_destination_var (scalar_dest
,
5698 gimple_call_return_type (new_stmt
));
5699 new_temp
= make_ssa_name (vec_dest
, new_stmt
);
5700 gimple_call_set_lhs (new_stmt
, new_temp
);
5702 if (compute_in_loop
)
5703 gsi_insert_before (gsi
, new_stmt
, GSI_SAME_STMT
);
5706 /* Generate the misalignment computation outside LOOP. */
5707 pe
= loop_preheader_edge (loop
);
5708 new_bb
= gsi_insert_on_edge_immediate (pe
, new_stmt
);
5709 gcc_assert (!new_bb
);
5712 *realignment_token
= gimple_call_lhs (new_stmt
);
5714 /* The result of the CALL_EXPR to this builtin is determined from
5715 the value of the parameter and no global variables are touched
5716 which makes the builtin a "const" function. Requiring the
5717 builtin to have the "const" attribute makes it unnecessary
5718 to call mark_call_clobbered. */
5719 gcc_assert (TREE_READONLY (builtin_decl
));
5722 if (alignment_support_scheme
== dr_explicit_realign
)
5725 gcc_assert (!compute_in_loop
);
5726 gcc_assert (alignment_support_scheme
== dr_explicit_realign_optimized
);
5729 /* 5. Create msq = phi <msq_init, lsq> in loop */
5731 pe
= loop_preheader_edge (containing_loop
);
5732 vec_dest
= vect_create_destination_var (scalar_dest
, vectype
);
5733 msq
= make_ssa_name (vec_dest
);
5734 phi_stmt
= create_phi_node (msq
, containing_loop
->header
);
5735 add_phi_arg (phi_stmt
, msq_init
, pe
, UNKNOWN_LOCATION
);
5741 /* Function vect_grouped_load_supported.
5743 COUNT is the size of the load group (the number of statements plus the
5744 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5745 only one statement, with a gap of COUNT - 1.
5747 Returns true if a suitable permute exists. */
5750 vect_grouped_load_supported (tree vectype
, bool single_element_p
,
5751 unsigned HOST_WIDE_INT count
)
5753 machine_mode mode
= TYPE_MODE (vectype
);
5755 /* If this is single-element interleaving with an element distance
5756 that leaves unused vector loads around punt - we at least create
5757 very sub-optimal code in that case (and blow up memory,
5759 if (single_element_p
&& maybe_gt (count
, TYPE_VECTOR_SUBPARTS (vectype
)))
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5763 "single-element interleaving not supported "
5764 "for not adjacent vector loads\n");
5768 /* vect_permute_load_chain requires the group size to be equal to 3 or
5769 be a power of two. */
5770 if (count
!= 3 && exact_log2 (count
) == -1)
5772 if (dump_enabled_p ())
5773 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5774 "the size of the group of accesses"
5775 " is not a power of 2 or not equal to 3\n");
5779 /* Check that the permutation is supported. */
5780 if (VECTOR_MODE_P (mode
))
5786 if (!GET_MODE_NUNITS (mode
).is_constant (&nelt
))
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5790 "cannot handle groups of 3 loads for"
5791 " variable-length vectors\n");
5795 vec_perm_builder
sel (nelt
, nelt
, 1);
5796 sel
.quick_grow (nelt
);
5797 vec_perm_indices indices
;
5799 for (k
= 0; k
< 3; k
++)
5801 for (i
= 0; i
< nelt
; i
++)
5802 if (3 * i
+ k
< 2 * nelt
)
5806 indices
.new_vector (sel
, 2, nelt
);
5807 if (!can_vec_perm_const_p (mode
, indices
))
5809 if (dump_enabled_p ())
5810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5811 "shuffle of 3 loads is not supported by"
5815 for (i
= 0, j
= 0; i
< nelt
; i
++)
5816 if (3 * i
+ k
< 2 * nelt
)
5819 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5820 indices
.new_vector (sel
, 2, nelt
);
5821 if (!can_vec_perm_const_p (mode
, indices
))
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5825 "shuffle of 3 loads is not supported by"
5834 /* If length is not equal to 3 then only power of 2 is supported. */
5835 gcc_assert (pow2p_hwi (count
));
5836 poly_uint64 nelt
= GET_MODE_NUNITS (mode
);
5838 /* The encoding has a single stepped pattern. */
5839 vec_perm_builder
sel (nelt
, 1, 3);
5841 for (i
= 0; i
< 3; i
++)
5843 vec_perm_indices
indices (sel
, 2, nelt
);
5844 if (can_vec_perm_const_p (mode
, indices
))
5846 for (i
= 0; i
< 3; i
++)
5848 indices
.new_vector (sel
, 2, nelt
);
5849 if (can_vec_perm_const_p (mode
, indices
))
5855 if (dump_enabled_p ())
5856 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5857 "extract even/odd not supported by target\n");
5861 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5862 type VECTYPE. MASKED_P says whether the masked form is needed. */
5865 vect_load_lanes_supported (tree vectype
, unsigned HOST_WIDE_INT count
,
5869 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5870 vec_mask_load_lanes_optab
,
5873 return vect_lanes_optab_supported_p ("vec_load_lanes",
5874 vec_load_lanes_optab
,
5878 /* Function vect_permute_load_chain.
5880 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5881 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5882 the input data correctly. Return the final references for loads in
5885 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5886 The input is 4 vectors each containing 8 elements. We assign a number to each
5887 element, the input sequence is:
5889 1st vec: 0 1 2 3 4 5 6 7
5890 2nd vec: 8 9 10 11 12 13 14 15
5891 3rd vec: 16 17 18 19 20 21 22 23
5892 4th vec: 24 25 26 27 28 29 30 31
5894 The output sequence should be:
5896 1st vec: 0 4 8 12 16 20 24 28
5897 2nd vec: 1 5 9 13 17 21 25 29
5898 3rd vec: 2 6 10 14 18 22 26 30
5899 4th vec: 3 7 11 15 19 23 27 31
5901 i.e., the first output vector should contain the first elements of each
5902 interleaving group, etc.
5904 We use extract_even/odd instructions to create such output. The input of
5905 each extract_even/odd operation is two vectors
5909 and the output is the vector of extracted even/odd elements. The output of
5910 extract_even will be: 0 2 4 6
5911 and of extract_odd: 1 3 5 7
5914 The permutation is done in log LENGTH stages. In each stage extract_even
5915 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5916 their order. In our example,
5918 E1: extract_even (1st vec, 2nd vec)
5919 E2: extract_odd (1st vec, 2nd vec)
5920 E3: extract_even (3rd vec, 4th vec)
5921 E4: extract_odd (3rd vec, 4th vec)
5923 The output for the first stage will be:
5925 E1: 0 2 4 6 8 10 12 14
5926 E2: 1 3 5 7 9 11 13 15
5927 E3: 16 18 20 22 24 26 28 30
5928 E4: 17 19 21 23 25 27 29 31
5930 In order to proceed and create the correct sequence for the next stage (or
5931 for the correct output, if the second stage is the last one, as in our
5932 example), we first put the output of extract_even operation and then the
5933 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5934 The input for the second stage is:
5936 1st vec (E1): 0 2 4 6 8 10 12 14
5937 2nd vec (E3): 16 18 20 22 24 26 28 30
5938 3rd vec (E2): 1 3 5 7 9 11 13 15
5939 4th vec (E4): 17 19 21 23 25 27 29 31
5941 The output of the second stage:
5943 E1: 0 4 8 12 16 20 24 28
5944 E2: 2 6 10 14 18 22 26 30
5945 E3: 1 5 9 13 17 21 25 29
5946 E4: 3 7 11 15 19 23 27 31
5948 And RESULT_CHAIN after reordering:
5950 1st vec (E1): 0 4 8 12 16 20 24 28
5951 2nd vec (E3): 1 5 9 13 17 21 25 29
5952 3rd vec (E2): 2 6 10 14 18 22 26 30
5953 4th vec (E4): 3 7 11 15 19 23 27 31. */
5956 vect_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
5957 unsigned int length
,
5958 stmt_vec_info stmt_info
,
5959 gimple_stmt_iterator
*gsi
,
5960 vec
<tree
> *result_chain
)
5962 tree data_ref
, first_vect
, second_vect
;
5963 tree perm_mask_even
, perm_mask_odd
;
5964 tree perm3_mask_low
, perm3_mask_high
;
5966 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5967 unsigned int i
, j
, log_length
= exact_log2 (length
);
5969 result_chain
->quick_grow (length
);
5970 memcpy (result_chain
->address (), dr_chain
.address (),
5971 length
* sizeof (tree
));
5975 /* vect_grouped_load_supported ensures that this is constant. */
5976 unsigned nelt
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
5979 vec_perm_builder
sel (nelt
, nelt
, 1);
5980 sel
.quick_grow (nelt
);
5981 vec_perm_indices indices
;
5982 for (k
= 0; k
< 3; k
++)
5984 for (i
= 0; i
< nelt
; i
++)
5985 if (3 * i
+ k
< 2 * nelt
)
5989 indices
.new_vector (sel
, 2, nelt
);
5990 perm3_mask_low
= vect_gen_perm_mask_checked (vectype
, indices
);
5992 for (i
= 0, j
= 0; i
< nelt
; i
++)
5993 if (3 * i
+ k
< 2 * nelt
)
5996 sel
[i
] = nelt
+ ((nelt
+ k
) % 3) + 3 * (j
++);
5997 indices
.new_vector (sel
, 2, nelt
);
5998 perm3_mask_high
= vect_gen_perm_mask_checked (vectype
, indices
);
6000 first_vect
= dr_chain
[0];
6001 second_vect
= dr_chain
[1];
6003 /* Create interleaving stmt (low part of):
6004 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6006 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_low");
6007 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6008 second_vect
, perm3_mask_low
);
6009 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6011 /* Create interleaving stmt (high part of):
6012 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6014 first_vect
= data_ref
;
6015 second_vect
= dr_chain
[2];
6016 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3_high");
6017 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, first_vect
,
6018 second_vect
, perm3_mask_high
);
6019 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6020 (*result_chain
)[k
] = data_ref
;
6025 /* If length is not equal to 3 then only power of 2 is supported. */
6026 gcc_assert (pow2p_hwi (length
));
6028 /* The encoding has a single stepped pattern. */
6029 poly_uint64 nelt
= TYPE_VECTOR_SUBPARTS (vectype
);
6030 vec_perm_builder
sel (nelt
, 1, 3);
6032 for (i
= 0; i
< 3; ++i
)
6034 vec_perm_indices
indices (sel
, 2, nelt
);
6035 perm_mask_even
= vect_gen_perm_mask_checked (vectype
, indices
);
6037 for (i
= 0; i
< 3; ++i
)
6039 indices
.new_vector (sel
, 2, nelt
);
6040 perm_mask_odd
= vect_gen_perm_mask_checked (vectype
, indices
);
6042 for (i
= 0; i
< log_length
; i
++)
6044 for (j
= 0; j
< length
; j
+= 2)
6046 first_vect
= dr_chain
[j
];
6047 second_vect
= dr_chain
[j
+1];
6049 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6050 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_even");
6051 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6052 first_vect
, second_vect
,
6054 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6055 (*result_chain
)[j
/2] = data_ref
;
6057 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6058 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_perm_odd");
6059 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6060 first_vect
, second_vect
,
6062 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6063 (*result_chain
)[j
/2+length
/2] = data_ref
;
6065 memcpy (dr_chain
.address (), result_chain
->address (),
6066 length
* sizeof (tree
));
6071 /* Function vect_shift_permute_load_chain.
6073 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6074 sequence of stmts to reorder the input data accordingly.
6075 Return the final references for loads in RESULT_CHAIN.
6076 Return true if successed, false otherwise.
6078 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6079 The input is 3 vectors each containing 8 elements. We assign a
6080 number to each element, the input sequence is:
6082 1st vec: 0 1 2 3 4 5 6 7
6083 2nd vec: 8 9 10 11 12 13 14 15
6084 3rd vec: 16 17 18 19 20 21 22 23
6086 The output sequence should be:
6088 1st vec: 0 3 6 9 12 15 18 21
6089 2nd vec: 1 4 7 10 13 16 19 22
6090 3rd vec: 2 5 8 11 14 17 20 23
6092 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6094 First we shuffle all 3 vectors to get correct elements order:
6096 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6097 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6098 3rd vec: (16 19 22) (17 20 23) (18 21)
6100 Next we unite and shift vector 3 times:
6103 shift right by 6 the concatenation of:
6104 "1st vec" and "2nd vec"
6105 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6106 "2nd vec" and "3rd vec"
6107 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6108 "3rd vec" and "1st vec"
6109 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6112 So that now new vectors are:
6114 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6115 2nd vec: (10 13) (16 19 22) (17 20 23)
6116 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6119 shift right by 5 the concatenation of:
6120 "1st vec" and "3rd vec"
6121 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6122 "2nd vec" and "1st vec"
6123 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6124 "3rd vec" and "2nd vec"
6125 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6128 So that now new vectors are:
6130 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6131 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6132 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6135 shift right by 5 the concatenation of:
6136 "1st vec" and "1st vec"
6137 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6138 shift right by 3 the concatenation of:
6139 "2nd vec" and "2nd vec"
6140 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6143 So that now all vectors are READY:
6144 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6145 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6146 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6148 This algorithm is faster than one in vect_permute_load_chain if:
6149 1. "shift of a concatination" is faster than general permutation.
6151 2. The TARGET machine can't execute vector instructions in parallel.
6152 This is because each step of the algorithm depends on previous.
6153 The algorithm in vect_permute_load_chain is much more parallel.
6155 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6159 vect_shift_permute_load_chain (vec_info
*vinfo
, vec
<tree
> dr_chain
,
6160 unsigned int length
,
6161 stmt_vec_info stmt_info
,
6162 gimple_stmt_iterator
*gsi
,
6163 vec
<tree
> *result_chain
)
6165 tree vect
[3], vect_shift
[3], data_ref
, first_vect
, second_vect
;
6166 tree perm2_mask1
, perm2_mask2
, perm3_mask
;
6167 tree select_mask
, shift1_mask
, shift2_mask
, shift3_mask
, shift4_mask
;
6170 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6172 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6174 unsigned HOST_WIDE_INT nelt
, vf
;
6175 if (!TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&nelt
)
6176 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo
).is_constant (&vf
))
6177 /* Not supported for variable-length vectors. */
6180 vec_perm_builder
sel (nelt
, nelt
, 1);
6181 sel
.quick_grow (nelt
);
6183 result_chain
->quick_grow (length
);
6184 memcpy (result_chain
->address (), dr_chain
.address (),
6185 length
* sizeof (tree
));
6187 if (pow2p_hwi (length
) && vf
> 4)
6189 unsigned int j
, log_length
= exact_log2 (length
);
6190 for (i
= 0; i
< nelt
/ 2; ++i
)
6192 for (i
= 0; i
< nelt
/ 2; ++i
)
6193 sel
[nelt
/ 2 + i
] = i
* 2 + 1;
6194 vec_perm_indices
indices (sel
, 2, nelt
);
6195 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6197 if (dump_enabled_p ())
6198 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6199 "shuffle of 2 fields structure is not \
6200 supported by target\n");
6203 perm2_mask1
= vect_gen_perm_mask_checked (vectype
, indices
);
6205 for (i
= 0; i
< nelt
/ 2; ++i
)
6207 for (i
= 0; i
< nelt
/ 2; ++i
)
6208 sel
[nelt
/ 2 + i
] = i
* 2;
6209 indices
.new_vector (sel
, 2, nelt
);
6210 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6212 if (dump_enabled_p ())
6213 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6214 "shuffle of 2 fields structure is not \
6215 supported by target\n");
6218 perm2_mask2
= vect_gen_perm_mask_checked (vectype
, indices
);
6220 /* Generating permutation constant to shift all elements.
6221 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6222 for (i
= 0; i
< nelt
; i
++)
6223 sel
[i
] = nelt
/ 2 + i
;
6224 indices
.new_vector (sel
, 2, nelt
);
6225 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6227 if (dump_enabled_p ())
6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6229 "shift permutation is not supported by target\n");
6232 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6234 /* Generating permutation constant to select vector from 2.
6235 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6236 for (i
= 0; i
< nelt
/ 2; i
++)
6238 for (i
= nelt
/ 2; i
< nelt
; i
++)
6240 indices
.new_vector (sel
, 2, nelt
);
6241 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6243 if (dump_enabled_p ())
6244 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6245 "select is not supported by target\n");
6248 select_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6250 for (i
= 0; i
< log_length
; i
++)
6252 for (j
= 0; j
< length
; j
+= 2)
6254 first_vect
= dr_chain
[j
];
6255 second_vect
= dr_chain
[j
+ 1];
6257 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6258 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6259 first_vect
, first_vect
,
6261 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6264 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle2");
6265 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6266 second_vect
, second_vect
,
6268 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6271 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift");
6272 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6273 vect
[0], vect
[1], shift1_mask
);
6274 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6275 (*result_chain
)[j
/2 + length
/2] = data_ref
;
6277 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_select");
6278 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6279 vect
[0], vect
[1], select_mask
);
6280 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6281 (*result_chain
)[j
/2] = data_ref
;
6283 memcpy (dr_chain
.address (), result_chain
->address (),
6284 length
* sizeof (tree
));
6288 if (length
== 3 && vf
> 2)
6290 unsigned int k
= 0, l
= 0;
6292 /* Generating permutation constant to get all elements in rigth order.
6293 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6294 for (i
= 0; i
< nelt
; i
++)
6296 if (3 * k
+ (l
% 3) >= nelt
)
6299 l
+= (3 - (nelt
% 3));
6301 sel
[i
] = 3 * k
+ (l
% 3);
6304 vec_perm_indices
indices (sel
, 2, nelt
);
6305 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6307 if (dump_enabled_p ())
6308 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6309 "shuffle of 3 fields structure is not \
6310 supported by target\n");
6313 perm3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6315 /* Generating permutation constant to shift all elements.
6316 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6317 for (i
= 0; i
< nelt
; i
++)
6318 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) + i
;
6319 indices
.new_vector (sel
, 2, nelt
);
6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6322 if (dump_enabled_p ())
6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6324 "shift permutation is not supported by target\n");
6327 shift1_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6329 /* Generating permutation constant to shift all elements.
6330 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6331 for (i
= 0; i
< nelt
; i
++)
6332 sel
[i
] = 2 * (nelt
/ 3) + 1 + i
;
6333 indices
.new_vector (sel
, 2, nelt
);
6334 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6336 if (dump_enabled_p ())
6337 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6338 "shift permutation is not supported by target\n");
6341 shift2_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6343 /* Generating permutation constant to shift all elements.
6344 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6345 for (i
= 0; i
< nelt
; i
++)
6346 sel
[i
] = (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6347 indices
.new_vector (sel
, 2, nelt
);
6348 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6350 if (dump_enabled_p ())
6351 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6352 "shift permutation is not supported by target\n");
6355 shift3_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6357 /* Generating permutation constant to shift all elements.
6358 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6359 for (i
= 0; i
< nelt
; i
++)
6360 sel
[i
] = 2 * (nelt
/ 3) + (nelt
% 3) / 2 + i
;
6361 indices
.new_vector (sel
, 2, nelt
);
6362 if (!can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
6364 if (dump_enabled_p ())
6365 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
6366 "shift permutation is not supported by target\n");
6369 shift4_mask
= vect_gen_perm_mask_checked (vectype
, indices
);
6371 for (k
= 0; k
< 3; k
++)
6373 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shuffle3");
6374 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6375 dr_chain
[k
], dr_chain
[k
],
6377 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6381 for (k
= 0; k
< 3; k
++)
6383 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift1");
6384 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6385 vect
[k
% 3], vect
[(k
+ 1) % 3],
6387 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6388 vect_shift
[k
] = data_ref
;
6391 for (k
= 0; k
< 3; k
++)
6393 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift2");
6394 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
,
6395 vect_shift
[(4 - k
) % 3],
6396 vect_shift
[(3 - k
) % 3],
6398 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6402 (*result_chain
)[3 - (nelt
% 3)] = vect
[2];
6404 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift3");
6405 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[0],
6406 vect
[0], shift3_mask
);
6407 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6408 (*result_chain
)[nelt
% 3] = data_ref
;
6410 data_ref
= make_temp_ssa_name (vectype
, NULL
, "vect_shift4");
6411 perm_stmt
= gimple_build_assign (data_ref
, VEC_PERM_EXPR
, vect
[1],
6412 vect
[1], shift4_mask
);
6413 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
, gsi
);
6414 (*result_chain
)[0] = data_ref
;
6420 /* Function vect_transform_grouped_load.
6422 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6423 to perform their permutation and ascribe the result vectorized statements to
6424 the scalar statements.
6428 vect_transform_grouped_load (vec_info
*vinfo
, stmt_vec_info stmt_info
,
6430 int size
, gimple_stmt_iterator
*gsi
)
6433 vec
<tree
> result_chain
= vNULL
;
6435 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6436 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6437 vectors, that are ready for vector computation. */
6438 result_chain
.create (size
);
6440 /* If reassociation width for vector type is 2 or greater target machine can
6441 execute 2 or more vector instructions in parallel. Otherwise try to
6442 get chain for loads group using vect_shift_permute_load_chain. */
6443 mode
= TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info
));
6444 if (targetm
.sched
.reassociation_width (VEC_PERM_EXPR
, mode
) > 1
6446 || !vect_shift_permute_load_chain (vinfo
, dr_chain
, size
, stmt_info
,
6447 gsi
, &result_chain
))
6448 vect_permute_load_chain (vinfo
, dr_chain
,
6449 size
, stmt_info
, gsi
, &result_chain
);
6450 vect_record_grouped_load_vectors (vinfo
, stmt_info
, result_chain
);
6451 result_chain
.release ();
6454 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6455 generated as part of the vectorization of STMT_INFO. Assign the statement
6456 for each vector to the associated scalar statement. */
6459 vect_record_grouped_load_vectors (vec_info
*, stmt_vec_info stmt_info
,
6460 vec
<tree
> result_chain
)
6462 stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
6463 unsigned int i
, gap_count
;
6466 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6467 Since we scan the chain starting from it's first node, their order
6468 corresponds the order of data-refs in RESULT_CHAIN. */
6469 stmt_vec_info next_stmt_info
= first_stmt_info
;
6471 FOR_EACH_VEC_ELT (result_chain
, i
, tmp_data_ref
)
6473 if (!next_stmt_info
)
6476 /* Skip the gaps. Loads created for the gaps will be removed by dead
6477 code elimination pass later. No need to check for the first stmt in
6478 the group, since it always exists.
6479 DR_GROUP_GAP is the number of steps in elements from the previous
6480 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6481 correspond to the gaps. */
6482 if (next_stmt_info
!= first_stmt_info
6483 && gap_count
< DR_GROUP_GAP (next_stmt_info
))
6489 /* ??? The following needs cleanup after the removal of
6490 DR_GROUP_SAME_DR_STMT. */
6493 gimple
*new_stmt
= SSA_NAME_DEF_STMT (tmp_data_ref
);
6494 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6495 copies, and we put the new vector statement last. */
6496 STMT_VINFO_VEC_STMTS (next_stmt_info
).safe_push (new_stmt
);
6498 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
6504 /* Function vect_force_dr_alignment_p.
6506 Returns whether the alignment of a DECL can be forced to be aligned
6507 on ALIGNMENT bit boundary. */
6510 vect_can_force_dr_alignment_p (const_tree decl
, poly_uint64 alignment
)
6515 if (decl_in_symtab_p (decl
)
6516 && !symtab_node::get (decl
)->can_increase_alignment_p ())
6519 if (TREE_STATIC (decl
))
6520 return (known_le (alignment
,
6521 (unsigned HOST_WIDE_INT
) MAX_OFILE_ALIGNMENT
));
6523 return (known_le (alignment
, (unsigned HOST_WIDE_INT
) MAX_STACK_ALIGNMENT
));
6527 /* Return whether the data reference DR_INFO is supported with respect to its
6529 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6530 it is aligned, i.e., check if it is possible to vectorize it with different
6533 enum dr_alignment_support
6534 vect_supportable_dr_alignment (vec_info
*vinfo
, dr_vec_info
*dr_info
,
6535 bool check_aligned_accesses
)
6537 data_reference
*dr
= dr_info
->dr
;
6538 stmt_vec_info stmt_info
= dr_info
->stmt
;
6539 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6540 machine_mode mode
= TYPE_MODE (vectype
);
6541 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
6542 class loop
*vect_loop
= NULL
;
6543 bool nested_in_vect_loop
= false;
6545 if (aligned_access_p (dr_info
) && !check_aligned_accesses
)
6548 /* For now assume all conditional loads/stores support unaligned
6549 access without any special code. */
6550 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
6551 if (gimple_call_internal_p (stmt
)
6552 && (gimple_call_internal_fn (stmt
) == IFN_MASK_LOAD
6553 || gimple_call_internal_fn (stmt
) == IFN_MASK_STORE
))
6554 return dr_unaligned_supported
;
6558 vect_loop
= LOOP_VINFO_LOOP (loop_vinfo
);
6559 nested_in_vect_loop
= nested_in_vect_loop_p (vect_loop
, stmt_info
);
6562 /* Possibly unaligned access. */
6564 /* We can choose between using the implicit realignment scheme (generating
6565 a misaligned_move stmt) and the explicit realignment scheme (generating
6566 aligned loads with a REALIGN_LOAD). There are two variants to the
6567 explicit realignment scheme: optimized, and unoptimized.
6568 We can optimize the realignment only if the step between consecutive
6569 vector loads is equal to the vector size. Since the vector memory
6570 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6571 is guaranteed that the misalignment amount remains the same throughout the
6572 execution of the vectorized loop. Therefore, we can create the
6573 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6574 at the loop preheader.
6576 However, in the case of outer-loop vectorization, when vectorizing a
6577 memory access in the inner-loop nested within the LOOP that is now being
6578 vectorized, while it is guaranteed that the misalignment of the
6579 vectorized memory access will remain the same in different outer-loop
6580 iterations, it is *not* guaranteed that is will remain the same throughout
6581 the execution of the inner-loop. This is because the inner-loop advances
6582 with the original scalar step (and not in steps of VS). If the inner-loop
6583 step happens to be a multiple of VS, then the misalignment remains fixed
6584 and we can use the optimized realignment scheme. For example:
6590 When vectorizing the i-loop in the above example, the step between
6591 consecutive vector loads is 1, and so the misalignment does not remain
6592 fixed across the execution of the inner-loop, and the realignment cannot
6593 be optimized (as illustrated in the following pseudo vectorized loop):
6595 for (i=0; i<N; i+=4)
6596 for (j=0; j<M; j++){
6597 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6598 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6599 // (assuming that we start from an aligned address).
6602 We therefore have to use the unoptimized realignment scheme:
6604 for (i=0; i<N; i+=4)
6605 for (j=k; j<M; j+=4)
6606 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6607 // that the misalignment of the initial address is
6610 The loop can then be vectorized as follows:
6612 for (k=0; k<4; k++){
6613 rt = get_realignment_token (&vp[k]);
6614 for (i=0; i<N; i+=4){
6616 for (j=k; j<M; j+=4){
6618 va = REALIGN_LOAD <v1,v2,rt>;
6625 if (DR_IS_READ (dr
))
6627 bool is_packed
= false;
6628 tree type
= (TREE_TYPE (DR_REF (dr
)));
6630 if (optab_handler (vec_realign_load_optab
, mode
) != CODE_FOR_nothing
6631 && (!targetm
.vectorize
.builtin_mask_for_load
6632 || targetm
.vectorize
.builtin_mask_for_load ()))
6634 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
6636 /* If we are doing SLP then the accesses need not have the
6637 same alignment, instead it depends on the SLP group size. */
6639 && STMT_SLP_TYPE (stmt_info
)
6640 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo
)
6642 (DR_GROUP_FIRST_ELEMENT (stmt_info
))),
6643 TYPE_VECTOR_SUBPARTS (vectype
)))
6645 else if (!loop_vinfo
6646 || (nested_in_vect_loop
6647 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr
)),
6648 GET_MODE_SIZE (TYPE_MODE (vectype
)))))
6649 return dr_explicit_realign
;
6651 return dr_explicit_realign_optimized
;
6653 if (!known_alignment_for_access_p (dr_info
))
6654 is_packed
= not_size_aligned (DR_REF (dr
));
6656 if (targetm
.vectorize
.support_vector_misalignment
6657 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6658 /* Can't software pipeline the loads, but can at least do them. */
6659 return dr_unaligned_supported
;
6663 bool is_packed
= false;
6664 tree type
= (TREE_TYPE (DR_REF (dr
)));
6666 if (!known_alignment_for_access_p (dr_info
))
6667 is_packed
= not_size_aligned (DR_REF (dr
));
6669 if (targetm
.vectorize
.support_vector_misalignment
6670 (mode
, type
, DR_MISALIGNMENT (dr_info
), is_packed
))
6671 return dr_unaligned_supported
;
6675 return dr_unaligned_unsupported
;