Daily bump.
[gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55
56 static object_allocator<_slp_tree> *slp_tree_pool;
57 static slp_tree slp_first_node;
58
59 void
60 vect_slp_init (void)
61 {
62 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
63 }
64
65 void
66 vect_slp_fini (void)
67 {
68 while (slp_first_node)
69 delete slp_first_node;
70 delete slp_tree_pool;
71 slp_tree_pool = NULL;
72 }
73
74 void *
75 _slp_tree::operator new (size_t n)
76 {
77 gcc_assert (n == sizeof (_slp_tree));
78 return slp_tree_pool->allocate_raw ();
79 }
80
81 void
82 _slp_tree::operator delete (void *node, size_t n)
83 {
84 gcc_assert (n == sizeof (_slp_tree));
85 slp_tree_pool->remove_raw (node);
86 }
87
88
89 /* Initialize a SLP node. */
90
91 _slp_tree::_slp_tree ()
92 {
93 this->prev_node = NULL;
94 if (slp_first_node)
95 slp_first_node->prev_node = this;
96 this->next_node = slp_first_node;
97 slp_first_node = this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL;
99 SLP_TREE_SCALAR_OPS (this) = vNULL;
100 SLP_TREE_VEC_STMTS (this) = vNULL;
101 SLP_TREE_VEC_DEFS (this) = vNULL;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
107 SLP_TREE_CODE (this) = ERROR_MARK;
108 SLP_TREE_VECTYPE (this) = NULL_TREE;
109 SLP_TREE_REPRESENTATIVE (this) = NULL;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits = 1;
112 this->lanes = 0;
113 }
114
115 /* Tear down a SLP node. */
116
117 _slp_tree::~_slp_tree ()
118 {
119 if (this->prev_node)
120 this->prev_node->next_node = this->next_node;
121 else
122 slp_first_node = this->next_node;
123 if (this->next_node)
124 this->next_node->prev_node = this->prev_node;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
132 }
133
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
135
136 void
137 vect_free_slp_tree (slp_tree node)
138 {
139 int i;
140 slp_tree child;
141
142 if (--SLP_TREE_REF_COUNT (node) != 0)
143 return;
144
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
146 if (child)
147 vect_free_slp_tree (child);
148
149 delete node;
150 }
151
152 /* Return a location suitable for dumpings related to the SLP instance. */
153
154 dump_user_location_t
155 _slp_instance::location () const
156 {
157 if (root_stmt)
158 return root_stmt->stmt;
159 else
160 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
161 }
162
163
164 /* Free the memory allocated for the SLP instance. */
165
166 void
167 vect_free_slp_instance (slp_instance instance)
168 {
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
170 SLP_INSTANCE_LOADS (instance).release ();
171 instance->subgraph_entries.release ();
172 instance->cost_vec.release ();
173 free (instance);
174 }
175
176
177 /* Create an SLP node for SCALAR_STMTS. */
178
179 slp_tree
180 vect_create_new_slp_node (unsigned nops, tree_code code)
181 {
182 slp_tree node = new _slp_tree;
183 SLP_TREE_SCALAR_STMTS (node) = vNULL;
184 SLP_TREE_CHILDREN (node).create (nops);
185 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
186 SLP_TREE_CODE (node) = code;
187 return node;
188 }
189 /* Create an SLP node for SCALAR_STMTS. */
190
191 static slp_tree
192 vect_create_new_slp_node (slp_tree node,
193 vec<stmt_vec_info> scalar_stmts, unsigned nops)
194 {
195 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
196 SLP_TREE_CHILDREN (node).create (nops);
197 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
198 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
199 SLP_TREE_LANES (node) = scalar_stmts.length ();
200 return node;
201 }
202
203 /* Create an SLP node for SCALAR_STMTS. */
204
205 static slp_tree
206 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
207 {
208 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
209 }
210
211 /* Create an SLP node for OPS. */
212
213 static slp_tree
214 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
215 {
216 SLP_TREE_SCALAR_OPS (node) = ops;
217 SLP_TREE_DEF_TYPE (node) = vect_external_def;
218 SLP_TREE_LANES (node) = ops.length ();
219 return node;
220 }
221
222 /* Create an SLP node for OPS. */
223
224 static slp_tree
225 vect_create_new_slp_node (vec<tree> ops)
226 {
227 return vect_create_new_slp_node (new _slp_tree, ops);
228 }
229
230
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
233 node. */
234 typedef struct _slp_oprnd_info
235 {
236 /* Def-stmts for the operands. */
237 vec<stmt_vec_info> def_stmts;
238 /* Operands. */
239 vec<tree> ops;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
242 stmt. */
243 tree first_op_type;
244 enum vect_def_type first_dt;
245 bool any_pattern;
246 } *slp_oprnd_info;
247
248
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
250 operand. */
251 static vec<slp_oprnd_info>
252 vect_create_oprnd_info (int nops, int group_size)
253 {
254 int i;
255 slp_oprnd_info oprnd_info;
256 vec<slp_oprnd_info> oprnds_info;
257
258 oprnds_info.create (nops);
259 for (i = 0; i < nops; i++)
260 {
261 oprnd_info = XNEW (struct _slp_oprnd_info);
262 oprnd_info->def_stmts.create (group_size);
263 oprnd_info->ops.create (group_size);
264 oprnd_info->first_dt = vect_uninitialized_def;
265 oprnd_info->first_op_type = NULL_TREE;
266 oprnd_info->any_pattern = false;
267 oprnds_info.quick_push (oprnd_info);
268 }
269
270 return oprnds_info;
271 }
272
273
274 /* Free operands info. */
275
276 static void
277 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
278 {
279 int i;
280 slp_oprnd_info oprnd_info;
281
282 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
283 {
284 oprnd_info->def_stmts.release ();
285 oprnd_info->ops.release ();
286 XDELETE (oprnd_info);
287 }
288
289 oprnds_info.release ();
290 }
291
292
293 /* Return true if STMTS contains a pattern statement. */
294
295 static bool
296 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
297 {
298 stmt_vec_info stmt_info;
299 unsigned int i;
300 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
301 if (is_pattern_stmt_p (stmt_info))
302 return true;
303 return false;
304 }
305
306 /* Return true when all lanes in the external or constant NODE have
307 the same value. */
308
309 static bool
310 vect_slp_tree_uniform_p (slp_tree node)
311 {
312 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
314
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
317 return false;
318
319 unsigned i;
320 tree op, first = NULL_TREE;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
322 if (!first)
323 first = op;
324 else if (!operand_equal_p (first, op, 0))
325 return false;
326
327 return true;
328 }
329
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
332 of the chain. */
333
334 int
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
336 stmt_vec_info first_stmt_info)
337 {
338 stmt_vec_info next_stmt_info = first_stmt_info;
339 int result = 0;
340
341 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
342 return -1;
343
344 do
345 {
346 if (next_stmt_info == stmt_info)
347 return result;
348 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
349 if (next_stmt_info)
350 result += DR_GROUP_GAP (next_stmt_info);
351 }
352 while (next_stmt_info);
353
354 return -1;
355 }
356
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
361 (if nonnull). */
362
363 bool
364 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
365 tree elt_type, unsigned int *nvectors_out,
366 tree *vector_type_out,
367 tree *permutes)
368 {
369 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
370 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
371 return false;
372
373 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
374 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
375 unsigned int nvectors = 1;
376 for (;;)
377 {
378 scalar_int_mode int_mode;
379 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
380 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
381 {
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type = build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode), 1);
385 tree vector_type
386 = get_vectype_for_scalar_type (vinfo, int_type, count);
387 if (vector_type
388 && VECTOR_MODE_P (TYPE_MODE (vector_type))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
390 GET_MODE_SIZE (base_vector_mode)))
391 {
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
396 vec_perm_builder sel1 (nelts, 2, 3);
397 vec_perm_builder sel2 (nelts, 2, 3);
398 poly_int64 half_nelts = exact_div (nelts, 2);
399 for (unsigned int i = 0; i < 3; ++i)
400 {
401 sel1.quick_push (i);
402 sel1.quick_push (i + nelts);
403 sel2.quick_push (half_nelts + i);
404 sel2.quick_push (half_nelts + i + nelts);
405 }
406 vec_perm_indices indices1 (sel1, 2, nelts);
407 vec_perm_indices indices2 (sel2, 2, nelts);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
410 {
411 if (nvectors_out)
412 *nvectors_out = nvectors;
413 if (vector_type_out)
414 *vector_type_out = vector_type;
415 if (permutes)
416 {
417 permutes[0] = vect_gen_perm_mask_checked (vector_type,
418 indices1);
419 permutes[1] = vect_gen_perm_mask_checked (vector_type,
420 indices2);
421 }
422 return true;
423 }
424 }
425 }
426 if (!multiple_p (elt_bytes, 2, &elt_bytes))
427 return false;
428 nvectors *= 2;
429 }
430 }
431
432 /* Return true if DTA and DTB match. */
433
434 static bool
435 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
436 {
437 return (dta == dtb
438 || ((dta == vect_external_def || dta == vect_constant_def)
439 && (dtb == vect_external_def || dtb == vect_constant_def)));
440 }
441
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
450 value.
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
453 ok return 0. */
454 static int
455 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
456 bool *skip_args,
457 vec<stmt_vec_info> stmts, unsigned stmt_num,
458 vec<slp_oprnd_info> *oprnds_info)
459 {
460 stmt_vec_info stmt_info = stmts[stmt_num];
461 tree oprnd;
462 unsigned int i, number_of_oprnds;
463 enum vect_def_type dt = vect_uninitialized_def;
464 slp_oprnd_info oprnd_info;
465 int first_op_idx = 1;
466 unsigned int commutative_op = -1U;
467 bool first_op_cond = false;
468 bool first = stmt_num == 0;
469
470 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
471 {
472 number_of_oprnds = gimple_call_num_args (stmt);
473 first_op_idx = 3;
474 if (gimple_call_internal_p (stmt))
475 {
476 internal_fn ifn = gimple_call_internal_fn (stmt);
477 commutative_op = first_commutative_argument (ifn);
478
479 /* Masked load, only look at mask. */
480 if (ifn == IFN_MASK_LOAD)
481 {
482 number_of_oprnds = 1;
483 /* Mask operand index. */
484 first_op_idx = 5;
485 }
486 }
487 }
488 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
489 {
490 enum tree_code code = gimple_assign_rhs_code (stmt);
491 number_of_oprnds = gimple_num_ops (stmt) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code == COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
496 {
497 first_op_cond = true;
498 number_of_oprnds++;
499 }
500 else
501 commutative_op = commutative_tree_code (code) ? 0U : -1U;
502 }
503 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
504 number_of_oprnds = gimple_phi_num_args (stmt);
505 else
506 return -1;
507
508 bool swapped = (swap != 0);
509 bool backedge = false;
510 gcc_assert (!swapped || first_op_cond);
511 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
512 for (i = 0; i < number_of_oprnds; i++)
513 {
514 if (first_op_cond)
515 {
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map = maps[swap];
519
520 if (i < 2)
521 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
522 first_op_idx), map[i]);
523 else
524 oprnd = gimple_op (stmt_info->stmt, map[i]);
525 }
526 else if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
527 {
528 oprnd = gimple_phi_arg_def (stmt, i);
529 backedge = dominated_by_p (CDI_DOMINATORS,
530 gimple_phi_arg_edge (stmt, i)->src,
531 gimple_bb (stmt_info->stmt));
532 }
533 else
534 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
535 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
536 oprnd = TREE_OPERAND (oprnd, 0);
537
538 oprnd_info = (*oprnds_info)[i];
539
540 stmt_vec_info def_stmt_info;
541 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
542 {
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
545 "Build SLP failed: can't analyze def for %T\n",
546 oprnd);
547
548 return -1;
549 }
550
551 if (skip_args[i])
552 {
553 oprnd_info->def_stmts.quick_push (NULL);
554 oprnd_info->ops.quick_push (NULL_TREE);
555 oprnd_info->first_dt = vect_uninitialized_def;
556 continue;
557 }
558
559 oprnd_info->def_stmts.quick_push (def_stmt_info);
560 oprnd_info->ops.quick_push (oprnd);
561
562 if (def_stmt_info
563 && is_pattern_stmt_p (def_stmt_info))
564 {
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
566 != def_stmt_info)
567 oprnd_info->any_pattern = true;
568 else
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info->ops.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
572 }
573
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
578 goals there. */
579 if (backedge
580 && dts[i] == vect_external_def
581 && is_a <bb_vec_info> (vinfo)
582 && TREE_CODE (oprnd) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
584 && !dominated_by_p (CDI_DOMINATORS,
585 as_a <bb_vec_info> (vinfo)->bbs[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd);
592 return -1;
593 }
594
595 if (first)
596 {
597 tree type = TREE_TYPE (oprnd);
598 dt = dts[i];
599 if ((dt == vect_constant_def
600 || dt == vect_external_def)
601 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
602 && (TREE_CODE (type) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
604 type)))
605 {
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd);
610 return -1;
611 }
612
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
617 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
618 && def_stmt_info)
619 dts[i] = dt = vect_reduction_def;
620
621 /* Check the types of the definition. */
622 switch (dt)
623 {
624 case vect_external_def:
625 case vect_constant_def:
626 case vect_internal_def:
627 case vect_reduction_def:
628 case vect_induction_def:
629 case vect_nested_cycle:
630 break;
631
632 default:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
636 "Build SLP failed: illegal type of def %T\n",
637 oprnd);
638 return -1;
639 }
640
641 oprnd_info->first_dt = dt;
642 oprnd_info->first_op_type = type;
643 }
644 }
645 if (first)
646 return 0;
647
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i = 0; i < number_of_oprnds;)
650 {
651 if (skip_args[i])
652 {
653 ++i;
654 continue;
655 }
656
657 oprnd_info = (*oprnds_info)[i];
658 dt = dts[i];
659 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
660 oprnd = oprnd_info->ops[stmt_num];
661 tree type = TREE_TYPE (oprnd);
662
663 if (!types_compatible_p (oprnd_info->first_op_type, type))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: different operand types\n");
668 return 1;
669 }
670
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
677 && !(oprnd_info->first_dt == vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
680 && def_stmt_info
681 && !STMT_VINFO_DATA_REF (def_stmt_info)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
684 || (!STMT_VINFO_DATA_REF (stmt_info)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
686 && ((!def_stmt_info
687 || STMT_VINFO_DATA_REF (def_stmt_info)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
690 != (oprnd_info->first_dt != vect_reduction_def))))
691 {
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i == commutative_op && !swapped
695 && (!is_a <bb_vec_info> (vinfo)
696 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
697 dts[i+1])
698 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
699 || vect_def_types_match
700 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE, vect_location,
704 "trying swapped operands\n");
705 std::swap (dts[i], dts[i+1]);
706 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
707 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
708 std::swap ((*oprnds_info)[i]->ops[stmt_num],
709 (*oprnds_info)[i+1]->ops[stmt_num]);
710 swapped = true;
711 continue;
712 }
713
714 if (is_a <bb_vec_info> (vinfo)
715 && !oprnd_info->any_pattern)
716 {
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "treating operand as external\n");
722 oprnd_info->first_dt = dt = vect_external_def;
723 }
724 else
725 {
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "Build SLP failed: different types\n");
729 return 1;
730 }
731 }
732
733 /* Make sure to demote the overall operand to external. */
734 if (dt == vect_external_def)
735 oprnd_info->first_dt = vect_external_def;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt == vect_internal_def
740 || dt == vect_reduction_def)
741 && oprnd_info->first_dt == vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
744 && !STMT_VINFO_DATA_REF (def_stmt_info)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
747 {
748 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
749 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
750 }
751
752 ++i;
753 }
754
755 /* Swap operands. */
756 if (swapped)
757 {
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE, vect_location,
760 "swapped operands to match def types in %G",
761 stmt_info->stmt);
762 }
763
764 return 0;
765 }
766
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
770
771 bool
772 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
773 {
774 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
775 if (old_vectype)
776 return useless_type_conversion_p (vectype, old_vectype);
777
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
779 {
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
783 stmt_vec_info member_info = first_info;
784 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
785 if (is_pattern_stmt_p (member_info)
786 && !useless_type_conversion_p (vectype,
787 STMT_VINFO_VECTYPE (member_info)))
788 break;
789
790 if (!member_info)
791 {
792 for (member_info = first_info; member_info;
793 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
794 STMT_VINFO_VECTYPE (member_info) = vectype;
795 return true;
796 }
797 }
798 else if (!is_pattern_stmt_p (stmt_info))
799 {
800 STMT_VINFO_VECTYPE (stmt_info) = vectype;
801 return true;
802 }
803
804 if (dump_enabled_p ())
805 {
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info->stmt);
809 dump_printf_loc (MSG_NOTE, vect_location,
810 " old vector type: %T\n", old_vectype);
811 dump_printf_loc (MSG_NOTE, vect_location,
812 " new vector type: %T\n", vectype);
813 }
814 return false;
815 }
816
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
819
820 static bool
821 compatible_calls_p (gcall *call1, gcall *call2)
822 {
823 unsigned int nargs = gimple_call_num_args (call1);
824 if (nargs != gimple_call_num_args (call2))
825 return false;
826
827 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
828 return false;
829
830 if (gimple_call_internal_p (call1))
831 {
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
833 TREE_TYPE (gimple_call_lhs (call2))))
834 return false;
835 for (unsigned int i = 0; i < nargs; ++i)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
837 TREE_TYPE (gimple_call_arg (call2, i))))
838 return false;
839 }
840 else
841 {
842 if (!operand_equal_p (gimple_call_fn (call1),
843 gimple_call_fn (call2), 0))
844 return false;
845
846 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
847 return false;
848 }
849 return true;
850 }
851
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
858
859 static bool
860 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
861 unsigned int group_size,
862 tree vectype, poly_uint64 *max_nunits)
863 {
864 if (!vectype)
865 {
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 "Build SLP failed: unsupported data-type in %G\n",
869 stmt_info->stmt);
870 /* Fatal mismatch. */
871 return false;
872 }
873
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 if (is_a <bb_vec_info> (vinfo)
877 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
878 {
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
881 "Build SLP failed: unrolling required "
882 "in basic block SLP\n");
883 /* Fatal mismatch. */
884 return false;
885 }
886
887 /* In case of multiple types we need to detect the smallest type. */
888 vect_update_max_nunits (max_nunits, vectype);
889 return true;
890 }
891
892 /* Verify if the scalar stmts STMTS are isomorphic, require data
893 permutation or are of unsupported types of operation. Return
894 true if they are, otherwise return false and indicate in *MATCHES
895 which stmts are not isomorphic to the first one. If MATCHES[0]
896 is false then this indicates the comparison could not be
897 carried out or the stmts will never be vectorized by SLP.
898
899 Note COND_EXPR is possibly isomorphic to another one after swapping its
900 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
901 the first stmt by swapping the two operands of comparison; set SWAP[i]
902 to 2 if stmt I is isormorphic to the first stmt by inverting the code
903 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
904 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
905
906 static bool
907 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
908 vec<stmt_vec_info> stmts, unsigned int group_size,
909 poly_uint64 *max_nunits, bool *matches,
910 bool *two_operators, tree *node_vectype)
911 {
912 unsigned int i;
913 stmt_vec_info first_stmt_info = stmts[0];
914 enum tree_code first_stmt_code = ERROR_MARK;
915 enum tree_code alt_stmt_code = ERROR_MARK;
916 enum tree_code rhs_code = ERROR_MARK;
917 enum tree_code first_cond_code = ERROR_MARK;
918 tree lhs;
919 bool need_same_oprnds = false;
920 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
921 optab optab;
922 int icode;
923 machine_mode optab_op2_mode;
924 machine_mode vec_mode;
925 stmt_vec_info first_load = NULL, prev_first_load = NULL;
926 bool first_stmt_load_p = false, load_p = false;
927 bool first_stmt_phi_p = false, phi_p = false;
928 bool maybe_soft_fail = false;
929 tree soft_fail_nunits_vectype = NULL_TREE;
930
931 /* For every stmt in NODE find its def stmt/s. */
932 stmt_vec_info stmt_info;
933 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
934 {
935 gimple *stmt = stmt_info->stmt;
936 swap[i] = 0;
937 matches[i] = false;
938
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
941
942 /* Fail to vectorize statements marked as unvectorizable, throw
943 or are volatile. */
944 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
945 || stmt_can_throw_internal (cfun, stmt)
946 || gimple_has_volatile_ops (stmt))
947 {
948 if (dump_enabled_p ())
949 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
950 "Build SLP failed: unvectorizable statement %G",
951 stmt);
952 /* ??? For BB vectorization we want to commutate operands in a way
953 to shuffle all unvectorizable defs into one operand and have
954 the other still vectorized. The following doesn't reliably
955 work for this though but it's the easiest we can do here. */
956 if (is_a <bb_vec_info> (vinfo) && i != 0)
957 continue;
958 /* Fatal mismatch. */
959 matches[0] = false;
960 return false;
961 }
962
963 lhs = gimple_get_lhs (stmt);
964 if (lhs == NULL_TREE)
965 {
966 if (dump_enabled_p ())
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Build SLP failed: not GIMPLE_ASSIGN nor "
969 "GIMPLE_CALL %G", stmt);
970 if (is_a <bb_vec_info> (vinfo) && i != 0)
971 continue;
972 /* Fatal mismatch. */
973 matches[0] = false;
974 return false;
975 }
976
977 tree nunits_vectype;
978 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
979 &nunits_vectype, group_size))
980 {
981 if (is_a <bb_vec_info> (vinfo) && i != 0)
982 continue;
983 /* Fatal mismatch. */
984 matches[0] = false;
985 return false;
986 }
987 /* Record nunits required but continue analysis, producing matches[]
988 as if nunits was not an issue. This allows splitting of groups
989 to happen. */
990 if (nunits_vectype
991 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
992 nunits_vectype, max_nunits))
993 {
994 gcc_assert (is_a <bb_vec_info> (vinfo));
995 maybe_soft_fail = true;
996 soft_fail_nunits_vectype = nunits_vectype;
997 }
998
999 gcc_assert (vectype);
1000
1001 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1002 if (call_stmt)
1003 {
1004 rhs_code = CALL_EXPR;
1005
1006 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
1007 load_p = true;
1008 else if ((gimple_call_internal_p (call_stmt)
1009 && (!vectorizable_internal_fn_p
1010 (gimple_call_internal_fn (call_stmt))))
1011 || gimple_call_tail_p (call_stmt)
1012 || gimple_call_noreturn_p (call_stmt)
1013 || !gimple_call_nothrow_p (call_stmt)
1014 || gimple_call_chain (call_stmt))
1015 {
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "Build SLP failed: unsupported call type %G",
1019 call_stmt);
1020 if (is_a <bb_vec_info> (vinfo) && i != 0)
1021 continue;
1022 /* Fatal mismatch. */
1023 matches[0] = false;
1024 return false;
1025 }
1026 }
1027 else if (gimple_code (stmt) == GIMPLE_PHI)
1028 {
1029 rhs_code = ERROR_MARK;
1030 phi_p = true;
1031 }
1032 else
1033 {
1034 rhs_code = gimple_assign_rhs_code (stmt);
1035 load_p = gimple_vuse (stmt);
1036 }
1037
1038 /* Check the operation. */
1039 if (i == 0)
1040 {
1041 *node_vectype = vectype;
1042 first_stmt_code = rhs_code;
1043 first_stmt_load_p = load_p;
1044 first_stmt_phi_p = phi_p;
1045
1046 /* Shift arguments should be equal in all the packed stmts for a
1047 vector shift with scalar shift operand. */
1048 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1049 || rhs_code == LROTATE_EXPR
1050 || rhs_code == RROTATE_EXPR)
1051 {
1052 vec_mode = TYPE_MODE (vectype);
1053
1054 /* First see if we have a vector/vector shift. */
1055 optab = optab_for_tree_code (rhs_code, vectype,
1056 optab_vector);
1057
1058 if (!optab
1059 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
1060 {
1061 /* No vector/vector shift, try for a vector/scalar shift. */
1062 optab = optab_for_tree_code (rhs_code, vectype,
1063 optab_scalar);
1064
1065 if (!optab)
1066 {
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "Build SLP failed: no optab.\n");
1070 if (is_a <bb_vec_info> (vinfo) && i != 0)
1071 continue;
1072 /* Fatal mismatch. */
1073 matches[0] = false;
1074 return false;
1075 }
1076 icode = (int) optab_handler (optab, vec_mode);
1077 if (icode == CODE_FOR_nothing)
1078 {
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081 "Build SLP failed: "
1082 "op not supported by target.\n");
1083 if (is_a <bb_vec_info> (vinfo) && i != 0)
1084 continue;
1085 /* Fatal mismatch. */
1086 matches[0] = false;
1087 return false;
1088 }
1089 optab_op2_mode = insn_data[icode].operand[2].mode;
1090 if (!VECTOR_MODE_P (optab_op2_mode))
1091 {
1092 need_same_oprnds = true;
1093 first_op1 = gimple_assign_rhs2 (stmt);
1094 }
1095 }
1096 }
1097 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1098 {
1099 need_same_oprnds = true;
1100 first_op1 = gimple_assign_rhs2 (stmt);
1101 }
1102 else if (!load_p
1103 && rhs_code == BIT_FIELD_REF)
1104 {
1105 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1106 if (!is_a <bb_vec_info> (vinfo)
1107 || TREE_CODE (vec) != SSA_NAME
1108 || !operand_equal_p (TYPE_SIZE (vectype),
1109 TYPE_SIZE (TREE_TYPE (vec))))
1110 {
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "Build SLP failed: "
1114 "BIT_FIELD_REF not supported\n");
1115 /* Fatal mismatch. */
1116 matches[0] = false;
1117 return false;
1118 }
1119 }
1120 else if (call_stmt
1121 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
1122 {
1123 need_same_oprnds = true;
1124 first_op1 = gimple_call_arg (call_stmt, 1);
1125 }
1126 }
1127 else
1128 {
1129 if (first_stmt_code != rhs_code
1130 && alt_stmt_code == ERROR_MARK)
1131 alt_stmt_code = rhs_code;
1132 if ((first_stmt_code != rhs_code
1133 && (first_stmt_code != IMAGPART_EXPR
1134 || rhs_code != REALPART_EXPR)
1135 && (first_stmt_code != REALPART_EXPR
1136 || rhs_code != IMAGPART_EXPR)
1137 /* Handle mismatches in plus/minus by computing both
1138 and merging the results. */
1139 && !((first_stmt_code == PLUS_EXPR
1140 || first_stmt_code == MINUS_EXPR)
1141 && (alt_stmt_code == PLUS_EXPR
1142 || alt_stmt_code == MINUS_EXPR)
1143 && rhs_code == alt_stmt_code)
1144 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145 && (first_stmt_code == ARRAY_REF
1146 || first_stmt_code == BIT_FIELD_REF
1147 || first_stmt_code == INDIRECT_REF
1148 || first_stmt_code == COMPONENT_REF
1149 || first_stmt_code == MEM_REF)))
1150 || first_stmt_load_p != load_p
1151 || first_stmt_phi_p != phi_p)
1152 {
1153 if (dump_enabled_p ())
1154 {
1155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156 "Build SLP failed: different operation "
1157 "in stmt %G", stmt);
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "original stmt %G", first_stmt_info->stmt);
1160 }
1161 /* Mismatch. */
1162 continue;
1163 }
1164
1165 if (need_same_oprnds)
1166 {
1167 tree other_op1 = (call_stmt
1168 ? gimple_call_arg (call_stmt, 1)
1169 : gimple_assign_rhs2 (stmt));
1170 if (!operand_equal_p (first_op1, other_op1, 0))
1171 {
1172 if (dump_enabled_p ())
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "Build SLP failed: different shift "
1175 "arguments in %G", stmt);
1176 /* Mismatch. */
1177 continue;
1178 }
1179 }
1180 if (!load_p
1181 && first_stmt_code == BIT_FIELD_REF
1182 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1183 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "Build SLP failed: different BIT_FIELD_REF "
1188 "arguments in %G", stmt);
1189 /* Mismatch. */
1190 continue;
1191 }
1192
1193 if (!load_p && rhs_code == CALL_EXPR)
1194 {
1195 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1196 as_a <gcall *> (stmt)))
1197 {
1198 if (dump_enabled_p ())
1199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200 "Build SLP failed: different calls in %G",
1201 stmt);
1202 /* Mismatch. */
1203 continue;
1204 }
1205 }
1206
1207 if (phi_p
1208 && (gimple_bb (first_stmt_info->stmt)
1209 != gimple_bb (stmt_info->stmt)))
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "Build SLP failed: different BB for PHI "
1214 "in %G", stmt);
1215 /* Mismatch. */
1216 continue;
1217 }
1218
1219 if (!types_compatible_p (vectype, *node_vectype))
1220 {
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: different vector type "
1224 "in %G", stmt);
1225 /* Mismatch. */
1226 continue;
1227 }
1228 }
1229
1230 /* Grouped store or load. */
1231 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1232 {
1233 if (REFERENCE_CLASS_P (lhs))
1234 {
1235 /* Store. */
1236 ;
1237 }
1238 else
1239 {
1240 /* Load. */
1241 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1242 if (prev_first_load)
1243 {
1244 /* Check that there are no loads from different interleaving
1245 chains in the same node. */
1246 if (prev_first_load != first_load)
1247 {
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1250 vect_location,
1251 "Build SLP failed: different "
1252 "interleaving chains in one node %G",
1253 stmt);
1254 /* Mismatch. */
1255 continue;
1256 }
1257 }
1258 else
1259 prev_first_load = first_load;
1260 }
1261 } /* Grouped access. */
1262 else
1263 {
1264 if (load_p)
1265 {
1266 /* Not grouped load. */
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Build SLP failed: not grouped load %G", stmt);
1270
1271 /* FORNOW: Not grouped loads are not supported. */
1272 if (is_a <bb_vec_info> (vinfo) && i != 0)
1273 continue;
1274 /* Fatal mismatch. */
1275 matches[0] = false;
1276 return false;
1277 }
1278
1279 /* Not memory operation. */
1280 if (!phi_p
1281 && TREE_CODE_CLASS (rhs_code) != tcc_binary
1282 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1283 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1284 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1285 && rhs_code != VIEW_CONVERT_EXPR
1286 && rhs_code != CALL_EXPR
1287 && rhs_code != BIT_FIELD_REF)
1288 {
1289 if (dump_enabled_p ())
1290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291 "Build SLP failed: operation unsupported %G",
1292 stmt);
1293 if (is_a <bb_vec_info> (vinfo) && i != 0)
1294 continue;
1295 /* Fatal mismatch. */
1296 matches[0] = false;
1297 return false;
1298 }
1299
1300 if (rhs_code == COND_EXPR)
1301 {
1302 tree cond_expr = gimple_assign_rhs1 (stmt);
1303 enum tree_code cond_code = TREE_CODE (cond_expr);
1304 enum tree_code swap_code = ERROR_MARK;
1305 enum tree_code invert_code = ERROR_MARK;
1306
1307 if (i == 0)
1308 first_cond_code = TREE_CODE (cond_expr);
1309 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1310 {
1311 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1312 swap_code = swap_tree_comparison (cond_code);
1313 invert_code = invert_tree_comparison (cond_code, honor_nans);
1314 }
1315
1316 if (first_cond_code == cond_code)
1317 ;
1318 /* Isomorphic can be achieved by swapping. */
1319 else if (first_cond_code == swap_code)
1320 swap[i] = 1;
1321 /* Isomorphic can be achieved by inverting. */
1322 else if (first_cond_code == invert_code)
1323 swap[i] = 2;
1324 else
1325 {
1326 if (dump_enabled_p ())
1327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1328 "Build SLP failed: different"
1329 " operation %G", stmt);
1330 /* Mismatch. */
1331 continue;
1332 }
1333 }
1334 }
1335
1336 matches[i] = true;
1337 }
1338
1339 for (i = 0; i < group_size; ++i)
1340 if (!matches[i])
1341 return false;
1342
1343 /* If we allowed a two-operation SLP node verify the target can cope
1344 with the permute we are going to use. */
1345 if (alt_stmt_code != ERROR_MARK
1346 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1347 {
1348 *two_operators = true;
1349 }
1350
1351 if (maybe_soft_fail)
1352 {
1353 unsigned HOST_WIDE_INT const_nunits;
1354 if (!TYPE_VECTOR_SUBPARTS
1355 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1356 || const_nunits > group_size)
1357 matches[0] = false;
1358 else
1359 {
1360 /* With constant vector elements simulate a mismatch at the
1361 point we need to split. */
1362 unsigned tail = group_size & (const_nunits - 1);
1363 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1364 }
1365 return false;
1366 }
1367
1368 return true;
1369 }
1370
1371 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1372 Note we never remove apart from at destruction time so we do not
1373 need a special value for deleted that differs from empty. */
1374 struct bst_traits
1375 {
1376 typedef vec <stmt_vec_info> value_type;
1377 typedef vec <stmt_vec_info> compare_type;
1378 static inline hashval_t hash (value_type);
1379 static inline bool equal (value_type existing, value_type candidate);
1380 static inline bool is_empty (value_type x) { return !x.exists (); }
1381 static inline bool is_deleted (value_type x) { return !x.exists (); }
1382 static const bool empty_zero_p = true;
1383 static inline void mark_empty (value_type &x) { x.release (); }
1384 static inline void mark_deleted (value_type &x) { x.release (); }
1385 static inline void remove (value_type &x) { x.release (); }
1386 };
1387 inline hashval_t
1388 bst_traits::hash (value_type x)
1389 {
1390 inchash::hash h;
1391 for (unsigned i = 0; i < x.length (); ++i)
1392 h.add_int (gimple_uid (x[i]->stmt));
1393 return h.end ();
1394 }
1395 inline bool
1396 bst_traits::equal (value_type existing, value_type candidate)
1397 {
1398 if (existing.length () != candidate.length ())
1399 return false;
1400 for (unsigned i = 0; i < existing.length (); ++i)
1401 if (existing[i] != candidate[i])
1402 return false;
1403 return true;
1404 }
1405
1406 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1407 simple_hashmap_traits <bst_traits, slp_tree> >
1408 scalar_stmts_to_slp_tree_map_t;
1409
1410 static slp_tree
1411 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1412 vec<stmt_vec_info> stmts, unsigned int group_size,
1413 poly_uint64 *max_nunits,
1414 bool *matches, unsigned *limit, unsigned *tree_size,
1415 scalar_stmts_to_slp_tree_map_t *bst_map);
1416
1417 static slp_tree
1418 vect_build_slp_tree (vec_info *vinfo,
1419 vec<stmt_vec_info> stmts, unsigned int group_size,
1420 poly_uint64 *max_nunits,
1421 bool *matches, unsigned *limit, unsigned *tree_size,
1422 scalar_stmts_to_slp_tree_map_t *bst_map)
1423 {
1424 if (slp_tree *leader = bst_map->get (stmts))
1425 {
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1428 *leader ? "" : "failed ", *leader);
1429 if (*leader)
1430 {
1431 SLP_TREE_REF_COUNT (*leader)++;
1432 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1433 stmts.release ();
1434 }
1435 return *leader;
1436 }
1437
1438 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1439 so we can pick up backedge destinations during discovery. */
1440 slp_tree res = new _slp_tree;
1441 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1442 SLP_TREE_SCALAR_STMTS (res) = stmts;
1443 bst_map->put (stmts.copy (), res);
1444
1445 if (*limit == 0)
1446 {
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "SLP discovery limit exceeded\n");
1450 bool existed_p = bst_map->put (stmts, NULL);
1451 gcc_assert (existed_p);
1452 /* Mark the node invalid so we can detect those when still in use
1453 as backedge destinations. */
1454 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1455 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1456 vect_free_slp_tree (res);
1457 memset (matches, 0, sizeof (bool) * group_size);
1458 return NULL;
1459 }
1460 --*limit;
1461
1462 poly_uint64 this_max_nunits = 1;
1463 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1464 &this_max_nunits,
1465 matches, limit, tree_size, bst_map);
1466 if (!res_)
1467 {
1468 bool existed_p = bst_map->put (stmts, NULL);
1469 gcc_assert (existed_p);
1470 /* Mark the node invalid so we can detect those when still in use
1471 as backedge destinations. */
1472 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1473 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1474 vect_free_slp_tree (res);
1475 }
1476 else
1477 {
1478 gcc_assert (res_ == res);
1479 res->max_nunits = this_max_nunits;
1480 vect_update_max_nunits (max_nunits, this_max_nunits);
1481 /* Keep a reference for the bst_map use. */
1482 SLP_TREE_REF_COUNT (res)++;
1483 }
1484 return res_;
1485 }
1486
1487 /* Recursively build an SLP tree starting from NODE.
1488 Fail (and return a value not equal to zero) if def-stmts are not
1489 isomorphic, require data permutation or are of unsupported types of
1490 operation. Otherwise, return 0.
1491 The value returned is the depth in the SLP tree where a mismatch
1492 was found. */
1493
1494 static slp_tree
1495 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1496 vec<stmt_vec_info> stmts, unsigned int group_size,
1497 poly_uint64 *max_nunits,
1498 bool *matches, unsigned *limit, unsigned *tree_size,
1499 scalar_stmts_to_slp_tree_map_t *bst_map)
1500 {
1501 unsigned nops, i, this_tree_size = 0;
1502 poly_uint64 this_max_nunits = *max_nunits;
1503
1504 matches[0] = false;
1505
1506 stmt_vec_info stmt_info = stmts[0];
1507 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1508 nops = gimple_call_num_args (stmt);
1509 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1510 {
1511 nops = gimple_num_ops (stmt) - 1;
1512 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1513 nops++;
1514 }
1515 else if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
1516 nops = gimple_phi_num_args (phi);
1517 else
1518 return NULL;
1519
1520 /* If the SLP node is a PHI (induction or reduction), terminate
1521 the recursion. */
1522 bool *skip_args = XALLOCAVEC (bool, nops);
1523 memset (skip_args, 0, sizeof (bool) * nops);
1524 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1525 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1526 {
1527 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1528 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1529 group_size);
1530 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1531 max_nunits))
1532 return NULL;
1533
1534 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1535 if (def_type == vect_induction_def)
1536 {
1537 /* Induction PHIs are not cycles but walk the initial
1538 value. Only for inner loops through, for outer loops
1539 we need to pick up the value from the actual PHIs
1540 to more easily support peeling and epilogue vectorization. */
1541 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1542 if (!nested_in_vect_loop_p (loop, stmt_info))
1543 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1544 else
1545 loop = loop->inner;
1546 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1547 }
1548 else if (def_type == vect_reduction_def
1549 || def_type == vect_double_reduction_def
1550 || def_type == vect_nested_cycle)
1551 {
1552 /* Else def types have to match. */
1553 stmt_vec_info other_info;
1554 bool all_same = true;
1555 FOR_EACH_VEC_ELT (stmts, i, other_info)
1556 {
1557 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1558 return NULL;
1559 if (other_info != stmt_info)
1560 all_same = false;
1561 }
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 /* Reduction initial values are not explicitely represented. */
1564 if (!nested_in_vect_loop_p (loop, stmt_info))
1565 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1566 /* Reduction chain backedge defs are filled manually.
1567 ??? Need a better way to identify a SLP reduction chain PHI.
1568 Or a better overall way to SLP match those. */
1569 if (all_same && def_type == vect_reduction_def)
1570 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1571 }
1572 else if (def_type != vect_internal_def)
1573 return NULL;
1574 }
1575
1576
1577 bool two_operators = false;
1578 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1579 tree vectype = NULL_TREE;
1580 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1581 &this_max_nunits, matches, &two_operators,
1582 &vectype))
1583 return NULL;
1584
1585 /* If the SLP node is a load, terminate the recursion unless masked. */
1586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1588 {
1589 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1590 {
1591 /* Masked load. */
1592 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1593 nops = 1;
1594 }
1595 else
1596 {
1597 *max_nunits = this_max_nunits;
1598 (*tree_size)++;
1599 node = vect_create_new_slp_node (node, stmts, 0);
1600 SLP_TREE_VECTYPE (node) = vectype;
1601 /* And compute the load permutation. Whether it is actually
1602 a permutation depends on the unrolling factor which is
1603 decided later. */
1604 vec<unsigned> load_permutation;
1605 int j;
1606 stmt_vec_info load_info;
1607 load_permutation.create (group_size);
1608 stmt_vec_info first_stmt_info
1609 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1610 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1611 {
1612 int load_place = vect_get_place_in_interleaving_chain
1613 (load_info, first_stmt_info);
1614 gcc_assert (load_place != -1);
1615 load_permutation.safe_push (load_place);
1616 }
1617 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1618 return node;
1619 }
1620 }
1621 else if (gimple_assign_single_p (stmt_info->stmt)
1622 && !gimple_vuse (stmt_info->stmt)
1623 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1624 {
1625 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1626 the same SSA name vector of a compatible type to vectype. */
1627 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1628 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1629 stmt_vec_info estmt_info;
1630 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1631 {
1632 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1633 tree bfref = gimple_assign_rhs1 (estmt);
1634 HOST_WIDE_INT lane;
1635 if (!known_eq (bit_field_size (bfref),
1636 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1637 || !constant_multiple_p (bit_field_offset (bfref),
1638 bit_field_size (bfref), &lane))
1639 {
1640 lperm.release ();
1641 return NULL;
1642 }
1643 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1644 }
1645 slp_tree vnode = vect_create_new_slp_node (vNULL);
1646 /* ??? We record vectype here but we hide eventually necessary
1647 punning and instead rely on code generation to materialize
1648 VIEW_CONVERT_EXPRs as necessary. We instead should make
1649 this explicit somehow. */
1650 SLP_TREE_VECTYPE (vnode) = vectype;
1651 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1652 /* We are always building a permutation node even if it is an identity
1653 permute to shield the rest of the vectorizer from the odd node
1654 representing an actual vector without any scalar ops.
1655 ??? We could hide it completely with making the permute node
1656 external? */
1657 node = vect_create_new_slp_node (node, stmts, 1);
1658 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1659 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1660 SLP_TREE_VECTYPE (node) = vectype;
1661 SLP_TREE_CHILDREN (node).quick_push (vnode);
1662 return node;
1663 }
1664
1665 /* Get at the operands, verifying they are compatible. */
1666 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1667 slp_oprnd_info oprnd_info;
1668 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1669 {
1670 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
1671 stmts, i, &oprnds_info);
1672 if (res != 0)
1673 matches[(res == -1) ? 0 : i] = false;
1674 if (!matches[0])
1675 break;
1676 }
1677 for (i = 0; i < group_size; ++i)
1678 if (!matches[i])
1679 {
1680 vect_free_oprnd_info (oprnds_info);
1681 return NULL;
1682 }
1683 swap = NULL;
1684
1685 auto_vec<slp_tree, 4> children;
1686
1687 stmt_info = stmts[0];
1688
1689 /* Create SLP_TREE nodes for the definition node/s. */
1690 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1691 {
1692 slp_tree child;
1693 unsigned int j;
1694
1695 /* We're skipping certain operands from processing, for example
1696 outer loop reduction initial defs. */
1697 if (skip_args[i])
1698 {
1699 children.safe_push (NULL);
1700 continue;
1701 }
1702
1703 if (oprnd_info->first_dt == vect_uninitialized_def)
1704 {
1705 /* COND_EXPR have one too many eventually if the condition
1706 is a SSA name. */
1707 gcc_assert (i == 3 && nops == 4);
1708 continue;
1709 }
1710
1711 if (is_a <bb_vec_info> (vinfo)
1712 && oprnd_info->first_dt == vect_internal_def
1713 && !oprnd_info->any_pattern)
1714 {
1715 /* For BB vectorization, if all defs are the same do not
1716 bother to continue the build along the single-lane
1717 graph but use a splat of the scalar value. */
1718 stmt_vec_info first_def = oprnd_info->def_stmts[0];
1719 for (j = 1; j < group_size; ++j)
1720 if (oprnd_info->def_stmts[j] != first_def)
1721 break;
1722 if (j == group_size
1723 /* But avoid doing this for loads where we may be
1724 able to CSE things, unless the stmt is not
1725 vectorizable. */
1726 && (!STMT_VINFO_VECTORIZABLE (first_def)
1727 || !gimple_vuse (first_def->stmt)))
1728 {
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_NOTE, vect_location,
1731 "Using a splat of the uniform operand\n");
1732 oprnd_info->first_dt = vect_external_def;
1733 }
1734 }
1735
1736 if (oprnd_info->first_dt == vect_external_def
1737 || oprnd_info->first_dt == vect_constant_def)
1738 {
1739 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1740 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1741 oprnd_info->ops = vNULL;
1742 children.safe_push (invnode);
1743 continue;
1744 }
1745
1746 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1747 group_size, &this_max_nunits,
1748 matches, limit,
1749 &this_tree_size, bst_map)) != NULL)
1750 {
1751 oprnd_info->def_stmts = vNULL;
1752 children.safe_push (child);
1753 continue;
1754 }
1755
1756 /* If the SLP build for operand zero failed and operand zero
1757 and one can be commutated try that for the scalar stmts
1758 that failed the match. */
1759 if (i == 0
1760 /* A first scalar stmt mismatch signals a fatal mismatch. */
1761 && matches[0]
1762 /* ??? For COND_EXPRs we can swap the comparison operands
1763 as well as the arms under some constraints. */
1764 && nops == 2
1765 && oprnds_info[1]->first_dt == vect_internal_def
1766 && is_gimple_assign (stmt_info->stmt)
1767 /* Swapping operands for reductions breaks assumptions later on. */
1768 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1769 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1770 {
1771 /* See whether we can swap the matching or the non-matching
1772 stmt operands. */
1773 bool swap_not_matching = true;
1774 do
1775 {
1776 for (j = 0; j < group_size; ++j)
1777 {
1778 if (matches[j] != !swap_not_matching)
1779 continue;
1780 stmt_vec_info stmt_info = stmts[j];
1781 /* Verify if we can swap operands of this stmt. */
1782 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1783 if (!stmt
1784 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1785 {
1786 if (!swap_not_matching)
1787 goto fail;
1788 swap_not_matching = false;
1789 break;
1790 }
1791 }
1792 }
1793 while (j != group_size);
1794
1795 /* Swap mismatched definition stmts. */
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_NOTE, vect_location,
1798 "Re-trying with swapped operands of stmts ");
1799 for (j = 0; j < group_size; ++j)
1800 if (matches[j] == !swap_not_matching)
1801 {
1802 std::swap (oprnds_info[0]->def_stmts[j],
1803 oprnds_info[1]->def_stmts[j]);
1804 std::swap (oprnds_info[0]->ops[j],
1805 oprnds_info[1]->ops[j]);
1806 if (dump_enabled_p ())
1807 dump_printf (MSG_NOTE, "%d ", j);
1808 }
1809 if (dump_enabled_p ())
1810 dump_printf (MSG_NOTE, "\n");
1811 /* And try again with scratch 'matches' ... */
1812 bool *tem = XALLOCAVEC (bool, group_size);
1813 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1814 group_size, &this_max_nunits,
1815 tem, limit,
1816 &this_tree_size, bst_map)) != NULL)
1817 {
1818 oprnd_info->def_stmts = vNULL;
1819 children.safe_push (child);
1820 continue;
1821 }
1822 }
1823 fail:
1824
1825 /* If the SLP build failed and we analyze a basic-block
1826 simply treat nodes we fail to build as externally defined
1827 (and thus build vectors from the scalar defs).
1828 The cost model will reject outright expensive cases.
1829 ??? This doesn't treat cases where permutation ultimatively
1830 fails (or we don't try permutation below). Ideally we'd
1831 even compute a permutation that will end up with the maximum
1832 SLP tree size... */
1833 if (is_a <bb_vec_info> (vinfo)
1834 /* ??? Rejecting patterns this way doesn't work. We'd have to
1835 do extra work to cancel the pattern so the uses see the
1836 scalar version. */
1837 && !is_pattern_stmt_p (stmt_info)
1838 && !oprnd_info->any_pattern)
1839 {
1840 /* But if there's a leading vector sized set of matching stmts
1841 fail here so we can split the group. This matches the condition
1842 vect_analyze_slp_instance uses. */
1843 /* ??? We might want to split here and combine the results to support
1844 multiple vector sizes better. */
1845 for (j = 0; j < group_size; ++j)
1846 if (!matches[j])
1847 break;
1848 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
1849 {
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "Building vector operands from scalars\n");
1853 this_tree_size++;
1854 child = vect_create_new_slp_node (oprnd_info->ops);
1855 children.safe_push (child);
1856 oprnd_info->ops = vNULL;
1857 continue;
1858 }
1859 }
1860
1861 gcc_assert (child == NULL);
1862 FOR_EACH_VEC_ELT (children, j, child)
1863 if (child)
1864 vect_free_slp_tree (child);
1865 vect_free_oprnd_info (oprnds_info);
1866 return NULL;
1867 }
1868
1869 vect_free_oprnd_info (oprnds_info);
1870
1871 /* If we have all children of a child built up from uniform scalars
1872 or does more than one possibly expensive vector construction then
1873 just throw that away, causing it built up from scalars.
1874 The exception is the SLP node for the vector store. */
1875 if (is_a <bb_vec_info> (vinfo)
1876 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1877 /* ??? Rejecting patterns this way doesn't work. We'd have to
1878 do extra work to cancel the pattern so the uses see the
1879 scalar version. */
1880 && !is_pattern_stmt_p (stmt_info))
1881 {
1882 slp_tree child;
1883 unsigned j;
1884 bool all_uniform_p = true;
1885 unsigned n_vector_builds = 0;
1886 FOR_EACH_VEC_ELT (children, j, child)
1887 {
1888 if (!child)
1889 ;
1890 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
1891 all_uniform_p = false;
1892 else if (!vect_slp_tree_uniform_p (child))
1893 {
1894 all_uniform_p = false;
1895 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
1896 n_vector_builds++;
1897 }
1898 }
1899 if (all_uniform_p || n_vector_builds > 1)
1900 {
1901 /* Roll back. */
1902 matches[0] = false;
1903 FOR_EACH_VEC_ELT (children, j, child)
1904 if (child)
1905 vect_free_slp_tree (child);
1906
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_NOTE, vect_location,
1909 "Building parent vector operands from "
1910 "scalars instead\n");
1911 return NULL;
1912 }
1913 }
1914
1915 *tree_size += this_tree_size + 1;
1916 *max_nunits = this_max_nunits;
1917
1918 if (two_operators)
1919 {
1920 /* ??? We'd likely want to either cache in bst_map sth like
1921 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1922 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1923 explicit stmts to put in so the keying on 'stmts' doesn't
1924 work (but we have the same issue with nodes that use 'ops'). */
1925 slp_tree one = new _slp_tree;
1926 slp_tree two = new _slp_tree;
1927 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1928 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1929 SLP_TREE_VECTYPE (one) = vectype;
1930 SLP_TREE_VECTYPE (two) = vectype;
1931 SLP_TREE_CHILDREN (one).safe_splice (children);
1932 SLP_TREE_CHILDREN (two).safe_splice (children);
1933 slp_tree child;
1934 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1935 SLP_TREE_REF_COUNT (child)++;
1936
1937 /* Here we record the original defs since this
1938 node represents the final lane configuration. */
1939 node = vect_create_new_slp_node (node, stmts, 2);
1940 SLP_TREE_VECTYPE (node) = vectype;
1941 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1942 SLP_TREE_CHILDREN (node).quick_push (one);
1943 SLP_TREE_CHILDREN (node).quick_push (two);
1944 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1945 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1946 enum tree_code ocode = ERROR_MARK;
1947 stmt_vec_info ostmt_info;
1948 unsigned j = 0;
1949 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1950 {
1951 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1952 if (gimple_assign_rhs_code (ostmt) != code0)
1953 {
1954 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1955 ocode = gimple_assign_rhs_code (ostmt);
1956 j = i;
1957 }
1958 else
1959 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1960 }
1961 SLP_TREE_CODE (one) = code0;
1962 SLP_TREE_CODE (two) = ocode;
1963 SLP_TREE_LANES (one) = stmts.length ();
1964 SLP_TREE_LANES (two) = stmts.length ();
1965 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1966 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1967 return node;
1968 }
1969
1970 node = vect_create_new_slp_node (node, stmts, nops);
1971 SLP_TREE_VECTYPE (node) = vectype;
1972 SLP_TREE_CHILDREN (node).splice (children);
1973 return node;
1974 }
1975
1976 /* Dump a single SLP tree NODE. */
1977
1978 static void
1979 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1980 slp_tree node)
1981 {
1982 unsigned i, j;
1983 slp_tree child;
1984 stmt_vec_info stmt_info;
1985 tree op;
1986
1987 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1988 dump_user_location_t user_loc = loc.get_user_location ();
1989 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1990 SLP_TREE_DEF_TYPE (node) == vect_external_def
1991 ? " (external)"
1992 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1993 ? " (constant)"
1994 : ""), node,
1995 estimated_poly_value (node->max_nunits),
1996 SLP_TREE_REF_COUNT (node));
1997 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
1998 {
1999 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2000 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2001 else
2002 dump_printf_loc (metadata, user_loc, "op template: %G",
2003 SLP_TREE_REPRESENTATIVE (node)->stmt);
2004 }
2005 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2006 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2007 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2008 else
2009 {
2010 dump_printf_loc (metadata, user_loc, "\t{ ");
2011 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2012 dump_printf (metadata, "%T%s ", op,
2013 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2014 dump_printf (metadata, "}\n");
2015 }
2016 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2017 {
2018 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2019 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2020 dump_printf (dump_kind, " %u", j);
2021 dump_printf (dump_kind, " }\n");
2022 }
2023 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2024 {
2025 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2026 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2027 dump_printf (dump_kind, " %u[%u]",
2028 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2029 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2030 dump_printf (dump_kind, " }\n");
2031 }
2032 if (SLP_TREE_CHILDREN (node).is_empty ())
2033 return;
2034 dump_printf_loc (metadata, user_loc, "\tchildren");
2035 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2036 dump_printf (dump_kind, " %p", (void *)child);
2037 dump_printf (dump_kind, "\n");
2038 }
2039
2040 DEBUG_FUNCTION void
2041 debug (slp_tree node)
2042 {
2043 debug_dump_context ctx;
2044 vect_print_slp_tree (MSG_NOTE,
2045 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2046 node);
2047 }
2048
2049 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2050
2051 static void
2052 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2053 slp_tree node, hash_set<slp_tree> &visited)
2054 {
2055 unsigned i;
2056 slp_tree child;
2057
2058 if (visited.add (node))
2059 return;
2060
2061 vect_print_slp_tree (dump_kind, loc, node);
2062
2063 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2064 if (child)
2065 vect_print_slp_graph (dump_kind, loc, child, visited);
2066 }
2067
2068 static void
2069 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2070 slp_tree entry)
2071 {
2072 hash_set<slp_tree> visited;
2073 vect_print_slp_graph (dump_kind, loc, entry, visited);
2074 }
2075
2076 /* Mark the tree rooted at NODE with PURE_SLP. */
2077
2078 static void
2079 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2080 {
2081 int i;
2082 stmt_vec_info stmt_info;
2083 slp_tree child;
2084
2085 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2086 return;
2087
2088 if (visited.add (node))
2089 return;
2090
2091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2092 STMT_SLP_TYPE (stmt_info) = pure_slp;
2093
2094 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2095 if (child)
2096 vect_mark_slp_stmts (child, visited);
2097 }
2098
2099 static void
2100 vect_mark_slp_stmts (slp_tree node)
2101 {
2102 hash_set<slp_tree> visited;
2103 vect_mark_slp_stmts (node, visited);
2104 }
2105
2106 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2107
2108 static void
2109 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2110 {
2111 int i;
2112 stmt_vec_info stmt_info;
2113 slp_tree child;
2114
2115 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2116 return;
2117
2118 if (visited.add (node))
2119 return;
2120
2121 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2122 {
2123 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2124 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2125 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2126 }
2127
2128 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2129 if (child)
2130 vect_mark_slp_stmts_relevant (child, visited);
2131 }
2132
2133 static void
2134 vect_mark_slp_stmts_relevant (slp_tree node)
2135 {
2136 hash_set<slp_tree> visited;
2137 vect_mark_slp_stmts_relevant (node, visited);
2138 }
2139
2140
2141 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2142
2143 static void
2144 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2145 hash_set<slp_tree> &visited)
2146 {
2147 if (!node || visited.add (node))
2148 return;
2149
2150 if (SLP_TREE_CHILDREN (node).length () == 0)
2151 {
2152 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2153 return;
2154 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2155 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2156 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2157 loads.safe_push (node);
2158 }
2159 else
2160 {
2161 unsigned i;
2162 slp_tree child;
2163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2164 vect_gather_slp_loads (loads, child, visited);
2165 }
2166 }
2167
2168
2169 /* Find the last store in SLP INSTANCE. */
2170
2171 stmt_vec_info
2172 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2173 {
2174 stmt_vec_info last = NULL;
2175 stmt_vec_info stmt_vinfo;
2176
2177 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2178 {
2179 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2180 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2181 }
2182
2183 return last;
2184 }
2185
2186 /* Find the first stmt in NODE. */
2187
2188 stmt_vec_info
2189 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2190 {
2191 stmt_vec_info first = NULL;
2192 stmt_vec_info stmt_vinfo;
2193
2194 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2195 {
2196 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2197 if (!first
2198 || get_later_stmt (stmt_vinfo, first) == first)
2199 first = stmt_vinfo;
2200 }
2201
2202 return first;
2203 }
2204
2205 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2206 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2207 (also containing the first GROUP1_SIZE stmts, since stores are
2208 consecutive), the second containing the remainder.
2209 Return the first stmt in the second group. */
2210
2211 static stmt_vec_info
2212 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2213 {
2214 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2215 gcc_assert (group1_size > 0);
2216 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2217 gcc_assert (group2_size > 0);
2218 DR_GROUP_SIZE (first_vinfo) = group1_size;
2219
2220 stmt_vec_info stmt_info = first_vinfo;
2221 for (unsigned i = group1_size; i > 1; i--)
2222 {
2223 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2224 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2225 }
2226 /* STMT is now the last element of the first group. */
2227 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2228 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2229
2230 DR_GROUP_SIZE (group2) = group2_size;
2231 for (stmt_info = group2; stmt_info;
2232 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2233 {
2234 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2235 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2236 }
2237
2238 /* For the second group, the DR_GROUP_GAP is that before the original group,
2239 plus skipping over the first vector. */
2240 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2241
2242 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2243 DR_GROUP_GAP (first_vinfo) += group2_size;
2244
2245 if (dump_enabled_p ())
2246 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2247 group1_size, group2_size);
2248
2249 return group2;
2250 }
2251
2252 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2253 statements and a vector of NUNITS elements. */
2254
2255 static poly_uint64
2256 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2257 {
2258 return exact_div (common_multiple (nunits, group_size), group_size);
2259 }
2260
2261 /* Helper that checks to see if a node is a load node. */
2262
2263 static inline bool
2264 vect_is_slp_load_node (slp_tree root)
2265 {
2266 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2267 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2268 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2269 }
2270
2271
2272 /* Helper function of optimize_load_redistribution that performs the operation
2273 recursively. */
2274
2275 static slp_tree
2276 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2277 vec_info *vinfo, unsigned int group_size,
2278 hash_map<slp_tree, slp_tree> *load_map,
2279 slp_tree root)
2280 {
2281 if (slp_tree *leader = load_map->get (root))
2282 return *leader;
2283
2284 load_map->put (root, NULL);
2285
2286 slp_tree node;
2287 unsigned i;
2288
2289 /* For now, we don't know anything about externals so do not do anything. */
2290 if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2291 return NULL;
2292 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2293 {
2294 /* First convert this node into a load node and add it to the leaves
2295 list and flatten the permute from a lane to a load one. If it's
2296 unneeded it will be elided later. */
2297 vec<stmt_vec_info> stmts;
2298 stmts.create (SLP_TREE_LANES (root));
2299 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2300 for (unsigned j = 0; j < lane_perm.length (); j++)
2301 {
2302 std::pair<unsigned, unsigned> perm = lane_perm[j];
2303 node = SLP_TREE_CHILDREN (root)[perm.first];
2304
2305 if (!vect_is_slp_load_node (node)
2306 || SLP_TREE_CHILDREN (node).exists ())
2307 {
2308 stmts.release ();
2309 goto next;
2310 }
2311
2312 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2313 }
2314
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_NOTE, vect_location,
2317 "converting stmts on permute node %p\n", root);
2318
2319 bool *matches = XALLOCAVEC (bool, group_size);
2320 poly_uint64 max_nunits = 1;
2321 unsigned tree_size = 0, limit = 1;
2322 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2323 matches, &limit, &tree_size, bst_map);
2324 if (!node)
2325 stmts.release ();
2326
2327 load_map->put (root, node);
2328 return node;
2329 }
2330
2331 next:
2332 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2333 {
2334 slp_tree value
2335 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2336 node);
2337 if (value)
2338 {
2339 SLP_TREE_REF_COUNT (value)++;
2340 SLP_TREE_CHILDREN (root)[i] = value;
2341 vect_free_slp_tree (node);
2342 }
2343 }
2344
2345 return NULL;
2346 }
2347
2348 /* Temporary workaround for loads not being CSEd during SLP build. This
2349 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2350 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2351 same DR such that the final operation is equal to a permuted load. Such
2352 NODES are then directly converted into LOADS themselves. The nodes are
2353 CSEd using BST_MAP. */
2354
2355 static void
2356 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2357 vec_info *vinfo, unsigned int group_size,
2358 hash_map<slp_tree, slp_tree> *load_map,
2359 slp_tree root)
2360 {
2361 slp_tree node;
2362 unsigned i;
2363
2364 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2365 {
2366 slp_tree value
2367 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2368 node);
2369 if (value)
2370 {
2371 SLP_TREE_REF_COUNT (value)++;
2372 SLP_TREE_CHILDREN (root)[i] = value;
2373 vect_free_slp_tree (node);
2374 }
2375 }
2376 }
2377
2378 /* Helper function of vect_match_slp_patterns.
2379
2380 Attempts to match patterns against the slp tree rooted in REF_NODE using
2381 VINFO. Patterns are matched in post-order traversal.
2382
2383 If matching is successful the value in REF_NODE is updated and returned, if
2384 not then it is returned unchanged. */
2385
2386 static bool
2387 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2388 slp_tree_to_load_perm_map_t *perm_cache,
2389 hash_set<slp_tree> *visited)
2390 {
2391 unsigned i;
2392 slp_tree node = *ref_node;
2393 bool found_p = false;
2394 if (!node || visited->add (node))
2395 return false;
2396
2397 slp_tree child;
2398 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2399 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2400 vinfo, perm_cache, visited);
2401
2402 for (unsigned x = 0; x < num__slp_patterns; x++)
2403 {
2404 vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
2405 if (pattern)
2406 {
2407 pattern->build (vinfo);
2408 delete pattern;
2409 found_p = true;
2410 }
2411 }
2412
2413 return found_p;
2414 }
2415
2416 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2417 vec_info VINFO.
2418
2419 The modified tree is returned. Patterns are tried in order and multiple
2420 patterns may match. */
2421
2422 static bool
2423 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2424 hash_set<slp_tree> *visited,
2425 slp_tree_to_load_perm_map_t *perm_cache)
2426 {
2427 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2428 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2429
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "Analyzing SLP tree %p for patterns\n",
2433 SLP_INSTANCE_TREE (instance));
2434
2435 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
2436 }
2437
2438 /* Analyze an SLP instance starting from a group of grouped stores. Call
2439 vect_build_slp_tree to build a tree of packed stmts if possible.
2440 Return FALSE if it's impossible to SLP any stmt in the loop. */
2441
2442 static bool
2443 vect_analyze_slp_instance (vec_info *vinfo,
2444 scalar_stmts_to_slp_tree_map_t *bst_map,
2445 stmt_vec_info stmt_info, slp_instance_kind kind,
2446 unsigned max_tree_size, unsigned *limit);
2447
2448 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2449 of KIND. Return true if successful. */
2450
2451 static bool
2452 vect_build_slp_instance (vec_info *vinfo,
2453 slp_instance_kind kind,
2454 vec<stmt_vec_info> &scalar_stmts,
2455 stmt_vec_info root_stmt_info,
2456 unsigned max_tree_size, unsigned *limit,
2457 scalar_stmts_to_slp_tree_map_t *bst_map,
2458 /* ??? We need stmt_info for group splitting. */
2459 stmt_vec_info stmt_info_)
2460 {
2461 if (dump_enabled_p ())
2462 {
2463 dump_printf_loc (MSG_NOTE, vect_location,
2464 "Starting SLP discovery for\n");
2465 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
2466 dump_printf_loc (MSG_NOTE, vect_location,
2467 " %G", scalar_stmts[i]->stmt);
2468 }
2469
2470 /* Build the tree for the SLP instance. */
2471 unsigned int group_size = scalar_stmts.length ();
2472 bool *matches = XALLOCAVEC (bool, group_size);
2473 poly_uint64 max_nunits = 1;
2474 unsigned tree_size = 0;
2475 unsigned i;
2476 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2477 &max_nunits, matches, limit,
2478 &tree_size, bst_map);
2479 if (node != NULL)
2480 {
2481 /* Calculate the unrolling factor based on the smallest type. */
2482 poly_uint64 unrolling_factor
2483 = calculate_unrolling_factor (max_nunits, group_size);
2484
2485 if (maybe_ne (unrolling_factor, 1U)
2486 && is_a <bb_vec_info> (vinfo))
2487 {
2488 unsigned HOST_WIDE_INT const_max_nunits;
2489 if (!max_nunits.is_constant (&const_max_nunits)
2490 || const_max_nunits > group_size)
2491 {
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "Build SLP failed: store group "
2495 "size not a multiple of the vector size "
2496 "in basic block SLP\n");
2497 vect_free_slp_tree (node);
2498 return false;
2499 }
2500 /* Fatal mismatch. */
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_NOTE, vect_location,
2503 "SLP discovery succeeded but node needs "
2504 "splitting\n");
2505 memset (matches, true, group_size);
2506 matches[group_size / const_max_nunits * const_max_nunits] = false;
2507 vect_free_slp_tree (node);
2508 }
2509 else
2510 {
2511 /* Create a new SLP instance. */
2512 slp_instance new_instance = XNEW (class _slp_instance);
2513 SLP_INSTANCE_TREE (new_instance) = node;
2514 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2515 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2516 SLP_INSTANCE_ROOT_STMT (new_instance) = root_stmt_info;
2517 SLP_INSTANCE_KIND (new_instance) = kind;
2518 new_instance->reduc_phis = NULL;
2519 new_instance->cost_vec = vNULL;
2520 new_instance->subgraph_entries = vNULL;
2521
2522 if (dump_enabled_p ())
2523 dump_printf_loc (MSG_NOTE, vect_location,
2524 "SLP size %u vs. limit %u.\n",
2525 tree_size, max_tree_size);
2526
2527 /* Fixup SLP reduction chains. */
2528 if (kind == slp_inst_kind_reduc_chain)
2529 {
2530 /* If this is a reduction chain with a conversion in front
2531 amend the SLP tree with a node for that. */
2532 gimple *scalar_def
2533 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2534 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
2535 {
2536 /* Get at the conversion stmt - we know it's the single use
2537 of the last stmt of the reduction chain. */
2538 use_operand_p use_p;
2539 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
2540 &use_p, &scalar_def);
2541 gcc_assert (r);
2542 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
2543 next_info = vect_stmt_to_vectorize (next_info);
2544 scalar_stmts = vNULL;
2545 scalar_stmts.create (group_size);
2546 for (unsigned i = 0; i < group_size; ++i)
2547 scalar_stmts.quick_push (next_info);
2548 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2549 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2550 SLP_TREE_CHILDREN (conv).quick_push (node);
2551 SLP_INSTANCE_TREE (new_instance) = conv;
2552 /* We also have to fake this conversion stmt as SLP reduction
2553 group so we don't have to mess with too much code
2554 elsewhere. */
2555 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2556 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2557 }
2558 /* Fill the backedge child of the PHI SLP node. The
2559 general matching code cannot find it because the
2560 scalar code does not reflect how we vectorize the
2561 reduction. */
2562 use_operand_p use_p;
2563 imm_use_iterator imm_iter;
2564 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
2565 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
2566 gimple_get_lhs (scalar_def))
2567 /* There are exactly two non-debug uses, the reduction
2568 PHI and the loop-closed PHI node. */
2569 if (!is_gimple_debug (USE_STMT (use_p))
2570 && gimple_bb (USE_STMT (use_p)) == loop->header)
2571 {
2572 auto_vec<stmt_vec_info, 64> phis (group_size);
2573 stmt_vec_info phi_info
2574 = vinfo->lookup_stmt (USE_STMT (use_p));
2575 for (unsigned i = 0; i < group_size; ++i)
2576 phis.quick_push (phi_info);
2577 slp_tree *phi_node = bst_map->get (phis);
2578 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
2579 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
2580 = SLP_INSTANCE_TREE (new_instance);
2581 SLP_INSTANCE_TREE (new_instance)->refcnt++;
2582 }
2583 }
2584
2585 vinfo->slp_instances.safe_push (new_instance);
2586
2587 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2588 the number of scalar stmts in the root in a few places.
2589 Verify that assumption holds. */
2590 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2591 .length () == group_size);
2592
2593 if (dump_enabled_p ())
2594 {
2595 dump_printf_loc (MSG_NOTE, vect_location,
2596 "Final SLP tree for instance %p:\n", new_instance);
2597 vect_print_slp_graph (MSG_NOTE, vect_location,
2598 SLP_INSTANCE_TREE (new_instance));
2599 }
2600
2601 return true;
2602 }
2603 }
2604 else
2605 {
2606 /* Failed to SLP. */
2607 /* Free the allocated memory. */
2608 scalar_stmts.release ();
2609 }
2610
2611 stmt_vec_info stmt_info = stmt_info_;
2612 /* Try to break the group up into pieces. */
2613 if (kind == slp_inst_kind_store)
2614 {
2615 /* ??? We could delay all the actual splitting of store-groups
2616 until after SLP discovery of the original group completed.
2617 Then we can recurse to vect_build_slp_instance directly. */
2618 for (i = 0; i < group_size; i++)
2619 if (!matches[i])
2620 break;
2621
2622 /* For basic block SLP, try to break the group up into multiples of
2623 a vector size. */
2624 if (is_a <bb_vec_info> (vinfo)
2625 && (i > 1 && i < group_size))
2626 {
2627 tree scalar_type
2628 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2629 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2630 1 << floor_log2 (i));
2631 unsigned HOST_WIDE_INT const_nunits;
2632 if (vectype
2633 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
2634 {
2635 /* Split into two groups at the first vector boundary. */
2636 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2637 unsigned group1_size = i & ~(const_nunits - 1);
2638
2639 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_NOTE, vect_location,
2641 "Splitting SLP group at stmt %u\n", i);
2642 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2643 group1_size);
2644 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2645 kind, max_tree_size,
2646 limit);
2647 /* Split the rest at the failure point and possibly
2648 re-analyze the remaining matching part if it has
2649 at least two lanes. */
2650 if (group1_size < i
2651 && (i + 1 < group_size
2652 || i - group1_size > 1))
2653 {
2654 stmt_vec_info rest2 = rest;
2655 rest = vect_split_slp_store_group (rest, i - group1_size);
2656 if (i - group1_size > 1)
2657 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
2658 kind, max_tree_size,
2659 limit);
2660 }
2661 /* Re-analyze the non-matching tail if it has at least
2662 two lanes. */
2663 if (i + 1 < group_size)
2664 res |= vect_analyze_slp_instance (vinfo, bst_map,
2665 rest, kind, max_tree_size,
2666 limit);
2667 return res;
2668 }
2669 }
2670
2671 /* For loop vectorization split into arbitrary pieces of size > 1. */
2672 if (is_a <loop_vec_info> (vinfo)
2673 && (i > 1 && i < group_size))
2674 {
2675 unsigned group1_size = i;
2676
2677 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE, vect_location,
2679 "Splitting SLP group at stmt %u\n", i);
2680
2681 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2682 group1_size);
2683 /* Loop vectorization cannot handle gaps in stores, make sure
2684 the split group appears as strided. */
2685 STMT_VINFO_STRIDED_P (rest) = 1;
2686 DR_GROUP_GAP (rest) = 0;
2687 STMT_VINFO_STRIDED_P (stmt_info) = 1;
2688 DR_GROUP_GAP (stmt_info) = 0;
2689
2690 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2691 kind, max_tree_size, limit);
2692 if (i + 1 < group_size)
2693 res |= vect_analyze_slp_instance (vinfo, bst_map,
2694 rest, kind, max_tree_size, limit);
2695
2696 return res;
2697 }
2698
2699 /* Even though the first vector did not all match, we might be able to SLP
2700 (some) of the remainder. FORNOW ignore this possibility. */
2701 }
2702
2703 /* Failed to SLP. */
2704 if (dump_enabled_p ())
2705 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
2706 return false;
2707 }
2708
2709
2710 /* Analyze an SLP instance starting from a group of grouped stores. Call
2711 vect_build_slp_tree to build a tree of packed stmts if possible.
2712 Return FALSE if it's impossible to SLP any stmt in the loop. */
2713
2714 static bool
2715 vect_analyze_slp_instance (vec_info *vinfo,
2716 scalar_stmts_to_slp_tree_map_t *bst_map,
2717 stmt_vec_info stmt_info,
2718 slp_instance_kind kind,
2719 unsigned max_tree_size, unsigned *limit)
2720 {
2721 unsigned int i;
2722 vec<stmt_vec_info> scalar_stmts;
2723
2724 if (is_a <bb_vec_info> (vinfo))
2725 vect_location = stmt_info->stmt;
2726
2727 stmt_vec_info next_info = stmt_info;
2728 if (kind == slp_inst_kind_store)
2729 {
2730 /* Collect the stores and store them in scalar_stmts. */
2731 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
2732 while (next_info)
2733 {
2734 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2735 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2736 }
2737 }
2738 else if (kind == slp_inst_kind_reduc_chain)
2739 {
2740 /* Collect the reduction stmts and store them in scalar_stmts. */
2741 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
2742 while (next_info)
2743 {
2744 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
2745 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2746 }
2747 /* Mark the first element of the reduction chain as reduction to properly
2748 transform the node. In the reduction analysis phase only the last
2749 element of the chain is marked as reduction. */
2750 STMT_VINFO_DEF_TYPE (stmt_info)
2751 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2752 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2753 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2754 }
2755 else if (kind == slp_inst_kind_ctor)
2756 {
2757 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2758 tree val;
2759 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
2760 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2761 {
2762 stmt_vec_info def_info = vinfo->lookup_def (val);
2763 def_info = vect_stmt_to_vectorize (def_info);
2764 scalar_stmts.quick_push (def_info);
2765 }
2766 if (dump_enabled_p ())
2767 dump_printf_loc (MSG_NOTE, vect_location,
2768 "Analyzing vectorizable constructor: %G\n",
2769 stmt_info->stmt);
2770 }
2771 else if (kind == slp_inst_kind_reduc_group)
2772 {
2773 /* Collect reduction statements. */
2774 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2775 scalar_stmts.create (reductions.length ());
2776 for (i = 0; reductions.iterate (i, &next_info); i++)
2777 if (STMT_VINFO_RELEVANT_P (next_info)
2778 || STMT_VINFO_LIVE_P (next_info))
2779 scalar_stmts.quick_push (next_info);
2780 /* If less than two were relevant/live there's nothing to SLP. */
2781 if (scalar_stmts.length () < 2)
2782 return false;
2783 }
2784 else
2785 gcc_unreachable ();
2786
2787 /* Build the tree for the SLP instance. */
2788 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
2789 kind == slp_inst_kind_ctor
2790 ? stmt_info : NULL,
2791 max_tree_size, limit, bst_map,
2792 kind == slp_inst_kind_store
2793 ? stmt_info : NULL);
2794
2795 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2796 where we should do store group splitting. */
2797
2798 return res;
2799 }
2800
2801 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2802 trees of packed scalar stmts if SLP is possible. */
2803
2804 opt_result
2805 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2806 {
2807 unsigned int i;
2808 stmt_vec_info first_element;
2809 slp_instance instance;
2810
2811 DUMP_VECT_SCOPE ("vect_analyze_slp");
2812
2813 unsigned limit = max_tree_size;
2814
2815 scalar_stmts_to_slp_tree_map_t *bst_map
2816 = new scalar_stmts_to_slp_tree_map_t ();
2817
2818 /* Find SLP sequences starting from groups of grouped stores. */
2819 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2820 vect_analyze_slp_instance (vinfo, bst_map, first_element,
2821 STMT_VINFO_GROUPED_ACCESS (first_element)
2822 ? slp_inst_kind_store : slp_inst_kind_ctor,
2823 max_tree_size, &limit);
2824
2825 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
2826 {
2827 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
2828 {
2829 vect_location = bb_vinfo->roots[i].root->stmt;
2830 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
2831 bb_vinfo->roots[i].stmts,
2832 bb_vinfo->roots[i].root,
2833 max_tree_size, &limit, bst_map, NULL))
2834 bb_vinfo->roots[i].stmts = vNULL;
2835 }
2836 }
2837
2838 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2839 {
2840 /* Find SLP sequences starting from reduction chains. */
2841 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2842 if (! STMT_VINFO_RELEVANT_P (first_element)
2843 && ! STMT_VINFO_LIVE_P (first_element))
2844 ;
2845 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2846 slp_inst_kind_reduc_chain,
2847 max_tree_size, &limit))
2848 {
2849 /* Dissolve reduction chain group. */
2850 stmt_vec_info vinfo = first_element;
2851 stmt_vec_info last = NULL;
2852 while (vinfo)
2853 {
2854 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2855 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2856 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2857 last = vinfo;
2858 vinfo = next;
2859 }
2860 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2861 /* It can be still vectorized as part of an SLP reduction. */
2862 loop_vinfo->reductions.safe_push (last);
2863 }
2864
2865 /* Find SLP sequences starting from groups of reductions. */
2866 if (loop_vinfo->reductions.length () > 1)
2867 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2868 slp_inst_kind_reduc_group, max_tree_size,
2869 &limit);
2870 }
2871
2872 hash_set<slp_tree> visited_patterns;
2873 slp_tree_to_load_perm_map_t perm_cache;
2874 hash_map<slp_tree, slp_tree> load_map;
2875
2876 /* See if any patterns can be found in the SLP tree. */
2877 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
2878 if (vect_match_slp_patterns (instance, vinfo, &visited_patterns,
2879 &perm_cache))
2880 {
2881 slp_tree root = SLP_INSTANCE_TREE (instance);
2882 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
2883 &load_map, root);
2884 if (dump_enabled_p ())
2885 {
2886 dump_printf_loc (MSG_NOTE, vect_location,
2887 "Pattern matched SLP tree\n");
2888 vect_print_slp_graph (MSG_NOTE, vect_location, root);
2889 }
2890 }
2891
2892
2893
2894 /* The map keeps a reference on SLP nodes built, release that. */
2895 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2896 it != bst_map->end (); ++it)
2897 if ((*it).second)
2898 vect_free_slp_tree ((*it).second);
2899 delete bst_map;
2900
2901 return opt_result::success ();
2902 }
2903
2904 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2905
2906 static void
2907 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
2908 vec<slp_tree> &vertices, vec<int> &leafs)
2909 {
2910 unsigned i;
2911 slp_tree child;
2912
2913 if (visited.add (node))
2914 return;
2915
2916 node->vertex = vertices.length ();
2917 vertices.safe_push (node);
2918
2919 bool leaf = true;
2920 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2921 if (child)
2922 {
2923 leaf = false;
2924 vect_slp_build_vertices (visited, child, vertices, leafs);
2925 }
2926 if (leaf)
2927 leafs.safe_push (node->vertex);
2928 }
2929
2930 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2931
2932 static void
2933 vect_slp_build_vertices (vec_info *info, vec<slp_tree> &vertices,
2934 vec<int> &leafs)
2935 {
2936 hash_set<slp_tree> visited;
2937 unsigned i;
2938 slp_instance instance;
2939 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
2940 {
2941 unsigned n_v = vertices.length ();
2942 unsigned n_l = leafs.length ();
2943 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
2944 leafs);
2945 /* If we added vertices but no entries to the reverse graph we've
2946 added a cycle that is not backwards-reachable. Push the entry
2947 to mimic as leaf then. */
2948 if (vertices.length () > n_v
2949 && leafs.length () == n_l)
2950 leafs.safe_push (SLP_INSTANCE_TREE (instance)->vertex);
2951 }
2952 }
2953
2954 /* Apply (reverse) bijectite PERM to VEC. */
2955
2956 template <class T>
2957 static void
2958 vect_slp_permute (vec<unsigned> perm,
2959 vec<T> &vec, bool reverse)
2960 {
2961 auto_vec<T, 64> saved;
2962 saved.create (vec.length ());
2963 for (unsigned i = 0; i < vec.length (); ++i)
2964 saved.quick_push (vec[i]);
2965
2966 if (reverse)
2967 {
2968 for (unsigned i = 0; i < vec.length (); ++i)
2969 vec[perm[i]] = saved[i];
2970 for (unsigned i = 0; i < vec.length (); ++i)
2971 gcc_assert (vec[perm[i]] == saved[i]);
2972 }
2973 else
2974 {
2975 for (unsigned i = 0; i < vec.length (); ++i)
2976 vec[i] = saved[perm[i]];
2977 for (unsigned i = 0; i < vec.length (); ++i)
2978 gcc_assert (vec[i] == saved[perm[i]]);
2979 }
2980 }
2981
2982 /* Return whether permutations PERM_A and PERM_B as recorded in the
2983 PERMS vector are equal. */
2984
2985 static bool
2986 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
2987 int perm_a, int perm_b)
2988 {
2989 return (perm_a == perm_b
2990 || (perms[perm_a].length () == perms[perm_b].length ()
2991 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
2992 sizeof (unsigned) * perms[perm_a].length ()) == 0));
2993 }
2994
2995 /* Optimize the SLP graph of VINFO. */
2996
2997 void
2998 vect_optimize_slp (vec_info *vinfo)
2999 {
3000 if (vinfo->slp_instances.is_empty ())
3001 return;
3002
3003 slp_tree node;
3004 unsigned i;
3005 auto_vec<slp_tree> vertices;
3006 auto_vec<int> leafs;
3007 vect_slp_build_vertices (vinfo, vertices, leafs);
3008
3009 struct graph *slpg = new_graph (vertices.length ());
3010 FOR_EACH_VEC_ELT (vertices, i, node)
3011 {
3012 unsigned j;
3013 slp_tree child;
3014 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3015 if (child)
3016 add_edge (slpg, i, child->vertex);
3017 }
3018
3019 /* Compute (reverse) postorder on the inverted graph. */
3020 auto_vec<int> ipo;
3021 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3022
3023 auto_sbitmap n_visited (vertices.length ());
3024 auto_sbitmap n_materialize (vertices.length ());
3025 auto_vec<int> n_perm (vertices.length ());
3026 auto_vec<vec<unsigned> > perms;
3027
3028 bitmap_clear (n_visited);
3029 bitmap_clear (n_materialize);
3030 n_perm.quick_grow_cleared (vertices.length ());
3031 perms.safe_push (vNULL); /* zero is no permute */
3032
3033 /* Produce initial permutations. */
3034 for (i = 0; i < leafs.length (); ++i)
3035 {
3036 int idx = leafs[i];
3037 slp_tree node = vertices[idx];
3038
3039 /* Handle externals and constants optimistically throughout the
3040 iteration. */
3041 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3042 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3043 continue;
3044
3045 /* Leafs do not change across iterations. Note leafs also double
3046 as entries to the reverse graph. */
3047 if (!slpg->vertices[idx].succ)
3048 bitmap_set_bit (n_visited, idx);
3049 /* Loads are the only thing generating permutes. */
3050 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3051 continue;
3052
3053 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3054 node unpermuted, record this permute. */
3055 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3056 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3057 continue;
3058 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3059 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3060 bool any_permute = false;
3061 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3062 {
3063 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3064 imin = MIN (imin, idx);
3065 imax = MAX (imax, idx);
3066 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3067 any_permute = true;
3068 }
3069 /* If there's no permute no need to split one out. */
3070 if (!any_permute)
3071 continue;
3072 /* If the span doesn't match we'd disrupt VF computation, avoid
3073 that for now. */
3074 if (imax - imin + 1 != SLP_TREE_LANES (node))
3075 continue;
3076
3077 /* For now only handle true permutes, like
3078 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3079 when permuting constants and invariants keeping the permute
3080 bijective. */
3081 auto_sbitmap load_index (SLP_TREE_LANES (node));
3082 bitmap_clear (load_index);
3083 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3084 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3085 unsigned j;
3086 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3087 if (!bitmap_bit_p (load_index, j))
3088 break;
3089 if (j != SLP_TREE_LANES (node))
3090 continue;
3091
3092 vec<unsigned> perm = vNULL;
3093 perm.safe_grow (SLP_TREE_LANES (node), true);
3094 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3095 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3096 perms.safe_push (perm);
3097 n_perm[idx] = perms.length () - 1;
3098 }
3099
3100 /* Propagate permutes along the graph and compute materialization points. */
3101 bool changed;
3102 unsigned iteration = 0;
3103 do
3104 {
3105 changed = false;
3106 ++iteration;
3107
3108 for (i = vertices.length (); i > 0 ; --i)
3109 {
3110 int idx = ipo[i-1];
3111 slp_tree node = vertices[idx];
3112 /* For leafs there's nothing to do - we've seeded permutes
3113 on those above. */
3114 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3115 continue;
3116
3117 bitmap_set_bit (n_visited, idx);
3118
3119 /* We cannot move a permute across a store. */
3120 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))
3121 && DR_IS_WRITE
3122 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
3123 continue;
3124
3125 int perm = -1;
3126 for (graph_edge *succ = slpg->vertices[idx].succ;
3127 succ; succ = succ->succ_next)
3128 {
3129 int succ_idx = succ->dest;
3130 /* Handle unvisited nodes optimistically. */
3131 /* ??? But for constants once we want to handle non-bijective
3132 permutes we have to verify the permute, when unifying lanes,
3133 will not unify different constants. For example see
3134 gcc.dg/vect/bb-slp-14.c for a case that would break. */
3135 if (!bitmap_bit_p (n_visited, succ_idx))
3136 continue;
3137 int succ_perm = n_perm[succ_idx];
3138 /* Once we materialize succs permutation its output lanes
3139 appear unpermuted to us. */
3140 if (bitmap_bit_p (n_materialize, succ_idx))
3141 succ_perm = 0;
3142 if (perm == -1)
3143 perm = succ_perm;
3144 else if (succ_perm == 0)
3145 {
3146 perm = 0;
3147 break;
3148 }
3149 else if (!vect_slp_perms_eq (perms, perm, succ_perm))
3150 {
3151 perm = 0;
3152 break;
3153 }
3154 }
3155
3156 if (perm == -1)
3157 /* Pick up pre-computed leaf values. */
3158 perm = n_perm[idx];
3159 else if (!vect_slp_perms_eq (perms, perm, n_perm[idx]))
3160 {
3161 if (iteration > 1)
3162 /* Make sure we eventually converge. */
3163 gcc_checking_assert (perm == 0);
3164 n_perm[idx] = perm;
3165 if (perm == 0)
3166 bitmap_clear_bit (n_materialize, idx);
3167 changed = true;
3168 }
3169
3170 if (perm == 0)
3171 continue;
3172
3173 /* Elide pruning at materialization points in the first
3174 iteration so every node was visited once at least. */
3175 if (iteration == 1)
3176 continue;
3177
3178 /* Decide on permute materialization. Look whether there's
3179 a use (pred) edge that is permuted differently than us.
3180 In that case mark ourselves so the permutation is applied.
3181 For VEC_PERM_EXPRs the permutation doesn't carry along
3182 from children to parents so force materialization at the
3183 point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
3184 are a source of an arbitrary permutation again, similar
3185 to constants/externals - that's something we do not yet
3186 optimally handle. */
3187 bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
3188 && slpg->vertices[idx].pred != NULL);
3189 if (all_preds_permuted)
3190 for (graph_edge *pred = slpg->vertices[idx].pred;
3191 pred; pred = pred->pred_next)
3192 {
3193 gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
3194 int pred_perm = n_perm[pred->src];
3195 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3196 {
3197 all_preds_permuted = false;
3198 break;
3199 }
3200 }
3201 if (!all_preds_permuted)
3202 {
3203 if (!bitmap_bit_p (n_materialize, idx))
3204 changed = true;
3205 bitmap_set_bit (n_materialize, idx);
3206 }
3207 }
3208 }
3209 while (changed || iteration == 1);
3210
3211 /* Materialize. */
3212 for (i = 0; i < vertices.length (); ++i)
3213 {
3214 int perm = n_perm[i];
3215 if (perm <= 0)
3216 continue;
3217
3218 slp_tree node = vertices[i];
3219
3220 /* First permute invariant/external original successors. */
3221 unsigned j;
3222 slp_tree child;
3223 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3224 {
3225 if (!child || SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3226 continue;
3227
3228 /* If the vector is uniform there's nothing to do. */
3229 if (vect_slp_tree_uniform_p (child))
3230 continue;
3231
3232 /* We can end up sharing some externals via two_operator
3233 handling. Be prepared to unshare those. */
3234 if (child->refcnt != 1)
3235 {
3236 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3237 SLP_TREE_CHILDREN (node)[j] = child
3238 = vect_create_new_slp_node
3239 (SLP_TREE_SCALAR_OPS (child).copy ());
3240 }
3241 vect_slp_permute (perms[perm],
3242 SLP_TREE_SCALAR_OPS (child), true);
3243 }
3244
3245 if (bitmap_bit_p (n_materialize, i))
3246 {
3247 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3248 /* For loads simply drop the permutation, the load permutation
3249 already performs the desired permutation. */
3250 ;
3251 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3252 {
3253 /* If the node is already a permute node we can apply
3254 the permutation to the lane selection, effectively
3255 materializing it on the incoming vectors. */
3256 if (dump_enabled_p ())
3257 dump_printf_loc (MSG_NOTE, vect_location,
3258 "simplifying permute node %p\n",
3259 node);
3260
3261 for (unsigned k = 0;
3262 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3263 SLP_TREE_LANE_PERMUTATION (node)[k].second
3264 = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3265 }
3266 else
3267 {
3268 if (dump_enabled_p ())
3269 dump_printf_loc (MSG_NOTE, vect_location,
3270 "inserting permute node in place of %p\n",
3271 node);
3272
3273 /* Make a copy of NODE and in-place change it to a
3274 VEC_PERM node to permute the lanes of the copy. */
3275 slp_tree copy = new _slp_tree;
3276 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3277 SLP_TREE_CHILDREN (node) = vNULL;
3278 SLP_TREE_SCALAR_STMTS (copy)
3279 = SLP_TREE_SCALAR_STMTS (node).copy ();
3280 vect_slp_permute (perms[perm],
3281 SLP_TREE_SCALAR_STMTS (copy), true);
3282 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
3283 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
3284 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
3285 SLP_TREE_LANE_PERMUTATION (copy)
3286 = SLP_TREE_LANE_PERMUTATION (node);
3287 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
3288 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
3289 copy->refcnt = 1;
3290 copy->max_nunits = node->max_nunits;
3291 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
3292 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
3293 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
3294
3295 /* Now turn NODE into a VEC_PERM. */
3296 SLP_TREE_CHILDREN (node).safe_push (copy);
3297 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
3298 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3299 SLP_TREE_LANE_PERMUTATION (node)
3300 .quick_push (std::make_pair (0, perms[perm][j]));
3301 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3302 }
3303 }
3304 else
3305 {
3306 /* Apply the reverse permutation to our stmts. */
3307 vect_slp_permute (perms[perm],
3308 SLP_TREE_SCALAR_STMTS (node), true);
3309 /* And to the load permutation, which we can simply
3310 make regular by design. */
3311 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3312 {
3313 /* ??? When we handle non-bijective permutes the idea
3314 is that we can force the load-permutation to be
3315 { min, min + 1, min + 2, ... max }. But then the
3316 scalar defs might no longer match the lane content
3317 which means wrong-code with live lane vectorization.
3318 So we possibly have to have NULL entries for those. */
3319 vect_slp_permute (perms[perm],
3320 SLP_TREE_LOAD_PERMUTATION (node), true);
3321 }
3322 }
3323 }
3324
3325 /* Free the perms vector used for propagation. */
3326 while (!perms.is_empty ())
3327 perms.pop ().release ();
3328 free_graph (slpg);
3329
3330
3331 /* Now elide load permutations that are not necessary. */
3332 for (i = 0; i < leafs.length (); ++i)
3333 {
3334 node = vertices[leafs[i]];
3335 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3336 continue;
3337
3338 /* In basic block vectorization we allow any subchain of an interleaving
3339 chain.
3340 FORNOW: not in loop SLP because of realignment complications. */
3341 if (is_a <bb_vec_info> (vinfo))
3342 {
3343 bool subchain_p = true;
3344 stmt_vec_info next_load_info = NULL;
3345 stmt_vec_info load_info;
3346 unsigned j;
3347 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3348 {
3349 if (j != 0
3350 && (next_load_info != load_info
3351 || DR_GROUP_GAP (load_info) != 1))
3352 {
3353 subchain_p = false;
3354 break;
3355 }
3356 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
3357 }
3358 if (subchain_p)
3359 {
3360 SLP_TREE_LOAD_PERMUTATION (node).release ();
3361 continue;
3362 }
3363 }
3364 else
3365 {
3366 stmt_vec_info load_info;
3367 bool this_load_permuted = false;
3368 unsigned j;
3369 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
3370 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
3371 {
3372 this_load_permuted = true;
3373 break;
3374 }
3375 stmt_vec_info first_stmt_info
3376 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
3377 if (!this_load_permuted
3378 /* The load requires permutation when unrolling exposes
3379 a gap either because the group is larger than the SLP
3380 group-size or because there is a gap between the groups. */
3381 && (known_eq (LOOP_VINFO_VECT_FACTOR
3382 (as_a <loop_vec_info> (vinfo)), 1U)
3383 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
3384 && DR_GROUP_GAP (first_stmt_info) == 0)))
3385 {
3386 SLP_TREE_LOAD_PERMUTATION (node).release ();
3387 continue;
3388 }
3389 }
3390 }
3391 }
3392
3393 /* Gather loads reachable from the individual SLP graph entries. */
3394
3395 void
3396 vect_gather_slp_loads (vec_info *vinfo)
3397 {
3398 unsigned i;
3399 slp_instance instance;
3400 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
3401 {
3402 hash_set<slp_tree> visited;
3403 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
3404 SLP_INSTANCE_TREE (instance), visited);
3405 }
3406 }
3407
3408
3409 /* For each possible SLP instance decide whether to SLP it and calculate overall
3410 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3411 least one instance. */
3412
3413 bool
3414 vect_make_slp_decision (loop_vec_info loop_vinfo)
3415 {
3416 unsigned int i;
3417 poly_uint64 unrolling_factor = 1;
3418 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3419 slp_instance instance;
3420 int decided_to_slp = 0;
3421
3422 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3423
3424 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3425 {
3426 /* FORNOW: SLP if you can. */
3427 /* All unroll factors have the form:
3428
3429 GET_MODE_SIZE (vinfo->vector_mode) * X
3430
3431 for some rational X, so they must have a common multiple. */
3432 unrolling_factor
3433 = force_common_multiple (unrolling_factor,
3434 SLP_INSTANCE_UNROLLING_FACTOR (instance));
3435
3436 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3437 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3438 loop-based vectorization. Such stmts will be marked as HYBRID. */
3439 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3440 decided_to_slp++;
3441 }
3442
3443 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
3444
3445 if (decided_to_slp && dump_enabled_p ())
3446 {
3447 dump_printf_loc (MSG_NOTE, vect_location,
3448 "Decided to SLP %d instances. Unrolling factor ",
3449 decided_to_slp);
3450 dump_dec (MSG_NOTE, unrolling_factor);
3451 dump_printf (MSG_NOTE, "\n");
3452 }
3453
3454 return (decided_to_slp > 0);
3455 }
3456
3457 /* Private data for vect_detect_hybrid_slp. */
3458 struct vdhs_data
3459 {
3460 loop_vec_info loop_vinfo;
3461 vec<stmt_vec_info> *worklist;
3462 };
3463
3464 /* Walker for walk_gimple_op. */
3465
3466 static tree
3467 vect_detect_hybrid_slp (tree *tp, int *, void *data)
3468 {
3469 walk_stmt_info *wi = (walk_stmt_info *)data;
3470 vdhs_data *dat = (vdhs_data *)wi->info;
3471
3472 if (wi->is_lhs)
3473 return NULL_TREE;
3474
3475 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
3476 if (!def_stmt_info)
3477 return NULL_TREE;
3478 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
3479 if (PURE_SLP_STMT (def_stmt_info))
3480 {
3481 if (dump_enabled_p ())
3482 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
3483 def_stmt_info->stmt);
3484 STMT_SLP_TYPE (def_stmt_info) = hybrid;
3485 dat->worklist->safe_push (def_stmt_info);
3486 }
3487
3488 return NULL_TREE;
3489 }
3490
3491 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3492 if so, otherwise pushing it to WORKLIST. */
3493
3494 static void
3495 maybe_push_to_hybrid_worklist (vec_info *vinfo,
3496 vec<stmt_vec_info> &worklist,
3497 stmt_vec_info stmt_info)
3498 {
3499 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_NOTE, vect_location,
3501 "Processing hybrid candidate : %G", stmt_info->stmt);
3502 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
3503 imm_use_iterator iter2;
3504 ssa_op_iter iter1;
3505 use_operand_p use_p;
3506 def_operand_p def_p;
3507 bool any_def = false;
3508 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
3509 {
3510 any_def = true;
3511 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
3512 {
3513 if (is_gimple_debug (USE_STMT (use_p)))
3514 continue;
3515 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
3516 /* An out-of loop use means this is a loop_vect sink. */
3517 if (!use_info)
3518 {
3519 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 "Found loop_vect sink: %G", stmt_info->stmt);
3522 worklist.safe_push (stmt_info);
3523 return;
3524 }
3525 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
3526 {
3527 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_NOTE, vect_location,
3529 "Found loop_vect use: %G", use_info->stmt);
3530 worklist.safe_push (stmt_info);
3531 return;
3532 }
3533 }
3534 }
3535 /* No def means this is a loo_vect sink. */
3536 if (!any_def)
3537 {
3538 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_NOTE, vect_location,
3540 "Found loop_vect sink: %G", stmt_info->stmt);
3541 worklist.safe_push (stmt_info);
3542 return;
3543 }
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_NOTE, vect_location,
3546 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
3547 STMT_SLP_TYPE (stmt_info) = pure_slp;
3548 }
3549
3550 /* Find stmts that must be both vectorized and SLPed. */
3551
3552 void
3553 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
3554 {
3555 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3556
3557 /* All stmts participating in SLP are marked pure_slp, all other
3558 stmts are loop_vect.
3559 First collect all loop_vect stmts into a worklist.
3560 SLP patterns cause not all original scalar stmts to appear in
3561 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3562 Rectify this here and do a backward walk over the IL only considering
3563 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3564 mark them as pure_slp. */
3565 auto_vec<stmt_vec_info> worklist;
3566 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
3567 {
3568 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3569 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3570 gsi_next (&gsi))
3571 {
3572 gphi *phi = gsi.phi ();
3573 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
3574 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3575 maybe_push_to_hybrid_worklist (loop_vinfo,
3576 worklist, stmt_info);
3577 }
3578 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
3579 gsi_prev (&gsi))
3580 {
3581 gimple *stmt = gsi_stmt (gsi);
3582 if (is_gimple_debug (stmt))
3583 continue;
3584 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
3585 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3586 {
3587 for (gimple_stmt_iterator gsi2
3588 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
3589 !gsi_end_p (gsi2); gsi_next (&gsi2))
3590 {
3591 stmt_vec_info patt_info
3592 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
3593 if (!STMT_SLP_TYPE (patt_info)
3594 && STMT_VINFO_RELEVANT (patt_info))
3595 maybe_push_to_hybrid_worklist (loop_vinfo,
3596 worklist, patt_info);
3597 }
3598 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3599 }
3600 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
3601 maybe_push_to_hybrid_worklist (loop_vinfo,
3602 worklist, stmt_info);
3603 }
3604 }
3605
3606 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3607 mark any SLP vectorized stmt as hybrid.
3608 ??? We're visiting def stmts N times (once for each non-SLP and
3609 once for each hybrid-SLP use). */
3610 walk_stmt_info wi;
3611 vdhs_data dat;
3612 dat.worklist = &worklist;
3613 dat.loop_vinfo = loop_vinfo;
3614 memset (&wi, 0, sizeof (wi));
3615 wi.info = (void *)&dat;
3616 while (!worklist.is_empty ())
3617 {
3618 stmt_vec_info stmt_info = worklist.pop ();
3619 /* Since SSA operands are not set up for pattern stmts we need
3620 to use walk_gimple_op. */
3621 wi.is_lhs = 0;
3622 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
3623 }
3624 }
3625
3626
3627 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3628
3629 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
3630 : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
3631 {
3632 for (unsigned i = 0; i < bbs.length (); ++i)
3633 {
3634 if (i != 0)
3635 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3636 gsi_next (&si))
3637 {
3638 gphi *phi = si.phi ();
3639 gimple_set_uid (phi, 0);
3640 add_stmt (phi);
3641 }
3642 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3643 !gsi_end_p (gsi); gsi_next (&gsi))
3644 {
3645 gimple *stmt = gsi_stmt (gsi);
3646 gimple_set_uid (stmt, 0);
3647 if (is_gimple_debug (stmt))
3648 continue;
3649 add_stmt (stmt);
3650 }
3651 }
3652 }
3653
3654
3655 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3656 stmts in the basic block. */
3657
3658 _bb_vec_info::~_bb_vec_info ()
3659 {
3660 /* Reset region marker. */
3661 for (unsigned i = 0; i < bbs.length (); ++i)
3662 {
3663 if (i != 0)
3664 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
3665 gsi_next (&si))
3666 {
3667 gphi *phi = si.phi ();
3668 gimple_set_uid (phi, -1);
3669 }
3670 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3671 !gsi_end_p (gsi); gsi_next (&gsi))
3672 {
3673 gimple *stmt = gsi_stmt (gsi);
3674 gimple_set_uid (stmt, -1);
3675 }
3676 }
3677
3678 for (unsigned i = 0; i < roots.length (); ++i)
3679 roots[i].stmts.release ();
3680 roots.release ();
3681 }
3682
3683 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3684 given then that child nodes have already been processed, and that
3685 their def types currently match their SLP node's def type. */
3686
3687 static bool
3688 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
3689 slp_instance node_instance,
3690 stmt_vector_for_cost *cost_vec)
3691 {
3692 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3693 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
3694
3695 /* Calculate the number of vector statements to be created for the
3696 scalar stmts in this node. For SLP reductions it is equal to the
3697 number of vector statements in the children (which has already been
3698 calculated by the recursive call). Otherwise it is the number of
3699 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3700 VF divided by the number of elements in a vector. */
3701 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3702 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
3703 {
3704 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
3705 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
3706 {
3707 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3708 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
3709 break;
3710 }
3711 }
3712 else
3713 {
3714 poly_uint64 vf;
3715 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3716 vf = loop_vinfo->vectorization_factor;
3717 else
3718 vf = 1;
3719 unsigned int group_size = SLP_TREE_LANES (node);
3720 tree vectype = SLP_TREE_VECTYPE (node);
3721 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
3722 = vect_get_num_vectors (vf * group_size, vectype);
3723 }
3724
3725 /* Handle purely internal nodes. */
3726 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3727 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
3728
3729 if (is_a <bb_vec_info> (vinfo)
3730 && !vect_update_shared_vectype (stmt_info, SLP_TREE_VECTYPE (node)))
3731 {
3732 if (dump_enabled_p ())
3733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3734 "desired vector type conflicts with earlier one "
3735 "for %G", stmt_info->stmt);
3736 return false;
3737 }
3738
3739 bool dummy;
3740 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
3741 node, node_instance, cost_vec);
3742 }
3743
3744 /* Try to build NODE from scalars, returning true on success.
3745 NODE_INSTANCE is the SLP instance that contains NODE. */
3746
3747 static bool
3748 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
3749 slp_instance node_instance)
3750 {
3751 stmt_vec_info stmt_info;
3752 unsigned int i;
3753
3754 if (!is_a <bb_vec_info> (vinfo)
3755 || node == SLP_INSTANCE_TREE (node_instance)
3756 || !SLP_TREE_SCALAR_STMTS (node).exists ()
3757 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
3758 return false;
3759
3760 if (dump_enabled_p ())
3761 dump_printf_loc (MSG_NOTE, vect_location,
3762 "Building vector operands of %p from scalars instead\n", node);
3763
3764 /* Don't remove and free the child nodes here, since they could be
3765 referenced by other structures. The analysis and scheduling phases
3766 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3767 unsigned int group_size = SLP_TREE_LANES (node);
3768 SLP_TREE_DEF_TYPE (node) = vect_external_def;
3769 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
3770 SLP_TREE_LOAD_PERMUTATION (node).release ();
3771 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3772 {
3773 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
3774 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
3775 }
3776 return true;
3777 }
3778
3779 /* Compute the prologue cost for invariant or constant operands represented
3780 by NODE. */
3781
3782 static void
3783 vect_prologue_cost_for_slp (slp_tree node,
3784 stmt_vector_for_cost *cost_vec)
3785 {
3786 /* There's a special case of an existing vector, that costs nothing. */
3787 if (SLP_TREE_SCALAR_OPS (node).length () == 0
3788 && !SLP_TREE_VEC_DEFS (node).is_empty ())
3789 return;
3790 /* Without looking at the actual initializer a vector of
3791 constants can be implemented as load from the constant pool.
3792 When all elements are the same we can use a splat. */
3793 tree vectype = SLP_TREE_VECTYPE (node);
3794 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
3795 unsigned num_vects_to_check;
3796 unsigned HOST_WIDE_INT const_nunits;
3797 unsigned nelt_limit;
3798 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
3799 && ! multiple_p (const_nunits, group_size))
3800 {
3801 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
3802 nelt_limit = const_nunits;
3803 }
3804 else
3805 {
3806 /* If either the vector has variable length or the vectors
3807 are composed of repeated whole groups we only need to
3808 cost construction once. All vectors will be the same. */
3809 num_vects_to_check = 1;
3810 nelt_limit = group_size;
3811 }
3812 tree elt = NULL_TREE;
3813 unsigned nelt = 0;
3814 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
3815 {
3816 unsigned si = j % group_size;
3817 if (nelt == 0)
3818 elt = SLP_TREE_SCALAR_OPS (node)[si];
3819 /* ??? We're just tracking whether all operands of a single
3820 vector initializer are the same, ideally we'd check if
3821 we emitted the same one already. */
3822 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
3823 elt = NULL_TREE;
3824 nelt++;
3825 if (nelt == nelt_limit)
3826 {
3827 record_stmt_cost (cost_vec, 1,
3828 SLP_TREE_DEF_TYPE (node) == vect_external_def
3829 ? (elt ? scalar_to_vec : vec_construct)
3830 : vector_load,
3831 NULL, vectype, 0, vect_prologue);
3832 nelt = 0;
3833 }
3834 }
3835 }
3836
3837 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3838 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3839
3840 Return true if the operations are supported. */
3841
3842 static bool
3843 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
3844 slp_instance node_instance,
3845 hash_set<slp_tree> &visited_set,
3846 vec<slp_tree> &visited_vec,
3847 stmt_vector_for_cost *cost_vec)
3848 {
3849 int i, j;
3850 slp_tree child;
3851
3852 /* Assume we can code-generate all invariants. */
3853 if (!node
3854 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
3855 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
3856 return true;
3857
3858 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
3859 {
3860 if (dump_enabled_p ())
3861 dump_printf_loc (MSG_NOTE, vect_location,
3862 "Failed cyclic SLP reference in %p", node);
3863 return false;
3864 }
3865 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
3866
3867 /* If we already analyzed the exact same set of scalar stmts we're done.
3868 We share the generated vector stmts for those. */
3869 if (visited_set.add (node))
3870 return true;
3871 visited_vec.safe_push (node);
3872
3873 bool res = true;
3874 unsigned visited_rec_start = visited_vec.length ();
3875 unsigned cost_vec_rec_start = cost_vec->length ();
3876 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3877 {
3878 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
3879 visited_set, visited_vec,
3880 cost_vec);
3881 if (!res)
3882 break;
3883 }
3884
3885 if (res)
3886 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
3887 cost_vec);
3888 /* If analysis failed we have to pop all recursive visited nodes
3889 plus ourselves. */
3890 if (!res)
3891 {
3892 while (visited_vec.length () >= visited_rec_start)
3893 visited_set.remove (visited_vec.pop ());
3894 cost_vec->truncate (cost_vec_rec_start);
3895 }
3896
3897 /* When the node can be vectorized cost invariant nodes it references.
3898 This is not done in DFS order to allow the refering node
3899 vectorizable_* calls to nail down the invariant nodes vector type
3900 and possibly unshare it if it needs a different vector type than
3901 other referrers. */
3902 if (res)
3903 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3904 if (child
3905 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
3906 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
3907 /* Perform usual caching, note code-generation still
3908 code-gens these nodes multiple times but we expect
3909 to CSE them later. */
3910 && !visited_set.add (child))
3911 {
3912 visited_vec.safe_push (child);
3913 /* ??? After auditing more code paths make a "default"
3914 and push the vector type from NODE to all children
3915 if it is not already set. */
3916 /* Compute the number of vectors to be generated. */
3917 tree vector_type = SLP_TREE_VECTYPE (child);
3918 if (!vector_type)
3919 {
3920 /* For shifts with a scalar argument we don't need
3921 to cost or code-generate anything.
3922 ??? Represent this more explicitely. */
3923 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
3924 == shift_vec_info_type)
3925 && j == 1);
3926 continue;
3927 }
3928 unsigned group_size = SLP_TREE_LANES (child);
3929 poly_uint64 vf = 1;
3930 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3931 vf = loop_vinfo->vectorization_factor;
3932 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
3933 = vect_get_num_vectors (vf * group_size, vector_type);
3934 /* And cost them. */
3935 vect_prologue_cost_for_slp (child, cost_vec);
3936 }
3937
3938 /* If this node or any of its children can't be vectorized, try pruning
3939 the tree here rather than felling the whole thing. */
3940 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
3941 {
3942 /* We'll need to revisit this for invariant costing and number
3943 of vectorized stmt setting. */
3944 res = true;
3945 }
3946
3947 return res;
3948 }
3949
3950
3951 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3952 region and that can be vectorized using vectorizable_live_operation
3953 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3954 scalar code computing it to be retained. */
3955
3956 static void
3957 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
3958 slp_instance instance,
3959 stmt_vector_for_cost *cost_vec,
3960 hash_set<stmt_vec_info> &svisited,
3961 hash_set<slp_tree> &visited)
3962 {
3963 if (visited.add (node))
3964 return;
3965
3966 unsigned i;
3967 stmt_vec_info stmt_info;
3968 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
3969 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3970 {
3971 if (svisited.contains (stmt_info))
3972 continue;
3973 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3974 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
3975 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
3976 /* Only the pattern root stmt computes the original scalar value. */
3977 continue;
3978 bool mark_visited = true;
3979 gimple *orig_stmt = orig_stmt_info->stmt;
3980 ssa_op_iter op_iter;
3981 def_operand_p def_p;
3982 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3983 {
3984 imm_use_iterator use_iter;
3985 gimple *use_stmt;
3986 stmt_vec_info use_stmt_info;
3987 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3988 if (!is_gimple_debug (use_stmt))
3989 {
3990 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
3991 if (!use_stmt_info
3992 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3993 {
3994 STMT_VINFO_LIVE_P (stmt_info) = true;
3995 if (vectorizable_live_operation (bb_vinfo, stmt_info,
3996 NULL, node, instance, i,
3997 false, cost_vec))
3998 /* ??? So we know we can vectorize the live stmt
3999 from one SLP node. If we cannot do so from all
4000 or none consistently we'd have to record which
4001 SLP node (and lane) we want to use for the live
4002 operation. So make sure we can code-generate
4003 from all nodes. */
4004 mark_visited = false;
4005 else
4006 STMT_VINFO_LIVE_P (stmt_info) = false;
4007 break;
4008 }
4009 }
4010 /* We have to verify whether we can insert the lane extract
4011 before all uses. The following is a conservative approximation.
4012 We cannot put this into vectorizable_live_operation because
4013 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4014 doesn't work.
4015 Note that while the fact that we emit code for loads at the
4016 first load should make this a non-problem leafs we construct
4017 from scalars are vectorized after the last scalar def.
4018 ??? If we'd actually compute the insert location during
4019 analysis we could use sth less conservative than the last
4020 scalar stmt in the node for the dominance check. */
4021 /* ??? What remains is "live" uses in vector CTORs in the same
4022 SLP graph which is where those uses can end up code-generated
4023 right after their definition instead of close to their original
4024 use. But that would restrict us to code-generate lane-extracts
4025 from the latest stmt in a node. So we compensate for this
4026 during code-generation, simply not replacing uses for those
4027 hopefully rare cases. */
4028 if (STMT_VINFO_LIVE_P (stmt_info))
4029 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4030 if (!is_gimple_debug (use_stmt)
4031 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4032 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4033 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4034 {
4035 if (dump_enabled_p ())
4036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037 "Cannot determine insertion place for "
4038 "lane extract\n");
4039 STMT_VINFO_LIVE_P (stmt_info) = false;
4040 mark_visited = true;
4041 }
4042 }
4043 if (mark_visited)
4044 svisited.add (stmt_info);
4045 }
4046
4047 slp_tree child;
4048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4049 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4050 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4051 cost_vec, svisited, visited);
4052 }
4053
4054 /* Analyze statements in SLP instances of VINFO. Return true if the
4055 operations are supported. */
4056
4057 bool
4058 vect_slp_analyze_operations (vec_info *vinfo)
4059 {
4060 slp_instance instance;
4061 int i;
4062
4063 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4064
4065 hash_set<slp_tree> visited;
4066 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4067 {
4068 auto_vec<slp_tree> visited_vec;
4069 stmt_vector_for_cost cost_vec;
4070 cost_vec.create (2);
4071 if (is_a <bb_vec_info> (vinfo))
4072 vect_location = instance->location ();
4073 if (!vect_slp_analyze_node_operations (vinfo,
4074 SLP_INSTANCE_TREE (instance),
4075 instance, visited, visited_vec,
4076 &cost_vec)
4077 /* Instances with a root stmt require vectorized defs for the
4078 SLP tree root. */
4079 || (SLP_INSTANCE_ROOT_STMT (instance)
4080 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4081 != vect_internal_def)))
4082 {
4083 slp_tree node = SLP_INSTANCE_TREE (instance);
4084 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4085 if (dump_enabled_p ())
4086 dump_printf_loc (MSG_NOTE, vect_location,
4087 "removing SLP instance operations starting from: %G",
4088 stmt_info->stmt);
4089 vect_free_slp_instance (instance);
4090 vinfo->slp_instances.ordered_remove (i);
4091 cost_vec.release ();
4092 while (!visited_vec.is_empty ())
4093 visited.remove (visited_vec.pop ());
4094 }
4095 else
4096 {
4097 i++;
4098
4099 /* For BB vectorization remember the SLP graph entry
4100 cost for later. */
4101 if (is_a <bb_vec_info> (vinfo))
4102 instance->cost_vec = cost_vec;
4103 else
4104 {
4105 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
4106 cost_vec.release ();
4107 }
4108 }
4109 }
4110
4111 /* Compute vectorizable live stmts. */
4112 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4113 {
4114 hash_set<stmt_vec_info> svisited;
4115 hash_set<slp_tree> visited;
4116 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
4117 {
4118 vect_location = instance->location ();
4119 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
4120 instance, &instance->cost_vec, svisited,
4121 visited);
4122 }
4123 }
4124
4125 return !vinfo->slp_instances.is_empty ();
4126 }
4127
4128 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
4129 closing the eventual chain. */
4130
4131 static slp_instance
4132 get_ultimate_leader (slp_instance instance,
4133 hash_map<slp_instance, slp_instance> &instance_leader)
4134 {
4135 auto_vec<slp_instance *, 8> chain;
4136 slp_instance *tem;
4137 while (*(tem = instance_leader.get (instance)) != instance)
4138 {
4139 chain.safe_push (tem);
4140 instance = *tem;
4141 }
4142 while (!chain.is_empty ())
4143 *chain.pop () = instance;
4144 return instance;
4145 }
4146
4147 /* Worker of vect_bb_partition_graph, recurse on NODE. */
4148
4149 static void
4150 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
4151 slp_instance instance, slp_tree node,
4152 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
4153 hash_map<slp_instance, slp_instance> &instance_leader,
4154 hash_set<slp_tree> &visited)
4155 {
4156 stmt_vec_info stmt_info;
4157 unsigned i;
4158
4159 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4160 {
4161 bool existed_p;
4162 slp_instance &stmt_instance
4163 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
4164 if (!existed_p)
4165 ;
4166 else if (stmt_instance != instance)
4167 {
4168 /* If we're running into a previously marked stmt make us the
4169 leader of the current ultimate leader. This keeps the
4170 leader chain acyclic and works even when the current instance
4171 connects two previously independent graph parts. */
4172 slp_instance stmt_leader
4173 = get_ultimate_leader (stmt_instance, instance_leader);
4174 if (stmt_leader != instance)
4175 instance_leader.put (stmt_leader, instance);
4176 }
4177 stmt_instance = instance;
4178 }
4179
4180 if (visited.add (node))
4181 return;
4182
4183 slp_tree child;
4184 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4185 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4186 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
4187 instance_leader, visited);
4188 }
4189
4190 /* Partition the SLP graph into pieces that can be costed independently. */
4191
4192 static void
4193 vect_bb_partition_graph (bb_vec_info bb_vinfo)
4194 {
4195 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4196
4197 /* First walk the SLP graph assigning each involved scalar stmt a
4198 corresponding SLP graph entry and upon visiting a previously
4199 marked stmt, make the stmts leader the current SLP graph entry. */
4200 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
4201 hash_map<slp_instance, slp_instance> instance_leader;
4202 hash_set<slp_tree> visited;
4203 slp_instance instance;
4204 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4205 {
4206 instance_leader.put (instance, instance);
4207 vect_bb_partition_graph_r (bb_vinfo,
4208 instance, SLP_INSTANCE_TREE (instance),
4209 stmt_to_instance, instance_leader,
4210 visited);
4211 }
4212
4213 /* Then collect entries to each independent subgraph. */
4214 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
4215 {
4216 slp_instance leader = get_ultimate_leader (instance, instance_leader);
4217 leader->subgraph_entries.safe_push (instance);
4218 if (dump_enabled_p ()
4219 && leader != instance)
4220 dump_printf_loc (MSG_NOTE, vect_location,
4221 "instance %p is leader of %p\n",
4222 leader, instance);
4223 }
4224 }
4225
4226 /* Compute the scalar cost of the SLP node NODE and its children
4227 and return it. Do not account defs that are marked in LIFE and
4228 update LIFE according to uses of NODE. */
4229
4230 static void
4231 vect_bb_slp_scalar_cost (vec_info *vinfo,
4232 slp_tree node, vec<bool, va_heap> *life,
4233 stmt_vector_for_cost *cost_vec,
4234 hash_set<slp_tree> &visited)
4235 {
4236 unsigned i;
4237 stmt_vec_info stmt_info;
4238 slp_tree child;
4239
4240 if (visited.add (node))
4241 return;
4242
4243 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4244 {
4245 ssa_op_iter op_iter;
4246 def_operand_p def_p;
4247
4248 if ((*life)[i])
4249 continue;
4250
4251 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4252 gimple *orig_stmt = orig_stmt_info->stmt;
4253
4254 /* If there is a non-vectorized use of the defs then the scalar
4255 stmt is kept live in which case we do not account it or any
4256 required defs in the SLP children in the scalar cost. This
4257 way we make the vectorization more costly when compared to
4258 the scalar cost. */
4259 if (!STMT_VINFO_LIVE_P (stmt_info))
4260 {
4261 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4262 {
4263 imm_use_iterator use_iter;
4264 gimple *use_stmt;
4265 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4266 if (!is_gimple_debug (use_stmt))
4267 {
4268 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
4269 if (!use_stmt_info
4270 || !PURE_SLP_STMT
4271 (vect_stmt_to_vectorize (use_stmt_info)))
4272 {
4273 (*life)[i] = true;
4274 break;
4275 }
4276 }
4277 }
4278 if ((*life)[i])
4279 continue;
4280 }
4281
4282 /* Count scalar stmts only once. */
4283 if (gimple_visited_p (orig_stmt))
4284 continue;
4285 gimple_set_visited (orig_stmt, true);
4286
4287 vect_cost_for_stmt kind;
4288 if (STMT_VINFO_DATA_REF (orig_stmt_info))
4289 {
4290 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
4291 kind = scalar_load;
4292 else
4293 kind = scalar_store;
4294 }
4295 else if (vect_nop_conversion_p (orig_stmt_info))
4296 continue;
4297 else
4298 kind = scalar_stmt;
4299 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
4300 SLP_TREE_VECTYPE (node), 0, vect_body);
4301 }
4302
4303 auto_vec<bool, 20> subtree_life;
4304 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4305 {
4306 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4307 {
4308 /* Do not directly pass LIFE to the recursive call, copy it to
4309 confine changes in the callee to the current child/subtree. */
4310 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4311 {
4312 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
4313 for (unsigned j = 0;
4314 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
4315 {
4316 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
4317 if (perm.first == i)
4318 subtree_life[perm.second] = (*life)[j];
4319 }
4320 }
4321 else
4322 {
4323 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
4324 subtree_life.safe_splice (*life);
4325 }
4326 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
4327 visited);
4328 subtree_life.truncate (0);
4329 }
4330 }
4331 }
4332
4333 /* Check if vectorization of the basic block is profitable for the
4334 subgraph denoted by SLP_INSTANCES. */
4335
4336 static bool
4337 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
4338 vec<slp_instance> slp_instances)
4339 {
4340 slp_instance instance;
4341 int i;
4342 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
4343 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
4344
4345 void *vect_target_cost_data = init_cost (NULL);
4346
4347 /* Calculate scalar cost and sum the cost for the vector stmts
4348 previously collected. */
4349 stmt_vector_for_cost scalar_costs;
4350 scalar_costs.create (0);
4351 hash_set<slp_tree> visited;
4352 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4353 {
4354 auto_vec<bool, 20> life;
4355 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
4356 true);
4357 vect_bb_slp_scalar_cost (bb_vinfo,
4358 SLP_INSTANCE_TREE (instance),
4359 &life, &scalar_costs, visited);
4360 add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
4361 instance->cost_vec.release ();
4362 }
4363 /* Unset visited flag. */
4364 stmt_info_for_cost *si;
4365 FOR_EACH_VEC_ELT (scalar_costs, i, si)
4366 gimple_set_visited (si->stmt_info->stmt, false);
4367
4368 void *scalar_target_cost_data = init_cost (NULL);
4369 add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
4370 scalar_costs.release ();
4371 unsigned dummy;
4372 finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
4373 destroy_cost_data (scalar_target_cost_data);
4374
4375 /* Complete the target-specific vector cost calculation. */
4376 finish_cost (vect_target_cost_data, &vec_prologue_cost,
4377 &vec_inside_cost, &vec_epilogue_cost);
4378 destroy_cost_data (vect_target_cost_data);
4379
4380 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
4381
4382 if (dump_enabled_p ())
4383 {
4384 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4385 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
4386 vec_inside_cost);
4387 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
4388 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
4389 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
4390 }
4391
4392 /* Vectorization is profitable if its cost is more than the cost of scalar
4393 version. Note that we err on the vector side for equal cost because
4394 the cost estimate is otherwise quite pessimistic (constant uses are
4395 free on the scalar side but cost a load on the vector side for
4396 example). */
4397 if (vec_outside_cost + vec_inside_cost > scalar_cost)
4398 return false;
4399
4400 return true;
4401 }
4402
4403 /* qsort comparator for lane defs. */
4404
4405 static int
4406 vld_cmp (const void *a_, const void *b_)
4407 {
4408 auto *a = (const std::pair<unsigned, tree> *)a_;
4409 auto *b = (const std::pair<unsigned, tree> *)b_;
4410 return a->first - b->first;
4411 }
4412
4413 /* Return true if USE_STMT is a vector lane insert into VEC and set
4414 *THIS_LANE to the lane number that is set. */
4415
4416 static bool
4417 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
4418 {
4419 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
4420 if (!use_ass
4421 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
4422 || (vec
4423 ? gimple_assign_rhs1 (use_ass) != vec
4424 : ((vec = gimple_assign_rhs1 (use_ass)), false))
4425 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
4426 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
4427 || !constant_multiple_p
4428 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
4429 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
4430 this_lane))
4431 return false;
4432 return true;
4433 }
4434
4435 /* Find any vectorizable constructors and add them to the grouped_store
4436 array. */
4437
4438 static void
4439 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
4440 {
4441 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
4442 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
4443 !gsi_end_p (gsi); gsi_next (&gsi))
4444 {
4445 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
4446 if (!assign)
4447 continue;
4448
4449 tree rhs = gimple_assign_rhs1 (assign);
4450 if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
4451 {
4452 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
4453 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
4454 CONSTRUCTOR_NELTS (rhs))
4455 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
4456 || uniform_vector_p (rhs))
4457 continue;
4458
4459 unsigned j;
4460 tree val;
4461 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
4462 if (TREE_CODE (val) != SSA_NAME
4463 || !bb_vinfo->lookup_def (val))
4464 break;
4465 if (j != CONSTRUCTOR_NELTS (rhs))
4466 continue;
4467
4468 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
4469 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
4470 }
4471 else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
4472 && VECTOR_TYPE_P (TREE_TYPE (rhs))
4473 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
4474 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
4475 && integer_zerop (gimple_assign_rhs3 (assign))
4476 && useless_type_conversion_p
4477 (TREE_TYPE (TREE_TYPE (rhs)),
4478 TREE_TYPE (gimple_assign_rhs2 (assign)))
4479 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
4480 {
4481 /* We start to match on insert to lane zero but since the
4482 inserts need not be ordered we'd have to search both
4483 the def and the use chains. */
4484 tree vectype = TREE_TYPE (rhs);
4485 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4486 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
4487 auto_sbitmap lanes (nlanes);
4488 bitmap_clear (lanes);
4489 bitmap_set_bit (lanes, 0);
4490 tree def = gimple_assign_lhs (assign);
4491 lane_defs.quick_push
4492 (std::make_pair (0, gimple_assign_rhs2 (assign)));
4493 unsigned lanes_found = 1;
4494 /* Start with the use chains, the last stmt will be the root. */
4495 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
4496 do
4497 {
4498 use_operand_p use_p;
4499 gimple *use_stmt;
4500 if (!single_imm_use (def, &use_p, &use_stmt))
4501 break;
4502 unsigned this_lane;
4503 if (!bb_vinfo->lookup_stmt (use_stmt)
4504 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
4505 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
4506 break;
4507 if (bitmap_bit_p (lanes, this_lane))
4508 break;
4509 lanes_found++;
4510 bitmap_set_bit (lanes, this_lane);
4511 gassign *use_ass = as_a <gassign *> (use_stmt);
4512 lane_defs.quick_push (std::make_pair
4513 (this_lane, gimple_assign_rhs2 (use_ass)));
4514 last = bb_vinfo->lookup_stmt (use_ass);
4515 def = gimple_assign_lhs (use_ass);
4516 }
4517 while (lanes_found < nlanes);
4518 if (lanes_found < nlanes)
4519 {
4520 /* Now search the def chain. */
4521 def = gimple_assign_rhs1 (assign);
4522 do
4523 {
4524 if (TREE_CODE (def) != SSA_NAME
4525 || !has_single_use (def))
4526 break;
4527 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
4528 unsigned this_lane;
4529 if (!bb_vinfo->lookup_stmt (def_stmt)
4530 || !vect_slp_is_lane_insert (def_stmt,
4531 NULL_TREE, &this_lane)
4532 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
4533 break;
4534 if (bitmap_bit_p (lanes, this_lane))
4535 break;
4536 lanes_found++;
4537 bitmap_set_bit (lanes, this_lane);
4538 lane_defs.quick_push (std::make_pair
4539 (this_lane,
4540 gimple_assign_rhs2 (def_stmt)));
4541 def = gimple_assign_rhs1 (def_stmt);
4542 }
4543 while (lanes_found < nlanes);
4544 }
4545 if (lanes_found == nlanes)
4546 {
4547 /* Sort lane_defs after the lane index and register the root. */
4548 lane_defs.qsort (vld_cmp);
4549 vec<stmt_vec_info> stmts;
4550 stmts.create (nlanes);
4551 for (unsigned i = 0; i < nlanes; ++i)
4552 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
4553 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
4554 stmts, last));
4555 }
4556 }
4557 }
4558 }
4559
4560 /* Walk the grouped store chains and replace entries with their
4561 pattern variant if any. */
4562
4563 static void
4564 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
4565 {
4566 stmt_vec_info first_element;
4567 unsigned i;
4568
4569 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4570 {
4571 /* We also have CTORs in this array. */
4572 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
4573 continue;
4574 if (STMT_VINFO_IN_PATTERN_P (first_element))
4575 {
4576 stmt_vec_info orig = first_element;
4577 first_element = STMT_VINFO_RELATED_STMT (first_element);
4578 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
4579 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
4580 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
4581 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
4582 vinfo->grouped_stores[i] = first_element;
4583 }
4584 stmt_vec_info prev = first_element;
4585 while (DR_GROUP_NEXT_ELEMENT (prev))
4586 {
4587 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
4588 if (STMT_VINFO_IN_PATTERN_P (elt))
4589 {
4590 stmt_vec_info orig = elt;
4591 elt = STMT_VINFO_RELATED_STMT (elt);
4592 DR_GROUP_NEXT_ELEMENT (prev) = elt;
4593 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
4594 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
4595 }
4596 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
4597 prev = elt;
4598 }
4599 }
4600 }
4601
4602 /* Check if the region described by BB_VINFO can be vectorized, returning
4603 true if so. When returning false, set FATAL to true if the same failure
4604 would prevent vectorization at other vector sizes, false if it is still
4605 worth trying other sizes. N_STMTS is the number of statements in the
4606 region. */
4607
4608 static bool
4609 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
4610 vec<int> *dataref_groups)
4611 {
4612 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4613
4614 slp_instance instance;
4615 int i;
4616 poly_uint64 min_vf = 2;
4617
4618 /* The first group of checks is independent of the vector size. */
4619 fatal = true;
4620
4621 /* Analyze the data references. */
4622
4623 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
4624 {
4625 if (dump_enabled_p ())
4626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4627 "not vectorized: unhandled data-ref in basic "
4628 "block.\n");
4629 return false;
4630 }
4631
4632 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
4633 {
4634 if (dump_enabled_p ())
4635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4636 "not vectorized: unhandled data access in "
4637 "basic block.\n");
4638 return false;
4639 }
4640
4641 vect_slp_check_for_constructors (bb_vinfo);
4642
4643 /* If there are no grouped stores and no constructors in the region
4644 there is no need to continue with pattern recog as vect_analyze_slp
4645 will fail anyway. */
4646 if (bb_vinfo->grouped_stores.is_empty ()
4647 && bb_vinfo->roots.is_empty ())
4648 {
4649 if (dump_enabled_p ())
4650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4651 "not vectorized: no grouped stores in "
4652 "basic block.\n");
4653 return false;
4654 }
4655
4656 /* While the rest of the analysis below depends on it in some way. */
4657 fatal = false;
4658
4659 vect_pattern_recog (bb_vinfo);
4660
4661 /* Update store groups from pattern processing. */
4662 vect_fixup_store_groups_with_patterns (bb_vinfo);
4663
4664 /* Check the SLP opportunities in the basic block, analyze and build SLP
4665 trees. */
4666 if (!vect_analyze_slp (bb_vinfo, n_stmts))
4667 {
4668 if (dump_enabled_p ())
4669 {
4670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4671 "Failed to SLP the basic block.\n");
4672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4673 "not vectorized: failed to find SLP opportunities "
4674 "in basic block.\n");
4675 }
4676 return false;
4677 }
4678
4679 /* Optimize permutations. */
4680 vect_optimize_slp (bb_vinfo);
4681
4682 /* Gather the loads reachable from the SLP graph entries. */
4683 vect_gather_slp_loads (bb_vinfo);
4684
4685 vect_record_base_alignments (bb_vinfo);
4686
4687 /* Analyze and verify the alignment of data references and the
4688 dependence in the SLP instances. */
4689 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
4690 {
4691 vect_location = instance->location ();
4692 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
4693 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
4694 {
4695 slp_tree node = SLP_INSTANCE_TREE (instance);
4696 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4697 if (dump_enabled_p ())
4698 dump_printf_loc (MSG_NOTE, vect_location,
4699 "removing SLP instance operations starting from: %G",
4700 stmt_info->stmt);
4701 vect_free_slp_instance (instance);
4702 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
4703 continue;
4704 }
4705
4706 /* Mark all the statements that we want to vectorize as pure SLP and
4707 relevant. */
4708 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4709 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
4710 if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
4711 {
4712 STMT_SLP_TYPE (root) = pure_slp;
4713 if (is_gimple_assign (root->stmt)
4714 && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
4715 {
4716 /* ??? We should probably record the whole vector of
4717 root stmts so we do not have to back-track here... */
4718 for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
4719 n != 1; --n)
4720 {
4721 root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
4722 STMT_SLP_TYPE (root) = pure_slp;
4723 }
4724 }
4725 }
4726
4727 i++;
4728 }
4729 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
4730 return false;
4731
4732 if (!vect_slp_analyze_operations (bb_vinfo))
4733 {
4734 if (dump_enabled_p ())
4735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4736 "not vectorized: bad operation in basic block.\n");
4737 return false;
4738 }
4739
4740 vect_bb_partition_graph (bb_vinfo);
4741
4742 return true;
4743 }
4744
4745 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4746 basic blocks in BBS, returning true on success.
4747 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4748
4749 static bool
4750 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
4751 vec<int> *dataref_groups, unsigned int n_stmts)
4752 {
4753 bb_vec_info bb_vinfo;
4754 auto_vector_modes vector_modes;
4755
4756 /* Autodetect first vector size we try. */
4757 machine_mode next_vector_mode = VOIDmode;
4758 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
4759 unsigned int mode_i = 0;
4760
4761 vec_info_shared shared;
4762
4763 machine_mode autodetected_vector_mode = VOIDmode;
4764 while (1)
4765 {
4766 bool vectorized = false;
4767 bool fatal = false;
4768 bb_vinfo = new _bb_vec_info (bbs, &shared);
4769
4770 bool first_time_p = shared.datarefs.is_empty ();
4771 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
4772 if (first_time_p)
4773 bb_vinfo->shared->save_datarefs ();
4774 else
4775 bb_vinfo->shared->check_datarefs ();
4776 bb_vinfo->vector_mode = next_vector_mode;
4777
4778 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
4779 {
4780 if (dump_enabled_p ())
4781 {
4782 dump_printf_loc (MSG_NOTE, vect_location,
4783 "***** Analysis succeeded with vector mode"
4784 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
4785 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
4786 }
4787
4788 bb_vinfo->shared->check_datarefs ();
4789
4790 unsigned i;
4791 slp_instance instance;
4792 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
4793 {
4794 if (instance->subgraph_entries.is_empty ())
4795 continue;
4796
4797 vect_location = instance->location ();
4798 if (!unlimited_cost_model (NULL)
4799 && !vect_bb_vectorization_profitable_p
4800 (bb_vinfo, instance->subgraph_entries))
4801 {
4802 if (dump_enabled_p ())
4803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4804 "not vectorized: vectorization is not "
4805 "profitable.\n");
4806 continue;
4807 }
4808
4809 if (!dbg_cnt (vect_slp))
4810 continue;
4811
4812 if (!vectorized && dump_enabled_p ())
4813 dump_printf_loc (MSG_NOTE, vect_location,
4814 "Basic block will be vectorized "
4815 "using SLP\n");
4816 vectorized = true;
4817
4818 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
4819
4820 unsigned HOST_WIDE_INT bytes;
4821 if (dump_enabled_p ())
4822 {
4823 if (GET_MODE_SIZE
4824 (bb_vinfo->vector_mode).is_constant (&bytes))
4825 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4826 "basic block part vectorized using %wu "
4827 "byte vectors\n", bytes);
4828 else
4829 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
4830 "basic block part vectorized using "
4831 "variable length vectors\n");
4832 }
4833 }
4834 }
4835 else
4836 {
4837 if (dump_enabled_p ())
4838 dump_printf_loc (MSG_NOTE, vect_location,
4839 "***** Analysis failed with vector mode %s\n",
4840 GET_MODE_NAME (bb_vinfo->vector_mode));
4841 }
4842
4843 if (mode_i == 0)
4844 autodetected_vector_mode = bb_vinfo->vector_mode;
4845
4846 if (!fatal)
4847 while (mode_i < vector_modes.length ()
4848 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
4849 {
4850 if (dump_enabled_p ())
4851 dump_printf_loc (MSG_NOTE, vect_location,
4852 "***** The result for vector mode %s would"
4853 " be the same\n",
4854 GET_MODE_NAME (vector_modes[mode_i]));
4855 mode_i += 1;
4856 }
4857
4858 delete bb_vinfo;
4859
4860 if (mode_i < vector_modes.length ()
4861 && VECTOR_MODE_P (autodetected_vector_mode)
4862 && (related_vector_mode (vector_modes[mode_i],
4863 GET_MODE_INNER (autodetected_vector_mode))
4864 == autodetected_vector_mode)
4865 && (related_vector_mode (autodetected_vector_mode,
4866 GET_MODE_INNER (vector_modes[mode_i]))
4867 == vector_modes[mode_i]))
4868 {
4869 if (dump_enabled_p ())
4870 dump_printf_loc (MSG_NOTE, vect_location,
4871 "***** Skipping vector mode %s, which would"
4872 " repeat the analysis for %s\n",
4873 GET_MODE_NAME (vector_modes[mode_i]),
4874 GET_MODE_NAME (autodetected_vector_mode));
4875 mode_i += 1;
4876 }
4877
4878 if (vectorized
4879 || mode_i == vector_modes.length ()
4880 || autodetected_vector_mode == VOIDmode
4881 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4882 vector sizes will fail do not bother iterating. */
4883 || fatal)
4884 return vectorized;
4885
4886 /* Try the next biggest vector size. */
4887 next_vector_mode = vector_modes[mode_i++];
4888 if (dump_enabled_p ())
4889 dump_printf_loc (MSG_NOTE, vect_location,
4890 "***** Re-trying analysis with vector mode %s\n",
4891 GET_MODE_NAME (next_vector_mode));
4892 }
4893 }
4894
4895
4896 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4897 true if anything in the basic-block was vectorized. */
4898
4899 static bool
4900 vect_slp_bbs (vec<basic_block> bbs)
4901 {
4902 vec<data_reference_p> datarefs = vNULL;
4903 auto_vec<int> dataref_groups;
4904 int insns = 0;
4905 int current_group = 0;
4906
4907 for (unsigned i = 0; i < bbs.length (); i++)
4908 {
4909 basic_block bb = bbs[i];
4910 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
4911 gsi_next (&gsi))
4912 {
4913 gimple *stmt = gsi_stmt (gsi);
4914 if (is_gimple_debug (stmt))
4915 continue;
4916
4917 insns++;
4918
4919 if (gimple_location (stmt) != UNKNOWN_LOCATION)
4920 vect_location = stmt;
4921
4922 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
4923 &dataref_groups, current_group))
4924 ++current_group;
4925 }
4926 }
4927
4928 return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
4929 }
4930
4931 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4932 true if anything in the basic-block was vectorized. */
4933
4934 bool
4935 vect_slp_bb (basic_block bb)
4936 {
4937 auto_vec<basic_block> bbs;
4938 bbs.safe_push (bb);
4939 return vect_slp_bbs (bbs);
4940 }
4941
4942 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4943 true if anything in the basic-block was vectorized. */
4944
4945 bool
4946 vect_slp_function (function *fun)
4947 {
4948 bool r = false;
4949 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
4950 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
4951
4952 /* For the moment split the function into pieces to avoid making
4953 the iteration on the vector mode moot. Split at points we know
4954 to not handle well which is CFG merges (SLP discovery doesn't
4955 handle non-loop-header PHIs) and loop exits. Since pattern
4956 recog requires reverse iteration to visit uses before defs
4957 simply chop RPO into pieces. */
4958 auto_vec<basic_block> bbs;
4959 for (unsigned i = 0; i < n; i++)
4960 {
4961 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
4962 bool split = false;
4963
4964 /* Split when a BB is not dominated by the first block. */
4965 if (!bbs.is_empty ()
4966 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
4967 {
4968 if (dump_enabled_p ())
4969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4970 "splitting region at dominance boundary bb%d\n",
4971 bb->index);
4972 split = true;
4973 }
4974 /* Split when the loop determined by the first block
4975 is exited. This is because we eventually insert
4976 invariants at region begin. */
4977 else if (!bbs.is_empty ()
4978 && bbs[0]->loop_father != bb->loop_father
4979 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
4980 {
4981 if (dump_enabled_p ())
4982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4983 "splitting region at loop %d exit at bb%d\n",
4984 bbs[0]->loop_father->num, bb->index);
4985 split = true;
4986 }
4987
4988 if (split && !bbs.is_empty ())
4989 {
4990 r |= vect_slp_bbs (bbs);
4991 bbs.truncate (0);
4992 bbs.quick_push (bb);
4993 }
4994 else
4995 bbs.safe_push (bb);
4996
4997 /* When we have a stmt ending this block and defining a
4998 value we have to insert on edges when inserting after it for
4999 a vector containing its definition. Avoid this for now. */
5000 if (gimple *last = last_stmt (bb))
5001 if (gimple_get_lhs (last)
5002 && is_ctrl_altering_stmt (last))
5003 {
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5006 "splitting region at control altering "
5007 "definition %G", last);
5008 r |= vect_slp_bbs (bbs);
5009 bbs.truncate (0);
5010 }
5011 }
5012
5013 if (!bbs.is_empty ())
5014 r |= vect_slp_bbs (bbs);
5015
5016 free (rpo);
5017
5018 return r;
5019 }
5020
5021 /* Build a variable-length vector in which the elements in ELTS are repeated
5022 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
5023 RESULTS and add any new instructions to SEQ.
5024
5025 The approach we use is:
5026
5027 (1) Find a vector mode VM with integer elements of mode IM.
5028
5029 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5030 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
5031 from small vectors to IM.
5032
5033 (3) Duplicate each ELTS'[I] into a vector of mode VM.
5034
5035 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
5036 correct byte contents.
5037
5038 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
5039
5040 We try to find the largest IM for which this sequence works, in order
5041 to cut down on the number of interleaves. */
5042
5043 void
5044 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
5045 vec<tree> elts, unsigned int nresults,
5046 vec<tree> &results)
5047 {
5048 unsigned int nelts = elts.length ();
5049 tree element_type = TREE_TYPE (vector_type);
5050
5051 /* (1) Find a vector mode VM with integer elements of mode IM. */
5052 unsigned int nvectors = 1;
5053 tree new_vector_type;
5054 tree permutes[2];
5055 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
5056 &nvectors, &new_vector_type,
5057 permutes))
5058 gcc_unreachable ();
5059
5060 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
5061 unsigned int partial_nelts = nelts / nvectors;
5062 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
5063
5064 tree_vector_builder partial_elts;
5065 auto_vec<tree, 32> pieces (nvectors * 2);
5066 pieces.quick_grow_cleared (nvectors * 2);
5067 for (unsigned int i = 0; i < nvectors; ++i)
5068 {
5069 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
5070 ELTS' has mode IM. */
5071 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
5072 for (unsigned int j = 0; j < partial_nelts; ++j)
5073 partial_elts.quick_push (elts[i * partial_nelts + j]);
5074 tree t = gimple_build_vector (seq, &partial_elts);
5075 t = gimple_build (seq, VIEW_CONVERT_EXPR,
5076 TREE_TYPE (new_vector_type), t);
5077
5078 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
5079 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
5080 }
5081
5082 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
5083 correct byte contents.
5084
5085 Conceptually, we need to repeat the following operation log2(nvectors)
5086 times, where hi_start = nvectors / 2:
5087
5088 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
5089 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
5090
5091 However, if each input repeats every N elements and the VF is
5092 a multiple of N * 2, the HI result is the same as the LO result.
5093 This will be true for the first N1 iterations of the outer loop,
5094 followed by N2 iterations for which both the LO and HI results
5095 are needed. I.e.:
5096
5097 N1 + N2 = log2(nvectors)
5098
5099 Each "N1 iteration" doubles the number of redundant vectors and the
5100 effect of the process as a whole is to have a sequence of nvectors/2**N1
5101 vectors that repeats 2**N1 times. Rather than generate these redundant
5102 vectors, we halve the number of vectors for each N1 iteration. */
5103 unsigned int in_start = 0;
5104 unsigned int out_start = nvectors;
5105 unsigned int new_nvectors = nvectors;
5106 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
5107 {
5108 unsigned int hi_start = new_nvectors / 2;
5109 unsigned int out_i = 0;
5110 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
5111 {
5112 if ((in_i & 1) != 0
5113 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
5114 2 * in_repeat))
5115 continue;
5116
5117 tree output = make_ssa_name (new_vector_type);
5118 tree input1 = pieces[in_start + (in_i / 2)];
5119 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
5120 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
5121 input1, input2,
5122 permutes[in_i & 1]);
5123 gimple_seq_add_stmt (seq, stmt);
5124 pieces[out_start + out_i] = output;
5125 out_i += 1;
5126 }
5127 std::swap (in_start, out_start);
5128 new_nvectors = out_i;
5129 }
5130
5131 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
5132 results.reserve (nresults);
5133 for (unsigned int i = 0; i < nresults; ++i)
5134 if (i < new_nvectors)
5135 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
5136 pieces[in_start + i]));
5137 else
5138 results.quick_push (results[i - new_nvectors]);
5139 }
5140
5141
5142 /* For constant and loop invariant defs in OP_NODE this function creates
5143 vector defs that will be used in the vectorized stmts and stores them
5144 to SLP_TREE_VEC_DEFS of OP_NODE. */
5145
5146 static void
5147 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
5148 {
5149 unsigned HOST_WIDE_INT nunits;
5150 tree vec_cst;
5151 unsigned j, number_of_places_left_in_vector;
5152 tree vector_type;
5153 tree vop;
5154 int group_size = op_node->ops.length ();
5155 unsigned int vec_num, i;
5156 unsigned number_of_copies = 1;
5157 bool constant_p;
5158 gimple_seq ctor_seq = NULL;
5159 auto_vec<tree, 16> permute_results;
5160
5161 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
5162 vector_type = SLP_TREE_VECTYPE (op_node);
5163
5164 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
5165 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
5166 auto_vec<tree> voprnds (number_of_vectors);
5167
5168 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5169 created vectors. It is greater than 1 if unrolling is performed.
5170
5171 For example, we have two scalar operands, s1 and s2 (e.g., group of
5172 strided accesses of size two), while NUNITS is four (i.e., four scalars
5173 of this type can be packed in a vector). The output vector will contain
5174 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5175 will be 2).
5176
5177 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5178 containing the operands.
5179
5180 For example, NUNITS is four as before, and the group size is 8
5181 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5182 {s5, s6, s7, s8}. */
5183
5184 /* When using duplicate_and_interleave, we just need one element for
5185 each scalar statement. */
5186 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5187 nunits = group_size;
5188
5189 number_of_copies = nunits * number_of_vectors / group_size;
5190
5191 number_of_places_left_in_vector = nunits;
5192 constant_p = true;
5193 tree_vector_builder elts (vector_type, nunits, 1);
5194 elts.quick_grow (nunits);
5195 stmt_vec_info insert_after = NULL;
5196 for (j = 0; j < number_of_copies; j++)
5197 {
5198 tree op;
5199 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
5200 {
5201 /* Create 'vect_ = {op0,op1,...,opn}'. */
5202 number_of_places_left_in_vector--;
5203 tree orig_op = op;
5204 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
5205 {
5206 if (CONSTANT_CLASS_P (op))
5207 {
5208 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5209 {
5210 /* Can't use VIEW_CONVERT_EXPR for booleans because
5211 of possibly different sizes of scalar value and
5212 vector element. */
5213 if (integer_zerop (op))
5214 op = build_int_cst (TREE_TYPE (vector_type), 0);
5215 else if (integer_onep (op))
5216 op = build_all_ones_cst (TREE_TYPE (vector_type));
5217 else
5218 gcc_unreachable ();
5219 }
5220 else
5221 op = fold_unary (VIEW_CONVERT_EXPR,
5222 TREE_TYPE (vector_type), op);
5223 gcc_assert (op && CONSTANT_CLASS_P (op));
5224 }
5225 else
5226 {
5227 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
5228 gimple *init_stmt;
5229 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
5230 {
5231 tree true_val
5232 = build_all_ones_cst (TREE_TYPE (vector_type));
5233 tree false_val
5234 = build_zero_cst (TREE_TYPE (vector_type));
5235 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
5236 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
5237 op, true_val,
5238 false_val);
5239 }
5240 else
5241 {
5242 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
5243 op);
5244 init_stmt
5245 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
5246 op);
5247 }
5248 gimple_seq_add_stmt (&ctor_seq, init_stmt);
5249 op = new_temp;
5250 }
5251 }
5252 elts[number_of_places_left_in_vector] = op;
5253 if (!CONSTANT_CLASS_P (op))
5254 constant_p = false;
5255 /* For BB vectorization we have to compute an insert location
5256 when a def is inside the analyzed region since we cannot
5257 simply insert at the BB start in this case. */
5258 stmt_vec_info opdef;
5259 if (TREE_CODE (orig_op) == SSA_NAME
5260 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
5261 && is_a <bb_vec_info> (vinfo)
5262 && (opdef = vinfo->lookup_def (orig_op)))
5263 {
5264 if (!insert_after)
5265 insert_after = opdef;
5266 else
5267 insert_after = get_later_stmt (insert_after, opdef);
5268 }
5269
5270 if (number_of_places_left_in_vector == 0)
5271 {
5272 if (constant_p
5273 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5274 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5275 vec_cst = gimple_build_vector (&ctor_seq, &elts);
5276 else
5277 {
5278 if (permute_results.is_empty ())
5279 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
5280 elts, number_of_vectors,
5281 permute_results);
5282 vec_cst = permute_results[number_of_vectors - j - 1];
5283 }
5284 if (!gimple_seq_empty_p (ctor_seq))
5285 {
5286 if (insert_after)
5287 {
5288 gimple_stmt_iterator gsi;
5289 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
5290 {
5291 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
5292 gsi_insert_seq_before (&gsi, ctor_seq,
5293 GSI_CONTINUE_LINKING);
5294 }
5295 else if (!stmt_ends_bb_p (insert_after->stmt))
5296 {
5297 gsi = gsi_for_stmt (insert_after->stmt);
5298 gsi_insert_seq_after (&gsi, ctor_seq,
5299 GSI_CONTINUE_LINKING);
5300 }
5301 else
5302 {
5303 /* When we want to insert after a def where the
5304 defining stmt throws then insert on the fallthru
5305 edge. */
5306 edge e = find_fallthru_edge
5307 (gimple_bb (insert_after->stmt)->succs);
5308 basic_block new_bb
5309 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
5310 gcc_assert (!new_bb);
5311 }
5312 }
5313 else
5314 vinfo->insert_seq_on_entry (NULL, ctor_seq);
5315 ctor_seq = NULL;
5316 }
5317 voprnds.quick_push (vec_cst);
5318 insert_after = NULL;
5319 number_of_places_left_in_vector = nunits;
5320 constant_p = true;
5321 elts.new_vector (vector_type, nunits, 1);
5322 elts.quick_grow (nunits);
5323 }
5324 }
5325 }
5326
5327 /* Since the vectors are created in the reverse order, we should invert
5328 them. */
5329 vec_num = voprnds.length ();
5330 for (j = vec_num; j != 0; j--)
5331 {
5332 vop = voprnds[j - 1];
5333 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5334 }
5335
5336 /* In case that VF is greater than the unrolling factor needed for the SLP
5337 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5338 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5339 to replicate the vectors. */
5340 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
5341 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
5342 i++)
5343 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
5344 }
5345
5346 /* Get the Ith vectorized definition from SLP_NODE. */
5347
5348 tree
5349 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
5350 {
5351 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
5352 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
5353 else
5354 return SLP_TREE_VEC_DEFS (slp_node)[i];
5355 }
5356
5357 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5358
5359 void
5360 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
5361 {
5362 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
5363 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
5364 {
5365 unsigned j;
5366 gimple *vec_def_stmt;
5367 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
5368 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
5369 }
5370 else
5371 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
5372 }
5373
5374 /* Get N vectorized definitions for SLP_NODE. */
5375
5376 void
5377 vect_get_slp_defs (vec_info *,
5378 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
5379 {
5380 if (n == -1U)
5381 n = SLP_TREE_CHILDREN (slp_node).length ();
5382
5383 for (unsigned i = 0; i < n; ++i)
5384 {
5385 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
5386 vec<tree> vec_defs = vNULL;
5387 vect_get_slp_defs (child, &vec_defs);
5388 vec_oprnds->quick_push (vec_defs);
5389 }
5390 }
5391
5392 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5393 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5394 permute statements for the SLP node NODE. Store the number of vector
5395 permute instructions in *N_PERMS and the number of vector load
5396 instructions in *N_LOADS. */
5397
5398 bool
5399 vect_transform_slp_perm_load (vec_info *vinfo,
5400 slp_tree node, vec<tree> dr_chain,
5401 gimple_stmt_iterator *gsi, poly_uint64 vf,
5402 bool analyze_only, unsigned *n_perms,
5403 unsigned int *n_loads)
5404 {
5405 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5406 int vec_index = 0;
5407 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5408 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
5409 unsigned int mask_element;
5410 machine_mode mode;
5411
5412 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
5413 return false;
5414
5415 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
5416
5417 mode = TYPE_MODE (vectype);
5418 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5419
5420 /* Initialize the vect stmts of NODE to properly insert the generated
5421 stmts later. */
5422 if (! analyze_only)
5423 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
5424 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
5425 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
5426
5427 /* Generate permutation masks for every NODE. Number of masks for each NODE
5428 is equal to GROUP_SIZE.
5429 E.g., we have a group of three nodes with three loads from the same
5430 location in each node, and the vector size is 4. I.e., we have a
5431 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5432 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5433 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5434 ...
5435
5436 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5437 The last mask is illegal since we assume two operands for permute
5438 operation, and the mask element values can't be outside that range.
5439 Hence, the last mask must be converted into {2,5,5,5}.
5440 For the first two permutations we need the first and the second input
5441 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5442 we need the second and the third vectors: {b1,c1,a2,b2} and
5443 {c2,a3,b3,c3}. */
5444
5445 int vect_stmts_counter = 0;
5446 unsigned int index = 0;
5447 int first_vec_index = -1;
5448 int second_vec_index = -1;
5449 bool noop_p = true;
5450 *n_perms = 0;
5451
5452 vec_perm_builder mask;
5453 unsigned int nelts_to_build;
5454 unsigned int nvectors_per_build;
5455 unsigned int in_nlanes;
5456 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
5457 && multiple_p (nunits, group_size));
5458 if (repeating_p)
5459 {
5460 /* A single vector contains a whole number of copies of the node, so:
5461 (a) all permutes can use the same mask; and
5462 (b) the permutes only need a single vector input. */
5463 mask.new_vector (nunits, group_size, 3);
5464 nelts_to_build = mask.encoded_nelts ();
5465 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
5466 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
5467 }
5468 else
5469 {
5470 /* We need to construct a separate mask for each vector statement. */
5471 unsigned HOST_WIDE_INT const_nunits, const_vf;
5472 if (!nunits.is_constant (&const_nunits)
5473 || !vf.is_constant (&const_vf))
5474 return false;
5475 mask.new_vector (const_nunits, const_nunits, 1);
5476 nelts_to_build = const_vf * group_size;
5477 nvectors_per_build = 1;
5478 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
5479 }
5480 auto_sbitmap used_in_lanes (in_nlanes);
5481 bitmap_clear (used_in_lanes);
5482
5483 unsigned int count = mask.encoded_nelts ();
5484 mask.quick_grow (count);
5485 vec_perm_indices indices;
5486
5487 for (unsigned int j = 0; j < nelts_to_build; j++)
5488 {
5489 unsigned int iter_num = j / group_size;
5490 unsigned int stmt_num = j % group_size;
5491 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
5492 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
5493 bitmap_set_bit (used_in_lanes, i);
5494 if (repeating_p)
5495 {
5496 first_vec_index = 0;
5497 mask_element = i;
5498 }
5499 else
5500 {
5501 /* Enforced before the loop when !repeating_p. */
5502 unsigned int const_nunits = nunits.to_constant ();
5503 vec_index = i / const_nunits;
5504 mask_element = i % const_nunits;
5505 if (vec_index == first_vec_index
5506 || first_vec_index == -1)
5507 {
5508 first_vec_index = vec_index;
5509 }
5510 else if (vec_index == second_vec_index
5511 || second_vec_index == -1)
5512 {
5513 second_vec_index = vec_index;
5514 mask_element += const_nunits;
5515 }
5516 else
5517 {
5518 if (dump_enabled_p ())
5519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5520 "permutation requires at "
5521 "least three vectors %G",
5522 stmt_info->stmt);
5523 gcc_assert (analyze_only);
5524 return false;
5525 }
5526
5527 gcc_assert (mask_element < 2 * const_nunits);
5528 }
5529
5530 if (mask_element != index)
5531 noop_p = false;
5532 mask[index++] = mask_element;
5533
5534 if (index == count && !noop_p)
5535 {
5536 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
5537 if (!can_vec_perm_const_p (mode, indices))
5538 {
5539 if (dump_enabled_p ())
5540 {
5541 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5542 vect_location,
5543 "unsupported vect permute { ");
5544 for (i = 0; i < count; ++i)
5545 {
5546 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5547 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5548 }
5549 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5550 }
5551 gcc_assert (analyze_only);
5552 return false;
5553 }
5554
5555 ++*n_perms;
5556 }
5557
5558 if (index == count)
5559 {
5560 if (!analyze_only)
5561 {
5562 tree mask_vec = NULL_TREE;
5563
5564 if (! noop_p)
5565 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5566
5567 if (second_vec_index == -1)
5568 second_vec_index = first_vec_index;
5569
5570 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
5571 {
5572 /* Generate the permute statement if necessary. */
5573 tree first_vec = dr_chain[first_vec_index + ri];
5574 tree second_vec = dr_chain[second_vec_index + ri];
5575 gimple *perm_stmt;
5576 if (! noop_p)
5577 {
5578 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5579 tree perm_dest
5580 = vect_create_destination_var (gimple_assign_lhs (stmt),
5581 vectype);
5582 perm_dest = make_ssa_name (perm_dest);
5583 perm_stmt
5584 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5585 first_vec, second_vec,
5586 mask_vec);
5587 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
5588 gsi);
5589 }
5590 else
5591 /* If mask was NULL_TREE generate the requested
5592 identity transform. */
5593 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
5594
5595 /* Store the vector statement in NODE. */
5596 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
5597 }
5598 }
5599
5600 index = 0;
5601 first_vec_index = -1;
5602 second_vec_index = -1;
5603 noop_p = true;
5604 }
5605 }
5606
5607 if (n_loads)
5608 {
5609 if (repeating_p)
5610 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
5611 else
5612 {
5613 /* Enforced above when !repeating_p. */
5614 unsigned int const_nunits = nunits.to_constant ();
5615 *n_loads = 0;
5616 bool load_seen = false;
5617 for (unsigned i = 0; i < in_nlanes; ++i)
5618 {
5619 if (i % const_nunits == 0)
5620 {
5621 if (load_seen)
5622 *n_loads += 1;
5623 load_seen = false;
5624 }
5625 if (bitmap_bit_p (used_in_lanes, i))
5626 load_seen = true;
5627 }
5628 if (load_seen)
5629 *n_loads += 1;
5630 }
5631 }
5632
5633 return true;
5634 }
5635
5636
5637 /* Vectorize the SLP permutations in NODE as specified
5638 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5639 child number and lane number.
5640 Interleaving of two two-lane two-child SLP subtrees (not supported):
5641 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5642 A blend of two four-lane two-child SLP subtrees:
5643 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5644 Highpart of a four-lane one-child SLP subtree (not supported):
5645 [ { 0, 2 }, { 0, 3 } ]
5646 Where currently only a subset is supported by code generating below. */
5647
5648 static bool
5649 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
5650 slp_tree node, stmt_vector_for_cost *cost_vec)
5651 {
5652 tree vectype = SLP_TREE_VECTYPE (node);
5653
5654 /* ??? We currently only support all same vector input and output types
5655 while the SLP IL should really do a concat + select and thus accept
5656 arbitrary mismatches. */
5657 slp_tree child;
5658 unsigned i;
5659 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5660 if (!vect_maybe_update_slp_op_vectype (child, vectype)
5661 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
5662 {
5663 if (dump_enabled_p ())
5664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665 "Unsupported lane permutation\n");
5666 return false;
5667 }
5668
5669 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
5670 gcc_assert (perm.length () == SLP_TREE_LANES (node));
5671 if (dump_enabled_p ())
5672 {
5673 dump_printf_loc (MSG_NOTE, vect_location,
5674 "vectorizing permutation");
5675 for (unsigned i = 0; i < perm.length (); ++i)
5676 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
5677 dump_printf (MSG_NOTE, "\n");
5678 }
5679
5680 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
5681 if (!nunits.is_constant ())
5682 return false;
5683 unsigned HOST_WIDE_INT vf = 1;
5684 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
5685 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
5686 return false;
5687 unsigned olanes = vf * SLP_TREE_LANES (node);
5688 gcc_assert (multiple_p (olanes, nunits));
5689
5690 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5691 from the { SLP operand, scalar lane } permutation as recorded in the
5692 SLP node as intermediate step. This part should already work
5693 with SLP children with arbitrary number of lanes. */
5694 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
5695 auto_vec<unsigned> active_lane;
5696 vperm.create (olanes);
5697 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
5698 for (unsigned i = 0; i < vf; ++i)
5699 {
5700 for (unsigned pi = 0; pi < perm.length (); ++pi)
5701 {
5702 std::pair<unsigned, unsigned> p = perm[pi];
5703 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
5704 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
5705 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
5706 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
5707 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
5708 }
5709 /* Advance to the next group. */
5710 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
5711 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
5712 }
5713
5714 if (dump_enabled_p ())
5715 {
5716 dump_printf_loc (MSG_NOTE, vect_location, "as");
5717 for (unsigned i = 0; i < vperm.length (); ++i)
5718 {
5719 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
5720 dump_printf (MSG_NOTE, ",");
5721 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
5722 vperm[i].first.first, vperm[i].first.second,
5723 vperm[i].second);
5724 }
5725 dump_printf (MSG_NOTE, "\n");
5726 }
5727
5728 /* We can only handle two-vector permutes, everything else should
5729 be lowered on the SLP level. The following is closely inspired
5730 by vect_transform_slp_perm_load and is supposed to eventually
5731 replace it.
5732 ??? As intermediate step do code-gen in the SLP tree representation
5733 somehow? */
5734 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
5735 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
5736 unsigned int const_nunits = nunits.to_constant ();
5737 unsigned int index = 0;
5738 unsigned int mask_element;
5739 vec_perm_builder mask;
5740 mask.new_vector (const_nunits, const_nunits, 1);
5741 unsigned int count = mask.encoded_nelts ();
5742 mask.quick_grow (count);
5743 vec_perm_indices indices;
5744 unsigned nperms = 0;
5745 for (unsigned i = 0; i < vperm.length (); ++i)
5746 {
5747 mask_element = vperm[i].second;
5748 if (first_vec.first == -1U
5749 || first_vec == vperm[i].first)
5750 first_vec = vperm[i].first;
5751 else if (second_vec.first == -1U
5752 || second_vec == vperm[i].first)
5753 {
5754 second_vec = vperm[i].first;
5755 mask_element += const_nunits;
5756 }
5757 else
5758 {
5759 if (dump_enabled_p ())
5760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5761 "permutation requires at "
5762 "least three vectors\n");
5763 gcc_assert (!gsi);
5764 return false;
5765 }
5766
5767 mask[index++] = mask_element;
5768
5769 if (index == count)
5770 {
5771 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
5772 const_nunits);
5773 bool identity_p = indices.series_p (0, 1, 0, 1);
5774 if (!identity_p
5775 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5776 {
5777 if (dump_enabled_p ())
5778 {
5779 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
5780 vect_location,
5781 "unsupported vect permute { ");
5782 for (i = 0; i < count; ++i)
5783 {
5784 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
5785 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
5786 }
5787 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
5788 }
5789 gcc_assert (!gsi);
5790 return false;
5791 }
5792
5793 if (!identity_p)
5794 nperms++;
5795 if (gsi)
5796 {
5797 if (second_vec.first == -1U)
5798 second_vec = first_vec;
5799
5800 /* Generate the permute statement if necessary. */
5801 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
5802 tree first_def
5803 = vect_get_slp_vect_def (first_node, first_vec.second);
5804 /* ??? We SLP match existing vector element extracts but
5805 allow punning which we need to re-instantiate at uses
5806 but have no good way of explicitely representing. */
5807 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
5808 {
5809 gassign *conv_stmt;
5810 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5811 build1 (VIEW_CONVERT_EXPR,
5812 vectype, first_def));
5813 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5814 first_def = gimple_assign_lhs (conv_stmt);
5815 }
5816 gassign *perm_stmt;
5817 tree perm_dest = make_ssa_name (vectype);
5818 if (!identity_p)
5819 {
5820 slp_tree second_node
5821 = SLP_TREE_CHILDREN (node)[second_vec.first];
5822 tree second_def
5823 = vect_get_slp_vect_def (second_node, second_vec.second);
5824 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
5825 {
5826 gassign *conv_stmt;
5827 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
5828 build1
5829 (VIEW_CONVERT_EXPR,
5830 vectype, second_def));
5831 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
5832 second_def = gimple_assign_lhs (conv_stmt);
5833 }
5834 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
5835 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
5836 first_def, second_def,
5837 mask_vec);
5838 }
5839 else
5840 /* We need a copy here in case the def was external. */
5841 perm_stmt = gimple_build_assign (perm_dest, first_def);
5842 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
5843 /* Store the vector statement in NODE. */
5844 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
5845 }
5846
5847 index = 0;
5848 first_vec = std::make_pair (-1U, -1U);
5849 second_vec = std::make_pair (-1U, -1U);
5850 }
5851 }
5852
5853 if (!gsi)
5854 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
5855
5856 return true;
5857 }
5858
5859 /* Vectorize SLP NODE. */
5860
5861 static void
5862 vect_schedule_slp_node (vec_info *vinfo,
5863 slp_tree node, slp_instance instance)
5864 {
5865 gimple_stmt_iterator si;
5866 int i;
5867 slp_tree child;
5868
5869 /* For existing vectors there's nothing to do. */
5870 if (SLP_TREE_VEC_DEFS (node).exists ())
5871 return;
5872
5873 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
5874
5875 /* Vectorize externals and constants. */
5876 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5877 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
5878 {
5879 /* ??? vectorizable_shift can end up using a scalar operand which is
5880 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5881 node in this case. */
5882 if (!SLP_TREE_VECTYPE (node))
5883 return;
5884
5885 vect_create_constant_vectors (vinfo, node);
5886 return;
5887 }
5888
5889 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
5890
5891 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
5892 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
5893
5894 if (dump_enabled_p ())
5895 dump_printf_loc (MSG_NOTE, vect_location,
5896 "------>vectorizing SLP node starting from: %G",
5897 stmt_info->stmt);
5898
5899 if (STMT_VINFO_DATA_REF (stmt_info)
5900 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5901 {
5902 /* Vectorized loads go before the first scalar load to make it
5903 ready early, vectorized stores go before the last scalar
5904 stmt which is where all uses are ready. */
5905 stmt_vec_info last_stmt_info = NULL;
5906 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
5907 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
5908 else /* DR_IS_WRITE */
5909 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
5910 si = gsi_for_stmt (last_stmt_info->stmt);
5911 }
5912 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
5913 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
5914 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
5915 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
5916 {
5917 /* For PHI node vectorization we do not use the insertion iterator. */
5918 si = gsi_none ();
5919 }
5920 else
5921 {
5922 /* Emit other stmts after the children vectorized defs which is
5923 earliest possible. */
5924 gimple *last_stmt = NULL;
5925 bool seen_vector_def = false;
5926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5927 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5928 {
5929 /* For fold-left reductions we are retaining the scalar
5930 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5931 set so the representation isn't perfect. Resort to the
5932 last scalar def here. */
5933 if (SLP_TREE_VEC_STMTS (child).is_empty ())
5934 {
5935 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
5936 == cycle_phi_info_type);
5937 gphi *phi = as_a <gphi *>
5938 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
5939 if (!last_stmt
5940 || vect_stmt_dominates_stmt_p (last_stmt, phi))
5941 last_stmt = phi;
5942 }
5943 /* We are emitting all vectorized stmts in the same place and
5944 the last one is the last.
5945 ??? Unless we have a load permutation applied and that
5946 figures to re-use an earlier generated load. */
5947 unsigned j;
5948 gimple *vstmt;
5949 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
5950 if (!last_stmt
5951 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5952 last_stmt = vstmt;
5953 }
5954 else if (!SLP_TREE_VECTYPE (child))
5955 {
5956 /* For externals we use unvectorized at all scalar defs. */
5957 unsigned j;
5958 tree def;
5959 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
5960 if (TREE_CODE (def) == SSA_NAME
5961 && !SSA_NAME_IS_DEFAULT_DEF (def))
5962 {
5963 gimple *stmt = SSA_NAME_DEF_STMT (def);
5964 if (!last_stmt
5965 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
5966 last_stmt = stmt;
5967 }
5968 }
5969 else
5970 {
5971 /* For externals we have to look at all defs since their
5972 insertion place is decided per vector. But beware
5973 of pre-existing vectors where we need to make sure
5974 we do not insert before the region boundary. */
5975 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
5976 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
5977 seen_vector_def = true;
5978 else
5979 {
5980 unsigned j;
5981 tree vdef;
5982 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
5983 if (TREE_CODE (vdef) == SSA_NAME
5984 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
5985 {
5986 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
5987 if (!last_stmt
5988 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
5989 last_stmt = vstmt;
5990 }
5991 }
5992 }
5993 /* This can happen when all children are pre-existing vectors or
5994 constants. */
5995 if (!last_stmt)
5996 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
5997 if (!last_stmt)
5998 {
5999 gcc_assert (seen_vector_def);
6000 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
6001 }
6002 else if (is_a <gphi *> (last_stmt))
6003 si = gsi_after_labels (gimple_bb (last_stmt));
6004 else
6005 {
6006 si = gsi_for_stmt (last_stmt);
6007 gsi_next (&si);
6008 }
6009 }
6010
6011 bool done_p = false;
6012
6013 /* Handle purely internal nodes. */
6014 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6015 {
6016 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
6017 be shared with different SLP nodes (but usually it's the same
6018 operation apart from the case the stmt is only there for denoting
6019 the actual scalar lane defs ...). So do not call vect_transform_stmt
6020 but open-code it here (partly). */
6021 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
6022 gcc_assert (done);
6023 done_p = true;
6024 }
6025 if (!done_p)
6026 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
6027 }
6028
6029 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
6030 For loop vectorization this is done in vectorizable_call, but for SLP
6031 it needs to be deferred until end of vect_schedule_slp, because multiple
6032 SLP instances may refer to the same scalar stmt. */
6033
6034 static void
6035 vect_remove_slp_scalar_calls (vec_info *vinfo,
6036 slp_tree node, hash_set<slp_tree> &visited)
6037 {
6038 gimple *new_stmt;
6039 gimple_stmt_iterator gsi;
6040 int i;
6041 slp_tree child;
6042 tree lhs;
6043 stmt_vec_info stmt_info;
6044
6045 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6046 return;
6047
6048 if (visited.add (node))
6049 return;
6050
6051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6052 vect_remove_slp_scalar_calls (vinfo, child, visited);
6053
6054 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6055 {
6056 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
6057 if (!stmt || gimple_bb (stmt) == NULL)
6058 continue;
6059 if (is_pattern_stmt_p (stmt_info)
6060 || !PURE_SLP_STMT (stmt_info))
6061 continue;
6062 lhs = gimple_call_lhs (stmt);
6063 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
6064 gsi = gsi_for_stmt (stmt);
6065 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
6066 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
6067 }
6068 }
6069
6070 static void
6071 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
6072 {
6073 hash_set<slp_tree> visited;
6074 vect_remove_slp_scalar_calls (vinfo, node, visited);
6075 }
6076
6077 /* Vectorize the instance root. */
6078
6079 void
6080 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
6081 {
6082 gassign *rstmt = NULL;
6083
6084 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
6085 {
6086 gimple *child_stmt;
6087 int j;
6088
6089 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6090 {
6091 tree vect_lhs = gimple_get_lhs (child_stmt);
6092 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
6093 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
6094 TREE_TYPE (vect_lhs)))
6095 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
6096 vect_lhs);
6097 rstmt = gimple_build_assign (root_lhs, vect_lhs);
6098 break;
6099 }
6100 }
6101 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
6102 {
6103 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6104 gimple *child_stmt;
6105 int j;
6106 vec<constructor_elt, va_gc> *v;
6107 vec_alloc (v, nelts);
6108
6109 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
6110 {
6111 CONSTRUCTOR_APPEND_ELT (v,
6112 NULL_TREE,
6113 gimple_get_lhs (child_stmt));
6114 }
6115 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
6116 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
6117 tree r_constructor = build_constructor (rtype, v);
6118 rstmt = gimple_build_assign (lhs, r_constructor);
6119 }
6120
6121 gcc_assert (rstmt);
6122
6123 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
6124 gsi_replace (&rgsi, rstmt, true);
6125 }
6126
6127 struct slp_scc_info
6128 {
6129 bool on_stack;
6130 int dfs;
6131 int lowlink;
6132 };
6133
6134 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
6135
6136 static void
6137 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
6138 hash_map<slp_tree, slp_scc_info> &scc_info,
6139 int &maxdfs, vec<slp_tree> &stack)
6140 {
6141 bool existed_p;
6142 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
6143 gcc_assert (!existed_p);
6144 info->dfs = maxdfs;
6145 info->lowlink = maxdfs;
6146 maxdfs++;
6147
6148 /* Leaf. */
6149 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
6150 {
6151 info->on_stack = false;
6152 vect_schedule_slp_node (vinfo, node, instance);
6153 return;
6154 }
6155
6156 info->on_stack = true;
6157 stack.safe_push (node);
6158
6159 unsigned i;
6160 slp_tree child;
6161 /* DFS recurse. */
6162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6163 {
6164 if (!child)
6165 continue;
6166 slp_scc_info *child_info = scc_info.get (child);
6167 if (!child_info)
6168 {
6169 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
6170 /* Recursion might have re-allocated the node. */
6171 info = scc_info.get (node);
6172 child_info = scc_info.get (child);
6173 info->lowlink = MIN (info->lowlink, child_info->lowlink);
6174 }
6175 else if (child_info->on_stack)
6176 info->lowlink = MIN (info->lowlink, child_info->dfs);
6177 }
6178 if (info->lowlink != info->dfs)
6179 return;
6180
6181 auto_vec<slp_tree, 4> phis_to_fixup;
6182
6183 /* Singleton. */
6184 if (stack.last () == node)
6185 {
6186 stack.pop ();
6187 info->on_stack = false;
6188 vect_schedule_slp_node (vinfo, node, instance);
6189 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6190 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
6191 phis_to_fixup.quick_push (node);
6192 }
6193 else
6194 {
6195 /* SCC. */
6196 int last_idx = stack.length () - 1;
6197 while (stack[last_idx] != node)
6198 last_idx--;
6199 /* We can break the cycle at PHIs who have at least one child
6200 code generated. Then we could re-start the DFS walk until
6201 all nodes in the SCC are covered (we might have new entries
6202 for only back-reachable nodes). But it's simpler to just
6203 iterate and schedule those that are ready. */
6204 unsigned todo = stack.length () - last_idx;
6205 do
6206 {
6207 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
6208 {
6209 slp_tree entry = stack[idx];
6210 if (!entry)
6211 continue;
6212 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
6213 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
6214 bool ready = !phi;
6215 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
6216 if (!child)
6217 {
6218 gcc_assert (phi);
6219 ready = true;
6220 break;
6221 }
6222 else if (scc_info.get (child)->on_stack)
6223 {
6224 if (!phi)
6225 {
6226 ready = false;
6227 break;
6228 }
6229 }
6230 else
6231 {
6232 if (phi)
6233 {
6234 ready = true;
6235 break;
6236 }
6237 }
6238 if (ready)
6239 {
6240 vect_schedule_slp_node (vinfo, entry, instance);
6241 scc_info.get (entry)->on_stack = false;
6242 stack[idx] = NULL;
6243 todo--;
6244 if (phi)
6245 phis_to_fixup.safe_push (entry);
6246 }
6247 }
6248 }
6249 while (todo != 0);
6250
6251 /* Pop the SCC. */
6252 stack.truncate (last_idx);
6253 }
6254
6255 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6256 slp_tree phi_node;
6257 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
6258 {
6259 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
6260 edge_iterator ei;
6261 edge e;
6262 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
6263 {
6264 unsigned dest_idx = e->dest_idx;
6265 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
6266 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
6267 continue;
6268 /* Simply fill all args. */
6269 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
6270 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
6271 vect_get_slp_vect_def (child, i),
6272 e, gimple_phi_arg_location (phi, dest_idx));
6273 }
6274 }
6275 }
6276
6277 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6278
6279 void
6280 vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
6281 {
6282 slp_instance instance;
6283 unsigned int i;
6284
6285 hash_map<slp_tree, slp_scc_info> scc_info;
6286 int maxdfs = 0;
6287 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6288 {
6289 slp_tree node = SLP_INSTANCE_TREE (instance);
6290 if (dump_enabled_p ())
6291 {
6292 dump_printf_loc (MSG_NOTE, vect_location,
6293 "Vectorizing SLP tree:\n");
6294 if (SLP_INSTANCE_ROOT_STMT (instance))
6295 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
6296 SLP_INSTANCE_ROOT_STMT (instance)->stmt);
6297 vect_print_slp_graph (MSG_NOTE, vect_location,
6298 SLP_INSTANCE_TREE (instance));
6299 }
6300 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6301 have a PHI be the node breaking the cycle. */
6302 auto_vec<slp_tree> stack;
6303 if (!scc_info.get (node))
6304 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
6305
6306 if (SLP_INSTANCE_ROOT_STMT (instance))
6307 vectorize_slp_instance_root_stmt (node, instance);
6308
6309 if (dump_enabled_p ())
6310 dump_printf_loc (MSG_NOTE, vect_location,
6311 "vectorizing stmts using SLP.\n");
6312 }
6313
6314 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6315 {
6316 slp_tree root = SLP_INSTANCE_TREE (instance);
6317 stmt_vec_info store_info;
6318 unsigned int j;
6319
6320 /* Remove scalar call stmts. Do not do this for basic-block
6321 vectorization as not all uses may be vectorized.
6322 ??? Why should this be necessary? DCE should be able to
6323 remove the stmts itself.
6324 ??? For BB vectorization we can as well remove scalar
6325 stmts starting from the SLP tree root if they have no
6326 uses. */
6327 if (is_a <loop_vec_info> (vinfo))
6328 vect_remove_slp_scalar_calls (vinfo, root);
6329
6330 /* Remove vectorized stores original scalar stmts. */
6331 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
6332 {
6333 if (!STMT_VINFO_DATA_REF (store_info)
6334 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
6335 break;
6336
6337 store_info = vect_orig_stmt (store_info);
6338 /* Free the attached stmt_vec_info and remove the stmt. */
6339 vinfo->remove_stmt (store_info);
6340 }
6341 }
6342 }