libgomp/team.c

   1 /* Copyright (C) 2005-2021 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file handles the maintenance of threads in response to team
  27    creation and termination.  */
  28
  29 #include "libgomp.h"
  30 #include "pool.h"
  31 #include <stdlib.h>
  32 #include <string.h>
  33
  34 #ifdef LIBGOMP_USE_PTHREADS
  35 pthread_attr_t gomp_thread_attr;
  36
  37 /* This key is for the thread destructor.  */
  38 pthread_key_t gomp_thread_destructor;
  39
  40
  41 /* This is the libgomp per-thread data structure.  */
  42 #if defined HAVE_TLS || defined USE_EMUTLS
  43 __thread struct gomp_thread gomp_tls_data;
  44 #else
  45 pthread_key_t gomp_tls_key;
  46 #endif
  47
  48
  49 /* This structure is used to communicate across pthread_create.  */
  50
  51 struct gomp_thread_start_data
  52 {
  53   void (*fn) (void *);
  54   void *fn_data;
  55   struct gomp_team_state ts;
  56   struct gomp_task *task;
  57   struct gomp_thread_pool *thread_pool;
  58   unsigned int place;
  59   bool nested;
  60   pthread_t handle;
  61 };
  62
  63
  64 /* This function is a pthread_create entry point.  This contains the idle
  65    loop in which a thread waits to be called up to become part of a team.  */
  66
  67 static void *
  68 gomp_thread_start (void *xdata)
  69 {
  70   struct gomp_thread_start_data *data = xdata;
  71   struct gomp_thread *thr;
  72   struct gomp_thread_pool *pool;
  73   void (*local_fn) (void *);
  74   void *local_data;
  75
  76 #if defined HAVE_TLS || defined USE_EMUTLS
  77   thr = &gomp_tls_data;
  78 #else
  79   struct gomp_thread local_thr;
  80   thr = &local_thr;
  81   pthread_setspecific (gomp_tls_key, thr);
  82 #endif
  83   gomp_sem_init (&thr->release, 0);
  84
  85   /* Extract what we need from data.  */
  86   local_fn = data->fn;
  87   local_data = data->fn_data;
  88   thr->thread_pool = data->thread_pool;
  89   thr->ts = data->ts;
  90   thr->task = data->task;
  91   thr->place = data->place;
  92 #ifdef GOMP_NEEDS_THREAD_HANDLE
  93   thr->handle = data->handle;
  94 #endif
  95
  96   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
  97
  98   /* Make thread pool local. */
  99   pool = thr->thread_pool;
 100
 101   if (data->nested)
 102     {
 103       struct gomp_team *team = thr->ts.team;
 104       struct gomp_task *task = thr->task;
 105
 106       gomp_barrier_wait (&team->barrier);
 107
 108       local_fn (local_data);
 109       gomp_team_barrier_wait_final (&team->barrier);
 110       gomp_finish_task (task);
 111       gomp_barrier_wait_last (&team->barrier);
 112     }
 113   else
 114     {
 115       pool->threads[thr->ts.team_id] = thr;
 116
 117       gomp_simple_barrier_wait (&pool->threads_dock);
 118       do
 119         {
 120           struct gomp_team *team = thr->ts.team;
 121           struct gomp_task *task = thr->task;
 122
 123           local_fn (local_data);
 124           gomp_team_barrier_wait_final (&team->barrier);
 125           gomp_finish_task (task);
 126
 127           gomp_simple_barrier_wait (&pool->threads_dock);
 128
 129           local_fn = thr->fn;
 130           local_data = thr->data;
 131           thr->fn = NULL;
 132         }
 133       while (local_fn);
 134     }
 135
 136   gomp_sem_destroy (&thr->release);
 137   pthread_detach (pthread_self ());
 138   thr->thread_pool = NULL;
 139   thr->task = NULL;
 140   return NULL;
 141 }
 142 #endif
 143
 144 static inline struct gomp_team *
 145 get_last_team (unsigned nthreads)
 146 {
 147   struct gomp_thread *thr = gomp_thread ();
 148   if (thr->ts.team == NULL)
 149     {
 150       struct gomp_thread_pool *pool = gomp_get_thread_pool (thr, nthreads);
 151       struct gomp_team *last_team = pool->last_team;
 152       if (last_team != NULL && last_team->nthreads == nthreads)
 153         {
 154           pool->last_team = NULL;
 155           return last_team;
 156         }
 157     }
 158   return NULL;
 159 }
 160
 161 /* Create a new team data structure.  */
 162
 163 struct gomp_team *
 164 gomp_new_team (unsigned nthreads)
 165 {
 166   struct gomp_team *team;
 167   int i;
 168
 169   team = get_last_team (nthreads);
 170   if (team == NULL)
 171     {
 172       size_t extra = sizeof (team->ordered_release[0])
 173                      + sizeof (team->implicit_task[0]);
 174       team = team_malloc (sizeof (*team) + nthreads * extra);
 175
 176 #ifndef HAVE_SYNC_BUILTINS
 177       gomp_mutex_init (&team->work_share_list_free_lock);
 178 #endif
 179       gomp_barrier_init (&team->barrier, nthreads);
 180       gomp_mutex_init (&team->task_lock);
 181
 182       team->nthreads = nthreads;
 183     }
 184
 185   team->work_share_chunk = 8;
 186 #ifdef HAVE_SYNC_BUILTINS
 187   team->single_count = 0;
 188 #endif
 189   team->work_shares_to_free = &team->work_shares[0];
 190   gomp_init_work_share (&team->work_shares[0], 0, nthreads);
 191   team->work_shares[0].next_alloc = NULL;
 192   team->work_share_list_free = NULL;
 193   team->work_share_list_alloc = &team->work_shares[1];
 194   for (i = 1; i < 7; i++)
 195     team->work_shares[i].next_free = &team->work_shares[i + 1];
 196   team->work_shares[i].next_free = NULL;
 197
 198   gomp_sem_init (&team->master_release, 0);
 199   team->ordered_release = (void *) &team->implicit_task[nthreads];
 200   team->ordered_release[0] = &team->master_release;
 201
 202   priority_queue_init (&team->task_queue);
 203   team->task_count = 0;
 204   team->task_queued_count = 0;
 205   team->task_running_count = 0;
 206   team->work_share_cancelled = 0;
 207   team->team_cancelled = 0;
 208
 209   priority_queue_init (&team->task_detach_queue);
 210   team->task_detach_count = 0;
 211
 212   return team;
 213 }
 214
 215
 216 /* Free a team data structure.  */
 217
 218 static void
 219 free_team (struct gomp_team *team)
 220 {
 221 #ifndef HAVE_SYNC_BUILTINS
 222   gomp_mutex_destroy (&team->work_share_list_free_lock);
 223 #endif
 224   gomp_barrier_destroy (&team->barrier);
 225   gomp_mutex_destroy (&team->task_lock);
 226   priority_queue_free (&team->task_queue);
 227   priority_queue_free (&team->task_detach_queue);
 228   team_free (team);
 229 }
 230
 231 static void
 232 gomp_free_pool_helper (void *thread_pool)
 233 {
 234   struct gomp_thread *thr = gomp_thread ();
 235   struct gomp_thread_pool *pool
 236     = (struct gomp_thread_pool *) thread_pool;
 237   gomp_simple_barrier_wait_last (&pool->threads_dock);
 238   gomp_sem_destroy (&thr->release);
 239   thr->thread_pool = NULL;
 240   thr->task = NULL;
 241 #ifdef LIBGOMP_USE_PTHREADS
 242   pthread_detach (pthread_self ());
 243   pthread_exit (NULL);
 244 #elif defined(__nvptx__)
 245   asm ("exit;");
 246 #elif defined(__AMDGCN__)
 247   asm ("s_dcache_wb\n\t"
 248        "s_endpgm");
 249 #else
 250 #error gomp_free_pool_helper must terminate the thread
 251 #endif
 252 }
 253
 254 /* Free a thread pool and release its threads. */
 255
 256 void
 257 gomp_free_thread (void *arg __attribute__((unused)))
 258 {
 259   struct gomp_thread *thr = gomp_thread ();
 260   struct gomp_thread_pool *pool = thr->thread_pool;
 261   if (pool)
 262     {
 263       if (pool->threads_used > 0)
 264         {
 265           int i;
 266           for (i = 1; i < pool->threads_used; i++)
 267             {
 268               struct gomp_thread *nthr = pool->threads[i];
 269               nthr->fn = gomp_free_pool_helper;
 270               nthr->data = pool;
 271             }
 272           /* This barrier undocks threads docked on pool->threads_dock.  */
 273           gomp_simple_barrier_wait (&pool->threads_dock);
 274           /* And this waits till all threads have called gomp_barrier_wait_last
 275              in gomp_free_pool_helper.  */
 276           gomp_simple_barrier_wait (&pool->threads_dock);
 277           /* Now it is safe to destroy the barrier and free the pool.  */
 278           gomp_simple_barrier_destroy (&pool->threads_dock);
 279
 280 #ifdef HAVE_SYNC_BUILTINS
 281           __sync_fetch_and_add (&gomp_managed_threads,
 282                                 1L - pool->threads_used);
 283 #else
 284           gomp_mutex_lock (&gomp_managed_threads_lock);
 285           gomp_managed_threads -= pool->threads_used - 1L;
 286           gomp_mutex_unlock (&gomp_managed_threads_lock);
 287 #endif
 288         }
 289       if (pool->last_team)
 290         free_team (pool->last_team);
 291 #ifndef __nvptx__
 292       team_free (pool->threads);
 293       team_free (pool);
 294 #endif
 295       thr->thread_pool = NULL;
 296     }
 297   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
 298     gomp_team_end ();
 299   if (thr->task != NULL)
 300     {
 301       struct gomp_task *task = thr->task;
 302       gomp_end_task ();
 303       free (task);
 304     }
 305 }
 306
 307 /* Launch a team.  */
 308
 309 #ifdef LIBGOMP_USE_PTHREADS
 310 void
 311 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
 312                  unsigned flags, struct gomp_team *team,
 313                  struct gomp_taskgroup *taskgroup)
 314 {
 315   struct gomp_thread_start_data *start_data;
 316   struct gomp_thread *thr, *nthr;
 317   struct gomp_task *task;
 318   struct gomp_task_icv *icv;
 319   bool nested;
 320   struct gomp_thread_pool *pool;
 321   unsigned i, n, old_threads_used = 0;
 322   pthread_attr_t thread_attr, *attr;
 323   unsigned long nthreads_var;
 324   char bind, bind_var;
 325   unsigned int s = 0, rest = 0, p = 0, k = 0;
 326   unsigned int affinity_count = 0;
 327   struct gomp_thread **affinity_thr = NULL;
 328   bool force_display = false;
 329
 330   thr = gomp_thread ();
 331   nested = thr->ts.level;
 332   pool = thr->thread_pool;
 333   task = thr->task;
 334   icv = task ? &task->icv : &gomp_global_icv;
 335   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
 336     {
 337       gomp_init_affinity ();
 338       if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
 339         gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
 340                                       thr->place);
 341     }
 342
 343   /* Always save the previous state, even if this isn't a nested team.
 344      In particular, we should save any work share state from an outer
 345      orphaned work share construct.  */
 346   team->prev_ts = thr->ts;
 347
 348   thr->ts.team = team;
 349   thr->ts.team_id = 0;
 350   ++thr->ts.level;
 351   if (nthreads > 1)
 352     ++thr->ts.active_level;
 353   thr->ts.work_share = &team->work_shares[0];
 354   thr->ts.last_work_share = NULL;
 355 #ifdef HAVE_SYNC_BUILTINS
 356   thr->ts.single_count = 0;
 357 #endif
 358   thr->ts.static_trip = 0;
 359   thr->task = &team->implicit_task[0];
 360 #ifdef GOMP_NEEDS_THREAD_HANDLE
 361   thr->handle = pthread_self ();
 362 #endif
 363   nthreads_var = icv->nthreads_var;
 364   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
 365       && thr->ts.level < gomp_nthreads_var_list_len)
 366     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
 367   bind_var = icv->bind_var;
 368   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
 369     bind_var = flags & 7;
 370   bind = bind_var;
 371   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
 372       && thr->ts.level < gomp_bind_var_list_len)
 373     bind_var = gomp_bind_var_list[thr->ts.level];
 374   gomp_init_task (thr->task, task, icv);
 375   thr->task->taskgroup = taskgroup;
 376   team->implicit_task[0].icv.nthreads_var = nthreads_var;
 377   team->implicit_task[0].icv.bind_var = bind_var;
 378
 379   if (nthreads == 1)
 380     return;
 381
 382   i = 1;
 383
 384   if (__builtin_expect (gomp_places_list != NULL, 0))
 385     {
 386       /* Depending on chosen proc_bind model, set subpartition
 387          for the master thread and initialize helper variables
 388          P and optionally S, K and/or REST used by later place
 389          computation for each additional thread.  */
 390       p = thr->place - 1;
 391       switch (bind)
 392         {
 393         case omp_proc_bind_true:
 394         case omp_proc_bind_close:
 395           if (nthreads > thr->ts.place_partition_len)
 396             {
 397               /* T > P.  S threads will be placed in each place,
 398                  and the final REM threads placed one by one
 399                  into the already occupied places.  */
 400               s = nthreads / thr->ts.place_partition_len;
 401               rest = nthreads % thr->ts.place_partition_len;
 402             }
 403           else
 404             s = 1;
 405           k = 1;
 406           break;
 407         case omp_proc_bind_master:
 408           /* Each thread will be bound to master's place.  */
 409           break;
 410         case omp_proc_bind_spread:
 411           if (nthreads <= thr->ts.place_partition_len)
 412             {
 413               /* T <= P.  Each subpartition will have in between s
 414                  and s+1 places (subpartitions starting at or
 415                  after rest will have s places, earlier s+1 places),
 416                  each thread will be bound to the first place in
 417                  its subpartition (except for the master thread
 418                  that can be bound to another place in its
 419                  subpartition).  */
 420               s = thr->ts.place_partition_len / nthreads;
 421               rest = thr->ts.place_partition_len % nthreads;
 422               rest = (s + 1) * rest + thr->ts.place_partition_off;
 423               if (p < rest)
 424                 {
 425                   p -= (p - thr->ts.place_partition_off) % (s + 1);
 426                   thr->ts.place_partition_len = s + 1;
 427                 }
 428               else
 429                 {
 430                   p -= (p - rest) % s;
 431                   thr->ts.place_partition_len = s;
 432                 }
 433               thr->ts.place_partition_off = p;
 434             }
 435           else
 436             {
 437               /* T > P.  Each subpartition will have just a single
 438                  place and we'll place between s and s+1
 439                  threads into each subpartition.  */
 440               s = nthreads / thr->ts.place_partition_len;
 441               rest = nthreads % thr->ts.place_partition_len;
 442               thr->ts.place_partition_off = p;
 443               thr->ts.place_partition_len = 1;
 444               k = 1;
 445             }
 446           break;
 447         }
 448     }
 449   else
 450     bind = omp_proc_bind_false;
 451
 452   /* We only allow the reuse of idle threads for non-nested PARALLEL
 453      regions.  This appears to be implied by the semantics of
 454      threadprivate variables, but perhaps that's reading too much into
 455      things.  Certainly it does prevent any locking problems, since
 456      only the initial program thread will modify gomp_threads.  */
 457   if (!nested)
 458     {
 459       old_threads_used = pool->threads_used;
 460
 461       if (nthreads <= old_threads_used)
 462         n = nthreads;
 463       else if (old_threads_used == 0)
 464         {
 465           n = 0;
 466           gomp_simple_barrier_init (&pool->threads_dock, nthreads);
 467         }
 468       else
 469         {
 470           n = old_threads_used;
 471
 472           /* Increase the barrier threshold to make sure all new
 473              threads arrive before the team is released.  */
 474           gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 475         }
 476
 477       /* Not true yet, but soon will be.  We're going to release all
 478          threads from the dock, and those that aren't part of the
 479          team will exit.  */
 480       pool->threads_used = nthreads;
 481
 482       /* If necessary, expand the size of the gomp_threads array.  It is
 483          expected that changes in the number of threads are rare, thus we
 484          make no effort to expand gomp_threads_size geometrically.  */
 485       if (nthreads >= pool->threads_size)
 486         {
 487           pool->threads_size = nthreads + 1;
 488           pool->threads
 489             = gomp_realloc (pool->threads,
 490                             pool->threads_size
 491                             * sizeof (struct gomp_thread *));
 492           /* Add current (master) thread to threads[].  */
 493           pool->threads[0] = thr;
 494         }
 495
 496       /* Release existing idle threads.  */
 497       for (; i < n; ++i)
 498         {
 499           unsigned int place_partition_off = thr->ts.place_partition_off;
 500           unsigned int place_partition_len = thr->ts.place_partition_len;
 501           unsigned int place = 0;
 502           if (__builtin_expect (gomp_places_list != NULL, 0))
 503             {
 504               switch (bind)
 505                 {
 506                 case omp_proc_bind_true:
 507                 case omp_proc_bind_close:
 508                   if (k == s)
 509                     {
 510                       ++p;
 511                       if (p == (team->prev_ts.place_partition_off
 512                                 + team->prev_ts.place_partition_len))
 513                         p = team->prev_ts.place_partition_off;
 514                       k = 1;
 515                       if (i == nthreads - rest)
 516                         s = 1;
 517                     }
 518                   else
 519                     ++k;
 520                   break;
 521                 case omp_proc_bind_master:
 522                   break;
 523                 case omp_proc_bind_spread:
 524                   if (k == 0)
 525                     {
 526                       /* T <= P.  */
 527                       if (p < rest)
 528                         p += s + 1;
 529                       else
 530                         p += s;
 531                       if (p == (team->prev_ts.place_partition_off
 532                                 + team->prev_ts.place_partition_len))
 533                         p = team->prev_ts.place_partition_off;
 534                       place_partition_off = p;
 535                       if (p < rest)
 536                         place_partition_len = s + 1;
 537                       else
 538                         place_partition_len = s;
 539                     }
 540                   else
 541                     {
 542                       /* T > P.  */
 543                       if (k == s)
 544                         {
 545                           ++p;
 546                           if (p == (team->prev_ts.place_partition_off
 547                                     + team->prev_ts.place_partition_len))
 548                             p = team->prev_ts.place_partition_off;
 549                           k = 1;
 550                           if (i == nthreads - rest)
 551                             s = 1;
 552                         }
 553                       else
 554                         ++k;
 555                       place_partition_off = p;
 556                       place_partition_len = 1;
 557                     }
 558                   break;
 559                 }
 560               if (affinity_thr != NULL
 561                   || (bind != omp_proc_bind_true
 562                       && pool->threads[i]->place != p + 1)
 563                   || pool->threads[i]->place <= place_partition_off
 564                   || pool->threads[i]->place > (place_partition_off
 565                                                 + place_partition_len))
 566                 {
 567                   unsigned int l;
 568                   force_display = true;
 569                   if (affinity_thr == NULL)
 570                     {
 571                       unsigned int j;
 572
 573                       if (team->prev_ts.place_partition_len > 64)
 574                         affinity_thr
 575                           = gomp_malloc (team->prev_ts.place_partition_len
 576                                          * sizeof (struct gomp_thread *));
 577                       else
 578                         affinity_thr
 579                           = gomp_alloca (team->prev_ts.place_partition_len
 580                                          * sizeof (struct gomp_thread *));
 581                       memset (affinity_thr, '\0',
 582                               team->prev_ts.place_partition_len
 583                               * sizeof (struct gomp_thread *));
 584                       for (j = i; j < old_threads_used; j++)
 585                         {
 586                           if (pool->threads[j]->place
 587                               > team->prev_ts.place_partition_off
 588                               && (pool->threads[j]->place
 589                                   <= (team->prev_ts.place_partition_off
 590                                       + team->prev_ts.place_partition_len)))
 591                             {
 592                               l = pool->threads[j]->place - 1
 593                                   - team->prev_ts.place_partition_off;
 594                               pool->threads[j]->data = affinity_thr[l];
 595                               affinity_thr[l] = pool->threads[j];
 596                             }
 597                           pool->threads[j] = NULL;
 598                         }
 599                       if (nthreads > old_threads_used)
 600                         memset (&pool->threads[old_threads_used],
 601                                 '\0', ((nthreads - old_threads_used)
 602                                        * sizeof (struct gomp_thread *)));
 603                       n = nthreads;
 604                       affinity_count = old_threads_used - i;
 605                     }
 606                   if (affinity_count == 0)
 607                     break;
 608                   l = p;
 609                   if (affinity_thr[l - team->prev_ts.place_partition_off]
 610                       == NULL)
 611                     {
 612                       if (bind != omp_proc_bind_true)
 613                         continue;
 614                       for (l = place_partition_off;
 615                            l < place_partition_off + place_partition_len;
 616                            l++)
 617                         if (affinity_thr[l - team->prev_ts.place_partition_off]
 618                             != NULL)
 619                           break;
 620                       if (l == place_partition_off + place_partition_len)
 621                         continue;
 622                     }
 623                   nthr = affinity_thr[l - team->prev_ts.place_partition_off];
 624                   affinity_thr[l - team->prev_ts.place_partition_off]
 625                     = (struct gomp_thread *) nthr->data;
 626                   affinity_count--;
 627                   pool->threads[i] = nthr;
 628                 }
 629               else
 630                 nthr = pool->threads[i];
 631               place = p + 1;
 632             }
 633           else
 634             nthr = pool->threads[i];
 635           nthr->ts.team = team;
 636           nthr->ts.work_share = &team->work_shares[0];
 637           nthr->ts.last_work_share = NULL;
 638           nthr->ts.team_id = i;
 639           nthr->ts.level = team->prev_ts.level + 1;
 640           nthr->ts.active_level = thr->ts.active_level;
 641           nthr->ts.place_partition_off = place_partition_off;
 642           nthr->ts.place_partition_len = place_partition_len;
 643           nthr->ts.def_allocator = thr->ts.def_allocator;
 644 #ifdef HAVE_SYNC_BUILTINS
 645           nthr->ts.single_count = 0;
 646 #endif
 647           nthr->ts.static_trip = 0;
 648           nthr->task = &team->implicit_task[i];
 649           nthr->place = place;
 650           gomp_init_task (nthr->task, task, icv);
 651           team->implicit_task[i].icv.nthreads_var = nthreads_var;
 652           team->implicit_task[i].icv.bind_var = bind_var;
 653           nthr->task->taskgroup = taskgroup;
 654           nthr->fn = fn;
 655           nthr->data = data;
 656           team->ordered_release[i] = &nthr->release;
 657         }
 658
 659       if (__builtin_expect (affinity_thr != NULL, 0))
 660         {
 661           /* If AFFINITY_THR is non-NULL just because we had to
 662              permute some threads in the pool, but we've managed
 663              to find exactly as many old threads as we'd find
 664              without affinity, we don't need to handle this
 665              specially anymore.  */
 666           if (nthreads <= old_threads_used
 667               ? (affinity_count == old_threads_used - nthreads)
 668               : (i == old_threads_used))
 669             {
 670               if (team->prev_ts.place_partition_len > 64)
 671                 free (affinity_thr);
 672               affinity_thr = NULL;
 673               affinity_count = 0;
 674             }
 675           else
 676             {
 677               i = 1;
 678               /* We are going to compute the places/subpartitions
 679                  again from the beginning.  So, we need to reinitialize
 680                  vars modified by the switch (bind) above inside
 681                  of the loop, to the state they had after the initial
 682                  switch (bind).  */
 683               switch (bind)
 684                 {
 685                 case omp_proc_bind_true:
 686                 case omp_proc_bind_close:
 687                   if (nthreads > thr->ts.place_partition_len)
 688                     /* T > P.  S has been changed, so needs
 689                        to be recomputed.  */
 690                     s = nthreads / thr->ts.place_partition_len;
 691                   k = 1;
 692                   p = thr->place - 1;
 693                   break;
 694                 case omp_proc_bind_master:
 695                   /* No vars have been changed.  */
 696                   break;
 697                 case omp_proc_bind_spread:
 698                   p = thr->ts.place_partition_off;
 699                   if (k != 0)
 700                     {
 701                       /* T > P.  */
 702                       s = nthreads / team->prev_ts.place_partition_len;
 703                       k = 1;
 704                     }
 705                   break;
 706                 }
 707
 708               /* Increase the barrier threshold to make sure all new
 709                  threads and all the threads we're going to let die
 710                  arrive before the team is released.  */
 711               if (affinity_count)
 712                 gomp_simple_barrier_reinit (&pool->threads_dock,
 713                                             nthreads + affinity_count);
 714             }
 715         }
 716
 717       if (i == nthreads)
 718         goto do_release;
 719
 720     }
 721
 722   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
 723     {
 724       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
 725
 726       if (old_threads_used == 0)
 727         --diff;
 728
 729 #ifdef HAVE_SYNC_BUILTINS
 730       __sync_fetch_and_add (&gomp_managed_threads, diff);
 731 #else
 732       gomp_mutex_lock (&gomp_managed_threads_lock);
 733       gomp_managed_threads += diff;
 734       gomp_mutex_unlock (&gomp_managed_threads_lock);
 735 #endif
 736     }
 737
 738   attr = &gomp_thread_attr;
 739   if (__builtin_expect (gomp_places_list != NULL, 0))
 740     {
 741       size_t stacksize;
 742       pthread_attr_init (&thread_attr);
 743       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
 744         pthread_attr_setstacksize (&thread_attr, stacksize);
 745       attr = &thread_attr;
 746     }
 747
 748   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
 749                             * (nthreads - i));
 750
 751   /* Launch new threads.  */
 752   for (; i < nthreads; ++i)
 753     {
 754       int err;
 755
 756       start_data->ts.place_partition_off = thr->ts.place_partition_off;
 757       start_data->ts.place_partition_len = thr->ts.place_partition_len;
 758       start_data->place = 0;
 759       if (__builtin_expect (gomp_places_list != NULL, 0))
 760         {
 761           switch (bind)
 762             {
 763             case omp_proc_bind_true:
 764             case omp_proc_bind_close:
 765               if (k == s)
 766                 {
 767                   ++p;
 768                   if (p == (team->prev_ts.place_partition_off
 769                             + team->prev_ts.place_partition_len))
 770                     p = team->prev_ts.place_partition_off;
 771                   k = 1;
 772                   if (i == nthreads - rest)
 773                     s = 1;
 774                 }
 775               else
 776                 ++k;
 777               break;
 778             case omp_proc_bind_master:
 779               break;
 780             case omp_proc_bind_spread:
 781               if (k == 0)
 782                 {
 783                   /* T <= P.  */
 784                   if (p < rest)
 785                     p += s + 1;
 786                   else
 787                     p += s;
 788                   if (p == (team->prev_ts.place_partition_off
 789                             + team->prev_ts.place_partition_len))
 790                     p = team->prev_ts.place_partition_off;
 791                   start_data->ts.place_partition_off = p;
 792                   if (p < rest)
 793                     start_data->ts.place_partition_len = s + 1;
 794                   else
 795                     start_data->ts.place_partition_len = s;
 796                 }
 797               else
 798                 {
 799                   /* T > P.  */
 800                   if (k == s)
 801                     {
 802                       ++p;
 803                       if (p == (team->prev_ts.place_partition_off
 804                                 + team->prev_ts.place_partition_len))
 805                         p = team->prev_ts.place_partition_off;
 806                       k = 1;
 807                       if (i == nthreads - rest)
 808                         s = 1;
 809                     }
 810                   else
 811                     ++k;
 812                   start_data->ts.place_partition_off = p;
 813                   start_data->ts.place_partition_len = 1;
 814                 }
 815               break;
 816             }
 817           start_data->place = p + 1;
 818           if (affinity_thr != NULL && pool->threads[i] != NULL)
 819             continue;
 820           gomp_init_thread_affinity (attr, p);
 821         }
 822
 823       start_data->fn = fn;
 824       start_data->fn_data = data;
 825       start_data->ts.team = team;
 826       start_data->ts.work_share = &team->work_shares[0];
 827       start_data->ts.last_work_share = NULL;
 828       start_data->ts.team_id = i;
 829       start_data->ts.level = team->prev_ts.level + 1;
 830       start_data->ts.active_level = thr->ts.active_level;
 831       start_data->ts.def_allocator = thr->ts.def_allocator;
 832 #ifdef HAVE_SYNC_BUILTINS
 833       start_data->ts.single_count = 0;
 834 #endif
 835       start_data->ts.static_trip = 0;
 836       start_data->task = &team->implicit_task[i];
 837       gomp_init_task (start_data->task, task, icv);
 838       team->implicit_task[i].icv.nthreads_var = nthreads_var;
 839       team->implicit_task[i].icv.bind_var = bind_var;
 840       start_data->task->taskgroup = taskgroup;
 841       start_data->thread_pool = pool;
 842       start_data->nested = nested;
 843
 844       attr = gomp_adjust_thread_attr (attr, &thread_attr);
 845       err = pthread_create (&start_data->handle, attr, gomp_thread_start,
 846                             start_data);
 847       start_data++;
 848       if (err != 0)
 849         gomp_fatal ("Thread creation failed: %s", strerror (err));
 850     }
 851
 852   if (__builtin_expect (attr == &thread_attr, 0))
 853     pthread_attr_destroy (&thread_attr);
 854
 855  do_release:
 856   if (nested)
 857     gomp_barrier_wait (&team->barrier);
 858   else
 859     gomp_simple_barrier_wait (&pool->threads_dock);
 860
 861   /* Decrease the barrier threshold to match the number of threads
 862      that should arrive back at the end of this team.  The extra
 863      threads should be exiting.  Note that we arrange for this test
 864      to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
 865      the barrier as well as gomp_managed_threads was temporarily
 866      set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
 867      AFFINITY_COUNT if non-zero will be always at least
 868      OLD_THREADS_COUNT - NTHREADS.  */
 869   if (__builtin_expect (nthreads < old_threads_used, 0)
 870       || __builtin_expect (affinity_count, 0))
 871     {
 872       long diff = (long) nthreads - (long) old_threads_used;
 873
 874       if (affinity_count)
 875         diff = -affinity_count;
 876
 877       gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 878
 879 #ifdef HAVE_SYNC_BUILTINS
 880       __sync_fetch_and_add (&gomp_managed_threads, diff);
 881 #else
 882       gomp_mutex_lock (&gomp_managed_threads_lock);
 883       gomp_managed_threads += diff;
 884       gomp_mutex_unlock (&gomp_managed_threads_lock);
 885 #endif
 886     }
 887   if (__builtin_expect (gomp_display_affinity_var, 0))
 888     {
 889       if (nested
 890           || nthreads != old_threads_used
 891           || force_display)
 892         {
 893           gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
 894                                         thr->place);
 895           if (nested)
 896             {
 897               start_data -= nthreads - 1;
 898               for (i = 1; i < nthreads; ++i)
 899                 {
 900                   gomp_display_affinity_thread (
 901 #ifdef LIBGOMP_USE_PTHREADS
 902                                                 start_data->handle,
 903 #else
 904                                                 gomp_thread_self (),
 905 #endif
 906                                                 &start_data->ts,
 907                                                 start_data->place);
 908                   start_data++;
 909                 }
 910             }
 911           else
 912             {
 913               for (i = 1; i < nthreads; ++i)
 914                 {
 915                   gomp_thread_handle handle
 916                     = gomp_thread_to_pthread_t (pool->threads[i]);
 917                   gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
 918                                                 pool->threads[i]->place);
 919                 }
 920             }
 921         }
 922     }
 923   if (__builtin_expect (affinity_thr != NULL, 0)
 924       && team->prev_ts.place_partition_len > 64)
 925     free (affinity_thr);
 926 }
 927 #endif
 928
 929
 930 /* Terminate the current team.  This is only to be called by the master
 931    thread.  We assume that we must wait for the other threads.  */
 932
 933 void
 934 gomp_team_end (void)
 935 {
 936   struct gomp_thread *thr = gomp_thread ();
 937   struct gomp_team *team = thr->ts.team;
 938
 939   /* This barrier handles all pending explicit threads.
 940      As #pragma omp cancel parallel might get awaited count in
 941      team->barrier in a inconsistent state, we need to use a different
 942      counter here.  */
 943   gomp_team_barrier_wait_final (&team->barrier);
 944   if (__builtin_expect (team->team_cancelled, 0))
 945     {
 946       struct gomp_work_share *ws = team->work_shares_to_free;
 947       do
 948         {
 949           struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
 950           if (next_ws == NULL)
 951             gomp_ptrlock_set (&ws->next_ws, ws);
 952           gomp_fini_work_share (ws);
 953           ws = next_ws;
 954         }
 955       while (ws != NULL);
 956     }
 957   else
 958     gomp_fini_work_share (thr->ts.work_share);
 959
 960   gomp_end_task ();
 961   thr->ts = team->prev_ts;
 962
 963   if (__builtin_expect (thr->ts.level != 0, 0))
 964     {
 965 #ifdef HAVE_SYNC_BUILTINS
 966       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
 967 #else
 968       gomp_mutex_lock (&gomp_managed_threads_lock);
 969       gomp_managed_threads -= team->nthreads - 1L;
 970       gomp_mutex_unlock (&gomp_managed_threads_lock);
 971 #endif
 972       /* This barrier has gomp_barrier_wait_last counterparts
 973          and ensures the team can be safely destroyed.  */
 974       gomp_barrier_wait (&team->barrier);
 975     }
 976
 977   if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
 978     {
 979       struct gomp_work_share *ws = team->work_shares[0].next_alloc;
 980       do
 981         {
 982           struct gomp_work_share *next_ws = ws->next_alloc;
 983           free (ws);
 984           ws = next_ws;
 985         }
 986       while (ws != NULL);
 987     }
 988   gomp_sem_destroy (&team->master_release);
 989
 990   if (__builtin_expect (thr->ts.team != NULL, 0)
 991       || __builtin_expect (team->nthreads == 1, 0))
 992     free_team (team);
 993   else
 994     {
 995       struct gomp_thread_pool *pool = thr->thread_pool;
 996       if (pool->last_team)
 997         free_team (pool->last_team);
 998       pool->last_team = team;
 999       gomp_release_thread_pool (pool);
1000     }
1001 }
1002
1003 #ifdef LIBGOMP_USE_PTHREADS
1004
1005 /* Constructors for this file.  */
1006
1007 static void __attribute__((constructor))
1008 initialize_team (void)
1009 {
1010 #if !defined HAVE_TLS && !defined USE_EMUTLS
1011   static struct gomp_thread initial_thread_tls_data;
1012
1013   pthread_key_create (&gomp_tls_key, NULL);
1014   pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
1015 #endif
1016
1017   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
1018     gomp_fatal ("could not create thread pool destructor.");
1019 }
1020
1021 static void __attribute__((destructor))
1022 team_destructor (void)
1023 {
1024   /* Without this dlclose on libgomp could lead to subsequent
1025      crashes.  */
1026   pthread_key_delete (gomp_thread_destructor);
1027 }
1028
1029 /* Similar to gomp_free_pool_helper, but don't detach itself,
1030    gomp_pause_host will pthread_join those threads.  */
1031
1032 static void
1033 gomp_pause_pool_helper (void *thread_pool)
1034 {
1035   struct gomp_thread *thr = gomp_thread ();
1036   struct gomp_thread_pool *pool
1037     = (struct gomp_thread_pool *) thread_pool;
1038   gomp_simple_barrier_wait_last (&pool->threads_dock);
1039   gomp_sem_destroy (&thr->release);
1040   thr->thread_pool = NULL;
1041   thr->task = NULL;
1042   pthread_exit (NULL);
1043 }
1044
1045 /* Free a thread pool and release its threads.  Return non-zero on
1046    failure.  */
1047
1048 int
1049 gomp_pause_host (void)
1050 {
1051   struct gomp_thread *thr = gomp_thread ();
1052   struct gomp_thread_pool *pool = thr->thread_pool;
1053   if (thr->ts.level)
1054     return -1;
1055   if (pool)
1056     {
1057       if (pool->threads_used > 0)
1058         {
1059           int i;
1060           pthread_t *thrs
1061             = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
1062           for (i = 1; i < pool->threads_used; i++)
1063             {
1064               struct gomp_thread *nthr = pool->threads[i];
1065               nthr->fn = gomp_pause_pool_helper;
1066               nthr->data = pool;
1067               thrs[i] = gomp_thread_to_pthread_t (nthr);
1068             }
1069           /* This barrier undocks threads docked on pool->threads_dock.  */
1070           gomp_simple_barrier_wait (&pool->threads_dock);
1071           /* And this waits till all threads have called gomp_barrier_wait_last
1072              in gomp_pause_pool_helper.  */
1073           gomp_simple_barrier_wait (&pool->threads_dock);
1074           /* Now it is safe to destroy the barrier and free the pool.  */
1075           gomp_simple_barrier_destroy (&pool->threads_dock);
1076
1077 #ifdef HAVE_SYNC_BUILTINS
1078           __sync_fetch_and_add (&gomp_managed_threads,
1079                                 1L - pool->threads_used);
1080 #else
1081           gomp_mutex_lock (&gomp_managed_threads_lock);
1082           gomp_managed_threads -= pool->threads_used - 1L;
1083           gomp_mutex_unlock (&gomp_managed_threads_lock);
1084 #endif
1085           for (i = 1; i < pool->threads_used; i++)
1086             pthread_join (thrs[i], NULL);
1087         }
1088       if (pool->last_team)
1089         free_team (pool->last_team);
1090 #ifndef __nvptx__
1091       team_free (pool->threads);
1092       team_free (pool);
1093 #endif
1094       thr->thread_pool = NULL;
1095     }
1096   return 0;
1097 }
1098 #endif
1099
1100 struct gomp_task_icv *
1101 gomp_new_icv (void)
1102 {
1103   struct gomp_thread *thr = gomp_thread ();
1104   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
1105   gomp_init_task (task, NULL, &gomp_global_icv);
1106   thr->task = task;
1107 #ifdef LIBGOMP_USE_PTHREADS
1108   pthread_setspecific (gomp_thread_destructor, thr);
1109 #endif
1110   return &task->icv;
1111 }