omp-low.c (lower_omp_ordered): Add argument to GOMP_SMD_ORDERED_* internal calls...
authorJakub Jelinek <jakub@gcc.gnu.org>
Sat, 14 Nov 2015 18:42:13 +0000 (19:42 +0100)
committerJakub Jelinek <jakub@gcc.gnu.org>
Sat, 14 Nov 2015 18:42:13 +0000 (19:42 +0100)
gcc/
2015-11-14  Jakub Jelinek  <jakub@redhat.com>

* omp-low.c (lower_omp_ordered): Add argument to GOMP_SMD_ORDERED_*
internal calls - 0 if ordered simd and 1 for ordered threads simd.
* tree-vectorizer.c (adjust_simduid_builtins): If GOMP_SIMD_ORDERED_*
argument is 1, replace it with GOMP_ordered_* call instead of removing
it.
gcc/c/
2015-11-14  Jakub Jelinek  <jakub@redhat.com>

* c-typeck.c (c_finish_omp_clauses): Don't mark
GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable.
gcc/cp/
2015-11-14  Jakub Jelinek  <jakub@redhat.com>

* semantics.c (finish_omp_clauses): Don't mark
GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable.
libgomp/
2015-11-14  Jakub Jelinek  <jakub@redhat.com>
    Aldy Hernandez  <aldyh@redhat.com>
    Ilya Verbin  <ilya.verbin@intel.com>

* ordered.c (gomp_doacross_init, GOMP_doacross_post,
GOMP_doacross_wait, gomp_doacross_ull_init, GOMP_doacross_ull_post,
GOMP_doacross_ull_wait): For GFS_GUIDED don't divide number of
iterators or IV by chunk size.
* parallel.c (gomp_resolve_num_threads): Don't assume that
if thr->ts.team is non-NULL, then pool must be non-NULL.
* libgomp-plugin.h (GOMP_PLUGIN_target_task_completion): Declare.
* libgomp.map (GOMP_PLUGIN_1.1): New symbol version, export
GOMP_PLUGIN_target_task_completion.
* Makefile.am (libgomp_la_SOURCES): Add priority_queue.c.
* Makefile.in: Regenerate.
* libgomp.h: Shuffle prototypes and forward definitions around so
priority queues can be defined.
(enum gomp_task_kind): Add GOMP_TASK_ASYNC_RUNNING.
(enum gomp_target_task_state): New enum.
(struct gomp_target_task): Add state, tgt, task and team fields.
(gomp_create_target_task): Change return type to bool, add
state argument.
(gomp_target_task_fn): Change return type to bool.
(struct gomp_device_descr): Add async_run_func.
(struct gomp_task): Remove children, next_child, prev_child,
next_queue, prev_queue, next_taskgroup, prev_taskgroup.
Add pnode field.
(struct gomp_taskgroup): Remove children.
Add taskgroup_queue.
(struct gomp_team): Change task_queue type to a priority queue.
(splay_compare): Define inline.
(priority_queue_offset): New.
(priority_node_to_task): New.
(task_to_priority_node): New.
* oacc-mem.c: Do not include splay-tree.h.
* priority_queue.c: New file.
* priority_queue.h: New file.
* splay-tree.c: Do not include splay-tree.h.
(splay_tree_foreach_internal): New.
(splay_tree_foreach): New.
* splay-tree.h: Become re-entrant if splay_tree_prefix is defined.
(splay_tree_callback): Define typedef.
* target.c (splay_compare): Move to libgomp.h.
(GOMP_target): Don't adjust *thr in any way around running offloaded
task.
(GOMP_target_ext): Likewise.  Handle target nowait.
(GOMP_target_update_ext, GOMP_target_enter_exit_data): Check
return value from gomp_create_target_task, if false, fallthrough
as if no dependencies exist.
(gomp_target_task_fn): Change return type to bool, return true
if the task should have another part scheduled later.  Handle
target nowait.
(gomp_load_plugin_for_device): Initialize async_run.
* task.c (gomp_init_task): Initialize children_queue.
(gomp_clear_parent_in_list): New.
(gomp_clear_parent_in_tree): New.
(gomp_clear_parent): Handle priorities.
(GOMP_task): Likewise.
(priority_queue_move_task_first,
gomp_target_task_completion, GOMP_PLUGIN_target_task_completion):
New functions.
(gomp_create_target_task): Use priority queues.  Change return type
to bool, add state argument, return false if for async
{{enter,exit} data,update} constructs no dependencies need to be
waited for, handle target nowait.  Set task->fn to NULL instead of
gomp_target_task_fn.
(verify_children_queue): Remove.
(priority_list_upgrade_task): New.
(priority_queue_upgrade_task): New.
(verify_task_queue): Remove.
(priority_list_downgrade_task): New.
(priority_queue_downgrade_task): New.
(gomp_task_run_pre): Use priority queues.
Abstract code out to priority_queue_downgrade_task.
(gomp_task_run_post_handle_dependers): Use priority queues.
(gomp_task_run_post_remove_parent): Likewise.
(gomp_task_run_post_remove_taskgroup): Likewise.
(gomp_barrier_handle_tasks): Likewise.  Handle target nowait target
tasks specially.
(GOMP_taskwait): Likewise.
(gomp_task_maybe_wait_for_dependencies): Likewise.  Abstract code to
priority-queue_upgrade_task.
(GOMP_taskgroup_start): Use priority queues.
(GOMP_taskgroup_end): Likewise.  Handle target nowait target tasks
specially.  If taskgroup is NULL, and thr->ts.level is 0, act as a
barrier.
* taskloop.c (GOMP_taskloop): Handle priorities.
* team.c (gomp_new_team): Call priority_queue_init.
(free_team): Call priority_queue_free.
(gomp_free_thread): Call gomp_team_end if thr->ts.team is artificial
team created for target nowait in implicit parallel region.
(gomp_team_start): For nested check, test thr->ts.level instead of
thr->ts.team != NULL.
* testsuite/libgomp.c/doacross-3.c: New test.
* testsuite/libgomp.c/ordered-5.c: New test.
* testsuite/libgomp.c/priority.c: New test.
* testsuite/libgomp.c/target-31.c: New test.
* testsuite/libgomp.c/target-32.c: New test.
* testsuite/libgomp.c/target-33.c: New test.
* testsuite/libgomp.c/target-34.c: New test.
liboffloadmic/
2015-11-14  Ilya Verbin  <ilya.verbin@intel.com>

* runtime/offload_host.cpp (task_completion_callback): New
variable.
(offload_proxy_task_completed_ooo): Call task_completion_callback.
(__offload_register_task_callback): New function.
* runtime/offload_host.h (__offload_register_task_callback): New
declaration.
* plugin/libgomp-plugin-intelmic.cpp (offload): Add async_data
argument, handle async offloading.
(register_main_image): Call register_main_image.
(GOMP_OFFLOAD_init_device, get_target_table, GOMP_OFFLOAD_alloc,
GOMP_OFFLOAD_free, GOMP_OFFLOAD_host2dev, GOMP_OFFLOAD_dev2host,
GOMP_OFFLOAD_dev2dev) Adjust offload callers.
(GOMP_OFFLOAD_async_run): New function.
(GOMP_OFFLOAD_run): Implement using GOMP_OFFLOAD_async_run.

From-SVN: r230381

35 files changed:
gcc/ChangeLog
gcc/c/ChangeLog
gcc/c/c-typeck.c
gcc/cp/ChangeLog
gcc/cp/semantics.c
gcc/omp-low.c
gcc/tree-vectorizer.c
libgomp/ChangeLog
libgomp/Makefile.am
libgomp/Makefile.in
libgomp/libgomp-plugin.h
libgomp/libgomp.h
libgomp/libgomp.map
libgomp/oacc-mem.c
libgomp/ordered.c
libgomp/parallel.c
libgomp/priority_queue.c [new file with mode: 0644]
libgomp/priority_queue.h [new file with mode: 0644]
libgomp/splay-tree.c
libgomp/splay-tree.h
libgomp/target.c
libgomp/task.c
libgomp/taskloop.c
libgomp/team.c
libgomp/testsuite/libgomp.c/doacross-3.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/ordered-5.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/priority.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/target-31.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/target-32.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/target-33.c [new file with mode: 0644]
libgomp/testsuite/libgomp.c/target-34.c [new file with mode: 0644]
liboffloadmic/ChangeLog
liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
liboffloadmic/runtime/offload_host.cpp
liboffloadmic/runtime/offload_host.h

index 6e85a5b5781a4733a22f526d832263ea20d2daaf..69a3044dfa38b604a37855788a2ee309a4e4b48b 100644 (file)
@@ -1,3 +1,11 @@
+2015-11-14  Jakub Jelinek  <jakub@redhat.com>
+
+       * omp-low.c (lower_omp_ordered): Add argument to GOMP_SMD_ORDERED_*
+       internal calls - 0 if ordered simd and 1 for ordered threads simd.
+       * tree-vectorizer.c (adjust_simduid_builtins): If GOMP_SIMD_ORDERED_*
+       argument is 1, replace it with GOMP_ordered_* call instead of removing
+       it.
+
 2015-11-13  Rich Felker <dalias@libc.org>
 
        * config/sh/sh.md (symGOT_load): Suppress __stack_chk_guard
index 04fe64da2770276c299e7f45b59cbc9718a30db8..5be190fec6b7192d51e78f93703ed5ea2afa44fc 100644 (file)
@@ -1,3 +1,8 @@
+2015-11-14  Jakub Jelinek  <jakub@redhat.com>
+
+       * c-typeck.c (c_finish_omp_clauses): Don't mark
+       GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable.
+
 2015-11-14  Marek Polacek  <polacek@redhat.com>
 
        * c-decl.c: Use RECORD_OR_UNION_TYPE_P throughout.
index 0215edac3c53ae0525c8d9848b15ffcfdfaca9f4..c18c3076752145d12841782b38c5ec7378faa478 100644 (file)
@@ -12918,7 +12918,10 @@ c_finish_omp_clauses (tree clauses, bool is_omp, bool declare_simd)
                        omp_clause_code_name[OMP_CLAUSE_CODE (c)]);
              remove = true;
            }
-         else if (!c_mark_addressable (t))
+         else if ((OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP
+                   || (OMP_CLAUSE_MAP_KIND (c)
+                       != GOMP_MAP_FIRSTPRIVATE_POINTER))
+                  && !c_mark_addressable (t))
            remove = true;
          else if (!(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
                     && (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_POINTER
index 2ea5448f08ecc38c7b9c29d9a1b4f417a9b350c1..baaabc8761f77c626aa11dda15ce8977cc17afd1 100644 (file)
@@ -1,3 +1,8 @@
+2015-11-14  Jakub Jelinek  <jakub@redhat.com>
+
+       * semantics.c (finish_omp_clauses): Don't mark
+       GOMP_MAP_FIRSTPRIVATE_POINTER decls addressable.
+
 2015-11-13  Kai Tietz  <ktietz70@googlemail.com>
            Marek Polacek  <polacek@redhat.com>
            Jason Merrill  <jason@redhat.com>
index 67f8590f63865a44db4139f1d4371b52c4d2ae3b..e7e5d8ecfaa263aa0c3d8ea7d924dec7c351c844 100644 (file)
@@ -6581,6 +6581,9 @@ finish_omp_clauses (tree clauses, bool allow_fields, bool declare_simd)
            }
          else if (!processing_template_decl
                   && TREE_CODE (TREE_TYPE (t)) != REFERENCE_TYPE
+                  && (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_MAP
+                      || (OMP_CLAUSE_MAP_KIND (c)
+                          != GOMP_MAP_FIRSTPRIVATE_POINTER))
                   && !cxx_mark_addressable (t))
            remove = true;
          else if (!(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
index 4b2b47772216724814c573015a4afa46d0bc75a5..987bc3ea60243c5471d0ed8bdf78bba2a3257133 100644 (file)
@@ -13951,8 +13951,10 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   gomp_ordered *ord_stmt = as_a <gomp_ordered *> (stmt);
   gcall *x;
   gbind *bind;
-  bool simd
-    = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt), OMP_CLAUSE_SIMD);
+  bool simd = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
+                              OMP_CLAUSE_SIMD);
+  bool threads = find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
+                                 OMP_CLAUSE_THREADS);
 
   if (find_omp_clause (gimple_omp_ordered_clauses (ord_stmt),
                       OMP_CLAUSE_DEPEND))
@@ -13975,7 +13977,8 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   if (simd)
     {
-      x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_START, 0);
+      x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_START, 1,
+                                     build_int_cst (NULL_TREE, threads));
       cfun->has_simduid_loops = true;
     }
   else
@@ -13989,7 +13992,8 @@ lower_omp_ordered (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   gimple_omp_set_body (stmt, NULL);
 
   if (simd)
-    x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 0);
+    x = gimple_build_call_internal (IFN_GOMP_SIMD_ORDERED_END, 1,
+                                   build_int_cst (NULL_TREE, threads));
   else
     x = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ORDERED_END),
                           0);
index 08c80253caa8876930689eb97207365729e7d0d5..b721c56d62acb4dc5f9ff285752c5d57cd3bb561 100644 (file)
@@ -177,6 +177,21 @@ adjust_simduid_builtins (hash_table<simduid_to_vf> *htab)
              break;
            case IFN_GOMP_SIMD_ORDERED_START:
            case IFN_GOMP_SIMD_ORDERED_END:
+             if (integer_onep (gimple_call_arg (stmt, 0)))
+               {
+                 enum built_in_function bcode
+                   = (ifn == IFN_GOMP_SIMD_ORDERED_START
+                      ? BUILT_IN_GOMP_ORDERED_START
+                      : BUILT_IN_GOMP_ORDERED_END);
+                 gimple *g
+                   = gimple_build_call (builtin_decl_explicit (bcode), 0);
+                 tree vdef = gimple_vdef (stmt);
+                 gimple_set_vdef (g, vdef);
+                 SSA_NAME_DEF_STMT (vdef) = g;
+                 gimple_set_vuse (g, gimple_vuse (stmt));
+                 gsi_replace (&i, g, true);
+                 continue;
+               }
              gsi_remove (&i, true);
              unlink_stmt_vdef (stmt);
              continue;
index 2f5a4d189daa6dca6d0d571d97d10321808691ff..15be0142a22549b80db643032de4c4f4659c5333 100644 (file)
@@ -1,3 +1,104 @@
+2015-11-14  Jakub Jelinek  <jakub@redhat.com>
+           Aldy Hernandez  <aldyh@redhat.com>
+           Ilya Verbin  <ilya.verbin@intel.com>
+
+       * ordered.c (gomp_doacross_init, GOMP_doacross_post,
+       GOMP_doacross_wait, gomp_doacross_ull_init, GOMP_doacross_ull_post,
+       GOMP_doacross_ull_wait): For GFS_GUIDED don't divide number of
+       iterators or IV by chunk size.
+       * parallel.c (gomp_resolve_num_threads): Don't assume that
+       if thr->ts.team is non-NULL, then pool must be non-NULL.
+       * libgomp-plugin.h (GOMP_PLUGIN_target_task_completion): Declare.
+       * libgomp.map (GOMP_PLUGIN_1.1): New symbol version, export
+       GOMP_PLUGIN_target_task_completion.
+       * Makefile.am (libgomp_la_SOURCES): Add priority_queue.c.
+       * Makefile.in: Regenerate.
+       * libgomp.h: Shuffle prototypes and forward definitions around so
+       priority queues can be defined.
+       (enum gomp_task_kind): Add GOMP_TASK_ASYNC_RUNNING.
+       (enum gomp_target_task_state): New enum.
+       (struct gomp_target_task): Add state, tgt, task and team fields.
+       (gomp_create_target_task): Change return type to bool, add
+       state argument.
+       (gomp_target_task_fn): Change return type to bool.
+       (struct gomp_device_descr): Add async_run_func.
+       (struct gomp_task): Remove children, next_child, prev_child,
+       next_queue, prev_queue, next_taskgroup, prev_taskgroup.
+       Add pnode field.
+       (struct gomp_taskgroup): Remove children.
+       Add taskgroup_queue.
+       (struct gomp_team): Change task_queue type to a priority queue.
+       (splay_compare): Define inline.
+       (priority_queue_offset): New.
+       (priority_node_to_task): New.
+       (task_to_priority_node): New.
+       * oacc-mem.c: Do not include splay-tree.h.
+       * priority_queue.c: New file.
+       * priority_queue.h: New file.
+       * splay-tree.c: Do not include splay-tree.h.
+       (splay_tree_foreach_internal): New.
+       (splay_tree_foreach): New.
+       * splay-tree.h: Become re-entrant if splay_tree_prefix is defined.
+       (splay_tree_callback): Define typedef.
+       * target.c (splay_compare): Move to libgomp.h.
+       (GOMP_target): Don't adjust *thr in any way around running offloaded
+       task.
+       (GOMP_target_ext): Likewise.  Handle target nowait.
+       (GOMP_target_update_ext, GOMP_target_enter_exit_data): Check
+       return value from gomp_create_target_task, if false, fallthrough
+       as if no dependencies exist.
+       (gomp_target_task_fn): Change return type to bool, return true
+       if the task should have another part scheduled later.  Handle
+       target nowait.
+       (gomp_load_plugin_for_device): Initialize async_run.
+       * task.c (gomp_init_task): Initialize children_queue.
+       (gomp_clear_parent_in_list): New.
+       (gomp_clear_parent_in_tree): New.
+       (gomp_clear_parent): Handle priorities.
+       (GOMP_task): Likewise.
+       (priority_queue_move_task_first,
+       gomp_target_task_completion, GOMP_PLUGIN_target_task_completion):
+       New functions.
+       (gomp_create_target_task): Use priority queues.  Change return type
+       to bool, add state argument, return false if for async
+       {{enter,exit} data,update} constructs no dependencies need to be
+       waited for, handle target nowait.  Set task->fn to NULL instead of
+       gomp_target_task_fn.
+       (verify_children_queue): Remove.
+       (priority_list_upgrade_task): New.
+       (priority_queue_upgrade_task): New.
+       (verify_task_queue): Remove.
+       (priority_list_downgrade_task): New.
+       (priority_queue_downgrade_task): New.
+       (gomp_task_run_pre): Use priority queues.
+       Abstract code out to priority_queue_downgrade_task.
+       (gomp_task_run_post_handle_dependers): Use priority queues.
+       (gomp_task_run_post_remove_parent): Likewise.
+       (gomp_task_run_post_remove_taskgroup): Likewise.
+       (gomp_barrier_handle_tasks): Likewise.  Handle target nowait target
+       tasks specially.
+       (GOMP_taskwait): Likewise.
+       (gomp_task_maybe_wait_for_dependencies): Likewise.  Abstract code to
+       priority-queue_upgrade_task.
+       (GOMP_taskgroup_start): Use priority queues.
+       (GOMP_taskgroup_end): Likewise.  Handle target nowait target tasks
+       specially.  If taskgroup is NULL, and thr->ts.level is 0, act as a
+       barrier.
+       * taskloop.c (GOMP_taskloop): Handle priorities.
+       * team.c (gomp_new_team): Call priority_queue_init.
+       (free_team): Call priority_queue_free.
+       (gomp_free_thread): Call gomp_team_end if thr->ts.team is artificial
+       team created for target nowait in implicit parallel region.
+       (gomp_team_start): For nested check, test thr->ts.level instead of
+       thr->ts.team != NULL.
+       * testsuite/libgomp.c/doacross-3.c: New test.
+       * testsuite/libgomp.c/ordered-5.c: New test.
+       * testsuite/libgomp.c/priority.c: New test.
+       * testsuite/libgomp.c/target-31.c: New test.
+       * testsuite/libgomp.c/target-32.c: New test.
+       * testsuite/libgomp.c/target-33.c: New test.
+       * testsuite/libgomp.c/target-34.c: New test.
+
 2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>
 
        * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: New.
index 5411278b0184676c803e74ca0beb85399af28230..a3e1c2b2c0f476bd0a0f4e76fa09be26f85a77a4 100644 (file)
@@ -63,7 +63,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
        task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
        time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
        oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
-       oacc-plugin.c oacc-cuda.c
+       oacc-plugin.c oacc-cuda.c priority_queue.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
index 79745ce93f2fd86170ce45534ee8fe8af7155b96..7a1c976c5c984cdd118f77e09ba349c714264b22 100644 (file)
@@ -168,7 +168,7 @@ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
        fortran.lo affinity.lo target.lo splay-tree.lo \
        libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
        oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
-       $(am__objects_1)
+       priority_queue.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -415,7 +415,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
        bar.c ptrlock.c time.c fortran.c affinity.c target.c \
        splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
        oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
-       $(am__append_2)
+       priority_queue.c $(am__append_2)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -589,6 +589,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
index 24fbb9455c934bdf8184d871322e1174fbe344b9..ab22e85cf61c728bbcf5b6821e675d0e0e54fba5 100644 (file)
@@ -63,6 +63,7 @@ struct addr_pair
 extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc));
 extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc));
 extern void *GOMP_PLUGIN_realloc (void *, size_t);
+void GOMP_PLUGIN_target_task_completion (void *);
 
 extern void GOMP_PLUGIN_debug (int, const char *, ...)
        __attribute__ ((format (printf, 2, 3)));
index 23b516ee37e3c0c215551012add9e9ed972205f9..c467f97788bd3d38f11bef6e4c41b32c9e2f75e5 100644 (file)
 #include <stdlib.h>
 #include <stdarg.h>
 
+/* Needed for memset in priority_queue.c.  */
+#if _LIBGOMP_CHECKING_
+# ifdef STRING_WITH_STRINGS
+#  include <string.h>
+#  include <strings.h>
+# else
+#  ifdef HAVE_STRING_H
+#   include <string.h>
+#  else
+#   ifdef HAVE_STRINGS_H
+#    include <strings.h>
+#   endif
+#  endif
+# endif
+#endif
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility push(hidden)
 #endif
@@ -65,6 +81,44 @@ enum memmodel
   MEMMODEL_SEQ_CST = 5
 };
 
+/* alloc.c */
+
+extern void *gomp_malloc (size_t) __attribute__((malloc));
+extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
+extern void *gomp_realloc (void *, size_t);
+
+/* Avoid conflicting prototypes of alloca() in system headers by using
+   GCC's builtin alloca().  */
+#define gomp_alloca(x)  __builtin_alloca(x)
+
+/* error.c */
+
+extern void gomp_vdebug (int, const char *, va_list);
+extern void gomp_debug (int, const char *, ...)
+       __attribute__ ((format (printf, 2, 3)));
+#define gomp_vdebug(KIND, FMT, VALIST) \
+  do { \
+    if (__builtin_expect (gomp_debug_var, 0)) \
+      (gomp_vdebug) ((KIND), (FMT), (VALIST)); \
+  } while (0)
+#define gomp_debug(KIND, ...) \
+  do { \
+    if (__builtin_expect (gomp_debug_var, 0)) \
+      (gomp_debug) ((KIND), __VA_ARGS__); \
+  } while (0)
+extern void gomp_verror (const char *, va_list);
+extern void gomp_error (const char *, ...)
+       __attribute__ ((format (printf, 1, 2)));
+extern void gomp_vfatal (const char *, va_list)
+       __attribute__ ((noreturn));
+extern void gomp_fatal (const char *, ...)
+       __attribute__ ((noreturn, format (printf, 1, 2)));
+
+struct gomp_task;
+struct gomp_taskgroup;
+struct htab;
+
+#include "priority_queue.h"
 #include "sem.h"
 #include "mutex.h"
 #include "bar.h"
@@ -298,6 +352,7 @@ extern gomp_mutex_t gomp_managed_threads_lock;
 #endif
 extern unsigned long gomp_max_active_levels_var;
 extern bool gomp_cancel_var;
+extern int gomp_max_task_priority_var;
 extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
 extern unsigned long gomp_available_cpus, gomp_managed_threads;
 extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
@@ -318,13 +373,14 @@ enum gomp_task_kind
   /* Task created by GOMP_task and waiting to be run.  */
   GOMP_TASK_WAITING,
   /* Task currently executing or scheduled and about to execute.  */
-  GOMP_TASK_TIED
+  GOMP_TASK_TIED,
+  /* Used for target tasks that have vars mapped and async run started,
+     but not yet completed.  Once that completes, they will be readded
+     into the queues as GOMP_TASK_WAITING in order to perform the var
+     unmapping.  */
+  GOMP_TASK_ASYNC_RUNNING
 };
 
-struct gomp_task;
-struct gomp_taskgroup;
-struct htab;
-
 struct gomp_task_depend_entry
 {
   /* Address of dependency.  */
@@ -352,8 +408,8 @@ struct gomp_taskwait
 {
   bool in_taskwait;
   bool in_depend_wait;
+  /* Number of tasks we are waiting for.  */
   size_t n_depend;
-  struct gomp_task *last_parent_depends_on;
   gomp_sem_t taskwait_sem;
 };
 
@@ -361,26 +417,10 @@ struct gomp_taskwait
 
 struct gomp_task
 {
-  /* Parent circular list.  See children description below.  */
+  /* Parent of this task.  */
   struct gomp_task *parent;
-  /* Circular list representing the children of this task.
-
-     In this list we first have parent_depends_on ready to run tasks,
-     then !parent_depends_on ready to run tasks, and finally already
-     running tasks.  */
-  struct gomp_task *children;
-  struct gomp_task *next_child;
-  struct gomp_task *prev_child;
-  /* Circular task_queue in `struct gomp_team'.
-
-     GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks.  */
-  struct gomp_task *next_queue;
-  struct gomp_task *prev_queue;
-  /* Circular queue in gomp_taskgroup->children.
-
-     GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks.  */
-  struct gomp_task *next_taskgroup;
-  struct gomp_task *prev_taskgroup;
+  /* Children of this task.  */
+  struct priority_queue children_queue;
   /* Taskgroup this task belongs in.  */
   struct gomp_taskgroup *taskgroup;
   /* Tasks that depend on this task.  */
@@ -389,8 +429,19 @@ struct gomp_task
   struct gomp_taskwait *taskwait;
   /* Number of items in DEPEND.  */
   size_t depend_count;
-  /* Number of tasks in the DEPENDERS field above.  */
+  /* Number of tasks this task depends on.  Once this counter reaches
+     0, we have no unsatisfied dependencies, and this task can be put
+     into the various queues to be scheduled.  */
   size_t num_dependees;
+
+  /* Priority of this task.  */
+  int priority;
+  /* The priority node for this task in each of the different queues.
+     We put this here to avoid allocating space for each priority
+     node.  Then we play offsetof() games to convert between pnode[]
+     entries and the gomp_task in which they reside.  */
+  struct priority_node pnode[3];
+
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
@@ -407,21 +458,32 @@ struct gomp_task
   struct gomp_task_depend_entry depend[];
 };
 
+/* This structure describes a single #pragma omp taskgroup.  */
+
 struct gomp_taskgroup
 {
   struct gomp_taskgroup *prev;
-  /* Circular list of tasks that belong in this taskgroup.
-
-     Tasks are chained by next/prev_taskgroup within gomp_task, and
-     are sorted by GOMP_TASK_WAITING tasks, and then GOMP_TASK_TIED
-     tasks.  */
-  struct gomp_task *children;
+  /* Queue of tasks that belong in this taskgroup.  */
+  struct priority_queue taskgroup_queue;
   bool in_taskgroup_wait;
   bool cancelled;
   gomp_sem_t taskgroup_sem;
   size_t num_children;
 };
 
+/* Various state of OpenMP async offloading tasks.  */
+enum gomp_target_task_state
+{
+  GOMP_TARGET_TASK_DATA,
+  GOMP_TARGET_TASK_BEFORE_MAP,
+  GOMP_TARGET_TASK_FALLBACK,
+  GOMP_TARGET_TASK_READY_TO_RUN,
+  GOMP_TARGET_TASK_RUNNING,
+  GOMP_TARGET_TASK_FINISHED
+};
+
+/* This structure describes a target task.  */
+
 struct gomp_target_task
 {
   struct gomp_device_descr *devicep;
@@ -430,6 +492,10 @@ struct gomp_target_task
   size_t *sizes;
   unsigned short *kinds;
   unsigned int flags;
+  enum gomp_target_task_state state;
+  struct target_mem_desc *tgt;
+  struct gomp_task *task;
+  struct gomp_team *team;
   void *hostaddrs[];
 };
 
@@ -495,9 +561,8 @@ struct gomp_team
   struct gomp_work_share work_shares[8];
 
   gomp_mutex_t task_lock;
-  /* Scheduled tasks.  Chain fields are next/prev_queue within a
-     gomp_task.  */
-  struct gomp_task *task_queue;
+  /* Scheduled tasks.  */
+  struct priority_queue task_queue;
   /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
   unsigned int task_count;
   /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
@@ -627,39 +692,6 @@ extern bool gomp_affinity_init_level (int, unsigned long, bool);
 extern void gomp_affinity_print_place (void *);
 extern void gomp_get_place_proc_ids_8 (int, int64_t *);
 
-/* alloc.c */
-
-extern void *gomp_malloc (size_t) __attribute__((malloc));
-extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
-extern void *gomp_realloc (void *, size_t);
-
-/* Avoid conflicting prototypes of alloca() in system headers by using
-   GCC's builtin alloca().  */
-#define gomp_alloca(x)  __builtin_alloca(x)
-
-/* error.c */
-
-extern void gomp_vdebug (int, const char *, va_list);
-extern void gomp_debug (int, const char *, ...)
-       __attribute__ ((format (printf, 2, 3)));
-#define gomp_vdebug(KIND, FMT, VALIST) \
-  do { \
-    if (__builtin_expect (gomp_debug_var, 0)) \
-      (gomp_vdebug) ((KIND), (FMT), (VALIST)); \
-  } while (0)
-#define gomp_debug(KIND, ...) \
-  do { \
-    if (__builtin_expect (gomp_debug_var, 0)) \
-      (gomp_debug) ((KIND), __VA_ARGS__); \
-  } while (0)
-extern void gomp_verror (const char *, va_list);
-extern void gomp_error (const char *, ...)
-       __attribute__ ((format (printf, 1, 2)));
-extern void gomp_vfatal (const char *, va_list)
-       __attribute__ ((noreturn));
-extern void gomp_fatal (const char *, ...)
-       __attribute__ ((noreturn, format (printf, 1, 2)));
-
 /* iter.c */
 
 extern int gomp_iter_static_next (long *, long *);
@@ -715,10 +747,10 @@ extern void gomp_init_task (struct gomp_task *, struct gomp_task *,
 extern void gomp_end_task (void);
 extern void gomp_barrier_handle_tasks (gomp_barrier_state_t);
 extern void gomp_task_maybe_wait_for_dependencies (void **);
-extern void gomp_create_target_task (struct gomp_device_descr *,
+extern bool gomp_create_target_task (struct gomp_device_descr *,
                                     void (*) (void *), size_t, void **,
                                     size_t *, unsigned short *, unsigned int,
-                                    void **);
+                                    void **, enum gomp_target_task_state);
 
 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -739,8 +771,9 @@ extern void gomp_free_thread (void *);
 
 extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
-extern void gomp_target_task_fn (void *);
+extern bool gomp_target_task_fn (void *);
 
+/* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
 typedef struct splay_tree_s *splay_tree;
 typedef struct splay_tree_key_s *splay_tree_key;
@@ -800,6 +833,21 @@ struct splay_tree_key_s {
   uintptr_t async_refcount;
 };
 
+/* The comparison function.  */
+
+static inline int
+splay_compare (splay_tree_key x, splay_tree_key y)
+{
+  if (x->host_start == x->host_end
+      && y->host_start == y->host_end)
+    return 0;
+  if (x->host_end <= y->host_start)
+    return -1;
+  if (x->host_start >= y->host_end)
+    return 1;
+  return 0;
+}
+
 #include "splay-tree.h"
 
 typedef struct acc_dispatch_t
@@ -877,6 +925,7 @@ struct gomp_device_descr
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
+  void (*async_run_func) (int, void *, void *, void *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;
@@ -1016,4 +1065,34 @@ extern int gomp_test_nest_lock_25 (omp_nest_lock_25_t *) __GOMP_NOTHROW;
 # define ialias_call(fn) fn
 #endif
 
+/* Helper function for priority_node_to_task() and
+   task_to_priority_node().
+
+   Return the offset from a task to its priority_node entry.  The
+   priority_node entry is has a type of TYPE.  */
+
+static inline size_t
+priority_queue_offset (enum priority_queue_type type)
+{
+  return offsetof (struct gomp_task, pnode[(int) type]);
+}
+
+/* Return the task associated with a priority NODE of type TYPE.  */
+
+static inline struct gomp_task *
+priority_node_to_task (enum priority_queue_type type,
+                      struct priority_node *node)
+{
+  return (struct gomp_task *) ((char *) node - priority_queue_offset (type));
+}
+
+/* Return the priority node of type TYPE for a given TASK.  */
+
+static inline struct priority_node *
+task_to_priority_node (enum priority_queue_type type,
+                      struct gomp_task *task)
+{
+  return (struct priority_node *) ((char *) task
+                                  + priority_queue_offset (type));
+}
 #endif /* LIBGOMP_H */
index d16710f44f4de23bb49ad32afa0301b3d5f3e995..4d42c42f4416a076f2746f3a9db5f180c853ae80 100644 (file)
@@ -407,3 +407,8 @@ GOMP_PLUGIN_1.0 {
        GOMP_PLUGIN_async_unmap_vars;
        GOMP_PLUGIN_acc_thread;
 };
+
+GOMP_PLUGIN_1.1 {
+  global:
+       GOMP_PLUGIN_target_task_completion;
+} GOMP_PLUGIN_1.0;
index 54109061fb1c8712f297edd0af50639b92dc9635..2488480717a6fbb6582847ce9ec184ac4a77ab37 100644 (file)
@@ -31,7 +31,6 @@
 #include "libgomp.h"
 #include "gomp-constants.h"
 #include "oacc-int.h"
-#include "splay-tree.h"
 #include <stdint.h>
 #include <assert.h>
 
index fdac3ee8f5854d677f5481ba7bc7fd6e94485736..52df385e427d2739962d78d68a5980f9a2fe98de 100644 (file)
@@ -297,6 +297,8 @@ gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
 
   if (ws->sched == GFS_STATIC)
     num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
   else
     num_ents = (counts[0] - 1) / chunk_size + 1;
   if (num_bits <= MAX_COLLAPSED_BITS)
@@ -366,6 +368,8 @@ GOMP_doacross_post (long *counts)
 
   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
   else
     ent = counts[0] / doacross->chunk_size;
   unsigned long *array = (unsigned long *) (doacross->array
@@ -426,6 +430,8 @@ GOMP_doacross_wait (long first, ...)
       else
        ent = first / ws->chunk_size % thr->ts.team->nthreads;
     }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
   else
     ent = first / doacross->chunk_size;
   unsigned long *array = (unsigned long *) (doacross->array
@@ -520,6 +526,8 @@ gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
 
   if (ws->sched == GFS_STATIC)
     num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
   else
     num_ents = (counts[0] - 1) / chunk_size + 1;
   if (num_bits <= MAX_COLLAPSED_BITS)
@@ -595,6 +603,8 @@ GOMP_doacross_ull_post (gomp_ull *counts)
 
   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
   else
     ent = counts[0] / doacross->chunk_size_ull;
 
@@ -676,6 +686,8 @@ GOMP_doacross_ull_wait (gomp_ull first, ...)
       else
        ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
     }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
   else
     ent = first / doacross->chunk_size_ull;
 
index 6d5ef050f94c80babe898727b57fc2bafa35077b..228086f40a2aa65c0b780cda4477aa16b78a378e 100644 (file)
@@ -85,7 +85,7 @@ gomp_resolve_num_threads (unsigned specified, unsigned count)
      nested parallel, so there is just one thread in the
      contention group as well, no need to handle it atomically.  */
   pool = thr->thread_pool;
-  if (thr->ts.team == NULL)
+  if (thr->ts.team == NULL || pool == NULL)
     {
       num_threads = max_num_threads;
       if (num_threads > icv->thread_limit_var)
diff --git a/libgomp/priority_queue.c b/libgomp/priority_queue.c
new file mode 100644 (file)
index 0000000..4bc5f9b
--- /dev/null
@@ -0,0 +1,300 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Priority queue implementation of GOMP tasks.  */
+
+#include "libgomp.h"
+
+#if _LIBGOMP_CHECKING_
+#include <stdio.h>
+
+/* Sanity check to verify whether a TASK is in LIST.  Return TRUE if
+   found, FALSE otherwise.
+
+   TYPE is the type of priority queue this task resides in.  */
+
+static inline bool
+priority_queue_task_in_list_p (enum priority_queue_type type,
+                              struct priority_list *list,
+                              struct gomp_task *task)
+{
+  struct priority_node *p = list->tasks;
+  do
+    {
+      if (priority_node_to_task (type, p) == task)
+       return true;
+      p = p->next;
+    }
+  while (p != list->tasks);
+  return false;
+}
+
+/* Tree version of priority_queue_task_in_list_p.  */
+
+static inline bool
+priority_queue_task_in_tree_p (enum priority_queue_type type,
+                              struct priority_queue *head,
+                              struct gomp_task *task)
+{
+  struct priority_list *list
+    = priority_queue_lookup_priority (head, task->priority);
+  if (!list)
+    return false;
+  return priority_queue_task_in_list_p (type, list, task);
+}
+
+/* Generic version of priority_queue_task_in_list_p that works for
+   trees or lists.  */
+
+bool
+priority_queue_task_in_queue_p (enum priority_queue_type type,
+                               struct priority_queue *head,
+                               struct gomp_task *task)
+{
+  if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
+    return false;
+  if (priority_queue_multi_p (head))
+    return priority_queue_task_in_tree_p (type, head, task);
+  else
+    return priority_queue_task_in_list_p (type, &head->l, task);
+}
+
+/* Sanity check LIST to make sure the tasks therein are in the right
+   order.  LIST is a priority list of type TYPE.
+
+   The expected order is that GOMP_TASK_WAITING tasks come before
+   GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones.
+
+   If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING
+   tasks come before !parent_depends_on WAITING tasks.  This is only
+   applicable to the children queue, and the caller is expected to
+   ensure that we are verifying the children queue.  */
+
+static void
+priority_list_verify (enum priority_queue_type type,
+                     struct priority_list *list, bool check_deps)
+{
+  bool seen_tied = false;
+  bool seen_plain_waiting = false;
+  struct priority_node *p = list->tasks;
+  while (1)
+    {
+      struct gomp_task *t = priority_node_to_task (type, p);
+      if (seen_tied && t->kind == GOMP_TASK_WAITING)
+       gomp_fatal ("priority_queue_verify: WAITING task after TIED");
+      if (t->kind >= GOMP_TASK_TIED)
+       seen_tied = true;
+      else if (check_deps && t->kind == GOMP_TASK_WAITING)
+       {
+         if (t->parent_depends_on)
+           {
+             if (seen_plain_waiting)
+               gomp_fatal ("priority_queue_verify: "
+                           "parent_depends_on after !parent_depends_on");
+           }
+         else
+           seen_plain_waiting = true;
+       }
+      p = p->next;
+      if (p == list->tasks)
+       break;
+    }
+}
+
+/* Callback type for priority_tree_verify_callback.  */
+struct cbtype
+{
+  enum priority_queue_type type;
+  bool check_deps;
+};
+
+/* Verify every task in NODE.
+
+   Callback for splay_tree_foreach.  */
+
+static void
+priority_tree_verify_callback (prio_splay_tree_key key, void *data)
+{
+  struct cbtype *cb = (struct cbtype *) data;
+  priority_list_verify (cb->type, &key->l, cb->check_deps);
+}
+
+/* Generic version of priority_list_verify.
+
+   Sanity check HEAD to make sure the tasks therein are in the right
+   order.  The priority_queue holds tasks of type TYPE.
+
+   If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING
+   tasks come before !parent_depends_on WAITING tasks.  This is only
+   applicable to the children queue, and the caller is expected to
+   ensure that we are verifying the children queue.  */
+
+void
+priority_queue_verify (enum priority_queue_type type,
+                      struct priority_queue *head, bool check_deps)
+{
+  if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
+    return;
+  if (priority_queue_multi_p (head))
+    {
+      struct cbtype cb = { type, check_deps };
+      prio_splay_tree_foreach (&head->t,
+                              priority_tree_verify_callback, &cb);
+    }
+  else
+    priority_list_verify (type, &head->l, check_deps);
+}
+#endif /* _LIBGOMP_CHECKING_ */
+
+/* Remove NODE from priority queue HEAD, wherever it may be inside the
+   tree.  HEAD contains tasks of type TYPE.  */
+
+void
+priority_tree_remove (enum priority_queue_type type,
+                     struct priority_queue *head,
+                     struct priority_node *node)
+{
+  /* ?? The only reason this function is not inlined is because we
+     need to find the priority within gomp_task (which has not been
+     completely defined in the header file).  If the lack of inlining
+     is a concern, we could pass the priority number as a
+     parameter, or we could move this to libgomp.h.  */
+  int priority = priority_node_to_task (type, node)->priority;
+
+  /* ?? We could avoid this lookup by keeping a pointer to the key in
+     the priority_node.  */
+  struct priority_list *list
+    = priority_queue_lookup_priority (head, priority);
+#if _LIBGOMP_CHECKING_
+  if (!list)
+    gomp_fatal ("Unable to find priority %d", priority);
+#endif
+  /* If NODE was the last in its priority, clean up the priority.  */
+  if (priority_list_remove (list, node, MEMMODEL_RELAXED))
+    {
+      prio_splay_tree_remove (&head->t, (prio_splay_tree_key) list);
+      list->tasks = NULL;
+#if _LIBGOMP_CHECKING_
+      memset (list, 0xaf, sizeof (*list));
+#endif
+      free (list);
+    }
+}
+
+/* Return the highest priority WAITING task in a splay tree NODE.  If
+   there are no WAITING tasks available, return NULL.
+
+   NODE is a priority list containing tasks of type TYPE.
+
+   The right most node in a tree contains the highest priority.
+   Recurse down to find such a node.  If the task at that max node is
+   not WAITING, bubble back up and look at the remaining tasks
+   in-order.  */
+
+static struct gomp_task *
+priority_tree_next_task_1 (enum priority_queue_type type,
+                          prio_splay_tree_node node)
+{
+ again:
+  if (!node)
+    return NULL;
+  struct gomp_task *ret = priority_tree_next_task_1 (type, node->right);
+  if (ret)
+    return ret;
+  ret = priority_node_to_task (type, node->key.l.tasks);
+  if (ret->kind == GOMP_TASK_WAITING)
+    return ret;
+  node = node->left;
+  goto again;
+}
+
+/* Return the highest priority WAITING task from within Q1 and Q2,
+   while giving preference to tasks from Q1.  Q1 is a queue containing
+   items of type TYPE1.  Q2 is a queue containing items of type TYPE2.
+
+   Since we are mostly interested in Q1, if there are no WAITING tasks
+   in Q1, we don't bother checking Q2, and just return NULL.
+
+   As a special case, Q2 can be NULL, in which case, we just choose
+   the highest priority WAITING task in Q1.  This is an optimization
+   to speed up looking through only one queue.
+
+   If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to
+   TRUE, otherwise it is set to FALSE.  */
+
+struct gomp_task *
+priority_tree_next_task (enum priority_queue_type type1,
+                        struct priority_queue *q1,
+                        enum priority_queue_type type2,
+                        struct priority_queue *q2,
+                        bool *q1_chosen_p)
+{
+  struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root);
+  if (!t1
+      /* Special optimization when only searching through one queue.  */
+      || !q2)
+    {
+      *q1_chosen_p = true;
+      return t1;
+    }
+  struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root);
+  if (!t2 || t1->priority > t2->priority)
+    {
+      *q1_chosen_p = true;
+      return t1;
+    }
+  if (t2->priority > t1->priority)
+    {
+      *q1_chosen_p = false;
+      return t2;
+    }
+  /* If we get here, the priorities are the same, so we must look at
+     parent_depends_on to make our decision.  */
+#if _LIBGOMP_CHECKING_
+  if (t1 != t2)
+    gomp_fatal ("priority_tree_next_task: t1 != t2");
+#endif
+  if (t2->parent_depends_on && !t1->parent_depends_on)
+    {
+      *q1_chosen_p = false;
+      return t2;
+    }
+  *q1_chosen_p = true;
+  return t1;
+}
+
+/* Priority splay trees comparison function.  */
+static inline int
+prio_splay_compare (prio_splay_tree_key x, prio_splay_tree_key y)
+{
+  if (x->l.priority == y->l.priority)
+    return 0;
+  return x->l.priority < y->l.priority ? -1 : 1;
+}
+
+/* Define another splay tree instantiation, for priority_list's.  */
+#define splay_tree_prefix prio
+#define splay_tree_c
+#include "splay-tree.h"
diff --git a/libgomp/priority_queue.h b/libgomp/priority_queue.h
new file mode 100644 (file)
index 0000000..e9c369b
--- /dev/null
@@ -0,0 +1,485 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Header file for a priority queue of GOMP tasks.  */
+
+/* ?? Perhaps all the priority_tree_* functions are complex and rare
+   enough to go out-of-line and be moved to priority_queue.c.  ??  */
+
+#ifndef _PRIORITY_QUEUE_H_
+#define _PRIORITY_QUEUE_H_
+
+/* One task.  */
+
+struct priority_node
+{
+  /* Next and previous chains in a circular doubly linked list for
+     tasks within this task's priority.  */
+  struct priority_node *next, *prev;
+};
+
+/* All tasks within the same priority.  */
+
+struct priority_list
+{
+  /* Priority of the tasks in this set.  */
+  int priority;
+
+  /* Tasks.  */
+  struct priority_node *tasks;
+
+  /* This points to the last of the higher priority WAITING tasks.
+     Remember that for the children queue, we have:
+
+       parent_depends_on WAITING tasks.
+       !parent_depends_on WAITING tasks.
+       TIED tasks.
+
+     This is a pointer to the last of the parent_depends_on WAITING
+     tasks which are essentially, higher priority items within their
+     priority.  */
+  struct priority_node *last_parent_depends_on;
+};
+
+/* Another splay tree instantiation, for priority_list's.  */
+typedef struct prio_splay_tree_node_s *prio_splay_tree_node;
+typedef struct prio_splay_tree_s *prio_splay_tree;
+typedef struct prio_splay_tree_key_s *prio_splay_tree_key;
+struct prio_splay_tree_key_s {
+  /* This structure must only containing a priority_list, as we cast
+     prio_splay_tree_key to priority_list throughout.  */
+  struct priority_list l;
+};
+#define splay_tree_prefix prio
+#include "splay-tree.h"
+
+/* The entry point into a priority queue of tasks.
+
+   There are two alternate implementations with which to store tasks:
+   as a balanced tree of sorts, or as a simple list of tasks.  If
+   there are only priority-0 items (ROOT is NULL), we use the simple
+   list, otherwise (ROOT is non-NULL) we use the tree.  */
+
+struct priority_queue
+{
+  /* If t.root != NULL, this is a splay tree of priority_lists to hold
+     all tasks.  This is only used if multiple priorities are in play,
+     otherwise we use the priority_list `l' below to hold all
+     (priority-0) tasks.  */
+  struct prio_splay_tree_s t;
+
+  /* If T above is NULL, only priority-0 items exist, so keep them
+     in a simple list.  */
+  struct priority_list l;
+};
+
+enum priority_insert_type {
+  /* Insert at the beginning of a priority list.  */
+  PRIORITY_INSERT_BEGIN,
+  /* Insert at the end of a priority list.  */
+  PRIORITY_INSERT_END
+};
+
+/* Used to determine in which queue a given priority node belongs in.
+   See pnode field of gomp_task.  */
+
+enum priority_queue_type
+{
+  PQ_TEAM,         /* Node belongs in gomp_team's task_queue.  */
+  PQ_CHILDREN,     /* Node belongs in parent's children_queue.  */
+  PQ_TASKGROUP,            /* Node belongs in taskgroup->taskgroup_queue.  */
+  PQ_IGNORED = 999
+};
+
+/* Priority queue implementation prototypes.  */
+
+extern bool priority_queue_task_in_queue_p (enum priority_queue_type,
+                                           struct priority_queue *,
+                                           struct gomp_task *);
+extern void priority_queue_dump (enum priority_queue_type,
+                                struct priority_queue *);
+extern void priority_queue_verify (enum priority_queue_type,
+                                  struct priority_queue *, bool);
+extern void priority_tree_remove (enum priority_queue_type,
+                                 struct priority_queue *,
+                                 struct priority_node *);
+extern struct gomp_task *priority_tree_next_task (enum priority_queue_type,
+                                                 struct priority_queue *,
+                                                 enum priority_queue_type,
+                                                 struct priority_queue *,
+                                                 bool *);
+
+/* Return TRUE if there is more than one priority in HEAD.  This is
+   used throughout to to choose between the fast path (priority 0 only
+   items) and a world with multiple priorities.  */
+
+static inline bool
+priority_queue_multi_p (struct priority_queue *head)
+{
+  return __builtin_expect (head->t.root != NULL, 0);
+}
+
+/* Initialize a priority queue.  */
+
+static inline void
+priority_queue_init (struct priority_queue *head)
+{
+  head->t.root = NULL;
+  /* To save a few microseconds, we don't initialize head->l.priority
+     to 0 here.  It is implied that priority will be 0 if head->t.root
+     == NULL.
+
+     priority_tree_insert() will fix this when we encounter multiple
+     priorities.  */
+  head->l.tasks = NULL;
+  head->l.last_parent_depends_on = NULL;
+}
+
+static inline void
+priority_queue_free (struct priority_queue *head)
+{
+  /* There's nothing to do, as tasks were freed as they were removed
+     in priority_queue_remove.  */
+}
+
+/* Forward declarations.  */
+static inline size_t priority_queue_offset (enum priority_queue_type);
+static inline struct gomp_task *priority_node_to_task
+                               (enum priority_queue_type,
+                                struct priority_node *);
+static inline struct priority_node *task_to_priority_node
+                                   (enum priority_queue_type,
+                                    struct gomp_task *);
+
+/* Return TRUE if priority queue HEAD is empty.
+
+   MODEL IS MEMMODEL_ACQUIRE if we should use an acquire atomic to
+   read from the root of the queue, otherwise MEMMODEL_RELAXED if we
+   should use a plain load.  */
+
+static inline _Bool
+priority_queue_empty_p (struct priority_queue *head, enum memmodel model)
+{
+  /* Note: The acquire barriers on the loads here synchronize with
+     the write of a NULL in gomp_task_run_post_remove_parent.  It is
+     not necessary that we synchronize with other non-NULL writes at
+     this point, but we must ensure that all writes to memory by a
+     child thread task work function are seen before we exit from
+     GOMP_taskwait.  */
+  if (priority_queue_multi_p (head))
+    {
+      if (model == MEMMODEL_ACQUIRE)
+       return __atomic_load_n (&head->t.root, MEMMODEL_ACQUIRE) == NULL;
+      return head->t.root == NULL;
+    }
+  if (model == MEMMODEL_ACQUIRE)
+    return __atomic_load_n (&head->l.tasks, MEMMODEL_ACQUIRE) == NULL;
+  return head->l.tasks == NULL;
+}
+
+/* Look for a given PRIORITY in HEAD.  Return it if found, otherwise
+   return NULL.  This only applies to the tree variant in HEAD.  There
+   is no point in searching for priorities in HEAD->L.  */
+
+static inline struct priority_list *
+priority_queue_lookup_priority (struct priority_queue *head, int priority)
+{
+  if (head->t.root == NULL)
+    return NULL;
+  struct prio_splay_tree_key_s k;
+  k.l.priority = priority;
+  return (struct priority_list *)
+    prio_splay_tree_lookup (&head->t, &k);
+}
+
+/* Insert task in DATA, with PRIORITY, in the priority list in LIST.
+   LIST contains items of type TYPE.
+
+   If POS is PRIORITY_INSERT_BEGIN, the new task is inserted at the
+   top of its respective priority.  If POS is PRIORITY_INSERT_END, the
+   task is inserted at the end of its priority.
+
+   If ADJUST_PARENT_DEPENDS_ON is TRUE, LIST is a children queue, and
+   we must keep track of higher and lower priority WAITING tasks by
+   keeping the queue's last_parent_depends_on field accurate.  This
+   only applies to the children queue, and the caller must ensure LIST
+   is a children queue in this case.
+
+   If ADJUST_PARENT_DEPENDS_ON is TRUE, TASK_IS_PARENT_DEPENDS_ON is
+   set to the task's parent_depends_on field.  If
+   ADJUST_PARENT_DEPENDS_ON is FALSE, this field is irrelevant.
+
+   Return the new priority_node.  */
+
+static inline void
+priority_list_insert (enum priority_queue_type type,
+                     struct priority_list *list,
+                     struct gomp_task *task,
+                     int priority,
+                     enum priority_insert_type pos,
+                     bool adjust_parent_depends_on,
+                     bool task_is_parent_depends_on)
+{
+  struct priority_node *node = task_to_priority_node (type, task);
+  if (list->tasks)
+    {
+      /* If we are keeping track of higher/lower priority items,
+        but this is a lower priority WAITING task
+        (parent_depends_on != NULL), put it after all ready to
+        run tasks.  See the comment in
+        priority_queue_upgrade_task for a visual on how tasks
+        should be organized.  */
+      if (adjust_parent_depends_on
+         && pos == PRIORITY_INSERT_BEGIN
+         && list->last_parent_depends_on
+         && !task_is_parent_depends_on)
+       {
+         struct priority_node *last_parent_depends_on
+           = list->last_parent_depends_on;
+         node->next = last_parent_depends_on->next;
+         node->prev = last_parent_depends_on;
+       }
+      /* Otherwise, put it at the top/bottom of the queue.  */
+      else
+       {
+         node->next = list->tasks;
+         node->prev = list->tasks->prev;
+         if (pos == PRIORITY_INSERT_BEGIN)
+           list->tasks = node;
+       }
+      node->next->prev = node;
+      node->prev->next = node;
+    }
+  else
+    {
+      node->next = node;
+      node->prev = node;
+      list->tasks = node;
+    }
+  if (adjust_parent_depends_on
+      && list->last_parent_depends_on == NULL
+      && task_is_parent_depends_on)
+    list->last_parent_depends_on = node;
+}
+
+/* Tree version of priority_list_insert.  */
+
+static inline void
+priority_tree_insert (enum priority_queue_type type,
+                     struct priority_queue *head,
+                     struct gomp_task *task,
+                     int priority,
+                     enum priority_insert_type pos,
+                     bool adjust_parent_depends_on,
+                     bool task_is_parent_depends_on)
+{
+  if (__builtin_expect (head->t.root == NULL, 0))
+    {
+      /* The first time around, transfer any priority 0 items to the
+        tree.  */
+      if (head->l.tasks != NULL)
+       {
+         prio_splay_tree_node k = gomp_malloc (sizeof (*k));
+         k->left = NULL;
+         k->right = NULL;
+         k->key.l.priority = 0;
+         k->key.l.tasks = head->l.tasks;
+         k->key.l.last_parent_depends_on = head->l.last_parent_depends_on;
+         prio_splay_tree_insert (&head->t, k);
+         head->l.tasks = NULL;
+       }
+    }
+  struct priority_list *list
+    = priority_queue_lookup_priority (head, priority);
+  if (!list)
+    {
+      prio_splay_tree_node k = gomp_malloc (sizeof (*k));
+      k->left = NULL;
+      k->right = NULL;
+      k->key.l.priority = priority;
+      k->key.l.tasks = NULL;
+      k->key.l.last_parent_depends_on = NULL;
+      prio_splay_tree_insert (&head->t, k);
+      list = &k->key.l;
+    }
+  priority_list_insert (type, list, task, priority, pos,
+                       adjust_parent_depends_on,
+                       task_is_parent_depends_on);
+}
+
+/* Generic version of priority_*_insert.  */
+
+static inline void
+priority_queue_insert (enum priority_queue_type type,
+                      struct priority_queue *head,
+                      struct gomp_task *task,
+                      int priority,
+                      enum priority_insert_type pos,
+                      bool adjust_parent_depends_on,
+                      bool task_is_parent_depends_on)
+{
+#if _LIBGOMP_CHECKING_
+  if (priority_queue_task_in_queue_p (type, head, task))
+    gomp_fatal ("Attempt to insert existing task %p", task);
+#endif
+  if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0))
+    priority_tree_insert (type, head, task, priority, pos,
+                         adjust_parent_depends_on,
+                         task_is_parent_depends_on);
+  else
+    priority_list_insert (type, &head->l, task, priority, pos,
+                         adjust_parent_depends_on,
+                         task_is_parent_depends_on);
+}
+
+/* If multiple priorities are in play, return the highest priority
+   task from within Q1 and Q2, while giving preference to tasks from
+   Q1.  If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to
+   TRUE, otherwise it is set to FALSE.
+
+   If multiple priorities are not in play (only 0 priorities are
+   available), the next task is chosen exclusively from Q1.
+
+   As a special case, Q2 can be NULL, in which case, we just choose
+   the highest priority WAITING task in Q1.  This is an optimization
+   to speed up looking through only one queue.
+
+   We assume Q1 has at least one item.  */
+
+static inline struct gomp_task *
+priority_queue_next_task (enum priority_queue_type t1,
+                         struct priority_queue *q1,
+                         enum priority_queue_type t2,
+                         struct priority_queue *q2,
+                         bool *q1_chosen_p)
+{
+#if _LIBGOMP_CHECKING_
+  if (priority_queue_empty_p (q1, MEMMODEL_RELAXED))
+    gomp_fatal ("priority_queue_next_task: Q1 is empty");
+#endif
+  if (priority_queue_multi_p (q1))
+    {
+      struct gomp_task *t
+       = priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p);
+      /* If T is NULL, there are no WAITING tasks in Q1.  In which
+        case, return any old (non-waiting) task which will cause the
+        caller to do the right thing when checking T->KIND ==
+        GOMP_TASK_WAITING.  */
+      if (!t)
+       {
+#if _LIBGOMP_CHECKING_
+         if (*q1_chosen_p == false)
+           gomp_fatal ("priority_queue_next_task inconsistency");
+#endif
+         return priority_node_to_task (t1, q1->t.root->key.l.tasks);
+       }
+      return t;
+    }
+  else
+    {
+      *q1_chosen_p = true;
+      return priority_node_to_task (t1, q1->l.tasks);
+    }
+}
+
+/* Remove NODE from LIST.
+
+   If we are removing the one and only item in the list, and MODEL is
+   MEMMODEL_RELEASE, use an atomic release to clear the list.
+
+   If the list becomes empty after the remove, return TRUE.  */
+
+static inline bool
+priority_list_remove (struct priority_list *list,
+                     struct priority_node *node,
+                     enum memmodel model)
+{
+  bool empty = false;
+  node->prev->next = node->next;
+  node->next->prev = node->prev;
+  if (list->tasks == node)
+    {
+      if (node->next != node)
+       list->tasks = node->next;
+      else
+       {
+         /* We access task->children in GOMP_taskwait outside of
+            the task lock mutex region, so need a release barrier
+            here to ensure memory written by child_task->fn above
+            is flushed before the NULL is written.  */
+         if (model == MEMMODEL_RELEASE)
+           __atomic_store_n (&list->tasks, NULL, MEMMODEL_RELEASE);
+         else
+           list->tasks = NULL;
+         empty = true;
+         goto remove_out;
+       }
+    }
+remove_out:
+#if _LIBGOMP_CHECKING_
+  memset (node, 0xaf, sizeof (*node));
+#endif
+  return empty;
+}
+
+/* This is the generic version of priority_list_remove.
+
+   Remove NODE from priority queue HEAD.  HEAD contains tasks of type TYPE.
+
+   If we are removing the one and only item in the priority queue and
+   MODEL is MEMMODEL_RELEASE, use an atomic release to clear the queue.
+
+   If the queue becomes empty after the remove, return TRUE.  */
+
+static inline bool
+priority_queue_remove (enum priority_queue_type type,
+                      struct priority_queue *head,
+                      struct gomp_task *task,
+                      enum memmodel model)
+{
+#if _LIBGOMP_CHECKING_
+  if (!priority_queue_task_in_queue_p (type, head, task))
+    gomp_fatal ("Attempt to remove missing task %p", task);
+#endif
+  if (priority_queue_multi_p (head))
+    {
+      priority_tree_remove (type, head, task_to_priority_node (type, task));
+      if (head->t.root == NULL)
+       {
+         if (model == MEMMODEL_RELEASE)
+           /* Errr, we store NULL twice, the alternative would be to
+              use an atomic release directly in the splay tree
+              routines.  Worth it?  */
+           __atomic_store_n (&head->t.root, NULL, MEMMODEL_RELEASE);
+         return true;
+       }
+      return false;
+    }
+  else
+    return priority_list_remove (&head->l,
+                                task_to_priority_node (type, task), model);
+}
+
+#endif /* _PRIORITY_QUEUE_H_ */
index 030ca8f6c450fc6ac0dbeda746ce29c85fc965e4..862bbb80aa57a2005a2fabb7ff184fe074d4eb69 100644 (file)
@@ -37,9 +37,6 @@
    are amortized O(log n) time for a tree with n nodes.  */
 
 #include "libgomp.h"
-#include "splay-tree.h"
-
-extern int splay_compare (splay_tree_key, splay_tree_key);
 
 /* Rotate the edge joining the left child N with its parent P.  PP is the
    grandparents' pointer to P.  */
@@ -215,3 +212,27 @@ splay_tree_lookup (splay_tree sp, splay_tree_key key)
   else
     return NULL;
 }
+
+/* Helper function for splay_tree_foreach.
+
+   Run FUNC on every node in KEY.  */
+
+static void
+splay_tree_foreach_internal (splay_tree_node node, splay_tree_callback func,
+                            void *data)
+{
+  if (!node)
+    return;
+  func (&node->key, data);
+  splay_tree_foreach_internal (node->left, func, data);
+  /* Yeah, whatever.  GCC can fix my tail recursion.  */
+  splay_tree_foreach_internal (node->right, func, data);
+}
+
+/* Run FUNC on each of the nodes in SP.  */
+
+attribute_hidden void
+splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data)
+{
+  splay_tree_foreach_internal (sp->root, func, data);
+}
index 085021cf3d642b6a0987e4d9080b55af149f6ce0..92c51bf930805c520ad871e2282f23f1736d2cc9 100644 (file)
@@ -33,7 +33,17 @@ typedef struct splay_tree_node_s *splay_tree_node;
 typedef struct splay_tree_s *splay_tree;
 typedef struct splay_tree_key_s *splay_tree_key;
    define splay_tree_key_s structure, and define
-   splay_compare inline function.  */
+   splay_compare inline function.
+
+   Alternatively, they can define splay_tree_prefix macro before
+   including this header and then all the above types, the
+   splay_compare function and the splay_tree_{lookup,insert_remove}
+   function will be prefixed by that prefix.  If splay_tree_prefix
+   macro is defined, this header must be included twice: once where
+   you need the header file definitions, and once where you need the
+   .c implementation routines.  In the latter case, you must also
+   define the macro splay_tree_c.  See the include of splay-tree.h in
+   priority_queue.[hc] for an example.  */
 
 /* For an easily readable description of splay-trees, see:
 
@@ -43,8 +53,37 @@ typedef struct splay_tree_key_s *splay_tree_key;
    The major feature of splay trees is that all basic tree operations
    are amortized O(log n) time for a tree with n nodes.  */
 
-#ifndef _SPLAY_TREE_H
-#define _SPLAY_TREE_H 1
+#ifdef splay_tree_prefix
+# define splay_tree_name_1(prefix, name) prefix ## _ ## name
+# define splay_tree_name(prefix, name) splay_tree_name_1 (prefix, name)
+# define splay_tree_node_s     \
+    splay_tree_name (splay_tree_prefix, splay_tree_node_s)
+# define splay_tree_s          \
+    splay_tree_name (splay_tree_prefix, splay_tree_s)
+# define splay_tree_key_s      \
+    splay_tree_name (splay_tree_prefix, splay_tree_key_s)
+# define splay_tree_node       \
+    splay_tree_name (splay_tree_prefix, splay_tree_node)
+# define splay_tree            \
+    splay_tree_name (splay_tree_prefix, splay_tree)
+# define splay_tree_key                \
+    splay_tree_name (splay_tree_prefix, splay_tree_key)
+# define splay_compare         \
+    splay_tree_name (splay_tree_prefix, splay_compare)
+# define splay_tree_lookup     \
+    splay_tree_name (splay_tree_prefix, splay_tree_lookup)
+# define splay_tree_insert     \
+    splay_tree_name (splay_tree_prefix, splay_tree_insert)
+# define splay_tree_remove     \
+    splay_tree_name (splay_tree_prefix, splay_tree_remove)
+# define splay_tree_foreach    \
+    splay_tree_name (splay_tree_prefix, splay_tree_foreach)
+# define splay_tree_callback   \
+    splay_tree_name (splay_tree_prefix, splay_tree_callback)
+#endif
+
+#ifndef splay_tree_c
+/* Header file definitions and prototypes.  */
 
 /* The nodes in the splay tree.  */
 struct splay_tree_node_s {
@@ -59,8 +98,33 @@ struct splay_tree_s {
   splay_tree_node root;
 };
 
+typedef void (*splay_tree_callback) (splay_tree_key, void *);
+
 extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
 extern void splay_tree_insert (splay_tree, splay_tree_node);
 extern void splay_tree_remove (splay_tree, splay_tree_key);
+extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
+#else  /* splay_tree_c */
+#  ifdef splay_tree_prefix
+#    include "splay-tree.c"
+#    undef splay_tree_name_1
+#    undef splay_tree_name
+#    undef splay_tree_node_s
+#    undef splay_tree_s
+#    undef splay_tree_key_s
+#    undef splay_tree_node
+#    undef splay_tree
+#    undef splay_tree_key
+#    undef splay_compare
+#    undef splay_tree_lookup
+#    undef splay_tree_insert
+#    undef splay_tree_remove
+#    undef splay_tree_foreach
+#    undef splay_tree_callback
+#    undef splay_tree_c
+#  endif
+#endif /* #ifndef splay_tree_c */
 
-#endif /* _SPLAY_TREE_H */
+#ifdef splay_tree_prefix
+#  undef splay_tree_prefix
+#endif
index 1bddc6ffcee5fa72477c68844c8256af2764c75f..cf9d0e64c7c241d9870642ee9954da3d74fb6827 100644 (file)
@@ -92,23 +92,6 @@ gomp_realloc_unlock (void *old, size_t size)
   return ret;
 }
 
-/* The comparison function.  */
-
-attribute_hidden int
-splay_compare (splay_tree_key x, splay_tree_key y)
-{
-  if (x->host_start == x->host_end
-      && y->host_start == y->host_end)
-    return 0;
-  if (x->host_end <= y->host_start)
-    return -1;
-  if (x->host_start >= y->host_end)
-    return 1;
-  return 0;
-}
-
-#include "splay-tree.h"
-
 attribute_hidden void
 gomp_init_targets_once (void)
 {
@@ -1365,17 +1348,7 @@ GOMP_target (int device, void (*fn) (void *), const void *unused,
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
                     GOMP_MAP_VARS_TARGET);
-  struct gomp_thread old_thr, *thr = gomp_thread ();
-  old_thr = *thr;
-  memset (thr, '\0', sizeof (*thr));
-  if (gomp_places_list)
-    {
-      thr->place = old_thr.place;
-      thr->ts.place_partition_len = gomp_places_list_len;
-    }
   devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
-  gomp_free_thread (thr);
-  *thr = old_thr;
   gomp_unmap_vars (tgt_vars, true);
 }
 
@@ -1404,10 +1377,52 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
   (void) num_teams;
   (void) thread_limit;
 
-  /* If there are depend clauses, but nowait is not present,
-     block the parent task until the dependencies are resolved
-     and then just continue with the rest of the function as if it
-     is a merged task.  */
+  if (flags & GOMP_TARGET_FLAG_NOWAIT)
+    {
+      struct gomp_thread *thr = gomp_thread ();
+      /* Create a team if we don't have any around, as nowait
+        target tasks make sense to run asynchronously even when
+        outside of any parallel.  */
+      if (__builtin_expect (thr->ts.team == NULL, 0))
+       {
+         struct gomp_team *team = gomp_new_team (1);
+         struct gomp_task *task = thr->task;
+         struct gomp_task_icv *icv = task ? &task->icv : &gomp_global_icv;
+         team->prev_ts = thr->ts;
+         thr->ts.team = team;
+         thr->ts.team_id = 0;
+         thr->ts.work_share = &team->work_shares[0];
+         thr->ts.last_work_share = NULL;
+#ifdef HAVE_SYNC_BUILTINS
+         thr->ts.single_count = 0;
+#endif
+         thr->ts.static_trip = 0;
+         thr->task = &team->implicit_task[0];
+         gomp_init_task (thr->task, NULL, icv);
+         if (task)
+           {
+             thr->task = task;
+             gomp_end_task ();
+             free (task);
+             thr->task = &team->implicit_task[0];
+           }
+         else
+           pthread_setspecific (gomp_thread_destructor, thr);
+       }
+      if (thr->ts.team
+         && !thr->task->final_task)
+       {
+         gomp_create_target_task (devicep, fn, mapnum, hostaddrs,
+                                  sizes, kinds, flags, depend,
+                                  GOMP_TARGET_TASK_BEFORE_MAP);
+         return;
+       }
+    }
+
+  /* If there are depend clauses, but nowait is not present
+     (or we are in a final task), block the parent task until the
+     dependencies are resolved and then just continue with the rest
+     of the function as if it is a merged task.  */
   if (depend != NULL)
     {
       struct gomp_thread *thr = gomp_thread ();
@@ -1427,17 +1442,7 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
                     GOMP_MAP_VARS_TARGET);
-  struct gomp_thread old_thr, *thr = gomp_thread ();
-  old_thr = *thr;
-  memset (thr, '\0', sizeof (*thr));
-  if (gomp_places_list)
-    {
-      thr->place = old_thr.place;
-      thr->ts.place_partition_len = gomp_places_list_len;
-    }
   devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
-  gomp_free_thread (thr);
-  *thr = old_thr;
   gomp_unmap_vars (tgt_vars, true);
 }
 
@@ -1544,23 +1549,25 @@ GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
              && thr->ts.team
              && !thr->task->final_task)
            {
-             gomp_create_target_task (devicep, (void (*) (void *)) NULL,
-                                      mapnum, hostaddrs, sizes, kinds,
-                                      flags | GOMP_TARGET_FLAG_UPDATE,
-                                      depend);
-             return;
+             if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+                                          mapnum, hostaddrs, sizes, kinds,
+                                          flags | GOMP_TARGET_FLAG_UPDATE,
+                                          depend, GOMP_TARGET_TASK_DATA))
+               return;
+           }
+         else
+           {
+             struct gomp_team *team = thr->ts.team;
+             /* If parallel or taskgroup has been cancelled, don't start new
+                tasks.  */
+             if (team
+                 && (gomp_team_barrier_cancelled (&team->barrier)
+                     || (thr->task->taskgroup
+                         && thr->task->taskgroup->cancelled)))
+               return;
+
+             gomp_task_maybe_wait_for_dependencies (depend);
            }
-
-         struct gomp_team *team = thr->ts.team;
-         /* If parallel or taskgroup has been cancelled, don't start new
-            tasks.  */
-         if (team
-             && (gomp_team_barrier_cancelled (&team->barrier)
-                 || (thr->task->taskgroup
-                     && thr->task->taskgroup->cancelled)))
-           return;
-
-         gomp_task_maybe_wait_for_dependencies (depend);
        }
     }
 
@@ -1664,22 +1671,25 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
              && thr->ts.team
              && !thr->task->final_task)
            {
-             gomp_create_target_task (devicep, (void (*) (void *)) NULL,
-                                      mapnum, hostaddrs, sizes, kinds,
-                                      flags, depend);
-             return;
+             if (gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+                                          mapnum, hostaddrs, sizes, kinds,
+                                          flags, depend,
+                                          GOMP_TARGET_TASK_DATA))
+               return;
+           }
+         else
+           {
+             struct gomp_team *team = thr->ts.team;
+             /* If parallel or taskgroup has been cancelled, don't start new
+                tasks.  */
+             if (team
+                 && (gomp_team_barrier_cancelled (&team->barrier)
+                     || (thr->task->taskgroup
+                         && thr->task->taskgroup->cancelled)))
+               return;
+
+             gomp_task_maybe_wait_for_dependencies (depend);
            }
-
-         struct gomp_team *team = thr->ts.team;
-         /* If parallel or taskgroup has been cancelled, don't start new
-            tasks.  */
-         if (team
-             && (gomp_team_barrier_cancelled (&team->barrier)
-                 || (thr->task->taskgroup
-                     && thr->task->taskgroup->cancelled)))
-           return;
-
-         gomp_task_maybe_wait_for_dependencies (depend);
        }
     }
 
@@ -1711,38 +1721,65 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
     gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds);
 }
 
-void
+bool
 gomp_target_task_fn (void *data)
 {
   struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_device_descr *devicep = ttask->devicep;
+
   if (ttask->fn != NULL)
     {
-      /* GOMP_target_ext */
+      if (devicep == NULL
+         || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+       {
+         ttask->state = GOMP_TARGET_TASK_FALLBACK;
+         gomp_target_fallback_firstprivate (ttask->fn, ttask->mapnum,
+                                            ttask->hostaddrs, ttask->sizes,
+                                            ttask->kinds);
+         return false;
+       }
+
+      if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+       {
+         gomp_unmap_vars (ttask->tgt, true);
+         return false;
+       }
+
+      void *fn_addr = gomp_get_target_fn_addr (devicep, ttask->fn);
+      ttask->tgt
+       = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+                        ttask->sizes, ttask->kinds, true,
+                        GOMP_MAP_VARS_TARGET);
+      ttask->state = GOMP_TARGET_TASK_READY_TO_RUN;
+
+      devicep->async_run_func (devicep->target_id, fn_addr,
+                              (void *) ttask->tgt->tgt_start, (void *) ttask);
+      return true;
     }
-  else if (ttask->devicep == NULL
-          || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
-    return;
+  else if (devicep == NULL
+          || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return false;
 
   size_t i;
   if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
-    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+    gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
                 ttask->kinds, true);
   else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
     for (i = 0; i < ttask->mapnum; i++)
       if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
        {
-         gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
-                        &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
-                        &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+         gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+                        NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+                        GOMP_MAP_VARS_ENTER_DATA);
          i += ttask->sizes[i];
        }
       else
-       gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
-                      &ttask->sizes[i], &ttask->kinds[i],
-                      true, GOMP_MAP_VARS_ENTER_DATA);
+       gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+                      &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
   else
-    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
-                   ttask->sizes, ttask->kinds);
+    gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+                   ttask->kinds);
+  return false;
 }
 
 void
@@ -2187,6 +2224,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
+      DLSYM (async_run);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
index 1246c6ae3187fbeabd21a21bce634cbeef3a7d09..541008d016b39f1c5ce3d1e5c114d564123d8a59 100644 (file)
@@ -65,6 +65,14 @@ void
 gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
                struct gomp_task_icv *prev_icv)
 {
+  /* It would seem that using memset here would be a win, but it turns
+     out that partially filling gomp_task allows us to keep the
+     overhead of task creation low.  In the nqueens-1.c test, for a
+     sufficiently large N, we drop the overhead from 5-6% to 1%.
+
+     Note, the nqueens-1.c test in serial mode is a good test to
+     benchmark the overhead of creating tasks as there are millions of
+     tiny tasks created that all run undeferred.  */
   task->parent = parent_task;
   task->icv = *prev_icv;
   task->kind = GOMP_TASK_IMPLICIT;
@@ -73,7 +81,7 @@ gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
   task->final_task = false;
   task->copy_ctors_done = false;
   task->parent_depends_on = false;
-  task->children = NULL;
+  priority_queue_init (&task->children_queue);
   task->taskgroup = NULL;
   task->dependers = NULL;
   task->depend_hash = NULL;
@@ -92,24 +100,66 @@ gomp_end_task (void)
   thr->task = task->parent;
 }
 
-/* Orphan the task in CHILDREN and all its siblings.  */
+/* Clear the parent field of every task in LIST.  */
 
 static inline void
-gomp_clear_parent (struct gomp_task *children)
+gomp_clear_parent_in_list (struct priority_list *list)
 {
-  struct gomp_task *task = children;
-
-  if (task)
+  struct priority_node *p = list->tasks;
+  if (p)
     do
       {
-       task->parent = NULL;
-       task = task->next_child;
+       priority_node_to_task (PQ_CHILDREN, p)->parent = NULL;
+       p = p->next;
       }
-    while (task != children);
+    while (p != list->tasks);
 }
 
-/* Helper function for GOMP_task and gomp_create_target_task.  Depend clause
-   handling for undeferred task creation.  */
+/* Splay tree version of gomp_clear_parent_in_list.
+
+   Clear the parent field of every task in NODE within SP, and free
+   the node when done.  */
+
+static void
+gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node)
+{
+  if (!node)
+    return;
+  prio_splay_tree_node left = node->left, right = node->right;
+  gomp_clear_parent_in_list (&node->key.l);
+#if _LIBGOMP_CHECKING_
+  memset (node, 0xaf, sizeof (*node));
+#endif
+  /* No need to remove the node from the tree.  We're nuking
+     everything, so just free the nodes and our caller can clear the
+     entire splay tree.  */
+  free (node);
+  gomp_clear_parent_in_tree (sp, left);
+  gomp_clear_parent_in_tree (sp, right);
+}
+
+/* Clear the parent field of every task in Q and remove every task
+   from Q.  */
+
+static inline void
+gomp_clear_parent (struct priority_queue *q)
+{
+  if (priority_queue_multi_p (q))
+    {
+      gomp_clear_parent_in_tree (&q->t, q->t.root);
+      /* All the nodes have been cleared in gomp_clear_parent_in_tree.
+        No need to remove anything.  We can just nuke everything.  */
+      q->t.root = NULL;
+    }
+  else
+    gomp_clear_parent_in_list (&q->l);
+}
+
+/* Helper function for GOMP_task and gomp_create_target_task.
+
+   For a TASK with in/out dependencies, fill in the various dependency
+   queues.  PARENT is the parent of said task.  DEPEND is as in
+   GOMP_task.  */
 
 static void
 gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
@@ -260,8 +310,8 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 
   if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
     priority = 0;
-  /* FIXME, use priority.  */
-  (void) priority;
+  else if (priority > gomp_max_task_priority_var)
+    priority = gomp_max_task_priority_var;
 
   if (!if_clause || team == NULL
       || (thr->task && thr->task->final_task)
@@ -283,6 +333,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       task.kind = GOMP_TASK_UNDEFERRED;
       task.final_task = (thr->task && thr->task->final_task)
                        || (flags & GOMP_TASK_FLAG_FINAL);
+      task.priority = priority;
       if (thr->task)
        {
          task.in_tied_task = thr->task->in_tied_task;
@@ -308,10 +359,10 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
         child thread, but seeing a stale non-NULL value is not a
         problem.  Once past the task_lock acquisition, this thread
         will see the real value of task.children.  */
-      if (task.children != NULL)
+      if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED))
        {
          gomp_mutex_lock (&team->task_lock);
-         gomp_clear_parent (task.children);
+         gomp_clear_parent (&task.children_queue);
          gomp_mutex_unlock (&team->task_lock);
        }
       gomp_end_task ();
@@ -333,6 +384,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
                      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
+      task->priority = priority;
       task->kind = GOMP_TASK_UNDEFERRED;
       task->in_tied_task = parent->in_tied_task;
       task->taskgroup = taskgroup;
@@ -368,53 +420,36 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
          gomp_task_handle_depend (task, parent, depend);
          if (task->num_dependees)
            {
+             /* Tasks that depend on other tasks are not put into the
+                various waiting queues, so we are done for now.  Said
+                tasks are instead put into the queues via
+                gomp_task_run_post_handle_dependers() after their
+                dependencies have been satisfied.  After which, they
+                can be picked up by the various scheduling
+                points.  */
              gomp_mutex_unlock (&team->task_lock);
              return;
            }
        }
-      if (parent->children)
-       {
-         task->next_child = parent->children;
-         task->prev_child = parent->children->prev_child;
-         task->next_child->prev_child = task;
-         task->prev_child->next_child = task;
-       }
-      else
-       {
-         task->next_child = task;
-         task->prev_child = task;
-       }
-      parent->children = task;
+
+      priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
+                            task, priority,
+                            PRIORITY_INSERT_BEGIN,
+                            /*adjust_parent_depends_on=*/false,
+                            task->parent_depends_on);
       if (taskgroup)
-       {
-         /* If applicable, place task into its taskgroup.  */
-         if (taskgroup->children)
-           {
-             task->next_taskgroup = taskgroup->children;
-             task->prev_taskgroup = taskgroup->children->prev_taskgroup;
-             task->next_taskgroup->prev_taskgroup = task;
-             task->prev_taskgroup->next_taskgroup = task;
-           }
-         else
-           {
-             task->next_taskgroup = task;
-             task->prev_taskgroup = task;
-           }
-         taskgroup->children = task;
-       }
-      if (team->task_queue)
-       {
-         task->next_queue = team->task_queue;
-         task->prev_queue = team->task_queue->prev_queue;
-         task->next_queue->prev_queue = task;
-         task->prev_queue->next_queue = task;
-       }
-      else
-       {
-         task->next_queue = task;
-         task->prev_queue = task;
-         team->task_queue = task;
-       }
+       priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                              task, priority,
+                              PRIORITY_INSERT_BEGIN,
+                              /*adjust_parent_depends_on=*/false,
+                              task->parent_depends_on);
+
+      priority_queue_insert (PQ_TEAM, &team->task_queue,
+                            task, priority,
+                            PRIORITY_INSERT_END,
+                            /*adjust_parent_depends_on=*/false,
+                            task->parent_depends_on);
+
       ++team->task_count;
       ++team->task_queued_count;
       gomp_team_barrier_set_task_pending (&team->barrier);
@@ -445,13 +480,119 @@ ialias (GOMP_taskgroup_end)
 #undef UTYPE
 #undef GOMP_taskloop
 
-/* Called for nowait target tasks.  */
+static void inline
+priority_queue_move_task_first (enum priority_queue_type type,
+                               struct priority_queue *head,
+                               struct gomp_task *task)
+{
+#if _LIBGOMP_CHECKING_
+  if (!priority_queue_task_in_queue_p (type, head, task))
+    gomp_fatal ("Attempt to move first missing task %p", task);
+#endif
+  struct priority_list *list;
+  if (priority_queue_multi_p (head))
+    {
+      list = priority_queue_lookup_priority (head, task->priority);
+#if _LIBGOMP_CHECKING_
+      if (!list)
+       gomp_fatal ("Unable to find priority %d", task->priority);
+#endif
+    }
+  else
+    list = &head->l;
+  priority_list_remove (list, task_to_priority_node (type, task), 0);
+  priority_list_insert (type, list, task, task->priority,
+                       PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
+                       task->parent_depends_on);
+}
+
+/* Actual body of GOMP_PLUGIN_target_task_completion that is executed
+   with team->task_lock held, or is executed in the thread that called
+   gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been
+   run before it acquires team->task_lock.  */
+
+static void
+gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
+{
+  struct gomp_task *parent = task->parent;
+  if (parent)
+    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
+                                   task);
+
+  struct gomp_taskgroup *taskgroup = task->taskgroup;
+  if (taskgroup)
+    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                                   task);
+
+  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
+                        PRIORITY_INSERT_BEGIN, false,
+                        task->parent_depends_on);
+  task->kind = GOMP_TASK_WAITING;
+  if (parent && parent->taskwait)
+    {
+      if (parent->taskwait->in_taskwait)
+       {
+         /* One more task has had its dependencies met.
+            Inform any waiters.  */
+         parent->taskwait->in_taskwait = false;
+         gomp_sem_post (&parent->taskwait->taskwait_sem);
+       }
+      else if (parent->taskwait->in_depend_wait)
+       {
+         /* One more task has had its dependencies met.
+            Inform any waiters.  */
+         parent->taskwait->in_depend_wait = false;
+         gomp_sem_post (&parent->taskwait->taskwait_sem);
+       }
+    }
+  if (taskgroup && taskgroup->in_taskgroup_wait)
+    {
+      /* One more task has had its dependencies met.
+        Inform any waiters.  */
+      taskgroup->in_taskgroup_wait = false;
+      gomp_sem_post (&taskgroup->taskgroup_sem);
+    }
+
+  ++team->task_queued_count;
+  gomp_team_barrier_set_task_pending (&team->barrier);
+  /* I'm afraid this can't be done after releasing team->task_lock,
+     as gomp_target_task_completion is run from unrelated thread and
+     therefore in between gomp_mutex_unlock and gomp_team_barrier_wake
+     the team could be gone already.  */
+  if (team->nthreads > team->task_running_count)
+    gomp_team_barrier_wake (&team->barrier, 1);
+}
+
+/* Signal that a target task TTASK has completed the asynchronously
+   running phase and should be requeued as a task to handle the
+   variable unmapping.  */
 
 void
+GOMP_PLUGIN_target_task_completion (void *data)
+{
+  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_task *task = ttask->task;
+  struct gomp_team *team = ttask->team;
+
+  gomp_mutex_lock (&team->task_lock);
+  if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN)
+    {
+      ttask->state = GOMP_TARGET_TASK_FINISHED;
+      gomp_mutex_unlock (&team->task_lock);
+    }
+  ttask->state = GOMP_TARGET_TASK_FINISHED;
+  gomp_target_task_completion (team, task);
+  gomp_mutex_unlock (&team->task_lock);
+}
+
+/* Called for nowait target tasks.  */
+
+bool
 gomp_create_target_task (struct gomp_device_descr *devicep,
                         void (*fn) (void *), size_t mapnum, void **hostaddrs,
                         size_t *sizes, unsigned short *kinds,
-                        unsigned int flags, void **depend)
+                        unsigned int flags, void **depend,
+                        enum gomp_target_task_state state)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -460,7 +601,7 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   if (team
       && (gomp_team_barrier_cancelled (&team->barrier)
          || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+    return true;
 
   struct gomp_target_task *ttask;
   struct gomp_task *task;
@@ -468,19 +609,45 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   struct gomp_taskgroup *taskgroup = parent->taskgroup;
   bool do_wake;
   size_t depend_size = 0;
+  uintptr_t depend_cnt = 0;
+  size_t tgt_align = 0, tgt_size = 0;
 
   if (depend != NULL)
-    depend_size = ((uintptr_t) depend[0]
-                  * sizeof (struct gomp_task_depend_entry));
+    {
+      depend_cnt = (uintptr_t) depend[0];
+      depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
+    }
+  if (fn)
+    {
+      /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are
+        firstprivate on the target task.  */
+      size_t i;
+      for (i = 0; i < mapnum; i++)
+       if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+         {
+           size_t align = (size_t) 1 << (kinds[i] >> 8);
+           if (tgt_align < align)
+             tgt_align = align;
+           tgt_size = (tgt_size + align - 1) & ~(align - 1);
+           tgt_size += sizes[i];
+         }
+      if (tgt_align)
+       tgt_size += tgt_align - 1;
+      else
+       tgt_size = 0;
+    }
+
   task = gomp_malloc (sizeof (*task) + depend_size
                      + sizeof (*ttask)
                      + mapnum * (sizeof (void *) + sizeof (size_t)
-                                 + sizeof (unsigned short)));
+                                 + sizeof (unsigned short))
+                     + tgt_size);
   gomp_init_task (task, parent, gomp_icv (false));
+  task->priority = 0;
   task->kind = GOMP_TASK_WAITING;
   task->in_tied_task = parent->in_tied_task;
   task->taskgroup = taskgroup;
-  ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]];
+  ttask = (struct gomp_target_task *) &task->depend[depend_cnt];
   ttask->devicep = devicep;
   ttask->fn = fn;
   ttask->mapnum = mapnum;
@@ -489,8 +656,29 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
   ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
   memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
+  if (tgt_align)
+    {
+      char *tgt = (char *) &ttask->kinds[mapnum];
+      size_t i;
+      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+      if (al)
+       tgt += tgt_align - al;
+      tgt_size = 0;
+      for (i = 0; i < mapnum; i++)
+       if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+         {
+           size_t align = (size_t) 1 << (kinds[i] >> 8);
+           tgt_size = (tgt_size + align - 1) & ~(align - 1);
+           memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+           ttask->hostaddrs[i] = tgt + tgt_size;
+           tgt_size = tgt_size + sizes[i];
+         }
+    }
   ttask->flags = flags;
-  task->fn = gomp_target_task_fn;
+  ttask->state = state;
+  ttask->task = task;
+  ttask->team = team;
+  task->fn = NULL;
   task->fn_data = ttask;
   task->final_task = 0;
   gomp_mutex_lock (&team->task_lock);
@@ -501,62 +689,78 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
       gomp_mutex_unlock (&team->task_lock);
       gomp_finish_task (task);
       free (task);
-      return;
+      return true;
     }
-  if (taskgroup)
-    taskgroup->num_children++;
   if (depend_size)
     {
       gomp_task_handle_depend (task, parent, depend);
       if (task->num_dependees)
        {
+         if (taskgroup)
+           taskgroup->num_children++;
          gomp_mutex_unlock (&team->task_lock);
-         return;
+         return true;
        }
     }
-  if (parent->children)
-    {
-      task->next_child = parent->children;
-      task->prev_child = parent->children->prev_child;
-      task->next_child->prev_child = task;
-      task->prev_child->next_child = task;
-    }
-  else
+  if (state == GOMP_TARGET_TASK_DATA)
     {
-      task->next_child = task;
-      task->prev_child = task;
+      gomp_mutex_unlock (&team->task_lock);
+      gomp_finish_task (task);
+      free (task);
+      return false;
     }
-  parent->children = task;
   if (taskgroup)
+    taskgroup->num_children++;
+  /* For async offloading, if we don't need to wait for dependencies,
+     run the gomp_target_task_fn right away, essentially schedule the
+     mapping part of the task in the current thread.  */
+  if (devicep != NULL
+      && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      /* If applicable, place task into its taskgroup.  */
-      if (taskgroup->children)
-       {
-         task->next_taskgroup = taskgroup->children;
-         task->prev_taskgroup = taskgroup->children->prev_taskgroup;
-         task->next_taskgroup->prev_taskgroup = task;
-         task->prev_taskgroup->next_taskgroup = task;
-       }
+      priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
+                            PRIORITY_INSERT_END,
+                            /*adjust_parent_depends_on=*/false,
+                            task->parent_depends_on);
+      if (taskgroup)
+       priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                              task, 0, PRIORITY_INSERT_END,
+                              /*adjust_parent_depends_on=*/false,
+                              task->parent_depends_on);
+      task->pnode[PQ_TEAM].next = NULL;
+      task->pnode[PQ_TEAM].prev = NULL;
+      task->kind = GOMP_TASK_TIED;
+      ++team->task_count;
+      gomp_mutex_unlock (&team->task_lock);
+
+      thr->task = task;
+      gomp_target_task_fn (task->fn_data);
+      thr->task = parent;
+
+      gomp_mutex_lock (&team->task_lock);
+      task->kind = GOMP_TASK_ASYNC_RUNNING;
+      /* If GOMP_PLUGIN_target_task_completion has run already
+        in between gomp_target_task_fn and the mutex lock,
+        perform the requeuing here.  */
+      if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+       gomp_target_task_completion (team, task);
       else
-       {
-         task->next_taskgroup = task;
-         task->prev_taskgroup = task;
-       }
-      taskgroup->children = task;
-    }
-  if (team->task_queue)
-    {
-      task->next_queue = team->task_queue;
-      task->prev_queue = team->task_queue->prev_queue;
-      task->next_queue->prev_queue = task;
-      task->prev_queue->next_queue = task;
-    }
-  else
-    {
-      task->next_queue = task;
-      task->prev_queue = task;
-      team->task_queue = task;
+       ttask->state = GOMP_TARGET_TASK_RUNNING;
+      gomp_mutex_unlock (&team->task_lock);
+      return true;
     }
+  priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
+                        PRIORITY_INSERT_BEGIN,
+                        /*adjust_parent_depends_on=*/false,
+                        task->parent_depends_on);
+  if (taskgroup)
+    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0,
+                          PRIORITY_INSERT_BEGIN,
+                          /*adjust_parent_depends_on=*/false,
+                          task->parent_depends_on);
+  priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0,
+                        PRIORITY_INSERT_END,
+                        /*adjust_parent_depends_on=*/false,
+                        task->parent_depends_on);
   ++team->task_count;
   ++team->task_queued_count;
   gomp_team_barrier_set_task_pending (&team->barrier);
@@ -565,210 +769,214 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   gomp_mutex_unlock (&team->task_lock);
   if (do_wake)
     gomp_team_barrier_wake (&team->barrier, 1);
+  return true;
 }
 
-#if _LIBGOMP_CHECKING
-/* Sanity check TASK to make sure it is in its parent's children
-   queue, and that the tasks therein are in the right order.
+/* Given a parent_depends_on task in LIST, move it to the front of its
+   priority so it is run as soon as possible.
 
-   The expected order is:
-       parent_depends_on WAITING tasks
-       !parent_depends_on WAITING tasks
-       TIED tasks
+   Care is taken to update the list's LAST_PARENT_DEPENDS_ON field.
 
-   PARENT is the alleged parent of TASK.  */
+   We rearrange the queue such that all parent_depends_on tasks are
+   first, and last_parent_depends_on points to the last such task we
+   rearranged.  For example, given the following tasks in a queue
+   where PD[123] are the parent_depends_on tasks:
 
-static void
-verify_children_queue (struct gomp_task *task, struct gomp_task *parent)
-{
-  if (task->parent != parent)
-    gomp_fatal ("verify_children_queue: incompatible parents");
-  /* It's OK, Annie was an orphan and she turned out all right.  */
-  if (!parent)
-    return;
+       task->children
+       |
+       V
+       C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
 
-  bool seen_tied = false;
-  bool seen_plain_waiting = false;
-  bool found = false;
-  struct gomp_task *t = parent->children;
-  while (1)
+       We rearrange such that:
+
+       task->children
+       |              +--- last_parent_depends_on
+       |              |
+       V              V
+       PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4.  */
+
+static void inline
+priority_list_upgrade_task (struct priority_list *list,
+                           struct priority_node *node)
+{
+  struct priority_node *last_parent_depends_on
+    = list->last_parent_depends_on;
+  if (last_parent_depends_on)
     {
-      if (t == task)
-       found = true;
-      if (seen_tied && t->kind == GOMP_TASK_WAITING)
-       gomp_fatal ("verify_children_queue: WAITING task after TIED");
-      if (t->kind == GOMP_TASK_TIED)
-       seen_tied = true;
-      else if (t->kind == GOMP_TASK_WAITING)
-       {
-         if (t->parent_depends_on)
-           {
-             if (seen_plain_waiting)
-               gomp_fatal ("verify_children_queue: parent_depends_on after "
-                           "!parent_depends_on");
-           }
-         else
-           seen_plain_waiting = true;
-       }
-      t = t->next_child;
-      if (t == parent->children)
-       break;
+      node->prev->next = node->next;
+      node->next->prev = node->prev;
+      node->prev = last_parent_depends_on;
+      node->next = last_parent_depends_on->next;
+      node->prev->next = node;
+      node->next->prev = node;
     }
-  if (!found)
-    gomp_fatal ("verify_children_queue: child not found in parent queue");
+  else if (node != list->tasks)
+    {
+      node->prev->next = node->next;
+      node->next->prev = node->prev;
+      node->prev = list->tasks->prev;
+      node->next = list->tasks;
+      list->tasks = node;
+      node->prev->next = node;
+      node->next->prev = node;
+    }
+  list->last_parent_depends_on = node;
 }
 
-/* Sanity check TASK to make sure it is in its taskgroup queue (if
-   applicable), and that the tasks therein are in the right order.
+/* Given a parent_depends_on TASK in its parent's children_queue, move
+   it to the front of its priority so it is run as soon as possible.
 
-   The expected order is that GOMP_TASK_WAITING tasks must come before
-   GOMP_TASK_TIED tasks.
+   PARENT is passed as an optimization.
 
-   TASK is the task.  */
+   (This function could be defined in priority_queue.c, but we want it
+   inlined, and putting it in priority_queue.h is not an option, given
+   that gomp_task has not been properly defined at that point).  */
 
-static void
-verify_taskgroup_queue (struct gomp_task *task)
+static void inline
+priority_queue_upgrade_task (struct gomp_task *task,
+                            struct gomp_task *parent)
 {
-  struct gomp_taskgroup *taskgroup = task->taskgroup;
-  if (!taskgroup)
-    return;
-
-  bool seen_tied = false;
-  bool found = false;
-  struct gomp_task *t = taskgroup->children;
-  while (1)
+  struct priority_queue *head = &parent->children_queue;
+  struct priority_node *node = &task->pnode[PQ_CHILDREN];
+#if _LIBGOMP_CHECKING_
+  if (!task->parent_depends_on)
+    gomp_fatal ("priority_queue_upgrade_task: task must be a "
+               "parent_depends_on task");
+  if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task))
+    gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task);
+#endif
+  if (priority_queue_multi_p (head))
     {
-      if (t == task)
-       found = true;
-      if (t->kind == GOMP_TASK_WAITING && seen_tied)
-       gomp_fatal ("verify_taskgroup_queue: WAITING task after TIED");
-      if (t->kind == GOMP_TASK_TIED)
-       seen_tied = true;
-      t = t->next_taskgroup;
-      if (t == taskgroup->children)
-       break;
+      struct priority_list *list
+       = priority_queue_lookup_priority (head, task->priority);
+      priority_list_upgrade_task (list, node);
     }
-  if (!found)
-    gomp_fatal ("verify_taskgroup_queue: child not found in parent queue");
+  else
+    priority_list_upgrade_task (&head->l, node);
 }
 
-/* Verify that TASK is in the team's task queue.  */
+/* Given a CHILD_TASK in LIST that is about to be executed, move it out of
+   the way in LIST so that other tasks can be considered for
+   execution.  LIST contains tasks of type TYPE.
 
-static void
-verify_task_queue (struct gomp_task *task, struct gomp_team *team)
+   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
+   if applicable.  */
+
+static void inline
+priority_list_downgrade_task (enum priority_queue_type type,
+                             struct priority_list *list,
+                             struct gomp_task *child_task)
 {
-  struct gomp_task *t = team->task_queue;
-  if (team)
-    while (1)
-      {
-       if (t == task)
-         return;
-       t = t->next_queue;
-       if (t == team->task_queue)
-         break;
-      }
-  gomp_fatal ("verify_team_queue: child not in team");
+  struct priority_node *node = task_to_priority_node (type, child_task);
+  if (list->tasks == node)
+    list->tasks = node->next;
+  else if (node->next != list->tasks)
+    {
+      /* The task in NODE is about to become TIED and TIED tasks
+        cannot come before WAITING tasks.  If we're about to
+        leave the queue in such an indeterminate state, rewire
+        things appropriately.  However, a TIED task at the end is
+        perfectly fine.  */
+      struct gomp_task *next_task = priority_node_to_task (type, node->next);
+      if (next_task->kind == GOMP_TASK_WAITING)
+       {
+         /* Remove from list.  */
+         node->prev->next = node->next;
+         node->next->prev = node->prev;
+         /* Rewire at the end.  */
+         node->next = list->tasks;
+         node->prev = list->tasks->prev;
+         list->tasks->prev->next = node;
+         list->tasks->prev = node;
+       }
+    }
+
+  /* If the current task is the last_parent_depends_on for its
+     priority, adjust last_parent_depends_on appropriately.  */
+  if (__builtin_expect (child_task->parent_depends_on, 0)
+      && list->last_parent_depends_on == node)
+    {
+      struct gomp_task *prev_child = priority_node_to_task (type, node->prev);
+      if (node->prev != node
+         && prev_child->kind == GOMP_TASK_WAITING
+         && prev_child->parent_depends_on)
+       list->last_parent_depends_on = node->prev;
+      else
+       {
+         /* There are no more parent_depends_on entries waiting
+            to run, clear the list.  */
+         list->last_parent_depends_on = NULL;
+       }
+    }
 }
+
+/* Given a TASK in HEAD that is about to be executed, move it out of
+   the way so that other tasks can be considered for execution.  HEAD
+   contains tasks of type TYPE.
+
+   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
+   if applicable.
+
+   (This function could be defined in priority_queue.c, but we want it
+   inlined, and putting it in priority_queue.h is not an option, given
+   that gomp_task has not been properly defined at that point).  */
+
+static void inline
+priority_queue_downgrade_task (enum priority_queue_type type,
+                              struct priority_queue *head,
+                              struct gomp_task *task)
+{
+#if _LIBGOMP_CHECKING_
+  if (!priority_queue_task_in_queue_p (type, head, task))
+    gomp_fatal ("Attempt to downgrade missing task %p", task);
 #endif
+  if (priority_queue_multi_p (head))
+    {
+      struct priority_list *list
+       = priority_queue_lookup_priority (head, task->priority);
+      priority_list_downgrade_task (type, list, task);
+    }
+  else
+    priority_list_downgrade_task (type, &head->l, task);
+}
+
+/* Setup CHILD_TASK to execute.  This is done by setting the task to
+   TIED, and updating all relevant queues so that CHILD_TASK is no
+   longer chosen for scheduling.  Also, remove CHILD_TASK from the
+   overall team task queue entirely.
+
+   Return TRUE if task or its containing taskgroup has been
+   cancelled.  */
 
 static inline bool
 gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
                   struct gomp_team *team)
 {
-#if _LIBGOMP_CHECKING
-  verify_children_queue (child_task, parent);
-  verify_taskgroup_queue (child_task);
-  verify_task_queue (child_task, team);
+#if _LIBGOMP_CHECKING_
+  if (child_task->parent)
+    priority_queue_verify (PQ_CHILDREN,
+                          &child_task->parent->children_queue, true);
+  if (child_task->taskgroup)
+    priority_queue_verify (PQ_TASKGROUP,
+                          &child_task->taskgroup->taskgroup_queue, false);
+  priority_queue_verify (PQ_TEAM, &team->task_queue, false);
 #endif
 
+  /* Task is about to go tied, move it out of the way.  */
   if (parent)
-    {
-      /* Adjust children such that it will point to a next child,
-        while the current one is scheduled to be executed.  This way,
-        GOMP_taskwait (and others) can schedule a next task while
-        waiting.
-
-        Do not remove it entirely from the circular list, as it is
-        still a child, though not one we should consider first (say
-        by GOMP_taskwait).  */
-      if (parent->children == child_task)
-       parent->children = child_task->next_child;
-      /* TIED tasks cannot come before WAITING tasks.  If we're about
-        to make this task TIED, rewire things appropriately.
-        However, a TIED task at the end is perfectly fine.  */
-      else if (child_task->next_child->kind == GOMP_TASK_WAITING
-              && child_task->next_child != parent->children)
-       {
-         /* Remove from the list.  */
-         child_task->prev_child->next_child = child_task->next_child;
-         child_task->next_child->prev_child = child_task->prev_child;
-         /* Rewire at the end of its siblings.  */
-         child_task->next_child = parent->children;
-         child_task->prev_child = parent->children->prev_child;
-         parent->children->prev_child->next_child = child_task;
-         parent->children->prev_child = child_task;
-       }
+    priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue,
+                                  child_task);
 
-      /* If the current task (child_task) is at the top of the
-        parent's last_parent_depends_on, it's about to be removed
-        from it.  Adjust last_parent_depends_on appropriately.  */
-      if (__builtin_expect (child_task->parent_depends_on, 0)
-         && parent->taskwait->last_parent_depends_on == child_task)
-       {
-         /* The last_parent_depends_on list was built with all
-            parent_depends_on entries linked to the prev_child.  Grab
-            the next last_parent_depends_on head from this prev_child if
-            available...  */
-         if (child_task->prev_child->kind == GOMP_TASK_WAITING
-             && child_task->prev_child->parent_depends_on)
-           parent->taskwait->last_parent_depends_on = child_task->prev_child;
-         else
-           {
-             /* ...otherwise, there are no more parent_depends_on
-                entries waiting to run.  In which case, clear the
-                list.  */
-             parent->taskwait->last_parent_depends_on = NULL;
-           }
-       }
-    }
-
-  /* Adjust taskgroup to point to the next taskgroup.  See note above
-     regarding adjustment of children as to why the child_task is not
-     removed entirely from the circular list.  */
+  /* Task is about to go tied, move it out of the way.  */
   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
   if (taskgroup)
-    {
-      if (taskgroup->children == child_task)
-       taskgroup->children = child_task->next_taskgroup;
-      /* TIED tasks cannot come before WAITING tasks.  If we're about
-        to make this task TIED, rewire things appropriately.
-        However, a TIED task at the end is perfectly fine.  */
-      else if (child_task->next_taskgroup->kind == GOMP_TASK_WAITING
-              && child_task->next_taskgroup != taskgroup->children)
-       {
-         /* Remove from the list.  */
-         child_task->prev_taskgroup->next_taskgroup
-           = child_task->next_taskgroup;
-         child_task->next_taskgroup->prev_taskgroup
-           = child_task->prev_taskgroup;
-         /* Rewire at the end of its taskgroup.  */
-         child_task->next_taskgroup = taskgroup->children;
-         child_task->prev_taskgroup = taskgroup->children->prev_taskgroup;
-         taskgroup->children->prev_taskgroup->next_taskgroup = child_task;
-         taskgroup->children->prev_taskgroup = child_task;
-       }
-    }
+    priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                                  child_task);
 
-  /* Remove child_task from the task_queue.  */
-  child_task->prev_queue->next_queue = child_task->next_queue;
-  child_task->next_queue->prev_queue = child_task->prev_queue;
-  if (team->task_queue == child_task)
-    {
-      if (child_task->next_queue != child_task)
-       team->task_queue = child_task->next_queue;
-      else
-       team->task_queue = NULL;
-    }
+  priority_queue_remove (PQ_TEAM, &team->task_queue, child_task,
+                        MEMMODEL_RELAXED);
+  child_task->pnode[PQ_TEAM].next = NULL;
+  child_task->pnode[PQ_TEAM].prev = NULL;
   child_task->kind = GOMP_TASK_TIED;
 
   if (--team->task_queued_count == 0)
@@ -808,8 +1016,11 @@ gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
       }
 }
 
-/* After CHILD_TASK has been run, adjust the various task queues to
-   give higher priority to the tasks that depend on CHILD_TASK.
+/* After a CHILD_TASK has been run, adjust the dependency queue for
+   each task that depends on CHILD_TASK, to record the fact that there
+   is one less dependency to worry about.  If a task that depended on
+   CHILD_TASK now has no dependencies, place it in the various queues
+   so it gets scheduled to run.
 
    TEAM is the team to which CHILD_TASK belongs to.  */
 
@@ -822,99 +1033,60 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
   for (i = 0; i < count; i++)
     {
       struct gomp_task *task = child_task->dependers->elem[i];
+
+      /* CHILD_TASK satisfies a dependency for TASK.  Keep track of
+        TASK's remaining dependencies.  Once TASK has no other
+        depenencies, put it into the various queues so it will get
+        scheduled for execution.  */
       if (--task->num_dependees != 0)
        continue;
 
       struct gomp_taskgroup *taskgroup = task->taskgroup;
       if (parent)
        {
-         if (parent->children)
-           {
-             /* If parent is in gomp_task_maybe_wait_for_dependencies
-                and it doesn't need to wait for this task, put it after
-                all ready to run tasks it needs to wait for.  */
-             if (parent->taskwait && parent->taskwait->last_parent_depends_on
-                 && !task->parent_depends_on)
-               {
-                 /* Put depender in last_parent_depends_on.  */
-                 struct gomp_task *last_parent_depends_on
-                   = parent->taskwait->last_parent_depends_on;
-                 task->next_child = last_parent_depends_on->next_child;
-                 task->prev_child = last_parent_depends_on;
-               }
-             else
-               {
-                 /* Make depender a sibling of child_task, and place
-                    it at the top of said sibling list.  */
-                 task->next_child = parent->children;
-                 task->prev_child = parent->children->prev_child;
-                 parent->children = task;
-               }
-             task->next_child->prev_child = task;
-             task->prev_child->next_child = task;
-           }
-         else
-           {
-             /* Make depender a sibling of child_task.  */
-             task->next_child = task;
-             task->prev_child = task;
-             parent->children = task;
-           }
+         priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
+                                task, task->priority,
+                                PRIORITY_INSERT_BEGIN,
+                                /*adjust_parent_depends_on=*/true,
+                                task->parent_depends_on);
          if (parent->taskwait)
            {
              if (parent->taskwait->in_taskwait)
                {
+                 /* One more task has had its dependencies met.
+                    Inform any waiters.  */
                  parent->taskwait->in_taskwait = false;
                  gomp_sem_post (&parent->taskwait->taskwait_sem);
                }
              else if (parent->taskwait->in_depend_wait)
                {
+                 /* One more task has had its dependencies met.
+                    Inform any waiters.  */
                  parent->taskwait->in_depend_wait = false;
                  gomp_sem_post (&parent->taskwait->taskwait_sem);
                }
-             if (parent->taskwait->last_parent_depends_on == NULL
-                 && task->parent_depends_on)
-               parent->taskwait->last_parent_depends_on = task;
            }
        }
-      /* If depender is in a taskgroup, put it at the TOP of its
-        taskgroup.  */
       if (taskgroup)
        {
-         if (taskgroup->children)
-           {
-             task->next_taskgroup = taskgroup->children;
-             task->prev_taskgroup = taskgroup->children->prev_taskgroup;
-             task->next_taskgroup->prev_taskgroup = task;
-             task->prev_taskgroup->next_taskgroup = task;
-           }
-         else
-           {
-             task->next_taskgroup = task;
-             task->prev_taskgroup = task;
-           }
-         taskgroup->children = task;
+         priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                                task, task->priority,
+                                PRIORITY_INSERT_BEGIN,
+                                /*adjust_parent_depends_on=*/false,
+                                task->parent_depends_on);
          if (taskgroup->in_taskgroup_wait)
            {
+             /* One more task has had its dependencies met.
+                Inform any waiters.  */
              taskgroup->in_taskgroup_wait = false;
              gomp_sem_post (&taskgroup->taskgroup_sem);
            }
        }
-      /* Put depender of child_task at the END of the team's
-        task_queue.  */
-      if (team->task_queue)
-       {
-         task->next_queue = team->task_queue;
-         task->prev_queue = team->task_queue->prev_queue;
-         task->next_queue->prev_queue = task;
-         task->prev_queue->next_queue = task;
-       }
-      else
-       {
-         task->next_queue = task;
-         task->prev_queue = task;
-         team->task_queue = task;
-       }
+      priority_queue_insert (PQ_TEAM, &team->task_queue,
+                            task, task->priority,
+                            PRIORITY_INSERT_END,
+                            /*adjust_parent_depends_on=*/false,
+                            task->parent_depends_on);
       ++team->task_count;
       ++team->task_queued_count;
       ++ret;
@@ -964,27 +1136,15 @@ gomp_task_run_post_remove_parent (struct gomp_task *child_task)
       gomp_sem_post (&parent->taskwait->taskwait_sem);
     }
 
-  /* Remove CHILD_TASK from its sibling list.  */
-  child_task->prev_child->next_child = child_task->next_child;
-  child_task->next_child->prev_child = child_task->prev_child;
-  if (parent->children != child_task)
-    return;
-  if (child_task->next_child != child_task)
-    parent->children = child_task->next_child;
-  else
+  if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue,
+                            child_task, MEMMODEL_RELEASE)
+      && parent->taskwait && parent->taskwait->in_taskwait)
     {
-      /* We access task->children in GOMP_taskwait
-        outside of the task lock mutex region, so
-        need a release barrier here to ensure memory
-        written by child_task->fn above is flushed
-        before the NULL is written.  */
-      __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
-      if (parent->taskwait && parent->taskwait->in_taskwait)
-       {
-         parent->taskwait->in_taskwait = false;
-         gomp_sem_post (&parent->taskwait->taskwait_sem);
-       }
+      parent->taskwait->in_taskwait = false;
+      gomp_sem_post (&parent->taskwait->taskwait_sem);
     }
+  child_task->pnode[PQ_CHILDREN].next = NULL;
+  child_task->pnode[PQ_CHILDREN].prev = NULL;
 }
 
 /* Remove CHILD_TASK from its taskgroup.  */
@@ -995,8 +1155,11 @@ gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
   if (taskgroup == NULL)
     return;
-  child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
-  child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
+  bool empty = priority_queue_remove (PQ_TASKGROUP,
+                                     &taskgroup->taskgroup_queue,
+                                     child_task, MEMMODEL_RELAXED);
+  child_task->pnode[PQ_TASKGROUP].next = NULL;
+  child_task->pnode[PQ_TASKGROUP].prev = NULL;
   if (taskgroup->num_children > 1)
     --taskgroup->num_children;
   else
@@ -1008,18 +1171,10 @@ gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
         before the NULL is written.  */
       __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
     }
-  if (taskgroup->children != child_task)
-    return;
-  if (child_task->next_taskgroup != child_task)
-    taskgroup->children = child_task->next_taskgroup;
-  else
+  if (empty && taskgroup->in_taskgroup_wait)
     {
-      taskgroup->children = NULL;
-      if (taskgroup->in_taskgroup_wait)
-       {
-         taskgroup->in_taskgroup_wait = false;
-         gomp_sem_post (&taskgroup->taskgroup_sem);
-       }
+      taskgroup->in_taskgroup_wait = false;
+      gomp_sem_post (&taskgroup->taskgroup_sem);
     }
 }
 
@@ -1049,9 +1204,13 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
   while (1)
     {
       bool cancelled = false;
-      if (team->task_queue != NULL)
+      if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
        {
-         child_task = team->task_queue;
+         bool ignored;
+         child_task
+           = priority_queue_next_task (PQ_TEAM, &team->task_queue,
+                                       PQ_IGNORED, NULL,
+                                       &ignored);
          cancelled = gomp_task_run_pre (child_task, child_task->parent,
                                         team);
          if (__builtin_expect (cancelled, 0))
@@ -1082,7 +1241,29 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
       if (child_task)
        {
          thr->task = child_task;
-         child_task->fn (child_task->fn_data);
+         if (__builtin_expect (child_task->fn == NULL, 0))
+           {
+             if (gomp_target_task_fn (child_task->fn_data))
+               {
+                 thr->task = task;
+                 gomp_mutex_lock (&team->task_lock);
+                 child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+                 team->task_running_count--;
+                 struct gomp_target_task *ttask
+                   = (struct gomp_target_task *) child_task->fn_data;
+                 /* If GOMP_PLUGIN_target_task_completion has run already
+                    in between gomp_target_task_fn and the mutex lock,
+                    perform the requeuing here.  */
+                 if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+                   gomp_target_task_completion (team, child_task);
+                 else
+                   ttask->state = GOMP_TARGET_TASK_RUNNING;
+                 child_task = NULL;
+                 continue;
+               }
+           }
+         else
+           child_task->fn (child_task->fn_data);
          thr->task = task;
        }
       else
@@ -1094,7 +1275,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
          size_t new_tasks
            = gomp_task_run_post_handle_depend (child_task, team);
          gomp_task_run_post_remove_parent (child_task);
-         gomp_clear_parent (child_task->children);
+         gomp_clear_parent (&child_task->children_queue);
          gomp_task_run_post_remove_taskgroup (child_task);
          to_free = child_task;
          child_task = NULL;
@@ -1140,15 +1321,16 @@ GOMP_taskwait (void)
      child thread task work function are seen before we exit from
      GOMP_taskwait.  */
   if (task == NULL
-      || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL)
+      || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE))
     return;
 
   memset (&taskwait, 0, sizeof (taskwait));
+  bool child_q = false;
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
       bool cancelled = false;
-      if (task->children == NULL)
+      if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED))
        {
          bool destroy_taskwait = task->taskwait != NULL;
          task->taskwait = NULL;
@@ -1162,9 +1344,12 @@ GOMP_taskwait (void)
            gomp_sem_destroy (&taskwait.taskwait_sem);
          return;
        }
-      if (task->children->kind == GOMP_TASK_WAITING)
+      struct gomp_task *next_task
+       = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
+                                   PQ_TEAM, &team->task_queue, &child_q);
+      if (next_task->kind == GOMP_TASK_WAITING)
        {
-         child_task = task->children;
+         child_task = next_task;
          cancelled
            = gomp_task_run_pre (child_task, task, team);
          if (__builtin_expect (cancelled, 0))
@@ -1180,8 +1365,10 @@ GOMP_taskwait (void)
        }
       else
        {
-         /* All tasks we are waiting for are already running
-            in other threads.  Wait for them.  */
+       /* All tasks we are waiting for are either running in other
+          threads, or they are tasks that have not had their
+          dependencies met (so they're not even in the queue).  Wait
+          for them.  */
          if (task->taskwait == NULL)
            {
              taskwait.in_depend_wait = false;
@@ -1205,7 +1392,28 @@ GOMP_taskwait (void)
       if (child_task)
        {
          thr->task = child_task;
-         child_task->fn (child_task->fn_data);
+         if (__builtin_expect (child_task->fn == NULL, 0))
+           {
+             if (gomp_target_task_fn (child_task->fn_data))
+               {
+                 thr->task = task;
+                 gomp_mutex_lock (&team->task_lock);
+                 child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+                 struct gomp_target_task *ttask
+                   = (struct gomp_target_task *) child_task->fn_data;
+                 /* If GOMP_PLUGIN_target_task_completion has run already
+                    in between gomp_target_task_fn and the mutex lock,
+                    perform the requeuing here.  */
+                 if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+                   gomp_target_task_completion (team, child_task);
+                 else
+                   ttask->state = GOMP_TARGET_TASK_RUNNING;
+                 child_task = NULL;
+                 continue;
+               }
+           }
+         else
+           child_task->fn (child_task->fn_data);
          thr->task = task;
        }
       else
@@ -1217,21 +1425,16 @@ GOMP_taskwait (void)
          size_t new_tasks
            = gomp_task_run_post_handle_depend (child_task, team);
 
-         /* Remove child_task from children list, and set up the next
-            sibling to be run.  */
-         child_task->prev_child->next_child = child_task->next_child;
-         child_task->next_child->prev_child = child_task->prev_child;
-         if (task->children == child_task)
+         if (child_q)
            {
-             if (child_task->next_child != child_task)
-               task->children = child_task->next_child;
-             else
-               task->children = NULL;
+             priority_queue_remove (PQ_CHILDREN, &task->children_queue,
+                                    child_task, MEMMODEL_RELAXED);
+             child_task->pnode[PQ_CHILDREN].next = NULL;
+             child_task->pnode[PQ_CHILDREN].prev = NULL;
            }
-         /* Orphan all the children of CHILD_TASK.  */
-         gomp_clear_parent (child_task->children);
 
-         /* Remove CHILD_TASK from its taskgroup.  */
+         gomp_clear_parent (&child_task->children_queue);
+
          gomp_task_run_post_remove_taskgroup (child_task);
 
          to_free = child_task;
@@ -1248,8 +1451,16 @@ GOMP_taskwait (void)
     }
 }
 
-/* This is like GOMP_taskwait, but we only wait for tasks that the
-   upcoming task depends on.
+/* An undeferred task is about to run.  Wait for all tasks that this
+   undeferred task depends on.
+
+   This is done by first putting all known ready dependencies
+   (dependencies that have their own dependencies met) at the top of
+   the scheduling queues.  Then we iterate through these imminently
+   ready tasks (and possibly other high priority tasks), and run them.
+   If we run out of ready dependencies to execute, we either wait for
+   the reamining dependencies to finish, or wait for them to get
+   scheduled so we can run them.
 
    DEPEND is as in GOMP_task.  */
 
@@ -1261,7 +1472,6 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
   struct gomp_team *team = thr->ts.team;
   struct gomp_task_depend_entry elem, *ent = NULL;
   struct gomp_taskwait taskwait;
-  struct gomp_task *last_parent_depends_on = NULL;
   size_t ndepend = (uintptr_t) depend[0];
   size_t nout = (uintptr_t) depend[1];
   size_t i;
@@ -1285,54 +1495,11 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
              {
                tsk->parent_depends_on = true;
                ++num_awaited;
-               /* If a task we need to wait for is not already
-                  running and is ready to be scheduled, move it to
-                  front, so that we run it as soon as possible.
-
-                  We rearrange the children queue such that all
-                  parent_depends_on tasks are first, and
-                  last_parent_depends_on points to the last such task
-                  we rearranged.  For example, given the following
-                  children where PD[123] are the parent_depends_on
-                  tasks:
-
-                       task->children
-                       |
-                       V
-                       C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
-
-                  We rearrange such that:
-
-                       task->children
-                       |              +--- last_parent_depends_on
-                       |              |
-                       V              V
-                       PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4
-               */
-
+               /* If depenency TSK itself has no dependencies and is
+                  ready to run, move it up front so that we run it as
+                  soon as possible.  */
                if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
-                 {
-                   if (last_parent_depends_on)
-                     {
-                       tsk->prev_child->next_child = tsk->next_child;
-                       tsk->next_child->prev_child = tsk->prev_child;
-                       tsk->prev_child = last_parent_depends_on;
-                       tsk->next_child = last_parent_depends_on->next_child;
-                       tsk->prev_child->next_child = tsk;
-                       tsk->next_child->prev_child = tsk;
-                     }
-                   else if (tsk != task->children)
-                     {
-                       tsk->prev_child->next_child = tsk->next_child;
-                       tsk->next_child->prev_child = tsk->prev_child;
-                       tsk->prev_child = task->children->prev_child;
-                       tsk->next_child = task->children;
-                       task->children = tsk;
-                       tsk->prev_child->next_child = tsk;
-                       tsk->next_child->prev_child = tsk;
-                     }
-                   last_parent_depends_on = tsk;
-                 }
+                 priority_queue_upgrade_task (tsk, task);
              }
          }
     }
@@ -1344,7 +1511,6 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 
   memset (&taskwait, 0, sizeof (taskwait));
   taskwait.n_depend = num_awaited;
-  taskwait.last_parent_depends_on = last_parent_depends_on;
   gomp_sem_init (&taskwait.taskwait_sem, 0);
   task->taskwait = &taskwait;
 
@@ -1363,9 +1529,28 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
          gomp_sem_destroy (&taskwait.taskwait_sem);
          return;
        }
-      if (task->children->kind == GOMP_TASK_WAITING)
+
+      /* Theoretically when we have multiple priorities, we should
+        chose between the highest priority item in
+        task->children_queue and team->task_queue here, so we should
+        use priority_queue_next_task().  However, since we are
+        running an undeferred task, perhaps that makes all tasks it
+        depends on undeferred, thus a priority of INF?  This would
+        make it unnecessary to take anything into account here,
+        but the dependencies.
+
+        On the other hand, if we want to use priority_queue_next_task(),
+        care should be taken to only use priority_queue_remove()
+        below if the task was actually removed from the children
+        queue.  */
+      bool ignored;
+      struct gomp_task *next_task
+       = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
+                                   PQ_IGNORED, NULL, &ignored);
+
+      if (next_task->kind == GOMP_TASK_WAITING)
        {
-         child_task = task->children;
+         child_task = next_task;
          cancelled
            = gomp_task_run_pre (child_task, task, team);
          if (__builtin_expect (cancelled, 0))
@@ -1380,8 +1565,10 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
            }
        }
       else
-       /* All tasks we are waiting for are already running
-          in other threads.  Wait for them.  */
+       /* All tasks we are waiting for are either running in other
+          threads, or they are tasks that have not had their
+          dependencies met (so they're not even in the queue).  Wait
+          for them.  */
        taskwait.in_depend_wait = true;
       gomp_mutex_unlock (&team->task_lock);
       if (do_wake)
@@ -1398,7 +1585,28 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
       if (child_task)
        {
          thr->task = child_task;
-         child_task->fn (child_task->fn_data);
+         if (__builtin_expect (child_task->fn == NULL, 0))
+           {
+             if (gomp_target_task_fn (child_task->fn_data))
+               {
+                 thr->task = task;
+                 gomp_mutex_lock (&team->task_lock);
+                 child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+                 struct gomp_target_task *ttask
+                   = (struct gomp_target_task *) child_task->fn_data;
+                 /* If GOMP_PLUGIN_target_task_completion has run already
+                    in between gomp_target_task_fn and the mutex lock,
+                    perform the requeuing here.  */
+                 if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+                   gomp_target_task_completion (team, child_task);
+                 else
+                   ttask->state = GOMP_TARGET_TASK_RUNNING;
+                 child_task = NULL;
+                 continue;
+               }
+           }
+         else
+           child_task->fn (child_task->fn_data);
          thr->task = task;
        }
       else
@@ -1412,18 +1620,12 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
          if (child_task->parent_depends_on)
            --taskwait.n_depend;
 
-         /* Remove child_task from sibling list.  */
-         child_task->prev_child->next_child = child_task->next_child;
-         child_task->next_child->prev_child = child_task->prev_child;
-         if (task->children == child_task)
-           {
-             if (child_task->next_child != child_task)
-               task->children = child_task->next_child;
-             else
-               task->children = NULL;
-           }
+         priority_queue_remove (PQ_CHILDREN, &task->children_queue,
+                                child_task, MEMMODEL_RELAXED);
+         child_task->pnode[PQ_CHILDREN].next = NULL;
+         child_task->pnode[PQ_CHILDREN].prev = NULL;
 
-         gomp_clear_parent (child_task->children);
+         gomp_clear_parent (&child_task->children_queue);
          gomp_task_run_post_remove_taskgroup (child_task);
          to_free = child_task;
          child_task = NULL;
@@ -1463,7 +1665,7 @@ GOMP_taskgroup_start (void)
     return;
   taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
   taskgroup->prev = task->taskgroup;
-  taskgroup->children = NULL;
+  priority_queue_init (&taskgroup->taskgroup_queue);
   taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
   taskgroup->num_children = 0;
@@ -1485,6 +1687,17 @@ GOMP_taskgroup_end (void)
   if (team == NULL)
     return;
   taskgroup = task->taskgroup;
+  if (__builtin_expect (taskgroup == NULL, 0)
+      && thr->ts.level == 0)
+    {
+      /* This can happen if GOMP_taskgroup_start is called when
+        thr->ts.team == NULL, but inside of the taskgroup there
+        is #pragma omp target nowait that creates an implicit
+        team with a single thread.  In this case, we want to wait
+        for all outstanding tasks in this team.  */
+      gomp_team_barrier_wait (&team->barrier);
+      return;
+    }
 
   /* The acquire barrier on load of taskgroup->num_children here
      synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
@@ -1495,19 +1708,25 @@ GOMP_taskgroup_end (void)
   if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
     goto finish;
 
+  bool unused;
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
       bool cancelled = false;
-      if (taskgroup->children == NULL)
+      if (priority_queue_empty_p (&taskgroup->taskgroup_queue,
+                                 MEMMODEL_RELAXED))
        {
          if (taskgroup->num_children)
            {
-             if (task->children == NULL)
+             if (priority_queue_empty_p (&task->children_queue,
+                                         MEMMODEL_RELAXED))
                goto do_wait;
-             child_task = task->children;
-            }
-          else
+             child_task
+               = priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
+                                           PQ_TEAM, &team->task_queue,
+                                           &unused);
+           }
+         else
            {
              gomp_mutex_unlock (&team->task_lock);
              if (to_free)
@@ -1519,7 +1738,9 @@ GOMP_taskgroup_end (void)
            }
        }
       else
-       child_task = taskgroup->children;
+       child_task
+         = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                                     PQ_TEAM, &team->task_queue, &unused);
       if (child_task->kind == GOMP_TASK_WAITING)
        {
          cancelled
@@ -1539,8 +1760,10 @@ GOMP_taskgroup_end (void)
        {
          child_task = NULL;
         do_wait:
-         /* All tasks we are waiting for are already running
-            in other threads.  Wait for them.  */
+       /* All tasks we are waiting for are either running in other
+          threads, or they are tasks that have not had their
+          dependencies met (so they're not even in the queue).  Wait
+          for them.  */
          taskgroup->in_taskgroup_wait = true;
        }
       gomp_mutex_unlock (&team->task_lock);
@@ -1558,7 +1781,28 @@ GOMP_taskgroup_end (void)
       if (child_task)
        {
          thr->task = child_task;
-         child_task->fn (child_task->fn_data);
+         if (__builtin_expect (child_task->fn == NULL, 0))
+           {
+             if (gomp_target_task_fn (child_task->fn_data))
+               {
+                 thr->task = task;
+                 gomp_mutex_lock (&team->task_lock);
+                 child_task->kind = GOMP_TASK_ASYNC_RUNNING;
+                 struct gomp_target_task *ttask
+                   = (struct gomp_target_task *) child_task->fn_data;
+                 /* If GOMP_PLUGIN_target_task_completion has run already
+                    in between gomp_target_task_fn and the mutex lock,
+                    perform the requeuing here.  */
+                 if (ttask->state == GOMP_TARGET_TASK_FINISHED)
+                   gomp_target_task_completion (team, child_task);
+                 else
+                   ttask->state = GOMP_TARGET_TASK_RUNNING;
+                 child_task = NULL;
+                 continue;
+               }
+           }
+         else
+           child_task->fn (child_task->fn_data);
          thr->task = task;
        }
       else
@@ -1570,7 +1814,7 @@ GOMP_taskgroup_end (void)
          size_t new_tasks
            = gomp_task_run_post_handle_depend (child_task, team);
          gomp_task_run_post_remove_parent (child_task);
-         gomp_clear_parent (child_task->children);
+         gomp_clear_parent (&child_task->children_queue);
          gomp_task_run_post_remove_taskgroup (child_task);
          to_free = child_task;
          child_task = NULL;
index f57a5a16ef22f6ec04af19e52cc6b6cacd7ed463..bcee326670cf24a9f96b726ffa1eabc1728b0a0a 100644 (file)
@@ -155,8 +155,8 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
   else
     ialias_call (GOMP_taskgroup_start) ();
 
-  /* FIXME, use priority.  */
-  (void) priority;
+  if (priority > gomp_max_task_priority_var)
+    priority = gomp_max_task_priority_var;
 
   if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL
       || (thr->task && thr->task->final_task)
@@ -175,6 +175,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
          for (i = 0; i < num_tasks; i++)
            {
              gomp_init_task (&task[i], parent, gomp_icv (false));
+             task[i].priority = priority;
              task[i].kind = GOMP_TASK_UNDEFERRED;
              task[i].final_task = (thr->task && thr->task->final_task)
                                   || (flags & GOMP_TASK_FLAG_FINAL);
@@ -198,10 +199,11 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
                task_step -= step;
              fn (arg);
              arg += arg_size;
-             if (task[i].children != NULL)
+             if (!priority_queue_empty_p (&task[i].children_queue,
+                                          MEMMODEL_RELAXED))
                {
                  gomp_mutex_lock (&team->task_lock);
-                 gomp_clear_parent (task[i].children);
+                 gomp_clear_parent (&task[i].children_queue);
                  gomp_mutex_unlock (&team->task_lock);
                }
              gomp_end_task ();
@@ -213,6 +215,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
            struct gomp_task task;
 
            gomp_init_task (&task, thr->task, gomp_icv (false));
+           task.priority = priority;
            task.kind = GOMP_TASK_UNDEFERRED;
            task.final_task = (thr->task && thr->task->final_task)
                              || (flags & GOMP_TASK_FLAG_FINAL);
@@ -228,10 +231,11 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
            if (i == nfirst)
              task_step -= step;
            fn (data);
-           if (task.children != NULL)
+           if (!priority_queue_empty_p (&task.children_queue,
+                                        MEMMODEL_RELAXED))
              {
                gomp_mutex_lock (&team->task_lock);
-               gomp_clear_parent (task.children);
+               gomp_clear_parent (&task.children_queue);
                gomp_mutex_unlock (&team->task_lock);
              }
            gomp_end_task ();
@@ -254,6 +258,7 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
          arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
                          & ~(uintptr_t) (arg_align - 1));
          gomp_init_task (task, parent, gomp_icv (false));
+         task->priority = priority;
          task->kind = GOMP_TASK_UNDEFERRED;
          task->in_tied_task = parent->in_tied_task;
          task->taskgroup = taskgroup;
@@ -298,48 +303,20 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       for (i = 0; i < num_tasks; i++)
        {
          struct gomp_task *task = tasks[i];
-         if (parent->children)
-           {
-             task->next_child = parent->children;
-             task->prev_child = parent->children->prev_child;
-             task->next_child->prev_child = task;
-             task->prev_child->next_child = task;
-           }
-         else
-           {
-             task->next_child = task;
-             task->prev_child = task;
-           }
-         parent->children = task;
+         priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
+                                task, priority,
+                                PRIORITY_INSERT_BEGIN,
+                                /*last_parent_depends_on=*/false,
+                                task->parent_depends_on);
          if (taskgroup)
-           {
-             if (taskgroup->children)
-               {
-                 task->next_taskgroup = taskgroup->children;
-                 task->prev_taskgroup = taskgroup->children->prev_taskgroup;
-                 task->next_taskgroup->prev_taskgroup = task;
-                 task->prev_taskgroup->next_taskgroup = task;
-               }
-             else
-               {
-                 task->next_taskgroup = task;
-                 task->prev_taskgroup = task;
-               }
-             taskgroup->children = task;
-           }
-         if (team->task_queue)
-           {
-             task->next_queue = team->task_queue;
-             task->prev_queue = team->task_queue->prev_queue;
-             task->next_queue->prev_queue = task;
-             task->prev_queue->next_queue = task;
-           }
-         else
-           {
-             task->next_queue = task;
-             task->prev_queue = task;
-             team->task_queue = task;
-           }
+           priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
+                                  task, priority, PRIORITY_INSERT_BEGIN,
+                                  /*last_parent_depends_on=*/false,
+                                  task->parent_depends_on);
+         priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority,
+                                PRIORITY_INSERT_END,
+                                /*last_parent_depends_on=*/false,
+                                task->parent_depends_on);
          ++team->task_count;
          ++team->task_queued_count;
        }
index 67e25b3cf7efb235c727d8d4dce99a8d69f8b3c4..34b77c07def75862584394d0f3573bebdd486365 100644 (file)
@@ -193,7 +193,7 @@ gomp_new_team (unsigned nthreads)
   team->ordered_release = (void *) &team->implicit_task[nthreads];
   team->ordered_release[0] = &team->master_release;
 
-  team->task_queue = NULL;
+  priority_queue_init (&team->task_queue);
   team->task_count = 0;
   team->task_queued_count = 0;
   team->task_running_count = 0;
@@ -214,6 +214,7 @@ free_team (struct gomp_team *team)
 #endif
   gomp_barrier_destroy (&team->barrier);
   gomp_mutex_destroy (&team->task_lock);
+  priority_queue_free (&team->task_queue);
   free (team);
 }
 
@@ -271,6 +272,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
       free (pool);
       thr->thread_pool = NULL;
     }
+  if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
+    gomp_team_end ();
   if (thr->task != NULL)
     {
       struct gomp_task *task = thr->task;
@@ -300,7 +303,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
   struct gomp_thread **affinity_thr = NULL;
 
   thr = gomp_thread ();
-  nested = thr->ts.team != NULL;
+  nested = thr->ts.level;
   pool = thr->thread_pool;
   task = thr->task;
   icv = task ? &task->icv : &gomp_global_icv;
diff --git a/libgomp/testsuite/libgomp.c/doacross-3.c b/libgomp/testsuite/libgomp.c/doacross-3.c
new file mode 100644 (file)
index 0000000..eef0d5e
--- /dev/null
@@ -0,0 +1,225 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+  unsigned long long i;
+  int j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(guided, 3) ordered (1) nowait
+    for (i = 1; i < N + f; i++)
+      {
+       #pragma omp atomic write
+       a[i] = 1;
+       #pragma omp ordered depend(sink: i - 1)
+       if (i > 1)
+         {
+           #pragma omp atomic read
+           l = a[i - 1];
+           if (l < 2)
+             abort ();
+         }
+       #pragma omp atomic write
+       a[i] = 2;
+       if (i < N - 1)
+         {
+           #pragma omp atomic read
+           l = a[i + 1];
+           if (l == 3)
+             abort ();
+         }
+       #pragma omp ordered depend(source)
+       #pragma omp atomic write
+       a[i] = 3;
+      }
+    #pragma omp for schedule(guided) ordered (3) nowait
+    for (i = 3; i < N / 16 - 1 + f; i++)
+      for (j = 0; j < 8; j += 2)
+       for (k = 1; k <= 3; k++)
+         {
+           #pragma omp atomic write
+           b[i][j][k] = 1;
+           #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+                               depend(sink: i - 2, j - 2, k + 1)
+           #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+           if (j >= 2 && k > 1)
+             {
+               #pragma omp atomic read
+               l = b[i][j - 2][k - 1];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp atomic write
+           b[i][j][k] = 2;
+           if (i >= 5 && j >= 2 && k < 3)
+             {
+               #pragma omp atomic read
+               l = b[i - 2][j - 2][k + 1];
+               if (l < 2)
+                 abort ();
+             }
+           if (i >= 6 && j < N / 16 - 3 && k == 3)
+             {
+               #pragma omp atomic read
+               l = b[i - 3][j + 2][k - 2];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp ordered depend(source)
+           #pragma omp atomic write
+           b[i][j][k] = 3;
+         }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(guided, 15)
+    for (i = 2; i < N / 32 + f; i++)
+      for (j = 7; j > 1; j--)
+       for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+         D(m)
+#undef A
+           {
+             #pragma omp atomic write
+             c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+             #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+                                 depend (sink:i - 2, j + 1, k - 4 E(m)) \
+                                 depend(sink: i - 1, j - 2, k - 2 E(m))
+             if (k <= 4)
+               {
+                 l = c[i][j][k + 2];
+                 if (l < 2)
+                   abort ();
+               }
+             #pragma omp atomic write
+             c[i][j][k] = 2;
+             if (i >= 4 && j < 7 && k >= 4)
+               {
+                 l = c[i - 2][j + 1][k - 4];
+                 if (l < 2)
+                   abort ();
+               }
+             if (i >= 3 && j >= 4 && k >= 2)
+               {
+                 l = c[i - 1][j - 2][k - 2];
+                 if (l < 2)
+                   abort ();
+               }
+             #pragma omp ordered depend (source)
+             #pragma omp atomic write
+             c[i][j][k] = 3;
+           }
+    #pragma omp for schedule(guided, 5) ordered (3) nowait
+    for (j = 0; j < N / 16 - 1; j++)
+      for (k = 0; k < 8; k += 2)
+       for (i = 3; i <= 5 + f; i++)
+         {
+           #pragma omp atomic write
+           g[j][k][i] = 1;
+           #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+                               depend(sink: j - 2, k - 2, i + 1)
+           #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+           if (k >= 2 && i > 3)
+             {
+               #pragma omp atomic read
+               l = g[j][k - 2][i - 1];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp atomic write
+           g[j][k][i] = 2;
+           if (j >= 2 && k >= 2 && i < 5)
+             {
+               #pragma omp atomic read
+               l = g[j - 2][k - 2][i + 1];
+               if (l < 2)
+                 abort ();
+             }
+           if (j >= 3 && k < N / 16 - 3 && i == 5)
+             {
+               #pragma omp atomic read
+               l = g[j - 3][k + 2][i - 2];
+               if (l < 2)
+                 abort ();
+             }
+           #pragma omp ordered depend(source)
+           #pragma omp atomic write
+           g[j][k][i] = 3;
+         }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+       for (k = 0; k < d; k++)
+         for (l = 0; l < d + 2; l++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+             if (!e)
+               abort ();
+           }
+    #pragma omp single
+    {
+      if (i != 3 || j != -1 || k != 0)
+       abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+       for (k = 0; k < d + 2; k++)
+         for (m = 0; m < d; m++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+             abort ();
+           }
+    #pragma omp single
+    if (i != 3 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 2; i < f + 3; i++)
+      for (j = d; j > 0; j--)
+       for (k = 0; k < d + 2; k++)
+         for (l = 0; l < d + 4; l++)
+           {
+             #pragma omp ordered depend (source)
+             #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+             if (!e)
+               abort ();
+           }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3 * (i >= 1))
+       abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 4; k++)
+         if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+           abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 8; k++)
+         if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+           abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+       for (k = 0; k < 6; k++)
+         if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+           abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/ordered-5.c b/libgomp/testsuite/libgomp.c/ordered-5.c
new file mode 100644 (file)
index 0000000..fac2440
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-additional-options "-msse2" { target sse2_runtime } } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+extern void abort (void);
+int a[1024], b = -1;
+
+int
+main ()
+{
+  int i;
+  #pragma omp parallel for simd ordered
+  for (i = 0; i < 1024; i++)
+    {
+      a[i] = i;
+      #pragma omp ordered threads simd
+      {
+       if (b + 1 != i)
+         abort ();
+       b = i;
+      }
+      a[i] += 3;
+    }
+  if (b != 1023)
+    abort ();
+  for (i = 0; i < 1024; i++)
+    if (a[i] != i + 3)
+      abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/priority.c b/libgomp/testsuite/libgomp.c/priority.c
new file mode 100644 (file)
index 0000000..012f09d
--- /dev/null
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var OMP_MAX_TASK_PRIORITY "10" } */
+
+/* This test verifies that the "priority" clause of omp task works as
+   advertised.
+
+   Testing the OpenMP task scheduler is a bit tricky, especially when
+   trying to determine what ran first (without explicitly calling
+   time() and/or synchronizing between threads).  What we do here is
+   run in single threaded mode which guarantees that we won't run into
+   data races while accessing the "prio" array.
+
+   We give each task a priority from 0..63, while setting
+   OMP_MAX_TASK_PRIORITY to 10, which basically gives us 10 lower
+   priority tasks, and the rest scheduled to run earlier.  We verify
+   that the priority < 10 tasks run last.  */
+
+#include <omp.h>
+#include <stdlib.h>
+
+#define N 64
+
+int main()
+{
+  int tsknum=0, prio[N];
+  int max_priority = omp_get_max_task_priority ();
+  int saved_tsknum = -1;
+  int i;
+
+#pragma omp parallel num_threads(1)
+#pragma omp single private (i)
+  {
+    for (i = 0; i < N; i++)
+      #pragma omp task priority(i ^ 1)
+      {
+       int t;
+       #pragma omp atomic capture seq_cst
+       t = tsknum++;
+       prio[t] = i ^ 1;
+      }
+    #pragma omp atomic read seq_cst
+    saved_tsknum = tsknum;
+  }
+
+  /* If any of the tasks have run before all tasks were created, don't
+     make any assumption on the task order.  Otherwise, we should have
+     tasks with >= max_priority scheduled first in arbitrary order,
+     followed by the rest of tasks in decreasing priority order, as
+     there is only one thread that can schedule them.  */
+  if (saved_tsknum == 0)
+    {
+      for (i = 0; i < N; i++)
+       if (i < N - max_priority)
+         {
+           if (prio[i] < max_priority)
+             abort ();
+         }
+       else if (i != N - prio[i] - 1)
+         abort ();
+    }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-31.c b/libgomp/testsuite/libgomp.c/target-31.c
new file mode 100644 (file)
index 0000000..255327c
--- /dev/null
@@ -0,0 +1,163 @@
+#include <omp.h>
+#include <stdlib.h>
+
+int a = 1, b = 2, c = 3, d = 4;
+int e[2] = { 5, 6 }, f[2] = { 7, 8 }, g[2] = { 9, 10 }, h[2] = { 11, 12 };
+
+__attribute__((noinline, noclone)) void
+use (int *k, int *l, int *m, int *n, int *o, int *p, int *q, int *r)
+{
+  asm volatile ("" : : "r" (k) : "memory");
+  asm volatile ("" : : "r" (l) : "memory");
+  asm volatile ("" : : "r" (m) : "memory");
+  asm volatile ("" : : "r" (n) : "memory");
+  asm volatile ("" : : "r" (o) : "memory");
+  asm volatile ("" : : "r" (p) : "memory");
+  asm volatile ("" : : "r" (q) : "memory");
+  asm volatile ("" : : "r" (r) : "memory");
+}
+
+#pragma omp declare target to (use)
+
+int
+main ()
+{
+  int err = 0, r = -1, t[4];
+  long s[4] = { -1, -2, -3, -4 };
+  int j = 13, k = 14, l[2] = { 15, 16 }, m[2] = { 17, 18 };
+  #pragma omp target private (a, b, e, f) firstprivate (c, d, g, h) map(from: r, s, t) \
+                    map(tofrom: err, j, l) map(to: k, m)
+  #pragma omp teams num_teams (4) thread_limit (8) private (b, f) firstprivate (d, h, k, m)
+  {
+    int u1 = k, u2[2] = { m[0], m[1] };
+    int u3[64];
+    int i;
+    for (i = 0; i < 64; i++)
+      u3[i] = k + i;
+    #pragma omp parallel num_threads (1)
+    {
+      if (c != 3 || d != 4 || g[0] != 9 || g[1] != 10 || h[0] != 11 || h[1] != 12 || k != 14 || m[0] != 17 || m[1] != 18)
+       #pragma omp atomic write
+         err = 1;
+      b = omp_get_team_num ();
+      if (b >= 4)
+       #pragma omp atomic write
+         err = 1;
+      if (b == 0)
+       {
+         a = omp_get_num_teams ();
+         e[0] = 2 * a;
+         e[1] = 3 * a;
+       }
+      f[0] = 2 * b;
+      f[1] = 3 * b;
+      #pragma omp atomic update
+       c++;
+      #pragma omp atomic update
+       g[0] += 2;
+      #pragma omp atomic update
+       g[1] += 3;
+      d++;
+      h[0] += 2;
+      h[1] += 3;
+      k += b;
+      m[0] += 2 * b;
+      m[1] += 3 * b;
+    }
+    use (&a, &b, &c, &d, e, f, g, h);
+    #pragma omp parallel firstprivate (u1, u2)
+    {
+      int w = omp_get_thread_num ();
+      int x = 19;
+      int y[2] = { 20, 21 };
+      int v = 24;
+      int ll[64];
+      if (u1 != 14 || u2[0] != 17 || u2[1] != 18)
+       #pragma omp atomic write
+         err = 1;
+      u1 += w;
+      u2[0] += 2 * w;
+      u2[1] += 3 * w;
+      use (&u1, u2, &t[b], l, &k, m, &j, h);
+      #pragma omp master
+       t[b] = omp_get_num_threads ();
+      #pragma omp atomic update
+       j++;
+      #pragma omp atomic update
+       l[0] += 2;
+      #pragma omp atomic update
+       l[1] += 3;
+      #pragma omp atomic update
+       k += 4;
+      #pragma omp atomic update
+       m[0] += 5;
+      #pragma omp atomic update
+       m[1] += 6;
+      x += w;
+      y[0] += 2 * w;
+      y[1] += 3 * w;
+      #pragma omp simd safelen(32) private (v)
+      for (i = 0; i < 64; i++)
+       {
+         v = 3 * i;
+         ll[i] = u1 + v * u2[0] + u2[1] + x + y[0] + y[1] + v + h[0] + u3[i];
+       }
+      #pragma omp barrier
+      use (&u1, u2, &t[b], l, &k, m, &x, y);
+      if (w < 0 || w > 8 || w != omp_get_thread_num () || u1 != 14 + w
+         || u2[0] != 17 + 2 * w || u2[1] != 18 + 3 * w
+         || x != 19 + w || y[0] != 20 + 2 * w || y[1] != 21 + 3 * w
+         || v != 24)
+       #pragma omp atomic write
+         err = 1;
+      for (i = 0; i < 64; i++)
+       if (ll[i] != u1 + 3 * i * u2[0] + u2[1] + x + y[0] + y[1] + 3 * i + 13 + 14 + i)
+         #pragma omp atomic write
+           err = 1;
+    }
+    #pragma omp parallel num_threads (1)
+    {
+      if (b == 0)
+       {
+         r = a;
+         if (a != omp_get_num_teams ()
+             || e[0] != 2 * a
+             || e[1] != 3 * a)
+           #pragma omp atomic write
+             err = 1;
+       }
+      int v1, v2, v3;
+      #pragma omp atomic read
+       v1 = c;
+      #pragma omp atomic read
+       v2 = g[0];
+      #pragma omp atomic read
+       v3 = g[1];
+      s[b] = v1 * 65536L + v2 * 256L + v3;
+      if (d != 5 || h[0] != 13 || h[1] != 15
+         || k != 14 + b + 4 * t[b]
+         || m[0] != 17 + 2 * b + 5 * t[b]
+         || m[1] != 18 + 3 * b + 6 * t[b]
+         || b != omp_get_team_num ()
+         || f[0] != 2 * b || f[1] != 3 * b)
+       #pragma omp atomic write
+         err = 1;
+    }
+  }
+  if (err != 0) abort ();
+  if (r < 1 || r > 4) abort ();
+  if (a != 1 || b != 2 || c != 3 || d != 4) abort ();
+  if (e[0] != 5 || e[1] != 6 || f[0] != 7 || f[1] != 8) abort ();
+  if (g[0] != 9 || g[1] != 10 || h[0] != 11 || h[1] != 12) abort ();
+  int i, cnt = 0;
+  for (i = 0; i < r; i++)
+    if ((s[i] >> 16) < 3 + 1 || (s[i] >> 16) > 3 + 4
+       || ((s[i] >> 8) & 0xff) < 9 + 2 * 1 || ((s[i] >> 8) & 0xff) > 9 + 2 * 4
+       || (s[i] & 0xff) < 10 + 3 * 1 || (s[i] & 0xff) > 10 + 3 * 4
+       || t[i] < 1 || t[i] > 8)
+      abort ();
+    else
+      cnt += t[i];
+  if (j != 13 + cnt || l[0] != 15 + 2 * cnt || l[1] != 16 + 3 * cnt) abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-32.c b/libgomp/testsuite/libgomp.c/target-32.c
new file mode 100644 (file)
index 0000000..233877b
--- /dev/null
@@ -0,0 +1,54 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int main ()
+{
+  int a = 0, b = 0, c = 0, d[7];
+
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task depend(out: d[0])
+      a = 2;
+
+    #pragma omp target enter data nowait map(to: a,b,c) depend(in: d[0]) depend(out: d[1])
+
+    #pragma omp target nowait map(alloc: a) depend(in: d[1]) depend(out: d[2])
+      a++;
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[3])
+    {
+      usleep (1000);
+      #pragma omp atomic update
+      b |= 4;
+    }
+
+    #pragma omp target nowait map(alloc: b) depend(in: d[2]) depend(out: d[4])
+    {
+      usleep (5000);
+      #pragma omp atomic update
+      b |= 1;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[5])
+    {
+      usleep (5000);
+      #pragma omp atomic update
+      c |= 8;
+    }
+
+    #pragma omp target nowait map(alloc: c) depend(in: d[3], d[4]) depend(out: d[6])
+    {
+      usleep (1000);
+      #pragma omp atomic update
+      c |= 2;
+    }
+
+    #pragma omp target exit data map(always,from: a,b,c) depend(in: d[5], d[6])
+  }
+
+  if (a != 3 || b != 5 || c != 10)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-33.c b/libgomp/testsuite/libgomp.c/target-33.c
new file mode 100644 (file)
index 0000000..1bed4b6
--- /dev/null
@@ -0,0 +1,93 @@
+extern void abort (void);
+
+int
+main ()
+{
+  int a = 1, b = 2, c = 4, d[7];
+  #pragma omp taskgroup
+  {
+    #pragma omp target enter data nowait map (to: a, b, c) depend(out: d[0])
+    #pragma omp target nowait map (alloc: a, b) depend(in: d[0]) depend(out: d[1])
+    {
+      #pragma omp atomic update
+      a |= 4;
+      #pragma omp atomic update
+      b |= 8;
+    }
+    #pragma omp target nowait map (alloc: a, c) depend(in: d[0]) depend(out: d[2])
+    {
+      #pragma omp atomic update
+      a |= 16;
+      #pragma omp atomic update
+      c |= 32;
+    }
+    #pragma omp target exit data nowait map (from: a, b, c) depend(in: d[1], d[2])
+  }
+  if (a != 21 || b != 10 || c != 36)
+    abort ();
+  #pragma omp target map (tofrom: a, b) nowait
+  {
+    a &= ~16;
+    b &= ~2;
+  }
+  #pragma omp target map (tofrom: c) nowait
+  {
+    c |= 8;
+  }
+  #pragma omp barrier
+  if (a != 5 || b != 8 || c != 44)
+    abort ();
+  #pragma omp target map (tofrom: a, b) nowait
+  {
+    a |= 32;
+    b |= 4;
+  }
+  #pragma omp target map (tofrom: c) nowait
+  {
+    c &= ~4;
+  }
+  #pragma omp taskwait
+  if (a != 37 || b != 12 || c != 40)
+    abort ();
+  #pragma omp target nowait map (tofrom: a, b) depend(out: d[3])
+  {
+    #pragma omp atomic update
+    a = a + 9;
+    b -= 8;
+  }
+  #pragma omp target nowait map (tofrom: a, c) depend(out: d[4])
+  {
+    #pragma omp atomic update
+    a = a + 4;
+    c >>= 1;
+  }
+  #pragma omp task if (0) depend (in: d[3], d[4]) shared (a, b, c)
+  if (a != 50 || b != 4 || c != 20)
+    abort ();
+  #pragma omp task shared (a)
+  a += 50;
+  #pragma omp target nowait map (tofrom: b)
+  b++;
+  #pragma omp target map (tofrom: c) nowait
+  c--;
+  #pragma omp taskwait
+  if (a != 100 || b != 5 || c != 19)
+    abort ();
+  #pragma omp target map (tofrom: a) nowait depend(out: d[5])
+  a++;
+  #pragma omp target map (tofrom: b) nowait depend(out: d[6])
+  b++;
+  #pragma omp target map (tofrom: a, b) depend(in: d[5], d[6])
+  {
+    if (a != 101 || b != 6)
+      a = -9;
+    else
+      {
+       a = 24;
+       b = 38;
+      }
+  }
+  if (a != 24 || b != 38)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-34.c b/libgomp/testsuite/libgomp.c/target-34.c
new file mode 100644 (file)
index 0000000..66d9f54
--- /dev/null
@@ -0,0 +1,112 @@
+extern void abort (void);
+
+int
+main ()
+{
+  int a = 1, b = 2, c = 4, d[7];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      #pragma omp taskgroup
+      {
+       #pragma omp target enter data nowait map (to: a, b, c) depend(out: d[0])
+       #pragma omp target nowait map (alloc: a, b) depend(in: d[0]) depend(out: d[1])
+       {
+         #pragma omp atomic update
+         a |= 4;
+         #pragma omp atomic update
+         b |= 8;
+       }
+       #pragma omp target nowait map (alloc: a, c) depend(in: d[0]) depend(out: d[2])
+       {
+         #pragma omp atomic update
+         a |= 16;
+         #pragma omp atomic update
+         c |= 32;
+       }
+       #pragma omp target exit data nowait map (from: a, b, c) depend(in: d[1], d[2])
+      }
+      if (a != 21 || b != 10 || c != 36)
+       abort ();
+      #pragma omp target map (tofrom: a, b) nowait
+      {
+       a &= ~16;
+       b &= ~2;
+      }
+      #pragma omp target map (tofrom: c) nowait
+      {
+       c |= 8;
+      }
+    } /* Implicit barrier here.  */
+    #pragma omp single
+    {
+      if (a != 5 || b != 8 || c != 44)
+       abort ();
+      #pragma omp target map (tofrom: a, b) nowait
+      {
+       a |= 32;
+       b |= 4;
+      }
+      #pragma omp target map (tofrom: c) nowait
+      c &= ~4;
+      #pragma omp taskwait
+      if (a != 37 || b != 12 || c != 40)
+       abort ();
+      #pragma omp target nowait map (tofrom: a, b) depend(out: d[3])
+      {
+       #pragma omp atomic update
+       a = a + 9;
+       b -= 8;
+      }
+      #pragma omp target nowait map (tofrom: a, c) depend(out: d[4])
+      {
+       #pragma omp atomic update
+       a = a + 4;
+       c >>= 1;
+      }
+      #pragma omp task if (0) depend (in: d[3], d[4]) shared (a, b, c)
+      if (a != 50 || b != 4 || c != 20)
+       abort ();
+      #pragma omp task shared (a)
+      a += 50;
+      #pragma omp target nowait map (tofrom: b)
+      b++;
+      #pragma omp target map (tofrom: c) nowait
+      c--;
+      #pragma omp taskwait
+      if (a != 100 || b != 5 || c != 19)
+       abort ();
+      #pragma omp target map (tofrom: a) nowait depend(out: d[5])
+      a++;
+      #pragma omp target map (tofrom: b) nowait depend(out: d[6])
+      b++;
+      #pragma omp target map (tofrom: a, b) depend(in: d[5], d[6])
+      {
+       if (a != 101 || b != 6)
+         a = -9;
+       else
+         {
+           a = 24;
+           b = 38;
+         }
+      }
+      if (a != 24 || b != 38)
+       abort ();
+    } /* Implicit barrier here.  */
+    #pragma omp master
+    {
+      #pragma omp target nowait map (tofrom: a, b)
+      {
+       a *= 2;
+       b++;
+      }
+      #pragma omp target map (tofrom: c) nowait
+      c--;
+    }
+    #pragma omp barrier
+    if (a != 48 || b != 39 || c != 18)
+      abort ();
+  }
+  return 0;
+}
index 93419bdfaaf06ad31a32f7158be4e55f71834dc4..4086d548c0dba8f45773f4ec42438ada906c29c4 100644 (file)
@@ -1,3 +1,20 @@
+2015-11-14  Ilya Verbin  <ilya.verbin@intel.com>
+
+       * runtime/offload_host.cpp (task_completion_callback): New
+       variable.
+       (offload_proxy_task_completed_ooo): Call task_completion_callback.
+       (__offload_register_task_callback): New function.
+       * runtime/offload_host.h (__offload_register_task_callback): New
+       declaration.
+       * plugin/libgomp-plugin-intelmic.cpp (offload): Add async_data
+       argument, handle async offloading.
+       (register_main_image): Call register_main_image.
+       (GOMP_OFFLOAD_init_device, get_target_table, GOMP_OFFLOAD_alloc,
+       GOMP_OFFLOAD_free, GOMP_OFFLOAD_host2dev, GOMP_OFFLOAD_dev2host,
+       GOMP_OFFLOAD_dev2dev) Adjust offload callers.
+       (GOMP_OFFLOAD_async_run): New function.
+       (GOMP_OFFLOAD_run): Implement using GOMP_OFFLOAD_async_run.
+
 2015-10-26  Ilya Verbin  <ilya.verbin@intel.com>
            Aleksander Ivanushenko  <aleksander.ivanushenko@intel.com>
 
index 26ac6fed982c70651f5fb1f1aba42dcc822895f4..772e1985dfbbaf75ac6ec296b0e68a98000cc851 100644 (file)
@@ -192,11 +192,23 @@ GOMP_OFFLOAD_get_num_devices (void)
 
 static void
 offload (const char *file, uint64_t line, int device, const char *name,
-        int num_vars, VarDesc *vars, VarDesc2 *vars2)
+        int num_vars, VarDesc *vars, VarDesc2 *vars2, const void **async_data)
 {
   OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
   if (ofld)
-    __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+    {
+      if (async_data == NULL)
+       __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+                           NULL);
+      else
+       {
+         OffloadFlags flags;
+         flags.flags = 0;
+         flags.bits.omp_async = 1;
+         __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+                             async_data, 0, NULL, flags, NULL);
+       }
+    }
   else
     {
       fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -208,6 +220,10 @@ static void
 register_main_image ()
 {
   __offload_register_image (&main_target_image);
+
+  /* liboffloadmic will call GOMP_PLUGIN_target_task_completion when
+     asynchronous task on target is completed.  */
+  __offload_register_task_callback (GOMP_PLUGIN_target_task_completion);
 }
 
 /* liboffloadmic loads and runs offload_target_main on all available devices
@@ -218,7 +234,7 @@ GOMP_OFFLOAD_init_device (int device)
   TRACE ("");
   pthread_once (&main_image_is_registered, register_main_image);
   offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
-          NULL, NULL);
+          NULL, NULL, NULL);
 }
 
 extern "C" void
@@ -240,7 +256,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
   VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   int table_size = num_funcs + 2 * num_vars;
   if (table_size > 0)
@@ -254,7 +270,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
       VarDesc2 vd2g = { "table", 0 };
 
       offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
-              &vd2, &vd2g);
+              &vd2, &vd2g, NULL);
     }
 }
 
@@ -401,8 +417,8 @@ GOMP_OFFLOAD_alloc (int device, size_t size)
   vd1[1].size = sizeof (void *);
   VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+          NULL);
   return tgt_ptr;
 }
 
@@ -416,7 +432,8 @@ GOMP_OFFLOAD_free (int device, void *tgt_ptr)
   vd1.size = sizeof (void *);
   VarDesc2 vd1g = { "tgt_ptr", 0 };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g);
+  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+          NULL);
 }
 
 extern "C" void *
@@ -435,7 +452,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_host2tgt;
   vd2.ptr = (void *) host_ptr;
@@ -443,7 +460,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
-          &vd2, &vd2g);
+          &vd2, &vd2g, NULL);
 
   return tgt_ptr;
 }
@@ -464,7 +481,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_tgt2host;
   vd2.ptr = (void *) host_ptr;
@@ -472,7 +489,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
-          &vd2, &vd2g);
+          &vd2, &vd2g, NULL);
 
   return host_ptr;
 }
@@ -495,22 +512,32 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr,
   VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
-          vd1g);
+          vd1g, NULL);
 
   return dst_ptr;
 }
 
 extern "C" void
-GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+                       void *async_data)
 {
-  TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+        tgt_fn, tgt_vars, async_data);
 
-  VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
-  vd1[0].ptr = &tgt_fn;
-  vd1[0].size = sizeof (void *);
-  vd1[1].ptr = &tgt_vars;
-  vd1[1].size = sizeof (void *);
-  VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+  VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+  vd[0].ptr = &tgt_fn;
+  vd[0].size = sizeof (void *);
+  vd[1].ptr = &tgt_vars;
+  vd[1].size = sizeof (void *);
+
+  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+          (const void **) async_data);
+}
+
+extern "C" void
+GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
+{
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);
 
-  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+  GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
 }
index c10dc4efb550cca5c65f34702b066b0dea6a32a0..e52019dfb28e7733cba65fc506d160ccb27e4fba 100644 (file)
@@ -64,6 +64,8 @@ static void __offload_fini_library(void);
 #define GET_OFFLOAD_NUMBER(timer_data) \
     timer_data? timer_data->offload_number : 0
 
+static void (*task_completion_callback)(void *);
+
 extern "C" {
 #ifdef TARGET_WINNT
 // Windows does not support imports from libraries without actually
@@ -2507,7 +2509,7 @@ extern "C" {
         const void *info
     )
     {
-       /* TODO: Call callback function, pass info.  */
+       task_completion_callback ((void *) info);
     }
 }
 
@@ -5669,6 +5671,11 @@ extern "C" void __offload_unregister_image(const void *target_image)
     }
 }
 
+extern "C" void __offload_register_task_callback(void (*cb)(void *))
+{
+    task_completion_callback = cb;
+}
+
 // Runtime trace interface for user programs
 
 void __offload_console_trace(int level)
index afd5c9936ca168e90b3656d221453936bc264166..2a43fd6a68a2eb9b0b219eaf4ed97118bf3d032d 100644 (file)
@@ -376,6 +376,9 @@ extern "C" bool __offload_target_image_is_executable(const void *target_image);
 extern "C" bool __offload_register_image(const void* image);
 extern "C" void __offload_unregister_image(const void* image);
 
+// Registers asynchronous task completion callback
+extern "C" void __offload_register_task_callback(void (*cb)(void *));
+
 // Initializes offload runtime library.
 DLL_LOCAL extern int __offload_init_library(void);