kmp_barrier.cpp 88.4 KB

Raw Blame History Permalink

/*
 * kmp_barrier.cpp
 */

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "kmp.h"
#include "kmp_wait_release.h"
#include "kmp_itt.h"
#include "kmp_os.h"
#include "kmp_stats.h"
#include "ompt-specific.h"

#if KMP_MIC
#include <immintrin.h>
#define USE_NGO_STORES 1
#endif // KMP_MIC

#include "tsan_annotations.h"

#if KMP_MIC && USE_NGO_STORES
// ICV copying
#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
#else
#define ngo_load(src) ((void)0)
#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
#define ngo_sync() ((void)0)
#endif /* KMP_MIC && USE_NGO_STORES */

void __kmp_print_structure(void); // Forward declaration

// ---------------------------- Barrier Algorithms ----------------------------

// Linear Barrier
template <bool cancellable = false>
static bool __kmp_linear_barrier_gather_template(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
  kmp_team_t *team = this_thr->th.th_team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_info_t **other_threads = team->t.t_threads;

  KA_TRACE(
      20,
      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
       gtid, team->t.t_id, tid, bt));
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
  // Barrier imbalance - save arrive time to the thread
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
        __itt_get_timestamp();
  }
#endif
  // We now perform a linear reduction to signal that all of the threads have
  // arrived.
  if (!KMP_MASTER_TID(tid)) {
    KA_TRACE(20,
             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
              "arrived(%p): %llu => %llu\n",
              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
    // Mark arrival to master thread
    /* After performing this write, a worker thread may not assume that the team
       is valid any more - it could be deallocated by the master thread at any
       time. */
    ANNOTATE_BARRIER_BEGIN(this_thr);
    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
    flag.release();
  } else {
    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
    int nproc = this_thr->th.th_team_nproc;
    int i;
    // Don't have to worry about sleep bit here or atomic since team setting
    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;

    // Collect all the worker team member threads.
    for (i = 1; i < nproc; ++i) {
#if KMP_CACHE_MANAGE
      // Prefetch next thread's arrived count
      if (i + 1 < nproc)
        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
                    "arrived(%p) == %llu\n",
                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                    team->t.t_id, i,
                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));

      // Wait for worker thread to arrive
      kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
                       new_state);
      if (cancellable) {
        bool cancelled = flag.wait_cancellable_nosleep(
            this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
        if (cancelled)
          return true;
      } else {
        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
      }
      ANNOTATE_BARRIER_END(other_threads[i]);
#if USE_ITT_BUILD && USE_ITT_NOTIFY
      // Barrier imbalance - write min of the thread time and the other thread
      // time to the thread.
      if (__kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_min_time = KMP_MIN(
            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
      }
#endif
      if (reduce) {
        KA_TRACE(100,
                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
                  team->t.t_id, i));
        ANNOTATE_REDUCE_AFTER(reduce);
        OMPT_REDUCTION_DECL(this_thr, gtid);
        OMPT_REDUCTION_BEGIN;
        (*reduce)(this_thr->th.th_local.reduce_data,
                  other_threads[i]->th.th_local.reduce_data);
        OMPT_REDUCTION_END;
        ANNOTATE_REDUCE_BEFORE(reduce);
        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
      }
    }
    // Don't have to worry about sleep bit here or atomic since team setting
    team_bar->b_arrived = new_state;
    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
                  "arrived(%p) = %llu\n",
                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
                  new_state));
  }
  KA_TRACE(
      20,
      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
       gtid, team->t.t_id, tid, bt));
  return false;
}

template <bool cancellable = false>
static bool __kmp_linear_barrier_release_template(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_team_t *team;

  if (KMP_MASTER_TID(tid)) {
    unsigned int i;
    kmp_uint32 nproc = this_thr->th.th_team_nproc;
    kmp_info_t **other_threads;

    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    other_threads = team->t.t_threads;

    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
                  "barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));

    if (nproc > 1) {
#if KMP_BARRIER_ICV_PUSH
      {
        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
        if (propagate_icvs) {
          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
          for (i = 1; i < nproc; ++i) {
            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
                                     team, i, FALSE);
            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
                           &team->t.t_implicit_task_taskdata[0].td_icvs);
          }
          ngo_sync();
        }
      }
#endif // KMP_BARRIER_ICV_PUSH

      // Now, release all of the worker threads
      for (i = 1; i < nproc; ++i) {
#if KMP_CACHE_MANAGE
        // Prefetch next thread's go flag
        if (i + 1 < nproc)
          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */
        KA_TRACE(
            20,
            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
             "go(%p): %u => %u\n",
             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
             other_threads[i]->th.th_bar[bt].bb.b_go,
             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
        ANNOTATE_BARRIER_BEGIN(other_threads[i]);
        kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
                         other_threads[i]);
        flag.release();
      }
    }
  } else { // Wait for the MASTER thread to release us
    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
    if (cancellable) {
      bool cancelled = flag.wait_cancellable_nosleep(
          this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
      if (cancelled) {
        return true;
      }
    } else {
      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    }
    ANNOTATE_BARRIER_END(this_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
      // disabled)
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
      // Cancel wait on previous parallel region...
      __kmp_itt_task_starting(itt_sync_obj);

      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
        return false;

      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
      if (itt_sync_obj != NULL)
        // Call prepare as early as possible for "new" barrier
        __kmp_itt_task_finished(itt_sync_obj);
    } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
      return false;
// The worker thread may now assume that the team is valid.
#ifdef KMP_DEBUG
    tid = __kmp_tid_from_gtid(gtid);
    team = __kmp_threads[gtid]->th.th_team;
#endif
    KMP_DEBUG_ASSERT(team != NULL);
    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
    KA_TRACE(20,
             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
    KMP_MB(); // Flush all pending memory write invalidates.
  }
  KA_TRACE(
      20,
      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
       gtid, team->t.t_id, tid, bt));
  return false;
}

static void __kmp_linear_barrier_gather(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  __kmp_linear_barrier_gather_template<false>(
      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
}

static bool __kmp_linear_barrier_gather_cancellable(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  return __kmp_linear_barrier_gather_template<true>(
      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
}

static void __kmp_linear_barrier_release(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  __kmp_linear_barrier_release_template<false>(
      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
}

static bool __kmp_linear_barrier_release_cancellable(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  return __kmp_linear_barrier_release_template<true>(
      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
}

// Tree barrier
static void
__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                          int tid, void (*reduce)(void *, void *)
                                       USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
  kmp_team_t *team = this_thr->th.th_team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_info_t **other_threads = team->t.t_threads;
  kmp_uint32 nproc = this_thr->th.th_team_nproc;
  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
  kmp_uint32 branch_factor = 1 << branch_bits;
  kmp_uint32 child;
  kmp_uint32 child_tid;
  kmp_uint64 new_state;

  KA_TRACE(
      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
           gtid, team->t.t_id, tid, bt));
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
  // Barrier imbalance - save arrive time to the thread
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
        __itt_get_timestamp();
  }
#endif
  // Perform tree gather to wait until all threads have arrived; reduce any
  // required data as we go
  child_tid = (tid << branch_bits) + 1;
  if (child_tid < nproc) {
    // Parent threads wait for all their children to arrive
    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
    child = 1;
    do {
      kmp_info_t *child_thr = other_threads[child_tid];
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
      // Prefetch next thread's arrived count
      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
        KMP_CACHE_PREFETCH(
            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
      KA_TRACE(20,
               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
                "arrived(%p) == %llu\n",
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
      // Wait for child to arrive
      kmp_flag_64 flag(&child_bar->b_arrived, new_state);
      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
      ANNOTATE_BARRIER_END(child_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY
      // Barrier imbalance - write min of the thread time and a child time to
      // the thread.
      if (__kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
                                               child_thr->th.th_bar_min_time);
      }
#endif
      if (reduce) {
        KA_TRACE(100,
                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                  team->t.t_id, child_tid));
        ANNOTATE_REDUCE_AFTER(reduce);
        OMPT_REDUCTION_DECL(this_thr, gtid);
        OMPT_REDUCTION_BEGIN;
        (*reduce)(this_thr->th.th_local.reduce_data,
                  child_thr->th.th_local.reduce_data);
        OMPT_REDUCTION_END;
        ANNOTATE_REDUCE_BEFORE(reduce);
        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
      }
      child++;
      child_tid++;
    } while (child <= branch_factor && child_tid < nproc);
  }

  if (!KMP_MASTER_TID(tid)) { // Worker threads
    kmp_int32 parent_tid = (tid - 1) >> branch_bits;

    KA_TRACE(20,
             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
              "arrived(%p): %llu => %llu\n",
              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));

    // Mark arrival to parent thread
    /* After performing this write, a worker thread may not assume that the team
       is valid any more - it could be deallocated by the master thread at any
       time.  */
    ANNOTATE_BARRIER_BEGIN(this_thr);
    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
    flag.release();
  } else {
    // Need to update the team arrived pointer if we are the master thread
    if (nproc > 1) // New value was already computed above
      team->t.t_bar[bt].b_arrived = new_state;
    else
      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
                  "arrived(%p) = %llu\n",
                  gtid, team->t.t_id, tid, team->t.t_id,
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  }
  KA_TRACE(20,
           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
            gtid, team->t.t_id, tid, bt));
}

static void __kmp_tree_barrier_release(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
  kmp_team_t *team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_uint32 nproc;
  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
  kmp_uint32 branch_factor = 1 << branch_bits;
  kmp_uint32 child;
  kmp_uint32 child_tid;

  // Perform a tree release for all of the threads that have been gathered
  if (!KMP_MASTER_TID(
          tid)) { // Handle fork barrier workers who aren't part of a team yet
    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
    // Wait for parent thread to release us
    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    ANNOTATE_BARRIER_END(this_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
      // In fork barrier where we could not get the object reliably (or
      // ITTNOTIFY is disabled)
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
      // Cancel wait on previous parallel region...
      __kmp_itt_task_starting(itt_sync_obj);

      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
        return;

      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
      if (itt_sync_obj != NULL)
        // Call prepare as early as possible for "new" barrier
        __kmp_itt_task_finished(itt_sync_obj);
    } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
      return;

    // The worker thread may now assume that the team is valid.
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    tid = __kmp_tid_from_gtid(gtid);

    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
    KA_TRACE(20,
             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
    KMP_MB(); // Flush all pending memory write invalidates.
  } else {
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
                  "barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
  }
  nproc = this_thr->th.th_team_nproc;
  child_tid = (tid << branch_bits) + 1;

  if (child_tid < nproc) {
    kmp_info_t **other_threads = team->t.t_threads;
    child = 1;
    // Parent threads release all their children
    do {
      kmp_info_t *child_thr = other_threads[child_tid];
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
      // Prefetch next thread's go count
      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
        KMP_CACHE_PREFETCH(
            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */

#if KMP_BARRIER_ICV_PUSH
      {
        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
        if (propagate_icvs) {
          __kmp_init_implicit_task(team->t.t_ident,
                                   team->t.t_threads[child_tid], team,
                                   child_tid, FALSE);
          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
                    &team->t.t_implicit_task_taskdata[0].td_icvs);
        }
      }
#endif // KMP_BARRIER_ICV_PUSH
      KA_TRACE(20,
               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
                "go(%p): %u => %u\n",
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
      // Release child from barrier
      ANNOTATE_BARRIER_BEGIN(child_thr);
      kmp_flag_64 flag(&child_bar->b_go, child_thr);
      flag.release();
      child++;
      child_tid++;
    } while (child <= branch_factor && child_tid < nproc);
  }
  KA_TRACE(
      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
           gtid, team->t.t_id, tid, bt));
}

// Hyper Barrier
static void
__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
                           int tid, void (*reduce)(void *, void *)
                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
  kmp_team_t *team = this_thr->th.th_team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_info_t **other_threads = team->t.t_threads;
  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
  kmp_uint32 branch_factor = 1 << branch_bits;
  kmp_uint32 offset;
  kmp_uint32 level;

  KA_TRACE(
      20,
      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
       gtid, team->t.t_id, tid, bt));
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
  // Barrier imbalance - save arrive time to the thread
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
        __itt_get_timestamp();
  }
#endif
  /* Perform a hypercube-embedded tree gather to wait until all of the threads
     have arrived, and reduce any required data as we go.  */
  kmp_flag_64 p_flag(&thr_bar->b_arrived);
  for (level = 0, offset = 1; offset < num_threads;
       level += branch_bits, offset <<= branch_bits) {
    kmp_uint32 child;
    kmp_uint32 child_tid;

    if (((tid >> level) & (branch_factor - 1)) != 0) {
      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);

      KMP_MB(); // Synchronize parent and child threads.
      KA_TRACE(20,
               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
                "arrived(%p): %llu => %llu\n",
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
                team->t.t_id, parent_tid, &thr_bar->b_arrived,
                thr_bar->b_arrived,
                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
      // Mark arrival to parent thread
      /* After performing this write (in the last iteration of the enclosing for
         loop), a worker thread may not assume that the team is valid any more
         - it could be deallocated by the master thread at any time.  */
      ANNOTATE_BARRIER_BEGIN(this_thr);
      p_flag.set_waiter(other_threads[parent_tid]);
      p_flag.release();
      break;
    }

    // Parent threads wait for children to arrive
    if (new_state == KMP_BARRIER_UNUSED_STATE)
      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
    for (child = 1, child_tid = tid + (1 << level);
         child < branch_factor && child_tid < num_threads;
         child++, child_tid += (1 << level)) {
      kmp_info_t *child_thr = other_threads[child_tid];
      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
      kmp_uint32 next_child_tid = child_tid + (1 << level);
      // Prefetch next thread's arrived count
      if (child + 1 < branch_factor && next_child_tid < num_threads)
        KMP_CACHE_PREFETCH(
            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
#endif /* KMP_CACHE_MANAGE */
      KA_TRACE(20,
               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
                "arrived(%p) == %llu\n",
                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
      // Wait for child to arrive
      kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
      ANNOTATE_BARRIER_END(child_thr);
      KMP_MB(); // Synchronize parent and child threads.
#if USE_ITT_BUILD && USE_ITT_NOTIFY
      // Barrier imbalance - write min of the thread time and a child time to
      // the thread.
      if (__kmp_forkjoin_frames_mode == 2) {
        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
                                               child_thr->th.th_bar_min_time);
      }
#endif
      if (reduce) {
        KA_TRACE(100,
                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                  team->t.t_id, child_tid));
        ANNOTATE_REDUCE_AFTER(reduce);
        OMPT_REDUCTION_DECL(this_thr, gtid);
        OMPT_REDUCTION_BEGIN;
        (*reduce)(this_thr->th.th_local.reduce_data,
                  child_thr->th.th_local.reduce_data);
        OMPT_REDUCTION_END;
        ANNOTATE_REDUCE_BEFORE(reduce);
        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
      }
    }
  }

  if (KMP_MASTER_TID(tid)) {
    // Need to update the team arrived pointer if we are the master thread
    if (new_state == KMP_BARRIER_UNUSED_STATE)
      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
    else
      team->t.t_bar[bt].b_arrived = new_state;
    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
                  "arrived(%p) = %llu\n",
                  gtid, team->t.t_id, tid, team->t.t_id,
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  }
  KA_TRACE(
      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
           gtid, team->t.t_id, tid, bt));
}

// The reverse versions seem to beat the forward versions overall
#define KMP_REVERSE_HYPER_BAR
static void __kmp_hyper_barrier_release(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
  kmp_team_t *team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_info_t **other_threads;
  kmp_uint32 num_threads;
  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
  kmp_uint32 branch_factor = 1 << branch_bits;
  kmp_uint32 child;
  kmp_uint32 child_tid;
  kmp_uint32 offset;
  kmp_uint32 level;

  /* Perform a hypercube-embedded tree release for all of the threads that have
     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
     are released in the reverse order of the corresponding gather, otherwise
     threads are released in the same order. */
  if (KMP_MASTER_TID(tid)) { // master
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
                  "barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
#if KMP_BARRIER_ICV_PUSH
    if (propagate_icvs) { // master already has ICVs in final destination; copy
      copy_icvs(&thr_bar->th_fixed_icvs,
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
    }
#endif
  } else { // Handle fork barrier workers who aren't part of a team yet
    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
    // Wait for parent thread to release us
    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    ANNOTATE_BARRIER_END(this_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
      // In fork barrier where we could not get the object reliably
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
      // Cancel wait on previous parallel region...
      __kmp_itt_task_starting(itt_sync_obj);

      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
        return;

      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
      if (itt_sync_obj != NULL)
        // Call prepare as early as possible for "new" barrier
        __kmp_itt_task_finished(itt_sync_obj);
    } else
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
        // Early exit for reaping threads releasing forkjoin barrier
        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
      return;

    // The worker thread may now assume that the team is valid.
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    tid = __kmp_tid_from_gtid(gtid);

    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
    KA_TRACE(20,
             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
    KMP_MB(); // Flush all pending memory write invalidates.
  }
  num_threads = this_thr->th.th_team_nproc;
  other_threads = team->t.t_threads;

#ifdef KMP_REVERSE_HYPER_BAR
  // Count up to correct level for parent
  for (level = 0, offset = 1;
       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
       level += branch_bits, offset <<= branch_bits)
    ;

  // Now go down from there
  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
       level -= branch_bits, offset >>= branch_bits)
#else
  // Go down the tree, level by level
  for (level = 0, offset = 1; offset < num_threads;
       level += branch_bits, offset <<= branch_bits)
#endif // KMP_REVERSE_HYPER_BAR
  {
#ifdef KMP_REVERSE_HYPER_BAR
    /* Now go in reverse order through the children, highest to lowest.
       Initial setting of child is conservative here. */
    child = num_threads >> ((level == 0) ? level : level - 1);
    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
        child_tid = tid + (child << level);
         child >= 1; child--, child_tid -= (1 << level))
#else
    if (((tid >> level) & (branch_factor - 1)) != 0)
      // No need to go lower than this, since this is the level parent would be
      // notified
      break;
    // Iterate through children on this level of the tree
    for (child = 1, child_tid = tid + (1 << level);
         child < branch_factor && child_tid < num_threads;
         child++, child_tid += (1 << level))
#endif // KMP_REVERSE_HYPER_BAR
    {
      if (child_tid >= num_threads)
        continue; // Child doesn't exist so keep going
      else {
        kmp_info_t *child_thr = other_threads[child_tid];
        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
#if KMP_CACHE_MANAGE
        kmp_uint32 next_child_tid = child_tid - (1 << level);
// Prefetch next thread's go count
#ifdef KMP_REVERSE_HYPER_BAR
        if (child - 1 >= 1 && next_child_tid < num_threads)
#else
        if (child + 1 < branch_factor && next_child_tid < num_threads)
#endif // KMP_REVERSE_HYPER_BAR
          KMP_CACHE_PREFETCH(
              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
#endif /* KMP_CACHE_MANAGE */

#if KMP_BARRIER_ICV_PUSH
        if (propagate_icvs) // push my fixed ICVs to my child
          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
#endif // KMP_BARRIER_ICV_PUSH

        KA_TRACE(
            20,
            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
             "go(%p): %u => %u\n",
             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
        // Release child from barrier
        ANNOTATE_BARRIER_BEGIN(child_thr);
        kmp_flag_64 flag(&child_bar->b_go, child_thr);
        flag.release();
      }
    }
  }
#if KMP_BARRIER_ICV_PUSH
  if (propagate_icvs &&
      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
                             FALSE);
    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
              &thr_bar->th_fixed_icvs);
  }
#endif
  KA_TRACE(
      20,
      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
       gtid, team->t.t_id, tid, bt));
}

// Hierarchical Barrier

// Initialize thread barrier data
/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
   Performs the minimum amount of initialization required based on how the team
   has changed. Returns true if leaf children will require both on-core and
   traditional wake-up mechanisms. For example, if the team size increases,
   threads already in the team will respond to on-core wakeup on their parent
   thread, but threads newly added to the team will only be listening on the
   their local b_go. */
static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
                                                   kmp_bstate_t *thr_bar,
                                                   kmp_uint32 nproc, int gtid,
                                                   int tid, kmp_team_t *team) {
  // Checks to determine if (re-)initialization is needed
  bool uninitialized = thr_bar->team == NULL;
  bool team_changed = team != thr_bar->team;
  bool team_sz_changed = nproc != thr_bar->nproc;
  bool tid_changed = tid != thr_bar->old_tid;
  bool retval = false;

  if (uninitialized || team_sz_changed) {
    __kmp_get_hierarchy(nproc, thr_bar);
  }

  if (uninitialized || team_sz_changed || tid_changed) {
    thr_bar->my_level = thr_bar->depth - 1; // default for master
    thr_bar->parent_tid = -1; // default for master
    if (!KMP_MASTER_TID(
            tid)) { // if not master, find parent thread in hierarchy
      kmp_uint32 d = 0;
      while (d < thr_bar->depth) { // find parent based on level of thread in
        // hierarchy, and note level
        kmp_uint32 rem;
        if (d == thr_bar->depth - 2) { // reached level right below the master
          thr_bar->parent_tid = 0;
          thr_bar->my_level = d;
          break;
        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
                   0) { // TODO: can we make this op faster?
          // thread is not a subtree root at next level, so this is max
          thr_bar->parent_tid = tid - rem;
          thr_bar->my_level = d;
          break;
        }
        ++d;
      }
    }
    thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
    thr_bar->old_tid = tid;
    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
    thr_bar->team = team;
    thr_bar->parent_bar =
        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
  }
  if (uninitialized || team_changed || tid_changed) {
    thr_bar->team = team;
    thr_bar->parent_bar =
        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
    retval = true;
  }
  if (uninitialized || team_sz_changed || tid_changed) {
    thr_bar->nproc = nproc;
    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
    if (thr_bar->my_level == 0)
      thr_bar->leaf_kids = 0;
    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
      thr_bar->leaf_kids = nproc - tid - 1;
    thr_bar->leaf_state = 0;
    for (int i = 0; i < thr_bar->leaf_kids; ++i)
      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
  }
  return retval;
}

static void __kmp_hierarchical_barrier_gather(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
  kmp_team_t *team = this_thr->th.th_team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_uint32 nproc = this_thr->th.th_team_nproc;
  kmp_info_t **other_threads = team->t.t_threads;
  kmp_uint64 new_state;

  int level = team->t.t_level;
  if (other_threads[0]
          ->th.th_teams_microtask) // are we inside the teams construct?
    if (this_thr->th.th_teams_size.nteams > 1)
      ++level; // level was not increased in teams construct for team_of_masters
  if (level == 1)
    thr_bar->use_oncore_barrier = 1;
  else
    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested

  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
                "barrier type %d\n",
                gtid, team->t.t_id, tid, bt));
  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
  // Barrier imbalance - save arrive time to the thread
  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
  }
#endif

  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
                                               team);

  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
    kmp_int32 child_tid;
    new_state =
        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
        thr_bar->use_oncore_barrier) {
      if (thr_bar->leaf_kids) {
        // First, wait for leaf children to check-in on my b_arrived flag
        kmp_uint64 leaf_state =
            KMP_MASTER_TID(tid)
                ? thr_bar->b_arrived | thr_bar->leaf_state
                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
                      "for leaf kids\n",
                      gtid, team->t.t_id, tid));
        kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
        if (reduce) {
          ANNOTATE_REDUCE_AFTER(reduce);
          OMPT_REDUCTION_DECL(this_thr, gtid);
          OMPT_REDUCTION_BEGIN;
          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
               ++child_tid) {
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
                           "T#%d(%d:%d)\n",
                           gtid, team->t.t_id, tid,
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                           child_tid));
            ANNOTATE_BARRIER_END(other_threads[child_tid]);
            (*reduce)(this_thr->th.th_local.reduce_data,
                      other_threads[child_tid]->th.th_local.reduce_data);
          }
          OMPT_REDUCTION_END;
          ANNOTATE_REDUCE_BEFORE(reduce);
          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
        }
        // clear leaf_state bits
        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
      }
      // Next, wait for higher level children on each child's b_arrived flag
      for (kmp_uint32 d = 1; d < thr_bar->my_level;
           ++d) { // gather lowest level threads first, but skip 0
        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
                   skip = thr_bar->skip_per_level[d];
        if (last > nproc)
          last = nproc;
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
          kmp_info_t *child_thr = other_threads[child_tid];
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
                        "T#%d(%d:%d) "
                        "arrived(%p) == %llu\n",
                        gtid, team->t.t_id, tid,
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                        child_tid, &child_bar->b_arrived, new_state));
          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          ANNOTATE_BARRIER_END(child_thr);
          if (reduce) {
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
                           "T#%d(%d:%d)\n",
                           gtid, team->t.t_id, tid,
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                           child_tid));
            ANNOTATE_REDUCE_AFTER(reduce);
            (*reduce)(this_thr->th.th_local.reduce_data,
                      child_thr->th.th_local.reduce_data);
            ANNOTATE_REDUCE_BEFORE(reduce);
            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
          }
        }
      }
    } else { // Blocktime is not infinite
      for (kmp_uint32 d = 0; d < thr_bar->my_level;
           ++d) { // Gather lowest level threads first
        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
                   skip = thr_bar->skip_per_level[d];
        if (last > nproc)
          last = nproc;
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
          kmp_info_t *child_thr = other_threads[child_tid];
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
                        "T#%d(%d:%d) "
                        "arrived(%p) == %llu\n",
                        gtid, team->t.t_id, tid,
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                        child_tid, &child_bar->b_arrived, new_state));
          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          ANNOTATE_BARRIER_END(child_thr);
          if (reduce) {
            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
                           "T#%d(%d:%d)\n",
                           gtid, team->t.t_id, tid,
                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                           child_tid));
            ANNOTATE_REDUCE_AFTER(reduce);
            (*reduce)(this_thr->th.th_local.reduce_data,
                      child_thr->th.th_local.reduce_data);
            ANNOTATE_REDUCE_BEFORE(reduce);
            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
          }
        }
      }
    }
  }
  // All subordinates are gathered; now release parent if not master thread

  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
                  gtid, team->t.t_id, tid,
                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
    /* Mark arrival to parent: After performing this write, a worker thread may
       not assume that the team is valid any more - it could be deallocated by
       the master thread at any time. */
    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
      // flag; release it
      ANNOTATE_BARRIER_BEGIN(this_thr);
      kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
      flag.release();
    } else {
      // Leaf does special release on "offset" bits of parent's b_arrived flag
      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
      flag.set_waiter(other_threads[thr_bar->parent_tid]);
      flag.release();
    }
  } else { // Master thread needs to update the team's b_arrived value
    team->t.t_bar[bt].b_arrived = new_state;
    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
                  "arrived(%p) = %llu\n",
                  gtid, team->t.t_id, tid, team->t.t_id,
                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
  }
  // Is the team access below unsafe or just technically invalid?
  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
                "barrier type %d\n",
                gtid, team->t.t_id, tid, bt));
}

static void __kmp_hierarchical_barrier_release(
    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
  kmp_team_t *team;
  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
  kmp_uint32 nproc;
  bool team_change = false; // indicates on-core barrier shouldn't be used

  if (KMP_MASTER_TID(tid)) {
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
                  "entered barrier type %d\n",
                  gtid, team->t.t_id, tid, bt));
  } else { // Worker threads
    // Wait for parent thread to release me
    if (!thr_bar->use_oncore_barrier ||
        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
        thr_bar->team == NULL) {
      // Use traditional method of waiting on my own b_go flag
      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
      kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
      ANNOTATE_BARRIER_END(this_thr);
      TCW_8(thr_bar->b_go,
            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
      // infinite, not nested
      // Wait on my "offset" bits on parent's b_go flag
      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
                           thr_bar->offset, bt,
                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
      flag.wait(this_thr, TRUE);
      if (thr_bar->wait_flag ==
          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
        TCW_8(thr_bar->b_go,
              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
      } else { // Reset my bits on parent's b_go flag
        (RCAST(volatile char *,
               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
      }
    }
    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
    // Early exit for reaping threads releasing forkjoin barrier
    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
      return;
    // The worker thread may now assume that the team is valid.
    team = __kmp_threads[gtid]->th.th_team;
    KMP_DEBUG_ASSERT(team != NULL);
    tid = __kmp_tid_from_gtid(gtid);

    KA_TRACE(
        20,
        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
    KMP_MB(); // Flush all pending memory write invalidates.
  }

  nproc = this_thr->th.th_team_nproc;
  int level = team->t.t_level;
  if (team->t.t_threads[0]
          ->th.th_teams_microtask) { // are we inside the teams construct?
    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
        this_thr->th.th_teams_level == level)
      ++level; // level was not increased in teams construct for team_of_workers
    if (this_thr->th.th_teams_size.nteams > 1)
      ++level; // level was not increased in teams construct for team_of_masters
  }
  if (level == 1)
    thr_bar->use_oncore_barrier = 1;
  else
    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested

  // If the team size has increased, we still communicate with old leaves via
  // oncore barrier.
  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
                                                       tid, team);
  // But if the entire team changes, we won't use oncore barrier at all
  if (team_change)
    old_leaf_kids = 0;

#if KMP_BARRIER_ICV_PUSH
  if (propagate_icvs) {
    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
                             FALSE);
    if (KMP_MASTER_TID(
            tid)) { // master already has copy in final destination; copy
      copy_icvs(&thr_bar->th_fixed_icvs,
                &team->t.t_implicit_task_taskdata[tid].td_icvs);
    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
        // leaves (on-core children) pull parent's fixed ICVs directly to local
        // ICV store
        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                  &thr_bar->parent_bar->th_fixed_icvs);
      // non-leaves will get ICVs piggybacked with b_go via NGO store
    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
        // access
        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
      else // leaves copy parent's fixed ICVs directly to local ICV store
        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                  &thr_bar->parent_bar->th_fixed_icvs);
    }
  }
#endif // KMP_BARRIER_ICV_PUSH

  // Now, release my children
  if (thr_bar->my_level) { // not a leaf
    kmp_int32 child_tid;
    kmp_uint32 last;
    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
        thr_bar->use_oncore_barrier) {
      if (KMP_MASTER_TID(tid)) { // do a flat release
        // Set local b_go to bump children via NGO store of the cache line
        // containing IVCs and b_go.
        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
        // the cache line
        ngo_load(&thr_bar->th_fixed_icvs);
        // This loops over all the threads skipping only the leaf nodes in the
        // hierarchy
        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
             child_tid += thr_bar->skip_per_level[1]) {
          kmp_bstate_t *child_bar =
              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
                        "releasing T#%d(%d:%d)"
                        " go(%p): %u => %u\n",
                        gtid, team->t.t_id, tid,
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                        child_tid, &child_bar->b_go, child_bar->b_go,
                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
          // Use ngo store (if available) to both store ICVs and release child
          // via child's b_go
          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
        }
        ngo_sync();
      }
      TCW_8(thr_bar->b_go,
            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
      // Now, release leaf children
      if (thr_bar->leaf_kids) { // if there are any
        // We test team_change on the off-chance that the level 1 team changed.
        if (team_change ||
            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
          if (old_leaf_kids) { // release old leaf kids
            thr_bar->b_go |= old_leaf_state;
          }
          // Release new leaf kids
          last = tid + thr_bar->skip_per_level[1];
          if (last > nproc)
            last = nproc;
          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
               ++child_tid) { // skip_per_level[0]=1
            kmp_info_t *child_thr = team->t.t_threads[child_tid];
            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
            KA_TRACE(
                20,
                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
                 " T#%d(%d:%d) go(%p): %u => %u\n",
                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
            // Release child using child's b_go flag
            ANNOTATE_BARRIER_BEGIN(child_thr);
            kmp_flag_64 flag(&child_bar->b_go, child_thr);
            flag.release();
          }
        } else { // Release all children at once with leaf_state bits on my own
          // b_go flag
          thr_bar->b_go |= thr_bar->leaf_state;
        }
      }
    } else { // Blocktime is not infinite; do a simple hierarchical release
      for (int d = thr_bar->my_level - 1; d >= 0;
           --d) { // Release highest level threads first
        last = tid + thr_bar->skip_per_level[d + 1];
        kmp_uint32 skip = thr_bar->skip_per_level[d];
        if (last > nproc)
          last = nproc;
        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
          kmp_info_t *child_thr = team->t.t_threads[child_tid];
          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
                        gtid, team->t.t_id, tid,
                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
                        child_tid, &child_bar->b_go, child_bar->b_go,
                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
          // Release child using child's b_go flag
          ANNOTATE_BARRIER_BEGIN(child_thr);
          kmp_flag_64 flag(&child_bar->b_go, child_thr);
          flag.release();
        }
      }
    }
#if KMP_BARRIER_ICV_PUSH
    if (propagate_icvs && !KMP_MASTER_TID(tid))
      // non-leaves copy ICVs from fixed ICVs to local dest
      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                &thr_bar->th_fixed_icvs);
#endif // KMP_BARRIER_ICV_PUSH
  }
  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
                "barrier type %d\n",
                gtid, team->t.t_id, tid, bt));
}

// End of Barrier Algorithms

// type traits for cancellable value
// if cancellable is true, then is_cancellable is a normal boolean variable
// if cancellable is false, then is_cancellable is a compile time constant
template <bool cancellable> struct is_cancellable {};
template <> struct is_cancellable<true> {
  bool value;
  is_cancellable() : value(false) {}
  is_cancellable(bool b) : value(b) {}
  is_cancellable &operator=(bool b) {
    value = b;
    return *this;
  }
  operator bool() const { return value; }
};
template <> struct is_cancellable<false> {
  is_cancellable &operator=(bool b) { return *this; }
  constexpr operator bool() const { return false; }
};

// Internal function to do a barrier.
/* If is_split is true, do a split barrier, otherwise, do a plain barrier
   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
   barrier
   When cancellable = false,
     Returns 0 if master thread, 1 if worker thread.
   When cancellable = true
     Returns 0 if not cancelled, 1 if cancelled.  */
template <bool cancellable = false>
static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
                                  size_t reduce_size, void *reduce_data,
                                  void (*reduce)(void *, void *)) {
  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
  int tid = __kmp_tid_from_gtid(gtid);
  kmp_info_t *this_thr = __kmp_threads[gtid];
  kmp_team_t *team = this_thr->th.th_team;
  int status = 0;
  is_cancellable<cancellable> cancelled;
#if OMPT_SUPPORT && OMPT_OPTIONAL
  ompt_data_t *my_task_data;
  ompt_data_t *my_parallel_data;
  void *return_address;
  ompt_sync_region_t barrier_kind;
#endif

  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));

  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
#if OMPT_SUPPORT
  if (ompt_enabled.enabled) {
#if OMPT_OPTIONAL
    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
    if (ompt_enabled.ompt_callback_sync_region) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
          return_address);
    }
    if (ompt_enabled.ompt_callback_sync_region_wait) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
          return_address);
    }
#endif
    // It is OK to report the barrier state after the barrier begin callback.
    // According to the OMPT specification, a compliant implementation may
    // even delay reporting this state until the barrier begins to wait.
    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
  }
#endif

  if (!team->t.t_serialized) {
#if USE_ITT_BUILD
    // This value will be used in itt notify events below.
    void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
#endif
#endif /* USE_ITT_BUILD */
    if (__kmp_tasking_mode == tskm_extra_barrier) {
      __kmp_tasking_barrier(team, this_thr, gtid);
      KA_TRACE(15,
               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
    }

    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
       access it when the team struct is not guaranteed to exist. */
    // See note about the corresponding code in __kmp_join_barrier() being
    // performance-critical.
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
#if KMP_USE_MONITOR
      this_thr->th.th_team_bt_intervals =
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
      this_thr->th.th_team_bt_set =
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
#else
      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
#endif
    }

#if USE_ITT_BUILD
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
#if USE_DEBUGGER
    // Let the debugger know: the thread arrived to the barrier and waiting.
    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
      team->t.t_bar[bt].b_master_arrived += 1;
    } else {
      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
    } // if
#endif /* USE_DEBUGGER */
    if (reduce != NULL) {
      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
      this_thr->th.th_local.reduce_data = reduce_data;
    }

    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
      // use 0 to only setup the current team if nthreads > 1
      __kmp_task_team_setup(this_thr, team, 0);

    if (cancellable) {
      cancelled = __kmp_linear_barrier_gather_cancellable(
          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
    } else {
      switch (__kmp_barrier_gather_pattern[bt]) {
      case bp_hyper_bar: {
        // don't set branch bits to 0; use linear
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
        break;
      }
      case bp_hierarchical_bar: {
        __kmp_hierarchical_barrier_gather(
            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
        break;
      }
      case bp_tree_bar: {
        // don't set branch bits to 0; use linear
        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
        break;
      }
      default: {
        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
      }
      }
    }

    KMP_MB();

    if (KMP_MASTER_TID(tid)) {
      status = 0;
      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
      }
#if USE_DEBUGGER
      // Let the debugger know: All threads are arrived and starting leaving the
      // barrier.
      team->t.t_bar[bt].b_team_arrived += 1;
#endif

      if (__kmp_omp_cancellation) {
        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
        // Reset cancellation flag for worksharing constructs
        if (cancel_request == cancel_loop ||
            cancel_request == cancel_sections) {
          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
        }
      }
#if USE_ITT_BUILD
      /* TODO: In case of split reduction barrier, master thread may send
         acquired event early, before the final summation into the shared
         variable is done (final summation can be a long operation for array
         reductions).  */
      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
#if USE_ITT_BUILD && USE_ITT_NOTIFY
      // Barrier - report frame end (only if active_level == 1)
      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
          __kmp_forkjoin_frames_mode &&
          this_thr->th.th_teams_microtask == NULL &&
          team->t.t_active_level == 1) {
        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
        kmp_uint64 cur_time = __itt_get_timestamp();
        kmp_info_t **other_threads = team->t.t_threads;
        int nproc = this_thr->th.th_team_nproc;
        int i;
        switch (__kmp_forkjoin_frames_mode) {
        case 1:
          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
                                 loc, nproc);
          this_thr->th.th_frame_time = cur_time;
          break;
        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
          // be fixed)
          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
                                 1, loc, nproc);
          break;
        case 3:
          if (__itt_metadata_add_ptr) {
            // Initialize with master's wait time
            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
            // Set arrive time to zero to be able to check it in
            // __kmp_invoke_task(); the same is done inside the loop below
            this_thr->th.th_bar_arrive_time = 0;
            for (i = 1; i < nproc; ++i) {
              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
              other_threads[i]->th.th_bar_arrive_time = 0;
            }
            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
                                         cur_time, delta,
                                         (kmp_uint64)(reduce != NULL));
          }
          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
                                 loc, nproc);
          this_thr->th.th_frame_time = cur_time;
          break;
        }
      }
#endif /* USE_ITT_BUILD */
    } else {
      status = 1;
#if USE_ITT_BUILD
      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
    }
    if ((status == 1 || !is_split) && !cancelled) {
      if (cancellable) {
        cancelled = __kmp_linear_barrier_release_cancellable(
            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
      } else {
        switch (__kmp_barrier_release_pattern[bt]) {
        case bp_hyper_bar: {
          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          break;
        }
        case bp_hierarchical_bar: {
          __kmp_hierarchical_barrier_release(
              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          break;
        }
        case bp_tree_bar: {
          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
          break;
        }
        default: {
          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
        }
        }
      }
      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
        __kmp_task_team_sync(this_thr, team);
      }
    }

#if USE_ITT_BUILD
    /* GEH: TODO: Move this under if-condition above and also include in
       __kmp_end_split_barrier(). This will more accurately represent the actual
       release time of the threads for split barriers.  */
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
  } else { // Team is serialized.
    status = 0;
    if (__kmp_tasking_mode != tskm_immediate_exec) {
      if (this_thr->th.th_task_team != NULL) {
#if USE_ITT_NOTIFY
        void *itt_sync_obj = NULL;
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
        }
#endif

        KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
                         TRUE);
        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
        __kmp_task_team_setup(this_thr, team, 0);

#if USE_ITT_BUILD
        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */
      }
    }
  }
  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
                __kmp_tid_from_gtid(gtid), status));

#if OMPT_SUPPORT
  if (ompt_enabled.enabled) {
#if OMPT_OPTIONAL
    if (ompt_enabled.ompt_callback_sync_region_wait) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
          return_address);
    }
    if (ompt_enabled.ompt_callback_sync_region) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
          return_address);
    }
#endif
    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
  }
#endif
  ANNOTATE_BARRIER_END(&team->t.t_bar);

  if (cancellable)
    return (int)cancelled;
  return status;
}

// Returns 0 if master thread, 1 if worker thread.
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
                  size_t reduce_size, void *reduce_data,
                  void (*reduce)(void *, void *)) {
  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
                                  reduce);
}

#if defined(KMP_GOMP_COMPAT)
// Returns 1 if cancelled, 0 otherwise
int __kmp_barrier_gomp_cancel(int gtid) {
  if (__kmp_omp_cancellation) {
    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
                                                 0, NULL, NULL);
    if (cancelled) {
      int tid = __kmp_tid_from_gtid(gtid);
      kmp_info_t *this_thr = __kmp_threads[gtid];
      if (KMP_MASTER_TID(tid)) {
        // Master does not need to revert anything
      } else {
        // Workers need to revert their private b_arrived flag
        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
            KMP_BARRIER_STATE_BUMP;
      }
    }
    return cancelled;
  }
  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
  return FALSE;
}
#endif

void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
  int tid = __kmp_tid_from_gtid(gtid);
  kmp_info_t *this_thr = __kmp_threads[gtid];
  kmp_team_t *team = this_thr->th.th_team;

  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
  if (!team->t.t_serialized) {
    if (KMP_MASTER_GTID(gtid)) {
      switch (__kmp_barrier_release_pattern[bt]) {
      case bp_hyper_bar: {
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
                                    FALSE USE_ITT_BUILD_ARG(NULL));
        break;
      }
      case bp_hierarchical_bar: {
        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
                                           FALSE USE_ITT_BUILD_ARG(NULL));
        break;
      }
      case bp_tree_bar: {
        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
                                   FALSE USE_ITT_BUILD_ARG(NULL));
        break;
      }
      default: {
        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
                                     FALSE USE_ITT_BUILD_ARG(NULL));
      }
      }
      if (__kmp_tasking_mode != tskm_immediate_exec) {
        __kmp_task_team_sync(this_thr, team);
      } // if
    }
  }
  ANNOTATE_BARRIER_END(&team->t.t_bar);
}

void __kmp_join_barrier(int gtid) {
  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
  kmp_info_t *this_thr = __kmp_threads[gtid];
  kmp_team_t *team;
  kmp_uint nproc;
  kmp_info_t *master_thread;
  int tid;
#ifdef KMP_DEBUG
  int team_id;
#endif /* KMP_DEBUG */
#if USE_ITT_BUILD
  void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
    // Get object created at fork_barrier
    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
#endif
#endif /* USE_ITT_BUILD */
  KMP_MB();

  // Get current info
  team = this_thr->th.th_team;
  nproc = this_thr->th.th_team_nproc;
  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
  tid = __kmp_tid_from_gtid(gtid);
#ifdef KMP_DEBUG
  team_id = team->t.t_id;
#endif /* KMP_DEBUG */
  master_thread = this_thr->th.th_team_master;
#ifdef KMP_DEBUG
  if (master_thread != team->t.t_threads[0]) {
    __kmp_print_structure();
  }
#endif /* KMP_DEBUG */
  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
  KMP_MB();

  // Verify state
  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
                gtid, team_id, tid));

  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
#if OMPT_SUPPORT
  if (ompt_enabled.enabled) {
#if OMPT_OPTIONAL
    ompt_data_t *my_task_data;
    ompt_data_t *my_parallel_data;
    void *codeptr = NULL;
    int ds_tid = this_thr->th.th_info.ds.ds_tid;
    if (KMP_MASTER_TID(ds_tid) &&
        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
      codeptr = team->t.ompt_team_info.master_return_address;
    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
    if (ompt_enabled.ompt_callback_sync_region) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
          my_task_data, codeptr);
    }
    if (ompt_enabled.ompt_callback_sync_region_wait) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
          my_task_data, codeptr);
    }
    if (!KMP_MASTER_TID(ds_tid))
      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
#endif
    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
  }
#endif

  if (__kmp_tasking_mode == tskm_extra_barrier) {
    __kmp_tasking_barrier(team, this_thr, gtid);
    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
                  team_id, tid));
  }
#ifdef KMP_DEBUG
  if (__kmp_tasking_mode != tskm_immediate_exec) {
    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
                  "%p, th_task_team = %p\n",
                  __kmp_gtid_from_thread(this_thr), team_id,
                  team->t.t_task_team[this_thr->th.th_task_state],
                  this_thr->th.th_task_team));
    KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
                     team->t.t_task_team[this_thr->th.th_task_state]);
  }
#endif /* KMP_DEBUG */

  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
     access it when the team struct is not guaranteed to exist. Doing these
     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
     we do not perform the copy if blocktime=infinite, since the values are not
     used by __kmp_wait_template() in that case. */
  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
#if KMP_USE_MONITOR
    this_thr->th.th_team_bt_intervals =
        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
    this_thr->th.th_team_bt_set =
        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
#else
    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
#endif
  }

#if USE_ITT_BUILD
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */

  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
  case bp_hyper_bar: {
    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  case bp_hierarchical_bar: {
    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  case bp_tree_bar: {
    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  default: {
    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
  }
  }

  /* From this point on, the team data structure may be deallocated at any time
     by the master thread - it is unsafe to reference it in any of the worker
     threads. Any per-team data items that need to be referenced before the
     end of the barrier should be moved to the kmp_task_team_t structs.  */
  if (KMP_MASTER_TID(tid)) {
    if (__kmp_tasking_mode != tskm_immediate_exec) {
      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
    }
    if (__kmp_display_affinity) {
      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
    }
#if KMP_STATS_ENABLED
    // Have master thread flag the workers to indicate they are now waiting for
    // next parallel region, Also wake them up so they switch their timers to
    // idle.
    for (int i = 0; i < team->t.t_nproc; ++i) {
      kmp_info_t *team_thread = team->t.t_threads[i];
      if (team_thread == this_thr)
        continue;
      team_thread->th.th_stats->setIdleFlag();
      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
          team_thread->th.th_sleep_loc != NULL)
        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
                                  team_thread->th.th_sleep_loc);
    }
#endif
#if USE_ITT_BUILD
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
#endif /* USE_ITT_BUILD */

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    // Join barrier - report frame end
    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
        __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
        team->t.t_active_level == 1) {
      kmp_uint64 cur_time = __itt_get_timestamp();
      ident_t *loc = team->t.t_ident;
      kmp_info_t **other_threads = team->t.t_threads;
      int nproc = this_thr->th.th_team_nproc;
      int i;
      switch (__kmp_forkjoin_frames_mode) {
      case 1:
        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
                               loc, nproc);
        break;
      case 2:
        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
                               loc, nproc);
        break;
      case 3:
        if (__itt_metadata_add_ptr) {
          // Initialize with master's wait time
          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
          // Set arrive time to zero to be able to check it in
          // __kmp_invoke_task(); the same is done inside the loop below
          this_thr->th.th_bar_arrive_time = 0;
          for (i = 1; i < nproc; ++i) {
            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
            other_threads[i]->th.th_bar_arrive_time = 0;
          }
          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
                                       cur_time, delta, 0);
        }
        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
                               loc, nproc);
        this_thr->th.th_frame_time = cur_time;
        break;
      }
    }
#endif /* USE_ITT_BUILD */
  }
#if USE_ITT_BUILD
  else {
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
  }
#endif /* USE_ITT_BUILD */

#if KMP_DEBUG
  if (KMP_MASTER_TID(tid)) {
    KA_TRACE(
        15,
        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
         gtid, team_id, tid, nproc));
  }
#endif /* KMP_DEBUG */

  // TODO now, mark worker threads as done so they may be disbanded
  KMP_MB(); // Flush all pending memory write invalidates.
  KA_TRACE(10,
           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));

  ANNOTATE_BARRIER_END(&team->t.t_bar);
}

// TODO release worker threads' fork barriers as we are ready instead of all at
// once
void __kmp_fork_barrier(int gtid, int tid) {
  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
  kmp_info_t *this_thr = __kmp_threads[gtid];
  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
#if USE_ITT_BUILD
  void *itt_sync_obj = NULL;
#endif /* USE_ITT_BUILD */
  if (team)
    ANNOTATE_BARRIER_END(&team->t.t_bar);

  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
                (team != NULL) ? team->t.t_id : -1, tid));

  // th_team pointer only valid for master thread here
  if (KMP_MASTER_TID(tid)) {
#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
      // Create itt barrier object
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
    }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */

#ifdef KMP_DEBUG
    kmp_info_t **other_threads = team->t.t_threads;
    int i;

    // Verify state
    KMP_MB();

    for (i = 1; i < team->t.t_nproc; ++i) {
      KA_TRACE(500,
               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
                "== %u.\n",
                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
      KMP_DEBUG_ASSERT(
          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
    }
#endif

    if (__kmp_tasking_mode != tskm_immediate_exec) {
      // 0 indicates setup current task team if nthreads > 1
      __kmp_task_team_setup(this_thr, team, 0);
    }

    /* The master thread may have changed its blocktime between the join barrier
       and the fork barrier. Copy the blocktime info to the thread, where
       __kmp_wait_template() can access it when the team struct is not
       guaranteed to exist. */
    // See note about the corresponding code in __kmp_join_barrier() being
    // performance-critical
    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
#if KMP_USE_MONITOR
      this_thr->th.th_team_bt_intervals =
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
      this_thr->th.th_team_bt_set =
          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
#else
      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
#endif
    }
  } // master

  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
  case bp_hyper_bar: {
    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  case bp_hierarchical_bar: {
    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  case bp_tree_bar: {
    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
    break;
  }
  default: {
    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
  }
  }

#if OMPT_SUPPORT
  if (ompt_enabled.enabled &&
      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
    int ds_tid = this_thr->th.th_info.ds.ds_tid;
    ompt_data_t *task_data = (team)
                                 ? OMPT_CUR_TASK_DATA(this_thr)
                                 : &(this_thr->th.ompt_thread_info.task_data);
    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
#if OMPT_OPTIONAL
    void *codeptr = NULL;
    if (KMP_MASTER_TID(ds_tid) &&
        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
      codeptr = team->t.ompt_team_info.master_return_address;
    if (ompt_enabled.ompt_callback_sync_region_wait) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
          codeptr);
    }
    if (ompt_enabled.ompt_callback_sync_region) {
      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
          codeptr);
    }
#endif
    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
    }
  }
#endif

  // Early exit for reaping threads releasing forkjoin barrier
  if (TCR_4(__kmp_global.g.g_done)) {
    this_thr->th.th_task_team = NULL;

#if USE_ITT_BUILD && USE_ITT_NOTIFY
    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
      if (!KMP_MASTER_TID(tid)) {
        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
        if (itt_sync_obj)
          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
      }
    }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
    return;
  }

  /* We can now assume that a valid team structure has been allocated by the
     master and propagated to all worker threads. The current thread, however,
     may not be part of the team, so we can't blindly assume that the team
     pointer is non-null.  */
  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
  KMP_DEBUG_ASSERT(team != NULL);
  tid = __kmp_tid_from_gtid(gtid);

#if KMP_BARRIER_ICV_PULL
  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
     implicit task has this data before this function is called. We cannot
     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
     struct, because it is not always the case that the threads arrays have
     been allocated when __kmp_fork_call() is executed. */
  {
    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
      // Copy the initial ICVs from the master's thread struct to the implicit
      // task for this tid.
      KA_TRACE(10,
               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
                               tid, FALSE);
      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
                &team->t.t_threads[0]
                     ->th.th_bar[bs_forkjoin_barrier]
                     .bb.th_fixed_icvs);
    }
  }
#endif // KMP_BARRIER_ICV_PULL

  if (__kmp_tasking_mode != tskm_immediate_exec) {
    __kmp_task_team_sync(this_thr, team);
  }

#if KMP_AFFINITY_SUPPORTED
  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
  if (proc_bind == proc_bind_intel) {
    // Call dynamic affinity settings
    if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
    }
  } else if (proc_bind != proc_bind_false) {
    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
                     __kmp_gtid_from_thread(this_thr),
                     this_thr->th.th_current_place));
    } else {
      __kmp_affinity_set_place(gtid);
    }
  }
#endif // KMP_AFFINITY_SUPPORTED
  // Perform the display affinity functionality
  if (__kmp_display_affinity) {
    if (team->t.t_display_affinity
#if KMP_AFFINITY_SUPPORTED
        || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
#endif
            ) {
      // NULL means use the affinity-format-var ICV
      __kmp_aux_display_affinity(gtid, NULL);
      this_thr->th.th_prev_num_threads = team->t.t_nproc;
      this_thr->th.th_prev_level = team->t.t_level;
    }
  }
  if (!KMP_MASTER_TID(tid))
    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);

#if USE_ITT_BUILD && USE_ITT_NOTIFY
  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
    if (!KMP_MASTER_TID(tid)) {
      // Get correct barrier object
      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
    } // (prepare called inside barrier_release)
  }
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
  ANNOTATE_BARRIER_END(&team->t.t_bar);
  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
                team->t.t_id, tid));
}

void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
                          kmp_internal_control_t *new_icvs, ident_t *loc) {
  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);

  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);

/* Master thread's copy of the ICVs was set up on the implicit taskdata in
   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
   implicit task has this data before this function is called. */
#if KMP_BARRIER_ICV_PULL
  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
     untouched), where all of the worker threads can access them and make their
     own copies after the barrier. */
  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
  // allocated at this point
  copy_icvs(
      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
      new_icvs);
  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
                team->t.t_threads[0], team));
#elif KMP_BARRIER_ICV_PUSH
  // The ICVs will be propagated in the fork barrier, so nothing needs to be
  // done here.
  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
                team->t.t_threads[0], team));
#else
  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
  // time.
  ngo_load(new_icvs);
  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
  // allocated at this point
  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
    // TODO: GEH - pass in better source location info since usually NULL here
    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                  f, team->t.t_threads[f], team));
    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                  f, team->t.t_threads[f], team));
  }
  ngo_sync();
#endif // KMP_BARRIER_ICV_PULL
}