30#include "kmp_dispatch.h"
32#include "kmp_dispatch_hier.h"
36#include "ompt-specific.h"
42void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
72static inline int __kmp_get_monotonicity(
enum sched_type schedule,
73 bool use_hier =
false) {
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
96void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
97 dispatch_private_info_template<T> *pr,
99 typename traits_t<T>::signed_t st,
101 kmp_uint64 *cur_chunk,
103 typename traits_t<T>::signed_t chunk,
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
116 typedef typename traits_t<T>::signed_t ST;
120 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
142#if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 pr->flags.nomerge = TRUE;
158 pr->flags.nomerge = FALSE;
160 pr->type_size = traits_t<T>::type_size;
162 pr->flags.ordered = TRUE;
166 pr->flags.ordered = FALSE;
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
174 schedule = __kmp_static;
176 if (schedule == kmp_sch_runtime) {
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185 schedule = __kmp_guided;
187 schedule = __kmp_static;
191 chunk = team->t.t_sched.chunk;
200 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
209 schedule = __kmp_guided;
212 chunk = KMP_DEFAULT_CHUNK;
218 schedule = __kmp_auto;
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
232#if KMP_STATIC_STEAL_ENABLED
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
258 chunk = team->t.t_sched.chunk * chunk;
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
277 pr->u.p.parm1 = chunk;
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
301 tc = (UT)(lb - ub) / (-st) + 1;
309 tc = (UT)(ub - lb) / st + 1;
316 if (KMP_MASTER_GTID(gtid)) {
327 pr->u.p.last_upper = ub + st;
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
341#if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
346 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
353 T small_chunk, extras;
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
358 init =
id * small_chunk + (
id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (
id + 1) % nproc;
369 if (traits_t<T>::type_size > 4) {
375 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376 pr->u.p.th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
378 __kmp_init_lock(pr->u.p.th_steal_lock);
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
393 case kmp_sch_static_balanced: {
398 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
408 pr->u.p.parm1 = (
id == tc - 1);
411 pr->u.p.parm1 = FALSE;
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init =
id * small_chunk + (
id < extras ? id : extras);
418 limit = init + small_chunk - (
id < extras ? 0 : 1);
419 pr->u.p.parm1 = (
id == nproc - 1);
425 pr->u.p.parm1 = TRUE;
429 pr->u.p.parm1 = FALSE;
435 if (itt_need_metadata_reporting)
437 *cur_chunk = limit - init + 1;
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
460 case kmp_sch_static_balanced_chunked: {
463 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
466 schedule = kmp_sch_static_greedy;
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
474 case kmp_sch_guided_iterative_chunked: {
477 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
482 if ((2L * chunk + 1) * nproc >= tc) {
484 schedule = kmp_sch_dynamic_chunked;
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(
double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc;
492 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
495 schedule = kmp_sch_static_greedy;
499 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
511 if ((2L * chunk + 1) * nproc >= tc) {
513 schedule = kmp_sch_dynamic_chunked;
518#if KMP_USE_X87CONTROL
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC);
532 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
539 x = (
long double)1.0 - (
long double)0.5 / nproc;
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
560 *(DBL *)&pr->u.p.parm3 = x;
573 p = __kmp_pow<UT>(x, right);
578 }
while (p > target && right < (1 << 27));
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
601 pr->u.p.parm2 = cross;
604#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
607#define GUIDED_ANALYTICAL_WORKAROUND (x)
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
613#if KMP_USE_X87CONTROL
615 _control87(oldFpcw, _MCW_PC);
619 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
622 schedule = kmp_sch_static_greedy;
628 case kmp_sch_static_greedy:
631 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
640 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
644 case kmp_sch_trapezoidal: {
647 T parm1, parm2, parm3, parm4;
649 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
655 parm2 = (tc / (2 * nproc));
665 }
else if (parm1 > parm2) {
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
679 parm4 = (parm2 - parm1) / parm4;
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
695 KMP_HNT(GetNewerLibrary),
700 pr->schedule = schedule;
703#if KMP_USE_HIER_SCHED
705inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
709__kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
717__kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
725__kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
733__kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
741void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (
int i = 0; i < num_disp_buff; ++i) {
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
747 &team->t.t_disp_buffer[i]);
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
761 T ub,
typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk,
int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T>
volatile *sh;
772 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
777 if (!TCR_4(__kmp_init_parallel))
778 __kmp_parallel_initialize();
780 __kmp_resume_if_soft_paused();
783 SSC_MARK_DISPATCH_INIT();
786 typedef typename traits_t<T>::signed_t ST;
790 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d "
791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792 traits_t<ST>::spec, traits_t<T>::spec,
793 traits_t<T>::spec, traits_t<ST>::spec);
794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795 __kmp_str_free(&buff);
799 th = __kmp_threads[gtid];
800 team = th->th.th_team;
801 active = !team->t.t_serialized;
802 th->th.th_ident = loc;
807 if (schedule == __kmp_static) {
813#if KMP_USE_HIER_SCHED
819 my_buffer_index = th->th.th_dispatch->th_disp_index;
820 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
828 if (pr->flags.use_hier) {
830 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. "
831 "Disabling hierarchical scheduling.\n",
833 pr->flags.use_hier = FALSE;
836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
839 if (!ordered && !pr->flags.use_hier)
840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
845 kmp_uint64 cur_chunk = chunk;
846 int itt_need_metadata_reporting =
847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849 team->t.t_active_level == 1;
852 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
853 th->th.th_dispatch->th_disp_buffer);
855 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
858 my_buffer_index = th->th.th_dispatch->th_disp_index++;
861 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
874 chunk, (T)th->th.th_team_nproc,
875 (T)th->th.th_info.ds.ds_tid);
877 if (pr->flags.ordered == 0) {
878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
890 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891 "sh->buffer_index:%d\n",
892 gtid, my_buffer_index, sh->buffer_index));
893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
898 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899 "sh->buffer_index:%d\n",
900 gtid, my_buffer_index, sh->buffer_index));
902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903 th->th.th_dispatch->th_dispatch_sh_current =
904 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
906 if (pr->flags.ordered) {
907 __kmp_itt_ordered_init(gtid);
910 if (itt_need_metadata_reporting) {
912 kmp_uint64 schedtype = 0;
914 case kmp_sch_static_chunked:
915 case kmp_sch_static_balanced:
917 case kmp_sch_static_greedy:
918 cur_chunk = pr->u.p.parm1;
920 case kmp_sch_dynamic_chunked:
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_analytical_chunked:
934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
936#if KMP_USE_HIER_SCHED
937 if (pr->flags.use_hier) {
939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
949 buff = __kmp_str_format(
950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962 __kmp_str_free(&buff);
965#if (KMP_STATIC_STEAL_ENABLED)
971 if (pr->schedule == kmp_sch_static_steal) {
975 volatile T *p = &pr->u.p.static_steal_counter;
980#if OMPT_SUPPORT && OMPT_OPTIONAL
981 if (ompt_enabled.ompt_callback_work) {
982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984 ompt_callbacks.ompt_callback(ompt_callback_work)(
985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
997template <
typename UT>
998static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
999 typedef typename traits_t<UT>::signed_t ST;
1000 kmp_info_t *th = __kmp_threads[gtid];
1002 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1003 if (!th->th.th_team->t.t_serialized) {
1005 dispatch_private_info_template<UT> *pr =
1006 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1007 th->th.th_dispatch->th_dispatch_pr_current);
1008 dispatch_shared_info_template<UT>
volatile *sh =
1009 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1010 th->th.th_dispatch->th_dispatch_sh_current);
1011 KMP_DEBUG_ASSERT(pr);
1012 KMP_DEBUG_ASSERT(sh);
1013 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1016 if (pr->ordered_bumped) {
1019 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1021 pr->ordered_bumped = 0;
1023 UT lower = pr->u.p.ordered_lower;
1029 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: "
1030 "ordered_iteration:%%%s lower:%%%s\n",
1031 traits_t<UT>::spec, traits_t<UT>::spec);
1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033 __kmp_str_free(&buff);
1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1044 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: "
1045 "ordered_iteration:%%%s lower:%%%s\n",
1046 traits_t<UT>::spec, traits_t<UT>::spec);
1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048 __kmp_str_free(&buff);
1052 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1055 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1058#ifdef KMP_GOMP_COMPAT
1060template <
typename UT>
1061static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1062 typedef typename traits_t<UT>::signed_t ST;
1063 kmp_info_t *th = __kmp_threads[gtid];
1065 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066 if (!th->th.th_team->t.t_serialized) {
1068 dispatch_private_info_template<UT> *pr =
1069 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1070 th->th.th_dispatch->th_dispatch_pr_current);
1071 dispatch_shared_info_template<UT>
volatile *sh =
1072 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1073 th->th.th_dispatch->th_dispatch_sh_current);
1074 KMP_DEBUG_ASSERT(pr);
1075 KMP_DEBUG_ASSERT(sh);
1076 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1080 UT lower = pr->u.p.ordered_lower;
1081 UT upper = pr->u.p.ordered_upper;
1082 UT inc = upper - lower + 1;
1084 if (pr->ordered_bumped == inc) {
1087 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1089 pr->ordered_bumped = 0;
1091 inc -= pr->ordered_bumped;
1097 buff = __kmp_str_format(
1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102 __kmp_str_free(&buff);
1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1110 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting "
1111 "ordered_bumped to zero\n",
1113 pr->ordered_bumped = 0;
1119 buff = __kmp_str_format(
1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123 traits_t<UT>::spec);
1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126 __kmp_str_free(&buff);
1130 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1134 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1139template <
typename T>
1140int __kmp_dispatch_next_algorithm(
int gtid,
1141 dispatch_private_info_template<T> *pr,
1142 dispatch_shared_info_template<T>
volatile *sh,
1143 kmp_int32 *p_last, T *p_lb, T *p_ub,
1144 typename traits_t<T>::signed_t *p_st, T nproc,
1146 typedef typename traits_t<T>::unsigned_t UT;
1147 typedef typename traits_t<T>::signed_t ST;
1148 typedef typename traits_t<T>::floating_t DBL;
1153 UT limit, trip, init;
1154 kmp_info_t *th = __kmp_threads[gtid];
1155 kmp_team_t *team = th->th.th_team;
1157 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159 KMP_DEBUG_ASSERT(pr);
1160 KMP_DEBUG_ASSERT(sh);
1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1167 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168 "sh:%%p nproc:%%%s tid:%%%s\n",
1169 traits_t<T>::spec, traits_t<T>::spec);
1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171 __kmp_str_free(&buff);
1176 if (pr->u.p.tc == 0) {
1178 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1184 switch (pr->schedule) {
1185#if (KMP_STATIC_STEAL_ENABLED)
1186 case kmp_sch_static_steal: {
1187 T chunk = pr->u.p.parm1;
1190 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1193 trip = pr->u.p.tc - 1;
1195 if (traits_t<T>::type_size > 4) {
1198 kmp_lock_t *lck = pr->u.p.th_steal_lock;
1199 KMP_DEBUG_ASSERT(lck != NULL);
1200 if (pr->u.p.count < (UT)pr->u.p.ub) {
1201 __kmp_acquire_lock(lck, gtid);
1203 init = (pr->u.p.count)++;
1204 status = (init < (UT)pr->u.p.ub);
1205 __kmp_release_lock(lck, gtid);
1210 kmp_info_t **other_threads = team->t.t_threads;
1211 int while_limit = pr->u.p.parm3;
1212 int while_index = 0;
1213 T
id = pr->u.p.static_steal_counter;
1214 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1215 __kmp_dispatch_num_buffers;
1219 while ((!status) && (while_limit != ++while_index)) {
1220 dispatch_private_info_template<T> *victim;
1222 T victimIdx = pr->u.p.parm4;
1223 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1224 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1225 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1226 KMP_DEBUG_ASSERT(victim);
1227 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1228 oldVictimIdx != victimIdx) {
1229 victimIdx = (victimIdx + 1) % nproc;
1230 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1231 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1232 KMP_DEBUG_ASSERT(victim);
1234 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1239 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1240 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1244 lck = victim->u.p.th_steal_lock;
1245 KMP_ASSERT(lck != NULL);
1246 __kmp_acquire_lock(lck, gtid);
1247 limit = victim->u.p.ub;
1248 if (victim->u.p.count >= limit ||
1249 (remaining = limit - victim->u.p.count) < 2) {
1250 __kmp_release_lock(lck, gtid);
1251 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1256 if (remaining > 3) {
1258 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1259 init = (victim->u.p.ub -= (remaining >> 2));
1262 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1263 init = (victim->u.p.ub -= 1);
1265 __kmp_release_lock(lck, gtid);
1267 KMP_DEBUG_ASSERT(init + 1 <= limit);
1268 pr->u.p.parm4 = victimIdx;
1272 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1273 pr->u.p.count = init + 1;
1275 __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1290 union_i4 vold, vnew;
1291 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1294 while (!KMP_COMPARE_AND_STORE_ACQ64(
1295 (
volatile kmp_int64 *)&pr->u.p.count,
1296 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1297 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1299 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1304 init = vnew.p.count;
1305 status = (init < (UT)vnew.p.ub);
1309 kmp_info_t **other_threads = team->t.t_threads;
1310 int while_limit = pr->u.p.parm3;
1311 int while_index = 0;
1312 T
id = pr->u.p.static_steal_counter;
1313 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1314 __kmp_dispatch_num_buffers;
1318 while ((!status) && (while_limit != ++while_index)) {
1319 dispatch_private_info_template<T> *victim;
1320 union_i4 vold, vnew;
1321 kmp_int32 remaining;
1322 T victimIdx = pr->u.p.parm4;
1323 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1324 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1325 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1326 KMP_DEBUG_ASSERT(victim);
1327 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1328 oldVictimIdx != victimIdx) {
1329 victimIdx = (victimIdx + 1) % nproc;
1330 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1331 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1332 KMP_DEBUG_ASSERT(victim);
1334 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1339 pr->u.p.parm4 = victimIdx;
1341 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1344 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1345 if (vnew.p.count >= (UT)vnew.p.ub ||
1346 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1347 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1350 if (remaining > 3) {
1351 vnew.p.ub -= (remaining >> 2);
1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1357 if (KMP_COMPARE_AND_STORE_ACQ64(
1358 (
volatile kmp_int64 *)&victim->u.p.count,
1359 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1360 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1362 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1363 vold.p.ub - vnew.p.ub);
1368 vold.p.count = init + 1;
1370 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1372 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1387 start = pr->u.p.parm2;
1389 limit = chunk + init - 1;
1391 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1393 KMP_DEBUG_ASSERT(init <= trip);
1394 if ((last = (limit >= trip)) != 0)
1400 *p_lb = start + init;
1401 *p_ub = start + limit;
1403 *p_lb = start + init * incr;
1404 *p_ub = start + limit * incr;
1407 if (pr->flags.ordered) {
1408 pr->u.p.ordered_lower = init;
1409 pr->u.p.ordered_upper = limit;
1415 case kmp_sch_static_balanced: {
1418 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1421 if ((status = !pr->u.p.count) != 0) {
1425 last = pr->u.p.parm1;
1429 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1433 case kmp_sch_static_greedy:
1435 case kmp_sch_static_chunked: {
1438 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1439 "kmp_sch_static_[affinity|chunked] case\n",
1441 parm1 = pr->u.p.parm1;
1443 trip = pr->u.p.tc - 1;
1444 init = parm1 * (pr->u.p.count + tid);
1446 if ((status = (init <= trip)) != 0) {
1449 limit = parm1 + init - 1;
1451 if ((last = (limit >= trip)) != 0)
1457 pr->u.p.count += nproc;
1460 *p_lb = start + init;
1461 *p_ub = start + limit;
1463 *p_lb = start + init * incr;
1464 *p_ub = start + limit * incr;
1467 if (pr->flags.ordered) {
1468 pr->u.p.ordered_lower = init;
1469 pr->u.p.ordered_upper = limit;
1475 case kmp_sch_dynamic_chunked: {
1476 T chunk = pr->u.p.parm1;
1480 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1483 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1484 trip = pr->u.p.tc - 1;
1486 if ((status = (init <= trip)) == 0) {
1493 limit = chunk + init - 1;
1496 if ((last = (limit >= trip)) != 0)
1503 *p_lb = start + init;
1504 *p_ub = start + limit;
1506 *p_lb = start + init * incr;
1507 *p_ub = start + limit * incr;
1510 if (pr->flags.ordered) {
1511 pr->u.p.ordered_lower = init;
1512 pr->u.p.ordered_upper = limit;
1518 case kmp_sch_guided_iterative_chunked: {
1519 T chunkspec = pr->u.p.parm1;
1520 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1527 init = sh->u.s.iteration;
1528 remaining = trip - init;
1529 if (remaining <= 0) {
1538 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1540 remaining = trip - init;
1541 if (remaining <= 0) {
1546 if ((T)remaining > chunkspec) {
1547 limit = init + chunkspec - 1;
1550 limit = init + remaining - 1;
1556 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1557 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1558 (ST)init, (ST)limit)) {
1570 *p_lb = start + init * incr;
1571 *p_ub = start + limit * incr;
1572 if (pr->flags.ordered) {
1573 pr->u.p.ordered_lower = init;
1574 pr->u.p.ordered_upper = limit;
1588 T chunk = pr->u.p.parm1;
1590 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1596 init = sh->u.s.iteration;
1597 remaining = trip - init;
1598 if (remaining <= 0) {
1602 KMP_DEBUG_ASSERT(init % chunk == 0);
1604 if ((T)remaining < pr->u.p.parm2) {
1607 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1609 remaining = trip - init;
1610 if (remaining <= 0) {
1615 if ((T)remaining > chunk) {
1616 limit = init + chunk - 1;
1619 limit = init + remaining - 1;
1625 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1626 UT rem = span % chunk;
1628 span += chunk - rem;
1629 limit = init + span;
1630 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1631 (ST)init, (ST)limit)) {
1643 *p_lb = start + init * incr;
1644 *p_ub = start + limit * incr;
1645 if (pr->flags.ordered) {
1646 pr->u.p.ordered_lower = init;
1647 pr->u.p.ordered_upper = limit;
1658 case kmp_sch_guided_analytical_chunked: {
1659 T chunkspec = pr->u.p.parm1;
1661#if KMP_USE_X87CONTROL
1664 unsigned int oldFpcw;
1665 unsigned int fpcwSet = 0;
1667 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1668 "kmp_sch_guided_analytical_chunked case\n",
1673 KMP_DEBUG_ASSERT(nproc > 1);
1674 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1678 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1679 if (chunkIdx >= (UT)pr->u.p.parm2) {
1682 init = chunkIdx * chunkspec + pr->u.p.count;
1685 if ((status = (init > 0 && init <= trip)) != 0) {
1686 limit = init + chunkspec - 1;
1688 if ((last = (limit >= trip)) != 0)
1698#if KMP_USE_X87CONTROL
1703 oldFpcw = _control87(0, 0);
1704 _control87(_PC_64, _MCW_PC);
1709 init = __kmp_dispatch_guided_remaining<T>(
1710 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1711 KMP_DEBUG_ASSERT(init);
1715 limit = trip - __kmp_dispatch_guided_remaining<T>(
1716 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1717 KMP_ASSERT(init <= limit);
1719 KMP_DEBUG_ASSERT(limit <= trip);
1726#if KMP_USE_X87CONTROL
1730 if (fpcwSet && (oldFpcw & fpcwSet))
1731 _control87(oldFpcw, _MCW_PC);
1738 *p_lb = start + init * incr;
1739 *p_ub = start + limit * incr;
1740 if (pr->flags.ordered) {
1741 pr->u.p.ordered_lower = init;
1742 pr->u.p.ordered_upper = limit;
1753 case kmp_sch_trapezoidal: {
1755 T parm2 = pr->u.p.parm2;
1756 T parm3 = pr->u.p.parm3;
1757 T parm4 = pr->u.p.parm4;
1759 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1762 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1764 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1765 trip = pr->u.p.tc - 1;
1767 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1774 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1777 if ((last = (limit >= trip)) != 0)
1784 *p_lb = start + init;
1785 *p_ub = start + limit;
1787 *p_lb = start + init * incr;
1788 *p_ub = start + limit * incr;
1791 if (pr->flags.ordered) {
1792 pr->u.p.ordered_lower = init;
1793 pr->u.p.ordered_upper = limit;
1800 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1801 KMP_HNT(GetNewerLibrary),
1809 if (pr->flags.ordered) {
1812 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1813 "ordered_lower:%%%s ordered_upper:%%%s\n",
1814 traits_t<UT>::spec, traits_t<UT>::spec);
1815 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1816 __kmp_str_free(&buff);
1821 buff = __kmp_str_format(
1822 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1823 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1824 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1825 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1826 __kmp_str_free(&buff);
1835#if OMPT_SUPPORT && OMPT_OPTIONAL
1836#define OMPT_LOOP_END \
1837 if (status == 0) { \
1838 if (ompt_enabled.ompt_callback_work) { \
1839 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1840 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1841 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1842 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1843 &(task_info->task_data), 0, codeptr); \
1848#define OMPT_LOOP_END
1851#if KMP_STATS_ENABLED
1852#define KMP_STATS_LOOP_END \
1854 kmp_int64 u, l, t, i; \
1855 l = (kmp_int64)(*p_lb); \
1856 u = (kmp_int64)(*p_ub); \
1857 i = (kmp_int64)(pr->u.p.st); \
1858 if (status == 0) { \
1860 KMP_POP_PARTITIONED_TIMER(); \
1861 } else if (i == 1) { \
1866 } else if (i < 0) { \
1868 t = (l - u) / (-i) + 1; \
1873 t = (u - l) / i + 1; \
1877 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1880#define KMP_STATS_LOOP_END
1883template <
typename T>
1884static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1886 typename traits_t<T>::signed_t *p_st
1887#
if OMPT_SUPPORT && OMPT_OPTIONAL
1893 typedef typename traits_t<T>::unsigned_t UT;
1894 typedef typename traits_t<T>::signed_t ST;
1899 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1902 dispatch_private_info_template<T> *pr;
1903 kmp_info_t *th = __kmp_threads[gtid];
1904 kmp_team_t *team = th->th.th_team;
1906 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1909 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1910 gtid, p_lb, p_ub, p_st, p_last));
1912 if (team->t.t_serialized) {
1914 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1915 th->th.th_dispatch->th_disp_buffer);
1916 KMP_DEBUG_ASSERT(pr);
1918 if ((status = (pr->u.p.tc != 0)) == 0) {
1925 if (__kmp_env_consistency_check) {
1926 if (pr->pushed_ws != ct_none) {
1927 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1930 }
else if (pr->flags.nomerge) {
1933 UT limit, trip, init;
1935 T chunk = pr->u.p.parm1;
1937 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1940 init = chunk * pr->u.p.count++;
1941 trip = pr->u.p.tc - 1;
1943 if ((status = (init <= trip)) == 0) {
1950 if (__kmp_env_consistency_check) {
1951 if (pr->pushed_ws != ct_none) {
1952 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1957 limit = chunk + init - 1;
1960 if ((last = (limit >= trip)) != 0) {
1963 pr->u.p.last_upper = pr->u.p.ub;
1971 *p_lb = start + init;
1972 *p_ub = start + limit;
1974 *p_lb = start + init * incr;
1975 *p_ub = start + limit * incr;
1978 if (pr->flags.ordered) {
1979 pr->u.p.ordered_lower = init;
1980 pr->u.p.ordered_upper = limit;
1985 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d "
1986 "ordered_lower:%%%s ordered_upper:%%%s\n",
1987 traits_t<UT>::spec, traits_t<UT>::spec);
1988 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1989 pr->u.p.ordered_upper));
1990 __kmp_str_free(&buff);
2000 pr->u.p.last_upper = *p_ub;
2011 buff = __kmp_str_format(
2012 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2013 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2014 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2015 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2016 __kmp_str_free(&buff);
2019#if INCLUDE_SSC_MARKS
2020 SSC_MARK_DISPATCH_NEXT();
2027 dispatch_shared_info_template<T>
volatile *sh;
2029 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2030 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2032 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2033 th->th.th_dispatch->th_dispatch_pr_current);
2034 KMP_DEBUG_ASSERT(pr);
2035 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2036 th->th.th_dispatch->th_dispatch_sh_current);
2037 KMP_DEBUG_ASSERT(sh);
2039#if KMP_USE_HIER_SCHED
2040 if (pr->flags.use_hier)
2041 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2044 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2045 p_st, th->th.th_team_nproc,
2046 th->th.th_info.ds.ds_tid);
2051 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2056 buff = __kmp_str_format(
2057 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2058 traits_t<UT>::spec);
2059 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2060 __kmp_str_free(&buff);
2064#if KMP_USE_HIER_SCHED
2065 pr->flags.use_hier = FALSE;
2067 if ((ST)num_done == th->th.th_team_nproc - 1) {
2068#if (KMP_STATIC_STEAL_ENABLED)
2069 if (pr->schedule == kmp_sch_static_steal &&
2070 traits_t<T>::type_size > 4) {
2072 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2073 __kmp_dispatch_num_buffers;
2074 kmp_info_t **other_threads = team->t.t_threads;
2076 for (i = 0; i < th->th.th_team_nproc; ++i) {
2077 dispatch_private_info_template<T> *buf =
2078 reinterpret_cast<dispatch_private_info_template<T> *
>(
2079 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2080 kmp_lock_t *lck = buf->u.p.th_steal_lock;
2081 KMP_ASSERT(lck != NULL);
2082 __kmp_destroy_lock(lck);
2084 buf->u.p.th_steal_lock = NULL;
2092 sh->u.s.num_done = 0;
2093 sh->u.s.iteration = 0;
2096 if (pr->flags.ordered) {
2097 sh->u.s.ordered_iteration = 0;
2102 sh->buffer_index += __kmp_dispatch_num_buffers;
2103 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2104 gtid, sh->buffer_index));
2109 if (__kmp_env_consistency_check) {
2110 if (pr->pushed_ws != ct_none) {
2111 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2115 th->th.th_dispatch->th_deo_fcn = NULL;
2116 th->th.th_dispatch->th_dxo_fcn = NULL;
2117 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2118 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2122 pr->u.p.last_upper = pr->u.p.ub;
2125 if (p_last != NULL && status != 0)
2133 buff = __kmp_str_format(
2134 "__kmp_dispatch_next: T#%%d normal case: "
2135 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2136 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2138 (p_last ? *p_last : 0), status));
2139 __kmp_str_free(&buff);
2142#if INCLUDE_SSC_MARKS
2143 SSC_MARK_DISPATCH_NEXT();
2150template <
typename T>
2151static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2152 kmp_int32 *plastiter, T *plower, T *pupper,
2153 typename traits_t<T>::signed_t incr) {
2154 typedef typename traits_t<T>::unsigned_t UT;
2161 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2162 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2164 typedef typename traits_t<T>::signed_t ST;
2168 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "
2169 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2170 traits_t<T>::spec, traits_t<T>::spec,
2171 traits_t<ST>::spec, traits_t<T>::spec);
2172 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2173 __kmp_str_free(&buff);
2177 if (__kmp_env_consistency_check) {
2179 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2182 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2192 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2195 th = __kmp_threads[gtid];
2196 team = th->th.th_team;
2197 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2198 nteams = th->th.th_teams_size.nteams;
2199 team_id = team->t.t_master_tid;
2200 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2204 trip_count = *pupper - *plower + 1;
2205 }
else if (incr == -1) {
2206 trip_count = *plower - *pupper + 1;
2207 }
else if (incr > 0) {
2209 trip_count = (UT)(*pupper - *plower) / incr + 1;
2211 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2214 if (trip_count <= nteams) {
2216 __kmp_static == kmp_sch_static_greedy ||
2218 kmp_sch_static_balanced);
2220 if (team_id < trip_count) {
2221 *pupper = *plower = *plower + team_id * incr;
2223 *plower = *pupper + incr;
2225 if (plastiter != NULL)
2226 *plastiter = (team_id == trip_count - 1);
2228 if (__kmp_static == kmp_sch_static_balanced) {
2229 UT chunk = trip_count / nteams;
2230 UT extras = trip_count % nteams;
2232 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2233 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2234 if (plastiter != NULL)
2235 *plastiter = (team_id == nteams - 1);
2238 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2240 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2242 *plower += team_id * chunk_inc_count;
2243 *pupper = *plower + chunk_inc_count - incr;
2246 if (*pupper < *plower)
2247 *pupper = traits_t<T>::max_value;
2248 if (plastiter != NULL)
2249 *plastiter = *plower <= upper && *pupper > upper - incr;
2250 if (*pupper > upper)
2253 if (*pupper > *plower)
2254 *pupper = traits_t<T>::min_value;
2255 if (plastiter != NULL)
2256 *plastiter = *plower >= upper && *pupper < upper - incr;
2257 if (*pupper < upper)
2289 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2290 KMP_DEBUG_ASSERT(__kmp_init_serial);
2291#if OMPT_SUPPORT && OMPT_OPTIONAL
2292 OMPT_STORE_RETURN_ADDRESS(gtid);
2294 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2301 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2302 KMP_DEBUG_ASSERT(__kmp_init_serial);
2303#if OMPT_SUPPORT && OMPT_OPTIONAL
2304 OMPT_STORE_RETURN_ADDRESS(gtid);
2306 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2314 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2315 KMP_DEBUG_ASSERT(__kmp_init_serial);
2316#if OMPT_SUPPORT && OMPT_OPTIONAL
2317 OMPT_STORE_RETURN_ADDRESS(gtid);
2319 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2327 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2328 KMP_DEBUG_ASSERT(__kmp_init_serial);
2329#if OMPT_SUPPORT && OMPT_OPTIONAL
2330 OMPT_STORE_RETURN_ADDRESS(gtid);
2332 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2346 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2348 KMP_DEBUG_ASSERT(__kmp_init_serial);
2349#if OMPT_SUPPORT && OMPT_OPTIONAL
2350 OMPT_STORE_RETURN_ADDRESS(gtid);
2352 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2353 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2356void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2358 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2360 KMP_DEBUG_ASSERT(__kmp_init_serial);
2361#if OMPT_SUPPORT && OMPT_OPTIONAL
2362 OMPT_STORE_RETURN_ADDRESS(gtid);
2364 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2365 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2368void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2370 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2372 KMP_DEBUG_ASSERT(__kmp_init_serial);
2373#if OMPT_SUPPORT && OMPT_OPTIONAL
2374 OMPT_STORE_RETURN_ADDRESS(gtid);
2376 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2377 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2380void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2382 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2384 KMP_DEBUG_ASSERT(__kmp_init_serial);
2385#if OMPT_SUPPORT && OMPT_OPTIONAL
2386 OMPT_STORE_RETURN_ADDRESS(gtid);
2388 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2389 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2406 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2407#if OMPT_SUPPORT && OMPT_OPTIONAL
2408 OMPT_STORE_RETURN_ADDRESS(gtid);
2410 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2411#
if OMPT_SUPPORT && OMPT_OPTIONAL
2413 OMPT_LOAD_RETURN_ADDRESS(gtid)
2422 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2424#if OMPT_SUPPORT && OMPT_OPTIONAL
2425 OMPT_STORE_RETURN_ADDRESS(gtid);
2427 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428#
if OMPT_SUPPORT && OMPT_OPTIONAL
2430 OMPT_LOAD_RETURN_ADDRESS(gtid)
2439 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2440#if OMPT_SUPPORT && OMPT_OPTIONAL
2441 OMPT_STORE_RETURN_ADDRESS(gtid);
2443 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2444#
if OMPT_SUPPORT && OMPT_OPTIONAL
2446 OMPT_LOAD_RETURN_ADDRESS(gtid)
2455 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2457#if OMPT_SUPPORT && OMPT_OPTIONAL
2458 OMPT_STORE_RETURN_ADDRESS(gtid);
2460 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461#
if OMPT_SUPPORT && OMPT_OPTIONAL
2463 OMPT_LOAD_RETURN_ADDRESS(gtid)
2475 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2482 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2489 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2496 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2503kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2504 return value == checker;
2507kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2508 return value != checker;
2511kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2512 return value < checker;
2515kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2516 return value >= checker;
2519kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2520 return value <= checker;
2524__kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2525 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2529 volatile kmp_uint32 *spin = spinner;
2530 kmp_uint32 check = checker;
2532 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2535 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2536 KMP_INIT_YIELD(spins);
2538 while (!f(r = TCR_4(*spin), check)) {
2539 KMP_FSYNC_SPIN_PREPARE(obj);
2544 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2546 KMP_FSYNC_SPIN_ACQUIRED(obj);
2550void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2551 kmp_uint32 (*pred)(
void *, kmp_uint32),
2555 void *spin = spinner;
2556 kmp_uint32 check = checker;
2558 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2560 KMP_FSYNC_SPIN_INIT(obj, spin);
2561 KMP_INIT_YIELD(spins);
2563 while (!f(spin, check)) {
2564 KMP_FSYNC_SPIN_PREPARE(obj);
2567 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2569 KMP_FSYNC_SPIN_ACQUIRED(obj);
2574#ifdef KMP_GOMP_COMPAT
2576void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2578 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2580 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2584void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2586 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2588 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2592void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2594 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2596 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2600void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2602 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2604 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2608void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2609 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2612void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2613 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2616void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2617 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2620void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2621 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)