72 #if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) || (defined(_MSC_VER) && (_MSC_VER >= 1916)) 73 #include <emmintrin.h> 74 #include <immintrin.h> 80 #define RTUSENEWWIN32APIS 1 81 #define _WIN32_WINNT 0x0400 88 #if defined(_AIX) || defined(_CRAY) || defined(__irix) || defined(__linux) || defined(__osf__) || defined(__sun) 92 #if defined(__APPLE__) && defined(THR) 94 #include <sys/types.h> 95 #include <sys/sysctl.h> 97 #include <Carbon/Carbon.h> 101 #if defined(__linux) && (defined(ARCH_LINUXARM64) || defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON)) 102 #include <sys/auxv.h> 106 #include <sys/mpctl.h> 118 #if defined(__APPLE__) 123 size_t alen =
sizeof(a);
125 mib[1] = HW_AVAILCPU;
127 rc = sysctl(mib, miblen, &a, &alen, NULL, 0);
129 perror(
"Error during sysctl() query for CPU count");
133 a = MPProcessorsScheduled();
138 struct _SYSTEM_INFO sysinfo;
139 GetSystemInfo(&sysinfo);
140 a = sysinfo.dwNumberOfProcessors;
143 #if defined(__PARAGON__) 148 a = sysconf(_SC_CRAY_NCPU);
151 #if defined(ANDROID) || defined(USEPHYSCPUCOUNT) 162 a = sysconf(_SC_NPROCESSORS_CONF);
167 int rc=0, b=1, i=-1, j=-1;
170 ifp = fopen(
"/sys/devices/system/cpu/present",
"r");
172 rc = fscanf(ifp,
"%d-%d", &i, &j);
175 if (rc == 2 && i == 0) {
184 #if defined(__sun) || defined(__linux) || defined(__osf__) || defined(_AIX) 185 a = sysconf(_SC_NPROCESSORS_ONLN);
190 a = sysconf(_SC_NPROC_ONLN);
194 a = mpctl(MPC_GETNUMSPUS, 0, 0);
208 char *forcecount = getenv(
"RTFORCECPUCOUNT");
209 if (forcecount != NULL) {
210 if (sscanf(forcecount,
"%d", &a) == 1) {
233 #define RT_USEINTCPUID 1 234 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1916))) && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64)) 237 static void rt_cpuid(
unsigned int eax,
unsigned int ecx,
unsigned int* abcd) {
238 #if defined(_MSC_VER) 239 __cpuidex((
int*)abcd, eax, ecx);
242 unsigned int ebx=0, edx=0;
243 #if defined(__i386__) && defined (__PIC__) 245 __asm__(
"movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" :
"=D" (ebx),
247 __asm__(
"cpuid" :
"+b" (ebx),
249 "+a" (eax),
"+c" (ecx),
"=d" (edx));
250 abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
254 static void rt_cpuid(
unsigned int eax,
unsigned int ecx,
unsigned int *info) {
255 __asm__ __volatile__(
259 :
"=a" (info[0]),
"=D" (info[1]),
"=c" (info[2]),
"=d" (info[3])
265 static unsigned long long rt_xgetbv(
unsigned int index) {
266 #if defined(_MSC_VER) 267 return _xgetbv(index);
269 unsigned int eax=0, edx=0;
270 __asm__ __volatile__(
272 :
"=a" (eax),
"=d"(edx)
275 return ((
unsigned long long) edx << 32) | eax;
285 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1916))) && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64)) 286 #define RT_INTERNAL_ENABLE_CPUCAP_BAILOUT 1 303 unsigned int vendcpuinfo[4] = { 0 };
304 unsigned int cpuinfo[4] = { 0 };
305 unsigned long long xcrFeatureMask = 0;
310 rt_cpuid(0, 0, vendcpuinfo);
311 if (vendcpuinfo[0] == 0)
314 rt_cpuid(1, 0, cpuinfo);
315 haveosxsave = (cpuinfo[2] & (1 << 27)) != 0;
318 flags |= ((cpuinfo[2] & (1 << 19)) != 0) *
CPU_SSE4_1;
319 flags |= ((cpuinfo[2] & (1 << 29)) != 0) *
CPU_F16C;
321 flags |= ((cpuinfo[3] & (1 << 26)) != 0) *
CPU_SSE2;
322 flags |= ((cpuinfo[3] & (1 << 28)) != 0) *
CPU_HT;
325 if ((cpuinfo[2] & (1 << 28)) != 0) {
326 xcrFeatureMask = rt_xgetbv(0);
327 havexmmymm = (xcrFeatureMask & 0x06) == 0x06;
328 havezmmmask = (xcrFeatureMask & 0xE6) == 0xE6;
331 flags |= (((cpuinfo[2] & (1 << 12)) != 0) &&
332 havexmmymm && haveosxsave) *
CPU_FMA;
334 flags |= (((cpuinfo[2] & (1 << 28)) != 0) &&
335 havexmmymm && haveosxsave) *
CPU_AVX;
338 if (cpuinfo[0] >= 0x7) {
339 unsigned int extcpuinfo[4] = { 0 };
340 rt_cpuid(7, 0, extcpuinfo);
342 flags |= (((extcpuinfo[1] & (1 << 5)) != 0) &&
343 havexmmymm && haveosxsave) *
CPU_AVX2;
345 flags |= (((extcpuinfo[1] & (1 << 16)) != 0) &&
347 flags |= (((extcpuinfo[1] & (1 << 26)) != 0) &&
349 flags |= (((extcpuinfo[1] & (1 << 27)) != 0) &&
351 flags |= (((extcpuinfo[1] & (1 << 28)) != 0) &&
367 int logicalcores = (cpuinfo[1] >> 16) && 0xFF;
368 int physicalcores = logicalcores;
369 char vendor[16] = { 0 };
370 ((
unsigned *)vendor)[0] = vendcpuinfo[1];
371 ((
unsigned *)vendor)[1] = vendcpuinfo[3];
372 ((
unsigned *)vendor)[2] = vendcpuinfo[2];
375 if (!strcmp(vendor,
"GenuineIntel")) {
376 unsigned int corecpuinfo[4] = { 0 };
377 rt_cpuid(4, 0, corecpuinfo);
378 physicalcores = ((corecpuinfo[0] >> 26) & 0x3f) + 1;
379 }
else if (!strcmp(vendor,
"AuthenticAMD")) {
380 unsigned int corecpuinfo[4] = { 0 };
381 rt_cpuid(0x80000008, 0, corecpuinfo);
382 physicalcores = (corecpuinfo[2] & 0xFF) + 1;
385 printf(
"cpuinfo: %d / %d vend: %s\n", logicalcores, physicalcores, vendor);
387 smtdepth = logicalcores / physicalcores;
391 #elif defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300) 395 flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) *
CPU_SSE2;
396 flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) *
CPU_SSE4_1;
397 flags |= _may_i_use_cpu_feature(_FEATURE_AVX) *
CPU_AVX;
398 flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) *
CPU_AVX2;
399 flags |= _may_i_use_cpu_feature(_FEATURE_FMA) *
CPU_FMA;
400 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) *
CPU_AVX512F;
401 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512CD) *
CPU_AVX512CD;
402 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512ER) *
CPU_AVX512ER;
403 flags |= _may_i_use_cpu_feature(_FEATURE_AVX512PF) *
CPU_AVX512PF;
405 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 409 __builtin_cpu_init();
410 flags |= (__builtin_cpu_supports(
"sse2")!=0) *
CPU_SSE2;
411 flags |= (__builtin_cpu_supports(
"sse4.1")!=0) *
CPU_SSE4_1;
412 flags |= (__builtin_cpu_supports(
"avx")!=0) *
CPU_AVX;
413 flags |= (__builtin_cpu_supports(
"avx2")!=0) *
CPU_AVX2;
414 flags |= (__builtin_cpu_supports(
"fma")!=0) *
CPU_FMA;
415 flags |= (__builtin_cpu_supports(
"avx512f")!=0) *
CPU_AVX512F;
416 flags |= (__builtin_cpu_supports(
"avx512cd")!=0) *
CPU_AVX512CD;
417 flags |= (__builtin_cpu_supports(
"avx512er")!=0) *
CPU_AVX512ER;
418 flags |= (__builtin_cpu_supports(
"avx512pf")!=0) *
CPU_AVX512PF;
420 #elif defined(__linux) && (defined(ARCH_LINUXARM64) || defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON)) 427 unsigned long auxval1=0;
429 auxval1 = getauxval(AT_HWCAP);
453 #if defined(RT_INTERNAL_ENABLE_CPUCAP_BAILOUT) 456 cpucaps->
flags = flags;
469 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (defined(__i386__) || defined(__x86_64__)) 484 int *affinitylist = NULL;
485 *cpuaffinitycount = -1;
488 #if 0 && defined(_MSC_VER) 490 HANDLE myproc = GetCurrentProcess();
491 DWORD affinitymask, sysaffinitymask;
493 if (!GetProcessAffinityMask(myproc, &affinitymask, &sysaffinitymask)) {
497 for (i=0; i<31; i++) {
498 affinitycount += (affinitymask >> i) & 0x1;
502 if (affinitycount > 0) {
503 affinitylist = (
int *) malloc(affinitycount *
sizeof(
int));
504 if (affinitylist == NULL)
508 for (i=0; i<CPU_SETSIZE; i++) {
509 if (CPU_ISSET(i, &affinitymask)) {
510 affinitylist[curcount] = i;
516 *cpuaffinitycount = affinitycount;
524 #if defined(CPU_SETSIZE) 526 cpu_set_t affinitymask;
530 if (sched_getaffinity(0,
sizeof(affinitymask), &affinitymask) < 0) {
531 perror(
"rt_cpu_affinitylist: sched_getaffinity");
536 for (i=0; i<CPU_SETSIZE; i++) {
537 affinitycount += CPU_ISSET(i, &affinitymask);
541 if (affinitycount > 0) {
542 affinitylist = (
int *) malloc(affinitycount *
sizeof(
int));
543 if (affinitylist == NULL)
547 for (i=0; i<CPU_SETSIZE; i++) {
548 if (CPU_ISSET(i, &affinitymask)) {
549 affinitylist[curcount] = i;
555 *cpuaffinitycount = affinitycount;
577 #if defined(__linux) && defined(CPU_ZERO) && defined(CPU_SET) 583 cpu_set_t affinitymask;
584 CPU_ZERO(&affinitymask);
585 CPU_SET(cpu, &affinitymask);
586 status = pthread_setaffinity_np(pthread_self(),
sizeof(affinitymask), &affinitymask);
589 cpu_set_t affinitymask;
590 CPU_ZERO(&affinitymask);
591 CPU_SET(cpu, &affinitymask);
594 if ((status=sched_setaffinity(0,
sizeof(affinitymask), &affinitymask)) < 0) {
595 perror(
"rt_thread_set_self_cpuaffinitylist: sched_setaffinity");
623 #ifdef USEPOSIXTHREADS 624 status = pthread_setconcurrency(nthr);
626 status = thr_setconcurrency(nthr);
630 #if defined(__irix) || defined(_AIX) 631 status = pthread_setconcurrency(nthr);
643 typedef void * (*RTTHREAD_START_ROUTINE)(
void *);
651 *thr = CreateThread(NULL, 8192, (LPTHREAD_START_ROUTINE) fctn, arg, 0, &tid);
662 #ifdef USEPOSIXTHREADS 667 pthread_attr_init(&attr);
668 pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
670 pthread_attr_destroy(&attr);
672 #elif defined(__PARAGON__) 673 status = pthread_create(thr, pthread_attr_default, fctn, arg);
695 wstatus = WAIT_TIMEOUT;
697 while (wstatus != WAIT_OBJECT_0) {
698 wstatus = WaitForSingleObject(thr, INFINITE);
705 #ifdef USEPOSIXTHREADS 706 status = pthread_join(thr, stat);
710 status = thr_join(thr, NULL, stat);
726 InitializeCriticalSection(mp);
729 #ifdef USEPOSIXTHREADS 730 status = pthread_mutex_init(mp, 0);
734 status = mutex_init(mp, USYNC_THREAD, NULL);
747 EnterCriticalSection(mp);
750 #ifdef USEPOSIXTHREADS 751 status = pthread_mutex_lock(mp);
755 status = mutex_lock(mp);
768 #if defined(THRUSENEWWIN32APIS) 771 status = (!(TryEnterCriticalSection(mp)));
775 #ifdef USEPOSIXTHREADS 776 status = (pthread_mutex_lock(mp) != 0);
789 #if defined(THRUSENEWWIN32APIS) 792 while (!TryEnterCriticalSection(mp));
794 EnterCriticalSection(mp);
798 #ifdef USEPOSIXTHREADS 799 while ((status = pthread_mutex_trylock(mp)) != 0);
812 LeaveCriticalSection(mp);
815 #ifdef USEPOSIXTHREADS 816 status = pthread_mutex_unlock(mp);
820 status = mutex_unlock(mp);
833 DeleteCriticalSection(mp);
836 #ifdef USEPOSIXTHREADS 837 status = pthread_mutex_destroy(mp);
841 status = mutex_destroy(mp);
857 #if defined(RTUSEWIN2008CONDVARS) 858 InitializeConditionVariable(cvp);
864 cvp->events[RT_COND_SIGNAL] = CreateEvent(NULL,
870 cvp->events[RT_COND_BROADCAST] = CreateEvent(NULL,
877 #ifdef USEPOSIXTHREADS 878 status = pthread_cond_init(cvp, NULL);
881 status = cond_init(cvp, USYNC_THREAD, NULL);
893 #if defined(RTUSEWIN2008CONDVARS) 896 CloseHandle(cvp->events[RT_COND_SIGNAL]);
897 CloseHandle(cvp->events[RT_COND_BROADCAST]);
901 #ifdef USEPOSIXTHREADS 902 status = pthread_cond_destroy(cvp);
905 status = cond_destroy(cvp);
914 #if defined(THR) && defined(_MSC_VER) 922 #if defined(RTUSEWIN2008CONDVARS) 923 SleepConditionVariableCS(cvp, mp, INFINITE)
925 #if !defined(RTUSEINTERLOCKEDATOMICOPS) 926 EnterCriticalSection(&cvp->waiters_lock);
928 LeaveCriticalSection(&cvp->waiters_lock);
930 InterlockedIncrement(&cvp->waiters);
933 LeaveCriticalSection(mp);
936 result = WaitForMultipleObjects(2, cvp->events, FALSE, INFINITE);
938 #if !defined(RTUSEINTERLOCKEDATOMICOPS) 939 EnterCriticalSection (&cvp->waiters_lock);
942 ((result == (WAIT_OBJECT_0 + RT_COND_BROADCAST)) && cvp->waiters == 0);
943 LeaveCriticalSection (&cvp->waiters_lock);
945 my_waiter = InterlockedDecrement(&cvp->waiters);
947 ((result == (WAIT_OBJECT_0 + RT_COND_BROADCAST)) && my_waiter == 0);
954 ResetEvent(cvp->events[RT_COND_BROADCAST]);
956 EnterCriticalSection(mp);
960 #ifdef USEPOSIXTHREADS 961 status = pthread_cond_wait(cvp, mp);
964 status = cond_wait(cvp, mp);
976 #if defined(RTUSEWIN2008CONDVARS) 977 WakeConditionVariable(cvp);
979 #if !defined(RTUSEINTERLOCKEDATOMICOPS) 980 EnterCriticalSection(&cvp->waiters_lock);
981 int have_waiters = (cvp->waiters > 0);
982 LeaveCriticalSection(&cvp->waiters_lock);
984 SetEvent (cvp->events[RT_COND_SIGNAL]);
986 if (InterlockedExchangeAdd(&cvp->waiters, 0) > 0)
987 SetEvent(cvp->events[RT_COND_SIGNAL]);
992 #ifdef USEPOSIXTHREADS 993 status = pthread_cond_signal(cvp);
996 status = cond_signal(cvp);
1008 #if defined(RTUSEWIN2008CONDVARS) 1009 WakeAllConditionVariable(cvp);
1011 #if !defined(RTUSEINTERLOCKEDATOMICOPS) 1012 EnterCriticalSection(&cvp->waiters_lock);
1013 int have_waiters = (cvp->waiters > 0);
1014 LeaveCriticalSection(&cvp->waiters_lock);
1016 SetEvent(cvp->events[RT_COND_BROADCAST]);
1018 if (InterlockedExchangeAdd(&cvp->waiters, 0) > 0)
1019 SetEvent(cvp->events[RT_COND_BROADCAST]);
1025 #ifdef USEPOSIXTHREADS 1026 status = pthread_cond_broadcast(cvp);
1029 status = cond_broadcast(cvp);
1046 #if defined(USEGCCATOMICS) 1048 #elif defined(USENETBSDATOMICS) 1050 #elif defined(USESOLARISATOMICS) 1052 #elif defined(USEWIN32ATOMICS) 1068 #if defined(USEGCCATOMICS) 1070 #elif defined(USENETBSDATOMICS) 1072 #elif defined(USESOLARISATOMICS) 1074 #elif defined(USEWIN32ATOMICS) 1091 #if defined(USEGCCATOMICS) 1095 #elif defined(USENETBSDATOMICS) 1099 #elif defined(USESOLARISATOMICS) 1103 #elif defined(USEWIN32ATOMICS) 1110 retval = atomp->
val;
1116 retval = atomp->
val;
1127 #if defined(USEGCCATOMICS) 1129 retval = atomp->
val;
1130 #elif defined(USENETBSDATOMICS) 1132 retval = atomp->
val;
1133 #elif defined(USESOLARISATOMICS) 1135 retval = atomp->
val;
1136 #elif defined(USEWIN32ATOMICS) 1138 retval = atomp->
val;
1141 retval = atomp->
val;
1146 retval = atomp->
val;
1154 #if defined(USEGCCATOMICS) 1155 return __sync_fetch_and_add(&atomp->
val, inc);
1156 #elif defined(USENETBSDATOMICS) 1158 return atomic_add_int_nv(&atomp->
val, inc) - inc;
1159 #elif defined(USESOLARISATOMICS) 1161 return atomic_add_int_nv(&atomp->
val, inc) - inc;
1162 #elif defined(USEWIN32ATOMICS) 1163 return InterlockedExchangeAdd(&atomp->
val, inc);
1167 retval = atomp->
val;
1173 int retval = atomp->
val;
1182 #if defined(USEGCCATOMICS) 1183 return __sync_add_and_fetch(&atomp->
val, inc);
1184 #elif defined(USENETBSDATOMICS) 1185 return atomic_add_int_nv(&atomp->
val, inc);
1186 #elif defined(USESOLARISATOMICS) 1187 return atomic_add_int_nv(&atomp->
val, inc);
1188 #elif defined(USEWIN32ATOMICS) 1190 return InterlockedExchangeAdd(&atomp->
val, inc) + inc;
1195 retval = atomp->
val;
1202 retval = atomp->
val;
1221 rwp->waiting_writers = 0;
1224 #ifdef USEPOSIXTHREADS 1225 pthread_mutex_init(&rwp->lock, NULL);
1226 pthread_cond_init(&rwp->rdrs_ok, NULL);
1227 pthread_cond_init(&rwp->wrtr_ok, NULL);
1229 rwp->waiting_writers = 0;
1233 status = rwlock_init(rwp, USYNC_THREAD, NULL);
1247 while (rwp->rwlock < 0 || rwp->waiting_writers)
1253 #ifdef USEPOSIXTHREADS 1254 pthread_mutex_lock(&rwp->lock);
1255 while (rwp->rwlock < 0 || rwp->waiting_writers)
1256 pthread_cond_wait(&rwp->rdrs_ok, &rwp->lock);
1258 pthread_mutex_unlock(&rwp->lock);
1262 status = rw_rdlock(rwp);
1276 while (rwp->rwlock != 0) {
1277 rwp->waiting_writers++;
1279 rwp->waiting_writers--;
1285 #ifdef USEPOSIXTHREADS 1286 pthread_mutex_lock(&rwp->lock);
1287 while (rwp->rwlock != 0) {
1288 rwp->waiting_writers++;
1289 pthread_cond_wait(&rwp->wrtr_ok, &rwp->lock);
1290 rwp->waiting_writers--;
1293 pthread_mutex_unlock(&rwp->lock);
1297 status = rw_wrlock(rwp);
1312 if (rwp->rwlock > 0) {
1317 ww = (rwp->waiting_writers && rwp->rwlock == 0);
1318 wr = (rwp->waiting_writers == 0);
1326 #ifdef USEPOSIXTHREADS 1328 pthread_mutex_lock(&rwp->lock);
1329 if (rwp->rwlock > 0) {
1334 ww = (rwp->waiting_writers && rwp->rwlock == 0);
1335 wr = (rwp->waiting_writers == 0);
1336 pthread_mutex_unlock(&rwp->lock);
1338 pthread_cond_signal(&rwp->wrtr_ok);
1340 pthread_cond_signal(&rwp->rdrs_ok);
1344 status = rw_unlock(rwp);
1359 if (barrier != NULL) {
1381 #ifdef USEPOSIXTHREADS 1382 if (barrier != NULL) {
1388 pthread_mutexattr_t mattr;
1389 pthread_condattr_t cattr;
1391 printf(
"Setting barriers to have system scope...\n");
1393 pthread_mutexattr_init(&mattr);
1394 if (pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED) != 0) {
1395 printf(
"WARNING: could not set mutex to process shared scope\n");
1398 pthread_condattr_init(&cattr);
1399 if (pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED) != 0) {
1400 printf(
"WARNING: could not set mutex to process shared scope\n");
1403 pthread_mutex_init(&barrier->
lock, &mattr);
1404 pthread_cond_init(&barrier->
wait_cv, &cattr);
1406 pthread_condattr_destroy(&cattr);
1407 pthread_mutexattr_destroy(&mattr);
1431 my_phase = barrier->
phase;
1432 barrier->
sum += increment;
1439 barrier->
phase = 1 - my_phase;
1443 while (barrier->
phase == my_phase) {
1447 my_result = barrier->
result;
1464 if (barrier != NULL) {
1468 barrier->
fctn = NULL;
1493 void **rsltparms))(
void *) {
1496 void * (*my_result)(
void*);
1499 my_phase = barrier->
phase;
1501 barrier->
fctn = fctn;
1503 barrier->
parms = parms;
1509 barrier->
fctn = NULL;
1510 barrier->
parms = NULL;
1512 barrier->
phase = 1 - my_phase;
1516 while (barrier->
phase == my_phase) {
1520 my_result = barrier->
rslt;
1521 if (rsltparms != NULL)
1526 void * (*my_result)(
void*) = fctn;
1527 if (rsltparms != NULL)
1589 int newsize = s->
top + 1;
1757 if (tile->
end > it->
end) {
1803 static void * rt_threadpool_workerproc(
void *voidparms) {
1804 void *(*fctn)(
void*);
1809 (*fctn)(workerdata);
1816 static void * rt_threadpool_workersync(
void *voidparms) {
1826 if (thrpool == NULL)
1837 thrpool->
devlist = (
int *) malloc(
sizeof(
int) * workercount);
1838 if (devlist == NULL) {
1839 for (i=0; i<workercount; i++)
1842 memcpy(thrpool->
devlist, devlist,
sizeof(
int) * workercount);
1861 for (i=0; i<workercount; i++) {
1873 for (i=0; i<workercount; i++) {
1883 void *fctn(
void *),
void *parms,
int blocking) {
1884 if (thrpool == NULL)
1959 if (threadid != NULL)
1962 if (threadcount != NULL)
1973 *devid = worker->
devid;
2010 if (tilesize != NULL) {
2012 scaledtilesize = (int) (worker->
devspeed * ((
float) (*tilesize)));
2013 if (scaledtilesize < 1)
2016 *tilesize = scaledtilesize;
2026 if (clientdata != NULL)
2027 *clientdata = worker->
parms;
2035 if (thrpool == NULL)
2104 if (threads == NULL)
2109 if (parms == NULL) {
2113 for (i=0; i<numprocs; i++) {
2114 parms[i].
iter = &iter;
2121 if (numprocs == 1) {
2130 fctn((
void *) &parms[0]);
2133 for (i=0; i<numprocs; i++) {
2138 for (i=0; i<numprocs; i++) {
2144 fctn((
void *) &parms[0]);
2164 if (threadid != NULL)
2167 if (threadcount != NULL)
2177 if (clientdata != NULL)
#define CPU_ARM64_AES
AES insns avail.
rt_shared_iterator_t * iter
dynamic work scheduler
rt_mutex_t mtx
Mutex lock for the structure.
int * devlist
per-worker CPU/GPU device IDs
iterator used for dynamic load balancing
int n_clients
Number of threads to wait for at barrier.
int rt_cpu_smt_depth(void)
CPU logical processors (SMT depth / aka hyperthreading)
int rt_tilestack_push(rt_tilestack_t *s, const rt_tasktile_t *t)
push a task tile onto the stack
int rt_mutex_lock(rt_mutex_t *mp)
lock a mutex
int rt_atomic_int_get(rt_atomic_int_t *atomp)
get an atomic int variable
int rt_shared_iterator_set(rt_shared_iterator_t *it, rt_tasktile_t *tile)
set shared iterator parameters
int rt_thread_numphysprocessors(void)
If compiling on Linux, enable the GNU CPU affinity functions in both libc and the libpthreads...
void * clientdata
worker parameters
int rt_cond_destroy(rt_cond_t *cvp)
destroy a condition variable
int rt_thread_barrier(rt_barrier_t *barrier, int increment)
synchronize on counting barrier primitive
int result
Answer to be returned by barrier_wait.
void *(* fctn)(void *)
Fctn ptr to call, or NULL if done.
rt_mutex_t lock
Mutex lock for the structure.
#define CPU_ARM64_CRC32
CRC32 insns avail.
Routines to generate a pool of threads which then grind through a dynamically load balanced work queu...
#define CPU_ARM64_FP
FP insns avail.
#define CPU_ARM64_SVE
Scalable Vector Extns avail.
int rt_threadpool_worker_getdevspeed(void *voiddata, float *speed)
worker thread calls this to get relative speed of this device as determined by the SM/core count and ...
rt_threadpool_t * rt_threadpool_create(int workercount, int *devlist)
create a thread pool with a specified number of worker threads
void rt_thread_barrier_destroy(rt_barrier_t *barrier)
destroy counting barrier primitive
#define CPU_ARM64_SHA2
SHA-2 insns avail.
Task tile struct for stack, iterator, and scheduler routines; 'start' is inclusive, 'end' is exclusive.
#define CPU_ARM64_ASIMD
Advanced SIMD avail.
int rt_rwlock_init(rt_rwlock_t *rwp)
initialize a reader/writer lock
int rt_thread_set_self_cpuaffinity(int cpu)
set the CPU affinity of the current thread (if allowed by host system)
float devspeed
speed scaling for this device
void * rsltparms
parms to return to barrier wait callers
rt_barrier_t * rt_thread_barrier_init(int n_clients)
initialize counting barrier primitive
#define CPU_HT
x86 Hyperthreading detected
void rt_thread_run_barrier_destroy(rt_run_barrier_t *barrier)
destroy thread pool barrier
int rt_threadlaunch(int numprocs, void *clientdata, void *fctn(void *), rt_tasktile_t *tile)
launch up to numprocs threads using shared iterator as a load balancer
#define CPU_SSE2
SSE2 SIMD avail.
int rt_tilestack_pop(rt_tilestack_t *s, rt_tasktile_t *t)
pop a task tile off of the stack
int size
current allocated stack size
rt_threadpool_workerdata_t * workerdata
per-worker data
#define CPU_AVX
AVX SIMD avail.
rt_cond_t wait_cv
Clients wait on condition variable to proceed.
int * rt_cpu_affinitylist(int *cpuaffinitycount)
query CPU affinity of the calling process (if allowed by host system)
rt_tasktile_t * s
stack of task tiles
int top
index of top stack element
int rt_thread_run_barrier_init(rt_run_barrier_t *barrier, int n_clients)
initialize thread pool barrier
#define RT_SCHED_DONE
Shared iterators intended for trivial CPU/GPU load balancing with no exception handling capability (a...
int rt_mutex_destroy(rt_mutex_t *mp)
destroy a mutex
#define CPU_ARM64_SHA512
SHA-512 insns avail.
int rt_rwlock_readlock(rt_rwlock_t *rwp)
set reader lock
int rt_threadlaunch_getdata(void *voidparms, void **clientdata)
worker thread can call this to get its client data pointer
#define CPU_ARM64_SHA1
SHA-1 insns avail.
int end
ending value (exlusive)
int rt_atomic_int_init(rt_atomic_int_t *atomp, int val)
initialize an atomic int variable
#define CPU_UNKNOWN
Unknown CPU type.
int n_clients
Number of threads to wait for at barrier.
int rt_mutex_init(rt_mutex_t *mp)
initialize a mutex
Tachyon cross-platform thread creation and management, atomic operations, and CPU feature query APIs...
int rt_threadpool_launch(rt_threadpool_t *thrpool, void *fctn(void *), void *parms, int blocking)
launch threads onto a new function, with associated parms
int rt_thread_setconcurrency(int nthr)
set the concurrency level and scheduling scope for threads
rt_run_barrier_t runbar
master/worker run barrier
#define RT_SCHED_CONTINUE
some work remains in the queue
int rt_rwlock_unlock(rt_rwlock_t *rwp)
unlock reader/writer lock
#define CPU_ARM64_ASIMDRDM
Advanced SIMD RDM avail.
int rt_atomic_int_fetch_and_add(rt_atomic_int_t *atomp, int inc)
fetch an atomic int and add inc to it, returning original value
int rt_mutex_trylock(rt_mutex_t *mp)
try to lock a mutex
int rt_shared_iterator_getfatalerror(rt_shared_iterator_t *it)
master thread calls this to query for fatal errors
rt_tilestack_t * errorstack
stack of tiles that failed
int rt_threadlaunch_getid(void *voidparms, int *threadid, int *threadcount)
worker thread can call this to get its ID and number of peers
thread-specific handle data for workers
#define RT_TILESTACK_EMPTY
int rt_thread_numprocessors(void)
number of processors available, subject to user override
int rt_tilestack_popall(rt_tilestack_t *s)
pop all of the task tiles off of the stack
int rt_threadpool_tile_failed(void *voidparms, rt_tasktile_t *tile)
worker thread calls this when a failure occurs on a tile it has already taken from the scheduler ...
#define CPU_ARM64_ASIMDHP
Advanced SIMD HP avail.
#define CPU_ARM64_SHA3
SHA-3 insns avail.
int end
ending task ID (exclusive)
int val
Integer value to be atomically manipulated.
int rt_rwlock_writelock(rt_rwlock_t *rwp)
set writer lock
int rt_threadpool_worker_getdata(void *voiddata, void **clientdata)
worker thread can call this to get its client data pointer
int sum
Sum of arguments passed to barrier_wait.
int fatalerror
cancel processing immediately for all threads
#define CPU_AVX512CD
AVX-512CD SIMD avail.
int rt_cpu_capability_flags(rt_cpu_caps_t *cpucaps)
CPU optional instruction set capability flags.
int threadcount
total number of worker threads
#define CPU_AVX2
AVX2 SIMD avail.
rt_mutex_t lock
Mutex lock for the structure.
int start
starting task ID (inclusive)
#define CPU_AVX512F
AVX-512F SIMD avail.
void *(* rslt)(void *)
Fctn ptr to return to barrier wait callers.
int rt_tilestack_empty(rt_tilestack_t *s)
query if the task tile stack is empty or not
int rt_threadpool_worker_getid(void *voiddata, int *threadid, int *threadcount)
worker thread can call this to get its ID and number of peers
#define CPU_AVX512ER
AVX-512ER SIMD avail.
void * parms
fctn parms for this worker
int rt_tilestack_compact(rt_tilestack_t *s)
shrink memory buffers associated with task tile stack if possible
int rt_threadpool_get_workercount(rt_threadpool_t *thrpool)
return the number of worker threads currently in the pool
stack of work tiles, for error handling
int rt_thread_run_barrier_poll(rt_run_barrier_t *barrier)
non-blocking poll to see if peers are already at the barrier
#define CPU_ARM64_ASIMDDP
Advanced SIMD DP avail.
int n_waiting
Number of currently waiting threads.
int rt_cond_signal(rt_cond_t *cvp)
signal a condition variable, waking at least one thread
atomic int structure with padding to prevent false sharing
int rt_threadlaunch_next_tile(void *voidparms, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator over the requested half-open interval
int rt_thread_barrier_init_proc_shared(rt_barrier_t *barrier, int n_clients)
When rendering in the CAVE we use a special synchronization mode so that shared memory mutexes and co...
rt_shared_iterator_t iter
dynamic work scheduler
int rt_mutex_unlock(rt_mutex_t *mp)
unlock a mutex
void * thrpool
void ptr to thread pool struct
int growthrate
stack growth chunk size
rt_tilestack_t errorstack
stack of tiles that failed
int rt_thread_join(rt_thread_t thr, void **stat)
join (wait for completion of, and merge with) a thread
int rt_threadpool_getfatalerror(void *voidparms)
master thread calls this to query for fatal errors
int rt_threadpool_worker_devscaletile(void *voiddata, int *tilesize)
worker thread calls this to scale max tile size by worker speed as determined by the SM/core count an...
int rt_threadlaunch_setfatalerror(void *voidparms)
worker thread calls this to indicate that an unrecoverable error occured
int n_waiting
Number of currently waiting threads.
int rt_atomic_int_destroy(rt_atomic_int_t *atomp)
destroy an atomic int variable
#define CPU_FMA
FMA insns avail.
barrier sync object with padding to prevent false sharing
int rt_shared_iterator_next_tile(rt_shared_iterator_t *it, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator, over a requested half-open interval
rt_mutex_t lock
Mutex lock for the structure.
int rt_cond_broadcast(rt_cond_t *cvp)
signal a condition variable, waking all threads
int start
starting value (inclusive)
#define CPU_AVX512PF
AVX-512PF SIMD avail.
int workercount
number of worker threads
int rt_threadpool_setfatalerror(void *voidparms)
worker thread calls this to indicate that an unrecoverable error occured
int rt_threadpool_destroy(rt_threadpool_t *thrpool)
join all worker threads and free resources
int rt_atomic_int_set(rt_atomic_int_t *atomp, int val)
set an atomic int variable
int rt_shared_iterator_init(rt_shared_iterator_t *it)
initialize a shared iterator
#define CPU_SMTDEPTH_UNKNOWN
Unknown SMT depth.
rt_shared_iterator_t * iter
dynamic scheduler iterator
#define CPU_SSE4_1
SSE4.1 SIMD avail.
void rt_tilestack_destroy(rt_tilestack_t *s)
destroy task tile stack
int rt_cond_wait(rt_cond_t *cvp, rt_mutex_t *mp)
wait on a condition variable
int rt_mutex_spin_lock(rt_mutex_t *mp)
lock a mutex by spinning only
rt_cond_t wait_cv
Clients wait on condition variable to proceed.
int rt_threadpool_next_tile(void *voidparms, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator over the requested half-open interval
int phase
Flag to separate waiters from fast workers.
void * parms
parms for fctn pointer
void *(*)(void *) rt_thread_run_barrier(rt_run_barrier_t *barrier, void *fctn(void *), void *parms, void **rsltparms)
Wait until all threads reach barrier, and return the function pointer passed in by the master thread...
void *(* RTTHREAD_START_ROUTINE)(void *)
Typedef to eliminate compiler warning caused by C/C++ linkage conflict.
rt_thread_t * threads
worker threads
int rt_threadpool_worker_setdevspeed(void *voiddata, float speed)
worker thread calls this to set relative speed of this device as determined by the SM/core count and ...
#define CPU_HYPERVISOR
VM/Hypervisor environment.
int rt_threadpool_worker_getdevid(void *voiddata, int *devid)
worker thread can call this to get its CPU/GPU device ID
int threadid
ID of worker thread.
int threadid
worker thread's id
int rt_shared_iterator_setfatalerror(rt_shared_iterator_t *it)
worker thread calls this to indicate a fatal error
#define CPU_F16C
F16C insns avail.
int rt_atomic_int_add_and_fetch(rt_atomic_int_t *atomp, int inc)
fetch an atomic int and add inc to it, returning new value
int rt_cond_init(rt_cond_t *cvp)
initialize a condition variable
run-barrier sync object with padding to prevent false sharing
int devid
worker CPU/GPU device ID
int rt_shared_iterator_destroy(rt_shared_iterator_t *it)
destroy a shared iterator
int rt_threadpool_sched_dynamic(rt_threadpool_t *thrpool, rt_tasktile_t *tile)
Set shared iterator state to half-open interval defined by tile.
#define CPU_ARM64_ASIMDFHM
Advanced SIMD FHM avail.
int threadcount
number of workers
int phase
Flag to separate waiters from fast workers.
int rt_thread_create(rt_thread_t *thr, void *fctn(void *), void *arg)
create a new child thread
int rt_threadpool_wait(rt_threadpool_t *thrpool)
wait for all worker threads to complete their work
int rt_tilestack_init(rt_tilestack_t *s, int size)
initialize task tile stack (to empty)
int rt_threadpool_poll(rt_threadpool_t *thrpool)