72 #if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) || (defined(_MSC_VER) && (_MSC_VER >= 1916))    73 #include <emmintrin.h>    74 #include <immintrin.h>    80 #define RTUSENEWWIN32APIS 1    81 #define _WIN32_WINNT 0x0400     88 #if defined(_AIX) || defined(_CRAY) || defined(__irix) || defined(__linux) || defined(__osf__) || defined(__sun)    92 #if defined(__APPLE__) && defined(THR)    94 #include <sys/types.h>    95 #include <sys/sysctl.h>         97 #include <Carbon/Carbon.h>     101 #if defined(__linux) && (defined(ARCH_LINUXARM64) || defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON))   102 #include <sys/auxv.h>          106 #include <sys/mpctl.h>         118 #if defined(__APPLE__)   123   size_t alen = 
sizeof(a);
   125   mib[1] = HW_AVAILCPU;
   127   rc = sysctl(mib, miblen, &a, &alen, NULL, 0); 
   129     perror(
"Error during sysctl() query for CPU count");
   133   a = MPProcessorsScheduled();       
   138   struct _SYSTEM_INFO sysinfo;
   139   GetSystemInfo(&sysinfo);
   140   a = sysinfo.dwNumberOfProcessors;  
   143 #if defined(__PARAGON__)    148   a = sysconf(_SC_CRAY_NCPU);        
   151 #if defined(ANDROID) || defined(USEPHYSCPUCOUNT)   162   a = sysconf(_SC_NPROCESSORS_CONF); 
   167     int rc=0, b=1, i=-1, j=-1;
   170     ifp = fopen(
"/sys/devices/system/cpu/present", 
"r");
   172       rc = fscanf(ifp, 
"%d-%d", &i, &j); 
   175       if (rc == 2 && i == 0) {
   184 #if defined(__sun) || defined(__linux) || defined(__osf__) || defined(_AIX)   185   a = sysconf(_SC_NPROCESSORS_ONLN); 
   190   a = sysconf(_SC_NPROC_ONLN);       
   194   a = mpctl(MPC_GETNUMSPUS, 0, 0);   
   208   char *forcecount = getenv(
"RTFORCECPUCOUNT");
   209   if (forcecount != NULL) {
   210     if (sscanf(forcecount, 
"%d", &a) == 1) {
   233 #define RT_USEINTCPUID 1   234 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1916))) && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))   237 static void rt_cpuid(
unsigned int eax, 
unsigned int ecx, 
unsigned int* abcd) {
   238 #if defined(_MSC_VER)   239   __cpuidex((
int*)abcd, eax, ecx);
   242   unsigned int ebx=0, edx=0;
   243 #if defined(__i386__) && defined (__PIC__)   245   __asm__(
"movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : 
"=D" (ebx),
   247   __asm__(
"cpuid" : 
"+b" (ebx),
   249           "+a" (eax), 
"+c" (ecx), 
"=d" (edx));
   250           abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
   254 static void rt_cpuid(
unsigned int eax, 
unsigned int ecx, 
unsigned int *info) {
   255   __asm__ __volatile__(
   259     :
"=a" (info[0]), 
"=D" (info[1]), 
"=c" (info[2]), 
"=d" (info[3])
   265 static unsigned long long rt_xgetbv(
unsigned int index) {
   266 #if defined(_MSC_VER)   267   return _xgetbv(index);
   269   unsigned int eax=0, edx=0;
   270   __asm__ __volatile__(
   272     : 
"=a" (eax), 
"=d"(edx)
   275   return ((
unsigned long long) edx << 32) | eax;
   285 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1916))) && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_AMD64))   286 #define RT_INTERNAL_ENABLE_CPUCAP_BAILOUT 1   303   unsigned int vendcpuinfo[4] = { 0 };
   304   unsigned int cpuinfo[4] = { 0 };
   305   unsigned long long xcrFeatureMask = 0;
   310   rt_cpuid(0, 0, vendcpuinfo); 
   311   if (vendcpuinfo[0] == 0)
   314   rt_cpuid(1, 0, cpuinfo);     
   315   haveosxsave = (cpuinfo[2] & (1 << 27)) != 0; 
   318   flags |= ((cpuinfo[2] & (1 << 19)) != 0)           * 
CPU_SSE4_1;
   319   flags |= ((cpuinfo[2] & (1 << 29)) != 0)           * 
CPU_F16C;
   321   flags |= ((cpuinfo[3] & (1 << 26)) != 0)           * 
CPU_SSE2;
   322   flags |= ((cpuinfo[3] & (1 << 28)) != 0)           * 
CPU_HT;
   325   if ((cpuinfo[2] & (1 << 28)) != 0) {
   326     xcrFeatureMask = rt_xgetbv(0);
   327     havexmmymm  = (xcrFeatureMask & 0x06) == 0x06;
   328     havezmmmask = (xcrFeatureMask & 0xE6) == 0xE6;
   331   flags |= (((cpuinfo[2] & (1 << 12)) != 0) &&
   332             havexmmymm && haveosxsave)               * 
CPU_FMA;
   334   flags |= (((cpuinfo[2] & (1 << 28)) != 0) &&
   335             havexmmymm && haveosxsave)               * 
CPU_AVX;
   338   if (cpuinfo[0] >= 0x7) {
   339     unsigned int extcpuinfo[4] = { 0 };
   340     rt_cpuid(7, 0, extcpuinfo);
   342     flags |= (((extcpuinfo[1] & (1 << 5)) != 0) &&
   343               havexmmymm && haveosxsave)               * 
CPU_AVX2;
   345     flags |= (((extcpuinfo[1] & (1 << 16)) != 0) &&
   347     flags |= (((extcpuinfo[1] & (1 << 26)) != 0) &&
   349     flags |= (((extcpuinfo[1] & (1 << 27)) != 0) &&
   351     flags |= (((extcpuinfo[1] & (1 << 28)) != 0) &&
   367     int logicalcores = (cpuinfo[1] >> 16) && 0xFF;
   368     int physicalcores = logicalcores;
   369     char vendor[16] = { 0 };
   370     ((
unsigned *)vendor)[0] = vendcpuinfo[1];
   371     ((
unsigned *)vendor)[1] = vendcpuinfo[3];
   372     ((
unsigned *)vendor)[2] = vendcpuinfo[2];
   375     if (!strcmp(vendor, 
"GenuineIntel")) {
   376       unsigned int corecpuinfo[4] = { 0 };
   377       rt_cpuid(4, 0, corecpuinfo);
   378       physicalcores = ((corecpuinfo[0] >> 26) & 0x3f) + 1; 
   379     } 
else if (!strcmp(vendor, 
"AuthenticAMD")) {
   380       unsigned int corecpuinfo[4] = { 0 };
   381       rt_cpuid(0x80000008, 0, corecpuinfo);
   382       physicalcores = (corecpuinfo[2] & 0xFF) + 1; 
   385 printf(
"cpuinfo: %d / %d  vend: %s\n", logicalcores, physicalcores, vendor);
   387     smtdepth = logicalcores / physicalcores;
   391 #elif defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)   395   flags |= _may_i_use_cpu_feature(_FEATURE_SSE2)     * 
CPU_SSE2;
   396   flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1)   * 
CPU_SSE4_1;
   397   flags |= _may_i_use_cpu_feature(_FEATURE_AVX)      * 
CPU_AVX;
   398   flags |= _may_i_use_cpu_feature(_FEATURE_AVX2)     * 
CPU_AVX2;
   399   flags |= _may_i_use_cpu_feature(_FEATURE_FMA)      * 
CPU_FMA;
   400   flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F)  * 
CPU_AVX512F;
   401   flags |= _may_i_use_cpu_feature(_FEATURE_AVX512CD) * 
CPU_AVX512CD;
   402   flags |= _may_i_use_cpu_feature(_FEATURE_AVX512ER) * 
CPU_AVX512ER;
   403   flags |= _may_i_use_cpu_feature(_FEATURE_AVX512PF) * 
CPU_AVX512PF;
   405 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))    409   __builtin_cpu_init();
   410   flags |= (__builtin_cpu_supports(
"sse2")!=0)       * 
CPU_SSE2;
   411   flags |= (__builtin_cpu_supports(
"sse4.1")!=0)     * 
CPU_SSE4_1;
   412   flags |= (__builtin_cpu_supports(
"avx")!=0)        * 
CPU_AVX;
   413   flags |= (__builtin_cpu_supports(
"avx2")!=0)       * 
CPU_AVX2;
   414   flags |= (__builtin_cpu_supports(
"fma")!=0)        * 
CPU_FMA;
   415   flags |= (__builtin_cpu_supports(
"avx512f")!=0)    * 
CPU_AVX512F;
   416   flags |= (__builtin_cpu_supports(
"avx512cd")!=0)   * 
CPU_AVX512CD;
   417   flags |= (__builtin_cpu_supports(
"avx512er")!=0)   * 
CPU_AVX512ER;
   418   flags |= (__builtin_cpu_supports(
"avx512pf")!=0)   * 
CPU_AVX512PF;
   420 #elif defined(__linux) && (defined(ARCH_LINUXARM64) || defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON))   427   unsigned long auxval1=0;
   429   auxval1 = getauxval(AT_HWCAP);
   453 #if defined(RT_INTERNAL_ENABLE_CPUCAP_BAILOUT)   456   cpucaps->
flags = flags;
   469 #if defined(RT_USEINTCPUID) && (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (defined(__i386__) || defined(__x86_64__))   484   int *affinitylist = NULL;
   485   *cpuaffinitycount = -1; 
   488 #if 0 && defined(_MSC_VER)   490   HANDLE myproc = GetCurrentProcess(); 
   491   DWORD affinitymask, sysaffinitymask;
   493   if (!GetProcessAffinityMask(myproc, &affinitymask, &sysaffinitymask)) {
   497     for (i=0; i<31; i++) {
   498       affinitycount += (affinitymask >> i) & 0x1;
   502     if (affinitycount > 0) {
   503       affinitylist = (
int *) malloc(affinitycount * 
sizeof(
int));
   504       if (affinitylist == NULL)
   508       for (i=0; i<CPU_SETSIZE; i++) {
   509         if (CPU_ISSET(i, &affinitymask)) {
   510           affinitylist[curcount] = i;
   516     *cpuaffinitycount = affinitycount; 
   524 #if defined(CPU_SETSIZE)   526   cpu_set_t affinitymask;
   530   if (sched_getaffinity(0, 
sizeof(affinitymask), &affinitymask) < 0) {
   531     perror(
"rt_cpu_affinitylist: sched_getaffinity");
   536   for (i=0; i<CPU_SETSIZE; i++) {
   537     affinitycount += CPU_ISSET(i, &affinitymask);
   541   if (affinitycount > 0) {
   542     affinitylist = (
int *) malloc(affinitycount * 
sizeof(
int));
   543     if (affinitylist == NULL)
   547     for (i=0; i<CPU_SETSIZE; i++) {
   548       if (CPU_ISSET(i, &affinitymask)) {
   549         affinitylist[curcount] = i;
   555   *cpuaffinitycount = affinitycount; 
   577 #if defined(__linux) && defined(CPU_ZERO) && defined(CPU_SET)   583   cpu_set_t affinitymask;
   584   CPU_ZERO(&affinitymask);
   585   CPU_SET(cpu, &affinitymask);
   586   status = pthread_setaffinity_np(pthread_self(), 
sizeof(affinitymask), &affinitymask);
   589   cpu_set_t affinitymask;
   590   CPU_ZERO(&affinitymask);
   591   CPU_SET(cpu, &affinitymask);
   594   if ((status=sched_setaffinity(0, 
sizeof(affinitymask), &affinitymask)) < 0) {
   595     perror(
"rt_thread_set_self_cpuaffinitylist: sched_setaffinity");
   623 #ifdef USEPOSIXTHREADS   624   status = pthread_setconcurrency(nthr);
   626   status = thr_setconcurrency(nthr);
   630 #if defined(__irix) || defined(_AIX)   631   status = pthread_setconcurrency(nthr);
   643 typedef void * (*RTTHREAD_START_ROUTINE)(
void *);
   651   *thr = CreateThread(NULL, 8192, (LPTHREAD_START_ROUTINE) fctn, arg, 0, &tid);
   662 #ifdef USEPOSIXTHREADS    667     pthread_attr_init(&attr);
   668     pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
   670     pthread_attr_destroy(&attr);
   672 #elif defined(__PARAGON__)   673   status = pthread_create(thr, pthread_attr_default, fctn, arg);
   695   wstatus = WAIT_TIMEOUT;
   697   while (wstatus != WAIT_OBJECT_0) {
   698     wstatus = WaitForSingleObject(thr, INFINITE);
   705 #ifdef USEPOSIXTHREADS   706   status = pthread_join(thr, stat);
   710   status = thr_join(thr, NULL, stat);
   726   InitializeCriticalSection(mp);
   729 #ifdef USEPOSIXTHREADS   730   status = pthread_mutex_init(mp, 0);
   734   status = mutex_init(mp, USYNC_THREAD, NULL);
   747   EnterCriticalSection(mp);
   750 #ifdef USEPOSIXTHREADS   751   status = pthread_mutex_lock(mp);
   755   status = mutex_lock(mp);
   768 #if defined(THRUSENEWWIN32APIS)   771   status = (!(TryEnterCriticalSection(mp)));
   775 #ifdef USEPOSIXTHREADS   776   status = (pthread_mutex_lock(mp) != 0);
   789 #if defined(THRUSENEWWIN32APIS)   792   while (!TryEnterCriticalSection(mp));
   794   EnterCriticalSection(mp);
   798 #ifdef USEPOSIXTHREADS   799   while ((status = pthread_mutex_trylock(mp)) != 0);
   812   LeaveCriticalSection(mp);
   815 #ifdef USEPOSIXTHREADS   816   status = pthread_mutex_unlock(mp);
   820   status = mutex_unlock(mp);
   833   DeleteCriticalSection(mp);
   836 #ifdef USEPOSIXTHREADS   837   status = pthread_mutex_destroy(mp);
   841   status = mutex_destroy(mp);
   857 #if defined(RTUSEWIN2008CONDVARS)   858   InitializeConditionVariable(cvp);
   864   cvp->events[RT_COND_SIGNAL] = CreateEvent(NULL,  
   870   cvp->events[RT_COND_BROADCAST] = CreateEvent(NULL,  
   877 #ifdef USEPOSIXTHREADS   878   status = pthread_cond_init(cvp, NULL);
   881   status = cond_init(cvp, USYNC_THREAD, NULL);
   893 #if defined(RTUSEWIN2008CONDVARS)   896   CloseHandle(cvp->events[RT_COND_SIGNAL]);
   897   CloseHandle(cvp->events[RT_COND_BROADCAST]);
   901 #ifdef USEPOSIXTHREADS   902   status = pthread_cond_destroy(cvp);
   905   status = cond_destroy(cvp);
   914 #if defined(THR) && defined(_MSC_VER)   922 #if defined(RTUSEWIN2008CONDVARS)   923   SleepConditionVariableCS(cvp, mp, INFINITE)
   925 #if !defined(RTUSEINTERLOCKEDATOMICOPS)   926   EnterCriticalSection(&cvp->waiters_lock);
   928   LeaveCriticalSection(&cvp->waiters_lock);
   930   InterlockedIncrement(&cvp->waiters);
   933   LeaveCriticalSection(mp); 
   936   result = WaitForMultipleObjects(2, cvp->events, FALSE, INFINITE);
   938 #if !defined(RTUSEINTERLOCKEDATOMICOPS)   939   EnterCriticalSection (&cvp->waiters_lock);
   942     ((result == (WAIT_OBJECT_0 + RT_COND_BROADCAST)) && cvp->waiters == 0);
   943   LeaveCriticalSection (&cvp->waiters_lock);
   945   my_waiter = InterlockedDecrement(&cvp->waiters);
   947     ((result == (WAIT_OBJECT_0 + RT_COND_BROADCAST)) && my_waiter == 0);
   954     ResetEvent(cvp->events[RT_COND_BROADCAST]);
   956   EnterCriticalSection(mp);
   960 #ifdef USEPOSIXTHREADS   961   status = pthread_cond_wait(cvp, mp);
   964   status = cond_wait(cvp, mp);
   976 #if defined(RTUSEWIN2008CONDVARS)   977   WakeConditionVariable(cvp);
   979 #if !defined(RTUSEINTERLOCKEDATOMICOPS)   980   EnterCriticalSection(&cvp->waiters_lock);
   981   int have_waiters = (cvp->waiters > 0);
   982   LeaveCriticalSection(&cvp->waiters_lock);
   984     SetEvent (cvp->events[RT_COND_SIGNAL]);
   986   if (InterlockedExchangeAdd(&cvp->waiters, 0) > 0)
   987     SetEvent(cvp->events[RT_COND_SIGNAL]);
   992 #ifdef USEPOSIXTHREADS   993   status = pthread_cond_signal(cvp);
   996   status = cond_signal(cvp);
  1008 #if defined(RTUSEWIN2008CONDVARS)  1009   WakeAllConditionVariable(cvp);
  1011 #if !defined(RTUSEINTERLOCKEDATOMICOPS)  1012   EnterCriticalSection(&cvp->waiters_lock);
  1013   int have_waiters = (cvp->waiters > 0);
  1014   LeaveCriticalSection(&cvp->waiters_lock);
  1016     SetEvent(cvp->events[RT_COND_BROADCAST]);
  1018   if (InterlockedExchangeAdd(&cvp->waiters, 0) > 0)
  1019     SetEvent(cvp->events[RT_COND_BROADCAST]);
  1025 #ifdef USEPOSIXTHREADS  1026   status = pthread_cond_broadcast(cvp);
  1029   status = cond_broadcast(cvp);
  1046 #if defined(USEGCCATOMICS)  1048 #elif defined(USENETBSDATOMICS)   1050 #elif defined(USESOLARISATOMICS)   1052 #elif defined(USEWIN32ATOMICS)   1068 #if defined(USEGCCATOMICS)  1070 #elif defined(USENETBSDATOMICS)   1072 #elif defined(USESOLARISATOMICS)   1074 #elif defined(USEWIN32ATOMICS)   1091 #if defined(USEGCCATOMICS)  1095 #elif defined(USENETBSDATOMICS)   1099 #elif defined(USESOLARISATOMICS)   1103 #elif defined(USEWIN32ATOMICS)   1110   retval = atomp->
val;
  1116   retval = atomp->
val;
  1127 #if defined(USEGCCATOMICS)  1129   retval = atomp->
val;
  1130 #elif defined(USENETBSDATOMICS)   1132   retval = atomp->
val;
  1133 #elif defined(USESOLARISATOMICS)   1135   retval = atomp->
val;
  1136 #elif defined(USEWIN32ATOMICS)   1138   retval = atomp->
val;
  1141   retval = atomp->
val;
  1146   retval = atomp->
val;
  1154 #if defined(USEGCCATOMICS)  1155   return __sync_fetch_and_add(&atomp->
val, inc);
  1156 #elif defined(USENETBSDATOMICS)   1158   return atomic_add_int_nv(&atomp->
val, inc) - inc;
  1159 #elif defined(USESOLARISATOMICS)   1161   return atomic_add_int_nv(&atomp->
val, inc) - inc;
  1162 #elif defined(USEWIN32ATOMICS)   1163   return InterlockedExchangeAdd(&atomp->
val, inc);
  1167   retval = atomp->
val;
  1173   int retval = atomp->
val;
  1182 #if defined(USEGCCATOMICS)  1183   return __sync_add_and_fetch(&atomp->
val, inc);
  1184 #elif defined(USENETBSDATOMICS)   1185   return atomic_add_int_nv(&atomp->
val, inc);
  1186 #elif defined(USESOLARISATOMICS)   1187   return atomic_add_int_nv(&atomp->
val, inc);
  1188 #elif defined(USEWIN32ATOMICS)   1190   return InterlockedExchangeAdd(&atomp->
val, inc) + inc;
  1195   retval = atomp->
val;
  1202   retval = atomp->
val;
  1221   rwp->waiting_writers = 0;
  1224 #ifdef USEPOSIXTHREADS  1225   pthread_mutex_init(&rwp->lock, NULL);
  1226   pthread_cond_init(&rwp->rdrs_ok, NULL);
  1227   pthread_cond_init(&rwp->wrtr_ok, NULL);
  1229   rwp->waiting_writers = 0;
  1233   status = rwlock_init(rwp, USYNC_THREAD, NULL);
  1247   while (rwp->rwlock < 0 || rwp->waiting_writers) 
  1253 #ifdef USEPOSIXTHREADS  1254   pthread_mutex_lock(&rwp->lock);
  1255   while (rwp->rwlock < 0 || rwp->waiting_writers) 
  1256     pthread_cond_wait(&rwp->rdrs_ok, &rwp->lock);   
  1258   pthread_mutex_unlock(&rwp->lock);
  1262   status = rw_rdlock(rwp);
  1276   while (rwp->rwlock != 0) {
  1277     rwp->waiting_writers++;
  1279     rwp->waiting_writers--;
  1285 #ifdef USEPOSIXTHREADS  1286   pthread_mutex_lock(&rwp->lock);
  1287   while (rwp->rwlock != 0) {
  1288     rwp->waiting_writers++;
  1289     pthread_cond_wait(&rwp->wrtr_ok, &rwp->lock);
  1290     rwp->waiting_writers--;
  1293   pthread_mutex_unlock(&rwp->lock);
  1297   status = rw_wrlock(rwp);
  1312   if (rwp->rwlock > 0) {
  1317   ww = (rwp->waiting_writers && rwp->rwlock == 0);
  1318   wr = (rwp->waiting_writers == 0);
  1326 #ifdef USEPOSIXTHREADS  1328   pthread_mutex_lock(&rwp->lock);
  1329   if (rwp->rwlock > 0) {
  1334   ww = (rwp->waiting_writers && rwp->rwlock == 0);
  1335   wr = (rwp->waiting_writers == 0);
  1336   pthread_mutex_unlock(&rwp->lock);
  1338     pthread_cond_signal(&rwp->wrtr_ok);
  1340     pthread_cond_signal(&rwp->rdrs_ok);
  1344   status = rw_unlock(rwp);
  1359   if (barrier != NULL) {
  1381 #ifdef USEPOSIXTHREADS  1382   if (barrier != NULL) {
  1388     pthread_mutexattr_t mattr;
  1389     pthread_condattr_t  cattr;
  1391     printf(
"Setting barriers to have system scope...\n");
  1393     pthread_mutexattr_init(&mattr);
  1394     if (pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED) != 0) {
  1395       printf(
"WARNING: could not set mutex to process shared scope\n");
  1398     pthread_condattr_init(&cattr);
  1399     if (pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED) != 0) {
  1400       printf(
"WARNING: could not set mutex to process shared scope\n");
  1403     pthread_mutex_init(&barrier->
lock, &mattr);
  1404     pthread_cond_init(&barrier->
wait_cv, &cattr);
  1406     pthread_condattr_destroy(&cattr);
  1407     pthread_mutexattr_destroy(&mattr);
  1431   my_phase = barrier->
phase;
  1432   barrier->
sum += increment;
  1439     barrier->
phase = 1 - my_phase;
  1443   while (barrier->
phase == my_phase) {
  1447   my_result = barrier->
result;
  1464   if (barrier != NULL) {
  1468     barrier->
fctn = NULL;
  1493                                void **rsltparms))(
void *) {
  1496   void * (*my_result)(
void*);
  1499   my_phase = barrier->
phase;
  1501     barrier->
fctn = fctn;
  1503     barrier->
parms = parms;
  1509     barrier->
fctn = NULL;
  1510     barrier->
parms = NULL;
  1512     barrier->
phase = 1 - my_phase;
  1516   while (barrier->
phase == my_phase) {
  1520   my_result = barrier->
rslt;
  1521   if (rsltparms != NULL)
  1526   void * (*my_result)(
void*) = fctn;
  1527   if (rsltparms != NULL)
  1589     int newsize = s->
top + 1;
  1757     if (tile->
end > it->
end) {
  1803 static void * rt_threadpool_workerproc(
void *voidparms) {
  1804   void *(*fctn)(
void*);
  1809     (*fctn)(workerdata);
  1816 static void * rt_threadpool_workersync(
void *voidparms) {
  1826   if (thrpool == NULL)
  1837   thrpool->
devlist = (
int *) malloc(
sizeof(
int) * workercount);
  1838   if (devlist == NULL) {
  1839     for (i=0; i<workercount; i++)
  1842     memcpy(thrpool->
devlist, devlist, 
sizeof(
int) * workercount);
  1861   for (i=0; i<workercount; i++) {
  1873   for (i=0; i<workercount; i++) {
  1883                           void *fctn(
void *), 
void *parms, 
int blocking) {
  1884   if (thrpool == NULL)
  1959   if (threadid != NULL)
  1962   if (threadcount != NULL)
  1973     *devid = worker->
devid;
  2010   if (tilesize != NULL) {
  2012     scaledtilesize = (int) (worker->
devspeed * ((
float) (*tilesize)));
  2013     if (scaledtilesize < 1)
  2016     *tilesize = scaledtilesize;
  2026   if (clientdata != NULL)
  2027     *clientdata = worker->
parms;
  2035   if (thrpool == NULL)
  2104   if (threads == NULL)
  2109   if (parms == NULL) {
  2113   for (i=0; i<numprocs; i++) {
  2114     parms[i].
iter = &iter;
  2121   if (numprocs == 1) {
  2130     fctn((
void *) &parms[0]);
  2133     for (i=0; i<numprocs; i++) {
  2138     for (i=0; i<numprocs; i++) {
  2144   fctn((
void *) &parms[0]);
  2164   if (threadid != NULL)
  2167   if (threadcount != NULL)
  2177   if (clientdata != NULL)
 #define CPU_ARM64_AES
AES insns avail. 
 
rt_shared_iterator_t * iter
dynamic work scheduler 
 
rt_mutex_t mtx
Mutex lock for the structure. 
 
int * devlist
per-worker CPU/GPU device IDs 
 
iterator used for dynamic load balancing 
 
int n_clients
Number of threads to wait for at barrier. 
 
int rt_cpu_smt_depth(void)
CPU logical processors (SMT depth / aka hyperthreading) 
 
int rt_tilestack_push(rt_tilestack_t *s, const rt_tasktile_t *t)
push a task tile onto the stack 
 
int rt_mutex_lock(rt_mutex_t *mp)
lock a mutex 
 
int rt_atomic_int_get(rt_atomic_int_t *atomp)
get an atomic int variable 
 
int rt_shared_iterator_set(rt_shared_iterator_t *it, rt_tasktile_t *tile)
set shared iterator parameters 
 
int rt_thread_numphysprocessors(void)
If compiling on Linux, enable the GNU CPU affinity functions in both libc and the libpthreads...
 
void * clientdata
worker parameters 
 
int rt_cond_destroy(rt_cond_t *cvp)
destroy a condition variable 
 
int rt_thread_barrier(rt_barrier_t *barrier, int increment)
synchronize on counting barrier primitive 
 
int result
Answer to be returned by barrier_wait. 
 
void *(* fctn)(void *)
Fctn ptr to call, or NULL if done. 
 
rt_mutex_t lock
Mutex lock for the structure. 
 
#define CPU_ARM64_CRC32
CRC32 insns avail. 
 
Routines to generate a pool of threads which then grind through a dynamically load balanced work queu...
 
#define CPU_ARM64_FP
FP insns avail. 
 
#define CPU_ARM64_SVE
Scalable Vector Extns avail. 
 
int rt_threadpool_worker_getdevspeed(void *voiddata, float *speed)
worker thread calls this to get relative speed of this device as determined by the SM/core count and ...
 
rt_threadpool_t * rt_threadpool_create(int workercount, int *devlist)
create a thread pool with a specified number of worker threads 
 
void rt_thread_barrier_destroy(rt_barrier_t *barrier)
destroy counting barrier primitive 
 
#define CPU_ARM64_SHA2
SHA-2 insns avail. 
 
Task tile struct for stack, iterator, and scheduler routines; 'start' is inclusive, 'end' is exclusive. 
 
#define CPU_ARM64_ASIMD
Advanced SIMD avail. 
 
int rt_rwlock_init(rt_rwlock_t *rwp)
initialize a reader/writer lock 
 
int rt_thread_set_self_cpuaffinity(int cpu)
set the CPU affinity of the current thread (if allowed by host system) 
 
float devspeed
speed scaling for this device 
 
void * rsltparms
parms to return to barrier wait callers 
 
rt_barrier_t * rt_thread_barrier_init(int n_clients)
initialize counting barrier primitive 
 
#define CPU_HT
x86 Hyperthreading detected 
 
void rt_thread_run_barrier_destroy(rt_run_barrier_t *barrier)
destroy thread pool barrier 
 
int rt_threadlaunch(int numprocs, void *clientdata, void *fctn(void *), rt_tasktile_t *tile)
launch up to numprocs threads using shared iterator as a load balancer 
 
#define CPU_SSE2
SSE2 SIMD avail. 
 
int rt_tilestack_pop(rt_tilestack_t *s, rt_tasktile_t *t)
pop a task tile off of the stack 
 
int size
current allocated stack size 
 
rt_threadpool_workerdata_t * workerdata
per-worker data 
 
#define CPU_AVX
AVX SIMD avail. 
 
rt_cond_t wait_cv
Clients wait on condition variable to proceed. 
 
int * rt_cpu_affinitylist(int *cpuaffinitycount)
query CPU affinity of the calling process (if allowed by host system) 
 
rt_tasktile_t * s
stack of task tiles 
 
int top
index of top stack element 
 
int rt_thread_run_barrier_init(rt_run_barrier_t *barrier, int n_clients)
initialize thread pool barrier 
 
#define RT_SCHED_DONE
Shared iterators intended for trivial CPU/GPU load balancing with no exception handling capability (a...
 
int rt_mutex_destroy(rt_mutex_t *mp)
destroy a mutex 
 
#define CPU_ARM64_SHA512
SHA-512 insns avail. 
 
int rt_rwlock_readlock(rt_rwlock_t *rwp)
set reader lock 
 
int rt_threadlaunch_getdata(void *voidparms, void **clientdata)
worker thread can call this to get its client data pointer 
 
#define CPU_ARM64_SHA1
SHA-1 insns avail. 
 
int end
ending value (exlusive) 
 
int rt_atomic_int_init(rt_atomic_int_t *atomp, int val)
initialize an atomic int variable 
 
#define CPU_UNKNOWN
Unknown CPU type. 
 
int n_clients
Number of threads to wait for at barrier. 
 
int rt_mutex_init(rt_mutex_t *mp)
initialize a mutex 
 
Tachyon cross-platform thread creation and management, atomic operations, and CPU feature query APIs...
 
int rt_threadpool_launch(rt_threadpool_t *thrpool, void *fctn(void *), void *parms, int blocking)
launch threads onto a new function, with associated parms 
 
int rt_thread_setconcurrency(int nthr)
set the concurrency level and scheduling scope for threads 
 
rt_run_barrier_t runbar
master/worker run barrier 
 
#define RT_SCHED_CONTINUE
some work remains in the queue 
 
int rt_rwlock_unlock(rt_rwlock_t *rwp)
unlock reader/writer lock 
 
#define CPU_ARM64_ASIMDRDM
Advanced SIMD RDM avail. 
 
int rt_atomic_int_fetch_and_add(rt_atomic_int_t *atomp, int inc)
fetch an atomic int and add inc to it, returning original value 
 
int rt_mutex_trylock(rt_mutex_t *mp)
try to lock a mutex 
 
int rt_shared_iterator_getfatalerror(rt_shared_iterator_t *it)
master thread calls this to query for fatal errors 
 
rt_tilestack_t * errorstack
stack of tiles that failed 
 
int rt_threadlaunch_getid(void *voidparms, int *threadid, int *threadcount)
worker thread can call this to get its ID and number of peers 
 
thread-specific handle data for workers 
 
#define RT_TILESTACK_EMPTY
 
int rt_thread_numprocessors(void)
number of processors available, subject to user override 
 
int rt_tilestack_popall(rt_tilestack_t *s)
pop all of the task tiles off of the stack 
 
int rt_threadpool_tile_failed(void *voidparms, rt_tasktile_t *tile)
worker thread calls this when a failure occurs on a tile it has already taken from the scheduler ...
 
#define CPU_ARM64_ASIMDHP
Advanced SIMD HP avail. 
 
#define CPU_ARM64_SHA3
SHA-3 insns avail. 
 
int end
ending task ID (exclusive) 
 
int val
Integer value to be atomically manipulated. 
 
int rt_rwlock_writelock(rt_rwlock_t *rwp)
set writer lock 
 
int rt_threadpool_worker_getdata(void *voiddata, void **clientdata)
worker thread can call this to get its client data pointer 
 
int sum
Sum of arguments passed to barrier_wait. 
 
int fatalerror
cancel processing immediately for all threads 
 
#define CPU_AVX512CD
AVX-512CD SIMD avail. 
 
int rt_cpu_capability_flags(rt_cpu_caps_t *cpucaps)
CPU optional instruction set capability flags. 
 
int threadcount
total number of worker threads 
 
#define CPU_AVX2
AVX2 SIMD avail. 
 
rt_mutex_t lock
Mutex lock for the structure. 
 
int start
starting task ID (inclusive) 
 
#define CPU_AVX512F
AVX-512F SIMD avail. 
 
void *(* rslt)(void *)
Fctn ptr to return to barrier wait callers. 
 
int rt_tilestack_empty(rt_tilestack_t *s)
query if the task tile stack is empty or not 
 
int rt_threadpool_worker_getid(void *voiddata, int *threadid, int *threadcount)
worker thread can call this to get its ID and number of peers 
 
#define CPU_AVX512ER
AVX-512ER SIMD avail. 
 
void * parms
fctn parms for this worker 
 
int rt_tilestack_compact(rt_tilestack_t *s)
shrink memory buffers associated with task tile stack if possible 
 
int rt_threadpool_get_workercount(rt_threadpool_t *thrpool)
return the number of worker threads currently in the pool 
 
stack of work tiles, for error handling 
 
int rt_thread_run_barrier_poll(rt_run_barrier_t *barrier)
non-blocking poll to see if peers are already at the barrier 
 
#define CPU_ARM64_ASIMDDP
Advanced SIMD DP avail. 
 
int n_waiting
Number of currently waiting threads. 
 
int rt_cond_signal(rt_cond_t *cvp)
signal a condition variable, waking at least one thread 
 
atomic int structure with padding to prevent false sharing 
 
int rt_threadlaunch_next_tile(void *voidparms, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator over the requested half-open interval 
 
int rt_thread_barrier_init_proc_shared(rt_barrier_t *barrier, int n_clients)
When rendering in the CAVE we use a special synchronization mode so that shared memory mutexes and co...
 
rt_shared_iterator_t iter
dynamic work scheduler 
 
int rt_mutex_unlock(rt_mutex_t *mp)
unlock a mutex 
 
void * thrpool
void ptr to thread pool struct 
 
int growthrate
stack growth chunk size 
 
rt_tilestack_t errorstack
stack of tiles that failed 
 
int rt_thread_join(rt_thread_t thr, void **stat)
join (wait for completion of, and merge with) a thread 
 
int rt_threadpool_getfatalerror(void *voidparms)
master thread calls this to query for fatal errors 
 
int rt_threadpool_worker_devscaletile(void *voiddata, int *tilesize)
worker thread calls this to scale max tile size by worker speed as determined by the SM/core count an...
 
int rt_threadlaunch_setfatalerror(void *voidparms)
worker thread calls this to indicate that an unrecoverable error occured 
 
int n_waiting
Number of currently waiting threads. 
 
int rt_atomic_int_destroy(rt_atomic_int_t *atomp)
destroy an atomic int variable 
 
#define CPU_FMA
FMA insns avail. 
 
barrier sync object with padding to prevent false sharing 
 
int rt_shared_iterator_next_tile(rt_shared_iterator_t *it, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator, over a requested half-open interval 
 
rt_mutex_t lock
Mutex lock for the structure. 
 
int rt_cond_broadcast(rt_cond_t *cvp)
signal a condition variable, waking all threads 
 
int start
starting value (inclusive) 
 
#define CPU_AVX512PF
AVX-512PF SIMD avail. 
 
int workercount
number of worker threads 
 
int rt_threadpool_setfatalerror(void *voidparms)
worker thread calls this to indicate that an unrecoverable error occured 
 
int rt_threadpool_destroy(rt_threadpool_t *thrpool)
join all worker threads and free resources 
 
int rt_atomic_int_set(rt_atomic_int_t *atomp, int val)
set an atomic int variable 
 
int rt_shared_iterator_init(rt_shared_iterator_t *it)
initialize a shared iterator 
 
#define CPU_SMTDEPTH_UNKNOWN
Unknown SMT depth. 
 
rt_shared_iterator_t * iter
dynamic scheduler iterator 
 
#define CPU_SSE4_1
SSE4.1 SIMD avail. 
 
void rt_tilestack_destroy(rt_tilestack_t *s)
destroy task tile stack 
 
int rt_cond_wait(rt_cond_t *cvp, rt_mutex_t *mp)
wait on a condition variable 
 
int rt_mutex_spin_lock(rt_mutex_t *mp)
lock a mutex by spinning only 
 
rt_cond_t wait_cv
Clients wait on condition variable to proceed. 
 
int rt_threadpool_next_tile(void *voidparms, int reqsize, rt_tasktile_t *tile)
iterate the shared iterator over the requested half-open interval 
 
int phase
Flag to separate waiters from fast workers. 
 
void * parms
parms for fctn pointer 
 
void *(*)(void *) rt_thread_run_barrier(rt_run_barrier_t *barrier, void *fctn(void *), void *parms, void **rsltparms)
Wait until all threads reach barrier, and return the function pointer passed in by the master thread...
 
void *(* RTTHREAD_START_ROUTINE)(void *)
Typedef to eliminate compiler warning caused by C/C++ linkage conflict. 
 
rt_thread_t * threads
worker threads 
 
int rt_threadpool_worker_setdevspeed(void *voiddata, float speed)
worker thread calls this to set relative speed of this device as determined by the SM/core count and ...
 
#define CPU_HYPERVISOR
VM/Hypervisor environment. 
 
int rt_threadpool_worker_getdevid(void *voiddata, int *devid)
worker thread can call this to get its CPU/GPU device ID 
 
int threadid
ID of worker thread. 
 
int threadid
worker thread's id 
 
int rt_shared_iterator_setfatalerror(rt_shared_iterator_t *it)
worker thread calls this to indicate a fatal error 
 
#define CPU_F16C
F16C insns avail. 
 
int rt_atomic_int_add_and_fetch(rt_atomic_int_t *atomp, int inc)
fetch an atomic int and add inc to it, returning new value 
 
int rt_cond_init(rt_cond_t *cvp)
initialize a condition variable 
 
run-barrier sync object with padding to prevent false sharing 
 
int devid
worker CPU/GPU device ID 
 
int rt_shared_iterator_destroy(rt_shared_iterator_t *it)
destroy a shared iterator 
 
int rt_threadpool_sched_dynamic(rt_threadpool_t *thrpool, rt_tasktile_t *tile)
Set shared iterator state to half-open interval defined by tile. 
 
#define CPU_ARM64_ASIMDFHM
Advanced SIMD FHM avail. 
 
int threadcount
number of workers 
 
int phase
Flag to separate waiters from fast workers. 
 
int rt_thread_create(rt_thread_t *thr, void *fctn(void *), void *arg)
create a new child thread 
 
int rt_threadpool_wait(rt_threadpool_t *thrpool)
wait for all worker threads to complete their work 
 
int rt_tilestack_init(rt_tilestack_t *s, int size)
initialize task tile stack (to empty) 
 
int rt_threadpool_poll(rt_threadpool_t *thrpool)