Tachyon (current)  Current Main Branch
TachyonOptiXShaders.h
Go to the documentation of this file.
1 /*
2  * TachyonOptiXShaders.h - prototypes for OptiX PTX shader routines
3  *
4  * (C) Copyright 2013-2022 John E. Stone
5  * SPDX-License-Identifier: BSD-3-Clause
6  *
7  * $Id: TachyonOptiXShaders.h,v 1.67 2022/04/19 02:54:24 johns Exp $
8  *
9  */
10 
22 //
23 // This is a second generation of the Tachyon implementation for OptiX.
24 // The new implementation favors the strengths of OptiX 7, and uses
25 // OptiX ray payload registers, direct CUDA interoperability and advanced
26 // CUDA features for both performance and maintainability.
27 //
28 // This software and its line of antecedants are described in:
29 // "Multiscale modeling and cinematic visualization of photosynthetic
30 // energy conversion processes from electronic to cell scales"
31 // M. Sener, S. Levy, J. E. Stone, AJ Christensen, B. Isralewitz,
32 // R. Patterson, K. Borkiewicz, J. Carpenter, C. N. Hunter,
33 // Z. Luthey-Schulten, D. Cox.
34 // J. Parallel Computing, 102, pp. 102698, 2021.
35 // https://doi.org/10.1016/j.parco.2020.102698
36 //
37 // "Omnidirectional Stereoscopic Projections for VR"
38 // J. E. Stone. In, William R. Sherman, editor,
39 // VR Developer Gems, Taylor and Francis / CRC Press, Chapter 24, 2019.
40 // https://www.taylorfrancis.com/chapters/edit/10.1201/b21598-24/omnidirectional-stereoscopic-projections-vr-john-stone
41 //
42 // "Interactive Ray Tracing Techniques for
43 // High-Fidelity Scientific Visualization"
44 // J. E. Stone. In, Eric Haines and Tomas Akenine-Möller, editors,
45 // Ray Tracing Gems, Apress, Chapter 27, pp. 493-515, 2019.
46 // https://link.springer.com/book/10.1007/978-1-4842-4427-2
47 //
48 // "A Planetarium Dome Master Camera"
49 // J. E. Stone. In, Eric Haines and Tomas Akenine-Möller, editors,
50 // Ray Tracing Gems, Apress, Chapter 4, pp. 49-60, 2019.
51 // https://link.springer.com/book/10.1007/978-1-4842-4427-2
52 //
53 // "Immersive Molecular Visualization with Omnidirectional
54 // Stereoscopic Ray Tracing and Remote Rendering"
55 // J. E. Stone, W. R. Sherman, and K. Schulten.
56 // High Performance Data Analysis and Visualization Workshop,
57 // 2016 IEEE International Parallel and Distributed Processing
58 // Symposium Workshops (IPDPSW), pp. 1048-1057, 2016.
59 // http://dx.doi.org/10.1109/IPDPSW.2016.121
60 //
61 // "Atomic Detail Visualization of Photosynthetic Membranes with
62 // GPU-Accelerated Ray Tracing"
63 // J. E. Stone, M. Sener, K. L. Vandivort, A. Barragan, A. Singharoy,
64 // I. Teo, J. V. Ribeiro, B. Isralewitz, B. Liu, B.-C. Goh, J. C. Phillips,
65 // C. MacGregor-Chatwin, M. P. Johnson, L. F. Kourkoutis, C. N. Hunter,
66 // K. Schulten
67 // J. Parallel Computing, 55:17-27, 2016.
68 // http://dx.doi.org/10.1016/j.parco.2015.10.015
69 //
70 // "GPU-Accelerated Molecular Visualization on
71 // Petascale Supercomputing Platforms"
72 // J. E. Stone, K. L. Vandivort, and K. Schulten.
73 // UltraVis'13: Proceedings of the 8th International Workshop on
74 // Ultrascale Visualization, pp. 6:1-6:8, 2013.
75 // http://dx.doi.org/10.1145/2535571.2535595
76 //
77 // "An Efficient Library for Parallel Ray Tracing and Animation"
78 // John E. Stone. Master's Thesis, University of Missouri-Rolla,
79 // Department of Computer Science, April 1998
80 // https://scholarsmine.mst.edu/masters_theses/1747
81 //
82 // "Rendering of Numerical Flow Simulations Using MPI"
83 // J. Stone and M. Underwood.
84 // Second MPI Developers Conference, pages 138-141, 1996.
85 // http://dx.doi.org/10.1109/MPIDC.1996.534105
86 //
87 
88 #ifndef TACHYONOPTIXSHADERS_H
89 #define TACHYONOPTIXSHADERS_H
90 
91 #if 0
92 #define TACHYON_RAYSTATS 1
94 #endif
95 
96 #if OPTIX_VERSION >= 70300
97 #define TACHYON_OPTIXDENOISER 1
98 #endif
99 
100 // enable use of geometry flags to accelerate various work
101 #define TACHYON_USE_GEOMFLAGS 1
102 
103 //
104 // Constants shared by both host and device code
105 //
106 #define RT_DEFAULT_MAX 1e27f
107 
108 //
109 // Beginning of OptiX data structures
110 //
111 
112 // Enable reversed traversal of any-hit rays for shadows/AO.
113 // This optimization yields a 20% performance gain in many cases.
114 // #define USE_REVERSE_SHADOW_RAYS 1
115 
116 // Use reverse rays by default rather than only when enabled interactively
117 // #define USE_REVERSE_SHADOW_RAYS_DEFAULT 1
122 };
123 
127 };
128 
136 };
137 
138 enum RayType {
142 };
143 
144 //
145 // OptiX 7.x geometry type-associated "hit kind" enums
146 //
147 enum RtHitKind {
149 
150  // XXX custom prims offset to start at 2 (see below)
157 };
158 
159 
160 // simplify runtime code for OptiX 7.0.0
161 #if defined(OPTIX_PRIMITIVE_TYPE_CUSTOM)
162 #define RT_CUSTPRIM (OPTIX_PRIMITIVE_TYPE_CUSTOM << 16)
163 #define RT_TRI_BUILTIN (OPTIX_PRIMITIVE_TYPE_TRIANGLE << 16)
164 #else
165 #define RT_CUSTPRIM 0 // OptiX 7.0.0
166 #define RT_TRI_BUILTIN OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE
167 #endif
168 
170  //
171  // Tachyon custom primitives:
172  // XXX to prevent the triangle front/back hit kindl ow-bit masking scheme
173  // (see below) from interfering with the custom prim types,
174  // the lowest byte of their enums must start at values above 0x02
180 
181  //
182  // OptiX 7.x built-in primitives
183  //
184  // XXX we handle both front+back face triangles with a single case by
185  // masking off the low bit from the hit kind value and the enums:
187  (0xFE & OPTIX_HIT_KIND_TRIANGLE_FRONT_FACE),
188 
189 #if OPTIX_VERSION >= 70400
190  RT_PRM_CATMULLROM = (OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM << 16),
191 #endif
192 #if OPTIX_VERSION >= 70200
193  RT_PRM_LINEAR = (OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR << 16),
194 #endif
195 };
196 
197 
198 // Enums used for custom primitive PGM indexing in SBT + GAS
199 enum RtCustPrim {
206 };
207 
212 };
213 
219 };
220 
223  RT_MAT_ALPHA = 0x1,
225 };
226 
227 
228 //
229 // Images, Materials, Textures...
230 //
231 
233 typedef struct {
234  int texflags;
235  float3 texgen_origin;
236  float3 texgen_uaxis;
237  float3 texgen_vaxis;
238  float3 texgen_waxis;
239  cudaArray_t d_img;
240  cudaTextureObject_t tex;
241  int userindex;
242 } rt_texture;
243 
244 
246 typedef struct {
247  float opacity;
248  float ambient;
249  float diffuse;
250  float specular;
251  float shininess;
252  float reflectivity;
253  float outline;
254  float outlinewidth;
255  int transmode;
256  cudaTextureObject_t tex;
257  int matflags;
258  int userindex;
259 } rt_material;
260 
261 
262 //
263 // Lighting data structures
264 //
265 typedef struct {
266  float3 dir;
267 // float3 color; // not yet used
269 
270 typedef struct {
271  float3 pos;
272 // float3 color; // not yet used
274 
275 
276 
277 //
278 // Shader Binding Table (SBT) Data Structures
279 //
280 struct ConeArraySBT {
281  float3 *base;
282  float3 *apex;
283  float *baserad;
284  float *apexrad;
285 };
286 
288  float3 *vertices;
289  float *vertradii;
291 };
292 
294  float3 *start;
295  float3 *end;
296  float *radius;
297 };
298 
299 struct QuadMeshSBT {
300  float3 *vertices;
301  int4 *indices;
302  float3 *normals;
303  uint4 *packednormals;
304  float3 *vertcolors3f;
305  uchar4 *vertcolors4u;
306 };
307 
308 struct RingArraySBT {
309  float3 *center;
310  float3 *norm;
311  float *inrad;
312  float *outrad;
313 };
314 
316  float4 *PosRadius;
317 };
318 
319 struct TriMeshSBT {
320  float3 *vertices;
321  int3 *indices;
322  float3 *normals;
323  uint4 *packednormals;
324  float3 *vertcolors3f;
325  uchar4 *vertcolors4u;
326  float2 *tex2d;
327  float3 *tex3d;
328 };
329 
330 struct GeomSBTHG {
331 #if defined(TACHYON_USE_GEOMFLAGS)
332  // XXX alpha/opacity AH optimization flags to skip material fetching
333  int geomflags;
334 #endif
335  float3 *prim_color;
336  float3 uniform_color;
338 
339  union {
347  };
348 };
349 
350 
351 
353 struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) HGRecord {
354  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
355  GeomSBTHG data;
356 };
357 
365  HGRecord radiance;
366  HGRecord shadow;
367 };
368 
369 
371 struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) ExceptionRecord {
372  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
373  void *data; // dummy value
374 };
375 
377 struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) RaygenRecord {
378  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
379  void *data; // dummy value
380 };
381 
383 struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) MissRecord {
384  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
385  void *data; // dummy value
386 };
387 
388 
393  struct {
394  int2 size;
401  uchar4 *framebuffer;
402 
403 #if defined(TACHYON_OPTIXDENOISER)
404  // buffers required for denoising
405  float4 *denoiser_colorbuffer;
406  int denoiser_enabled;
407 #endif
408 
410  float4 *accum_buffer;
411 
412 #if defined(TACHYON_RAYSTATS)
413  uint4 *raystats1_buffer;
414  uint4 *raystats2_buffer;
415 #endif
416  } frame;
417 
418  struct {
419  float3 bg_color;
422  float3 bg_grad_updir;
427  int fog_mode;
428  float fog_start;
429  float fog_end;
430  float fog_density;
431  float epsilon;
432  } scene;
433 
434  struct {
438  float ao_ambient;
439  float ao_direct;
440  float ao_maxdist;
443  float3 *dir_lights;
445  float3 *pos_lights;
446  } lights;
447 
448  struct {
449  float3 pos;
450  float3 U;
451  float3 V;
452  float3 W;
453  float zoom;
460  } cam;
461 
462  // VR HMD fade+clipping plane/sphere
465  float clipview_end;
466 
468 
469  int max_depth;
470  int max_trans;
472 
473  OptixTraversableHandle traversable;
474 };
475 
476 
477 
478 #ifndef M_PI
479 #define M_PI 3.14159265358979323846
480 #endif
481 #ifndef M_PIf
482 #define M_PIf 3.14159265358979323846f
483 #endif
484 
485 //
486 // Eliminate compiler warnings about any unused functions
487 //
488 //#pragma push
489 // suppress "function was declared but never referenced warning"
490 //#pragma nv_diag_suppress 177
491 
492 
493 //
494 // Vector math helper routines
495 //
496 
497 //
498 // float2 vector operators
499 //
500 inline __host__ __device__ float2 operator+(const float2& a, const float2& b) {
501  return make_float2(a.x + b.x, a.y + b.y);
502 }
503 
504 inline __host__ __device__ float2 operator+(const float2& a, const float s) {
505  return make_float2(a.x + s, a.y + s);
506 }
507 
508 inline __host__ __device__ float2 operator-(const float2& a, const float2& b) {
509  return make_float2(a.x - b.x, a.y - b.y);
510 }
511 
512 inline __host__ __device__ float2 operator-(const float2& a, const float s) {
513  return make_float2(a.x - s, a.y - s);
514 }
515 
516 inline __host__ __device__ float2 operator-(const float s, const float2& a) {
517  return make_float2(s - a.x, s - a.y);
518 }
519 
520 inline __host__ __device__ float2 operator*(const float2& a, const float2& b) {
521  return make_float2(a.x * b.x, a.y * b.y);
522 }
523 
524 inline __host__ __device__ float2 operator*(const float s, const float2& a) {
525  return make_float2(a.x * s, a.y * s);
526 }
527 
528 inline __host__ __device__ float2 operator*(const float2& a, const float s) {
529  return make_float2(a.x * s, a.y * s);
530 }
531 
532 inline __host__ __device__ void operator*=(float2& a, const float s) {
533  a.x *= s; a.y *= s;
534 }
535 
536 inline __host__ __device__ float2 operator/(const float s, const float2& a) {
537  return make_float2(s/a.x, s/a.y);
538 }
539 
540 
541 
542 //
543 // float3 vector operators
544 //
545 inline __host__ __device__ float3 make_float3(const float s) {
546  return make_float3(s, s, s);
547 }
548 
549 inline __host__ __device__ float3 make_float3(const float4& a) {
550  return make_float3(a.x, a.y, a.z);
551 }
552 
553 inline __host__ __device__ float3 operator+(float3 a, float3 b) {
554  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
555 }
556 
557 inline __host__ __device__ float3 operator-(const float3& a, const float3 &b) {
558  return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
559 }
560 
561 inline __host__ __device__ float3 operator-(const float3& a) {
562  return make_float3(-a.x, -a.y, -a.z);
563 }
564 
565 inline __host__ __device__ void operator+=(float3& a, const float3& b) {
566  a.x += b.x; a.y += b.y; a.z += b.z;
567 }
568 
569 inline __host__ __device__ float3 operator+(const float3& a, const float &b) {
570  return make_float3(a.x + b, a.y + b, a.z + b);
571 }
572 
573 inline __host__ __device__ float3 operator*(const float3& a, const float3 &b) {
574  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
575 }
576 
577 inline __host__ __device__ float3 operator*(float s, const float3 &a) {
578  return make_float3(s * a.x, s * a.y, s * a.z);
579 }
580 
581 inline __host__ __device__ float3 operator*(const float3 &a, const float s) {
582  return make_float3(a.x * s, a.y * s, a.z * s);
583 }
584 
585 inline __host__ __device__ void operator*=(float3& a, const float s) {
586  a.x *= s; a.y *= s; a.z *= s;
587 }
588 
589 inline __host__ __device__ void operator*=(float3& a, const float3 &b) {
590  a.x *= b.x; a.y *= b.y; a.z *= b.z;
591 }
592 
593 inline __host__ __device__ float3 operator/(const float3 &a, const float3 &b) {
594  return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
595 }
596 
597 
598 //
599 // float4 vector operators
600 //
601 inline __host__ __device__ float4 make_float4(const float3 &a, const float &b) {
602  return make_float4(a.x, a.y, a.z, b);
603 }
604 
605 inline __host__ __device__ float4 make_float4(const float a) {
606  return make_float4(a, a, a, a);
607 }
608 
609 inline __host__ __device__ void operator+=(float4& a, const float4& b) {
610  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
611 }
612 
613 inline __host__ __device__ float4 operator*(const float4& a, const float s) {
614  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
615 }
616 
617 inline __host__ __device__ void operator*=(float4& a, const float &b) {
618  a.x *= b; a.y *= b; a.z *= b; a.w *= b;
619 }
620 
621 
622 //
623 // operators with subsequent type conversions
624 //
625 inline __host__ __device__ float3 operator*(char4 a, const float s) {
626  return make_float3(s * a.x, s * a.y, s * a.z);
627 }
628 
629 inline __host__ __device__ float3 operator*(uchar4 a, const float s) {
630  return make_float3(s * a.x, s * a.y, s * a.z);
631 }
632 
633 
634 //
635 // math fctns...
636 //
637 inline __host__ __device__ float3 fabsf(const float3& a) {
638  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
639 }
640 
641 inline __host__ __device__ float3 fmaxf(const float3& a, const float3& b) {
642  return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
643 }
644 
645 inline __host__ __device__ float fmaxf(const float3& a) {
646  return fmaxf(fmaxf(a.x, a.y), a.z);
647 }
648 
649 inline __host__ __device__ float dot(const float3 & a, const float3 & b) {
650  return a.x*b.x + a.y*b.y + a.z*b.z;
651 }
652 
653 inline __host__ __device__ float dot(const float4 & a, const float4 & b) {
654  return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
655 }
656 
657 inline __host__ __device__ float length(const float3 & v) {
658  return sqrtf(dot(v, v));
659 }
660 
661 
663 inline __host__ __device__ float3 normalize(const float3 & v) {
664 #if defined(__CUDACC__) || defined(__NVCC__)
665  float invlen = rsqrtf(dot(v, v));
666 #else
667  float invlen = 1.0f / sqrtf(dot(v, v));
668 #endif
669  float3 out;
670  out.x = v.x * invlen;
671  out.y = v.y * invlen;
672  out.z = v.z * invlen;
673  return out;
674 }
675 
676 
678 inline __host__ __device__ float3 normalize_len(const float3 v, float &l) {
679  l = length(v);
680  float invlen = 1.0f / l;
681  float3 out;
682  out.x = v.x * invlen;
683  out.y = v.y * invlen;
684  out.z = v.z * invlen;
685  return out;
686 }
687 
688 
691 inline __host__ __device__ float3 normalize_invlen(const float3 v, float &invlen) {
692 #if defined(__CUDACC__) || defined(__NVCC__)
693  invlen = rsqrtf(dot(v, v));
694 #else
695  invlen = 1.0f / sqrtf(dot(v, v));
696 #endif
697  float3 out;
698  out.x = v.x * invlen;
699  out.y = v.y * invlen;
700  out.z = v.z * invlen;
701  return out;
702 }
703 
704 
706 inline __host__ __device__ float3 cross(const float3 & a, const float3 & b) {
707  float3 out;
708  out.x = a.y * b.z - b.y * a.z;
709  out.y = -a.x * b.z + b.x * a.z;
710  out.z = a.x * b.y - b.x * a.y;
711  return out;
712 }
713 
714 
717 inline __host__ __device__ float3 reflect(const float3& i, const float3& n) {
718  return i - 2.0f * n * dot(n, i);
719 }
720 
721 
724 inline __host__ __device__ float3 faceforward(const float3& n, const float3& i,
725  const float3& nref) {
726  return n * copysignf(1.0f, dot(i, nref));
727 }
728 
729 
730 //
731 // PRNGs
732 //
733 
734 //
735 // Various random number routines
736 // https://en.wikipedia.org/wiki/List_of_random_number_generators
737 //
738 
739 #define UINT32_RAND_MAX 4294967296.0f // max uint32 random value
740 #define UINT32_RAND_MAX_INV 2.3283064365e-10f // normalize uint32 RNs
741 
742 //
743 // Survey of parallel RNGS suited to GPUs, by L'Ecuyer et al.:
744 // Random numbers for parallel computers: Requirements and methods,
745 // with emphasis on GPUs.
746 // Pierre L'Ecuyer, David Munger, Boris Oreshkina, and Richard Simard.
747 // Mathematics and Computers in Simulation 135:3-17, 2017.
748 // https://doi.org/10.1016/j.matcom.2016.05.005
749 //
750 // Counter-based RNGs introduced by Salmon @ D.E. Shaw Research:
751 // "Parallel random numbers: as easy as 1, 2, 3", by Salmon et al.,
752 // D. E. Shaw Research:
753 // http://doi.org/10.1145/2063384.2063405
754 // https://www.thesalmons.org/john/random123/releases/latest/docs/index.html
755 // https://en.wikipedia.org/wiki/Counter-based_random_number_generator_(CBRNG)
756 //
757 
758 
759 //
760 // Quick and dirty 32-bit LCG random number generator [Fishman 1990]:
761 // A=1099087573 B=0 M=2^32
762 // Period: 10^9
763 // Fastest gun in the west, but fails many tests after 10^6 samples,
764 // and fails all statistics tests after 10^7 samples.
765 // It fares better than the Numerical Recipes LCG. This is the fastest
766 // power of two rand, and has the best multiplier for 2^32, found by
767 // brute force[Fishman 1990]. Test results:
768 // http://www.iro.umontreal.ca/~lecuyer/myftp/papers/testu01.pdf
769 // http://www.shadlen.org/ichbin/random/
770 //
771 static __host__ __device__ __inline__
772 uint32_t qnd_rng(uint32_t &idum) {
773  idum *= 1099087573;
774  return idum; // already 32-bits, no need to mask result
775 }
776 
777 
778 //
779 // Middle Square Weyl Sequence ("msws")
780 // This is an improved variant of von Neumann's middle square RNG
781 // that uses Weyl sequences to provide a long period. Claimed as
782 // fastest traditional seeded RNG that passes statistical tests.
783 // V5: Bernard Widynski, May 2020.
784 // https://arxiv.org/abs/1704.00358
785 //
786 // Additional notes and commentary:
787 // https://en.wikipedia.org/wiki/Middle-square_method
788 // https://pthree.org/2018/07/30/middle-square-weyl-sequence-prng/
789 //
790 // Reported to passes both BigCrush and PractRand tests:
791 // "An Empirical Study of Non-Cryptographically Secure
792 // Pseudorandom Number Generators," M. Singh, P. Singh and P. Kumar,
793 // 2020 International Conference on Computer Science, Engineering
794 // and Applications (ICCSEA), 2020,
795 // http://doi.org/10.1109/ICCSEA49143.2020.9132873
796 //
797 static __host__ __device__ __inline__
798 uint32_t msws_rng(uint64_t &x, uint64_t &w) {
799  const uint64_t s = 0xb5ad4eceda1ce2a9;
800  x *= x; // square the value per von Neumann's RNG
801  w += s; // add in Weyl sequence for longer period
802  x += w; // apply to x
803  x = (x>>32) | (x<<32); // select "middle square" as per von Neumann's RNG
804  return x; // implied truncation to lower 32-bit result
805 }
806 
807 
808 
809 //
810 // Squares: A Fast Counter-Based RNG
811 // This is a counter-based RNG based on John von Neumann's
812 // Middle Square RNG, with the Weyl sequence added to provide a long period.
813 // V3: Bernard Widynski, Nov 2020.
814 // https://arxiv.org/abs/2004.06278
815 //
816 // This RNG claims to outperform all of the original the counter-based RNGs
817 // in "Parallel random numbers: as easy as 1, 2, 3",
818 // by Salmon et al., http://doi.org/10.1145/2063384.2063405
819 // https://en.wikipedia.org/wiki/Counter-based_random_number_generator_(CBRNG)
820 // That being said, key generation technique is important in this case.
821 //
822 #define SQUARES_RNG_KEY1 0x1235d7fcb4dfec21 // a few good keys...
823 #define SQUARES_RNG_KEY2 0x418627e323f457a1 // a few good keys...
824 #define SQUARES_RNG_KEY3 0x83fc79d43614975f // a few good keys...
825 #define SQUARES_RNG_KEY4 0xc62f73498cb654e3 // a few good keys...
826 
827 // Template to allow compile-time selection of number of rounds (2, 3, 4).
828 // Roughly 5 integer ALU operations per round, 4 rounds is standard.
829 template<unsigned int ROUNDS> static __host__ __device__ __inline__
830 uint32_t squares_rng(uint64_t counter, uint64_t key) {
831  uint64_t x, y, z;
832  y = x = counter * key;
833  z = x + key;
834 
835  x = x*x + y; // round 1, middle square, add Weyl seq
836  x = (x>>32) | (x<<32); // round 1, bit rotation
837 
838  x = x*x + z; // round 2, middle square, add Weyl seq
839  if (ROUNDS == 2) {
840  return x >> 32; // round 2, upper 32-bits are bit-rotated result
841  } else {
842  x = (x>>32) | (x<<32); // round 2, bit rotation
843 
844  x = x*x + y; // round 3, middle square, add Weyl seq
845  if (ROUNDS == 3) {
846  return x >> 32; // round 3, upper 32-bits are bit-rotated result
847  } else {
848  x = (x>>32) | (x<<32); // round 3, bit rotation
849 
850  x = x*x + z; // round 4, middle square, add Weyl seq
851  return x >> 32; // round 4, upper 32-bits are bit-rotated result
852  }
853  }
854 }
855 
856 
857 //
858 // Hashing based PRNGs
859 //
860 
861 
862 //
863 // TEA, a tiny encryption algorithm.
864 // D. Wheeler and R. Needham, 2nd Intl. Workshop Fast Software Encryption,
865 // LNCS, pp. 363-366, 1994.
866 //
867 // GPU Random Numbers via the Tiny Encryption Algorithm
868 // F. Zafar, M. Olano, and A. Curtis.
869 // HPG '10 Proceedings of the Conference on High Performance Graphics,
870 // pp. 133-141, 2010.
871 // https://dl.acm.org/doi/10.5555/1921479.1921500
872 //
873 // Tea has avalanche effect in output from one bit input delta after 6 rounds
874 //
875 template<unsigned int ROUNDS> static __host__ __device__ __inline__
876 unsigned int tea(uint32_t val0, uint32_t val1) {
877  uint32_t v0 = val0;
878  uint32_t v1 = val1;
879  uint32_t s0 = 0;
880 
881  for (unsigned int n = 0; n < ROUNDS; n++) {
882  s0 += 0x9e3779b9;
883  v0 += ((v1<<4)+0xa341316c)^(v1+s0)^((v1>>5)+0xc8013ea4);
884  v1 += ((v0<<4)+0xad90777d)^(v0+s0)^((v0>>5)+0x7e95761e);
885  }
886 
887  return v0;
888 }
889 
890 
891 
892 //
893 // QRNGs
894 //
895 
896 
897 //
898 // Low discrepancy sequences based on the Golden Ratio, described in
899 // Golden Ratio Sequences for Low-Discrepancy Sampling,
900 // Colas Schretter and Leif Kobbelt, pp. 95-104, JGT 16(2), 2012.
901 //
902 // Other useful online references:
903 // http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/
904 //
905 
906 // compute Nth value in 1-D sequence
907 static __device__ __inline__
908 float goldenratioseq1d(int n) {
909  const double g = 1.61803398874989484820458683436563;
910  const double a1 = 1.0 / g;
911  const double seed = 0.5;
912  double ngold;
913  ngold = (seed + (a1 * n));
914  return ngold - trunc(ngold);
915 }
916 
917 
918 // incremental formulation to obtain the next value in the sequence
919 static __device__ __inline__
920 void goldenratioseq1d_incr(float &x) {
921  const double g = 1.61803398874989484820458683436563;
922  const double a1 = 1.0 / g;
923  float ngold = x + a1;
924  x = ngold - truncf(ngold);
925 }
926 
927 
928 // compute Nth point in 2-D sequence
929 static __device__ __inline__
930 void goldenratioseq2d(int n, float2 &xy) {
931  const double g = 1.32471795724474602596;
932  const double a1 = 1.0 / g;
933  const double a2 = 1.0 / (g*g);
934  const double seed = 0.5;
935  double ngold;
936 
937  ngold = (seed + (a1 * n));
938  xy.x = (float) (ngold - trunc(ngold));
939 
940  ngold = (seed + (a2 * n));
941  xy.y = (float) (ngold - trunc(ngold));
942 }
943 
944 
945 // incremental formulation to obtain the next value in the sequence
946 static __device__ __inline__
947 void goldenratioseq2d_incr(float2 &xy) {
948  const float g = 1.32471795724474602596;
949  const float a1 = 1.0 / g;
950  const float a2 = 1.0 / (g*g);
951  float ngold;
952 
953  ngold = xy.x + a1;
954  xy.x = (ngold - trunc(ngold));
955 
956  ngold = xy.y + a2;
957  xy.y = (ngold - trunc(ngold));
958 }
959 
960 
961 // compute Nth point in 3-D sequence
962 static __device__ __inline__
963 void goldenratioseq3d(int n, float3 &xyz) {
964  const double g = 1.22074408460575947536;
965  const double a1 = 1.0 / g;
966  const double a2 = 1.0 / (g*g);
967  const double a3 = 1.0 / (g*g*g);
968  const double seed = 0.5;
969  double ngold;
970 
971  ngold = (seed + (a1 * n));
972  xyz.x = (float) (ngold - trunc(ngold));
973 
974  ngold = (seed + (a2 * n));
975  xyz.y = (float) (ngold - trunc(ngold));
976 
977  ngold = (seed + (a3 * n));
978  xyz.z = (float) (ngold - trunc(ngold));
979 }
980 
981 
982 // incremental formulation to obtain the next value in the sequence
983 static __device__ __inline__
984 void goldenratioseq3d_incr(float3 &xyz) {
985  const float g = 1.22074408460575947536;
986  const float a1 = 1.0 / g;
987  const float a2 = 1.0 / (g*g);
988  const float a3 = 1.0 / (g*g*g);
989  float ngold;
990 
991  ngold = xyz.x + a1;
992  xyz.x = (ngold - trunc(ngold));
993 
994  ngold = xyz.y + a2;
995  xyz.y = (ngold - trunc(ngold));
996 
997  ngold = xyz.z + a3;
998  xyz.z = (ngold - trunc(ngold));
999 }
1000 
1001 
1002 // compute Nth point in 4-D sequence
1003 static __device__ __inline__
1004 void goldenratioseq4d(int n, float2 &xy1, float2 &xy2) {
1005  const double g = 1.167303978261418740;
1006  const double a1 = 1.0 / g;
1007  const double a2 = 1.0 / (g*g);
1008  const double a3 = 1.0 / (g*g*g);
1009  const double a4 = 1.0 / (g*g*g*g);
1010  const double seed = 0.5;
1011  double ngold;
1012 
1013  ngold = (seed + (a1 * n));
1014  xy1.x = (float) (ngold - trunc(ngold));
1015 
1016  ngold = (seed + (a2 * n));
1017  xy1.y = (float) (ngold - trunc(ngold));
1018 
1019  ngold = (seed + (a3 * n));
1020  xy2.x = (float) (ngold - trunc(ngold));
1021 
1022  ngold = (seed + (a4 * n));
1023  xy2.y = (float) (ngold - trunc(ngold));
1024 }
1025 
1026 
1027 // incremental formulation to obtain the next value in the sequence
1028 static __device__ __inline__
1029 void goldenratioseq4d_incr(float2 &xy1, float2 &xy2) {
1030  const double g = 1.167303978261418740;
1031  const float a1 = 1.0 / g;
1032  const float a2 = 1.0 / (g*g);
1033  const float a3 = 1.0 / (g*g*g);
1034  const float a4 = 1.0 / (g*g*g*g);
1035  float ngold;
1036 
1037  ngold = xy1.x + a1;
1038  xy1.x = (ngold - trunc(ngold));
1039 
1040  ngold = xy1.y + a2;
1041  xy1.y = (ngold - trunc(ngold));
1042 
1043  ngold = xy2.x + a3;
1044  xy2.x = (ngold - trunc(ngold));
1045 
1046  ngold = xy2.y + a4;
1047  xy2.y = (ngold - trunc(ngold));
1048 }
1049 
1050 
1051 
1052 //
1053 // stochastic sampling helper routines
1054 //
1055 
1056 // Generate an offset to jitter AA samples in the image plane
1057 static __device__ __inline__
1058 void jitter_offset2f(unsigned int &pval, float2 &xy) {
1059  xy.x = (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 0.5f;
1060  xy.y = (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 0.5f;
1061 }
1062 
1063 
1064 // Generate an offset to jitter DoF samples in the Circle of Confusion
1065 static __device__ __inline__
1066 void jitter_disc2f(unsigned int &pval, float2 &xy, float radius) {
1067 #if 1
1068  // Since the GPU RT currently uses super cheap/sleazy LCG RNGs,
1069  // it is best to avoid using sample picking, which can fail if
1070  // we use a multiply-only RNG and we hit a zero in the PRN sequence.
1071  // The special functions are slow, but have bounded runtime and
1072  // minimal branch divergence.
1073  float r=(qnd_rng(pval) * UINT32_RAND_MAX_INV);
1074  float phi=(qnd_rng(pval) * UINT32_RAND_MAX_INV) * 2.0f * M_PIf;
1075  __sincosf(phi, &xy.x, &xy.y); // fast approximation
1076  xy *= sqrtf(r) * radius;
1077 #else
1078  // Pick uniform samples that fall within the disc --
1079  // this scheme can hang in an endless loop if a poor quality
1080  // RNG is used and it gets stuck in a short PRN sub-sequence
1081  do {
1082  xy.x = 2.0f * (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 1.0f;
1083  xy.y = 2.0f * (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 1.0f;
1084  } while ((xy.x*xy.x + xy.y*xy.y) > 1.0f);
1085  xy *= radius;
1086 #endif
1087 }
1088 
1089 
1090 // Generate an offset to jitter AA samples in the image plane using
1091 // a low-discrepancy sequence
1092 static __device__ __inline__
1093 void jitter_offset2f_qrn(float2 qrnxy, float2 &xy) {
1094  xy = qrnxy - make_float2(0.5f, 0.5f);
1095 }
1096 
1097 
1098 // Generate an offset to jitter DoF samples in the Circle of Confusion,
1099 // using low-discrepancy sequences based on the Golden Ratio
1100 static __device__ __inline__
1101 void jitter_disc2f_qrn(float2 &qrnxy, float2 &xy, float radius) {
1102  goldenratioseq2d_incr(qrnxy);
1103  float r=qrnxy.x;
1104  float phi=qrnxy.y * 2.0f * M_PIf;
1105  __sincosf(phi, &xy.x, &xy.y); // fast approximation
1106  xy *= sqrtf(r) * radius;
1107 }
1108 
1109 
1110 //
1111 // Protect functions that are only GPU-callable, e.g., those that
1112 // use GPU-specific intrinsics such as __saturatef() or others.
1113 //
1114 #if defined(TACHYON_INTERNAL)
1115 
1116 // Generate a randomly oriented ray
1117 static __device__ __inline__
1118 void jitter_sphere3f(unsigned int &pval, float3 &dir) {
1119 #if 1
1120  //
1121  // Use GPU fast/approximate math routines
1122  //
1123  /* Archimedes' cylindrical projection scheme */
1124  /* generate a point on a unit cylinder and project */
1125  /* back onto the sphere. This approach is likely */
1126  /* faster for SIMD hardware, despite the use of */
1127  /* transcendental functions. */
1128  float u1 = qnd_rng(pval) * UINT32_RAND_MAX_INV;
1129  dir.z = 2.0f * u1 - 1.0f;
1130  float R = __fsqrt_rn(1.0f - dir.z*dir.z); // fast approximation
1131  float u2 = qnd_rng(pval) * UINT32_RAND_MAX_INV;
1132  float phi = 2.0f * M_PIf * u2;
1133  float sinphi, cosphi;
1134  __sincosf(phi, &sinphi, &cosphi); // fast approximation
1135  dir.x = R * cosphi;
1136  dir.y = R * sinphi;
1137 #elif 1
1138  /* Archimedes' cylindrical projection scheme */
1139  /* generate a point on a unit cylinder and project */
1140  /* back onto the sphere. This approach is likely */
1141  /* faster for SIMD hardware, despite the use of */
1142  /* transcendental functions. */
1143  float u1 = qnd_rng(pval) * UINT32_RAND_MAX_INV;
1144  dir.z = 2.0f * u1 - 1.0f;
1145  float R = sqrtf(1.0f - dir.z*dir.z);
1146 
1147  float u2 = qnd_rng(pval) * UINT32_RAND_MAX_INV;
1148  float phi = 2.0f * M_PIf * u2;
1149  float sinphi, cosphi;
1150  sincosf(phi, &sinphi, &cosphi);
1151  dir.x = R * cosphi;
1152  dir.y = R * sinphi;
1153 #else
1154  /* Marsaglia's uniform sphere sampling scheme */
1155  /* In order to correctly sample a sphere, using rays */
1156  /* generated randomly within a cube we must throw out */
1157  /* direction vectors longer than 1.0, otherwise we'll */
1158  /* oversample the corners of the cube relative to */
1159  /* a true sphere. */
1160  float len;
1161  float3 d;
1162  do {
1163  d.x = (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 0.5f;
1164  d.y = (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 0.5f;
1165  d.z = (qnd_rng(pval) * UINT32_RAND_MAX_INV) - 0.5f;
1166  len = dot(d, d);
1167  } while (len > 0.250f);
1168  float invlen = rsqrtf(len);
1169 
1170  /* finish normalizing the direction vector */
1171  dir = d * invlen;
1172 #endif
1173 }
1174 
1175 
1176 //
1177 // Spherical Fibonacci pattern to create a uniformly
1178 // distributed sample pattern on a sphere.
1179 // Spherical Fibonacci mapping.
1180 // B. Keinert, M. Innmann, M. Sänger, and M. Stamminger.
1181 // ACM Transactions on Graphics, 34:193:1-193:7, 2015.
1182 // http://doi.org/10.1145/2816795.2818131
1183 //
1184 static __device__ __inline__
1185 float3 sphericalFibonacci(float i, float totaln) {
1186  const float PHI = sqrtf(5.0f) * 0.5f + 0.5f;
1187  float fraction = (i * (PHI - 1.0f)) - floorf(i * (PHI - 1.0f));
1188 
1189  float phi = 2.0f * M_PI * fraction;
1190  float cosTheta = 1.0f - (2.0f * i + 1.0f) * (1.0f / totaln);
1191  float sinTheta = sqrt(__saturatef(1.0f - cosTheta * cosTheta));
1192 
1193  float cosPhi, sinPhi;
1194  sincosf(phi, &cosPhi, &sinPhi);
1195  return make_float3(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
1196 }
1197 
1198 #endif // TACHYON_INTERNAL
1199 
1200 
1201 
1202 //
1203 // Convert between 2-D planar coordinates and an octahedral mapping.
1204 // This is useful for both omnidirectional cameras and image formats,
1205 // and for surface normal compression/quantization.
1206 //
1207 // This implementation follows the method described here:
1208 // "A Survey of Efficient Representations for Independent Unit Vectors",
1209 // Cigolle et al., J. Computer Graphics Techniques 3(2), 2014.
1210 // http://jcgt.org/published/0003/02/01/
1211 //
1212 // UNORM: convert internal SNORM output range [-1,1] to UNORM [0,1] range
1213 // UNORM mode costs extra instructions
1214 //
1215 template <int UNORM>
1216 static __host__ __device__ __inline__ float2 OctEncode(float3 n) {
1217  const float invL1Norm = 1.0f / (fabsf(n.x) + fabsf(n.y) + fabsf(n.z));
1218  float2 projected;
1219  if (n.z < 0.0f) {
1220  projected = 1.0f - make_float2(fabsf(n.y), fabsf(n.x)) * invL1Norm;
1221  projected.x = copysignf(projected.x, n.x);
1222  projected.y = copysignf(projected.y, n.y);
1223  } else {
1224  projected = make_float2(n.x, n.y) * invL1Norm;
1225  }
1226 
1227  // convert from SNORM to UNORM
1228  if (UNORM)
1229  projected = projected * 0.5f + 0.5f; // convert to UNORM range [0,1]
1230 
1231  return projected;
1232 }
1233 
1234 
1235 //
1236 // XXX TODO: implement a high-precision OctPEncode() variant, based on
1237 // floored snorms and an error minimization scheme using a
1238 // comparison of internally decoded values for least error
1239 //
1240 
1241 //
1242 // Direct adaptation from Cigolle et al, with optional UNORM mode.
1243 //
1244 // UNORM: convert from UNORM input domain [0,1] to internal SNORM [-1,1] domain
1245 // UNORM mode costs extra instructions
1246 //
1247 template <int UNORM>
1248 static __host__ __device__ __inline__ float3 OctDecode(float2 projected) {
1249  // convert from UNORM input domain to native SNORM internal domain
1250  if (UNORM)
1251  projected *= 2.0f - 1.0f; // convert to SNORM range [-1,1]
1252 
1253  float3 n = make_float3(projected.x,
1254  projected.y,
1255  1.0f - (fabsf(projected.x) + fabsf(projected.y)));
1256  if (n.z < 0.0f) {
1257  float oldX = n.x;
1258  n.x = copysignf(1.0f - fabsf(n.y), oldX);
1259  n.y = copysignf(1.0f - fabsf(oldX), n.y);
1260  }
1261 
1262  return n;
1263 }
1264 
1265 
1266 //
1267 // Protect functions that are only GPU-callable, e.g., those that
1268 // use GPU-specific intrinsics such as __saturatef() or others.
1269 //
1270 #if defined(TACHYON_INTERNAL)
1271 
1272 //
1273 // Faster version by Rune Stubbe (2017) that avoids branching in decode:
1274 // https://twitter.com/Stubbesaurus/status/937994790553227264
1275 // https://twitter.com/Stubbesaurus/status/937994790553227264/photo/1
1276 // https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding
1277 // Another variant:
1278 // http://johnwhite3d.blogspot.com/2017/10/signed-octahedron-normal-encoding.html
1279 // UNORM: convert from UNORM input domain [0,1] to internal SNORM [-1,1] domain
1280 // UNORM mode costs extra instructions
1281 //
1282 template <int UNORM>
1283 static __device__ __inline__ float3 OctDecode_fast(float2 projected) {
1284  // convert from UNORM input domain to native SNORM internal domain
1285  if (UNORM)
1286  projected *= 2.0f - 1.0f; // convert to SNORM range [-1,1]
1287 
1288  float3 n = make_float3(projected.x,
1289  projected.y,
1290  1.0f - fabsf(projected.x) - fabsf(projected.y));
1291  float t = __saturatef(-n.z); // or max(-n.z, 0.0)
1292  n.x += (n.x >= 0.0f) ? -t : t;
1293  n.y += (n.y >= 0.0f) ? -t : t;
1294 
1295  return n;
1296 }
1297 
1298 #endif // TACHYON_INTERNAL
1299 
1300 
1301 //
1302 // Methods for packing normals into a 4-byte quantity, such as a
1303 // [u]int or [u]char4, and similar. See JCGT article by Cigolle et al.,
1304 // "A Survey of Efficient Representations for Independent Unit Vectors",
1305 // J. Computer Graphics Techniques 3(2), 2014.
1306 // http://jcgt.org/published/0003/02/01/
1307 //
1308 static __host__ __device__ __inline__ uint convfloat2uint32(float2 f2) {
1309  f2 = f2 * 0.5f + 0.5f;
1310  uint packed;
1311  packed = ((uint) (f2.x * 65535)) | ((uint) (f2.y * 65535) << 16);
1312  return packed;
1313 }
1314 
1315 static __host__ __device__ __inline__ float2 convuint32float2(uint packed) {
1316  float2 f2;
1317  f2.x = (float)((packed ) & 0x0000ffff) / 65535;
1318  f2.y = (float)((packed >> 16) & 0x0000ffff) / 65535;
1319  return f2 * 2.0f - 1.0f;
1320 }
1321 
1322 
1323 
1324 #if 1
1325 
1326 //
1327 // oct32: 32-bit octahedral normal encoding using [su]norm16x2 quantization
1328 // Meyer et al., "On Floating Point Normal Vectors", In Proc. 21st
1329 // Eurographics Conference on Rendering.
1330 // http://dx.doi.org/10.1111/j.1467-8659.2010.01737.x
1331 //
1332 static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
1333  float2 octf2 = OctEncode<0>(normal);
1334  return convfloat2uint32(octf2);
1335 }
1336 
1337 static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
1338  float2 octf2 = convuint32float2(packed);
1339  return OctDecode<0>(octf2);
1340 }
1341 
1342 #elif 0
1343 
1344 //
1345 // snorm10x3: signed 10-bit-per-component scalar unit real representation
1346 // Better representation than unorm.
1347 // Supported by most fixed-function graphics hardware.
1348 // https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_snorm.txt
1349 // i=round(clamp(r,-1,1) * (2^(b-1) - 1)
1350 // r=clamp(i/(2^(b-1) - 1), -1, 1)
1351 //
1352 
1353 #elif 1
1354 
1355 // OpenGL GLbyte signed quantization scheme
1356 // i = r * (2^b - 1) - 0.5;
1357 // r = (2i + 1)/(2^b - 1)
1358 static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
1359  // conversion to GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
1360  const float3 N = normal * 127.5f - 0.5f;
1361  const char4 packed = make_char4(N.x, N.y, N.z, 0);
1362  return *((uint *) &packed);
1363 }
1364 
1365 static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
1366  char4 c4norm = *((char4 *) &packed);
1367 
1368  // conversion from GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
1369  // float = (2c+1)/(2^8-1)
1370  const float ci2f = 1.0f / 255.0f;
1371  const float cn2f = 1.0f / 127.5f;
1372  float3 N = c4norm * cn2f + ci2f;
1373 
1374  return N;
1375 }
1376 #endif
1377 
1378 
1379 
1380 //
1381 // Device functions to convert between linear and sRGB colorspaces
1382 //
1383 // It's important to note that accurate conversions between
1384 // linear and sRGB color spaces require the use of
1385 // floating point or deep bit depth integer arithmetic.
1386 // We use the CUDA texturing hardware to perform sRGB to
1387 // linear color conversion during texture sampling.
1388 //
1389 // Some useful example results from improper conversion techniques:
1390 // https://blog.demofox.org/2018/03/10/dont-convert-srgb-u8-to-linear-u8/
1391 //
1392 
1393 
1394 //
1395 // Conversion between sRGB and linear using the official equations
1396 //
1397 static __forceinline__ __device__
1398 float4 sRGB_to_linear(const float4 &rgba) {
1399  float4 lin;
1400  if (rgba.x <= 0.0404482362771082f) {
1401  lin.x = rgba.x * 0.0773993f; // divide by 12.92f;
1402  } else {
1403  lin.x = powf(((rgba.x + 0.055f)/1.055f), 2.4f);
1404  }
1405 
1406  if (rgba.y <= 0.0404482362771082f) {
1407  lin.y = rgba.y * 0.0773993f; // divide by 12.92f;
1408  } else {
1409  lin.y = powf(((rgba.y + 0.055f)/1.055f), 2.4f);
1410  }
1411 
1412  if (rgba.z <= 0.0404482362771082f) {
1413  lin.z = rgba.z * 0.0773993f; // divide by 12.92f;
1414  } else {
1415  lin.z = powf(((rgba.z + 0.055f)/1.055f), 2.4f);
1416  }
1417 
1418  lin.w = rgba.w; // alpha remains linear regardless of color space
1419 
1420  return lin;
1421 }
1422 
1423 
1424 //
1425 // Conversion between linear and sRGB using the official equations
1426 //
1427 static __forceinline__ __device__
1428 float4 linear_to_sRGB(const float4 &lin) {
1429  float4 rgba;
1430  if (lin.x > 0.0031308f) {
1431  rgba.x = 1.055f * (powf(lin.x, (1.0f / 2.4f))) - 0.055f;
1432  } else {
1433  rgba.x = 12.92f * lin.x;
1434  }
1435 
1436  if (lin.y > 0.0031308f) {
1437  rgba.y = 1.055f * (powf(lin.y, (1.0f / 2.4f))) - 0.055f;
1438  } else {
1439  rgba.y = 12.92f * lin.y;
1440  }
1441 
1442  if (lin.z > 0.0031308f) {
1443  rgba.z = 1.055f * (powf(lin.z, (1.0f / 2.4f))) - 0.055f;
1444  } else {
1445  rgba.z = 12.92f * lin.z;
1446  }
1447 
1448  rgba.w = lin.w; // alpha remains linear regardless of color space
1449 
1450  return rgba;
1451 }
1452 
1453 
1454 
1455 //
1456 // Fast, approximate conversion between linear and sRGB:
1457 // https://excamera.com/sphinx/article-srgb.html
1458 // http://chilliant.blogspot.com/2012/08/srgb-approximations-for-hlsl.html
1459 //
1460 static __forceinline__ __device__
1461 float4 sRGB_to_linear_approx(const float4 &rgba) {
1462  float3 sRGB = make_float3(rgba);
1463  float3 lin = sRGB * (sRGB * (sRGB * 0.305306011f + 0.682171111f) + 0.012522878f);
1464  return make_float4(lin, rgba.w); // preserve linear alpha
1465 }
1466 
1467 
1468 //
1469 // Fast, approximate conversion between sRGB and linear:
1470 // https://excamera.com/sphinx/article-srgb.html
1471 // http://chilliant.blogspot.com/2012/08/srgb-approximations-for-hlsl.html
1472 //
1473 static __forceinline__ __device__
1474 float4 linear_to_sRGB_approx(const float4 &linear) {
1475  float3 lin = make_float3(linear);
1476  float3 S1 = make_float3(sqrtf(lin.x), sqrtf(lin.y), sqrtf(lin.z));
1477  float3 S2 = make_float3(sqrtf(S1.x), sqrtf(S1.y), sqrtf(S1.z));
1478  float3 S3 = make_float3(sqrtf(S2.x), sqrtf(S2.y), sqrtf(S2.z));
1479  float3 sRGB = 0.662002687f * S1 + 0.684122060f * S2
1480  - 0.323583601f * S3 - 0.0225411470f * lin;
1481  return make_float4(sRGB, linear.w); // preserver linear alpha
1482 }
1483 
1484 
1485 
1486 //
1487 // Fastest low-approximate conversion between linear and sRGB (gamma 2.0):
1488 //
1489 static __forceinline__ __device__
1490 float4 sRGB_to_linear_approx_20(const float4 &rgba) {
1491  float3 sRGB = make_float3(rgba);
1492  return make_float4(sRGB * sRGB, rgba.w); // preserve linear alpha
1493 }
1494 
1495 
1496 //
1497 // Fastest low-approximate conversion between sRGB and linear (gamma 2.0):
1498 //
1499 static __forceinline__ __device__
1500 float4 linear_to_sRGB_approx_20(const float4 &linear) {
1501  float3 lin = make_float3(linear);
1502  float3 sRGB = make_float3(sqrtf(lin.x), sqrtf(lin.y), sqrtf(lin.z));
1503  return make_float4(sRGB, linear.w); // preserver linear alpha
1504 }
1505 
1506 
1507 
1508 //
1509 // Tone mapping and color grading device functions.
1510 // Useful references:
1511 // Photographic Tone Reproduction for Digital Images
1512 // E. Reinhard, M. Stark, P. Shirley, J. Ferwerda
1513 // ACM Transactions on Graphics, 21(3) pp. 267-276, 2002.
1514 // https://doi.org/10.1145/566654.566575
1515 //
1516 // Tone Mapping of HDR Images: A Review
1517 // Y. Salih, W. Md-Esa, A. Malik, N. Saad.
1518 // http://doi.org/10.1109/ICIAS.2012.6306220
1519 //
1520 // Others:
1521 // http://filmicworlds.com/blog/filmic-tonemapping-operators/
1522 // http://filmicworlds.com/blog/filmic-tonemapping-with-piecewise-power-curves/
1523 // http://filmicworlds.com/blog/minimal-color-grading-tools/
1524 // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
1525 // https://bartwronski.com/2022/02/28/exposure-fusion-local-tonemapping-for-real-time-rendering/
1526 // https://bartwronski.github.io/local_tonemapping_js_demo/
1527 
1528 
1529 //
1530 // Calculate relative luminance from linear RGB w/ perceptual coefficients:
1531 // https://en.wikipedia.org/wiki/Relative_luminance
1532 //
1533 static __device__ __inline__
1534 float luminance(float3 c) {
1535  return dot(c, make_float3(0.2126f, 0.7152f, 0.0722f));;
1536 }
1537 
1538 
1539 //
1540 // Rescale RGB colors to achieve desired luminance
1541 //
1542 static __device__ __inline__
1543 float3 rescale_luminance(float3 c, float newluminance) {
1544  float l = luminance(c);
1545  return c * (newluminance / l);
1546 }
1547 
1548 
1549 //
1550 // ACES filmic tone mapping approximations:
1551 // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
1552 // https://github.com/TheRealMJP/BakingLab/blob/master/BakingLab/ACES.hlsl
1553 //
1554 static __device__ __inline__
1555 float3 ACES_TMO(float3 c) {
1556  float3 num = c * (2.51f * c + make_float3(0.03f));
1557  float3 den = c * (2.43f * c + make_float3(0.59f)) + make_float3(0.14f);
1558  float3 t = num / den;
1559 
1560  return t; // clamping is deferred
1561 }
1562 
1563 
1564 //
1565 // Reinhard style tone mapping
1566 //
1567 static __device__ __inline__
1568 float3 reinhard_TMO(float3 c) {
1569  return c / (make_float3(1.0f) + c);
1570 }
1571 
1572 
1573 //
1574 // Extended Reinhard style tone mapping:
1575 // https://64.github.io/tonemapping/
1576 //
1577 static __device__ __inline__
1578 float3 reinhard_extended_TMO(float3 c, float maxwhite) {
1579  float3 num = c * (make_float3(1.0f) + (c / make_float3(maxwhite * maxwhite)));
1580  return num / (make_float3(1.0f) + c);
1581 }
1582 
1583 
1584 //
1585 // Extended Reinhard style tone mapping applied to luminance:
1586 // https://64.github.io/tonemapping/
1587 //
1588 static __device__ __inline__
1589 float3 reinhard_extended_luminance_TMO(float3 c, float maxL) {
1590  float oldL = luminance(c);
1591  float num = oldL * (1.0f + (oldL / (maxL * maxL)));
1592  float newL = num / (1.0f + oldL);
1593  return rescale_luminance(c, newL);
1594 }
1595 
1596 
1597 //
1598 // Protect functions that are only GPU-callable, e.g., those that
1599 // use GPU-specific intrinsics such as __saturatef() or others.
1600 //
1601 #if defined(TACHYON_INTERNAL)
1602 
1603 // clamp vector to range [0,1] using __saturatef() intrinsic
1604 static __device__ __inline__ float3 clamp_float3(const float3 &a) {
1605  return make_float3(__saturatef(a.x), __saturatef(a.y), __saturatef(a.z));
1606 }
1607 
1608 // clamp vector to range [0,1] using __saturatef() intrinsic
1609 static __device__ __inline__ float4 clamp_float4(const float4 &a) {
1610  return make_float4(__saturatef(a.x), __saturatef(a.y),
1611  __saturatef(a.z), __saturatef(a.w));
1612 }
1613 
1614 
1615 //
1616 // Color conversion operations
1617 //
1618 
1620 static __device__ __inline__ uchar4 make_color_rgb4u(const float3& c) {
1621  return make_uchar4(static_cast<unsigned char>(__saturatef(c.x)*255.99f),
1622  static_cast<unsigned char>(__saturatef(c.y)*255.99f),
1623  static_cast<unsigned char>(__saturatef(c.z)*255.99f),
1624  255u);
1625 }
1626 
1628 static __device__ __inline__ uchar4 make_color_rgb4u(const float4& c) {
1629  return make_uchar4(static_cast<unsigned char>(__saturatef(c.x)*255.99f),
1630  static_cast<unsigned char>(__saturatef(c.y)*255.99f),
1631  static_cast<unsigned char>(__saturatef(c.z)*255.99f),
1632  static_cast<unsigned char>(__saturatef(c.w)*255.99f));
1633 }
1634 
1635 
1636 
1637 //
1638 // HDR tone mapping
1639 //
1640 static __inline__ __device__
1641 float4 tonemap_color(const float4 & colrgba4f, int tonemap_mode,
1642  float tonemap_exposure, int colorspace) {
1643  float alpha = colrgba4f.w; // preserve linear alpha channel
1644  float3 color = make_float3(colrgba4f) * tonemap_exposure;
1645 
1646  switch (tonemap_mode) {
1647  case RT_TONEMAP_ACES:
1648  color = ACES_TMO(color);
1649  break;
1650 
1651  case RT_TONEMAP_REINHARD:
1652  color = reinhard_TMO(color);
1653  break;
1654 
1656  color = reinhard_extended_TMO(color, 1.0f);
1657  break;
1658 
1660  color = reinhard_extended_luminance_TMO(color, 1.0f);
1661  break;
1662 
1663  case RT_TONEMAP_CLAMP:
1664  default:
1665  break;
1666  }
1667 
1668  float4 outcolor = make_float4(color, alpha);
1669 
1670  // range clamping is deferred until storage format conversion
1671  return outcolor;
1672 }
1673 
1674 #endif // TACHYON_INTERNAL
1675 
1676 
1677 
1678 //
1679 // End of potentially unreferenced functions
1680 //
1681 //#pragma pop
1682 
1683 
1684 #endif // TACHYONOPTIXSHADERS_H
static __host__ __device__ __inline__ unsigned int tea(uint32_t val0, uint32_t val1)
__host__ __device__ float3 normalize(const float3 &v)
Normalize input vector to unit length.
float3 U
camera orthonormal U (right) axis
static __device__ __inline__ float3 rescale_luminance(float3 c, float newluminance)
uchar4 * vertcolors4u
unsigned char color representation
float3 texgen_vaxis
world coordinate texgen V axis
uint4 * packednormals
packed normals: ng [n0 n1 n2]
float outlinewidth
width of outline shading effect
static __device__ __inline__ void goldenratioseq4d(int n, float2 &xy1, float2 &xy2)
uchar4 * framebuffer
8-bit unorm RGBA framebuffer
float accum_normalize
precalc 1.0f / subframe_index
denosier on, std. impl.
void * rt_texture(SceneHandle sc, apitexture *apitex)
Translate a texture definition into the internal format used by Tachyon, and returns an opaque pointe...
Definition: api.c:933
shadows on, std. impl.
int matflags
alpha/cutout transparency flags
int headlight_mode
Extra VR camera-located headlight.
static __device__ __inline__ float goldenratioseq1d(int n)
static __device__ __inline__ void jitter_offset2f(unsigned int &pval, float2 &xy)
float bg_grad_invrange
miss background gradient inverse range
float4 * PosRadius
X,Y,Z,Radius packed for coalescing.
float3 * vertcolors3f
float3 bg_grad_updir
miss background gradient up direction
default behavior
int update_colorbuffer
accumulation copyout flag
cudaTextureObject_t tex
texture, non-zero if valid
static __device__ __inline__ void jitter_disc2f_qrn(float2 &qrnxy, float2 &xy, float radius)
int tonemap_mode
output tone mapping mode
__host__ __device__ float3 normalize_len(const float3 v, float &l)
Normalize input vector to unit length, and return its original length.
float ao_lightscale
2.0f/float(ao_samples)
QuadMeshSBT quadmesh
float dof_aperture_rad
DoF (defocus blur) aperture radius.
__host__ __device__ float2 operator+(const float2 &a, const float2 &b)
static __host__ __device__ __inline__ uint32_t msws_rng(uint64_t &x, uint64_t &w)
float reflectivity
mirror reflectance coefficient
float shininess
specular highlight size (exponential)
float ao_maxdist
AO maximum occlusion distance.
static __device__ __inline__ void goldenratioseq3d_incr(float3 &xyz)
__host__ __device__ float4 make_float4(const float3 &a, const float &b)
ring SBT index multiplier
__host__ __device__ float3 make_float3(const float s)
float3 * prim_color
optional per-primitive color array
OptiX 7.x built-in curve prims.
float2 * tex2d
2-D texture coordinate buffer
uint4 * packednormals
packed normals: ng [n0 n1 n2]
float3 W
camera orthonormal W (view) axis
__host__ __device__ float3 fmaxf(const float3 &a, const float3 &b)
float3 pos
camera position
int max_trans
max transparent surface crossing count
RtMergedPrimKind
float tonemap_exposure
tone mapping exposure gain parameter
cylinder SBT index multiplier
shadows disabled
float3 texgen_uaxis
world coordinate texgen U axis
int userindex
material user index, positive if valid
float ao_direct
AO direct lighting scaling factor.
int materialindex
material index for this array
__host__ __device__ float3 faceforward(const float3 &n, const float3 &i, const float3 &nref)
Ensure that an interpolated surface normal n faces in the same direction as dictated by a geometric n...
__host__ __device__ void operator*=(float2 &a, const float s)
Tachyon OptiX global launch parameter structure containing the active camera, framebuffer, materials, and any global scene parameters required for shading.
void jitter_sphere3f(rng_frand_handle *rngh, float *dir)
Definition: util.c:779
int fb_clearall
clear/overwrite all FB components
struct tachyonLaunchParams::@4 lights
__host__ __device__ float3 fabsf(const float3 &a)
custom prim ring
shadow probe/AO rays
int fog_mode
fog type (or off)
static __forceinline__ __device__ float4 sRGB_to_linear(const float4 &rgba)
float3 pos
point light position
cudaTextureObject_t tex
texture, non-zero if valid
#define M_PI
__host__ __device__ float3 reflect(const float3 &i, const float3 &n)
calculate reflection direction from incident direction i, and surface normal n.
static __device__ __inline__ void goldenratioseq4d_incr(float2 &xy1, float2 &xy2)
float bg_grad_topval
miss background gradient top value
__host__ __device__ float2 operator/(const float s, const float2 &a)
default behavior
float dof_focal_dist
DoF focal plane distance.
struct __align__(OPTIX_SBT_RECORD_ALIGNMENT) HGRecord
SBT record for a hitgroup program.
linear rgba, gamma 1.0
static __host__ __device__ __inline__ uint packNormal(const float3 &normal)
enable cutout/transparency
linear rgba, gamma 1.0
TriMeshSBT trimesh
static __device__ __inline__ void jitter_disc2f(unsigned int &pval, float2 &xy, float radius)
static __host__ __device__ __inline__ uint32_t squares_rng(uint64_t counter, uint64_t key)
static __device__ __inline__ void goldenratioseq2d_incr(float2 &xy)
__host__ __device__ void operator+=(float3 &a, const float3 &b)
float outline
outline shading coefficient
Adobe sRGB (gamma 2.2)
custom prim cylinder
static __host__ __device__ __inline__ float2 OctEncode(float3 n)
float3 dir
directional light direction
SphereArraySBT sphere
static __host__ __device__ __inline__ float3 unpackNormal(uint packed)
float3 texgen_waxis
world coordinate texgen W axis
float fog_end
radial/linear fog end/max distance
int shadows_enabled
global shadow flag
int transmode
transparency behavior
void * rt_directional_light(SceneHandle voidscene, void *tex, apivector dir)
Define a directional light with associated texture and direction.
Definition: api.c:1077
static __device__ __inline__ void goldenratioseq2d(int n, float2 &xy)
#define RT_CUSTPRIM
int stereo_enabled
stereo rendering on/off
float3 * dir_lights
list of directional light directions
static __device__ __inline__ float3 reinhard_TMO(float3 c)
custom prim cyliner
float bg_grad_botval
miss background gradient bottom value
struct tachyonLaunchParams::@3 scene
float fog_start
radial/linear fog start distance
float specular
specular reflectance coefficient
enable alpha transparency
__host__ __device__ float dot(const float3 &a, const float3 &b)
static __device__ __inline__ float luminance(float3 c)
only clamp the color values [0,1]
cudaArray_t d_img
GPU allocated image buffer.
float3 * tex3d
3-D texture coordinate buffer
float3 V
camera orthonormal V (up) axis
"Extended" Reinhard style, color
static __device__ __inline__ float3 ACES_TMO(float3 c)
CylinderArraySBT cyl
float stereo_convergence_dist
stereo convergence distance (world)
int2 size
framebuffer size
total count of SBT geometric multipliers
float3 bg_color_grad_bot
miss background gradient (bottom)
static __forceinline__ __device__ float4 linear_to_sRGB_approx_20(const float4 &linear)
any-hit traversal reversal
CurveArraySBT curve
ConeArraySBT cone
static __host__ __device__ __inline__ uint32_t qnd_rng(uint32_t &idum)
int dof_enabled
DoF (defocus blur) on/off.
int ao_samples
number of AO samples per AA ray
static __host__ __device__ __inline__ float3 OctDecode(float2 projected)
int subframe_index
accumulation subframe index
normal radiance rays
__host__ __device__ float3 cross(const float3 &a, const float3 &b)
calculate the cross product between vectors a and b.
denoiser disabled
int userindex
material user index, positive if valid
uchar4 * vertcolors4u
unsigned char color representation
float bg_grad_noisemag
miss background gradient noise magnitude
__host__ __device__ float2 operator-(const float2 &a, const float2 &b)
int aa_samples
AA samples per launch.
static __host__ __device__ __inline__ uint convfloat2uint32(float2 f2)
cone SBT index multiplier
static __forceinline__ __device__ float4 linear_to_sRGB_approx(const float4 &linear)
OptixTraversableHandle traversable
global OptiX scene traversable handle
float3 bg_color_grad_top
miss background gradient (top)
float clipview_end
clipping sphere/plane end coord
quad SBT index multiplier
__host__ __device__ float2 operator*(const float2 &a, const float2 &b)
struct tachyonLaunchParams::@2 frame
static __device__ __inline__ float3 reinhard_extended_TMO(float3 c, float maxwhite)
Store all hitgroup records for a given geometry together for simpler dynamic updates.
custom prim sphere
__host__ __device__ float length(const float3 &v)
float clipview_start
clipping sphere/plane start coord
#define RT_TRI_BUILTIN
float opacity
surface opacity
sphere SBT index multiplier
int colorspace
output colorspace
struct tachyonLaunchParams::@5 cam
int num_dir_lights
directional light count
total count of ray types
float3 * pos_lights
list of positional light positions
#define UINT32_RAND_MAX_INV
static __device__ __inline__ void jitter_offset2f_qrn(float2 qrnxy, float2 &xy)
int texflags
linear/sRGB colorspace | texturing flags
total count of available colorspaces
custom prim quadrilateral
static __forceinline__ __device__ float4 sRGB_to_linear_approx_20(const float4 &rgba)
structure containing Tachyon material properties
rt_material * materials
device memory material array
float diffuse
diffuse reflectance coefficient
static __device__ __inline__ void goldenratioseq3d(int n, float3 &xyz)
Reinhard style, color.
#define M_PIf
custom prim cone
__host__ __device__ float3 normalize_invlen(const float3 v, float &invlen)
Normalize input vector to unit length, and return the reciprocal of its original length.
float fog_density
exponential fog density
ACES style approximation.
custom prim quadrilateral
float3 bg_color
miss background color
custom prim cone
float3 texgen_origin
world coordinate texgen origin
int num_pos_lights
positional light count
static __forceinline__ __device__ float4 sRGB_to_linear_approx(const float4 &rgba)
custom prim ring
float ambient
constant ambient light factor
static __device__ __inline__ float3 reinhard_extended_luminance_TMO(float3 c, float maxL)
static __host__ __device__ __inline__ float2 convuint32float2(uint packed)
float ao_ambient
AO ambient factor.
float3 uniform_color
uniform color for entire sphere array
float4 * accum_buffer
32-bit FP RGBA accumulation buffer
static __device__ __inline__ void goldenratioseq1d_incr(float &x)
RtDenoiserMode
float stereo_eyesep
stereo eye separation, in world coords
Adobe sRGB (gamma 2.2)
int clipview_mode
VR clipping view on/off.
int max_depth
global max ray tracing recursion depth
enable tex cutout transparency
"Extended" Reinhard style, luminance
custom prim sphere
float epsilon
global epsilon value
RingArraySBT ring
total count of ray types
static __forceinline__ __device__ float4 linear_to_sRGB(const float4 &lin)
float zoom
camera zoom factor