/*
 * TachyonOptiXShaders.h - prototypes for OptiX PTX shader routines 
 *
 * (C) Copyright 2013-2022 John E. Stone
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * $Id: TachyonOptiXShaders.h,v 1.55 2022/03/23 04:56:41 johns Exp $
 *
 */

/**
 *  \file TachyonOptiXShaders.h
 *  \brief Tachyon ray tracing engine core routines and data structures
 *         compiled to PTX for runtime JIT to build complete ray tracing 
 *         pipelines.  Key data structures defined here are shared both by
 *         the compiled PTX core ray tracing routines, and by the host code
 *         that assembles the complete ray tracing pipeline and launches
 *         the pipeline kernels.
 *         Written for NVIDIA OptiX 7 and later.
 */

//
// This is a second generation of the Tachyon implementation for OptiX.
// The new implementation favors the strengths of OptiX 7, and uses
// OptiX ray payload registers, direct CUDA interoperability and advanced
// CUDA features for both performance and maintainability.
//
// This software and its line of antecedants are described in:
//   "Multiscale modeling and cinematic visualization of photosynthetic
//    energy conversion processes from electronic to cell scales"
//    M. Sener, S. Levy, J. E. Stone, AJ Christensen, B. Isralewitz,
//    R. Patterson, K. Borkiewicz, J. Carpenter, C. N. Hunter,
//    Z. Luthey-Schulten, D. Cox.
//    J. Parallel Computing, 102, pp. 102698, 2021.
//    https://doi.org/10.1016/j.parco.2020.102698
//
//   "Omnidirectional Stereoscopic Projections for VR"
//    J. E. Stone.  In, William R. Sherman, editor,
//    VR Developer Gems, Taylor and Francis / CRC Press, Chapter 24, 2019.
//    https://www.taylorfrancis.com/chapters/edit/10.1201/b21598-24/omnidirectional-stereoscopic-projections-vr-john-stone
//
//   "Interactive Ray Tracing Techniques for
//    High-Fidelity Scientific Visualization"
//    J. E. Stone. In, Eric Haines and Tomas Akenine-Mller, editors,
//    Ray Tracing Gems, Apress, Chapter 27, pp. 493-515, 2019.
//    https://link.springer.com/book/10.1007/978-1-4842-4427-2
//
//   "A Planetarium Dome Master Camera"
//    J. E. Stone.  In, Eric Haines and Tomas Akenine-Mller, editors,
//    Ray Tracing Gems, Apress, Chapter 4, pp. 49-60, 2019.
//    https://link.springer.com/book/10.1007/978-1-4842-4427-2
//
//   "Immersive Molecular Visualization with Omnidirectional
//    Stereoscopic Ray Tracing and Remote Rendering"
//    J. E. Stone, W. R. Sherman, and K. Schulten.
//    High Performance Data Analysis and Visualization Workshop,
//    2016 IEEE International Parallel and Distributed Processing
//    Symposium Workshops (IPDPSW), pp. 1048-1057, 2016.
//    http://dx.doi.org/10.1109/IPDPSW.2016.121
//
//   "Atomic Detail Visualization of Photosynthetic Membranes with
//    GPU-Accelerated Ray Tracing"
//    J. E. Stone, M. Sener, K. L. Vandivort, A. Barragan, A. Singharoy,
//    I. Teo, J. V. Ribeiro, B. Isralewitz, B. Liu, B.-C. Goh, J. C. Phillips,
//    C. MacGregor-Chatwin, M. P. Johnson, L. F. Kourkoutis, C. N. Hunter,
//    K. Schulten
//    J. Parallel Computing, 55:17-27, 2016.
//    http://dx.doi.org/10.1016/j.parco.2015.10.015
//
//   "GPU-Accelerated Molecular Visualization on
//    Petascale Supercomputing Platforms"
//    J. E. Stone, K. L. Vandivort, and K. Schulten.
//    UltraVis'13: Proceedings of the 8th International Workshop on
//    Ultrascale Visualization, pp. 6:1-6:8, 2013.
//    http://dx.doi.org/10.1145/2535571.2535595
//
//    "An Efficient Library for Parallel Ray Tracing and Animation"
//    John E. Stone.  Master's Thesis, University of Missouri-Rolla,
//    Department of Computer Science, April 1998
//    https://scholarsmine.mst.edu/masters_theses/1747
//
//    "Rendering of Numerical Flow Simulations Using MPI"
//    J. Stone and M. Underwood.
//    Second MPI Developers Conference, pages 138-141, 1996.
//    http://dx.doi.org/10.1109/MPIDC.1996.534105
//

#ifndef TACHYONOPTIXSHADERS_H
#define TACHYONOPTIXSHADERS_H

#if 0
/// Compile-time flag for collection and reporting of ray statistics
#define TACHYON_RAYSTATS 1
#endif

#if OPTIX_VERSION >= 70300
#define TACHYON_OPTIXDENOISER 1
#endif

//
// Constants shared by both host and device code
//
#define RT_DEFAULT_MAX 1e27f

//
// Beginning of OptiX data structures
//

// Enable reversed traversal of any-hit rays for shadows/AO.
// This optimization yields a 20% performance gain in many cases.
// #define USE_REVERSE_SHADOW_RAYS 1

// Use reverse rays by default rather than only when enabled interactively
// #define USE_REVERSE_SHADOW_RAYS_DEFAULT 1
enum RtShadowMode {
  RT_SHADOWS_OFF=0,          ///< shadows disabled
  RT_SHADOWS_ON=1,           ///< shadows on, std. impl.
  RT_SHADOWS_ON_REVERSE=2    ///< any-hit traversal reversal
};

enum RtDenoiserMode {
  RT_DENOISER_OFF=0,          ///< denoiser disabled
  RT_DENOISER_ON=1,           ///< denosier on, std. impl.
};

enum RayType {
  RT_RAY_TYPE_RADIANCE=0,    ///< normal radiance rays
  RT_RAY_TYPE_SHADOW=1,      ///< shadow probe/AO rays
  RT_RAY_TYPE_COUNT          ///< total count of ray types
};

//
// OptiX 7.x geometry type-associated "hit kind" enums
//
enum RtHitKind {
  RT_HIT_HWTRIANGLE=0,       ///< RTX triangle
  RT_HIT_CONE,               ///< custom prim cone
  RT_HIT_CYLINDER,           ///< custom prim cyliner
  RT_HIT_QUAD,               ///< custom prim quadrilateral
  RT_HIT_RING,               ///< custom prim ring
  RT_HIT_SPHERE,             ///< custom prim sphere
  RT_HIT_CURVE,              ///< OptiX 7.x built-in curve prims
  RT_HIT_COUNT 
};      

// Enums used for custom primitive PGM indexing in SBT + GAS
enum RtCustPrim { 
  RT_CUST_PRIM_CONE=0,       ///< cone SBT index multiplier
  RT_CUST_PRIM_CYLINDER,     ///< cylinder SBT index multiplier
  RT_CUST_PRIM_QUAD,         ///< quad SBT index multiplier
  RT_CUST_PRIM_RING,         ///< ring SBT index multiplier
  RT_CUST_PRIM_SPHERE,       ///< sphere SBT index multiplier
  RT_CUST_PRIM_COUNT         ///< total count of SBT geometric multipliers
};

enum RtColorSpace {
  RT_COLORSPACE_LINEAR=0,    ///< linear rgba, gamma 1.0
  RT_COLORSPACE_sRGB=1,      ///< Adobe sRGB (gamma 2.2)
  RT_COLORSPACE_COUNT        ///< total count of available colorspaces
};


//
// Images, Materials, Textures...
//

/// structure containing Tachyon texture (only used on host side)
typedef struct {
  int colorspace;            ///< linear or sRGB color space
  float3 texgen_origin;      ///< world coordinate texgen origin
  float3 texgen_uaxis;       ///< world coordinate texgen U axis
  float3 texgen_vaxis;       ///< world coordinate texgen V axis
  float3 texgen_waxis;       ///< world coordinate texgen W axis
  cudaArray_t d_img;         ///< GPU allocated image buffer
  cudaTextureObject_t tex;   ///< texture, non-zero if valid
  int userindex;             ///< material user index, positive if valid
} rt_texture;


/// structure containing Tachyon material properties
typedef struct {
  float opacity;             ///< surface opacity 
  float ambient;             ///< constant ambient light factor
  float diffuse;             ///< diffuse reflectance coefficient
  float specular;            ///< specular reflectance coefficient
  float shininess;           ///< specular highlight size (exponential scale)
  float reflectivity;        ///< mirror reflectance coefficient
  float outline;             ///< outline shading coefficient 
  float outlinewidth;        ///< width of outline shading effect
  int transmode;             ///< transparency behavior
  cudaTextureObject_t tex;   ///< texture, non-zero if valid
  int userindex;             ///< material user index, positive if valid
} rt_material;


//
// Lighting data structures
//
typedef struct {
  float3 dir;                ///< directional light direction
//  float3 color; // not yet used
} rt_directional_light;

typedef struct {
  float3 pos;                ///< point light position
//  float3 color; // not yet used
} rt_positional_light;



//
// Shader Binding Table (SBT) Data Structures
//
struct ConeArraySBT {
  float3 *base;
  float3 *apex;
  float  *baserad;
  float  *apexrad;
};

struct CurveArraySBT {
  float3 *vertices;
  float  *vertradii;
  int    *segindices;
};

struct CylinderArraySBT {
  float3 *start;
  float3 *end;
  float  *radius;
};

struct QuadMeshSBT {
  float3 *vertices;
  int4   *indices;
  float3 *normals;
  uint4  *packednormals;   ///< packed normals: ng [n0 n1 n2]
  float3 *vertcolors3f; 
  uchar4 *vertcolors4u;    ///< unsigned char color representation
};

struct RingArraySBT {
  float3 *center;
  float3 *norm;
  float  *inrad;
  float  *outrad; 
};

struct SphereArraySBT {
  float4 *PosRadius;       ///< X,Y,Z,Radius packed for coalescing
};

struct TriMeshSBT {
  float3 *vertices;
  int3   *indices;
  float3 *normals;
  uint4  *packednormals;   ///< packed normals: ng [n0 n1 n2]
  float3 *vertcolors3f; 
  uchar4 *vertcolors4u;    ///< unsigned char color representation
  float2 *tex2d;           ///< 2-D texture coordinate buffer
  float3 *tex3d;           ///< 3-D texture coordinate buffer
};

struct GeomSBTHG {
  float3 *prim_color;      ///< optional per-primitive color array
  float3 uniform_color;    ///< uniform color for entire sphere array
  int materialindex;       ///< material index for this array

  union {
    ConeArraySBT cone;
    CurveArraySBT curve;
    CylinderArraySBT cyl;
    QuadMeshSBT quadmesh;
    RingArraySBT ring;
    SphereArraySBT sphere;
    TriMeshSBT trimesh;
  };
};



/// SBT record for a hitgroup program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) HGRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  GeomSBTHG data;
};

/// Store all hitgroup records for a given geometry together for 
/// simpler dynamic updates.  At present, we have pairs of records,
/// for radiance and shadow rayss.  Records differ only in their header.
/// Each HGRecordGroup contains RT_RAY_TYPE_COUNT HGRecords, so when querying
/// the size of any vector containers or other data structures to count total
/// hitgroup records, we need to remember to multiply by RT_RAY_TYPE_COUNT.
struct HGRecordGroup {
  HGRecord radiance;
  HGRecord shadow;
};


/// SBT record for an exception program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) ExceptionRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};

/// SBT record for a raygen program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) RaygenRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};

/// SBT record for a miss program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) MissRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};


/// Tachyon OptiX global launch parameter structure containing the active
/// camera, framebuffer, materials, and any global scene parameters required
/// for shading.
struct tachyonLaunchParams {
  struct {
    int2 size;                    ///< framebuffer size
    int subframe_index;           ///< accumulation subframe index
    int update_colorbuffer;       ///< accumulation copyout flag 
    int fb_clearall;              ///< clear/overwrite all FB components
    int colorspace;               ///< output colorspace
    uchar4 *framebuffer;          ///< 8-bit unorm RGBA framebuffer

#if defined(TACHYON_OPTIXDENOISER)
    // buffers required for denoising 
    float4 *denoiser_colorbuffer; ///< linear, normalized 32-bit FP RGBA buffer
    int denoiser_enabled;         ///< flag to write to denoiser color buffer
#endif

    float accum_normalize;        ///< precalc 1.0f / subframe_index
    float4 *accum_buffer;         ///< 32-bit FP RGBA accumulation buffer

#if defined(TACHYON_RAYSTATS)
    uint4 *raystats1_buffer;      ///< x=prim, y=shad-dir, z=shad-ao, w=miss
    uint4 *raystats2_buffer;      ///< x=trans, y=trans-skip, z=?, w=refl
#endif
  } frame;

  struct {
    float3 bg_color;              ///< miss background color
    float3 bg_color_grad_top;     ///< miss background gradient (top)
    float3 bg_color_grad_bot;     ///< miss background gradient (bottom)
    float3 bg_grad_updir;         ///< miss background gradient up direction
    float  bg_grad_topval;        ///< miss background gradient top value
    float  bg_grad_botval;        ///< miss background gradient bottom value
    float  bg_grad_invrange;      ///< miss background gradient inverse range
    float  bg_grad_noisemag;      ///< miss background gradient noise magnitude
    int    fog_mode;              ///< fog type (or off)
    float  fog_start;             ///< radial/linear fog start distance
    float  fog_end;               ///< radial/linear fog end/max distance
    float  fog_density;           ///< exponential fog density
    float  epsilon;               ///< global epsilon value
  } scene;

  struct {
    int shadows_enabled;          ///< global shadow flag
    int ao_samples;               ///< number of AO samples per AA ray
    float ao_lightscale;          ///< 2.0f/float(ao_samples)
    float ao_ambient;             ///< AO ambient factor
    float ao_direct;              ///< AO direct lighting scaling factor
    float ao_maxdist;             ///< AO maximum occlusion distance
    int headlight_mode;           ///< Extra VR camera-located headlight
    int num_dir_lights;           ///< directional light count
    float3 *dir_lights;           ///< list of directional light directions
    int num_pos_lights;           ///< positional light count
    float3 *pos_lights;           ///< list of positional light positions
  } lights;

  struct {
    float3 pos;                   ///< camera position
    float3 U;                     ///< camera orthonormal U (right) axis
    float3 V;                     ///< camera orthonormal V (up) axis
    float3 W;                     ///< camera orthonormal W (view) axis
    float zoom;                   ///< camera zoom factor 
    int   dof_enabled;            ///< DoF (defocus blur) on/off
    float dof_aperture_rad;       ///< DoF (defocus blur) aperture radius
    float dof_focal_dist;         ///< DoF focal plane distance
    int   stereo_enabled;         ///< stereo rendering on/off
    float stereo_eyesep;          ///< stereo eye separation, in world coords
    float stereo_convergence_dist; ///< stereo convergence distance (world)
  } cam;

  // VR HMD fade+clipping plane/sphere
  int clipview_mode;              ///< VR clipping view on/off
  float clipview_start;           ///< clipping sphere/plane start coord
  float clipview_end;             ///< clipping sphere/plane end coord

  rt_material *materials;         ///< device memory material array

  int max_depth;                  ///< global max ray tracing recursion depth
  int max_trans;                  ///< max transparent surface crossing count
  int aa_samples;                 ///< AA samples per launch

  OptixTraversableHandle traversable; ///< global OptiX scene traversable handle
};



//
// Vector math helper routines
//

//
// float2 vector operators
//
inline __host__ __device__ float2 operator+(const float2& a, const float2& b) {
  return make_float2(a.x + b.x, a.y + b.y);
}

inline __host__ __device__ float2 operator+(const float2& a, const float s) {
  return make_float2(a.x + s, a.y + s);
}

inline __host__ __device__ float2 operator-(const float2& a, const float2& b) {
  return make_float2(a.x - b.x, a.y - b.y);
}

inline __host__ __device__ float2 operator-(const float2& a, const float s) {
  return make_float2(a.x - s, a.y - s);
}

inline __host__ __device__ float2 operator-(const float s, const float2& a) {
  return make_float2(s - a.x, s - a.y);
}

inline __host__ __device__ float2 operator*(const float2& a, const float2& b) {
  return make_float2(a.x * b.x, a.y * b.y);
}

inline __host__ __device__ float2 operator*(const float s, const float2& a) {
  return make_float2(a.x * s, a.y * s);
}

inline __host__ __device__ float2 operator*(const float2& a, const float s) {
  return make_float2(a.x * s, a.y * s);
}

inline __host__ __device__ void operator*=(float2& a, const float s) {
  a.x *= s; a.y *= s;
}

inline __host__ __device__ float2 operator/(const float s, const float2& a) {
  return make_float2(s/a.x, s/a.y);
}



//
// float3 vector operators
//
inline __host__ __device__ float3 make_float3(const float s) {
  return make_float3(s, s, s);
}

inline __host__ __device__ float3 make_float3(const float4& a) {
  return make_float3(a.x, a.y, a.z);
}

inline __host__ __device__ float3 operator+(float3 a, float3 b) {
  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}

inline __host__ __device__ float3 operator-(const float3& a, const float3 &b) {
  return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}

inline __host__ __device__ float3 operator-(const float3& a) {
  return make_float3(-a.x, -a.y, -a.z);
}

inline __host__ __device__ void operator+=(float3& a, const float3& b) {
  a.x += b.x; a.y += b.y; a.z += b.z;
}

inline __host__ __device__ float3 operator*(const float3& a, const float3 &b) {
  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}

inline __host__ __device__ float3 operator*(float s, const float3 &a) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}

inline __host__ __device__ float3 operator*(const float3 &a, const float s) {
  return make_float3(a.x * s, a.y * s, a.z * s);
}

inline __host__ __device__ void operator*=(float3& a, const float s) {
  a.x *= s; a.y *= s; a.z *= s;
}

inline __host__ __device__ void operator*=(float3& a, const float3 &b) {
  a.x *= b.x; a.y *= b.y; a.z *= b.z;
}

inline __host__ __device__ float3 operator/(const float3 &a, const float3 &b) {
  return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}


//
// float4 vector operators
//
inline __host__ __device__ float4 make_float4(const float3& a, const float b) {
  return make_float4(a.x, a.y, a.z, b);
}

inline __host__ __device__ float4 make_float4(const float a) {
  return make_float4(a, a, a, a);
}

inline __host__ __device__ void operator+=(float4& a, const float4& b) {
  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}

inline __host__ __device__ float4 operator*(const float4& a, const float s) {
  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}

inline __host__ __device__ void operator*=(float4& a, const float &b) {
  a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}


//
// operators with subsequent type conversions
//
inline __host__ __device__ float3 operator*(char4 a, const float s) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}

inline __host__ __device__ float3 operator*(uchar4 a, const float s) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}


//
// math fctns...
//
inline __host__ __device__ float3 fabsf(const float3& a) {
  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
}

inline __host__ __device__ float3 fmaxf(const float3& a, const float3& b) {
  return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
}

inline __host__ __device__ float fmaxf(const float3& a) {
  return fmaxf(fmaxf(a.x, a.y), a.z);
}

inline __host__ __device__ float dot(const float3 & a, const float3 & b) {
  return a.x*b.x + a.y*b.y + a.z*b.z;
}

inline __host__ __device__ float dot(const float4 & a, const float4 & b) {
  return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
}

inline __host__ __device__ float length(const float3 & v) {
  return sqrtf(dot(v, v));
}


/// Normalize input vector to unit length.
inline __host__ __device__ float3 normalize(const float3 & v) {
#if defined(__CUDACC__) || defined(__NVCC__)
  float invlen = rsqrtf(dot(v, v));
#else
  float invlen = 1.0f / sqrtf(dot(v, v));
#endif
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// Normalize input vector to unit length, and return its original length.
inline __host__ __device__ float3 normalize_len(const float3 v, float &l) {
  l = length(v);
  float invlen = 1.0f / l;
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// Normalize input vector to unit length, and return the 
/// reciprocal of its original length.
inline __host__ __device__ float3 normalize_invlen(const float3 v, float &invlen) {
#if defined(__CUDACC__) || defined(__NVCC__)
  invlen = rsqrtf(dot(v, v));
#else
  invlen = 1.0f / sqrtf(dot(v, v));
#endif
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// calculate the cross product between vectors a and b.
inline __host__ __device__ float3 cross(const float3 & a, const float3 & b) {
  float3 out;
  out.x =  a.y * b.z - b.y * a.z;
  out.y = -a.x * b.z + b.x * a.z;
  out.z =  a.x * b.y - b.x * a.y;
  return out;
}

/// calculate reflection direction from incident direction i,
/// and surface normal n.
inline __host__ __device__ float3 reflect(const float3& i, const float3& n) {
  return i - 2.0f * n * dot(n, i);
}


/// Ensure that an interpolated surface normal n faces in the same direction
/// as dictated by a geometric normal nref, as seen from incident vector i.
inline __host__ __device__ float3 faceforward(const float3& n, const float3& i,
                                              const float3& nref) {
  return n * copysignf(1.0f, dot(i, nref));
}


//
// Convert between 2-D planar coordinates and an octahedral mapping.
// This is useful for both omnidirectional cameras and image formats,
// and for surface normal compression/quantization.
//
static __host__ __device__ __inline__ float3 OctDecode(float2 projected) {
  float3 n = make_float3(projected.x,
                         projected.y,
                         1.0f - (fabsf(projected.x) + fabsf(projected.y)));
  if (n.z < 0.0f) {
    float oldX = n.x;
    n.x = copysignf(1.0f - fabsf(n.y), oldX);
    n.y = copysignf(1.0f - fabsf(oldX), n.y);
  }

  return n;
}

//
// XXX TODO: implement a high-precision OctPEncode() variant, based on
//           floored snorms and an error minimization scheme using a
//           comparison of internally decoded values for least error
//

static __host__ __device__ __inline__ float2 OctEncode(float3 n) {
  const float invL1Norm = 1.0f / (fabsf(n.x) + fabsf(n.y) + fabsf(n.z));
  float2 projected;
  if (n.z < 0.0f) {
    projected = 1.0f - make_float2(fabsf(n.y), fabsf(n.x)) * invL1Norm;
    projected.x = copysignf(projected.x, n.x);
    projected.y = copysignf(projected.y, n.y);
  } else {
    projected = make_float2(n.x, n.y) * invL1Norm;
  }

  return projected;
}

static __host__ __device__ __inline__ uint convfloat2uint32(float2 f2) {
  f2 = f2 * 0.5f + 0.5f;
  uint packed;
  packed = ((uint) (f2.x * 65535)) | ((uint) (f2.y * 65535) << 16);
  return packed;
}

static __host__ __device__ __inline__ float2 convuint32float2(uint packed) {
  float2 f2;
  f2.x = (float)((packed      ) & 0x0000ffff) / 65535;
  f2.y = (float)((packed >> 16) & 0x0000ffff) / 65535;
  return f2 * 2.0f - 1.0f;
}


//
// Methods for packing normals into a 4-byte quantity, such as a
// [u]int or [u]char4, and similar.  See JCGT article by Cigolle et al.,
// "A Survey of Efficient Representations for Independent Unit Vectors",
// J. Computer Graphics Techniques 3(2), 2014.
// http://jcgt.org/published/0003/02/01/
//

#if 1

//
// oct32: 32-bit octahedral normal encoding using [su]norm16x2 quantization
// Meyer et al., "On Floating Point Normal Vectors", In Proc. 21st
// Eurographics Conference on Rendering.
//   http://dx.doi.org/10.1111/j.1467-8659.2010.01737.x
// Others:
// https://twitter.com/Stubbesaurus/status/937994790553227264
// https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding
//
static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
  float2 octf2 = OctEncode(normal);
  return convfloat2uint32(octf2);
}

static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
  float2 octf2 = convuint32float2(packed);
  return OctDecode(octf2);
}

#elif 0

//
// snorm10x3: signed 10-bit-per-component scalar unit real representation
// Better representation than unorm.
// Supported by most fixed-function graphics hardware.
// https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_snorm.txt
//   i=round(clamp(r,-1,1) * (2^(b-1) - 1)
//   r=clamp(i/(2^(b-1) - 1), -1, 1)
//

#elif 1

// OpenGL GLbyte signed quantization scheme
//   i = r * (2^b - 1) - 0.5;
//   r = (2i + 1)/(2^b - 1)
static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
  // conversion to GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
  const float3 N = normal * 127.5f - 0.5f;
  const char4 packed = make_char4(N.x, N.y, N.z, 0);
  return *((uint *) &packed);
}

static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
  char4 c4norm = *((char4 *) &packed);

  // conversion from GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
  // float = (2c+1)/(2^8-1)
  const float ci2f = 1.0f / 255.0f;
  const float cn2f = 1.0f / 127.5f;
  float3 N = c4norm * cn2f + ci2f;

  return N;
}
#endif



//
// Tone mapping and color grading device functions.
// Useful references:
//   Photographic Tone Reproduction for Digital Images
//   E. Reinhard, M. Stark, P. Shirley, J. Ferwerda
//   ACM Transactions on Graphics, 21(3) pp. 267-276, 2002.
//   https://doi.org/10.1145/566654.566575
//
//   Tone Mapping of HDR Images: A Review
//   Y. Salih, W. Md-Esa, A. Malik, N. Saad.
//   http://doi.org/10.1109/ICIAS.2012.6306220
//
// Others:
// http://filmicworlds.com/blog/filmic-tonemapping-operators/
// http://filmicworlds.com/blog/filmic-tonemapping-with-piecewise-power-curves/
// http://filmicworlds.com/blog/minimal-color-grading-tools/
// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
//

//
// Calculate relative luminance from linear RGB w/ perceptual coefficients:
//   https://en.wikipedia.org/wiki/Relative_luminance
//
static __device__ __inline__ 
float luminance(float3 c) {
  return dot(c, make_float3(0.2126f, 0.7152f, 0.0722f));; 
}


//
// Rescale RGB colors to achieve desired luminance
//
static __device__ __inline__ 
float3 rescale_luminance(float3 c, float newluminance) {
  float l = luminance(c);
  return c * (newluminance / l);
}


//
// Reinhard style tone mapping
//
static __device__ __inline__ 
float3 reinhard_TMO(float3 c) {
  return c / (make_float3(1.0f) + c); 
}


//
// Extended Reinhard style tone mapping:
//   https://64.github.io/tonemapping/
//
static __device__ __inline__ 
float3 reinhard_extended_TMO(float3 c, float maxwhite) {
  float3 num = c * (make_float3(1.0f) + (c / make_float3(maxwhite * maxwhite)));
  return num / (make_float3(1.0f) + c); 
}


//
// Extended Reinhard style tone mapping applied to luminance:
//   https://64.github.io/tonemapping/
//
static __device__ __inline__ 
float3 reinhard_extended_luminance_TMO(float3 c, float maxL) {
  float oldL = luminance(c);
  float num = oldL * (1.0f + (oldL / (maxL * maxL)));
  float newL = num / (1.0f + oldL);
  return rescale_luminance(c, newL);
}


//
// Protect functions that are only GPU-callable, e.g., those that
// use GPU-specific intrinsics such as __saturatef() or others.
//
#if defined(TACHYON_INTERNAL)

// clamp vector to range [0,1] using __saturatef() intrinsic
static __device__ __inline__ float3 clamp_float3(const float3 &a) {
  return make_float3(__saturatef(a.x), __saturatef(a.y), __saturatef(a.z));
}


//
// ACES filmic tone mapping approximations:
//   https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
//   https://github.com/TheRealMJP/BakingLab/blob/master/BakingLab/ACES.hlsl
//

static __device__ __inline__ 
float3 ACESFilmToneMappingCurve(float3 c) {
  float3 num = c * (2.51f * c + make_float3(0.03f));
  float3 den = c * (2.43f * c + make_float3(0.59f)) + make_float3(0.14f);
  float3 t = num / den;

  // we clamp here, but this could be done separately
  return t;
}



//
// Color conversion operations
//

/// Convert float3 rgb data to uchar4 with alpha channel set to 255.
static __device__ __inline__ uchar4 make_color_rgb4u(const float3& c) {
  return make_uchar4(static_cast<unsigned char>(__saturatef(c.x)*255.99f),
                     static_cast<unsigned char>(__saturatef(c.y)*255.99f),
                     static_cast<unsigned char>(__saturatef(c.z)*255.99f),
                     255u);
}

/// Convert float4 rgba data to uchar4 unorm color representation.
static __device__ __inline__ uchar4 make_color_rgb4u(const float4& c) {
  return make_uchar4(static_cast<unsigned char>(__saturatef(c.x)*255.99f),
                     static_cast<unsigned char>(__saturatef(c.y)*255.99f),
                     static_cast<unsigned char>(__saturatef(c.z)*255.99f),
                     static_cast<unsigned char>(__saturatef(c.w)*255.99f));
}

#endif

#endif
