/*
 * TachyonOptiXShaders.h - prototypes for OptiX PTX shader routines 
 *
 * (C) Copyright 2013-2022 John E. Stone
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * $Id: TachyonOptiXShaders.h,v 1.34 2022/03/07 04:28:22 johns Exp $
 *
 */

/**
 *  \file TachyonOptiXShaders.h
 *  \brief Tachyon ray tracing engine core routines and data structures
 *         compiled to PTX for runtime JIT to build complete ray tracing 
 *         pipelines.  Key data structures defined here are shared both by
 *         the compiled PTX core ray tracing routines, and by the host code
 *         that assembles the complete ray tracing pipeline and launches
 *         the pipeline kernels.
 *         Written for NVIDIA OptiX 7 and later.
 */

#ifndef TACHYONOPTIXSHADERS_H
#define TACHYONOPTIXSHADERS_H

#if 0
/// Compile-time flag for collection and reporting of ray statistics
#define TACHYON_RAYSTATS 1
#endif


//
// Constants shared by both host and device code
//
#define RT_DEFAULT_MAX 1e27f


//
// Vector math helper routines
//

//
// float2 vector operators
//
inline __host__ __device__ float2 operator+(const float2& a, const float2& b) {
  return make_float2(a.x + b.x, a.y + b.y);
}

inline __host__ __device__ float2 operator+(const float2& a, const float s) {
  return make_float2(a.x + s, a.y + s);
}

inline __host__ __device__ float2 operator-(const float2& a, const float2& b) {
  return make_float2(a.x - b.x, a.y - b.y);
}

inline __host__ __device__ float2 operator-(const float2& a, const float s) {
  return make_float2(a.x - s, a.y - s);
}

inline __host__ __device__ float2 operator-(const float s, const float2& a) {
  return make_float2(s - a.x, s - a.y);
}

inline __host__ __device__ float2 operator*(const float2& a, const float2& b) {
  return make_float2(a.x * b.x, a.y * b.y);
}

inline __host__ __device__ float2 operator*(const float s, const float2& a) {
  return make_float2(a.x * s, a.y * s);
}

inline __host__ __device__ float2 operator*(const float2& a, const float s) {
  return make_float2(a.x * s, a.y * s);
}

inline __host__ __device__ void operator*=(float2& a, const float s) {
  a.x *= s; a.y *= s;
}

inline __host__ __device__ float2 operator/(const float s, const float2& a) {
  return make_float2(s/a.x, s/a.y);
}



//
// float3 vector operators
//
inline __host__ __device__ float3 make_float3(const float s) {
  return make_float3(s, s, s);
}

inline __host__ __device__ float3 make_float3(const float4& a) {
  return make_float3(a.x, a.y, a.z);
}

inline __host__ __device__ float3 operator+(float3 a, float3 b) {
  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}

inline __host__ __device__ float3 operator-(const float3& a, const float3 &b) {
  return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}

inline __host__ __device__ float3 operator-(const float3& a) {
  return make_float3(-a.x, -a.y, -a.z);
}

inline __host__ __device__ void operator+=(float3& a, const float3& b) {
  a.x += b.x; a.y += b.y; a.z += b.z;
}

inline __host__ __device__ float3 operator*(const float3& a, const float3 &b) {
  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}

inline __host__ __device__ float3 operator*(float s, const float3 & a) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}

inline __host__ __device__ float3 operator*(const float3& a, const float s) {
  return make_float3(a.x * s, a.y * s, a.z * s);
}

inline __host__ __device__ void operator*=(float3& a, const float s) {
  a.x *= s; a.y *= s; a.z *= s;
}

inline __host__ __device__ void operator*=(float3& a, const float3 &b) {
  a.x *= b.x; a.y *= b.y; a.z *= b.z;
}


//
// float4 vector operators
//
inline __host__ __device__ float4 make_float4(const float3& a, const float b) {
  return make_float4(a.x, a.y, a.z, b);
}

inline __host__ __device__ float4 make_float4(const float a) {
  return make_float4(a, a, a, a);
}

inline __host__ __device__ void operator+=(float4& a, const float4& b) {
  a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}

inline __host__ __device__ float4 operator*(const float4& a, const float s) {
  return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}

inline __host__ __device__ void operator*=(float4& a, const float &b) {
  a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}


//
// operators with subsequent type conversions
//
inline __host__ __device__ float3 operator*(char4 a, const float s) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}

inline __host__ __device__ float3 operator*(uchar4 a, const float s) {
  return make_float3(s * a.x, s * a.y, s * a.z);
}


//
// math fctns...
//
inline __host__ __device__ float3 fabsf(const float3& a) {
  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
}

inline __host__ __device__ float3 fmaxf(const float3& a, const float3& b) {
  return make_float3(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z));
}

inline __host__ __device__ float fmaxf(const float3& a) {
  return fmaxf(fmaxf(a.x, a.y), a.z);
}

inline __host__ __device__ float dot(const float3 & a, const float3 & b) {
  return a.x*b.x + a.y*b.y + a.z*b.z;
}

inline __host__ __device__ float dot(const float4 & a, const float4 & b) {
  return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
}

inline __host__ __device__ float length(const float3 & v) {
  return sqrtf(dot(v, v));
}


/// Normalize input vector to unit length.
inline __host__ __device__ float3 normalize(const float3 & v) {
#if defined(__CUDACC__) || defined(__NVCC__)
  float invlen = rsqrtf(dot(v, v));
#else
  float invlen = 1.0f / sqrtf(dot(v, v));
#endif
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// Normalize input vector to unit length, and return its original length.
inline __host__ __device__ float3 normalize_len(const float3 v, float &l) {
  l = length(v);
  float invlen = 1.0f / l;
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// Normalize input vector to unit length, and return the 
/// reciprocal of its original length.
inline __host__ __device__ float3 normalize_invlen(const float3 v, float &invlen) {
#if defined(__CUDACC__) || defined(__NVCC__)
  invlen = rsqrtf(dot(v, v));
#else
  invlen = 1.0f / sqrtf(dot(v, v));
#endif
  float3 out;
  out.x = v.x * invlen;
  out.y = v.y * invlen;
  out.z = v.z * invlen;
  return out;
}


/// calculate the cross product between vectors a and b.
inline __host__ __device__ float3 cross(const float3 & a, const float3 & b) {
  float3 out;
  out.x =  a.y * b.z - b.y * a.z;
  out.y = -a.x * b.z + b.x * a.z;
  out.z =  a.x * b.y - b.x * a.y;
  return out;
}

/// calculate reflection direction from incident direction i,
/// and surface normal n.
inline __host__ __device__ float3 reflect(const float3& i, const float3& n) {
  return i - 2.0f * n * dot(n, i);
}


/// Ensure that an interpolated surface normal n faces in the same direction
/// as dictated by a geometric normal nref, as seen from incident vector i.
inline __host__ __device__ float3 faceforward(const float3& n, const float3& i,
                                              const float3& nref) {
  return n * copysignf(1.0f, dot(i, nref));
}



//
// Beginning of OptiX data structures
//

// Enable reversed traversal of any-hit rays for shadows/AO.
// This optimization yields a 20% performance gain in many cases.
// #define USE_REVERSE_SHADOW_RAYS 1

// Use reverse rays by default rather than only when enabled interactively
// #define USE_REVERSE_SHADOW_RAYS_DEFAULT 1
enum RtShadowMode {
  RT_SHADOWS_OFF=0,          ///< shadows disabled
  RT_SHADOWS_ON=1,           ///< shadows on, std. impl.
  RT_SHADOWS_ON_REVERSE=2    ///< any-hit traversal reversal
};

enum RayType {
  RT_RAY_TYPE_RADIANCE=0,    ///< normal radiance rays
  RT_RAY_TYPE_SHADOW=1,      ///< shadow probe/AO rays
  RT_RAY_TYPE_COUNT          ///< total count of ray types
};

//
// OptiX 7.x geometry type-associated "hit kind" enums
//
enum RtHitKind {
  RT_HIT_HWTRIANGLE=0,       ///< RTX triangle
  RT_HIT_CONE,               ///< custom prim cone
  RT_HIT_CYLINDER,           ///< custom prim cyliner
  RT_HIT_QUAD,               ///< custom prim quadrilateral
  RT_HIT_RING,               ///< custom prim ring
  RT_HIT_SPHERE,             ///< custom prim sphere
  RT_HIT_CURVE,              ///< OptiX 7.x built-in curve prims
  RT_HIT_COUNT 
};      

// Enums used for custom primitive PGM indexing in SBT + GAS
enum RtCustPrim { 
  RT_CUST_PRIM_CONE=0,       ///< cone SBT index multiplier
  RT_CUST_PRIM_CYLINDER,     ///< cylinder SBT index multiplier
  RT_CUST_PRIM_QUAD,         ///< quad SBT index multiplier
  RT_CUST_PRIM_RING,         ///< ring SBT index multiplier
  RT_CUST_PRIM_SPHERE,       ///< sphere SBT index multiplier
  RT_CUST_PRIM_COUNT         ///< total count of SBT geometric multipliers
};



//
// Materials
//

/// structure containing Tachyon material properties
typedef struct {
  float ambient;             ///< constant ambient light factor
  float diffuse;             ///< diffuse reflectance coefficient
  float specular;            ///< specular reflectance coefficient
  float shininess;           ///< specular highlight size (exponential scale)
  float reflectivity;        ///< mirror reflectance coefficient
  float opacity;             ///< surface opacity 
  float outline;             ///< outline shading coefficient 
  float outlinewidth;        ///< width of outline shading effect
  int transmode;             ///< transparency behavior
  int ind;                   ///< material internal index
} rt_material;



//
// Lighting data structures
//
typedef struct {
  float3 dir;                ///< directional light direction
//  float3 color; // not yet used
} rt_directional_light;

typedef struct {
  float3 pos;                ///< point light position
//  float3 color; // not yet used
} rt_positional_light;



//
// Shader Binding Table (SBT) Data Structures
//
struct ConeArraySBT {
  float3 *base;
  float3 *apex;
  float  *baserad;
  float  *apexrad;
};

struct CurveArraySBT {
  float3 *vertices;
  float  *vertradii;
  int    *segindices;
};

struct CylinderArraySBT {
  float3 *start;
  float3 *end;
  float  *radius;
};

struct QuadMeshSBT {
  float3 *vertices;
  int4   *indices;
  float3 *normals;
  uint4  *packednormals;   ///< packed normals: ng [n0 n1 n2]
  float3 *vertcolors3f; 
  uchar4 *vertcolors4u;    ///< unsigned char color representation
};

struct RingArraySBT {
  float3 *center;
  float3 *norm;
  float  *inrad;
  float  *outrad; 
};

struct SphereArraySBT {
  float3 *center;
  float  *radius;
};

struct TriMeshSBT {
  float3 *vertices;
  int3   *indices;
  float3 *normals;
  uint4  *packednormals;   ///< packed normals: ng [n0 n1 n2]
  float3 *vertcolors3f; 
  uchar4 *vertcolors4u;    ///< unsigned char color representation
};

struct GeomSBTHG {
  float3 *prim_color;      ///< optional per-primitive color array
  float3 uniform_color;    ///< uniform color for entire sphere array
  int materialindex;       ///< material index for this array

  union {
    ConeArraySBT cone;
    CurveArraySBT curve;
    CylinderArraySBT cyl;
    QuadMeshSBT quadmesh;
    RingArraySBT ring;
    SphereArraySBT sphere;
    TriMeshSBT trimesh;
  };
};

/// SBT record for a hitgroup program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) HGRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  GeomSBTHG data;
};

/// Store all hitgroup records for a given geometry together for 
/// simpler dynamic updates.  At present, we have pairs of records,
/// for radiance and shadow rayss.  Records differ only in their header.
/// Each HGRecordGroup contains RT_RAY_TYPE_COUNT HGRecords, so when querying
/// the size of any vector containers or other data structures to count total
/// hitgroup records, we need to remember to multiply by RT_RAY_TYPE_COUNT.
struct HGRecordGroup {
  HGRecord radiance;
  HGRecord shadow;
};


/// SBT record for an exception program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) ExceptionRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};

/// SBT record for a raygen program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) RaygenRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};

/// SBT record for a miss program
struct __align__( OPTIX_SBT_RECORD_ALIGNMENT ) MissRecord {
  __align__( OPTIX_SBT_RECORD_ALIGNMENT ) char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  void *data; // dummy value
};


/// Tachyon OptiX global launch parameter structure containing the active
/// camera, framebuffer, materials, and any global scene parameters required
/// for shading.
struct tachyonLaunchParams {
  struct {
    int2 size;                    ///< framebuffer size
    int subframe_index { 0 };     ///< accumulation subframe index
    int update_framebuffer { 0 }; ///< accumulation copyout flag 
    uchar4 *framebuffer;          ///< 8-bit unorm RGBA framebuffer
    float4 *accumulation_buffer;  ///< 32-bit FP RGBA accumulation buffer
#if defined(TACHYON_RAYSTATS)
    uint4 *raystats1_buffer;      ///< x=prim, y=shad-dir, z=shad-ao, w=miss
    uint4 *raystats2_buffer;      ///< x=trans, y=trans-skip, z=?, w=refl
#endif
  } frame;

  struct {
    float3 bg_color;              ///< miss background color
    float3 bg_color_grad_top;     ///< miss background gradient (top)
    float3 bg_color_grad_bot;     ///< miss background gradient (bottom)
    float3 gradient;              ///< miss background gradient up direction
    float  gradient_topval;       ///< miss background gradient top value
    float  gradient_botval;       ///< miss background gradient bottom value
    float  gradient_invrange;     ///< miss background gradient inverse range
    float  gradient_noisemag;     ///< miss background gradient noise magnitude
    int    fog_mode;              ///< fog type (or off)
    float  fog_start;             ///< radial/linear fog start distance
    float  fog_end;               ///< radial/linear fog end/max distance
    float  fog_density;           ///< exponential fog density
    float  epsilon;               ///< global epsilon value
  } scene;

  struct {
    int shadows_enabled;          ///< global shadow flag
    int ao_samples;               ///< number of AO samples per AA ray
    float ao_ambient;             ///< AO ambient factor
    float ao_direct;              ///< AO direct lighting scaling factor
    float ao_maxdist;             ///< AO maximum occlusion distance
    int headlight_mode;           ///< Extra VR camera-located headlight
    int num_dir_lights;           ///< directional light count
    float3 *dir_lights;           ///< list of directional light directions
    int num_pos_lights;           ///< positional light count
    float3 *pos_lights;           ///< list of positional light positions
  } lights;

  struct {
    float3 pos;                   ///< camera position
    float3 U;                     ///< camera orthonormal U (right) axis
    float3 V;                     ///< camera orthonormal V (up) axis
    float3 W;                     ///< camera orthonormal W (view) axis
    float zoom;                   ///< camera zoom factor 
    float stereo_eyesep;          ///< stereo eye separation, in world coords
    float stereo_convergence_dist; ///< stereo convergence distance (world)
    int   dof_enabled;            ///< DoF (defocus blur) on/off
    float dof_aperture_rad;       ///< DoF (defocus blur) aperture radius
    float dof_focal_dist;         ///< DoF focal plane distance
  } cam;

  // VR HMD fade+clipping plane/sphere
  int clipview_mode;              ///< VR clipping view on/off
  float clipview_start;           ///< clipping sphere/plane start coord
  float clipview_end;             ///< clipping sphere/plane end coord

  rt_material *materials;         ///< device memory material array

  int max_depth;                  ///< global max ray tracing recursion depth
  int max_trans;                  ///< max transparent surface crossing count
  int aa_samples;                 ///< AA samples per launch
//  int accum_count { 0 };          // not currently used
  float accumulation_normalization_factor; ///< accum buffer normalization multiplier

  OptixTraversableHandle traversable; ///< global OptiX scene traversable handle
};


//
// Methods for packing normals into a 4-byte quantity, such as a
// [u]int or [u]char4, and similar.  See JCGT article by Cigolle et al.,
// "A Survey of Efficient Representations for Independent Unit Vectors",
// J. Computer Graphics Techniques 3(2), 2014.
// http://jcgt.org/published/0003/02/01/
//

#if 1

//
// oct32: 32-bit octahedral normal encoding using [su]norm16x2 quantization
// Meyer et al., "On Floating Point Normal Vectors", In Proc. 21st
// Eurographics Conference on Rendering.
//   http://dx.doi.org/10.1111/j.1467-8659.2010.01737.x
// Others:
// https://twitter.com/Stubbesaurus/status/937994790553227264
// https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding
//
static __host__ __device__ __inline__ float3 OctDecode(float2 projected) {
  float3 n = make_float3(projected.x,
                         projected.y,
                         1.0f - (fabsf(projected.x) + fabsf(projected.y)));
  if (n.z < 0.0f) {
    float oldX = n.x;
    n.x = copysignf(1.0f - fabsf(n.y), oldX);
    n.y = copysignf(1.0f - fabsf(oldX), n.y);
  }

  return n;
}

//
// XXX TODO: implement a high-precision OctPEncode() variant, based on
//           floored snorms and an error minimization scheme using a
//           comparison of internally decoded values for least error
//

static __host__ __device__ __inline__ float2 OctEncode(float3 n) {
  const float invL1Norm = 1.0f / (fabsf(n.x) + fabsf(n.y) + fabsf(n.z));
  float2 projected;
  if (n.z < 0.0f) {
    projected = 1.0f - make_float2(fabsf(n.y), fabsf(n.x)) * invL1Norm;
    projected.x = copysignf(projected.x, n.x);
    projected.y = copysignf(projected.y, n.y);
  } else {
    projected = make_float2(n.x, n.y) * invL1Norm;
  }

  return projected;
}


static __host__ __device__ __inline__ uint convfloat2uint32(float2 f2) {
  f2 = f2 * 0.5f + 0.5f;
  uint packed;
  packed = ((uint) (f2.x * 65535)) | ((uint) (f2.y * 65535) << 16);
  return packed;
}

static __host__ __device__ __inline__ float2 convuint32float2(uint packed) {
  float2 f2;
  f2.x = (float)((packed      ) & 0x0000ffff) / 65535;
  f2.y = (float)((packed >> 16) & 0x0000ffff) / 65535;
  return f2 * 2.0f - 1.0f;
}


static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
  float2 octf2 = OctEncode(normal);
  return convfloat2uint32(octf2);
}

static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
  float2 octf2 = convuint32float2(packed);
  return OctDecode(octf2);
}

#elif 0

//
// snorm10x3: signed 10-bit-per-component scalar unit real representation
// Better representation than unorm.
// Supported by most fixed-function graphics hardware.
// https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_snorm.txt
//   i=round(clamp(r,-1,1) * (2^(b-1) - 1)
//   r=clamp(i/(2^(b-1) - 1), -1, 1)
//

#elif 1

// OpenGL GLbyte signed quantization scheme
//   i = r * (2^b - 1) - 0.5;
//   r = (2i + 1)/(2^b - 1)
static __host__ __device__ __inline__ uint packNormal(const float3& normal) {
  // conversion to GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
  const float3 N = normal * 127.5f - 0.5f;
  const char4 packed = make_char4(N.x, N.y, N.z, 0);
  return *((uint *) &packed);
}

static __host__ __device__ __inline__ float3 unpackNormal(uint packed) {
  char4 c4norm = *((char4 *) &packed);

  // conversion from GLbyte format, Table 2.6, p. 44 of OpenGL spec 1.2.1
  // float = (2c+1)/(2^8-1)
  const float ci2f = 1.0f / 255.0f;
  const float cn2f = 1.0f / 127.5f;
  float3 N = c4norm * cn2f + ci2f;

  return N;
}

#endif


#endif
