eigen/unsupported/TensorRandom_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>

 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H


 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace internal {


 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {

 #if defined(EIGEN_GPU_COMPILE_PHASE)

   // We don't support 3d kernels since we currently only use 1 and

   // 2d kernels.

   gpu_assert(threadIdx.z == 0);

   return blockIdx.x * blockDim.x + threadIdx.x

          + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);

 #else

   // Rely on Eigen's random implementation.

   return random<uint64_t>();

 #endif

 }


 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {

   // TODO: Unify with the implementation in the non blocking thread pool.

   uint64_t current = *state;

   // Update the internal state

   *state = current * 6364136223846793005ULL + (stream << 1 | 1);

   // Generate the random output (using the PCG-XSH-RS scheme)

   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));

 }


 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {

   seed = seed ? seed : get_random_seed();

   return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;

 }


 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 T RandomToTypeUniform(uint64_t* state, uint64_t stream) {

   unsigned rnd = PCG_XSH_RS_generator(state, stream);

   return static_cast<T>(rnd);

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {

   // Generate 10 random bits for the mantissa, merge with exponent.

   unsigned rnd = PCG_XSH_RS_generator(state, stream);

   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);

   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);

   // Return the final result

   return result - Eigen::half(1.0f);

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {


   // Generate 7 random bits for the mantissa, merge with exponent.

   unsigned rnd = PCG_XSH_RS_generator(state, stream);

   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);

   Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);

   // Return the final result

   return result - Eigen::bfloat16(1.0f);

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {

   typedef union {

     uint32_t raw;

     float fp;

   } internal;

   internal result;

   // Generate 23 random bits for the mantissa mantissa

   const unsigned rnd = PCG_XSH_RS_generator(state, stream);

   result.raw = rnd & 0x7fffffu;

   // Set the exponent

   result.raw |= (static_cast<uint32_t>(127) << 23);

   // Return the final result

   return result.fp - 1.0f;

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {

   typedef union {

     uint64_t raw;

     double dp;

   } internal;

   internal result;

   result.raw = 0;

   // Generate 52 random bits for the mantissa

   // First generate the upper 20 bits

   unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;

   // The generate the lower 32 bits

   unsigned rnd2 = PCG_XSH_RS_generator(state, stream);

   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;

   // Set the exponent

   result.raw |= (static_cast<uint64_t>(1023) << 52);

   // Return the final result

   return result.dp - 1.0;

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {

   return std::complex<float>(RandomToTypeUniform<float>(state, stream),

                              RandomToTypeUniform<float>(state, stream));

 }

 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {

   return std::complex<double>(RandomToTypeUniform<double>(state, stream),

                               RandomToTypeUniform<double>(state, stream));

 }


 template <typename T> class UniformRandomGenerator {

  public:

   static constexpr bool PacketAccess = true;


   // Uses the given "seed" if non-zero, otherwise uses a random seed.

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(

       uint64_t seed = 0) {

     m_state = PCG_XSH_RS_state(seed);

     #ifdef EIGEN_USE_SYCL

     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.

     // Therefore, we need two steps to initializate the m_state.

     // IN SYCL, the constructor of the functor is s called on the CPU

     // and we get the clock seed here from the CPU. However, This seed is

     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.

     // and only  available on the Operator() function (which is called on the GPU).

     // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread

     // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds

     // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction

     // similar to CUDA Therefore, the thread Id injection is not available at this stage.

     //However when the operator() is called the thread ID will be available. So inside the opeator,

     // we add the thrreadID, BlockId,... (which is equivalent of i)

     //to the seed and construct the unique m_state per thead similar to cuda.

     m_exec_once =false;

    #endif

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(

       const UniformRandomGenerator& other) {

     m_state = other.m_state;

     #ifdef EIGEN_USE_SYCL

      m_exec_once =other.m_exec_once;

     #endif

   }


   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   T operator()(Index i) const {

     #ifdef EIGEN_USE_SYCL

       if(!m_exec_once) {

       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread

       // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side

        m_state += (i * 6364136223846793005ULL);

        m_exec_once =true;

       }

     #endif

     T result = RandomToTypeUniform<T>(&m_state, i);

     return result;

   }


   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   Packet packetOp(Index i) const {

     const int packetSize = internal::unpacket_traits<Packet>::size;

     EIGEN_ALIGN_MAX T values[packetSize];

       #ifdef EIGEN_USE_SYCL

       if(!m_exec_once) {

       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread

        m_state += (i * 6364136223846793005ULL);

        m_exec_once =true;

       }

     #endif

     EIGEN_UNROLL_LOOP

     for (int j = 0; j < packetSize; ++j) {

       values[j] = RandomToTypeUniform<T>(&m_state, i);

     }

     return internal::pload<Packet>(values);

   }


  private:

   mutable uint64_t m_state;

   #ifdef EIGEN_USE_SYCL

   mutable bool m_exec_once;

   #endif

 };


 template <typename Scalar>

 struct functor_traits<UniformRandomGenerator<Scalar> > {

   enum {

     // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).

     Cost = 12 * NumTraits<Scalar>::AddCost *

            ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),

     PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess

   };

 };


 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 T RandomToTypeNormal(uint64_t* state, uint64_t stream) {

   // Use the ratio of uniform method to generate numbers following a normal

   // distribution. See for example Numerical Recipes chapter 7.3.9 for the

   // details.

   T u, v, q;

   do {

     u = RandomToTypeUniform<T>(state, stream);

     v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));

     const T x = u - T(0.449871);

     const T y = numext::abs(v) + T(0.386595);

     q = x*x + y * (T(0.196)*y - T(0.25472)*x);

   } while (q > T(0.27597) &&

            (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));


   return v/u;

 }


 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {

   return std::complex<float>(RandomToTypeNormal<float>(state, stream),

                              RandomToTypeNormal<float>(state, stream));

 }

 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {

   return std::complex<double>(RandomToTypeNormal<double>(state, stream),

                               RandomToTypeNormal<double>(state, stream));

 }


 template <typename T> class NormalRandomGenerator {

  public:

   static constexpr bool PacketAccess = true;


   // Uses the given "seed" if non-zero, otherwise uses a random seed.

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {

     m_state = PCG_XSH_RS_state(seed);

     #ifdef EIGEN_USE_SYCL

     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.

     // Therefore, we need two steps to initializate the m_state.

     // IN SYCL, the constructor of the functor is s called on the CPU

     // and we get the clock seed here from the CPU. However, This seed is

     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.

     // and only  available on the Operator() function (which is called on the GPU).

     // Therefore, the thread Id injection is not available at this stage. However when the operator()

     //is called the thread ID will be available. So inside the operator,

     // we add the thrreadID, BlockId,... (which is equivalent of i)

     //to the seed and construct the unique m_state per thead similar to cuda.

     m_exec_once =false;

    #endif

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(

       const NormalRandomGenerator& other) {

     m_state = other.m_state;

 #ifdef EIGEN_USE_SYCL

     m_exec_once=other.m_exec_once;

 #endif

   }


  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   T operator()(Index i) const {

     #ifdef EIGEN_USE_SYCL

     if(!m_exec_once) {

       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread

       m_state += (i * 6364136223846793005ULL);

       m_exec_once =true;

     }

     #endif

     T result = RandomToTypeNormal<T>(&m_state, i);

     return result;

   }


   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   Packet packetOp(Index i) const {

     const int packetSize = internal::unpacket_traits<Packet>::size;

     EIGEN_ALIGN_MAX T values[packetSize];

     #ifdef EIGEN_USE_SYCL

     if(!m_exec_once) {

       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread

       m_state += (i * 6364136223846793005ULL);

       m_exec_once =true;

     }

     #endif

     EIGEN_UNROLL_LOOP

     for (int j = 0; j < packetSize; ++j) {

       values[j] = RandomToTypeNormal<T>(&m_state, i);

     }

     return internal::pload<Packet>(values);

   }


  private:

   mutable uint64_t m_state;

    #ifdef EIGEN_USE_SYCL

   mutable bool m_exec_once;

   #endif

 };


 template <typename Scalar>

 struct functor_traits<NormalRandomGenerator<Scalar> > {

   enum {

     // On average, we need to generate about 3 random numbers

     // 15 mul, 8 add, 1.5 logs

     Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +

            15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +

            3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,

     PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess

   };

 };


 } // end namespace internal

 } // end namespace Eigen


 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

v
Array< int, Dynamic, 1 > v

i
int i

EIGEN_ALIGN_MAX
#define EIGEN_ALIGN_MAX

operator()
IndexedView_or_VectorBlock operator()(const Indices &indices)

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC

gpu_assert
#define gpu_assert(x)
Definition: Tensor:70

Eigen::Triplet

Eigen::internal::RandomToTypeUniform
T RandomToTypeUniform(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:47

Eigen::internal::PCG_XSH_RS_generator
unsigned PCG_XSH_RS_generator(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:32

Eigen::internal::y
const Scalar & y

Eigen::internal::RandomToTypeUniform< double >
double RandomToTypeUniform< double >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:91

Eigen::internal::RandomToTypeNormal
T RandomToTypeNormal(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:206

Eigen::internal::RandomToTypeUniform< float >
float RandomToTypeUniform< float >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:75

Eigen::internal::get_random_seed
uint64_t get_random_seed()
Definition: TensorRandom.h:19

Eigen::internal::PCG_XSH_RS_state
uint64_t PCG_XSH_RS_state(uint64_t seed)
Definition: TensorRandom.h:41

uint16_t
std::uint16_t uint16_t

uint32_t
std::uint32_t uint32_t

Eigen::numext::x
const Scalar & x
Definition: SpecialFunctionsImpl.h:1997

uint64_t
std::uint64_t uint64_t

Eigen::numext::abs
EIGEN_ALWAYS_INLINE std::enable_if_t< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typename NumTraits< T >::Real > abs(const T &x)

Eigen::numext::q
const Scalar & q
Definition: SpecialFunctionsImpl.h:1991

Eigen::numext::log
EIGEN_ALWAYS_INLINE T log(const T &x)

Eigen
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend

Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index

internal

InternalHeaderCheck.h

Eigen::GenericNumTraits::AddCost
AddCost

Eigen::bfloat16

Eigen::half

j
std::ptrdiff_t j