TensorRandom.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13 
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 namespace internal {
18 
20 #if defined(EIGEN_GPU_COMPILE_PHASE)
21  // We don't support 3d kernels since we currently only use 1 and
22  // 2d kernels.
23  gpu_assert(threadIdx.z == 0);
24  return blockIdx.x * blockDim.x + threadIdx.x
25  + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26 #else
27  // Rely on Eigen's random implementation.
28  return random<uint64_t>();
29 #endif
30 }
31 
32 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33  // TODO: Unify with the implementation in the non blocking thread pool.
34  uint64_t current = *state;
35  // Update the internal state
36  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37  // Generate the random output (using the PCG-XSH-RS scheme)
38  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39 }
40 
42  seed = seed ? seed : get_random_seed();
43  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44 }
45 
46 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
48  unsigned rnd = PCG_XSH_RS_generator(state, stream);
49  return static_cast<T>(rnd);
50 }
51 
52 
53 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
54 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
55  // Generate 10 random bits for the mantissa, merge with exponent.
56  unsigned rnd = PCG_XSH_RS_generator(state, stream);
57  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
58  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
59  // Return the final result
60  return result - Eigen::half(1.0f);
61 }
62 
63 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
64 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
65 
66  // Generate 7 random bits for the mantissa, merge with exponent.
67  unsigned rnd = PCG_XSH_RS_generator(state, stream);
68  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
69  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
70  // Return the final result
71  return result - Eigen::bfloat16(1.0f);
72 }
73 
74 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
76  typedef union {
77  uint32_t raw;
78  float fp;
79  } internal;
80  internal result;
81  // Generate 23 random bits for the mantissa mantissa
82  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
83  result.raw = rnd & 0x7fffffu;
84  // Set the exponent
85  result.raw |= (static_cast<uint32_t>(127) << 23);
86  // Return the final result
87  return result.fp - 1.0f;
88 }
89 
90 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
92  typedef union {
93  uint64_t raw;
94  double dp;
95  } internal;
96  internal result;
97  result.raw = 0;
98  // Generate 52 random bits for the mantissa
99  // First generate the upper 20 bits
100  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
101  // The generate the lower 32 bits
102  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
103  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
104  // Set the exponent
105  result.raw |= (static_cast<uint64_t>(1023) << 52);
106  // Return the final result
107  return result.dp - 1.0;
108 }
109 
110 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
111 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
112  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
113  RandomToTypeUniform<float>(state, stream));
114 }
115 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
116 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
117  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
118  RandomToTypeUniform<double>(state, stream));
119 }
120 
121 template <typename T> class UniformRandomGenerator {
122  public:
123  static constexpr bool PacketAccess = true;
124 
125  // Uses the given "seed" if non-zero, otherwise uses a random seed.
126  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
127  uint64_t seed = 0) {
128  m_state = PCG_XSH_RS_state(seed);
129  #ifdef EIGEN_USE_SYCL
130  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
131  // Therefore, we need two steps to initializate the m_state.
132  // IN SYCL, the constructor of the functor is s called on the CPU
133  // and we get the clock seed here from the CPU. However, This seed is
134  //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
135  // and only available on the Operator() function (which is called on the GPU).
136  // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
137  // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
138  // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
139  // similar to CUDA Therefore, the thread Id injection is not available at this stage.
140  //However when the operator() is called the thread ID will be available. So inside the opeator,
141  // we add the thrreadID, BlockId,... (which is equivalent of i)
142  //to the seed and construct the unique m_state per thead similar to cuda.
143  m_exec_once =false;
144  #endif
145  }
146  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
147  const UniformRandomGenerator& other) {
148  m_state = other.m_state;
149  #ifdef EIGEN_USE_SYCL
150  m_exec_once =other.m_exec_once;
151  #endif
152  }
153 
154  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
155  T operator()(Index i) const {
156  #ifdef EIGEN_USE_SYCL
157  if(!m_exec_once) {
158  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
159  // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
160  m_state += (i * 6364136223846793005ULL);
161  m_exec_once =true;
162  }
163  #endif
164  T result = RandomToTypeUniform<T>(&m_state, i);
165  return result;
166  }
167 
168  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
169  Packet packetOp(Index i) const {
170  const int packetSize = internal::unpacket_traits<Packet>::size;
171  EIGEN_ALIGN_MAX T values[packetSize];
172  #ifdef EIGEN_USE_SYCL
173  if(!m_exec_once) {
174  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
175  m_state += (i * 6364136223846793005ULL);
176  m_exec_once =true;
177  }
178  #endif
180  for (int j = 0; j < packetSize; ++j) {
181  values[j] = RandomToTypeUniform<T>(&m_state, i);
182  }
183  return internal::pload<Packet>(values);
184  }
185 
186  private:
187  mutable uint64_t m_state;
188  #ifdef EIGEN_USE_SYCL
189  mutable bool m_exec_once;
190  #endif
191 };
192 
193 template <typename Scalar>
194 struct functor_traits<UniformRandomGenerator<Scalar> > {
195  enum {
196  // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
197  Cost = 12 * NumTraits<Scalar>::AddCost *
198  ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
199  PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
200  };
201 };
202 
203 
204 
205 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
207  // Use the ratio of uniform method to generate numbers following a normal
208  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
209  // details.
210  T u, v, q;
211  do {
212  u = RandomToTypeUniform<T>(state, stream);
213  v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
214  const T x = u - T(0.449871);
215  const T y = numext::abs(v) + T(0.386595);
216  q = x*x + y * (T(0.196)*y - T(0.25472)*x);
217  } while (q > T(0.27597) &&
218  (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
219 
220  return v/u;
221 }
222 
223 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
224 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
225  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
226  RandomToTypeNormal<float>(state, stream));
227 }
228 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
229 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
230  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
231  RandomToTypeNormal<double>(state, stream));
232 }
233 
234 
235 template <typename T> class NormalRandomGenerator {
236  public:
237  static constexpr bool PacketAccess = true;
238 
239  // Uses the given "seed" if non-zero, otherwise uses a random seed.
240  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
241  m_state = PCG_XSH_RS_state(seed);
242  #ifdef EIGEN_USE_SYCL
243  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
244  // Therefore, we need two steps to initializate the m_state.
245  // IN SYCL, the constructor of the functor is s called on the CPU
246  // and we get the clock seed here from the CPU. However, This seed is
247  //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
248  // and only available on the Operator() function (which is called on the GPU).
249  // Therefore, the thread Id injection is not available at this stage. However when the operator()
250  //is called the thread ID will be available. So inside the operator,
251  // we add the thrreadID, BlockId,... (which is equivalent of i)
252  //to the seed and construct the unique m_state per thead similar to cuda.
253  m_exec_once =false;
254  #endif
255  }
256  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
257  const NormalRandomGenerator& other) {
258  m_state = other.m_state;
259 #ifdef EIGEN_USE_SYCL
260  m_exec_once=other.m_exec_once;
261 #endif
262  }
263 
264  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
265  T operator()(Index i) const {
266  #ifdef EIGEN_USE_SYCL
267  if(!m_exec_once) {
268  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
269  m_state += (i * 6364136223846793005ULL);
270  m_exec_once =true;
271  }
272  #endif
273  T result = RandomToTypeNormal<T>(&m_state, i);
274  return result;
275  }
276 
277  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
278  Packet packetOp(Index i) const {
279  const int packetSize = internal::unpacket_traits<Packet>::size;
280  EIGEN_ALIGN_MAX T values[packetSize];
281  #ifdef EIGEN_USE_SYCL
282  if(!m_exec_once) {
283  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
284  m_state += (i * 6364136223846793005ULL);
285  m_exec_once =true;
286  }
287  #endif
289  for (int j = 0; j < packetSize; ++j) {
290  values[j] = RandomToTypeNormal<T>(&m_state, i);
291  }
292  return internal::pload<Packet>(values);
293  }
294 
295  private:
296  mutable uint64_t m_state;
297  #ifdef EIGEN_USE_SYCL
298  mutable bool m_exec_once;
299  #endif
300 };
301 
302 
303 template <typename Scalar>
304 struct functor_traits<NormalRandomGenerator<Scalar> > {
305  enum {
306  // On average, we need to generate about 3 random numbers
307  // 15 mul, 8 add, 1.5 logs
308  Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
310  3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
311  PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
312  };
313 };
314 
315 
316 } // end namespace internal
317 } // end namespace Eigen
318 
319 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Array< int, Dynamic, 1 > v
int i
#define EIGEN_ALIGN_MAX
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_UNROLL_LOOP
#define EIGEN_DEVICE_FUNC
#define gpu_assert(x)
Definition: Tensor:70
T RandomToTypeUniform(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:47
unsigned PCG_XSH_RS_generator(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:32
const Scalar & y
double RandomToTypeUniform< double >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:91
T RandomToTypeNormal(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:206
float RandomToTypeUniform< float >(uint64_t *state, uint64_t stream)
Definition: TensorRandom.h:75
uint64_t get_random_seed()
Definition: TensorRandom.h:19
uint64_t PCG_XSH_RS_state(uint64_t seed)
Definition: TensorRandom.h:41
std::uint16_t uint16_t
std::uint32_t uint32_t
std::uint64_t uint64_t
EIGEN_ALWAYS_INLINE std::enable_if_t< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typename NumTraits< T >::Real > abs(const T &x)
EIGEN_ALWAYS_INLINE T log(const T &x)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
std::ptrdiff_t j