TensorReductionGpu.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 namespace internal {
17 
18 
19 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
20 // Full reducers for GPU, don't vectorize for now
21 
22 // Reducer function that enables multiple gpu thread to safely accumulate at the same
23 // output address. It basically reads the current value of the output variable, and
24 // attempts to update it with the new value. If in the meantime another gpu thread
25 // updated the content of the output address it will try again.
26 template <typename T, typename R>
27 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
28 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29  if (sizeof(T) == 4)
30  {
31  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
32  unsigned int newval = oldval;
33  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
34  if (newval == oldval) {
35  return;
36  }
37  unsigned int readback;
38  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
39  oldval = readback;
40  newval = oldval;
41  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
42  if (newval == oldval) {
43  return;
44  }
45  }
46  }
47  else if (sizeof(T) == 8) {
48  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
49  unsigned long long newval = oldval;
50  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
51  if (newval == oldval) {
52  return;
53  }
54  unsigned long long readback;
55  while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
56  oldval = readback;
57  newval = oldval;
58  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
59  if (newval == oldval) {
60  return;
61  }
62  }
63  }
64  else {
65  gpu_assert(0 && "Wordsize not supported");
66  }
67 #else // EIGEN_CUDA_ARCH >= 300
68  EIGEN_UNUSED_VARIABLE(output);
69  EIGEN_UNUSED_VARIABLE(accum);
70  EIGEN_UNUSED_VARIABLE(reducer);
71  gpu_assert(0 && "Shouldn't be called on unsupported device");
72 #endif // EIGEN_CUDA_ARCH >= 300
73 }
74 
75 // We extend atomicExch to support extra data types
76 template <typename Type>
77 __device__ inline Type atomicExchCustom(Type* address, Type val) {
78  return atomicExch(address, val);
79 }
80 
81 template <>
82 __device__ inline double atomicExchCustom(double* address, double val) {
83  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
84  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
85 }
86 
87 #ifdef EIGEN_HAS_GPU_FP16
88 template <typename R>
89 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
90  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
91  unsigned int newval = oldval;
92  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
93  if (newval == oldval) {
94  return;
95  }
96  unsigned int readback;
97  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
98  oldval = readback;
99  newval = oldval;
100  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
101  if (newval == oldval) {
102  return;
103  }
104  }
105 }
106 #ifdef EIGEN_GPU_COMPILE_PHASE
107 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
108 template <typename R>
109 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
110  half2* houtput=reinterpret_cast<half2*>(output);
111  half2* haccum=reinterpret_cast<half2*>(&accum);
112  for(int i=0;i<4;++i){
113  atomicReduce(houtput+i,*(haccum+i),reducer);
114  }
115 }
116 #endif // EIGEN_GPU_COMPILE_PHASE
117 #endif // EIGEN_HAS_GPU_FP16
118 
119 template <>
120 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
121 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
122  atomicAdd(output, accum);
123 #else // EIGEN_CUDA_ARCH >= 300
124  EIGEN_UNUSED_VARIABLE(output);
125  EIGEN_UNUSED_VARIABLE(accum);
126  gpu_assert(0 && "Shouldn't be called on unsupported device");
127 #endif // EIGEN_CUDA_ARCH >= 300
128 }
129 
130 
131 template <typename CoeffType, typename Index>
132 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
133  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
134  const Index num_threads = blockDim.x * gridDim.x;
135  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
136  output[i] = val;
137  }
138 }
139 
140 
141 template <int BlockSize, int NumPerThread, typename Self,
142  typename Reducer, typename Index>
143 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
144  typename Self::CoeffReturnType* output, unsigned int* semaphore) {
145 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
146  // Initialize the output value
147  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
148  if (gridDim.x == 1) {
149  if (first_index == 0) {
150  *output = reducer.initialize();
151  }
152  }
153  else {
154  if (threadIdx.x == 0) {
155  unsigned int block = atomicCAS(semaphore, 0u, 1u);
156  if (block == 0) {
157  // We're the first block to run, initialize the output value
158  atomicExchCustom(output, reducer.initialize());
159  __threadfence();
160  atomicExch(semaphore, 2u);
161  }
162  else {
163  // Wait for the first block to initialize the output value.
164  // Use atomicCAS here to ensure that the reads aren't cached
165  unsigned int val;
166  do {
167  val = atomicCAS(semaphore, 2u, 2u);
168  }
169  while (val < 2u);
170  }
171  }
172  }
173 
174  __syncthreads();
175 
176  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
177 
178  typename Self::CoeffReturnType accum = reducer.initialize();
179  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
180  for (Index i = 0; i < max_iter; i+=BlockSize) {
181  const Index index = first_index + i;
182  eigen_assert(index < num_coeffs);
183  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
184  reducer.reduce(val, &accum);
185  }
186 
187 #pragma unroll
188  for (int offset = warpSize/2; offset > 0; offset /= 2) {
189  #if defined(EIGEN_HIPCC)
190  // use std::is_floating_point to determine the type of reduced_val
191  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
192  // and list the float and int versions of __shfl_down as the candidate functions.
193  if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
194  reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
195  } else {
196  reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
197  }
198  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
199  reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
200  #else
201  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
202  #endif
203  }
204 
205  if ((threadIdx.x & (warpSize - 1)) == 0) {
206  atomicReduce(output, accum, reducer);
207  }
208 
209  if (gridDim.x > 1 && threadIdx.x == 0) {
210  // Let the last block reset the semaphore
211  atomicInc(semaphore, gridDim.x + 1);
212 #if defined(EIGEN_HIPCC)
213  __threadfence_system();
214 #endif
215  }
216 #else // EIGEN_CUDA_ARCH >= 300
217  EIGEN_UNUSED_VARIABLE(reducer);
218  EIGEN_UNUSED_VARIABLE(input);
219  EIGEN_UNUSED_VARIABLE(num_coeffs);
220  EIGEN_UNUSED_VARIABLE(output);
221  EIGEN_UNUSED_VARIABLE(semaphore);
222  gpu_assert(0 && "Shouldn't be called on unsupported device");
223 #endif // EIGEN_CUDA_ARCH >= 300
224 }
225 
226 
227 #ifdef EIGEN_HAS_GPU_FP16
228 template <typename Self,
229  typename Reducer, typename Index>
230 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
231  Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
232  eigen_assert(blockDim.x == 1);
233  eigen_assert(gridDim.x == 1);
234  typedef packet_traits<Eigen::half>::type packet_type;
235  Index packet_remainder =
236  num_coeffs % Index(unpacket_traits<packet_type>::size);
237  if (packet_remainder != 0) {
238  half2* h2scratch = reinterpret_cast<half2*>(scratch);
239  for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
240  *h2scratch =
241  __halves2half2(input.coeff(i), input.coeff(i + 1));
242  h2scratch++;
243  }
244  if ((num_coeffs & 1) != 0) {
245  half lastCoeff = input.coeff(num_coeffs - 1);
246  *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
247  }
248  } else {
249  packet_type reduce = reducer.template initializePacket<packet_type>();
250  internal::pstoreu(scratch, reduce);
251  }
252 }
253 
254 template <typename Self,
255  typename Reducer, typename Index>
256 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) {
257  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
258  const Index num_threads = blockDim.x * gridDim.x;
259  typedef typename packet_traits<Eigen::half>::type PacketType;
260 
261  const Index num_packets =
262  num_coeffs / Index(unpacket_traits<PacketType>::size);
263  PacketType* p_output = reinterpret_cast<PacketType*>(output);
264  for (Index i = thread_id; i < num_packets; i += num_threads) {
265  p_output[i] = reducer.template initializePacket<PacketType>();
266  }
267  Index packet_remainder =
268  num_coeffs % Index(unpacket_traits<PacketType>::size);
269  if (thread_id < packet_remainder) {
270  output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
271  }
272 }
273 
274 template <int BlockSize, int NumPerThread, typename Self,
275  typename Reducer, typename Index>
276 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
277  Reducer reducer, const Self input, Index num_coeffs,
278  half* output, half* scratch) {
279  typedef typename packet_traits<Eigen::half>::type PacketType;
280  const int packet_width = unpacket_traits<PacketType>::size;
281  eigen_assert(NumPerThread % packet_width == 0);
282  const Index first_index =
283  blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
284 
285  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
286 
287  if (gridDim.x == 1) {
288  if (first_index == 0) {
289  int rem = num_coeffs % packet_width;
290  if (rem != 0) {
291  half2* p_scratch = reinterpret_cast<half2*>(scratch);
292  pstoreu(scratch, reducer.template initializePacket<PacketType>());
293  for (int i = 0; i < rem / 2; i++) {
294  *p_scratch = __halves2half2(
295  input.coeff(num_coeffs - packet_width + 2 * i),
296  input.coeff(num_coeffs - packet_width + 2 * i + 1));
297  p_scratch++;
298  }
299  if ((num_coeffs & 1) != 0) {
300  half last = input.coeff(num_coeffs - 1);
301  *p_scratch = __halves2half2(last, reducer.initialize());
302  }
303  } else {
304  PacketType reduce = reducer.template initializePacket<PacketType>();
305  pstoreu(scratch, reduce);
306  }
307  }
308  __syncthreads();
309  }
310 
311  PacketType accum = reducer.template initializePacket<PacketType>();
312  const Index max_iter =
313  numext::mini<Index>((num_coeffs - first_index) / packet_width,
314  NumPerThread * BlockSize / packet_width);
315  for (Index i = 0; i < max_iter; i += BlockSize) {
316  const Index index = first_index + packet_width * i;
317  eigen_assert(index + packet_width < num_coeffs);
318  PacketType val = input.template packet<Unaligned>(index);
319  reducer.reducePacket(val, &accum);
320  }
321 
322 #pragma unroll
323  for (int offset = warpSize/2; offset > 0; offset /= 2) {
324  #if defined(EIGEN_HIPCC)
325  PacketType r1;
326  half2* hr = reinterpret_cast<half2*>(&r1);
327  half2* hacc = reinterpret_cast<half2*>(&accum);
328  for (int i = 0; i < packet_width / 2; i++) {
329  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
330  union { int i; half2 h; } wka_in, wka_out;
331  wka_in.h = hacc[i];
332  wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
333  hr[i] = wka_out.h;
334  }
335  reducer.reducePacket(r1, &accum);
336  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
337  PacketType r1;
338  half2* hr = reinterpret_cast<half2*>(&r1);
339  half2* hacc = reinterpret_cast<half2*>(&accum);
340  for (int i = 0; i < packet_width / 2; i++) {
341  hr[i] = __shfl_down(hacc[i], offset, warpSize);
342  }
343  reducer.reducePacket(r1, &accum);
344  #else
345  PacketType r1;
346  half2* hr = reinterpret_cast<half2*>(&r1);
347  half2* hacc = reinterpret_cast<half2*>(&accum);
348  for (int i = 0; i < packet_width / 2; i++) {
349  hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
350  }
351  reducer.reducePacket(r1, &accum);
352 
353  #endif
354  }
355 
356  if ((threadIdx.x & (warpSize - 1)) == 0) {
357  atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
358  }
359 
360  __syncthreads();
361  half2* rv1 = reinterpret_cast<half2*>(scratch);
362  if (packet_width > 2) {
363  reducer.reducePacket(rv1[2], rv1);
364  reducer.reducePacket(rv1[3], rv1 + 1);
365  reducer.reducePacket(rv1[1], rv1);
366  }
367  if (gridDim.x == 1) {
368  if (first_index == 0) {
369  half tmp = __low2half(*rv1);
370  reducer.reduce(__high2half(*rv1), &tmp);
371  *output = tmp;
372  }
373  }
374 }
375 
376 template <typename Op>
377 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
378  eigen_assert(threadIdx.x == 1);
379  typedef packet_traits<Eigen::half>::type packet_type;
380  if (unpacket_traits<packet_type>::size == 1) {
381  *output = *scratch;
382  } else {
383  half2* pscratch = reinterpret_cast<half2*>(scratch);
384  half tmp = __float2half(0.f);
385  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
386  reducer.reduce(__low2half(*pscratch), &tmp);
387  reducer.reduce(__high2half(*pscratch), &tmp);
388  pscratch++;
389  }
390  *output = tmp;
391  }
392 }
393 
394 #endif // EIGEN_HAS_GPU_FP16
395 
396 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
397 struct FullReductionLauncher {
398  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
399  gpu_assert(false && "Should only be called on doubles, floats and half floats");
400  }
401 };
402 
403 // Specialization for float and double
404 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
405 struct FullReductionLauncher<
406  Self, Op, OutputType, PacketAccess,
407  std::enable_if_t<
408  internal::is_same<float, OutputType>::value ||
409  internal::is_same<double, OutputType>::value,
410  void>> {
411  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
412 
413  typedef typename Self::Index Index;
414  const int block_size = 256;
415  const int num_per_thread = 128;
416  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
417 
418  unsigned int* semaphore = NULL;
419  if (num_blocks > 1) {
420  semaphore = device.semaphore();
421  }
422 
423  LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
424  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
425  }
426 };
427 
428 #ifdef EIGEN_HAS_GPU_FP16
429 template <typename Self, typename Op>
430 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
431  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
432  gpu_assert(false && "Should not be called since there is no packet accessor");
433  }
434 };
435 
436 template <typename Self, typename Op>
437 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
438  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
439  typedef typename Self::Index Index;
440 
441  const int block_size = 256;
442  const int num_per_thread = 128;
443  const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
444  half* scratch = static_cast<half*>(device.scratchpad());
445 
446  if (num_blocks > 1) {
447  // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
448  // won't be a race conditions between multiple thread blocks.
449  LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
450  1, 1, 0, device, reducer, self, num_coeffs, scratch);
451  }
452 
453  LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
454  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
455 
456  if (num_blocks > 1) {
457  LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
458  1, 1, 0, device, reducer, output, scratch);
459  }
460  }
461 };
462 #endif // EIGEN_HAS_GPU_FP16
463 
464 
465 template <typename Self, typename Op, bool Vectorizable>
466 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
467  // Unfortunately nvidia doesn't support well exotic types such as complex,
468  // so reduce the scope of the optimized version of the code to the simple cases
469  // of doubles, floats and half floats
470 #ifdef EIGEN_HAS_GPU_FP16
471  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
472  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
473  internal::is_same<typename Self::CoeffReturnType, double>::value ||
474  (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
475 #else // EIGEN_HAS_GPU_FP16
476  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
477  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
478  internal::is_same<typename Self::CoeffReturnType, double>::value);
479 #endif // EIGEN_HAS_GPU_FP16
480 
481  template <typename OutputType>
482  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
483  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
484  const Index num_coeffs = array_prod(self.m_impl.dimensions());
485  // Don't crash when we're called with an input tensor of size 0.
486  if (num_coeffs == 0) {
487  return;
488  }
489 
490  FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
491  }
492 };
493 
494 
495 template <int NumPerThread, typename Self,
496  typename Reducer, typename Index>
497 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
498  typename Self::CoeffReturnType* output) {
499 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
500  typedef typename Self::CoeffReturnType Type;
501  eigen_assert(blockDim.y == 1);
502  eigen_assert(blockDim.z == 1);
503  eigen_assert(gridDim.y == 1);
504  eigen_assert(gridDim.z == 1);
505 
506  const int unroll_times = 16;
507  eigen_assert(NumPerThread % unroll_times == 0);
508 
509  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
510  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
511 
512  const Index num_threads = blockDim.x * gridDim.x;
513  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
514 
515  // Initialize the output values if they weren't initialized by the ReductionInitKernel
516  if (gridDim.x == 1) {
517  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
518  output[i] = reducer.initialize();
519  }
520  __syncthreads();
521  }
522 
523  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
524  const Index row = i / input_col_blocks;
525 
526  if (row < num_preserved_coeffs) {
527  const Index col_block = i % input_col_blocks;
528  const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
529 
530  Type reduced_val = reducer.initialize();
531 
532  for (Index j = 0; j < NumPerThread; j += unroll_times) {
533  const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
534  if (last_col >= num_coeffs_to_reduce) {
535  for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
536  const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
537  reducer.reduce(val, &reduced_val);
538  }
539  break;
540  } else {
541  // Faster version of the loop with no branches after unrolling.
542 #pragma unroll
543  for (int k = 0; k < unroll_times; ++k) {
544  const Index col = col_begin + blockDim.x * (j + k);
545  reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
546  }
547  }
548  }
549 
550 #pragma unroll
551  for (int offset = warpSize/2; offset > 0; offset /= 2) {
552  #if defined(EIGEN_HIPCC)
553  // use std::is_floating_point to determine the type of reduced_val
554  // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
555  // and list the float and int versions of __shfl_down as the candidate functions.
556  if (std::is_floating_point<Type>::value) {
557  reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
558  } else {
559  reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
560  }
561  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
562  reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
563  #else
564  reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
565  #endif
566  }
567 
568  if ((threadIdx.x & (warpSize - 1)) == 0) {
569  atomicReduce(&(output[row]), reduced_val, reducer);
570  }
571  }
572  }
573 #else // EIGEN_CUDA_ARCH >= 300
574  gpu_assert(0 && "Shouldn't be called on unsupported device");
575 #endif // EIGEN_CUDA_ARCH >= 300
576 }
577 
578 #ifdef EIGEN_HAS_GPU_FP16
579 
580 template <int NumPerThread, typename Self,
581  typename Reducer, typename Index>
582 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
583  half* output) {
584  eigen_assert(blockDim.y == 1);
585  eigen_assert(blockDim.z == 1);
586  eigen_assert(gridDim.y == 1);
587  eigen_assert(gridDim.z == 1);
588 
589  typedef typename packet_traits<Eigen::half>::type PacketType;
590  const int packet_width = unpacket_traits<PacketType>::size;
591  const int unroll_times = 16 / packet_width;
592  eigen_assert(NumPerThread % unroll_times == 0);
593  eigen_assert(unroll_times % 2 == 0);
594 
595  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
596  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
597 
598  const Index num_threads = blockDim.x * gridDim.x;
599  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
600 
601  // Initialize the output values if they weren't initialized by the ReductionInitKernel
602  if (gridDim.x == 1) {
603  Index i = packet_width * thread_id;
604  for (; i + packet_width <= num_preserved_coeffs;
605  i += packet_width * num_threads) {
606  PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
607  *poutput = reducer.template initializePacket<PacketType>();
608  }
609  if (i < num_preserved_coeffs) {
610  output[i] = reducer.initialize();
611  }
612  __syncthreads();
613  }
614 
615  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
616  const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
617 
618  if (row + 1 < num_preserved_coeffs) {
619  const Index col_block = i % input_col_blocks;
620  const Index col_begin =
621  packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
622 
623  PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
624  PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
625 
626  for (Index j = 0; j < NumPerThread; j += unroll_times) {
627  const Index last_col =
628  col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
629  if (last_col >= num_coeffs_to_reduce) {
630  Index col = col_begin + blockDim.x * j;
631  for (; col + packet_width <= num_coeffs_to_reduce;
632  col += blockDim.x) {
633  const PacketType val1 = input.m_impl.template packet<Unaligned>(
634  row * num_coeffs_to_reduce + col);
635  reducer.reducePacket(val1, &reduced_val1);
636  const PacketType val2 = input.m_impl.template packet<Unaligned>(
637  (row + 1) * num_coeffs_to_reduce + col);
638  reducer.reducePacket(val2, &reduced_val2);
639  }
640  if (col < num_coeffs_to_reduce) {
641  PacketType r1 = reducer.template initializePacket<PacketType>();
642  PacketType r2 = reducer.template initializePacket<PacketType>();
643  half2* hr1 = reinterpret_cast<half2*>(&r1);
644  half2* hr2 = reinterpret_cast<half2*>(&r2);
645  while (col + 1 < num_coeffs_to_reduce) {
646  *hr1 = __halves2half2(
647  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
648  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
649  *hr2 = __halves2half2(
650  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
651  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
652  1));
653  hr1++;
654  hr2++;
655  col += 2;
656  }
657  if (col < num_coeffs_to_reduce) {
658  // Peel;
659  const half last1 =
660  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
661  *hr1 = __halves2half2(last1, reducer.initialize());
662  const half last2 =
663  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
664  *hr2 = __halves2half2(last2, reducer.initialize());
665  }
666  reducer.reducePacket(r1, &reduced_val1);
667  reducer.reducePacket(r2, &reduced_val2);
668  }
669  break;
670  } else {
671  // Faster version of the loop with no branches after unrolling.
672 #pragma unroll
673  for (int k = 0; k < unroll_times; ++k) {
674  const Index col = col_begin + blockDim.x * (j + k) * packet_width;
675  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
676  row * num_coeffs_to_reduce + col),
677  &reduced_val1);
678  reducer.reducePacket(input.m_impl.template packet<Unaligned>(
679  (row + 1) * num_coeffs_to_reduce + col),
680  &reduced_val2);
681  }
682  }
683  }
684 
685 #pragma unroll
686  for (int offset = warpSize/2; offset > 0; offset /= 2) {
687  #if defined(EIGEN_HIPCC)
688  PacketType r1;
689  PacketType r2;
690  half2* hr1 = reinterpret_cast<half2*>(&r1);
691  half2* hr2 = reinterpret_cast<half2*>(&r2);
692  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
693  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
694  for (int i = 0; i < packet_width / 2; i++) {
695  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
696  union { int i; half2 h; } wka_in1, wka_out1;
697  wka_in1.h = rv1[i];
698  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
699  hr1[i] = wka_out1.h;
700 
701  union { int i; half2 h; } wka_in2, wka_out2;
702  wka_in2.h = rv2[i];
703  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
704  hr2[i] = wka_out2.h;
705  }
706  reducer.reducePacket(r1, &reduced_val1);
707  reducer.reducePacket(r2, &reduced_val2);
708  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
709  PacketType r1;
710  PacketType r2;
711  half2* hr1 = reinterpret_cast<half2*>(&r1);
712  half2* hr2 = reinterpret_cast<half2*>(&r2);
713  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
714  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
715  for (int i = 0; i < packet_width / 2; i++) {
716  hr1[i] = __shfl_down(rv1[i], offset, warpSize);
717  hr2[i] = __shfl_down(rv2[i], offset, warpSize);
718  }
719  reducer.reducePacket(r1, &reduced_val1);
720  reducer.reducePacket(r2, &reduced_val2);
721  #else
722  PacketType r1;
723  PacketType r2;
724  half2* hr1 = reinterpret_cast<half2*>(&r1);
725  half2* hr2 = reinterpret_cast<half2*>(&r2);
726  half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
727  half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
728  for (int j = 0; j < packet_width / 2; j++) {
729  hr1[j] =
730  __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
731  hr2[j] =
732  __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
733  }
734  reducer.reducePacket(r1, &reduced_val1);
735  reducer.reducePacket(r2, &reduced_val2);
736 
737  #endif
738  }
739  half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
740  half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
741  half2 val;
742  if (packet_width > 2) {
743  reducer.reducePacket(rv1[2], rv1);
744  reducer.reducePacket(rv1[3], rv1 + 1);
745  reducer.reducePacket(rv1[1], rv1);
746  reducer.reducePacket(rv2[2], rv2);
747  reducer.reducePacket(rv2[3], rv2 + 1);
748  reducer.reducePacket(rv2[1], rv2);
749  }
750  half val1 = __low2half(*rv1);
751  reducer.reduce(__high2half(*rv1), &val1);
752  half val2 = __low2half(*rv2);
753  reducer.reduce(__high2half(*rv2), &val2);
754  val = __halves2half2(val1, val2);
755  if ((threadIdx.x & (warpSize - 1)) == 0) {
756  half* loc = output + row;
757  atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
758  }
759  }
760  }
761 }
762 
763 #endif // EIGEN_HAS_GPU_FP16
764 
765 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
766 struct InnerReductionLauncher {
767  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
768  gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
769  return true;
770  }
771 };
772 
773 // Specialization for float and double
774 template <typename Self, typename Op, typename OutputType, bool PacketAccess>
775 struct InnerReductionLauncher<
776  Self, Op, OutputType, PacketAccess,
777  std::enable_if_t<
778  internal::is_same<float, OutputType>::value ||
779  internal::is_same<double, OutputType>::value,
780  void>> {
781  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
782  typedef typename Self::Index Index;
783 
784  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
785  const int block_size = 256;
786  const int num_per_thread = 128;
787  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
788  const int max_blocks = device.getNumGpuMultiProcessors() *
789  device.maxGpuThreadsPerMultiProcessor() / block_size;
790  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
791 
792  if (num_blocks > 1) {
793  // We initialize the outputs outside the reduction kernel when we can't be sure that there
794  // won't be a race conditions between multiple thread blocks.
795  const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
796  const int max_blocks2 = device.getNumGpuMultiProcessors() *
797  device.maxGpuThreadsPerMultiProcessor() / 1024;
798  const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
799  LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
800  num_blocks2, 1024, 0, device, reducer.initialize(),
801  num_preserved_vals, output);
802  }
803 
804  LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
805  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
806 
807  return false;
808  }
809 };
810 
811 #ifdef EIGEN_HAS_GPU_FP16
812 template <typename Self, typename Op>
813 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
814  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
815  gpu_assert(false && "Should not be called since there is no packet accessor");
816  return true;
817  }
818 };
819 
820 template <typename Self, typename Op>
821 struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
822  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
823  typedef typename Self::Index Index;
824 
825  if (num_preserved_vals % 2 != 0) {
826  // Not supported yet, revert to the slower code path
827  return true;
828  }
829 
830  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
831  const int block_size = /*256*/128;
832  const int num_per_thread = /*128*/64;
833  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
834  const int max_blocks = device.getNumGpuMultiProcessors() *
835  device.maxGpuThreadsPerMultiProcessor() / block_size;
836  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
837 
838  if (num_blocks > 1) {
839  // We initialize the outputs outside the reduction kernel when we can't be sure that there
840  // won't be a race conditions between multiple thread blocks.
841  LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
842  1, 1, 0, device, reducer, self, num_preserved_vals, output);
843  }
844 
845  LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
846  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
847 
848  return false;
849  }
850 };
851 #endif // EIGEN_HAS_GPU_FP16
852 
853 
854 template <typename Self, typename Op>
855 struct InnerReducer<Self, Op, GpuDevice> {
856  // Unfortunately nvidia doesn't support well exotic types such as complex,
857  // so reduce the scope of the optimized version of the code to the simple case
858  // of floats and half floats.
859 #ifdef EIGEN_HAS_GPU_FP16
860  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
861  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
862  internal::is_same<typename Self::CoeffReturnType, double>::value ||
863  (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
864 #else // EIGEN_HAS_GPU_FP16
865  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
866  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
867  internal::is_same<typename Self::CoeffReturnType, double>::value);
868 #endif // EIGEN_HAS_GPU_FP16
869 
870  template <typename OutputType>
871  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
872  gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
873  const Index num_coeffs = array_prod(self.m_impl.dimensions());
874  // Don't crash when we're called with an input tensor of size 0.
875  if (num_coeffs == 0) {
876  return true;
877  }
878  // It's faster to use the usual code.
879  if (num_coeffs_to_reduce <= 128) {
880  return true;
881  }
882 
883  return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
884  }
885 };
886 
887 template <int NumPerThread, typename Self,
888  typename Reducer, typename Index>
889 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
890  typename Self::CoeffReturnType* output) {
891  const Index num_threads = blockDim.x * gridDim.x;
892  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
893  // Initialize the output values if they weren't initialized by the ReductionInitKernel
894  if (gridDim.x == 1) {
895  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
896  output[i] = reducer.initialize();
897  }
898  __syncthreads();
899  }
900 
901  // Do the reduction.
902  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
903  for (Index i = thread_id; i < max_iter; i += num_threads) {
904  const Index input_col = i % num_preserved_coeffs;
905  const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
906  typename Self::CoeffReturnType reduced_val = reducer.initialize();
907  const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
908  for (Index j = input_row; j < max_row; j++) {
909  typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
910  reducer.reduce(val, &reduced_val);
911  }
912  atomicReduce(&(output[input_col]), reduced_val, reducer);
913  }
914 }
915 
916 
917 template <typename Self, typename Op>
918 struct OuterReducer<Self, Op, GpuDevice> {
919  // Unfortunately nvidia doesn't support well exotic types such as complex,
920  // so reduce the scope of the optimized version of the code to the simple case
921  // of floats.
922  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
923  (internal::is_same<typename Self::CoeffReturnType, float>::value ||
924  internal::is_same<typename Self::CoeffReturnType, double>::value);
925  template <typename Device, typename OutputType>
926  static
927  #if !defined(EIGEN_HIPCC)
928  // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
929  // (in the cxx11_tensor_reduction_gpu test)
930  //
931  // terminate called after throwing an instance of 'std::runtime_error'
932  // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
933  //
934  // don't know why this happens (and why is it a runtime error instead of a compile time error)
935  //
936  // this will be fixed by HIP PR#457
938  #endif
939  bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
940  gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
941  return true;
942  }
943 
944  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
945  typedef typename Self::Index Index;
946 
947  // It's faster to use the usual code.
948  if (num_coeffs_to_reduce <= 32) {
949  return true;
950  }
951 
952  const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
953  const int block_size = 256;
954  const int num_per_thread = 16;
955  const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
956  const int max_blocks = device.getNumGpuMultiProcessors() *
957  device.maxGpuThreadsPerMultiProcessor() / block_size;
958  const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
959 
960  if (num_blocks > 1) {
961  // We initialize the outputs in the reduction kernel itself when we don't have to worry
962  // about race conditions between multiple thread blocks.
963  const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
964  const int max_blocks2 = device.getNumGpuMultiProcessors() *
965  device.maxGpuThreadsPerMultiProcessor() / 1024;
966  const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
967  LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
968  num_blocks2, 1024, 0, device, reducer.initialize(),
969  num_preserved_vals, output);
970  }
971 
972  LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
973  num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
974 
975  return false;
976  }
977 };
978 
979 #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
980 
981 
982 } // end namespace internal
983 } // end namespace Eigen
984 
985 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
int i
RowXpr row(Index i) const
ColXpr col(Index i) const
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL FixedBlockXpr< NRows, NCols >::Type block(Index startRow, Index startCol)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#define eigen_assert(x)
#define gpu_assert(x)
Definition: Tensor:70
static const last_t last
Type
void pstoreu(Scalar *to, const Packet &from)
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
std::ptrdiff_t j