10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
19 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
26 template <
typename T,
typename R>
28 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
31 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
32 unsigned int newval = oldval;
33 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
34 if (newval == oldval) {
37 unsigned int readback;
38 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
41 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
42 if (newval == oldval) {
47 else if (
sizeof(
T) == 8) {
48 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
49 unsigned long long newval = oldval;
50 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
51 if (newval == oldval) {
54 unsigned long long readback;
55 while ((readback = atomicCAS(
reinterpret_cast<unsigned long long*
>(output), oldval, newval)) != oldval) {
58 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
59 if (newval == oldval) {
71 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
76 template <
typename Type>
77 __device__
inline Type atomicExchCustom(
Type* address,
Type val) {
78 return atomicExch(address, val);
82 __device__
inline double atomicExchCustom(
double* address,
double val) {
83 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
84 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
87 #ifdef EIGEN_HAS_GPU_FP16
89 __device__
inline void atomicReduce(half2* output, half2 accum, R& reducer) {
90 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
91 unsigned int newval = oldval;
92 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
93 if (newval == oldval) {
96 unsigned int readback;
97 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
100 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
101 if (newval == oldval) {
106 #ifdef EIGEN_GPU_COMPILE_PHASE
108 template <
typename R>
109 __device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
110 half2* houtput=
reinterpret_cast<half2*
>(output);
111 half2* haccum=
reinterpret_cast<half2*
>(&accum);
112 for(
int i=0;
i<4;++
i){
113 atomicReduce(houtput+i,*(haccum+i),reducer);
120 __device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
121 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
122 atomicAdd(output, accum);
126 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
131 template <
typename CoeffType,
typename Index>
133 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
134 const Index num_threads = blockDim.x * gridDim.x;
135 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
141 template <
int BlockSize,
int NumPerThread,
typename Self,
142 typename Reducer,
typename Index>
144 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
145 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
147 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
148 if (gridDim.x == 1) {
149 if (first_index == 0) {
150 *output = reducer.initialize();
154 if (threadIdx.x == 0) {
155 unsigned int block = atomicCAS(semaphore, 0u, 1u);
158 atomicExchCustom(output, reducer.initialize());
160 atomicExch(semaphore, 2u);
167 val = atomicCAS(semaphore, 2u, 2u);
178 typename Self::CoeffReturnType accum = reducer.initialize();
179 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
180 for (
Index i = 0;
i < max_iter;
i+=BlockSize) {
181 const Index index = first_index +
i;
183 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
184 reducer.reduce(val, &accum);
188 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
189 #if defined(EIGEN_HIPCC)
193 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
194 reducer.reduce(__shfl_down(
static_cast<float>(accum), offset, warpSize), &accum);
196 reducer.reduce(__shfl_down(
static_cast<int>(accum), offset, warpSize), &accum);
198 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
199 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
201 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
205 if ((threadIdx.x & (warpSize - 1)) == 0) {
206 atomicReduce(output, accum, reducer);
209 if (gridDim.x > 1 && threadIdx.x == 0) {
211 atomicInc(semaphore, gridDim.x + 1);
212 #if defined(EIGEN_HIPCC)
213 __threadfence_system();
222 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
227 #ifdef EIGEN_HAS_GPU_FP16
228 template <
typename Self,
229 typename Reducer,
typename Index>
231 Reducer reducer,
const Self input,
Index num_coeffs, half* scratch) {
234 typedef packet_traits<Eigen::half>::type packet_type;
235 Index packet_remainder =
236 num_coeffs %
Index(unpacket_traits<packet_type>::size);
237 if (packet_remainder != 0) {
238 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
239 for (
Index i = num_coeffs - packet_remainder;
i + 2 <= num_coeffs;
i += 2) {
241 __halves2half2(input.coeff(i), input.coeff(i + 1));
244 if ((num_coeffs & 1) != 0) {
245 half lastCoeff = input.coeff(num_coeffs - 1);
246 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
249 packet_type reduce = reducer.template initializePacket<packet_type>();
254 template <
typename Self,
255 typename Reducer,
typename Index>
257 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
258 const Index num_threads = blockDim.x * gridDim.x;
259 typedef typename packet_traits<Eigen::half>::type PacketType;
261 const Index num_packets =
262 num_coeffs /
Index(unpacket_traits<PacketType>::size);
263 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
264 for (
Index i = thread_id;
i < num_packets;
i += num_threads) {
265 p_output[
i] = reducer.template initializePacket<PacketType>();
267 Index packet_remainder =
268 num_coeffs %
Index(unpacket_traits<PacketType>::size);
269 if (thread_id < packet_remainder) {
270 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
274 template <
int BlockSize,
int NumPerThread,
typename Self,
275 typename Reducer,
typename Index>
277 Reducer reducer,
const Self input,
Index num_coeffs,
278 half* output, half* scratch) {
279 typedef typename packet_traits<Eigen::half>::type PacketType;
280 const int packet_width = unpacket_traits<PacketType>::size;
282 const Index first_index =
283 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
287 if (gridDim.x == 1) {
288 if (first_index == 0) {
289 int rem = num_coeffs % packet_width;
291 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
292 pstoreu(scratch, reducer.template initializePacket<PacketType>());
293 for (
int i = 0;
i < rem / 2;
i++) {
294 *p_scratch = __halves2half2(
295 input.coeff(num_coeffs - packet_width + 2 * i),
296 input.coeff(num_coeffs - packet_width + 2 * i + 1));
299 if ((num_coeffs & 1) != 0) {
300 half
last = input.coeff(num_coeffs - 1);
301 *p_scratch = __halves2half2(
last, reducer.initialize());
304 PacketType reduce = reducer.template initializePacket<PacketType>();
311 PacketType accum = reducer.template initializePacket<PacketType>();
312 const Index max_iter =
313 numext::mini<Index>((num_coeffs - first_index) / packet_width,
314 NumPerThread * BlockSize / packet_width);
315 for (
Index i = 0;
i < max_iter;
i += BlockSize) {
316 const Index index = first_index + packet_width *
i;
318 PacketType val = input.template packet<Unaligned>(index);
319 reducer.reducePacket(val, &accum);
323 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
324 #if defined(EIGEN_HIPCC)
326 half2* hr =
reinterpret_cast<half2*
>(&r1);
327 half2* hacc =
reinterpret_cast<half2*
>(&accum);
328 for (
int i = 0;
i < packet_width / 2;
i++) {
330 union {
int i; half2 h; } wka_in, wka_out;
332 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
335 reducer.reducePacket(r1, &accum);
336 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
338 half2* hr =
reinterpret_cast<half2*
>(&r1);
339 half2* hacc =
reinterpret_cast<half2*
>(&accum);
340 for (
int i = 0;
i < packet_width / 2;
i++) {
341 hr[
i] = __shfl_down(hacc[i], offset, warpSize);
343 reducer.reducePacket(r1, &accum);
346 half2* hr =
reinterpret_cast<half2*
>(&r1);
347 half2* hacc =
reinterpret_cast<half2*
>(&accum);
348 for (
int i = 0;
i < packet_width / 2;
i++) {
349 hr[
i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (
unsigned)offset, warpSize);
351 reducer.reducePacket(r1, &accum);
356 if ((threadIdx.x & (warpSize - 1)) == 0) {
357 atomicReduce(
reinterpret_cast<PacketType*
>(scratch), accum, reducer);
361 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
362 if (packet_width > 2) {
363 reducer.reducePacket(rv1[2], rv1);
364 reducer.reducePacket(rv1[3], rv1 + 1);
365 reducer.reducePacket(rv1[1], rv1);
367 if (gridDim.x == 1) {
368 if (first_index == 0) {
369 half tmp = __low2half(*rv1);
370 reducer.reduce(__high2half(*rv1), &tmp);
376 template <
typename Op>
379 typedef packet_traits<Eigen::half>::type packet_type;
380 if (unpacket_traits<packet_type>::size == 1) {
383 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
384 half tmp = __float2half(0.f);
385 for (
int i = 0; i < unpacket_traits<packet_type>::size;
i += 2) {
386 reducer.reduce(__low2half(*pscratch), &tmp);
387 reducer.reduce(__high2half(*pscratch), &tmp);
396 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
397 struct FullReductionLauncher {
398 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
399 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
404 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
405 struct FullReductionLauncher<
406 Self, Op, OutputType, PacketAccess,
408 internal::is_same<float, OutputType>::value ||
409 internal::is_same<double, OutputType>::value,
411 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
413 typedef typename Self::Index
Index;
414 const int block_size = 256;
415 const int num_per_thread = 128;
416 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
418 unsigned int* semaphore = NULL;
419 if (num_blocks > 1) {
420 semaphore = device.semaphore();
423 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
424 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, semaphore);
428 #ifdef EIGEN_HAS_GPU_FP16
429 template <
typename Self,
typename Op>
430 struct FullReductionLauncher<Self, Op,
Eigen::half, false> {
431 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
432 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
436 template <
typename Self,
typename Op>
437 struct FullReductionLauncher<Self, Op,
Eigen::half, true> {
438 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
439 typedef typename Self::Index
Index;
441 const int block_size = 256;
442 const int num_per_thread = 128;
443 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
444 half* scratch =
static_cast<half*
>(device.scratchpad());
446 if (num_blocks > 1) {
449 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
450 1, 1, 0, device, reducer,
self, num_coeffs, scratch);
453 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
454 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output, scratch);
456 if (num_blocks > 1) {
457 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
458 1, 1, 0, device, reducer, output, scratch);
465 template <
typename Self,
typename Op,
bool Vectorizable>
466 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
470 #ifdef EIGEN_HAS_GPU_FP16
471 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
472 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
473 internal::is_same<typename Self::CoeffReturnType, double>::value ||
474 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
476 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
477 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
478 internal::is_same<typename Self::CoeffReturnType, double>::value);
481 template <
typename OutputType>
482 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
483 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
486 if (num_coeffs == 0) {
490 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
self, reducer, device, output, num_coeffs);
495 template <
int NumPerThread,
typename Self,
496 typename Reducer,
typename Index>
498 typename Self::CoeffReturnType* output) {
499 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
500 typedef typename Self::CoeffReturnType
Type;
506 const int unroll_times = 16;
509 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
510 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
512 const Index num_threads = blockDim.x * gridDim.x;
513 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
516 if (gridDim.x == 1) {
517 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
518 output[
i] = reducer.initialize();
523 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
526 if (row < num_preserved_coeffs) {
527 const Index col_block =
i % input_col_blocks;
528 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
530 Type reduced_val = reducer.initialize();
532 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
533 const Index last_col = col_begin + blockDim.x * (
j + unroll_times - 1);
534 if (last_col >= num_coeffs_to_reduce) {
535 for (
Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
536 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
537 reducer.reduce(val, &reduced_val);
543 for (
int k = 0; k < unroll_times; ++k) {
544 const Index col = col_begin + blockDim.x * (
j + k);
545 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
551 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
552 #if defined(EIGEN_HIPCC)
556 if (std::is_floating_point<Type>::value) {
557 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val), offset), &reduced_val);
559 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val), offset), &reduced_val);
561 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
562 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
564 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
568 if ((threadIdx.x & (warpSize - 1)) == 0) {
569 atomicReduce(&(output[row]), reduced_val, reducer);
574 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
578 #ifdef EIGEN_HAS_GPU_FP16
580 template <
int NumPerThread,
typename Self,
581 typename Reducer,
typename Index>
589 typedef typename packet_traits<Eigen::half>::type PacketType;
590 const int packet_width = unpacket_traits<PacketType>::size;
591 const int unroll_times = 16 / packet_width;
595 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
596 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
598 const Index num_threads = blockDim.x * gridDim.x;
599 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
602 if (gridDim.x == 1) {
603 Index i = packet_width * thread_id;
604 for (;
i + packet_width <= num_preserved_coeffs;
605 i += packet_width * num_threads) {
606 PacketType* poutput =
reinterpret_cast<PacketType*
>(output +
i);
607 *poutput = reducer.template initializePacket<PacketType>();
609 if (i < num_preserved_coeffs) {
610 output[
i] = reducer.initialize();
615 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
616 const Index row = 2 * (
i / input_col_blocks);
618 if (row + 1 < num_preserved_coeffs) {
619 const Index col_block =
i % input_col_blocks;
620 const Index col_begin =
621 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
623 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
624 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
626 for (
Index j = 0;
j < NumPerThread;
j += unroll_times) {
627 const Index last_col =
628 col_begin + blockDim.x * (
j + unroll_times - 1) * packet_width;
629 if (last_col >= num_coeffs_to_reduce) {
631 for (;
col + packet_width <= num_coeffs_to_reduce;
633 const PacketType val1 = input.m_impl.template packet<Unaligned>(
634 row * num_coeffs_to_reduce + col);
635 reducer.reducePacket(val1, &reduced_val1);
636 const PacketType val2 = input.m_impl.template packet<Unaligned>(
637 (row + 1) * num_coeffs_to_reduce + col);
638 reducer.reducePacket(val2, &reduced_val2);
640 if (col < num_coeffs_to_reduce) {
641 PacketType r1 = reducer.template initializePacket<PacketType>();
642 PacketType r2 = reducer.template initializePacket<PacketType>();
643 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
644 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
645 while (col + 1 < num_coeffs_to_reduce) {
646 *hr1 = __halves2half2(
647 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
648 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
649 *hr2 = __halves2half2(
650 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
651 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
657 if (col < num_coeffs_to_reduce) {
660 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
661 *hr1 = __halves2half2(last1, reducer.initialize());
663 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
664 *hr2 = __halves2half2(last2, reducer.initialize());
666 reducer.reducePacket(r1, &reduced_val1);
667 reducer.reducePacket(r2, &reduced_val2);
673 for (
int k = 0; k < unroll_times; ++k) {
674 const Index col = col_begin + blockDim.x * (
j + k) * packet_width;
675 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
676 row * num_coeffs_to_reduce + col),
678 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
679 (row + 1) * num_coeffs_to_reduce + col),
686 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
687 #if defined(EIGEN_HIPCC)
690 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
691 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
692 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
693 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
694 for (
int i = 0;
i < packet_width / 2;
i++) {
696 union {
int i; half2 h; } wka_in1, wka_out1;
698 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
701 union {
int i; half2 h; } wka_in2, wka_out2;
703 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
706 reducer.reducePacket(r1, &reduced_val1);
707 reducer.reducePacket(r2, &reduced_val2);
708 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
711 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
712 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
713 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
714 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
715 for (
int i = 0;
i < packet_width / 2;
i++) {
716 hr1[
i] = __shfl_down(rv1[i], offset, warpSize);
717 hr2[
i] = __shfl_down(rv2[i], offset, warpSize);
719 reducer.reducePacket(r1, &reduced_val1);
720 reducer.reducePacket(r2, &reduced_val2);
724 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
725 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
726 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
727 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
728 for (
int j = 0;
j < packet_width / 2;
j++) {
730 __shfl_down_sync(0xFFFFFFFF, rr1[j], (
unsigned)offset, warpSize);
732 __shfl_down_sync(0xFFFFFFFF, rr2[j], (
unsigned)offset, warpSize);
734 reducer.reducePacket(r1, &reduced_val1);
735 reducer.reducePacket(r2, &reduced_val2);
739 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
740 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
742 if (packet_width > 2) {
743 reducer.reducePacket(rv1[2], rv1);
744 reducer.reducePacket(rv1[3], rv1 + 1);
745 reducer.reducePacket(rv1[1], rv1);
746 reducer.reducePacket(rv2[2], rv2);
747 reducer.reducePacket(rv2[3], rv2 + 1);
748 reducer.reducePacket(rv2[1], rv2);
750 half val1 = __low2half(*rv1);
751 reducer.reduce(__high2half(*rv1), &val1);
752 half val2 = __low2half(*rv2);
753 reducer.reduce(__high2half(*rv2), &val2);
754 val = __halves2half2(val1, val2);
755 if ((threadIdx.x & (warpSize - 1)) == 0) {
756 half* loc = output +
row;
757 atomicReduce(
reinterpret_cast<half2*
>(loc), val, reducer);
765 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
766 struct InnerReductionLauncher {
767 static EIGEN_DEVICE_FUNC bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
768 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
774 template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
775 struct InnerReductionLauncher<
776 Self, Op, OutputType, PacketAccess,
778 internal::is_same<float, OutputType>::value ||
779 internal::is_same<double, OutputType>::value,
781 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
782 typedef typename Self::Index
Index;
784 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
785 const int block_size = 256;
786 const int num_per_thread = 128;
787 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
788 const int max_blocks = device.getNumGpuMultiProcessors() *
789 device.maxGpuThreadsPerMultiProcessor() / block_size;
790 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
792 if (num_blocks > 1) {
795 const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
796 const int max_blocks2 = device.getNumGpuMultiProcessors() *
797 device.maxGpuThreadsPerMultiProcessor() / 1024;
798 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
799 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
800 num_blocks2, 1024, 0, device, reducer.initialize(),
801 num_preserved_vals, output);
804 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
805 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
811 #ifdef EIGEN_HAS_GPU_FP16
812 template <
typename Self,
typename Op>
813 struct InnerReductionLauncher<Self, Op,
Eigen::half, false> {
814 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
815 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
820 template <
typename Self,
typename Op>
821 struct InnerReductionLauncher<Self, Op,
Eigen::half, true> {
822 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
823 typedef typename Self::Index
Index;
825 if (num_preserved_vals % 2 != 0) {
830 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
831 const int block_size = 128;
832 const int num_per_thread = 64;
833 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
834 const int max_blocks = device.getNumGpuMultiProcessors() *
835 device.maxGpuThreadsPerMultiProcessor() / block_size;
836 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
838 if (num_blocks > 1) {
841 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
842 1, 1, 0, device, reducer,
self, num_preserved_vals, output);
845 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
846 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
854 template <
typename Self,
typename Op>
855 struct InnerReducer<Self, Op, GpuDevice> {
859 #ifdef EIGEN_HAS_GPU_FP16
860 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
861 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
862 internal::is_same<typename Self::CoeffReturnType, double>::value ||
863 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
865 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
866 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
867 internal::is_same<typename Self::CoeffReturnType, double>::value);
870 template <
typename OutputType>
871 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
872 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
875 if (num_coeffs == 0) {
879 if (num_coeffs_to_reduce <= 128) {
883 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
887 template <
int NumPerThread,
typename Self,
888 typename Reducer,
typename Index>
890 typename Self::CoeffReturnType* output) {
891 const Index num_threads = blockDim.x * gridDim.x;
892 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
894 if (gridDim.x == 1) {
895 for (
Index i = thread_id;
i < num_preserved_coeffs;
i += num_threads) {
896 output[
i] = reducer.initialize();
902 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
903 for (
Index i = thread_id;
i < max_iter;
i += num_threads) {
904 const Index input_col =
i % num_preserved_coeffs;
905 const Index input_row = (
i / num_preserved_coeffs) * NumPerThread;
906 typename Self::CoeffReturnType reduced_val = reducer.initialize();
907 const Index max_row =
numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
908 for (
Index j = input_row;
j < max_row;
j++) {
909 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
910 reducer.reduce(val, &reduced_val);
912 atomicReduce(&(output[input_col]), reduced_val, reducer);
917 template <
typename Self,
typename Op>
918 struct OuterReducer<Self, Op, GpuDevice> {
922 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
923 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
924 internal::is_same<typename Self::CoeffReturnType, double>::value);
925 template <
typename Device,
typename OutputType>
927 #if !defined(EIGEN_HIPCC)
939 bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
940 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
944 static bool run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
945 typedef typename Self::Index
Index;
948 if (num_coeffs_to_reduce <= 32) {
952 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
953 const int block_size = 256;
954 const int num_per_thread = 16;
955 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
956 const int max_blocks = device.getNumGpuMultiProcessors() *
957 device.maxGpuThreadsPerMultiProcessor() / block_size;
958 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
960 if (num_blocks > 1) {
963 const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
964 const int max_blocks2 = device.getNumGpuMultiProcessors() *
965 device.maxGpuThreadsPerMultiProcessor() / 1024;
966 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
967 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
968 num_blocks2, 1024, 0, device, reducer.initialize(),
969 num_preserved_vals, output);
972 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
973 num_blocks, block_size, 0, device, reducer,
self, num_coeffs_to_reduce, num_preserved_vals, output);
RowXpr row(Index i) const
ColXpr col(Index i) const
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL FixedBlockXpr< NRows, NCols >::Type block(Index startRow, Index startCol)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
void pstoreu(Scalar *to, const Packet &from)
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index