10 #ifndef EIGEN_PACKET_MATH_GPU_H
11 #define EIGEN_PACKET_MATH_GPU_H
13 #include "../../InternalHeaderCheck.h"
20 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
21 #define EIGEN_GPU_HAS_LDG 1
25 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
26 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
29 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
30 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
36 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
38 template<>
struct is_arithmetic<float4> {
enum { value =
true }; };
39 template<>
struct is_arithmetic<double2> {
enum { value =
true }; };
41 template<>
struct packet_traits<float> : default_packet_traits
67 HasGammaSampleDerAlpha = 1,
76 template<>
struct packet_traits<double> : default_packet_traits
100 HasGammaSampleDerAlpha = 1,
110 template<>
struct unpacket_traits<float4> {
typedef float type;
enum {
size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef float4 half; };
111 template<>
struct unpacket_traits<double2> {
typedef double type;
enum {
size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef double2 half; };
113 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(
const float& from) {
114 return make_float4(from, from, from, from);
116 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
117 return make_double2(from, from);
123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
127 return __int_as_float(__float_as_int(
a) & __float_as_int(
b));
131 return __longlong_as_double(__double_as_longlong(
a) &
132 __double_as_longlong(
b));
137 return __int_as_float(__float_as_int(
a) | __float_as_int(
b));
141 return __longlong_as_double(__double_as_longlong(
a) |
142 __double_as_longlong(
b));
147 return __int_as_float(__float_as_int(
a) ^ __float_as_int(
b));
151 return __longlong_as_double(__double_as_longlong(
a) ^
152 __double_as_longlong(
b));
157 return __int_as_float(__float_as_int(
a) & ~__float_as_int(
b));
161 return __longlong_as_double(__double_as_longlong(
a) &
162 ~__double_as_longlong(
b));
166 return __int_as_float(
a ==
b ? 0xffffffffu : 0u);
170 return __longlong_as_double(
a ==
b ? 0xffffffffffffffffull : 0ull);
175 return __int_as_float(
a <
b ? 0xffffffffu : 0u);
180 return __longlong_as_double(
a <
b ? 0xffffffffffffffffull : 0ull);
185 return __int_as_float(
a <=
b ? 0xffffffffu : 0u);
190 return __longlong_as_double(
a <=
b ? 0xffffffffffffffffull : 0ull);
196 return make_float4(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y),
197 bitwise_and(
a.z,
b.z), bitwise_and(
a.w,
b.w));
202 return make_double2(bitwise_and(
a.x,
b.x), bitwise_and(
a.y,
b.y));
208 return make_float4(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y),
209 bitwise_or(
a.z,
b.z), bitwise_or(
a.w,
b.w));
214 return make_double2(bitwise_or(
a.x,
b.x), bitwise_or(
a.y,
b.y));
220 return make_float4(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y),
221 bitwise_xor(
a.z,
b.z), bitwise_xor(
a.w,
b.w));
226 return make_double2(bitwise_xor(
a.x,
b.x), bitwise_xor(
a.y,
b.y));
232 return make_float4(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y),
233 bitwise_andnot(
a.z,
b.z), bitwise_andnot(
a.w,
b.w));
237 pandnot<double2>(
const double2&
a,
const double2&
b) {
238 return make_double2(bitwise_andnot(
a.x,
b.x), bitwise_andnot(
a.y,
b.y));
244 return make_float4(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y), eq_mask(
a.z,
b.z),
250 return make_float4(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y), lt_mask(
a.z,
b.z),
256 return make_float4(le_mask(
a.x,
b.x), le_mask(
a.y,
b.y), le_mask(
a.z,
b.z),
261 pcmp_eq<double2>(
const double2&
a,
const double2&
b) {
262 return make_double2(eq_mask(
a.x,
b.x), eq_mask(
a.y,
b.y));
266 pcmp_lt<double2>(
const double2&
a,
const double2&
b) {
267 return make_double2(lt_mask(
a.x,
b.x), lt_mask(
a.y,
b.y));
271 pcmp_le<double2>(
const double2&
a,
const double2&
b) {
272 return make_double2(le_mask(
a.x,
b.x), le_mask(
a.y,
b.y));
276 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(
const float&
a) {
277 return make_float4(
a,
a+1,
a+2,
a+3);
279 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(
const double&
a) {
280 return make_double2(
a,
a+1);
283 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(
const float4&
a,
const float4&
b) {
284 return make_float4(
a.x+
b.x,
a.y+
b.y,
a.z+
b.z,
a.w+
b.w);
286 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(
const double2&
a,
const double2&
b) {
287 return make_double2(
a.x+
b.x,
a.y+
b.y);
290 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(
const float4&
a,
const float4&
b) {
291 return make_float4(
a.x-
b.x,
a.y-
b.y,
a.z-
b.z,
a.w-
b.w);
293 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(
const double2&
a,
const double2&
b) {
294 return make_double2(
a.x-
b.x,
a.y-
b.y);
298 return make_float4(-
a.x, -
a.y, -
a.z, -
a.w);
301 return make_double2(-
a.x, -
a.y);
307 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4&
a,
const float4&
b) {
308 return make_float4(
a.x*
b.x,
a.y*
b.y,
a.z*
b.z,
a.w*
b.w);
310 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2&
a,
const double2&
b) {
311 return make_double2(
a.x*
b.x,
a.y*
b.y);
314 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4&
a,
const float4&
b) {
315 return make_float4(
a.x/
b.x,
a.y/
b.y,
a.z/
b.z,
a.w/
b.w);
317 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2&
a,
const double2&
b) {
318 return make_double2(
a.x/
b.x,
a.y/
b.y);
321 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4&
a,
const float4&
b) {
322 return make_float4(fminf(
a.x,
b.x), fminf(
a.y,
b.y), fminf(
a.z,
b.z), fminf(
a.w,
b.w));
324 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2&
a,
const double2&
b) {
328 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4&
a,
const float4&
b) {
329 return make_float4(fmaxf(
a.x,
b.x), fmaxf(
a.y,
b.y), fmaxf(
a.z,
b.z), fmaxf(
a.w,
b.w));
331 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2&
a,
const double2&
b) {
335 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(
const float* from) {
336 return *
reinterpret_cast<const float4*
>(from);
339 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
340 return *
reinterpret_cast<const double2*
>(from);
343 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
344 return make_float4(from[0], from[1], from[2], from[3]);
346 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
347 return make_double2(from[0], from[1]);
350 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(
const float* from) {
351 return make_float4(from[0], from[0], from[1], from[1]);
353 template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(
const double* from) {
354 return make_double2(from[0], from[0]);
358 *
reinterpret_cast<float4*
>(to) = from;
362 *
reinterpret_cast<double2*
>(to) = from;
379 #if defined(EIGEN_GPU_HAS_LDG)
380 return __ldg(
reinterpret_cast<const float4*
>(from));
382 return make_float4(from[0], from[1], from[2], from[3]);
387 #if defined(EIGEN_GPU_HAS_LDG)
388 return __ldg(
reinterpret_cast<const double2*
>(from));
390 return make_double2(from[0], from[1]);
396 #if defined(EIGEN_GPU_HAS_LDG)
397 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
399 return make_float4(from[0], from[1], from[2], from[3]);
404 #if defined(EIGEN_GPU_HAS_LDG)
405 return make_double2(__ldg(from+0), __ldg(from+1));
407 return make_double2(from[0], from[1]);
412 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
416 return make_double2(from[0*stride], from[1*stride]);
419 template<>
EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
420 to[stride*0] = from.x;
421 to[stride*1] = from.y;
422 to[stride*2] = from.z;
423 to[stride*3] = from.w;
425 template<>
EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(
double* to,
const double2& from,
Index stride) {
426 to[stride*0] = from.x;
427 to[stride*1] = from.y;
438 return a.x +
a.y +
a.z +
a.w;
445 return fmaxf(fmaxf(
a.x,
a.y), fmaxf(
a.z,
a.w));
452 return fminf(fminf(
a.x,
a.y), fminf(
a.z,
a.w));
459 return a.x *
a.y *
a.z *
a.w;
466 return make_float4(fabsf(
a.x), fabsf(
a.y), fabsf(
a.z), fabsf(
a.w));
469 return make_double2(fabs(
a.x), fabs(
a.y));
473 return make_float4(floorf(
a.x), floorf(
a.y), floorf(
a.z), floorf(
a.w));
481 float tmp = kernel.packet[0].y;
482 kernel.packet[0].y = kernel.packet[1].x;
483 kernel.packet[1].x = tmp;
485 tmp = kernel.packet[0].z;
486 kernel.packet[0].z = kernel.packet[2].x;
487 kernel.packet[2].x = tmp;
489 tmp = kernel.packet[0].w;
490 kernel.packet[0].w = kernel.packet[3].x;
491 kernel.packet[3].x = tmp;
493 tmp = kernel.packet[1].z;
494 kernel.packet[1].z = kernel.packet[2].y;
495 kernel.packet[2].y = tmp;
497 tmp = kernel.packet[1].w;
498 kernel.packet[1].w = kernel.packet[3].y;
499 kernel.packet[3].y = tmp;
501 tmp = kernel.packet[2].w;
502 kernel.packet[2].w = kernel.packet[3].z;
503 kernel.packet[3].z = tmp;
508 double tmp = kernel.packet[0].y;
509 kernel.packet[0].y = kernel.packet[1].x;
510 kernel.packet[1].x = tmp;
518 #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
520 typedef ulonglong2 Packet4h2;
521 template<>
struct unpacket_traits<Packet4h2> {
typedef Eigen::half type;
enum {
size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h2 half; };
522 template<>
struct is_arithmetic<Packet4h2> {
enum { value =
true }; };
524 template<>
struct unpacket_traits<half2> {
typedef Eigen::half type;
enum {
size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef half2 half; };
525 template<>
struct is_arithmetic<half2> {
enum { value =
true }; };
527 template<>
struct packet_traits<
Eigen::half> : default_packet_traits
529 typedef Packet4h2 type;
530 typedef Packet4h2 half;
550 return __half2half2(from);
557 half2* p_alias =
reinterpret_cast<half2*
>(&r);
558 p_alias[0] = pset1<half2>(from);
559 p_alias[1] = pset1<half2>(from);
560 p_alias[2] = pset1<half2>(from);
561 p_alias[3] = pset1<half2>(from);
568 return *
reinterpret_cast<const half2*
>(from);
572 return __halves2half2(from[0], from[1]);
576 return __halves2half2(from[0], from[0]);
581 *
reinterpret_cast<half2*
>(to) = from;
586 to[0] = __low2half(from);
587 to[1] = __high2half(from);
593 #if defined(EIGEN_GPU_HAS_LDG)
595 return __ldg(
reinterpret_cast<const half2*
>(from));
597 return __halves2half2(*(from+0), *(from+1));
603 #if defined(EIGEN_GPU_HAS_LDG)
604 return __halves2half2(__ldg(from+0), __ldg(from+1));
606 return __halves2half2(*(from+0), *(from+1));
612 return __halves2half2(from[0*stride], from[1*stride]);
617 to[stride*0] = __low2half(from);
618 to[stride*1] = __high2half(from);
622 return __low2half(
a);
626 half a1 = __low2half(
a);
627 half a2 = __high2half(
a);
630 return __halves2half2(result1, result2);
635 return pset1<half2>(true_half);
640 return pset1<half2>(false_half);
645 __half a1 = __low2half(kernel.packet[0]);
646 __half a2 = __high2half(kernel.packet[0]);
647 __half b1 = __low2half(kernel.packet[1]);
648 __half b2 = __high2half(kernel.packet[1]);
649 kernel.packet[0] = __halves2half2(a1, b1);
650 kernel.packet[1] = __halves2half2(a2, b2);
654 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
655 return __halves2half2(
a, __hadd(
a, __float2half(1.0f)));
657 float f = __half2float(
a) + 1.0f;
658 return __halves2half2(
a, __float2half(f));
665 half mask_low = __low2half(mask);
666 half mask_high = __high2half(mask);
667 half result_low = mask_low == half(0) ? __low2half(
b) : __low2half(
a);
668 half result_high = mask_high == half(0) ? __high2half(
b) : __high2half(
a);
669 return __halves2half2(result_low, result_high);
676 half a1 = __low2half(
a);
677 half a2 = __high2half(
a);
678 half b1 = __low2half(
b);
679 half b2 = __high2half(
b);
680 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
681 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
682 return __halves2half2(eq1, eq2);
689 half a1 = __low2half(
a);
690 half a2 = __high2half(
a);
691 half b1 = __low2half(
b);
692 half b2 = __high2half(
b);
693 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
694 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
695 return __halves2half2(eq1, eq2);
702 half a1 = __low2half(
a);
703 half a2 = __high2half(
a);
704 half b1 = __low2half(
b);
705 half b2 = __high2half(
b);
706 half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
707 half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
708 return __halves2half2(eq1, eq2);
713 half a1 = __low2half(
a);
714 half a2 = __high2half(
a);
715 half b1 = __low2half(
b);
716 half b2 = __high2half(
b);
719 return __halves2half2(result1, result2);
724 half a1 = __low2half(
a);
725 half a2 = __high2half(
a);
726 half b1 = __low2half(
b);
727 half b2 = __high2half(
b);
730 return __halves2half2(result1, result2);
735 half a1 = __low2half(
a);
736 half a2 = __high2half(
a);
737 half b1 = __low2half(
b);
738 half b2 = __high2half(
b);
741 return __halves2half2(result1, result2);
746 half a1 = __low2half(
a);
747 half a2 = __high2half(
a);
748 half b1 = __low2half(
b);
749 half b2 = __high2half(
b);
752 return __halves2half2(result1, result2);
757 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
758 return __hadd2(
a,
b);
760 float a1 = __low2float(
a);
761 float a2 = __high2float(
a);
762 float b1 = __low2float(
b);
763 float b2 = __high2float(
b);
766 return __floats2half2_rn(r1, r2);
772 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
773 return __hsub2(
a,
b);
775 float a1 = __low2float(
a);
776 float a2 = __high2float(
a);
777 float b1 = __low2float(
b);
778 float b2 = __high2float(
b);
781 return __floats2half2_rn(r1, r2);
786 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
789 float a1 = __low2float(
a);
790 float a2 = __high2float(
a);
791 return __floats2half2_rn(-a1, -a2);
799 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
800 return __hmul2(
a,
b);
802 float a1 = __low2float(
a);
803 float a2 = __high2float(
a);
804 float b1 = __low2float(
b);
805 float b2 = __high2float(
b);
808 return __floats2half2_rn(r1, r2);
815 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
816 return __hfma2(
a,
b,
c);
818 float a1 = __low2float(
a);
819 float a2 = __high2float(
a);
820 float b1 = __low2float(
b);
821 float b2 = __high2float(
b);
822 float c1 = __low2float(
c);
823 float c2 = __high2float(
c);
824 float r1 = a1 * b1 + c1;
825 float r2 = a2 * b2 + c2;
826 return __floats2half2_rn(r1, r2);
832 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
833 return __h2div(
a,
b);
835 float a1 = __low2float(
a);
836 float a2 = __high2float(
a);
837 float b1 = __low2float(
b);
838 float b2 = __high2float(
b);
841 return __floats2half2_rn(r1, r2);
847 float a1 = __low2float(
a);
848 float a2 = __high2float(
a);
849 float b1 = __low2float(
b);
850 float b2 = __high2float(
b);
851 __half r1 = a1 < b1 ? __low2half(
a) : __low2half(
b);
852 __half r2 = a2 < b2 ? __high2half(
a) : __high2half(
b);
853 return __halves2half2(r1, r2);
858 float a1 = __low2float(
a);
859 float a2 = __high2float(
a);
860 float b1 = __low2float(
b);
861 float b2 = __high2float(
b);
862 __half r1 = a1 > b1 ? __low2half(
a) : __low2half(
b);
863 __half r2 = a2 > b2 ? __high2half(
a) : __high2half(
b);
864 return __halves2half2(r1, r2);
868 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
869 return __hadd(__low2half(
a), __high2half(
a));
871 float a1 = __low2float(
a);
872 float a2 = __high2float(
a);
878 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
879 __half
first = __low2half(
a);
880 __half second = __high2half(
a);
883 float a1 = __low2float(
a);
884 float a2 = __high2float(
a);
885 return a1 > a2 ? __low2half(
a) : __high2half(
a);
890 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
891 __half
first = __low2half(
a);
892 __half second = __high2half(
a);
895 float a1 = __low2float(
a);
896 float a2 = __high2float(
a);
897 return a1 < a2 ? __low2half(
a) : __high2half(
a);
902 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
903 return __hmul(__low2half(
a), __high2half(
a));
905 float a1 = __low2float(
a);
906 float a2 = __high2float(
a);
912 float a1 = __low2float(
a);
913 float a2 = __high2float(
a);
914 float r1 = log1pf(a1);
915 float r2 = log1pf(a2);
916 return __floats2half2_rn(r1, r2);
920 float a1 = __low2float(
a);
921 float a2 = __high2float(
a);
922 float r1 = expm1f(a1);
923 float r2 = expm1f(a2);
924 return __floats2half2_rn(r1, r2);
927 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
928 defined(EIGEN_HIP_DEVICE_COMPILE)
931 half2
plog(
const half2&
a) {
936 half2
pexp(
const half2&
a) {
941 half2
psqrt(
const half2&
a) {
953 float a1 = __low2float(
a);
954 float a2 = __high2float(
a);
957 return __floats2half2_rn(r1, r2);
961 float a1 = __low2float(
a);
962 float a2 = __high2float(
a);
965 return __floats2half2_rn(r1, r2);
969 float a1 = __low2float(
a);
970 float a2 = __high2float(
a);
971 float r1 = sqrtf(a1);
972 float r2 = sqrtf(a2);
973 return __floats2half2_rn(r1, r2);
977 float a1 = __low2float(
a);
978 float a2 = __high2float(
a);
979 float r1 = rsqrtf(a1);
980 float r2 = rsqrtf(a2);
981 return __floats2half2_rn(r1, r2);
989 return *
reinterpret_cast<const Packet4h2*
>(from);
997 half2* p_alias =
reinterpret_cast<half2*
>(&r);
998 p_alias[0] =
ploadu(from + 0);
999 p_alias[1] =
ploadu(from + 2);
1000 p_alias[2] =
ploadu(from + 4);
1001 p_alias[3] =
ploadu(from + 6);
1009 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1020 *
reinterpret_cast<Packet4h2*
>(to) = from;
1026 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1027 pstoreu(to + 0,from_alias[0]);
1028 pstoreu(to + 2,from_alias[1]);
1029 pstoreu(to + 4,from_alias[2]);
1030 pstoreu(to + 6,from_alias[3]);
1035 ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
1036 #if defined(EIGEN_GPU_HAS_LDG)
1038 r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
1042 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1043 r_alias[0] = ploadt_ro_aligned(from + 0);
1044 r_alias[1] = ploadt_ro_aligned(from + 2);
1045 r_alias[2] = ploadt_ro_aligned(from + 4);
1046 r_alias[3] = ploadt_ro_aligned(from + 6);
1053 ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
1055 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1056 r_alias[0] = ploadt_ro_unaligned(from + 0);
1057 r_alias[1] = ploadt_ro_unaligned(from + 2);
1058 r_alias[2] = ploadt_ro_unaligned(from + 4);
1059 r_alias[3] = ploadt_ro_unaligned(from + 6);
1067 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1068 p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1069 p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1070 p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1071 p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1078 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1079 pscatter(to + stride * 0, from_alias[0], stride);
1080 pscatter(to + stride * 2, from_alias[1], stride);
1081 pscatter(to + stride * 4, from_alias[2], stride);
1082 pscatter(to + stride * 6, from_alias[3], stride);
1087 const Packet4h2&
a) {
1088 return pfirst(*(
reinterpret_cast<const half2*
>(&
a)));
1093 const Packet4h2&
a) {
1095 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1096 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1097 p_alias[0] =
pabs(a_alias[0]);
1098 p_alias[1] =
pabs(a_alias[1]);
1099 p_alias[2] =
pabs(a_alias[2]);
1100 p_alias[3] =
pabs(a_alias[3]);
1106 const Packet4h2& ) {
1108 return pset1<Packet4h2>(true_half);
1112 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(
const Packet4h2& ) {
1114 return pset1<Packet4h2>(false_half);
1118 double* d_row0,
double* d_row1,
double* d_row2,
double* d_row3,
1119 double* d_row4,
double* d_row5,
double* d_row6,
double* d_row7) {
1122 d_row0[1] = d_row4[0];
1126 d_row1[1] = d_row5[0];
1130 d_row2[1] = d_row6[0];
1134 d_row3[1] = d_row7[0];
1139 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1142 f_row0[1] = f_row2[0];
1146 f_row1[1] = f_row3[0];
1151 ptranspose_half(half2& f0, half2& f1) {
1152 __half a1 = __low2half(f0);
1153 __half a2 = __high2half(f0);
1154 __half b1 = __low2half(f1);
1155 __half b2 = __high2half(f1);
1156 f0 = __halves2half2(a1, b1);
1157 f1 = __halves2half2(a2, b2);
1161 ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1162 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1163 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1164 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1165 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1166 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1167 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1168 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1169 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1170 ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1171 d_row4, d_row5, d_row6, d_row7);
1174 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1175 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1176 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1177 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1178 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1179 ptranspose_half(f_row0[0], f_row1[0]);
1180 ptranspose_half(f_row0[1], f_row1[1]);
1181 ptranspose_half(f_row2[0], f_row3[0]);
1182 ptranspose_half(f_row2[1], f_row3[1]);
1184 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1185 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1186 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1187 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1188 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1189 ptranspose_half(f_row0[0], f_row1[0]);
1190 ptranspose_half(f_row0[1], f_row1[1]);
1191 ptranspose_half(f_row2[0], f_row3[0]);
1192 ptranspose_half(f_row2[1], f_row3[1]);
1194 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1195 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1196 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1197 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1198 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1199 ptranspose_half(f_row0[0], f_row1[0]);
1200 ptranspose_half(f_row0[1], f_row1[1]);
1201 ptranspose_half(f_row2[0], f_row3[0]);
1202 ptranspose_half(f_row2[1], f_row3[1]);
1204 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1205 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1206 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1207 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1208 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1209 ptranspose_half(f_row0[0], f_row1[0]);
1210 ptranspose_half(f_row0[1], f_row1[1]);
1211 ptranspose_half(f_row2[0], f_row3[0]);
1212 ptranspose_half(f_row2[1], f_row3[1]);
1219 #if defined(EIGEN_HIP_DEVICE_COMPILE)
1222 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1223 p_alias[0] = __halves2half2(
a, __hadd(
a, __float2half(1.0f)));
1224 p_alias[1] = __halves2half2(__hadd(
a, __float2half(2.0f)),
1225 __hadd(
a, __float2half(3.0f)));
1226 p_alias[2] = __halves2half2(__hadd(
a, __float2half(4.0f)),
1227 __hadd(
a, __float2half(5.0f)));
1228 p_alias[3] = __halves2half2(__hadd(
a, __float2half(6.0f)),
1229 __hadd(
a, __float2half(7.0f)));
1231 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1233 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1235 half2
b = pset1<half2>(
a);
1237 half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1238 half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1240 c = __hadd2(
b, half_offset0);
1241 r_alias[0] =
plset(__low2half(
c));
1242 r_alias[1] =
plset(__high2half(
c));
1244 c = __hadd2(
b, half_offset1);
1245 r_alias[2] =
plset(__low2half(
c));
1246 r_alias[3] =
plset(__high2half(
c));
1251 float f = __half2float(
a);
1253 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1254 p_alias[0] = __halves2half2(
a, __float2half(f + 1.0f));
1255 p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
1256 p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
1257 p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
1264 pselect<Packet4h2>(
const Packet4h2& mask,
const Packet4h2&
a,
1265 const Packet4h2&
b) {
1267 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1268 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1269 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1270 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1271 r_alias[0] =
pselect(mask_alias[0], a_alias[0], b_alias[0]);
1272 r_alias[1] =
pselect(mask_alias[1], a_alias[1], b_alias[1]);
1273 r_alias[2] =
pselect(mask_alias[2], a_alias[2], b_alias[2]);
1274 r_alias[3] =
pselect(mask_alias[3], a_alias[3], b_alias[3]);
1280 pcmp_eq<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1282 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1283 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1284 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1285 r_alias[0] =
pcmp_eq(a_alias[0], b_alias[0]);
1286 r_alias[1] =
pcmp_eq(a_alias[1], b_alias[1]);
1287 r_alias[2] =
pcmp_eq(a_alias[2], b_alias[2]);
1288 r_alias[3] =
pcmp_eq(a_alias[3], b_alias[3]);
1294 pcmp_lt<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1296 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1297 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1298 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1299 r_alias[0] =
pcmp_lt(a_alias[0], b_alias[0]);
1300 r_alias[1] =
pcmp_lt(a_alias[1], b_alias[1]);
1301 r_alias[2] =
pcmp_lt(a_alias[2], b_alias[2]);
1302 r_alias[3] =
pcmp_lt(a_alias[3], b_alias[3]);
1308 pcmp_le<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1310 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1311 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1312 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1313 r_alias[0] =
pcmp_le(a_alias[0], b_alias[0]);
1314 r_alias[1] =
pcmp_le(a_alias[1], b_alias[1]);
1315 r_alias[2] =
pcmp_le(a_alias[2], b_alias[2]);
1316 r_alias[3] =
pcmp_le(a_alias[3], b_alias[3]);
1322 const Packet4h2&
a,
const Packet4h2&
b) {
1324 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1325 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1326 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1327 r_alias[0] =
pand(a_alias[0], b_alias[0]);
1328 r_alias[1] =
pand(a_alias[1], b_alias[1]);
1329 r_alias[2] =
pand(a_alias[2], b_alias[2]);
1330 r_alias[3] =
pand(a_alias[3], b_alias[3]);
1336 const Packet4h2&
a,
const Packet4h2&
b) {
1338 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1339 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1340 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1341 r_alias[0] =
por(a_alias[0], b_alias[0]);
1342 r_alias[1] =
por(a_alias[1], b_alias[1]);
1343 r_alias[2] =
por(a_alias[2], b_alias[2]);
1344 r_alias[3] =
por(a_alias[3], b_alias[3]);
1350 const Packet4h2&
a,
const Packet4h2&
b) {
1352 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1353 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1354 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1355 r_alias[0] =
pxor(a_alias[0], b_alias[0]);
1356 r_alias[1] =
pxor(a_alias[1], b_alias[1]);
1357 r_alias[2] =
pxor(a_alias[2], b_alias[2]);
1358 r_alias[3] =
pxor(a_alias[3], b_alias[3]);
1364 pandnot<Packet4h2>(
const Packet4h2&
a,
const Packet4h2&
b) {
1366 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1367 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1368 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1369 r_alias[0] =
pandnot(a_alias[0], b_alias[0]);
1370 r_alias[1] =
pandnot(a_alias[1], b_alias[1]);
1371 r_alias[2] =
pandnot(a_alias[2], b_alias[2]);
1372 r_alias[3] =
pandnot(a_alias[3], b_alias[3]);
1378 const Packet4h2&
a,
const Packet4h2&
b) {
1380 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1381 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1382 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1383 r_alias[0] =
padd(a_alias[0], b_alias[0]);
1384 r_alias[1] =
padd(a_alias[1], b_alias[1]);
1385 r_alias[2] =
padd(a_alias[2], b_alias[2]);
1386 r_alias[3] =
padd(a_alias[3], b_alias[3]);
1392 const Packet4h2&
a,
const Packet4h2&
b) {
1394 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1395 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1396 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1397 r_alias[0] =
psub(a_alias[0], b_alias[0]);
1398 r_alias[1] =
psub(a_alias[1], b_alias[1]);
1399 r_alias[2] =
psub(a_alias[2], b_alias[2]);
1400 r_alias[3] =
psub(a_alias[3], b_alias[3]);
1407 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1408 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1409 r_alias[0] =
pnegate(a_alias[0]);
1410 r_alias[1] =
pnegate(a_alias[1]);
1411 r_alias[2] =
pnegate(a_alias[2]);
1412 r_alias[3] =
pnegate(a_alias[3]);
1423 const Packet4h2&
a,
const Packet4h2&
b) {
1425 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1426 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1427 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1428 r_alias[0] =
pmul(a_alias[0], b_alias[0]);
1429 r_alias[1] =
pmul(a_alias[1], b_alias[1]);
1430 r_alias[2] =
pmul(a_alias[2], b_alias[2]);
1431 r_alias[3] =
pmul(a_alias[3], b_alias[3]);
1437 const Packet4h2&
a,
const Packet4h2&
b,
const Packet4h2&
c) {
1439 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1440 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1441 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1442 const half2* c_alias =
reinterpret_cast<const half2*
>(&
c);
1443 r_alias[0] =
pmadd(a_alias[0], b_alias[0], c_alias[0]);
1444 r_alias[1] =
pmadd(a_alias[1], b_alias[1], c_alias[1]);
1445 r_alias[2] =
pmadd(a_alias[2], b_alias[2], c_alias[2]);
1446 r_alias[3] =
pmadd(a_alias[3], b_alias[3], c_alias[3]);
1452 const Packet4h2&
a,
const Packet4h2&
b) {
1454 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1455 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1456 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1457 r_alias[0] =
pdiv(a_alias[0], b_alias[0]);
1458 r_alias[1] =
pdiv(a_alias[1], b_alias[1]);
1459 r_alias[2] =
pdiv(a_alias[2], b_alias[2]);
1460 r_alias[3] =
pdiv(a_alias[3], b_alias[3]);
1466 const Packet4h2&
a,
const Packet4h2&
b) {
1468 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1469 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1470 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1471 r_alias[0] =
pmin(a_alias[0], b_alias[0]);
1472 r_alias[1] =
pmin(a_alias[1], b_alias[1]);
1473 r_alias[2] =
pmin(a_alias[2], b_alias[2]);
1474 r_alias[3] =
pmin(a_alias[3], b_alias[3]);
1480 const Packet4h2&
a,
const Packet4h2&
b) {
1482 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1483 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1484 const half2* b_alias =
reinterpret_cast<const half2*
>(&
b);
1485 r_alias[0] =
pmax(a_alias[0], b_alias[0]);
1486 r_alias[1] =
pmax(a_alias[1], b_alias[1]);
1487 r_alias[2] =
pmax(a_alias[2], b_alias[2]);
1488 r_alias[3] =
pmax(a_alias[3], b_alias[3]);
1494 const Packet4h2&
a) {
1495 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1503 const Packet4h2&
a) {
1504 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1505 half2 m0 = __halves2half2(
predux_max(a_alias[0]),
1511 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1512 return (__hgt(
first, second) ?
first : second);
1514 float ffirst = __half2float(
first);
1515 float fsecond = __half2float(second);
1516 return (ffirst > fsecond)?
first: second;
1522 const Packet4h2&
a) {
1523 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1524 half2 m0 = __halves2half2(
predux_min(a_alias[0]),
1530 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1531 return (__hlt(
first, second) ?
first : second);
1533 float ffirst = __half2float(
first);
1534 float fsecond = __half2float(second);
1535 return (ffirst < fsecond)?
first: second;
1542 const Packet4h2&
a) {
1543 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1545 pmul(a_alias[2], a_alias[3])));
1550 plog1p<Packet4h2>(
const Packet4h2&
a) {
1552 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1553 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1554 r_alias[0] =
plog1p(a_alias[0]);
1555 r_alias[1] =
plog1p(a_alias[1]);
1556 r_alias[2] =
plog1p(a_alias[2]);
1557 r_alias[3] =
plog1p(a_alias[3]);
1563 pexpm1<Packet4h2>(
const Packet4h2&
a) {
1565 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1566 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1567 r_alias[0] =
pexpm1(a_alias[0]);
1568 r_alias[1] =
pexpm1(a_alias[1]);
1569 r_alias[2] =
pexpm1(a_alias[2]);
1570 r_alias[3] =
pexpm1(a_alias[3]);
1575 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(
const Packet4h2&
a) {
1577 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1578 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1579 r_alias[0] =
plog(a_alias[0]);
1580 r_alias[1] =
plog(a_alias[1]);
1581 r_alias[2] =
plog(a_alias[2]);
1582 r_alias[3] =
plog(a_alias[3]);
1587 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(
const Packet4h2&
a) {
1589 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1590 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1591 r_alias[0] =
pexp(a_alias[0]);
1592 r_alias[1] =
pexp(a_alias[1]);
1593 r_alias[2] =
pexp(a_alias[2]);
1594 r_alias[3] =
pexp(a_alias[3]);
1599 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(
const Packet4h2&
a) {
1601 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1602 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1603 r_alias[0] =
psqrt(a_alias[0]);
1604 r_alias[1] =
psqrt(a_alias[1]);
1605 r_alias[2] =
psqrt(a_alias[2]);
1606 r_alias[3] =
psqrt(a_alias[3]);
1612 prsqrt<Packet4h2>(
const Packet4h2&
a) {
1614 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1615 const half2* a_alias =
reinterpret_cast<const half2*
>(&
a);
1616 r_alias[0] =
prsqrt(a_alias[0]);
1617 r_alias[1] =
prsqrt(a_alias[1]);
1618 r_alias[2] =
prsqrt(a_alias[2]);
1619 r_alias[3] =
prsqrt(a_alias[3]);
1628 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1629 return __hadd2(
a,
b);
1631 float a1 = __low2float(
a);
1632 float a2 = __high2float(
a);
1633 float b1 = __low2float(
b);
1634 float b2 = __high2float(
b);
1637 return __floats2half2_rn(r1, r2);
1644 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1645 return __hmul2(
a,
b);
1647 float a1 = __low2float(
a);
1648 float a2 = __high2float(
a);
1649 float b1 = __low2float(
b);
1650 float b2 = __high2float(
b);
1653 return __floats2half2_rn(r1, r2);
1660 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1661 return __h2div(
a,
b);
1663 float a1 = __low2float(
a);
1664 float a2 = __high2float(
a);
1665 float b1 = __low2float(
b);
1666 float b2 = __high2float(
b);
1669 return __floats2half2_rn(r1, r2);
1676 float a1 = __low2float(
a);
1677 float a2 = __high2float(
a);
1678 float b1 = __low2float(
b);
1679 float b2 = __high2float(
b);
1680 __half r1 = a1 < b1 ? __low2half(
a) : __low2half(
b);
1681 __half r2 = a2 < b2 ? __high2half(
a) : __high2half(
b);
1682 return __halves2half2(r1, r2);
1688 float a1 = __low2float(
a);
1689 float a2 = __high2float(
a);
1690 float b1 = __low2float(
b);
1691 float b2 = __high2float(
b);
1692 __half r1 = a1 > b1 ? __low2half(
a) : __low2half(
b);
1693 __half r2 = a2 > b2 ? __high2half(
a) : __high2half(
b);
1694 return __halves2half2(r1, r2);
1699 #undef EIGEN_GPU_HAS_LDG
1700 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1701 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
#define EIGEN_ALWAYS_INLINE
#define EIGEN_DEVICE_FUNC
bfloat16 fmax(const bfloat16 &a, const bfloat16 &b)
bfloat16 fmin(const bfloat16 &a, const bfloat16 &b)
EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Packet pmin(const Packet &a, const Packet &b)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexpm1(const Packet &a)
Packet padd(const Packet &a, const Packet &b)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
void pstore< float >(float *to, const Packet4f &from)
unpacket_traits< Packet >::type predux(const Packet &a)
Packet8h ptrue(const Packet8h &a)
Packet ploadu(const typename unpacket_traits< Packet >::type *from)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet &a)
Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Packet4f pabs(const Packet4f &a)
Packet pmax(const Packet &a, const Packet &b)
Packet2cf pnegate(const Packet2cf &a)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu(Scalar *to, const Packet &from)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
bfloat16 pfirst(const Packet8bf &a)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Packet pmul(const Packet &a, const Packet &b)
void pscatter(Scalar *to, const Packet &from, Index stride, typename unpacket_traits< Packet >::mask_t umask)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4f psqrt(const Packet4f &a)
Packet psub(const Packet &a, const Packet &b)
Packet pgather(const Packet &src, const Scalar *from, Index stride, typename unpacket_traits< Packet >::mask_t umask)
unpacket_traits< Packet >::type predux_mul(const Packet &a)
Packet8h pand(const Packet8h &a, const Packet8h &b)
EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p(const Packet &a)
void pstoreu< float >(float *to, const Packet4f &from)
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet ploaddup(const typename unpacket_traits< Packet >::type *from)
Packet pdiv(const Packet &a, const Packet &b)
unpacket_traits< Packet >::type predux_max(const Packet &a)
Packet2cf pconj(const Packet2cf &a)
Packet plset(const typename unpacket_traits< Packet >::type &a)
Packet8h por(const Packet8h &a, const Packet8h &b)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet pload(const typename unpacket_traits< Packet >::type *from)
unpacket_traits< Packet >::type predux_min(const Packet &a)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4f prsqrt(const Packet4f &a)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_floor_op< typename Derived::Scalar >, const Derived > floor(const Eigen::ArrayBase< Derived > &x)