10 #ifndef EIGEN_PACKET_MATH_SSE_H
11 #define EIGEN_PACKET_MATH_SSE_H
14 #include "../../InternalHeaderCheck.h"
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
27 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
30 #ifdef EIGEN_VECTORIZE_FMA
31 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
36 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
42 typedef eigen_packet_wrapper<__m128>
Packet4f;
43 typedef eigen_packet_wrapper<__m128d>
Packet2d;
49 typedef eigen_packet_wrapper<__m128i, 0>
Packet4i;
51 typedef eigen_packet_wrapper<__m128i, 4>
Packet4ui;
53 template<>
struct is_arithmetic<__m128> {
enum { value =
true }; };
54 template<>
struct is_arithmetic<__m128i> {
enum { value =
true }; };
55 template<>
struct is_arithmetic<__m128d> {
enum { value =
true }; };
56 template<>
struct is_arithmetic<
Packet4i> {
enum { value =
true }; };
60 template<>
struct is_arithmetic<
Packet4ui> {
enum { value =
false }; };
61 template<>
struct is_arithmetic<
Packet16b> {
enum { value =
true }; };
63 template<
int p,
int q,
int r,
int s>
65 enum { mask = (s)<<6|(r)<<4|(q)<<2|(
p) };
69 #define vec4f_swizzle1(v,p,q,r,s) \
70 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
72 #define vec4i_swizzle1(v,p,q,r,s) \
73 Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
75 #define vec4ui_swizzle1(v, p, q, r, s) \
76 Packet4ui(vec4i_swizzle1(v,p,q,r,s))
78 #define vec2d_swizzle1(v,p,q) \
79 Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
81 #define vec4f_swizzle2(a,b,p,q,r,s) \
82 Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
84 #define vec4i_swizzle2(a,b,p,q,r,s) \
85 Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
87 #define vec4ui_swizzle2(a,b,p,q,r,s) \
88 Packet4i(vec4i_swizzle2(a,b,p,q,r,s))
106 #define vec4f_duplane(a,p) \
107 vec4f_swizzle2(a,a,p,p,p,p)
109 #define vec2d_swizzle2(a,b,mask) \
110 Packet2d(_mm_shuffle_pd(a,b,mask))
120 #define vec2d_duplane(a,p) \
121 vec2d_swizzle2(a,a,(p<<1)|p)
123 #define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
124 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
126 #define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
127 const Packet2d p2d_##NAME = pset1<Packet2d>(X)
129 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
130 const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
132 #define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
133 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
135 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
139 #ifndef EIGEN_VECTORIZE_AVX
141 struct packet_traits<float> : default_packet_traits {
171 #ifdef EIGEN_VECTORIZE_SSE4_1
179 struct packet_traits<double> : default_packet_traits {
197 #ifdef EIGEN_VECTORIZE_SSE4_1
203 template<>
struct packet_traits<int> : default_packet_traits
218 template<>
struct packet_traits<
uint32_t> : default_packet_traits
238 template<>
struct packet_traits<
bool> : default_packet_traits
263 template<>
struct unpacket_traits<
Packet4f> {
267 enum {
size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
269 template<>
struct unpacket_traits<
Packet2d> {
272 enum {
size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
274 template<>
struct unpacket_traits<
Packet4i> {
277 enum {
size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
279 template<>
struct unpacket_traits<
Packet4ui> {
282 enum {
size = 4, alignment =
Aligned16, vectorizable =
true, masked_load_available =
false, masked_store_available =
false};
284 template<>
struct unpacket_traits<
Packet16b> {
287 enum {
size=16, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
290 #ifndef EIGEN_VECTORIZE_AVX
291 template<>
struct scalar_div_cost<float,true> {
enum { value = 7 }; };
292 template<>
struct scalar_div_cost<double,true> {
enum { value = 8 }; };
319 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
337 template<
typename Packet> EIGEN_STRONG_INLINE Packet
padds(
const Packet&
a,
const Packet&
b);
350 #ifdef EIGEN_VECTORIZE_SSE3
351 return _mm_addsub_ps(
a,
b);
353 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
361 #ifdef EIGEN_VECTORIZE_SSE3
362 return _mm_addsub_pd(
a,
b);
364 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0));
371 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
372 return _mm_xor_ps(
a,mask);
376 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
377 return _mm_xor_pd(
a,mask);
397 #ifdef EIGEN_VECTORIZE_SSE4_1
398 return _mm_mullo_epi32(
a,
b);
412 #ifdef EIGEN_VECTORIZE_SSE4_1
413 return _mm_mullo_epi32(
a,
b);
434 #ifdef EIGEN_VECTORIZE_AVX
435 return _mm256_cvttpd_epi32(
436 _mm256_div_pd(_mm256_cvtepi32_pd(
a), _mm256_cvtepi32_pd(
b)));
438 __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(
a), _mm_cvtepi32_pd(
b)));
440 _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(
vec4i_swizzle1(
a, 2, 3, 0, 1)),
442 return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
450 #ifdef EIGEN_VECTORIZE_FMA
460 template<
typename Packet> EIGEN_STRONG_INLINE Packet pmadds(
const Packet&
a,
const Packet&
b,
const Packet&
c);
465 #ifdef EIGEN_VECTORIZE_SSE4_1
467 return _mm_blendv_ps(
b,
a,mask);
471 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(
b),_mm_castsi128_ps(
a),_mm_castsi128_ps(mask)));
475 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(
b),_mm_castsi128_ps(
a),_mm_castsi128_ps(mask)));
481 return _mm_blendv_epi8(
b,
a,mask);
487 return _mm_or_si128(a_part, b_part);
496 return _mm_castsi128_ps(_mm_cmpeq_epi32(
b,
b));
501 return _mm_castsi128_pd(_mm_cmpeq_epi32(
b,
b));
545 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
550 #ifdef EIGEN_VECTORIZE_AVX
552 asm(
"vminps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
555 asm(
"minps %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
560 return _mm_min_ps(
b,
a);
564 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
569 #ifdef EIGEN_VECTORIZE_AVX
571 asm(
"vminpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
574 asm(
"minpd %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
579 return _mm_min_pd(
b,
a);
584 #ifdef EIGEN_VECTORIZE_SSE4_1
585 return _mm_min_epi32(
a,
b);
589 return _mm_or_si128(_mm_and_si128(mask,
a),_mm_andnot_si128(mask,
b));
593 #ifdef EIGEN_VECTORIZE_SSE4_1
594 return _mm_min_epu32(
a,
b);
604 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
609 #ifdef EIGEN_VECTORIZE_AVX
611 asm(
"vmaxps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
614 asm(
"maxps %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
619 return _mm_max_ps(
b,
a);
623 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
628 #ifdef EIGEN_VECTORIZE_AVX
630 asm(
"vmaxpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
633 asm(
"maxpd %[a], %[res]" : [
res]
"+x" (
res) : [
a]
"x" (
a));
638 return _mm_max_pd(
b,
a);
643 #ifdef EIGEN_VECTORIZE_SSE4_1
644 return _mm_max_epi32(
a,
b);
648 return _mm_or_si128(_mm_and_si128(mask,
a),_mm_andnot_si128(mask,
b));
652 #ifdef EIGEN_VECTORIZE_SSE4_1
653 return _mm_max_epu32(
a,
b);
662 #ifdef EIGEN_VECTORIZE_SSE4_1
670 #ifdef EIGEN_VECTORIZE_SSE4_1
678 template <
typename Packet,
typename Op>
684 return pselect<Packet>(not_nan_mask_a,
m,
b);
687 template <
typename Packet,
typename Op>
693 return pselect<Packet>(not_nan_mask_a,
m,
a);
740 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
741 return _mm_and_ps(
a,mask);
745 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
746 return _mm_and_pd(
a,mask);
750 #ifdef EIGEN_VECTORIZE_SSSE3
751 return _mm_abs_epi32(
a);
754 return _mm_sub_epi32(_mm_xor_si128(
a,aux),aux);
759 template<> EIGEN_STRONG_INLINE
Packet4f psignbit(
const Packet4f&
a) {
return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(
a), 31)); }
762 Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(
a));
763 #ifdef EIGEN_VECTORIZE_AVX
764 return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
766 return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
771 #ifdef EIGEN_VECTORIZE_SSE4_1
777 return _mm_round_ps(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
782 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
783 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
784 return _mm_round_pd(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
787 template<> EIGEN_STRONG_INLINE
Packet4f print<Packet4f>(
const Packet4f&
a) {
return _mm_round_ps(
a, _MM_FROUND_CUR_DIRECTION); }
788 template<> EIGEN_STRONG_INLINE
Packet2d print<Packet2d>(
const Packet2d&
a) {
return _mm_round_pd(
a, _MM_FROUND_CUR_DIRECTION); }
830 mask =
pand(mask, cst_1);
831 return psub(tmp, mask);
840 mask =
pand(mask, cst_1);
841 return psub(tmp, mask);
850 mask =
pand(mask, cst_1);
851 return padd(tmp, mask);
860 mask =
pand(mask, cst_1);
861 return padd(tmp, mask);
874 return _mm_loadu_ps(from);
882 return _mm_loadu_ps(from);
889 return _mm_loadu_pd(from);
894 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
899 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
903 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
907 template<
typename Packet> EIGEN_STRONG_INLINE Packet
ploadl(
const typename unpacket_traits<Packet>::type* from);
912 template<
typename Packet> EIGEN_STRONG_INLINE Packet
ploads(
const typename unpacket_traits<Packet>::type* from);
918 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from))), 0, 0, 1, 1);
925 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
931 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
939 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(
reinterpret_cast<const double*
>(from)));
940 return _mm_unpacklo_epi8(tmp, tmp);
947 __m128i tmp = _mm_castps_si128(
pload1<Packet4f>(
reinterpret_cast<const float*
>(from)));
948 tmp = _mm_unpacklo_epi8(tmp, tmp);
949 return _mm_unpacklo_epi16(tmp, tmp);
964 template<
typename Scalar,
typename Packet> EIGEN_STRONG_INLINE
void pstorel(Scalar* to,
const Packet& from);
968 template<
typename Scalar,
typename Packet> EIGEN_STRONG_INLINE
void pstores(Scalar* to,
const Packet& from);
974 return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
978 return _mm_set_pd(from[1*stride], from[0*stride]);
982 return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
986 return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
987 numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
992 return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
993 from[11*stride], from[10*stride], from[9*stride], from[8*stride],
994 from[7*stride], from[6*stride], from[5*stride], from[4*stride],
995 from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1000 to[stride*0] = _mm_cvtss_f32(from);
1001 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
1002 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
1003 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
1007 to[stride*0] = _mm_cvtsd_f64(from);
1008 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
1012 to[stride*0] = _mm_cvtsi128_si32(from);
1013 to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1014 to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1015 to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1019 to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1020 to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1021 to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1022 to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1026 to[4*stride*0] = _mm_cvtsi128_si32(from);
1027 to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1028 to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1029 to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1046 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1052 #ifndef EIGEN_VECTORIZE_AVX
1059 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1066 #elif EIGEN_COMP_MSVC_STRICT
1085 #ifdef EIGEN_VECTORIZE_SSSE3
1086 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1087 return _mm_shuffle_epi8(
a, mask);
1089 Packet16b tmp = _mm_shuffle_epi32(
a, _MM_SHUFFLE(0, 1, 2, 3));
1090 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1091 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1104 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(
pand(
a, cst_exp_mask)), 52);
1127 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1128 Packet4i b = parithmetic_shift_right<2>(ei);
1129 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(
padd(
b, bias), 52));
1132 c = _mm_castsi128_pd(_mm_slli_epi64(
padd(
b, bias), 52));
1139 template<> EIGEN_STRONG_INLINE
void
1149 template<> EIGEN_STRONG_INLINE
void
1153 #ifdef EIGEN_VECTORIZE_SSE3
1154 a0 = _mm_loaddup_pd(
a+0);
1155 a1 = _mm_loaddup_pd(
a+1);
1156 a2 = _mm_loaddup_pd(
a+2);
1157 a3 = _mm_loaddup_pd(
a+3);
1171 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1172 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1173 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1174 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1201 #ifdef EIGEN_VECTORIZE_SSSE3
1216 Packet4i tmp = _mm_add_epi32(
a, _mm_unpackhi_epi64(
a,
a));
1220 Packet4ui tmp = _mm_add_epi32(
a, _mm_unpackhi_epi64(
a,
a));
1226 Packet4i tmp = _mm_or_si128(
a, _mm_unpackhi_epi64(
a,
a));
1250 return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1259 return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1263 Packet4i tmp = _mm_and_si128(
a, _mm_unpackhi_epi64(
a,
a));
1280 #ifdef EIGEN_VECTORIZE_SSE4_1
1281 Packet4i tmp = _mm_min_epi32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1288 int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1289 int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1290 return aux0<aux2 ? aux0 : aux2;
1295 #ifdef EIGEN_VECTORIZE_SSE4_1
1296 Packet4ui tmp = _mm_min_epu32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1303 uint32_t aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1304 uint32_t aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1305 return aux0<aux2 ? aux0 : aux2;
1321 #ifdef EIGEN_VECTORIZE_SSE4_1
1322 Packet4i tmp = _mm_max_epi32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1329 int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1330 int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1331 return aux0>aux2 ? aux0 : aux2;
1336 #ifdef EIGEN_VECTORIZE_SSE4_1
1337 Packet4ui tmp = _mm_max_epu32(
a, _mm_shuffle_epi32(
a, _MM_SHUFFLE(0,0,3,2)));
1344 uint32_t aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1345 uint32_t aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1346 return aux0>aux2 ? aux0 : aux2;
1358 return _mm_movemask_ps(
x) != 0x0;
1363 return _mm_movemask_ps(_mm_castsi128_ps(
x)) != 0x0;
1367 return _mm_movemask_ps(_mm_castsi128_ps(
x)) != 0x0;
1371 ptranspose(PacketBlock<Packet4f,4>& kernel) {
1372 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1376 ptranspose(PacketBlock<Packet2d,2>& kernel) {
1377 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1378 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1379 kernel.packet[1] = tmp;
1383 ptranspose(PacketBlock<Packet4i,4>& kernel) {
1384 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1385 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1386 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1387 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1389 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1390 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1391 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1392 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1395 ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1400 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1401 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1402 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1403 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1404 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1405 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1406 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1407 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1423 __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1424 __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1425 __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1426 __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1427 __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);
1428 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);
1429 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1430 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1431 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1432 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1433 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1434 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1435 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1436 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1437 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1438 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1440 __m128i s0 = _mm_unpacklo_epi16(t0, t2);
1441 __m128i s1 = _mm_unpackhi_epi16(t0, t2);
1442 __m128i s2 = _mm_unpacklo_epi16(t1, t3);
1443 __m128i s3 = _mm_unpackhi_epi16(t1, t3);
1444 __m128i s4 = _mm_unpacklo_epi16(t4, t6);
1445 __m128i s5 = _mm_unpackhi_epi16(t4, t6);
1446 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1447 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1448 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1449 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1450 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1451 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1452 __m128i sc = _mm_unpacklo_epi16(tc, te);
1453 __m128i sd = _mm_unpackhi_epi16(tc, te);
1454 __m128i se = _mm_unpacklo_epi16(td, tf);
1455 __m128i sf = _mm_unpackhi_epi16(td, tf);
1457 __m128i u0 = _mm_unpacklo_epi32(s0, s4);
1458 __m128i u1 = _mm_unpackhi_epi32(s0, s4);
1459 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1460 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1461 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1462 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1463 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1464 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1465 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1466 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1467 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1468 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1469 __m128i uc = _mm_unpacklo_epi32(sa, se);
1470 __m128i ud = _mm_unpackhi_epi32(sa, se);
1471 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1472 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1474 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1475 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1476 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1477 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1478 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1479 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1480 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1481 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1482 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1483 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1484 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1485 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1486 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1487 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1488 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1489 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
1493 const __m128i zero = _mm_setzero_si128();
1494 const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1495 __m128i false_mask = _mm_cmpeq_epi32(select, zero);
1496 #ifdef EIGEN_VECTORIZE_SSE4_1
1497 return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
1499 return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
1507 const __m128 zero = _mm_setzero_ps();
1508 const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1509 __m128 false_mask = _mm_cmpeq_ps(select, zero);
1510 #ifdef EIGEN_VECTORIZE_SSE4_1
1511 return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
1513 return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
1517 const __m128d zero = _mm_setzero_pd();
1518 const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
1519 __m128d false_mask = _mm_cmpeq_pd(select, zero);
1520 #ifdef EIGEN_VECTORIZE_SSE4_1
1521 return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
1523 return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
1528 #ifdef EIGEN_VECTORIZE_FMA
1529 template<> EIGEN_STRONG_INLINE
float pmadd(
const float&
a,
const float&
b,
const float&
c) {
1530 return ::fmaf(
a,
b,
c);
1532 template<> EIGEN_STRONG_INLINE
double pmadd(
const double&
a,
const double&
b,
const double&
c) {
1533 return ::fma(
a,
b,
c);
1535 template<> EIGEN_STRONG_INLINE
float pmsub(
const float&
a,
const float&
b,
const float&
c) {
1536 return ::fmaf(
a,
b,-
c);
1538 template<> EIGEN_STRONG_INLINE
double pmsub(
const double&
a,
const double&
b,
const double&
c) {
1539 return ::fma(
a,
b,-
c);
1541 template<> EIGEN_STRONG_INLINE
float pnmadd(
const float&
a,
const float&
b,
const float&
c) {
1542 return ::fmaf(-
a,
b,
c);
1544 template<> EIGEN_STRONG_INLINE
double pnmadd(
const double&
a,
const double&
b,
const double&
c) {
1545 return ::fma(-
a,
b,
c);
1547 template<> EIGEN_STRONG_INLINE
float pnmsub(
const float&
a,
const float&
b,
const float&
c) {
1548 return ::fmaf(-
a,
b,-
c);
1550 template<> EIGEN_STRONG_INLINE
double pnmsub(
const double&
a,
const double&
b,
const double&
c) {
1551 return ::fma(-
a,
b,-
c);
1555 #ifdef EIGEN_VECTORIZE_SSE4_1
1558 EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
1559 __m128i input = _mm_cvtepu16_epi32(h);
1562 __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
1564 __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
1566 __m128i
exp = _mm_and_si128(ou, shifted_exp);
1568 ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
1571 __m128i naninf_mask = _mm_cmpeq_epi32(
exp, shifted_exp);
1573 __m128i naninf_adj =
1574 _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
1576 ou = _mm_add_epi32(ou, naninf_adj);
1579 __m128i zeroden_mask = _mm_cmpeq_epi32(
exp, _mm_setzero_si128());
1580 __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
1582 ou = _mm_add_epi32(ou, zeroden_adj);
1584 __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
1586 ou = _mm_castps_si128(
1587 _mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
1590 _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
1592 ou = _mm_or_si128(ou,
sign);
1599 EIGEN_STRONG_INLINE __m128i
float2half(__m128 f) {
1600 __m128i o = _mm_setzero_si128();
1603 __m128i
sign = _mm_set1_epi32(0x80000000u);
1605 sign = _mm_and_si128(
sign, _mm_castps_si128(f));
1607 f = _mm_xor_ps(f, _mm_castsi128_ps(
sign));
1609 __m128i fu = _mm_castps_si128(f);
1611 __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
1612 __m128i f32infty = _mm_set1_epi32(255 << 23);
1615 __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
1616 __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
1617 __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
1618 __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
1619 __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
1621 __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
1623 __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
1624 __m128i subnorm_mask =
1625 _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
1627 f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
1629 o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
1630 o = _mm_and_si128(o, subnorm_mask);
1632 o = _mm_or_si128(o, naninf_value);
1634 __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
1635 o = _mm_and_si128(o, mask);
1638 __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
1640 fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
1642 fu = _mm_add_epi32(fu, mand_odd);
1643 fu = _mm_andnot_si128(mask, fu);
1645 fu = _mm_srli_epi32(fu, 13);
1646 o = _mm_or_si128(fu, o);
1649 o = _mm_or_si128(o, _mm_srli_epi32(
sign, 16));
1652 return _mm_and_si128(o, _mm_set1_epi32(0xffff));
1666 template<>
struct is_arithmetic<Packet4h> {
enum { value =
true }; };
1669 struct packet_traits<
Eigen::half> : default_packet_traits {
1670 typedef Packet4h type;
1672 typedef Packet4h half;
1675 AlignedOnScalar = 1,
1697 template<>
struct unpacket_traits<Packet4h> {
typedef Eigen::half type;
enum {
size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h half; };
1699 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(
const Eigen::half& from) {
1701 result.x = _mm_set1_pi16(from.
x);
1705 template<> EIGEN_STRONG_INLINE
Eigen::half pfirst<Packet4h>(
const Packet4h& from) {
1709 template<> EIGEN_STRONG_INLINE Packet4h
pconj(
const Packet4h&
a) {
return a; }
1711 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(
const Packet4h&
a,
const Packet4h&
b) {
1712 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1713 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1730 result.
x = _mm_set_pi16(h[3].
x, h[2].
x, h[1].
x, h[0].
x);
1734 template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(
const Packet4h&
a,
const Packet4h&
b) {
1735 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1736 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1753 result.
x = _mm_set_pi16(h[3].
x, h[2].
x, h[1].
x, h[0].
x);
1757 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(
const Packet4h&
a,
const Packet4h&
b) {
1758 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1759 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1776 result.
x = _mm_set_pi16(h[3].
x, h[2].
x, h[1].
x, h[0].
x);
1780 template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(
const Packet4h&
a,
const Packet4h&
b) {
1781 __int64_t a64 = _mm_cvtm64_si64(
a.x);
1782 __int64_t b64 = _mm_cvtm64_si64(
b.x);
1799 result.
x = _mm_set_pi16(h[3].
x, h[2].
x, h[1].
x, h[0].
x);
1803 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(
const Eigen::half* from) {
1805 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
1809 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(
const Eigen::half* from) {
1811 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
1815 template<> EIGEN_STRONG_INLINE
void pstore<Eigen::half>(
Eigen::half* to,
const Packet4h& from) {
1816 __int64_t r = _mm_cvtm64_si64(from.x);
1817 *(
reinterpret_cast<__int64_t*
>(to)) = r;
1820 template<> EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(
Eigen::half* to,
const Packet4h& from) {
1821 __int64_t r = _mm_cvtm64_si64(from.x);
1822 *(
reinterpret_cast<__int64_t*
>(to)) = r;
1825 template<> EIGEN_STRONG_INLINE Packet4h
1827 return pset1<Packet4h>(*from);
1830 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(
const Eigen::half* from,
Index stride)
1833 result.x = _mm_set_pi16(from[3*stride].
x, from[2*stride].
x, from[1*stride].
x, from[0*stride].
x);
1837 template<> EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4h>(
Eigen::half* to,
const Packet4h& from,
Index stride)
1839 __int64_t
a = _mm_cvtm64_si64(from.x);
1840 to[stride*0].
x =
static_cast<unsigned short>(
a);
1841 to[stride*1].
x =
static_cast<unsigned short>(
a >> 16);
1842 to[stride*2].
x =
static_cast<unsigned short>(
a >> 32);
1843 to[stride*3].
x =
static_cast<unsigned short>(
a >> 48);
1846 EIGEN_STRONG_INLINE
void
1847 ptranspose(PacketBlock<Packet4h,4>& kernel) {
1848 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1849 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1850 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1851 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1853 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1854 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1855 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1856 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1866 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1868 static inline __m128 _mm_castpd_ps (__m128d
x) {
return reinterpret_cast<__m128&
>(
x); }
1869 static inline __m128i _mm_castpd_si128(__m128d
x) {
return reinterpret_cast<__m128i&
>(
x); }
1870 static inline __m128d _mm_castps_pd (__m128
x) {
return reinterpret_cast<__m128d&
>(
x); }
1871 static inline __m128i _mm_castps_si128(__m128
x) {
return reinterpret_cast<__m128i&
>(
x); }
1872 static inline __m128 _mm_castsi128_ps(__m128i
x) {
return reinterpret_cast<__m128&
>(
x); }
1873 static inline __m128d _mm_castsi128_pd(__m128i
x) {
return reinterpret_cast<__m128d&
>(
x); }
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEVICE_FUNC
#define EIGEN_OPTIMIZATION_BARRIER(X)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define vec4ui_swizzle2(a, b, p, q, r, s)
#define vec2d_swizzle1(v, p, q)
#define vec4ui_swizzle1(v, p, q, r, s)
#define vec4i_swizzle1(v, p, q, r, s)
#define vec4i_swizzle2(a, b, p, q, r, s)
EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Packet pmin(const Packet &a, const Packet &b)
Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pnmsub(const Packet &a, const Packet &b, const Packet &c)
Packet padd(const Packet &a, const Packet &b)
Packet4f pmin< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
double predux_max< Packet2d >(const Packet2d &a)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
void pstore< float >(float *to, const Packet4f &from)
Packet4ui pxor< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
uint32_t pfirst< Packet4ui >(const Packet4ui &a)
Packet16b pgather< bool, Packet16b >(const bool *from, Index stride)
Packet2d pset1frombits< Packet2d >(uint64_t from)
float predux_max< Packet4f >(const Packet4f &a)
double predux_min< Packet2d >(const Packet2d &a)
Packet16b pand< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d plset< Packet2d >(const double &a)
Packet4i ptrue< Packet4i >(const Packet4i &a)
Packet4f pmin< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8h ptrue(const Packet8h &a)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Packet4f pload1< Packet4f >(const float *from)
void pstore< int >(int *to, const Packet4i &from)
Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
bool predux< Packet16b >(const Packet16b &a)
Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pfrexp< Packet2d >(const Packet2d &a, Packet2d &exponent)
Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Packet2d pmax< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui plset< Packet4ui >(const uint32_t &a)
int predux< Packet4i >(const Packet4i &a)
Packet ploads(const typename unpacket_traits< Packet >::type *from)
Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
void pstore1< Packet4f >(float *to, const float &a)
bool predux_mul< Packet16b >(const Packet16b &a)
float predux_min< Packet4f >(const Packet4f &a)
Packet16b pmul< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet4f pset1frombits< Packet4f >(unsigned int from)
Packet2d pceil< Packet2d >(const Packet2d &a)
void pscatter< uint32_t, Packet4ui >(uint32_t *to, const Packet4ui &from, Index stride)
Packet4ui pmax< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
EIGEN_ALWAYS_INLINE Packet4i pgather< int, Packet4i >(const int *from, Index stride)
void pstore1< Packet2d >(double *to, const double &a)
void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
void pstores(Scalar *to, const Packet &from)
Packet4f pldexp< Packet4f >(const Packet4f &a, const Packet4f &exponent)
Packet4f pabs(const Packet4f &a)
Packet pmax(const Packet &a, const Packet &b)
void pstorel(Scalar *to, const Packet &from)
float predux< Packet4f >(const Packet4f &a)
Packet4f plset< Packet4f >(const float &a)
void pscatter< bool, Packet16b >(bool *to, const Packet16b &from, Index stride)
Packet4ui pload< Packet4ui >(const uint32_t *from)
Packet16b padd< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2cf pnegate(const Packet2cf &a)
double predux< Packet2d >(const Packet2d &a)
Packet2d pround< Packet2d >(const Packet2d &a)
Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
eigen_packet_wrapper< __m128i, 1 > Packet16b
int predux_max< Packet4i >(const Packet4i &a)
Packet4i plogical_shift_right(const Packet4i &a)
Packet4ui ploadu< Packet4ui >(const uint32_t *from)
Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui pmin< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
double predux_mul< Packet2d >(const Packet2d &a)
Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
uint32_t predux_max< Packet4ui >(const Packet4ui &a)
Packet16b por< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet2d ploaddup< Packet2d >(const double *from)
uint32_t predux_min< Packet4ui >(const Packet4ui &a)
void punpackp(Packet4f *vecs)
Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4ui padd< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Packet4f paddsub< Packet4f >(const Packet4f &a, const Packet4f &b)
bfloat16 pfirst(const Packet8bf &a)
Packet4ui pmul< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet16b pxor< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d pgather< double, Packet2d >(const double *from, Index stride)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Packet padds(const Packet &a, const Packet &b)
EIGEN_ALWAYS_INLINE void pscatter< int, Packet4i >(int *to, const Packet4i &from, Index stride)
__vector unsigned int Packet4ui
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet4i ploadu< Packet4i >(const int *from)
float pfirst< Packet4f >(const Packet4f &a)
Packet2d pmax< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet16b ploadquad< Packet16b >(const bool *from)
double pfirst< Packet2d >(const Packet2d &a)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet4f ptrue< Packet4f >(const Packet4f &a)
Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Packet4i pset1< Packet4i >(const int &from)
void prefetch< uint32_t >(const uint32_t *addr)
Packet8h float2half(const Packet8f &a)
Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pload< Packet4i >(const int *from)
Packet4f ploadu< Packet4f >(const float *from)
float predux_mul< Packet4f >(const Packet4f &a)
Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Packet4f print(const Packet4f &a)
void pstore< bool >(bool *to, const Packet16b &from)
Packet psub(const Packet &a, const Packet &b)
Packet4i ploaddup< Packet4i >(const int *from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pset1< Packet2d >(const double &from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet16b psub< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d paddsub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8h pand(const Packet8h &a, const Packet8h &b)
Packet4f pfrexp< Packet4f >(const Packet4f &a, Packet4f &exponent)
uint32_t predux< Packet4ui >(const Packet4ui &a)
Packet2d ploadu< Packet2d >(const double *from)
Packet4i plset< Packet4i >(const int &a)
const char * SsePrefetchPtrType
void pstoreu< float >(float *to, const Packet4f &from)
Packet2d ptrue< Packet2d >(const Packet2d &a)
Packet4f pmax< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pfloor< Packet2d >(const Packet2d &a)
EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
Packet2d pldexp< Packet2d >(const Packet2d &a, const Packet2d &exponent)
Packet2d ploads< Packet2d >(const double *from)
Packet4f padds< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet2d pmin< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmin< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
int predux_min< Packet4i >(const Packet4i &a)
Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4f ploads< Packet4f >(const float *from)
Packet4ui psub< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui ploaddup< Packet4ui >(const uint32_t *from)
Packet8bf psignbit(const Packet8bf &a)
int predux_mul< Packet4i >(const Packet4i &a)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
int pfirst< Packet4i >(const Packet4i &a)
Packet16b ptrue< Packet16b >(const Packet16b &a)
Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Packet4ui pgather< uint32_t, Packet4ui >(const uint32_t *from, Index stride)
Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2cf pconj(const Packet2cf &a)
Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
void pstoreu< int >(int *to, const Packet4i &from)
Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d pload< Packet2d >(const double *from)
Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i plogical_shift_left(const Packet4i &a)
Packet2cf preverse(const Packet2cf &a)
Packet16b pload< Packet16b >(const bool *from)
Packet4i parithmetic_shift_right(const Packet4i &a)
Packet4f pload< Packet4f >(const float *from)
Packet8h por(const Packet8h &a, const Packet8h &b)
void pstoreu< bool >(bool *to, const Packet16b &from)
Packet4f pset1< Packet4f >(const float &from)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet4f ploaddup< Packet4f >(const float *from)
Packet ploadl(const typename unpacket_traits< Packet >::type *from)
Packet4ui pset1< Packet4ui >(const uint32_t &from)
Packet4f pmax< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d ploadl< Packet2d >(const double *from)
Packet4f pround< Packet4f >(const Packet4f &a)
Packet4f pfloor< Packet4f >(const Packet4f &a)
Packet16b ploadu< Packet16b >(const bool *from)
void prefetch< int >(const int *addr)
void pbroadcast4< Packet4f >(const float *a, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
Packet4ui pand< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
void pbroadcast4< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
Packet16b ploaddup< Packet16b >(const bool *from)
Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
bool predux_any(const Packet4f &x)
Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Packet4f ploadl< Packet4f >(const float *from)
Packet4ui por< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet4ui pandnot< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4f pceil< Packet4f >(const Packet4f &a)
bool pfirst< Packet16b >(const Packet16b &a)
uint32_t predux_mul< Packet4ui >(const Packet4ui &a)
Packet2d padds< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4f vec4f_swizzle1(const Packet4f &a, int p, int q, int r, int s)
Packet16b pset1< Packet16b >(const bool &from)
Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8f peven_mask(const Packet8f &)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)