SSE/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_SSE_H
11 #define EIGEN_PACKET_MATH_SSE_H
12 
13 #include <cstdint>
14 #include "../../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
22 #endif
23 
24 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
25 // 32 bits => 8 registers
26 // 64 bits => 16 registers
27 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
28 #endif
29 
30 #ifdef EIGEN_VECTORIZE_FMA
31 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #endif
34 #endif
35 
36 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
37 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
38 // have overloads for both types without linking error.
39 // One solution is to increase ABI version using -fabi-version=4 (or greater).
40 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
41 // structure:
42 typedef eigen_packet_wrapper<__m128> Packet4f;
43 typedef eigen_packet_wrapper<__m128d> Packet2d;
44 #else
45 typedef __m128 Packet4f;
46 typedef __m128d Packet2d;
47 #endif
48 
49 typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
50 typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
51 typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
52 
53 template<> struct is_arithmetic<__m128> { enum { value = true }; };
54 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
55 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
56 template<> struct is_arithmetic<Packet4i> { enum { value = true }; };
57 // Note that `Packet4ui` uses the underlying type `__m128i`, which is
58 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
59 // operations used in `GenericPacketMath.h`.
60 template<> struct is_arithmetic<Packet4ui> { enum { value = false }; };
61 template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
62 
63 template<int p, int q, int r, int s>
64 struct shuffle_mask{
65  enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
66 };
67 
68 // TODO: change the implementation of all swizzle* ops from macro to template,
69 #define vec4f_swizzle1(v,p,q,r,s) \
70  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
71 
72 #define vec4i_swizzle1(v,p,q,r,s) \
73  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
74 
75 #define vec4ui_swizzle1(v, p, q, r, s) \
76  Packet4ui(vec4i_swizzle1(v,p,q,r,s))
77 
78 #define vec2d_swizzle1(v,p,q) \
79  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
80 
81 #define vec4f_swizzle2(a,b,p,q,r,s) \
82  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
83 
84 #define vec4i_swizzle2(a,b,p,q,r,s) \
85  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
86 
87 #define vec4ui_swizzle2(a,b,p,q,r,s) \
88  Packet4i(vec4i_swizzle2(a,b,p,q,r,s))
89 
90 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
91 {
92  return Packet4f(_mm_movelh_ps(a,b));
93 }
94 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
95 {
96  return Packet4f(_mm_movehl_ps(a,b));
97 }
98 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
99 {
100  return Packet4f(_mm_unpacklo_ps(a,b));
101 }
102 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
103 {
104  return Packet4f(_mm_unpackhi_ps(a,b));
105 }
106 #define vec4f_duplane(a,p) \
107  vec4f_swizzle2(a,a,p,p,p,p)
108 
109 #define vec2d_swizzle2(a,b,mask) \
110  Packet2d(_mm_shuffle_pd(a,b,mask))
111 
112 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
113 {
114  return Packet2d(_mm_unpacklo_pd(a,b));
115 }
116 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
117 {
118  return Packet2d(_mm_unpackhi_pd(a,b));
119 }
120 #define vec2d_duplane(a,p) \
121  vec2d_swizzle2(a,a,(p<<1)|p)
122 
123 #define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
124  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
125 
126 #define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
127  const Packet2d p2d_##NAME = pset1<Packet2d>(X)
128 
129 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
130  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
131 
132 #define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
133  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
134 
135 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
136 
137 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
138 // to leverage AVX instructions.
139 #ifndef EIGEN_VECTORIZE_AVX
140 template <>
141 struct packet_traits<float> : default_packet_traits {
142  typedef Packet4f type;
143  typedef Packet4f half;
144  enum {
145  Vectorizable = 1,
146  AlignedOnScalar = 1,
147  size = 4,
148 
149  HasCmp = 1,
150  HasDiv = 1,
151  HasReciprocal = EIGEN_FAST_MATH,
152  HasSin = EIGEN_FAST_MATH,
153  HasCos = EIGEN_FAST_MATH,
154  HasACos = 1,
155  HasASin = 1,
156  HasATan = 1,
157  HasATanh = 1,
158  HasLog = 1,
159  HasLog1p = 1,
160  HasExpm1 = 1,
161  HasNdtri = 1,
162  HasExp = 1,
163  HasBessel = 1,
164  HasSqrt = 1,
165  HasRsqrt = 1,
166  HasTanh = EIGEN_FAST_MATH,
167  HasErf = EIGEN_FAST_MATH,
168  HasBlend = 1,
169  HasCeil = 1,
170  HasFloor = 1,
171 #ifdef EIGEN_VECTORIZE_SSE4_1
172  HasRound = 1,
173 #endif
174  HasRint = 1,
175  HasSign = 0 // The manually vectorized version is slightly slower for SSE.
176  };
177 };
178 template <>
179 struct packet_traits<double> : default_packet_traits {
180  typedef Packet2d type;
181  typedef Packet2d half;
182  enum {
183  Vectorizable = 1,
184  AlignedOnScalar = 1,
185  size=2,
186 
187  HasCmp = 1,
188  HasDiv = 1,
189  HasLog = 1,
190  HasExp = 1,
191  HasSqrt = 1,
192  HasRsqrt = 1,
193  HasATan = 1,
194  HasBlend = 1,
195  HasFloor = 1,
196  HasCeil = 1,
197 #ifdef EIGEN_VECTORIZE_SSE4_1
198  HasRound = 1,
199 #endif
200  HasRint = 1
201  };
202 };
203 template<> struct packet_traits<int> : default_packet_traits
204 {
205  typedef Packet4i type;
206  typedef Packet4i half;
207  enum {
208  Vectorizable = 1,
209  AlignedOnScalar = 1,
210  HasCmp = 1,
211  HasDiv=1,
212  size=4,
213 
214  HasShift = 1,
215  HasBlend = 1
216  };
217 };
218 template<> struct packet_traits<uint32_t> : default_packet_traits
219 {
220  typedef Packet4ui type;
221  typedef Packet4ui half;
222  enum {
223  Vectorizable = 1,
224  AlignedOnScalar = 1,
225  size = 4,
226 
227  HasDiv = 0,
228  HasNegate = 0,
229  HasSqrt = 0,
230  HasCmp = 1,
231  HasMin = 1,
232  HasMax = 1,
233  HasShift = 1,
234  HasBlend = 1
235  };
236 };
237 #endif
238 template<> struct packet_traits<bool> : default_packet_traits
239 {
240  typedef Packet16b type;
241  typedef Packet16b half;
242  enum {
243  Vectorizable = 1,
244  AlignedOnScalar = 1,
245  size=16,
246 
247  HasAdd = 1,
248  HasSub = 1,
249  HasCmp = 1, // note -- only pcmp_eq is defined
250  HasShift = 0,
251  HasMul = 1,
252  HasNegate = 1,
253  HasAbs = 0,
254  HasAbs2 = 0,
255  HasMin = 0,
256  HasMax = 0,
257  HasConj = 0,
258  HasSqrt = 1,
259  HasSign = 0 // Don't try to vectorize psign<bool> = identity.
260  };
261 };
262 
263 template<> struct unpacket_traits<Packet4f> {
264  typedef float type;
265  typedef Packet4f half;
266  typedef Packet4i integer_packet;
267  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
268 };
269 template<> struct unpacket_traits<Packet2d> {
270  typedef double type;
271  typedef Packet2d half;
272  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
273 };
274 template<> struct unpacket_traits<Packet4i> {
275  typedef int type;
276  typedef Packet4i half;
277  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
278 };
279 template<> struct unpacket_traits<Packet4ui> {
280  typedef uint32_t type;
281  typedef Packet4ui half;
282  enum {size = 4, alignment = Aligned16, vectorizable = true, masked_load_available = false, masked_store_available = false};
283 };
284 template<> struct unpacket_traits<Packet16b> {
285  typedef bool type;
286  typedef Packet16b half;
287  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
288 };
289 
290 #ifndef EIGEN_VECTORIZE_AVX
291 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
292 template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
293 #endif
294 
295 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
296 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
297 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
298 template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return _mm_set1_epi32(numext::bit_cast<int32_t>(from)); }
299 template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) { return _mm_set1_epi8(static_cast<char>(from)); }
300 
301 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
302 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
303 
304 template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
305 template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
306 template<> EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
307 template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
308 
309 template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
310 template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
311 template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
312 template<> EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) { return _mm_setzero_si128(); }
313 
314 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
315 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
316 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
317 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
318 // Also note that with AVX, we want it to generate a vbroadcastss.
319 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
320 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
321  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
322 }
323 #endif
324 
325 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
326 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
327 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
328 template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) { return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0)); }
329 
330 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
331 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
332 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
333 template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_add_epi32(a, b); }
334 
335 template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
336 
337 template<typename Packet> EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
338 template<> EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ss(a,b); }
339 template<> EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_sd(a,b); }
340 
341 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
342 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
343 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
344 template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_sub_epi32(a, b); }
345 template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
346 
347 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
348 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
349 {
350 #ifdef EIGEN_VECTORIZE_SSE3
351  return _mm_addsub_ps(a,b);
352 #else
353  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
354  return padd(a, pxor(mask, b));
355 #endif
356 }
357 
358 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
359 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b)
360 {
361 #ifdef EIGEN_VECTORIZE_SSE3
362  return _mm_addsub_pd(a,b);
363 #else
364  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0));
365  return padd(a, pxor(mask, b));
366 #endif
367 }
368 
369 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
370 {
371  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
372  return _mm_xor_ps(a,mask);
373 }
374 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
375 {
376  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
377  return _mm_xor_pd(a,mask);
378 }
379 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
380 {
381  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
382 }
383 
384 template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
385 {
386  return psub(pset1<Packet16b>(false), a);
387 }
388 
389 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
390 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
391 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
392 
393 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
394 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
395 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
396 {
397 #ifdef EIGEN_VECTORIZE_SSE4_1
398  return _mm_mullo_epi32(a,b);
399 #else
400  // this version is slightly faster than 4 scalar products
401  return vec4i_swizzle1(
403  _mm_mul_epu32(a,b),
404  _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
405  vec4i_swizzle1(b,1,0,3,2)),
406  0,2,0,2),
407  0,2,1,3);
408 #endif
409 }
410 template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
411 {
412 #ifdef EIGEN_VECTORIZE_SSE4_1
413  return _mm_mullo_epi32(a,b);
414 #else
415  // this version is slightly faster than 4 scalar products
416  return vec4ui_swizzle1(
418  _mm_mul_epu32(a,b),
419  _mm_mul_epu32(vec4ui_swizzle1(a,1,0,3,2),
420  vec4ui_swizzle1(b,1,0,3,2)),
421  0,2,0,2),
422  0,2,1,3);
423 #endif
424 }
425 
426 template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
427 
428 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
429 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
430 
431 template <>
432 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a,
433  const Packet4i& b) {
434 #ifdef EIGEN_VECTORIZE_AVX
435  return _mm256_cvttpd_epi32(
436  _mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
437 #else
438  __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
439  __m128i q_hi =
440  _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)),
441  _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
442  return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
443 #endif
444 }
445 
446 
447 // for some weird raisons, it has to be overloaded for packet of integers
448 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
449 template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { return padd(pmul(a, b), c); }
450 #ifdef EIGEN_VECTORIZE_FMA
451 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
452 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
453 template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmsub_ps(a,b,c); }
454 template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmsub_pd(a,b,c); }
455 template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmadd_ps(a,b,c); }
456 template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmadd_pd(a,b,c); }
457 template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmsub_ps(a,b,c); }
458 template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmsub_pd(a,b,c); }
459 
460 template<typename Packet> EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
461 template<> EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ss(a,b,c); }
462 template<> EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_sd(a,b,c); }
463 #endif
464 
465 #ifdef EIGEN_VECTORIZE_SSE4_1
466 template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
467  return _mm_blendv_ps(b,a,mask);
468 }
469 
470 template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
471  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
472 }
473 
474 template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
475  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
476 }
477 
478 template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); }
479 
480 template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
481  return _mm_blendv_epi8(b,a,mask);
482 }
483 #else
484 template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
485  Packet16b a_part = _mm_and_si128(mask, a);
486  Packet16b b_part = _mm_andnot_si128(mask, b);
487  return _mm_or_si128(a_part, b_part);
488 }
489 #endif
490 
491 template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
492 template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
493 template<> EIGEN_STRONG_INLINE Packet4f
494 ptrue<Packet4f>(const Packet4f& a) {
495  Packet4i b = _mm_castps_si128(a);
496  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
497 }
498 template<> EIGEN_STRONG_INLINE Packet2d
499 ptrue<Packet2d>(const Packet2d& a) {
500  Packet4i b = _mm_castpd_si128(a);
501  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
502 }
503 
504 
505 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
506 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
507 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
508 template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_and_si128(a, b); }
509 template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
510 
511 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
512 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
513 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
514 template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_or_si128(a, b); }
515 template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
516 
517 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
518 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
519 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
520 template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_xor_si128(a, b); }
521 template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
522 
523 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
524 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
525 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
526 template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_andnot_si128(b, a); }
527 
528 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
529 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
530 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
531 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
532 
533 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
534 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
535 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
536 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
537 
538 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
539 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
540 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { return _mm_cmpeq_epi32(a, b); }
541 template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
542 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
543 
544 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
545 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
546 // There appears to be a bug in GCC, by which the optimizer may
547 // flip the argument order in calls to _mm_min_ps, so we have to
548 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
549 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
550 #ifdef EIGEN_VECTORIZE_AVX
551  Packet4f res;
552  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
553 #else
554  Packet4f res = b;
555  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
556 #endif
557  return res;
558 #else
559  // Arguments are reversed to match NaN propagation behavior of std::min.
560  return _mm_min_ps(b, a);
561 #endif
562 }
563 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
564 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
565 // There appears to be a bug in GCC, by which the optimizer may
566 // flip the argument order in calls to _mm_min_pd, so we have to
567 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
568 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
569 #ifdef EIGEN_VECTORIZE_AVX
570  Packet2d res;
571  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
572 #else
573  Packet2d res = b;
574  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
575 #endif
576  return res;
577 #else
578  // Arguments are reversed to match NaN propagation behavior of std::min.
579  return _mm_min_pd(b, a);
580 #endif
581 }
582 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
583 {
584 #ifdef EIGEN_VECTORIZE_SSE4_1
585  return _mm_min_epi32(a,b);
586 #else
587  // after some bench, this version *is* faster than a scalar implementation
588  Packet4i mask = _mm_cmplt_epi32(a,b);
589  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
590 #endif
591 }
592 template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
593 #ifdef EIGEN_VECTORIZE_SSE4_1
594  return _mm_min_epu32(a, b);
595 #else
596  return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
597  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
598  pset1<Packet4ui>(0x80000000UL));
599 #endif
600 }
601 
602 
603 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
604 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
605 // There appears to be a bug in GCC, by which the optimizer may
606 // flip the argument order in calls to _mm_max_ps, so we have to
607 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
608 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
609 #ifdef EIGEN_VECTORIZE_AVX
610  Packet4f res;
611  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
612 #else
613  Packet4f res = b;
614  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
615 #endif
616  return res;
617 #else
618  // Arguments are reversed to match NaN propagation behavior of std::max.
619  return _mm_max_ps(b, a);
620 #endif
621 }
622 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
623 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
624 // There appears to be a bug in GCC, by which the optimizer may
625 // flip the argument order in calls to _mm_max_pd, so we have to
626 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
627 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
628 #ifdef EIGEN_VECTORIZE_AVX
629  Packet2d res;
630  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
631 #else
632  Packet2d res = b;
633  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
634 #endif
635  return res;
636 #else
637  // Arguments are reversed to match NaN propagation behavior of std::max.
638  return _mm_max_pd(b, a);
639 #endif
640 }
641 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
642 {
643 #ifdef EIGEN_VECTORIZE_SSE4_1
644  return _mm_max_epi32(a,b);
645 #else
646  // after some bench, this version *is* faster than a scalar implementation
647  Packet4i mask = _mm_cmpgt_epi32(a,b);
648  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
649 #endif
650 }
651 template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
652 #ifdef EIGEN_VECTORIZE_SSE4_1
653  return _mm_max_epu32(a, b);
654 #else
655  return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
656  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
657  pset1<Packet4ui>(0x80000000UL));
658 #endif
659 }
660 
661 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
662 #ifdef EIGEN_VECTORIZE_SSE4_1
663  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
664 #else
665  return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
666  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
667 #endif
668 }
669 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
670 #ifdef EIGEN_VECTORIZE_SSE4_1
671  return pcmp_eq(a, pmin(a, b));
672 #else
673  return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
674  (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
675 #endif
676 }
677 
678 template <typename Packet, typename Op>
679 EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
680  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
681  // always return a if either a or b is NaN.
682  Packet not_nan_mask_a = pcmp_eq(a, a);
683  Packet m = op(a, b);
684  return pselect<Packet>(not_nan_mask_a, m, b);
685 }
686 
687 template <typename Packet, typename Op>
688 EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
689  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
690  // always return a if either a or b is NaN.
691  Packet not_nan_mask_a = pcmp_eq(a, a);
692  Packet m = op(b, a);
693  return pselect<Packet>(not_nan_mask_a, m, a);
694 }
695 
696 // Add specializations for min/max with prescribed NaN progation.
697 template<>
698 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
700 }
701 template<>
702 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
704 }
705 template<>
706 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
708 }
709 template<>
710 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
712 }
713 template<>
714 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
716 }
717 template<>
718 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
720 }
721 template<>
722 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
724 }
725 template<>
726 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
728 }
729 
730 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
731 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); }
732 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); }
733 
734 template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { return _mm_srli_epi32(a,N); }
735 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right (const Packet4ui& a) { return _mm_srli_epi32(a,N); }
736 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left (const Packet4ui& a) { return _mm_slli_epi32(a,N); }
737 
738 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
739 {
740  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
741  return _mm_and_ps(a,mask);
742 }
743 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
744 {
745  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
746  return _mm_and_pd(a,mask);
747 }
748 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
749 {
750 #ifdef EIGEN_VECTORIZE_SSSE3
751  return _mm_abs_epi32(a);
752 #else
753  Packet4i aux = _mm_srai_epi32(a,31);
754  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
755 #endif
756 }
757 template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
758 
759 template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); }
760 template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
761 {
762  Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
763 #ifdef EIGEN_VECTORIZE_AVX
764  return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
765 #else
766  return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
767 #endif // EIGEN_VECTORIZE_AVX
768 }
769 template<> EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) { return pzero(a); }
770 
771 #ifdef EIGEN_VECTORIZE_SSE4_1
772 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
773 {
774  // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
775  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
776  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
777  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
778 }
779 
780 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
781 {
782  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
783  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
784  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
785 }
786 
787 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
788 template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
789 
790 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
791 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
792 
793 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
794 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
795 #else
796 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
797  // Adds and subtracts signum(a) * 2^23 to force rounding.
798  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
799  const Packet4f abs_a = pabs(a);
800  Packet4f r = padd(abs_a, limit);
801  // Don't compile-away addition and subtraction.
803  r = psub(r, limit);
804  // If greater than limit, simply return a. Otherwise, account for sign.
805  r = pselect(pcmp_lt(abs_a, limit),
806  pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
807  return r;
808 }
809 
810 template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
811  // Adds and subtracts signum(a) * 2^52 to force rounding.
812  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
813  const Packet2d abs_a = pabs(a);
814  Packet2d r = padd(abs_a, limit);
815  // Don't compile-away addition and subtraction.
817  r = psub(r, limit);
818  // If greater than limit, simply return a. Otherwise, account for sign.
819  r = pselect(pcmp_lt(abs_a, limit),
820  pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
821  return r;
822 }
823 
824 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
825 {
826  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
827  Packet4f tmp = print<Packet4f>(a);
828  // If greater, subtract one.
829  Packet4f mask = _mm_cmpgt_ps(tmp, a);
830  mask = pand(mask, cst_1);
831  return psub(tmp, mask);
832 }
833 
834 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
835 {
836  const Packet2d cst_1 = pset1<Packet2d>(1.0);
837  Packet2d tmp = print<Packet2d>(a);
838  // If greater, subtract one.
839  Packet2d mask = _mm_cmpgt_pd(tmp, a);
840  mask = pand(mask, cst_1);
841  return psub(tmp, mask);
842 }
843 
844 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
845 {
846  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
847  Packet4f tmp = print<Packet4f>(a);
848  // If smaller, add one.
849  Packet4f mask = _mm_cmplt_ps(tmp, a);
850  mask = pand(mask, cst_1);
851  return padd(tmp, mask);
852 }
853 
854 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
855 {
856  const Packet2d cst_1 = pset1<Packet2d>(1.0);
857  Packet2d tmp = print<Packet2d>(a);
858  // If smaller, add one.
859  Packet2d mask = _mm_cmplt_pd(tmp, a);
860  mask = pand(mask, cst_1);
861  return padd(tmp, mask);
862 }
863 #endif
864 
865 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
866 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
867 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
868 template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
869 template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
870 
871 #if EIGEN_COMP_MSVC
872  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
874  return _mm_loadu_ps(from);
875 }
876 #else
877 // NOTE: with the code below, MSVC's compiler crashes!
878 
879 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
880 {
882  return _mm_loadu_ps(from);
883 }
884 #endif
885 
886 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
887 {
889  return _mm_loadu_pd(from);
890 }
891 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
892 {
894  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
895 }
896 template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
897 {
899  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
900 }
901 template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
903  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
904 }
905 
906 // Load lower part of packet zero extending.
907 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
908 template<> EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))); }
909 template<> EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
910 
911 // Load scalar
912 template<typename Packet> EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
913 template<> EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from); }
914 template<> EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
915 
916 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
917 {
918  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
919 }
920 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
921 { return pset1<Packet2d>(from[0]); }
922 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
923 {
924  Packet4i tmp;
925  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
926  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
927 }
928 template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
929 {
930  Packet4ui tmp;
931  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
932  return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
933 }
934 
935 // Loads 8 bools from memory and returns the packet
936 // {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
937 template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from)
938 {
939  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
940  return _mm_unpacklo_epi8(tmp, tmp);
941 }
942 
943 // Loads 4 bools from memory and returns the packet
944 // {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
945 template<> EIGEN_STRONG_INLINE Packet16b
946 ploadquad<Packet16b>(const bool* from) {
947  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
948  tmp = _mm_unpacklo_epi8(tmp, tmp);
949  return _mm_unpacklo_epi16(tmp, tmp);
950 }
951 
952 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
953 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
954 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
955 template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
956 template<> EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
957 
958 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
959 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
960 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
961 template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
962 template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
963 
964 template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
965 template<> EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from); }
966 template<> EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from); }
967 
968 template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
969 template<> EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from); }
970 template<> EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from); }
971 
972 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
973 {
974  return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
975 }
976 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
977 {
978  return _mm_set_pd(from[1*stride], from[0*stride]);
979 }
980 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
981 {
982  return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
983 }
984 template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
985 {
986  return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
987  numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
988 }
989 
990 template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
991 {
992  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
993  from[11*stride], from[10*stride], from[9*stride], from[8*stride],
994  from[7*stride], from[6*stride], from[5*stride], from[4*stride],
995  from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
996 }
997 
998 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
999 {
1000  to[stride*0] = _mm_cvtss_f32(from);
1001  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
1002  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
1003  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
1004 }
1005 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
1006 {
1007  to[stride*0] = _mm_cvtsd_f64(from);
1008  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
1009 }
1010 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
1011 {
1012  to[stride*0] = _mm_cvtsi128_si32(from);
1013  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1014  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1015  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1016 }
1017 template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
1018 {
1019  to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1020  to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1021  to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1022  to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1023 }
1024 template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
1025 {
1026  to[4*stride*0] = _mm_cvtsi128_si32(from);
1027  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1028  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1029  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1030 }
1031 
1032 
1033 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1034 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
1035 {
1036  Packet4f pa = _mm_set_ss(a);
1037  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
1038 }
1039 // some compilers might be tempted to perform multiple moves instead of using a vector path.
1040 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
1041 {
1042  Packet2d pa = _mm_set_sd(a);
1043  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
1044 }
1045 
1046 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1047 typedef const void * SsePrefetchPtrType;
1048 #else
1049 typedef const char * SsePrefetchPtrType;
1050 #endif
1051 
1052 #ifndef EIGEN_VECTORIZE_AVX
1053 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1054 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1055 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1056 template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1057 #endif
1058 
1059 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1060 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1061 // Direct of the struct members fixed bug #62.
1062 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
1063 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
1064 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
1065 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
1066 #elif EIGEN_COMP_MSVC_STRICT
1067 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1068 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
1069 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
1070 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
1071 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
1072 #else
1073 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
1074 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
1075 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
1076 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); }
1077 #endif
1078 template<> EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
1079 
1080 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
1081 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
1082 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
1083 template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { return _mm_shuffle_epi32(a, 0x1B); }
1084 template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
1085 #ifdef EIGEN_VECTORIZE_SSSE3
1086  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1087  return _mm_shuffle_epi8(a, mask);
1088 #else
1089  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1090  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1091  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1092 #endif
1093 }
1094 
1095 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1096  return pfrexp_generic(a,exponent);
1097 }
1098 
1099 // Extract exponent without existence of Packet2l.
1100 template<>
1101 EIGEN_STRONG_INLINE
1103  const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1104  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1105  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1106 }
1107 
1108 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
1109  return pfrexp_generic(a, exponent);
1110 }
1111 
1112 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1113  return pldexp_generic(a,exponent);
1114 }
1115 
1116 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1117 // supported by SSE, and has more range than is needed for exponents.
1118 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1119  // Clamp exponent to [-2099, 2099]
1120  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1121  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1122 
1123  // Convert e to integer and swizzle to low-order bits.
1124  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1125 
1126  // Split 2^e into four factors and multiply:
1127  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1128  Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
1129  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
1130  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1131  b = psub(psub(psub(ei, b), b), b); // e - 3b
1132  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
1133  out = pmul(out, c); // a * 2^e
1134  return out;
1135 }
1136 
1137 // with AVX, the default implementations based on pload1 are faster
1138 #ifndef __AVX__
1139 template<> EIGEN_STRONG_INLINE void
1140 pbroadcast4<Packet4f>(const float *a,
1141  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
1142 {
1143  a3 = pload<Packet4f>(a);
1144  a0 = vec4f_swizzle1(a3, 0,0,0,0);
1145  a1 = vec4f_swizzle1(a3, 1,1,1,1);
1146  a2 = vec4f_swizzle1(a3, 2,2,2,2);
1147  a3 = vec4f_swizzle1(a3, 3,3,3,3);
1148 }
1149 template<> EIGEN_STRONG_INLINE void
1150 pbroadcast4<Packet2d>(const double *a,
1151  Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1152 {
1153 #ifdef EIGEN_VECTORIZE_SSE3
1154  a0 = _mm_loaddup_pd(a+0);
1155  a1 = _mm_loaddup_pd(a+1);
1156  a2 = _mm_loaddup_pd(a+2);
1157  a3 = _mm_loaddup_pd(a+3);
1158 #else
1159  a1 = pload<Packet2d>(a);
1160  a0 = vec2d_swizzle1(a1, 0,0);
1161  a1 = vec2d_swizzle1(a1, 1,1);
1162  a3 = pload<Packet2d>(a+2);
1163  a2 = vec2d_swizzle1(a3, 0,0);
1164  a3 = vec2d_swizzle1(a3, 1,1);
1165 #endif
1166 }
1167 #endif
1168 
1169 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
1170 {
1171  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1172  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1173  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1174  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1175 }
1176 
1177 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
1178 {
1179  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
1180  // (from Nehalem to Haswell)
1181  // #ifdef EIGEN_VECTORIZE_SSE3
1182  // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
1183  // return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
1184  // #else
1185  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
1186  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1187  // #endif
1188 }
1189 
1190 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
1191 {
1192  // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
1193  // (from Nehalem to Haswell)
1194  // #ifdef EIGEN_VECTORIZE_SSE3
1195  // return pfirst<Packet2d>(_mm_hadd_pd(a, a));
1196  // #else
1197  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
1198  // #endif
1199 }
1200 
1201 #ifdef EIGEN_VECTORIZE_SSSE3
1202 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1203 {
1204  Packet4i tmp0 = _mm_hadd_epi32(a,a);
1205  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
1206 }
1207 template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
1208 {
1209  Packet4ui tmp0 = _mm_hadd_epi32(a, a);
1210  return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
1211 }
1212 
1213 #else
1214 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1215 {
1216  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
1217  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
1218 }
1219 template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
1220  Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
1221  return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
1222 }
1223 #endif
1224 
1225 template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
1226  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
1227  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
1228 }
1229 
1230 // Other reduction functions:
1231 
1232 
1233 // mul
1234 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1235 {
1236  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
1237  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1238 }
1239 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
1240 {
1241  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
1242 }
1243 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1244 {
1245  // after some experiments, it is seems this is the fastest way to implement it
1246  // for GCC (eg., reusing pmul is very slow !)
1247  // TODO try to call _mm_mul_epu32 directly
1248  EIGEN_ALIGN16 int aux[4];
1249  pstore(aux, a);
1250  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1251 }
1252 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
1253 {
1254  // after some experiments, it is seems this is the fastest way to implement it
1255  // for GCC (eg., reusing pmul is very slow !)
1256  // TODO try to call _mm_mul_epu32 directly
1257  EIGEN_ALIGN16 uint32_t aux[4];
1258  pstore(aux, a);
1259  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1260 }
1261 
1262 template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
1263  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
1264  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
1265  (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
1266 }
1267 
1268 // min
1269 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1270 {
1271  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
1272  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1273 }
1274 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
1275 {
1276  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
1277 }
1278 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1279 {
1280 #ifdef EIGEN_VECTORIZE_SSE4_1
1281  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1282  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
1283 #else
1284  // after some experiments, it is seems this is the fastest way to implement it
1285  // for GCC (eg., it does not like using std::min after the pstore !!)
1286  EIGEN_ALIGN16 int aux[4];
1287  pstore(aux, a);
1288  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1289  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1290  return aux0<aux2 ? aux0 : aux2;
1291 #endif // EIGEN_VECTORIZE_SSE4_1
1292 }
1293 template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
1294 {
1295 #ifdef EIGEN_VECTORIZE_SSE4_1
1296  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1297  return pfirst<Packet4ui>(_mm_min_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
1298 #else
1299  // after some experiments, it is seems this is the fastest way to implement it
1300  // for GCC (eg., it does not like using std::min after the pstore !!)
1301  EIGEN_ALIGN16 uint32_t aux[4];
1302  pstore(aux, a);
1303  uint32_t aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1304  uint32_t aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1305  return aux0<aux2 ? aux0 : aux2;
1306 #endif // EIGEN_VECTORIZE_SSE4_1
1307 }
1308 
1309 // max
1310 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1311 {
1312  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
1313  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1314 }
1315 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
1316 {
1317  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
1318 }
1319 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1320 {
1321 #ifdef EIGEN_VECTORIZE_SSE4_1
1322  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1323  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
1324 #else
1325  // after some experiments, it is seems this is the fastest way to implement it
1326  // for GCC (eg., it does not like using std::min after the pstore !!)
1327  EIGEN_ALIGN16 int aux[4];
1328  pstore(aux, a);
1329  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1330  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1331  return aux0>aux2 ? aux0 : aux2;
1332 #endif // EIGEN_VECTORIZE_SSE4_1
1333 }
1334 template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
1335 {
1336 #ifdef EIGEN_VECTORIZE_SSE4_1
1337  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1338  return pfirst<Packet4ui>(_mm_max_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
1339 #else
1340  // after some experiments, it is seems this is the fastest way to implement it
1341  // for GCC (eg., it does not like using std::min after the pstore !!)
1342  EIGEN_ALIGN16 uint32_t aux[4];
1343  pstore(aux, a);
1344  uint32_t aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1345  uint32_t aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1346  return aux0>aux2 ? aux0 : aux2;
1347 #endif // EIGEN_VECTORIZE_SSE4_1
1348 }
1349 
1350 // not needed yet
1351 // template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
1352 // {
1353 // return _mm_movemask_ps(x) == 0xF;
1354 // }
1355 
1356 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
1357 {
1358  return _mm_movemask_ps(x) != 0x0;
1359 }
1360 
1361 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
1362 {
1363  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
1364 }
1365 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x)
1366 {
1367  return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
1368 }
1369 
1370 EIGEN_DEVICE_FUNC inline void
1371 ptranspose(PacketBlock<Packet4f,4>& kernel) {
1372  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1373 }
1374 
1375 EIGEN_DEVICE_FUNC inline void
1376 ptranspose(PacketBlock<Packet2d,2>& kernel) {
1377  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1378  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1379  kernel.packet[1] = tmp;
1380 }
1381 
1382 EIGEN_DEVICE_FUNC inline void
1383 ptranspose(PacketBlock<Packet4i,4>& kernel) {
1384  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1385  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1386  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1387  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1388 
1389  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1390  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1391  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1392  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1393 }
1394 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1395  ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1396 }
1397 
1398 EIGEN_DEVICE_FUNC inline void
1399 ptranspose(PacketBlock<Packet16b,4>& kernel) {
1400  __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1401  __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1402  __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1403  __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1404  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1405  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1406  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1407  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1408 }
1409 
1410 EIGEN_DEVICE_FUNC inline void
1411 ptranspose(PacketBlock<Packet16b,16>& kernel) {
1412  // If we number the elements in the input thus:
1413  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
1414  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
1415  // ...
1416  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
1417  //
1418  // the desired output is:
1419  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
1420  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
1421  // ...
1422  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
1423  __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1424  __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
1425  __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
1426  __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
1427  __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
1428  __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
1429  __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1430  __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1431  __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1432  __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1433  __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1434  __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1435  __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1436  __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1437  __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1438  __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1439 
1440  __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1441  __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
1442  __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
1443  __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
1444  __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1445  __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
1446  __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1447  __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1448  __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1449  __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1450  __m128i sa = _mm_unpacklo_epi16(t9, tb);
1451  __m128i sb = _mm_unpackhi_epi16(t9, tb);
1452  __m128i sc = _mm_unpacklo_epi16(tc, te);
1453  __m128i sd = _mm_unpackhi_epi16(tc, te);
1454  __m128i se = _mm_unpacklo_epi16(td, tf);
1455  __m128i sf = _mm_unpackhi_epi16(td, tf);
1456 
1457  __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1458  __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1459  __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1460  __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1461  __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1462  __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1463  __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1464  __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1465  __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1466  __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1467  __m128i ua = _mm_unpacklo_epi32(s9, sd);
1468  __m128i ub = _mm_unpackhi_epi32(s9, sd);
1469  __m128i uc = _mm_unpacklo_epi32(sa, se);
1470  __m128i ud = _mm_unpackhi_epi32(sa, se);
1471  __m128i ue = _mm_unpacklo_epi32(sb, sf);
1472  __m128i uf = _mm_unpackhi_epi32(sb, sf);
1473 
1474  kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1475  kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1476  kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1477  kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1478  kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1479  kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1480  kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1481  kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1482  kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1483  kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1484  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1485  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1486  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1487  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1488  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1489  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
1490 }
1491 
1492 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
1493  const __m128i zero = _mm_setzero_si128();
1494  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1495  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
1496 #ifdef EIGEN_VECTORIZE_SSE4_1
1497  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
1498 #else
1499  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
1500 #endif
1501 }
1502 template<> EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
1503  const Packet4ui& elsePacket) {
1504  return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
1505 }
1506 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
1507  const __m128 zero = _mm_setzero_ps();
1508  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1509  __m128 false_mask = _mm_cmpeq_ps(select, zero);
1510 #ifdef EIGEN_VECTORIZE_SSE4_1
1511  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
1512 #else
1513  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
1514 #endif
1515 }
1516 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
1517  const __m128d zero = _mm_setzero_pd();
1518  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
1519  __m128d false_mask = _mm_cmpeq_pd(select, zero);
1520 #ifdef EIGEN_VECTORIZE_SSE4_1
1521  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
1522 #else
1523  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
1524 #endif
1525 }
1526 
1527 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
1528 #ifdef EIGEN_VECTORIZE_FMA
1529 template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
1530  return ::fmaf(a,b,c);
1531 }
1532 template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
1533  return ::fma(a,b,c);
1534 }
1535 template<> EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
1536  return ::fmaf(a,b,-c);
1537 }
1538 template<> EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
1539  return ::fma(a,b,-c);
1540 }
1541 template<> EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
1542  return ::fmaf(-a,b,c);
1543 }
1544 template<> EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
1545  return ::fma(-a,b,c);
1546 }
1547 template<> EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
1548  return ::fmaf(-a,b,-c);
1549 }
1550 template<> EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
1551  return ::fma(-a,b,-c);
1552 }
1553 #endif
1554 
1555 #ifdef EIGEN_VECTORIZE_SSE4_1
1556 // Helpers for half->float and float->half conversions.
1557 // Currently only used by the AVX code.
1558 EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
1559  __m128i input = _mm_cvtepu16_epi32(h);
1560 
1561  // Direct vectorization of half_to_float, C parts in the comments.
1562  __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
1563  // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
1564  __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
1565  // exp = shifted_exp & o.u; // just the exponent
1566  __m128i exp = _mm_and_si128(ou, shifted_exp);
1567  // o.u += (127 - 15) << 23;
1568  ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
1569 
1570  // Inf/NaN?
1571  __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
1572  // Inf/NaN adjust
1573  __m128i naninf_adj =
1574  _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
1575  // extra exp adjust for Inf/NaN
1576  ou = _mm_add_epi32(ou, naninf_adj);
1577 
1578  // Zero/Denormal?
1579  __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
1580  __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
1581  // o.u += 1 << 23;
1582  ou = _mm_add_epi32(ou, zeroden_adj);
1583  // magic.u = 113 << 23
1584  __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
1585  // o.f -= magic.f
1586  ou = _mm_castps_si128(
1587  _mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
1588 
1589  __m128i sign =
1590  _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
1591  // o.u |= (h.x & 0x8000) << 16; // sign bit
1592  ou = _mm_or_si128(ou, sign);
1593  // return o.f;
1594  // We are actually returning uint version, to make
1595  // _mm256_insertf128_si256 work.
1596  return ou;
1597 }
1598 
1599 EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
1600  __m128i o = _mm_setzero_si128();
1601 
1602  // unsigned int sign_mask = 0x80000000u;
1603  __m128i sign = _mm_set1_epi32(0x80000000u);
1604  // unsigned int sign = f.u & sign_mask;
1605  sign = _mm_and_si128(sign, _mm_castps_si128(f));
1606  // f.u ^= sign;
1607  f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
1608 
1609  __m128i fu = _mm_castps_si128(f);
1610 
1611  __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
1612  __m128i f32infty = _mm_set1_epi32(255 << 23);
1613  // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
1614  // there is no _mm_cmpge_epi32, so use lt and swap operands
1615  __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
1616  __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
1617  __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
1618  __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
1619  __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
1620  // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
1621  __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
1622 
1623  __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
1624  __m128i subnorm_mask =
1625  _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
1626  // f.f += denorm_magic.f;
1627  f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
1628  // f.u - denorm_magic.u
1629  o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
1630  o = _mm_and_si128(o, subnorm_mask);
1631  // Correct result for inf/nan/zero/subnormal, 0 otherwise
1632  o = _mm_or_si128(o, naninf_value);
1633 
1634  __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
1635  o = _mm_and_si128(o, mask);
1636 
1637  // mant_odd = (f.u >> 13) & 1;
1638  __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
1639  // f.u += 0xc8000fffU;
1640  fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
1641  // f.u += mant_odd;
1642  fu = _mm_add_epi32(fu, mand_odd);
1643  fu = _mm_andnot_si128(mask, fu);
1644  // f.u >> 13
1645  fu = _mm_srli_epi32(fu, 13);
1646  o = _mm_or_si128(fu, o);
1647 
1648  // o.x |= static_cast<numext::uint16_t>(sign >> 16);
1649  o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
1650 
1651  // 16 bit values
1652  return _mm_and_si128(o, _mm_set1_epi32(0xffff));
1653 }
1654 #endif
1655 
1656 // Packet math for Eigen::half
1657 // Disable the following code since it's broken on too many platforms / compilers.
1658 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
1659 #if 0
1660 
1661 typedef struct {
1662  __m64 x;
1663 } Packet4h;
1664 
1665 
1666 template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
1667 
1668 template <>
1669 struct packet_traits<Eigen::half> : default_packet_traits {
1670  typedef Packet4h type;
1671  // There is no half-size packet for Packet4h.
1672  typedef Packet4h half;
1673  enum {
1674  Vectorizable = 1,
1675  AlignedOnScalar = 1,
1676  size = 4,
1677  HasAdd = 1,
1678  HasSub = 1,
1679  HasMul = 1,
1680  HasDiv = 1,
1681  HasNegate = 0,
1682  HasAbs = 0,
1683  HasAbs2 = 0,
1684  HasMin = 0,
1685  HasMax = 0,
1686  HasConj = 0,
1687  HasSetLinear = 0,
1688  HasSqrt = 0,
1689  HasRsqrt = 0,
1690  HasExp = 0,
1691  HasLog = 0,
1692  HasBlend = 0
1693  };
1694 };
1695 
1696 
1697 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
1698 
1699 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1700  Packet4h result;
1701  result.x = _mm_set1_pi16(from.x);
1702  return result;
1703 }
1704 
1705 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1706  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1707 }
1708 
1709 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1710 
1711 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1712  __int64_t a64 = _mm_cvtm64_si64(a.x);
1713  __int64_t b64 = _mm_cvtm64_si64(b.x);
1714 
1715  Eigen::half h[4];
1716 
1717  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1718  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1719  h[0] = ha + hb;
1720  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1721  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1722  h[1] = ha + hb;
1723  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1724  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1725  h[2] = ha + hb;
1726  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1727  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1728  h[3] = ha + hb;
1729  Packet4h result;
1730  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1731  return result;
1732 }
1733 
1734 template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
1735  __int64_t a64 = _mm_cvtm64_si64(a.x);
1736  __int64_t b64 = _mm_cvtm64_si64(b.x);
1737 
1738  Eigen::half h[4];
1739 
1740  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1741  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1742  h[0] = ha - hb;
1743  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1744  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1745  h[1] = ha - hb;
1746  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1747  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1748  h[2] = ha - hb;
1749  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1750  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1751  h[3] = ha - hb;
1752  Packet4h result;
1753  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1754  return result;
1755 }
1756 
1757 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1758  __int64_t a64 = _mm_cvtm64_si64(a.x);
1759  __int64_t b64 = _mm_cvtm64_si64(b.x);
1760 
1761  Eigen::half h[4];
1762 
1763  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1764  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1765  h[0] = ha * hb;
1766  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1767  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1768  h[1] = ha * hb;
1769  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1770  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1771  h[2] = ha * hb;
1772  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1773  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1774  h[3] = ha * hb;
1775  Packet4h result;
1776  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1777  return result;
1778 }
1779 
1780 template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
1781  __int64_t a64 = _mm_cvtm64_si64(a.x);
1782  __int64_t b64 = _mm_cvtm64_si64(b.x);
1783 
1784  Eigen::half h[4];
1785 
1786  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1787  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1788  h[0] = ha / hb;
1789  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1790  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1791  h[1] = ha / hb;
1792  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1793  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1794  h[2] = ha / hb;
1795  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1796  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1797  h[3] = ha / hb;
1798  Packet4h result;
1799  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1800  return result;
1801 }
1802 
1803 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1804  Packet4h result;
1805  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1806  return result;
1807 }
1808 
1809 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1810  Packet4h result;
1811  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1812  return result;
1813 }
1814 
1815 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1816  __int64_t r = _mm_cvtm64_si64(from.x);
1817  *(reinterpret_cast<__int64_t*>(to)) = r;
1818 }
1819 
1820 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1821  __int64_t r = _mm_cvtm64_si64(from.x);
1822  *(reinterpret_cast<__int64_t*>(to)) = r;
1823 }
1824 
1825 template<> EIGEN_STRONG_INLINE Packet4h
1826 ploadquad<Packet4h>(const Eigen::half* from) {
1827  return pset1<Packet4h>(*from);
1828 }
1829 
1830 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1831 {
1832  Packet4h result;
1833  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1834  return result;
1835 }
1836 
1837 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1838 {
1839  __int64_t a = _mm_cvtm64_si64(from.x);
1840  to[stride*0].x = static_cast<unsigned short>(a);
1841  to[stride*1].x = static_cast<unsigned short>(a >> 16);
1842  to[stride*2].x = static_cast<unsigned short>(a >> 32);
1843  to[stride*3].x = static_cast<unsigned short>(a >> 48);
1844 }
1845 
1846 EIGEN_STRONG_INLINE void
1847 ptranspose(PacketBlock<Packet4h,4>& kernel) {
1848  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1849  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1850  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1851  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1852 
1853  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1854  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1855  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1856  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1857 }
1858 
1859 #endif
1860 
1861 
1862 } // end namespace internal
1863 
1864 } // end namespace Eigen
1865 
1866 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1867 // PGI++ does not define the following intrinsics in C++ mode.
1868 static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
1869 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
1870 static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
1871 static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
1872 static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
1873 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
1874 #endif
1875 
1876 #endif // EIGEN_PACKET_MATH_SSE_H
Matrix3f m
Array< int, 3, 1 > b
#define EIGEN_ALIGN16
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Array33i c
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:883
#define EIGEN_FAST_MATH
Definition: Macros.h:50
#define EIGEN_OPTIMIZATION_BARRIER(X)
Definition: Macros.h:1039
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define vec4ui_swizzle2(a, b, p, q, r, s)
#define vec2d_swizzle1(v, p, q)
#define vec4ui_swizzle1(v, p, q, r, s)
#define vec4i_swizzle1(v, p, q, r, s)
#define vec4i_swizzle2(a, b, p, q, r, s)
float * p
@ Aligned16
Definition: Constants.h:237
EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x)
Definition: Half.h:551
Packet pmin(const Packet &a, const Packet &b)
Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pnmsub(const Packet &a, const Packet &b, const Packet &c)
Packet padd(const Packet &a, const Packet &b)
Packet4f pmin< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
double predux_max< Packet2d >(const Packet2d &a)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
void pstore< float >(float *to, const Packet4f &from)
Packet4ui pxor< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
uint32_t pfirst< Packet4ui >(const Packet4ui &a)
Packet16b pgather< bool, Packet16b >(const bool *from, Index stride)
Packet2d pset1frombits< Packet2d >(uint64_t from)
float predux_max< Packet4f >(const Packet4f &a)
double predux_min< Packet2d >(const Packet2d &a)
Packet16b pand< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d plset< Packet2d >(const double &a)
Packet4i ptrue< Packet4i >(const Packet4i &a)
__vector int Packet4i
Packet4f pmin< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8h ptrue(const Packet8h &a)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
Packet4f pload1< Packet4f >(const float *from)
void pstore< int >(int *to, const Packet4i &from)
Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
bool predux< Packet16b >(const Packet16b &a)
Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pfrexp< Packet2d >(const Packet2d &a, Packet2d &exponent)
Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Packet2d pmax< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui plset< Packet4ui >(const uint32_t &a)
int predux< Packet4i >(const Packet4i &a)
Packet ploads(const typename unpacket_traits< Packet >::type *from)
Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
void pstore1< Packet4f >(float *to, const float &a)
bool predux_mul< Packet16b >(const Packet16b &a)
float predux_min< Packet4f >(const Packet4f &a)
Packet16b pmul< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet4f pset1frombits< Packet4f >(unsigned int from)
Packet2d pceil< Packet2d >(const Packet2d &a)
void pscatter< uint32_t, Packet4ui >(uint32_t *to, const Packet4ui &from, Index stride)
Packet4ui pmax< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
EIGEN_ALWAYS_INLINE Packet4i pgather< int, Packet4i >(const int *from, Index stride)
void pstore1< Packet2d >(double *to, const double &a)
void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
void pstores(Scalar *to, const Packet &from)
Packet4f pldexp< Packet4f >(const Packet4f &a, const Packet4f &exponent)
Packet4f pabs(const Packet4f &a)
Packet pmax(const Packet &a, const Packet &b)
void pstorel(Scalar *to, const Packet &from)
float predux< Packet4f >(const Packet4f &a)
Packet4f plset< Packet4f >(const float &a)
void pscatter< bool, Packet16b >(bool *to, const Packet16b &from, Index stride)
Packet4ui pload< Packet4ui >(const uint32_t *from)
Packet16b padd< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2cf pnegate(const Packet2cf &a)
double predux< Packet2d >(const Packet2d &a)
Packet2d pround< Packet2d >(const Packet2d &a)
Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
eigen_packet_wrapper< __m128i, 1 > Packet16b
int predux_max< Packet4i >(const Packet4i &a)
Packet4i plogical_shift_right(const Packet4i &a)
Packet4ui ploadu< Packet4ui >(const uint32_t *from)
Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui pmin< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
double predux_mul< Packet2d >(const Packet2d &a)
Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
uint32_t predux_max< Packet4ui >(const Packet4ui &a)
Packet16b por< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet2d ploaddup< Packet2d >(const double *from)
uint32_t predux_min< Packet4ui >(const Packet4ui &a)
void punpackp(Packet4f *vecs)
Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4ui padd< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
Packet4f paddsub< Packet4f >(const Packet4f &a, const Packet4f &b)
bfloat16 pfirst(const Packet8bf &a)
Packet4ui pmul< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet16b pxor< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d pgather< double, Packet2d >(const double *from, Index stride)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Packet padds(const Packet &a, const Packet &b)
EIGEN_ALWAYS_INLINE void pscatter< int, Packet4i >(int *to, const Packet4i &from, Index stride)
__vector unsigned int Packet4ui
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet4i ploadu< Packet4i >(const int *from)
float pfirst< Packet4f >(const Packet4f &a)
Packet2d pmax< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet16b ploadquad< Packet16b >(const bool *from)
double pfirst< Packet2d >(const Packet2d &a)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet4f ptrue< Packet4f >(const Packet4f &a)
Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Packet4i pset1< Packet4i >(const int &from)
void prefetch< uint32_t >(const uint32_t *addr)
Packet8h float2half(const Packet8f &a)
Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pload< Packet4i >(const int *from)
Packet4f ploadu< Packet4f >(const float *from)
float predux_mul< Packet4f >(const Packet4f &a)
Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Packet4f print(const Packet4f &a)
void pstore< bool >(bool *to, const Packet16b &from)
Packet psub(const Packet &a, const Packet &b)
Packet4i ploaddup< Packet4i >(const int *from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pset1< Packet2d >(const double &from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet16b psub< Packet16b >(const Packet16b &a, const Packet16b &b)
Packet2d paddsub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8h pand(const Packet8h &a, const Packet8h &b)
Packet4f pfrexp< Packet4f >(const Packet4f &a, Packet4f &exponent)
uint32_t predux< Packet4ui >(const Packet4ui &a)
Packet2d ploadu< Packet2d >(const double *from)
Packet4i plset< Packet4i >(const int &a)
const char * SsePrefetchPtrType
void pstoreu< float >(float *to, const Packet4f &from)
Packet2d ptrue< Packet2d >(const Packet2d &a)
Packet4f pmax< PropagateNumbers, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pfloor< Packet2d >(const Packet2d &a)
EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
Packet2d pldexp< Packet2d >(const Packet2d &a, const Packet2d &exponent)
Packet2d ploads< Packet2d >(const double *from)
Packet4f padds< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet2d pmin< PropagateNaN, Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmin< PropagateNumbers, Packet2d >(const Packet2d &a, const Packet2d &b)
int predux_min< Packet4i >(const Packet4i &a)
Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4f ploads< Packet4f >(const float *from)
Packet4ui psub< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4ui ploaddup< Packet4ui >(const uint32_t *from)
Packet8bf psignbit(const Packet8bf &a)
int predux_mul< Packet4i >(const Packet4i &a)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
int pfirst< Packet4i >(const Packet4i &a)
Packet16b ptrue< Packet16b >(const Packet16b &a)
Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Packet4ui pgather< uint32_t, Packet4ui >(const uint32_t *from, Index stride)
Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2cf pconj(const Packet2cf &a)
Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
void pstoreu< int >(int *to, const Packet4i &from)
Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d pload< Packet2d >(const double *from)
Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i plogical_shift_left(const Packet4i &a)
Packet2cf preverse(const Packet2cf &a)
Packet16b pload< Packet16b >(const bool *from)
Packet4i parithmetic_shift_right(const Packet4i &a)
Packet4f pload< Packet4f >(const float *from)
Packet8h por(const Packet8h &a, const Packet8h &b)
void pstoreu< bool >(bool *to, const Packet16b &from)
Packet4f pset1< Packet4f >(const float &from)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet4f ploaddup< Packet4f >(const float *from)
Packet ploadl(const typename unpacket_traits< Packet >::type *from)
Packet4ui pset1< Packet4ui >(const uint32_t &from)
__vector float Packet4f
Packet4f pmax< PropagateNaN, Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d ploadl< Packet2d >(const double *from)
Packet4f pround< Packet4f >(const Packet4f &a)
Packet4f pfloor< Packet4f >(const Packet4f &a)
Packet16b ploadu< Packet16b >(const bool *from)
void prefetch< int >(const int *addr)
void pbroadcast4< Packet4f >(const float *a, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
Packet4ui pand< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
void pbroadcast4< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
Packet16b ploaddup< Packet16b >(const bool *from)
Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
bool predux_any(const Packet4f &x)
Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Packet4f ploadl< Packet4f >(const float *from)
Packet4ui por< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet4ui pandnot< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4f pceil< Packet4f >(const Packet4f &a)
bool pfirst< Packet16b >(const Packet16b &a)
uint32_t predux_mul< Packet4ui >(const Packet4ui &a)
Packet2d padds< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4f vec4f_swizzle1(const Packet4f &a, int p, int q, int r, int s)
Packet16b pset1< Packet16b >(const bool &from)
Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet8f peven_mask(const Packet8f &)
std::uint32_t uint32_t
Definition: Meta.h:39
std::uint64_t uint64_t
Definition: Meta.h:41
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
numext::uint16_t x
Definition: Half.h:104