AVX/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_AVX_H
11 #define EIGEN_PACKET_MATH_AVX_H
12 
13 #include "../../InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
17 namespace internal {
18 
19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21 #endif
22 
23 #if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
24 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
25 #endif
26 
27 #ifdef EIGEN_VECTORIZE_FMA
28 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
30 #endif
31 #endif
32 
33 typedef __m256 Packet8f;
34 typedef eigen_packet_wrapper<__m256i, 0> Packet8i;
35 typedef __m256d Packet4d;
36 #ifndef EIGEN_VECTORIZE_AVX512FP16
37 typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
38 #endif
39 typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
40 typedef eigen_packet_wrapper<__m256i, 4> Packet8ui;
41 
42 #ifdef EIGEN_VECTORIZE_AVX2
43 // Start from 3 to be compatible with AVX512
44 typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
45 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
46 #endif
47 
48 template<> struct is_arithmetic<__m256> { enum { value = true }; };
49 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
50 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
51 template<> struct is_arithmetic<Packet8i> { enum { value = true }; };
52 // Note that `Packet8ui` uses the underlying type `__m256i`, which is
53 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
54 // operations used in `GenericPacketMath.h`.
55 template<> struct is_arithmetic<Packet8ui> { enum { value = false }; };
56 #ifndef EIGEN_VECTORIZE_AVX512FP16
57 template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
58 #endif
59 template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
60 #ifdef EIGEN_VECTORIZE_AVX2
61 template<> struct is_arithmetic<Packet4l> { enum { value = true }; };
62 // Note that `Packet4ul` uses the underlying type `__m256i`, which is
63 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
64 // operations used in `GenericPacketMath.h`.
65 template<> struct is_arithmetic<Packet4ul> { enum { value = false }; };
66 #endif
67 
68 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
69 // to leverage AVX512 instructions.
70 #ifndef EIGEN_VECTORIZE_AVX512
71 template<> struct packet_traits<float> : default_packet_traits
72 {
73  typedef Packet8f type;
74  typedef Packet4f half;
75  enum {
76  Vectorizable = 1,
77  AlignedOnScalar = 1,
78  size = 8,
79 
80  HasCmp = 1,
81  HasDiv = 1,
82  HasReciprocal = EIGEN_FAST_MATH,
83  HasSin = EIGEN_FAST_MATH,
84  HasCos = EIGEN_FAST_MATH,
85  HasACos = 1,
86  HasASin = 1,
87  HasATan = 1,
88  HasATanh = 1,
89  HasLog = 1,
90  HasLog1p = 1,
91  HasExpm1 = 1,
92  HasExp = 1,
93  HasNdtri = 1,
94  HasBessel = 1,
95  HasSqrt = 1,
96  HasRsqrt = 1,
97  HasTanh = EIGEN_FAST_MATH,
98  HasErf = EIGEN_FAST_MATH,
99  HasBlend = 1,
100  HasRound = 1,
101  HasFloor = 1,
102  HasCeil = 1,
103  HasRint = 1
104  };
105 };
106 template<> struct packet_traits<double> : default_packet_traits
107 {
108  typedef Packet4d type;
109  typedef Packet2d half;
110  enum {
111  Vectorizable = 1,
112  AlignedOnScalar = 1,
113  size=4,
114 
115  HasCmp = 1,
116  HasDiv = 1,
117  HasLog = 1,
118  HasExp = 1,
119  HasSqrt = 1,
120  HasRsqrt = 1,
121  HasATan = 1,
122  HasBlend = 1,
123  HasRound = 1,
124  HasFloor = 1,
125  HasCeil = 1,
126  HasRint = 1
127  };
128 };
129 
130 template <>
131 struct packet_traits<Eigen::half> : default_packet_traits {
132  typedef Packet8h type;
133  // There is no half-size packet for Packet8h.
134  typedef Packet8h half;
135  enum {
136  Vectorizable = 1,
137  AlignedOnScalar = 1,
138  size = 8,
139 
140  HasCmp = 1,
141  HasAdd = 1,
142  HasSub = 1,
143  HasMul = 1,
144  HasDiv = 1,
145  HasSin = EIGEN_FAST_MATH,
146  HasCos = EIGEN_FAST_MATH,
147  HasNegate = 1,
148  HasAbs = 1,
149  HasAbs2 = 0,
150  HasMin = 1,
151  HasMax = 1,
152  HasConj = 1,
153  HasSetLinear = 0,
154  HasLog = 1,
155  HasLog1p = 1,
156  HasExpm1 = 1,
157  HasExp = 1,
158  HasSqrt = 1,
159  HasRsqrt = 1,
160  HasTanh = EIGEN_FAST_MATH,
161  HasErf = EIGEN_FAST_MATH,
162  HasBlend = 0,
163  HasRound = 1,
164  HasFloor = 1,
165  HasCeil = 1,
166  HasRint = 1,
167  HasBessel = 1,
168  HasNdtri = 1
169  };
170 };
171 
172 template <>
173 struct packet_traits<bfloat16> : default_packet_traits {
174  typedef Packet8bf type;
175  // There is no half-size packet for current Packet8bf.
176  // TODO: support as SSE path.
177  typedef Packet8bf half;
178  enum {
179  Vectorizable = 1,
180  AlignedOnScalar = 1,
181  size = 8,
182 
183  HasCmp = 1,
184  HasAdd = 1,
185  HasSub = 1,
186  HasMul = 1,
187  HasDiv = 1,
188  HasSin = EIGEN_FAST_MATH,
189  HasCos = EIGEN_FAST_MATH,
190  HasNegate = 1,
191  HasAbs = 1,
192  HasAbs2 = 0,
193  HasMin = 1,
194  HasMax = 1,
195  HasConj = 1,
196  HasSetLinear = 0,
197  HasLog = 1,
198  HasLog1p = 1,
199  HasExpm1 = 1,
200  HasExp = 1,
201  HasSqrt = 1,
202  HasRsqrt = 1,
203  HasTanh = EIGEN_FAST_MATH,
204  HasErf = EIGEN_FAST_MATH,
205  HasBlend = 0,
206  HasRound = 1,
207  HasFloor = 1,
208  HasCeil = 1,
209  HasRint = 1,
210  HasBessel = 1,
211  HasNdtri = 1
212  };
213 };
214 
215 template<> struct packet_traits<int> : default_packet_traits
216 {
217  typedef Packet8i type;
218  typedef Packet4i half;
219  enum {
220  Vectorizable = 1,
221  AlignedOnScalar = 1,
222  HasCmp = 1,
223  HasDiv = 1,
224  size=8
225  };
226 };
227 template<> struct packet_traits<uint32_t> : default_packet_traits
228 {
229  typedef Packet8ui type;
230  typedef Packet4ui half;
231  enum {
232  Vectorizable = 1,
233  AlignedOnScalar = 1,
234  size = 8,
235 
236  HasDiv = 0,
237  HasNegate = 0,
238  HasSqrt = 0,
239 
240  HasCmp = 1,
241  HasMin = 1,
242  HasMax = 1,
243  HasShift = 1
244  };
245 };
246 
247 #ifdef EIGEN_VECTORIZE_AVX2
248 template<> struct packet_traits<int64_t> : default_packet_traits
249 {
250  typedef Packet4l type;
251  // There is no half-size packet for current Packet4l.
252  // TODO: support as SSE path.
253  typedef Packet4l half;
254  enum {
255  Vectorizable = 1,
256  AlignedOnScalar = 1,
257  HasCmp = 1,
258  size=4
259  };
260 };
261 template<> struct packet_traits<uint64_t> : default_packet_traits
262 {
263  typedef Packet4ul type;
264  // There is no half-size packet for current Packet4ul.
265  // TODO: support as SSE path.
266  typedef Packet4ul half;
267  enum {
268  Vectorizable = 1,
269  AlignedOnScalar = 1,
270  size = 4,
271 
272  // HasMin = 0,
273  // HasMax = 0,
274  HasDiv = 0,
275  HasBlend = 0,
276  HasTranspose = 0,
277  HasNegate = 0,
278  HasSqrt = 0,
279  HasCmp = 1,
280  HasShift = 1
281  };
282 };
283 #endif
284 
285 #endif
286 
287 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
288 template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
289 
290 template<> struct unpacket_traits<Packet8f> {
291  typedef float type;
292  typedef Packet4f half;
293  typedef Packet8i integer_packet;
294  typedef uint8_t mask_t;
295  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true
296 #ifdef EIGEN_VECTORIZE_AVX512
297  , masked_fpops_available=true
298 #endif
299  };
300 };
301 template<> struct unpacket_traits<Packet4d> {
302  typedef double type;
303  typedef Packet2d half;
304  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
305 };
306 template<> struct unpacket_traits<Packet8i> {
307  typedef int type;
308  typedef Packet4i half;
309  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
310 };
311 template<> struct unpacket_traits<Packet8ui> {
312  typedef uint32_t type;
313  typedef Packet4ui half;
314  enum {size = 8, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
315 };
316 #ifdef EIGEN_VECTORIZE_AVX2
317 template<> struct unpacket_traits<Packet4l> {
318  typedef int64_t type;
319  typedef Packet4l half;
320  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
321 };
322 template<> struct unpacket_traits<Packet4ul> {
323  typedef uint64_t type;
324  typedef Packet4ul half;
325  enum {size = 4, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
326 };
327 #endif
328 template<> struct unpacket_traits<Packet8bf> {
329  typedef bfloat16 type;
330  typedef Packet8bf half;
331  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
332 };
333 
334 // Helper function for bit packing snippet of low precision comparison.
335 // It packs the flags from 16x16 to 8x16.
336 EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
337  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
338  _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
339 }
340 
341 #ifdef EIGEN_VECTORIZE_AVX2
342 template <>
343 EIGEN_STRONG_INLINE Packet4l pset1<Packet4l>(const int64_t& from) {
344  return _mm256_set1_epi64x(from);
345 }
346 template <>
347 EIGEN_STRONG_INLINE Packet4ul pset1<Packet4ul>(const uint64_t& from) {
348  return _mm256_set1_epi64x(numext::bit_cast<uint64_t>(from));
349 }
350 template <>
351 EIGEN_STRONG_INLINE Packet4l pzero(const Packet4l& /*a*/) {
352  return _mm256_setzero_si256();
353 }
354 template <>
355 EIGEN_STRONG_INLINE Packet4ul pzero(const Packet4ul& /*a*/) {
356  return _mm256_setzero_si256();
357 }
358 template <>
359 EIGEN_STRONG_INLINE Packet4l peven_mask(const Packet4l& /*a*/) {
360  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
361 }
362 template <>
363 EIGEN_STRONG_INLINE Packet4ul peven_mask(const Packet4ul& /*a*/) {
364  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
365 }
366 template <>
367 EIGEN_STRONG_INLINE Packet4l pload1<Packet4l>(const int64_t* from) {
368  return _mm256_set1_epi64x(*from);
369 }
370 template <>
371 EIGEN_STRONG_INLINE Packet4ul pload1<Packet4ul>(const uint64_t* from) {
372  return _mm256_set1_epi64x(*from);
373 }
374 template <>
375 EIGEN_STRONG_INLINE Packet4l padd<Packet4l>(const Packet4l& a, const Packet4l& b) {
376  return _mm256_add_epi64(a, b);
377 }
378 template <>
379 EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
380  return _mm256_add_epi64(a, b);
381 }
382 template<>
383 EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
384  return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
385 }
386 template <>
387 EIGEN_STRONG_INLINE Packet4ul plset<Packet4ul>(const uint64_t& a) {
388  return padd(pset1<Packet4ul>(a), Packet4ul(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
389 }
390 template <>
391 EIGEN_STRONG_INLINE Packet4l psub<Packet4l>(const Packet4l& a, const Packet4l& b) {
392  return _mm256_sub_epi64(a, b);
393 }
394 template <>
395 EIGEN_STRONG_INLINE Packet4ul psub<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
396  return _mm256_sub_epi64(a, b);
397 }
398 template <>
399 EIGEN_STRONG_INLINE Packet4l pnegate(const Packet4l& a) {
400  return psub(pzero(a), a);
401 }
402 template <>
403 EIGEN_STRONG_INLINE Packet4l pconj(const Packet4l& a) {
404  return a;
405 }
406 template <>
407 EIGEN_STRONG_INLINE Packet4l pcmp_le(const Packet4l& a, const Packet4l& b) {
408  return _mm256_xor_si256(_mm256_cmpgt_epi64(a, b), _mm256_set1_epi32(-1));
409 }
410 template <>
411 EIGEN_STRONG_INLINE Packet4ul pcmp_le(const Packet4ul& a, const Packet4ul& b) {
412  return (Packet4ul)pcmp_le((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
413  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
414 }
415 template <>
416 EIGEN_STRONG_INLINE Packet4l pcmp_lt(const Packet4l& a, const Packet4l& b) {
417  return _mm256_cmpgt_epi64(b, a);
418 }
419 template <>
420 EIGEN_STRONG_INLINE Packet4ul pcmp_lt(const Packet4ul& a, const Packet4ul& b) {
421  return (Packet4ul)pcmp_lt((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
422  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
423 }
424 template <>
425 EIGEN_STRONG_INLINE Packet4l pcmp_eq(const Packet4l& a, const Packet4l& b) {
426  return _mm256_cmpeq_epi64(a, b);
427 }
428 template <>
429 EIGEN_STRONG_INLINE Packet4ul pcmp_eq(const Packet4ul& a, const Packet4ul& b) {
430  return _mm256_cmpeq_epi64(a, b);
431 }
432 template <>
433 EIGEN_STRONG_INLINE Packet4l ptrue<Packet4l>(const Packet4l& a) {
434  return _mm256_cmpeq_epi64(a, a);
435 }
436 template <>
437 EIGEN_STRONG_INLINE Packet4ul ptrue<Packet4ul>(const Packet4ul& a) {
438  return _mm256_cmpeq_epi64(a, a);
439 }
440 template <>
441 EIGEN_STRONG_INLINE Packet4l pand<Packet4l>(const Packet4l& a, const Packet4l& b) {
442  return _mm256_and_si256(a, b);
443 }
444 template <>
445 EIGEN_STRONG_INLINE Packet4l por<Packet4l>(const Packet4l& a, const Packet4l& b) {
446  return _mm256_or_si256(a, b);
447 }
448 template <>
449 EIGEN_STRONG_INLINE Packet4l pxor<Packet4l>(const Packet4l& a, const Packet4l& b) {
450  return _mm256_xor_si256(a, b);
451 }
452 template <>
453 EIGEN_STRONG_INLINE Packet4ul pxor<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
454  return _mm256_xor_si256(a, b);
455 }
456 template <>
457 EIGEN_STRONG_INLINE Packet4l pandnot<Packet4l>(const Packet4l& a, const Packet4l& b) {
458  return _mm256_andnot_si256(b, a);
459 }
460 template <int N>
461 EIGEN_STRONG_INLINE Packet4l plogical_shift_right(Packet4l a) {
462  return _mm256_srli_epi64(a, N);
463 }
464 template <int N>
465 EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) {
466  return _mm256_slli_epi64(a, N);
467 }
468 #ifdef EIGEN_VECTORIZE_AVX512FP16
469 template <int N>
470 EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) { return _mm256_srai_epi64(a, N); }
471 #else
472 template <int N>
473 EIGEN_STRONG_INLINE std::enable_if_t< (N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
474  return a;
475 }
476 template <int N>
477 EIGEN_STRONG_INLINE std::enable_if_t< (N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
478  __m256i hi_word = _mm256_srai_epi32(a, N);
479  __m256i lo_word = _mm256_srli_epi64(a, N);
480  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
481 }
482 template <int N>
483 EIGEN_STRONG_INLINE std::enable_if_t< (N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
484  __m256i hi_word = _mm256_srai_epi32(a, 31);
485  __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
486  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
487 }
488 template <int N>
489 EIGEN_STRONG_INLINE std::enable_if_t< (N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
490  return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
491 }
492 template <int N>
493 EIGEN_STRONG_INLINE std::enable_if_t< (N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
494  return parithmetic_shift_right<int(N&63)>(a);
495 }
496 #endif
497 template <>
498 EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(const int64_t* from) {
499  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
500 }
501 template <>
502 EIGEN_STRONG_INLINE Packet4ul pload<Packet4ul>(const uint64_t* from) {
503  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
504 }
505 template <>
506 EIGEN_STRONG_INLINE Packet4l ploadu<Packet4l>(const int64_t* from) {
507  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
508 }
509 template <>
510 EIGEN_STRONG_INLINE Packet4ul ploadu<Packet4ul>(const uint64_t* from) {
511  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
512 }
513 // Loads 2 int64_ts from memory a returns the packet {a0, a0, a1, a1}
514 template <>
515 EIGEN_STRONG_INLINE Packet4l ploaddup<Packet4l>(const int64_t* from) {
516  const Packet4l a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
517  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
518 }
519 // Loads 2 uint64_ts from memory a returns the packet {a0, a0, a1, a1}
520 template <>
521 EIGEN_STRONG_INLINE Packet4ul ploaddup<Packet4ul>(const uint64_t* from) {
522  const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
523  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
524 }
525 template<>
526 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
527  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
528 }
529 template <>
530 EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet4ul& from) {
531  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
532 }
533 template <>
534 EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet4l& from) {
535  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
536 }
537 template <>
538 EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet4ul& from) {
539  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
540 }
541 template <>
542 EIGEN_DEVICE_FUNC inline Packet4l pgather<int64_t, Packet4l>(const int64_t* from, Index stride) {
543  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
544 }
545 template <>
546 EIGEN_DEVICE_FUNC inline Packet4ul pgather<uint64_t, Packet4ul>(const uint64_t* from, Index stride) {
547  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
548 }
549 template <>
550 EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index stride) {
551  __m128i low = _mm256_extractf128_si256(from, 0);
552  to[stride * 0] = _mm_extract_epi64(low, 0);
553  to[stride * 1] = _mm_extract_epi64(low, 1);
554 
555  __m128i high = _mm256_extractf128_si256(from, 1);
556  to[stride * 2] = _mm_extract_epi64(high, 0);
557  to[stride * 3] = _mm_extract_epi64(high, 1);
558 }
559 template <>
560 EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index stride) {
561  __m128i low = _mm256_extractf128_si256(from, 0);
562  to[stride * 0] = _mm_extract_epi64(low, 0);
563  to[stride * 1] = _mm_extract_epi64(low, 1);
564 
565  __m128i high = _mm256_extractf128_si256(from, 1);
566  to[stride * 2] = _mm_extract_epi64(high, 0);
567  to[stride * 3] = _mm_extract_epi64(high, 1);
568 }
569 template <>
570 EIGEN_STRONG_INLINE void pstore1<Packet4l>(int64_t* to, const int64_t& a) {
571  Packet4l pa = pset1<Packet4l>(a);
572  pstore(to, pa);
573 }
574 template <>
575 EIGEN_STRONG_INLINE void pstore1<Packet4ul>(uint64_t* to, const uint64_t& a) {
576  Packet4ul pa = pset1<Packet4ul>(a);
577  pstore(to, pa);
578 }
579 template<>
580 EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
581  return _mm_cvtsi128_si64(_mm256_castsi256_si128(a));
582 }
583 template <>
584 EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
585  return _mm_cvtsi128_si64(_mm256_castsi256_si128(a));
586 }
587 template <>
588 EIGEN_STRONG_INLINE int64_t predux<Packet4l>(const Packet4l& a) {
589  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
590  return _mm_extract_epi64(r, 0) + _mm_extract_epi64(r, 1);
591 }
592 template <>
593 EIGEN_STRONG_INLINE uint64_t predux<Packet4ul>(const Packet4ul& a) {
594  __m128i r = _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
595  return numext::bit_cast<uint64_t>(_mm_extract_epi64(r, 0) + _mm_extract_epi64(r, 1));
596 }
597 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
598 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
599  __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
600  __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0);
601  __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15);
602  __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0);
603 
604  kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32));
605  kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49));
606  kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32));
607  kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49));
608 }
609 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ul, 4>& kernel) {
610  ptranspose((PacketBlock<Packet4l, 4>&)kernel);
611 }
612 template <>
613 EIGEN_STRONG_INLINE Packet4l pmin<Packet4l>(const Packet4l& a, const Packet4l& b) {
614  __m256i cmp = _mm256_cmpgt_epi64(a, b);
615  __m256i a_min = _mm256_andnot_si256(cmp, a);
616  __m256i b_min = _mm256_and_si256(cmp, b);
617  return Packet4l(_mm256_or_si256(a_min, b_min));
618 }
619 template <>
620 EIGEN_STRONG_INLINE Packet4ul pmin<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
621  return padd((Packet4ul)pmin((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
622  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
623  pset1<Packet4ul>(0x8000000000000000UL));
624 }
625 template <>
626 EIGEN_STRONG_INLINE Packet4l pmax<Packet4l>(const Packet4l& a, const Packet4l& b) {
627  __m256i cmp = _mm256_cmpgt_epi64(a, b);
628  __m256i a_min = _mm256_and_si256(cmp, a);
629  __m256i b_min = _mm256_andnot_si256(cmp, b);
630  return Packet4l(_mm256_or_si256(a_min, b_min));
631 }
632 template <>
633 EIGEN_STRONG_INLINE Packet4ul pmax<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
634  return padd((Packet4ul)pmax((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
635  (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
636  pset1<Packet4ul>(0x8000000000000000UL));
637 }
638 template <>
639 EIGEN_STRONG_INLINE Packet4l pabs<Packet4l>(const Packet4l& a) {
640  Packet4l pz = pzero<Packet4l>(a);
641  Packet4l cmp = _mm256_cmpgt_epi64(a, pz);
642  return psub(cmp, pxor(a, cmp));
643 }
644 template <>
645 EIGEN_STRONG_INLINE Packet4ul pabs<Packet4ul>(const Packet4ul& a) {
646  return a;
647 }
648 template <>
649 EIGEN_STRONG_INLINE Packet4l pmul<Packet4l>(const Packet4l& a, const Packet4l& b) {
650  // 64-bit mul requires avx512, so do this with 32-bit multiplication
651  __m256i upper32_a = _mm256_srli_epi64(a, 32);
652  __m256i upper32_b = _mm256_srli_epi64(b, 32);
653 
654  // upper * lower
655  __m256i mul1 = _mm256_mul_epu32(upper32_a, b);
656  __m256i mul2 = _mm256_mul_epu32(upper32_b, a);
657  // Gives us both upper*upper and lower*lower
658  __m256i mul3 = _mm256_mul_epu32(a, b);
659 
660  __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32);
661  return _mm256_add_epi64(high, mul3);
662 }
663 template <>
664 EIGEN_STRONG_INLINE Packet4ul pmul<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
665  return (Packet4ul)pmul<Packet4l>((Packet4l)a, (Packet4l)b);
666 }
667 #endif
668 
669 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
670 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
671 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
672 template<> EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) { return _mm256_set1_epi32(from); }
673 
674 template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
675 template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
676 
677 template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
678 template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
679 template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
680 template<> EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) { return _mm256_setzero_si256(); }
681 
682 
683 template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
684 template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
685 template<> EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
686 template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
687 
688 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
689 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
690 
691 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
692 #ifdef EIGEN_VECTORIZE_AVX512
693 template <>
694 EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b, uint8_t umask) {
695  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
696  return _mm512_castps512_ps256(_mm512_maskz_add_ps(
697  mask,
698  _mm512_castps256_ps512(a),
699  _mm512_castps256_ps512(b)));
700 }
701 #endif
702 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
703 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
704 #ifdef EIGEN_VECTORIZE_AVX2
705  return _mm256_add_epi32(a,b);
706 #else
707  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
708  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
709  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
710 #endif
711 }
712 template<> EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
713 {
714 #ifdef EIGEN_VECTORIZE_AVX2
715  return _mm256_add_epi32(a, b);
716 #else
717  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
718  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
719  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
720 #endif
721 }
722 
723 template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
724 template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
725 template<> EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) { return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7,6,5,4,3,2,1,0)); }
726 template<> EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) { return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); }
727 
728 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
729 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
730 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
731 #ifdef EIGEN_VECTORIZE_AVX2
732  return _mm256_sub_epi32(a,b);
733 #else
734  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
735  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
736  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
737 #endif
738 }
739 template<> EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
740 {
741 #ifdef EIGEN_VECTORIZE_AVX2
742  return _mm256_sub_epi32(a, b);
743 #else
744  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
745  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
746  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
747 #endif
748 }
749 
750 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
751 {
752  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
753  return _mm256_xor_ps(a, mask);
754 }
755 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
756 {
757  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
758  return _mm256_xor_pd(a, mask);
759 }
760 template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a)
761 {
762  return psub(pzero(a), a);
763 }
764 
765 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
766 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
767 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
768 
769 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
770 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
771 template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
772 #ifdef EIGEN_VECTORIZE_AVX2
773  return _mm256_mullo_epi32(a,b);
774 #else
775  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
776  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
777  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
778 #endif
779 }
780 template<> EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
781 {
782 #ifdef EIGEN_VECTORIZE_AVX2
783  return _mm256_mullo_epi32(a, b);
784 #else
785  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
786  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
787  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
788 #endif
789 }
790 
791 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
792 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
793 
794 template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b)
795 {
796 #ifdef EIGEN_VECTORIZE_AVX512
797  return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
798 #else
799  Packet4i lo = pdiv<Packet4i>(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
800  Packet4i hi = pdiv<Packet4i>(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
801  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
802 #endif
803 }
804 
805 #ifdef EIGEN_VECTORIZE_FMA
806 template <>
807 EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
808  return _mm256_fmadd_ps(a, b, c);
809 }
810 template <>
811 EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
812  return _mm256_fmadd_pd(a, b, c);
813 }
814 
815 template <>
816 EIGEN_STRONG_INLINE Packet8f pmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
817  return _mm256_fmsub_ps(a, b, c);
818 }
819 
820 template <>
821 EIGEN_STRONG_INLINE Packet4d pmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
822  return _mm256_fmsub_pd(a, b, c);
823 }
824 
825 template <>
826 EIGEN_STRONG_INLINE Packet8f pnmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
827  return _mm256_fnmadd_ps(a, b, c);
828 }
829 
830 template <>
831 EIGEN_STRONG_INLINE Packet4d pnmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
832  return _mm256_fnmadd_pd(a, b, c);
833 }
834 
835 template <>
836 EIGEN_STRONG_INLINE Packet8f pnmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
837  return _mm256_fnmsub_ps(a, b, c);
838 }
839 
840 template <>
841 EIGEN_STRONG_INLINE Packet4d pnmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
842  return _mm256_fnmsub_pd(a, b, c);
843 }
844 
845 #endif
846 
847 template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
848 template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
849 template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
850 template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
851 template<> EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) { return _mm256_cmp_ps(a,a,_CMP_UNORD_Q); }
852 
853 template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
854 template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
855 template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
856 template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
857 
858 template<> EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
859 #ifdef EIGEN_VECTORIZE_AVX2
860  return _mm256_xor_si256(_mm256_cmpgt_epi32(a,b), _mm256_set1_epi32(-1));
861 #else
862  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
863  lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
864  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
865  hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
866  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
867 #endif
868 }
869 template<> EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
870 #ifdef EIGEN_VECTORIZE_AVX2
871  return _mm256_cmpgt_epi32(b,a);
872 #else
873  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
874  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
875  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
876 #endif
877 }
878 template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
879 #ifdef EIGEN_VECTORIZE_AVX2
880  return _mm256_cmpeq_epi32(a,b);
881 #else
882  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
883  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
884  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
885 #endif
886 }
887 template<> EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
888 #ifdef EIGEN_VECTORIZE_AVX2
889  return _mm256_cmpeq_epi32(a, b);
890 #else
891  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
892  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
893  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
894 #endif
895 }
896 
897 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
898 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
899  // There appears to be a bug in GCC, by which the optimizer may flip
900  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
901  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
902  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
903  Packet8f res;
904  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
905  return res;
906 #else
907  // Arguments are swapped to match NaN propagation behavior of std::min.
908  return _mm256_min_ps(b,a);
909 #endif
910 }
911 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
912 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
913  // See pmin above
914  Packet4d res;
915  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
916  return res;
917 #else
918  // Arguments are swapped to match NaN propagation behavior of std::min.
919  return _mm256_min_pd(b,a);
920 #endif
921 }
922 template<> EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
923 #ifdef EIGEN_VECTORIZE_AVX2
924  return _mm256_min_epi32(a, b);
925 #else
926  __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
927  __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
928  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
929 #endif
930 }
931 template<> EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
932 #ifdef EIGEN_VECTORIZE_AVX2
933  return _mm256_min_epu32(a, b);
934 #else
935  __m128i lo = _mm_min_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
936  __m128i hi = _mm_min_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
937  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
938 #endif
939 }
940 
941 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
942 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
943  // See pmin above
944  Packet8f res;
945  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
946  return res;
947 #else
948  // Arguments are swapped to match NaN propagation behavior of std::max.
949  return _mm256_max_ps(b,a);
950 #endif
951 }
952 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
953 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
954  // See pmin above
955  Packet4d res;
956  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
957  return res;
958 #else
959  // Arguments are swapped to match NaN propagation behavior of std::max.
960  return _mm256_max_pd(b,a);
961 #endif
962 }
963 template<> EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
964 #ifdef EIGEN_VECTORIZE_AVX2
965  return _mm256_max_epi32(a, b);
966 #else
967  __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
968  __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
969  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
970 #endif
971 }
972 template<> EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
973 #ifdef EIGEN_VECTORIZE_AVX2
974  return _mm256_max_epu32(a, b);
975 #else
976  __m128i lo = _mm_max_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
977  __m128i hi = _mm_max_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
978  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
979 #endif
980 }
981 
982 #ifdef EIGEN_VECTORIZE_AVX2
983 template<> EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
984  return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
985 }
986 #endif
987 
988 // Add specializations for min/max with prescribed NaN progation.
989 template<>
990 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
992 }
993 template<>
994 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
996 }
997 template<>
998 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
1000 }
1001 template<>
1002 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
1004 }
1005 template<>
1006 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
1008 }
1009 template<>
1010 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
1012 }
1013 template<>
1014 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
1016 }
1017 template<>
1018 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
1020 }
1021 
1022 template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
1023 template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
1024 
1025 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
1026 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
1027 
1028 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
1029 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
1030 
1031 
1032 template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
1033 #ifdef EIGEN_VECTORIZE_AVX2
1034  // vpcmpeqd has lower latency than the more general vcmpps
1035  return _mm256_cmpeq_epi32(a,a);
1036 #else
1037  const __m256 b = _mm256_castsi256_ps(a);
1038  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
1039 #endif
1040 }
1041 
1042 template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
1043 #ifdef EIGEN_VECTORIZE_AVX2
1044  // vpcmpeqd has lower latency than the more general vcmpps
1045  const __m256i b = _mm256_castps_si256(a);
1046  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
1047 #else
1048  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
1049 #endif
1050 }
1051 
1052 template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
1053 #ifdef EIGEN_VECTORIZE_AVX2
1054  // vpcmpeqq has lower latency than the more general vcmppd
1055  const __m256i b = _mm256_castpd_si256(a);
1056  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
1057 #else
1058  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
1059 #endif
1060 }
1061 
1062 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
1063 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
1064 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
1065 #ifdef EIGEN_VECTORIZE_AVX2
1066  return _mm256_and_si256(a,b);
1067 #else
1068  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
1069 #endif
1070 }
1071 template<> EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1072 #ifdef EIGEN_VECTORIZE_AVX2
1073  return _mm256_and_si256(a,b);
1074 #else
1075  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
1076 #endif
1077 }
1078 
1079 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
1080 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
1081 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
1082 #ifdef EIGEN_VECTORIZE_AVX2
1083  return _mm256_or_si256(a,b);
1084 #else
1085  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
1086 #endif
1087 }
1088 template<> EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1089 #ifdef EIGEN_VECTORIZE_AVX2
1090  return _mm256_or_si256(a,b);
1091 #else
1092  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
1093 #endif
1094 }
1095 
1096 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
1097 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
1098 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
1099 #ifdef EIGEN_VECTORIZE_AVX2
1100  return _mm256_xor_si256(a,b);
1101 #else
1102  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
1103 #endif
1104 }
1105 template<> EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1106 #ifdef EIGEN_VECTORIZE_AVX2
1107  return _mm256_xor_si256(a, b);
1108 #else
1109  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
1110 #endif
1111 }
1112 
1113 template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
1114 template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
1115 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
1116 #ifdef EIGEN_VECTORIZE_AVX2
1117  return _mm256_andnot_si256(b,a);
1118 #else
1119  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
1120 #endif
1121 }
1122 template<> EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
1123 #ifdef EIGEN_VECTORIZE_AVX2
1124  return _mm256_andnot_si256(b,a);
1125 #else
1126  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
1127 #endif
1128 }
1129 
1130 template<> EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
1131  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1132 }
1133 template<> EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
1134  return pcmp_eq(a, pmin(a, b));
1135 }
1136 
1137 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
1138 {
1139  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
1140  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
1141  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1142 }
1143 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
1144 {
1145  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
1146  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
1147  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1148 }
1149 
1150 template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
1151 { return _mm256_blendv_ps(b,a,mask); }
1152 template<> EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b)
1153 { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
1154 template<> EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b)
1155 { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
1156 
1157 template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
1158 { return _mm256_blendv_pd(b,a,mask); }
1159 
1160 template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
1161 #ifdef EIGEN_VECTORIZE_AVX2
1162  return _mm256_srai_epi32(a, N);
1163 #else
1164  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
1165  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
1166  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1167 #endif
1168 }
1169 
1170 template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
1171 #ifdef EIGEN_VECTORIZE_AVX2
1172  return _mm256_srli_epi32(a, N);
1173 #else
1174  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
1175  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
1176  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1177 #endif
1178 }
1179 
1180 template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
1181 #ifdef EIGEN_VECTORIZE_AVX2
1182  return _mm256_slli_epi32(a, N);
1183 #else
1184  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
1185  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
1186  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1187 #endif
1188 }
1189 
1190 template<int N> EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
1191  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1192 }
1193 template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
1194  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
1195 }
1196 template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
1197  return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
1198 }
1199 
1200 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
1201 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
1202 template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
1203 template<> EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
1204 
1205 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
1206 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
1207 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
1208 template<> EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
1209 
1210 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
1211 #ifdef EIGEN_VECTORIZE_AVX512
1212  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1213  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
1214 #else
1215  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1216  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
1217  mask = por<Packet8i>(mask, bit_mask);
1218  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1219  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
1220 #endif
1221 }
1222 
1223 // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
1224 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
1225 {
1226  // TODO try to find a way to avoid the need of a temporary register
1227  // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
1228 // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
1229 // return _mm256_unpacklo_ps(tmp,tmp);
1230 
1231  // _mm256_insertf128_ps is very slow on Haswell, thus:
1232  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1233  // mimic an "inplace" permutation of the lower 128bits using a blend
1234  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
1235  // then we can perform a consistent permutation on the global register to get everything in shape:
1236  return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
1237 }
1238 // Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
1239 template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
1240 {
1241  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
1242  return _mm256_permute_pd(tmp, 3<<2);
1243 }
1244 // Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
1245 template<> EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from)
1246 {
1247 #ifdef EIGEN_VECTORIZE_AVX2
1248  const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
1249  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1250 #else
1251  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1252  // mimic an "inplace" permutation of the lower 128bits using a blend
1253  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
1254  // then we can perform a consistent permutation on the global register to get everything in shape:
1255  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)));
1256 #endif
1257 }
1258 template<> EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
1259 #ifdef EIGEN_VECTORIZE_AVX2
1260  const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
1261  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1262 #else
1263  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
1264  // mimic an "inplace" permutation of the lower 128bits using a blend
1265  tmp = _mm256_blend_ps(
1266  tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1267  // then we can perform a consistent permutation on the global register to get
1268  // everything in shape:
1269  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1270 #endif
1271 }
1272 
1273 // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
1274 template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
1275 {
1276  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
1277  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
1278 }
1279 template<> EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from)
1280 {
1281  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1);
1282 }
1283 template<> EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
1284  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1285 }
1286 
1287 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
1288 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
1289 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
1290 template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
1291 
1292 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
1293 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
1294 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
1295 template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
1296 
1297 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
1298 #ifdef EIGEN_VECTORIZE_AVX512
1299  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
1300  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
1301 #else
1302  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
1303  const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
1304  mask = por<Packet8i>(mask, bit_mask);
1305  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1306 #if EIGEN_COMP_MSVC
1307  // MSVC sometimes seems to use a bogus mask with maskstore.
1308  const __m256i ifrom = _mm256_castps_si256(from);
1309  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast<char*>(to));
1310  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast<char*>(to + 4));
1311 #else
1312  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
1313 #endif
1314 #endif
1315 }
1316 
1317 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
1318 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
1319 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
1320 {
1321  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
1322  from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1323 }
1324 template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
1325 {
1326  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1327 }
1328 template<> EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride)
1329 {
1330  return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
1331  from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1332 }
1334  return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
1335 }
1336 
1337 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
1338 {
1339  __m128 low = _mm256_extractf128_ps(from, 0);
1340  to[stride*0] = _mm_cvtss_f32(low);
1341  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
1342  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
1343  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
1344 
1345  __m128 high = _mm256_extractf128_ps(from, 1);
1346  to[stride*4] = _mm_cvtss_f32(high);
1347  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
1348  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
1349  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
1350 }
1351 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
1352 {
1353  __m128d low = _mm256_extractf128_pd(from, 0);
1354  to[stride*0] = _mm_cvtsd_f64(low);
1355  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
1356  __m128d high = _mm256_extractf128_pd(from, 1);
1357  to[stride*2] = _mm_cvtsd_f64(high);
1358  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
1359 }
1360 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride)
1361 {
1362  __m128i low = _mm256_extractf128_si256(from, 0);
1363  to[stride*0] = _mm_extract_epi32(low, 0);
1364  to[stride*1] = _mm_extract_epi32(low, 1);
1365  to[stride*2] = _mm_extract_epi32(low, 2);
1366  to[stride*3] = _mm_extract_epi32(low, 3);
1367 
1368  __m128i high = _mm256_extractf128_si256(from, 1);
1369  to[stride*4] = _mm_extract_epi32(high, 0);
1370  to[stride*5] = _mm_extract_epi32(high, 1);
1371  to[stride*6] = _mm_extract_epi32(high, 2);
1372  to[stride*7] = _mm_extract_epi32(high, 3);
1373 }
1374 template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
1375  pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
1376 }
1377 
1378 template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
1379 {
1380  Packet8f pa = pset1<Packet8f>(a);
1381  pstore(to, pa);
1382 }
1383 template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
1384 {
1385  Packet4d pa = pset1<Packet4d>(a);
1386  pstore(to, pa);
1387 }
1388 template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
1389 {
1390  Packet8i pa = pset1<Packet8i>(a);
1391  pstore(to, pa);
1392 }
1393 
1394 #ifndef EIGEN_VECTORIZE_AVX512
1395 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1396 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1397 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1398 template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
1399 #endif
1400 
1401 template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
1402  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
1403 }
1404 template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
1405  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
1406 }
1407 template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
1408  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
1409 }
1410 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
1411  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
1412 }
1413 
1414 
1415 template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
1416 {
1417  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
1418  return _mm256_permute2f128_ps(tmp, tmp, 1);
1419 }
1420 template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
1421 {
1422  __m256d tmp = _mm256_shuffle_pd(a,a,5);
1423  return _mm256_permute2f128_pd(tmp, tmp, 1);
1424 #if 0
1425  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
1426  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
1427  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
1428  return _mm256_permute_pd(swap_halves,5);
1429 #endif
1430 }
1431 template<> EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a)
1432 {
1433  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1434 }
1435 template<> EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
1436  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
1437 }
1438 
1439 #ifdef EIGEN_VECTORIZE_AVX2
1440 template<> EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a)
1441  {
1442  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1443 }
1444 template<> EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
1445  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
1446 }
1447 #endif
1448 
1449 // pabs should be ok
1450 template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
1451 {
1452  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
1453  return _mm256_and_ps(a,mask);
1454 }
1455 template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
1456 {
1457  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
1458  return _mm256_and_pd(a,mask);
1459 }
1460 template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a)
1461 {
1462 #ifdef EIGEN_VECTORIZE_AVX2
1463  return _mm256_abs_epi32(a);
1464 #else
1465  __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0));
1466  __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1));
1467  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1468 #endif
1469 }
1470 template<> EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) { return a; }
1471 
1472 template<> EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) { return _mm_srai_epi16(a, 15); }
1473 template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_srai_epi16(a, 15); }
1474 template<> EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) { return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); }
1475 template<> EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) { return pzero(a); }
1476 #ifdef EIGEN_VECTORIZE_AVX2
1477 template<> EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) { return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); }
1478 template<> EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) { return pzero(a); }
1479 #endif
1480 
1481 template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
1482  return pfrexp_generic(a,exponent);
1483 }
1484 
1485 // Extract exponent without existence of Packet4l.
1486 template<>
1487 EIGEN_STRONG_INLINE
1489  const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1490  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
1491 #ifdef EIGEN_VECTORIZE_AVX2
1492  a_expo = _mm256_srli_epi64(a_expo, 52);
1493  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1494  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1495 #else
1496  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1497  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1498  lo = _mm_srli_epi64(lo, 52);
1499  hi = _mm_srli_epi64(hi, 52);
1500 #endif
1501  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
1502  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
1503  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
1504  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
1505  return exponent;
1506 }
1507 
1508 
1509 template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
1510  return pfrexp_generic(a, exponent);
1511 }
1512 
1513 template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
1514  return pldexp_generic(a, exponent);
1515 }
1516 
1517 template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
1518  // Clamp exponent to [-2099, 2099]
1519  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
1520  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1521 
1522  // Split 2^e into four factors and multiply.
1523  const Packet4i bias = pset1<Packet4i>(1023);
1524  Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4)
1525 
1526  // 2^b
1527  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1528  Packet4i lo = _mm_slli_epi64(hi, 52);
1529  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1530  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1531  Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1532 
1533  // 2^(e - 3b)
1534  b = psub(psub(psub(e, b), b), b); // e - 3b
1535  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
1536  lo = _mm_slli_epi64(hi, 52);
1537  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1538  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1539  out = pmul(out, c); // a * 2^e
1540  return out;
1541 }
1542 
1543 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
1544 {
1545  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
1546 }
1547 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
1548 {
1549  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
1550 }
1551 template<> EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a)
1552 {
1553  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1))));
1554 }
1555 template<> EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
1556  return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
1557 }
1558 
1559 template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
1560 {
1561  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
1562 }
1563 template<> EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a)
1564 {
1565  return _mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1));
1566 }
1567 template<> EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
1568  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
1569 }
1570 
1571 template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
1572 {
1573  Packet8f tmp;
1574  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
1575  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1576  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1577 }
1578 template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
1579 {
1580  Packet4d tmp;
1581  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
1582  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
1583 }
1584 
1585 template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
1586 {
1587  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
1588  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1589  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1590 }
1591 template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
1592 {
1593  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
1594  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
1595 }
1596 
1597 template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
1598 {
1599  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
1600  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1601  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1602 }
1603 
1604 template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
1605 {
1606  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
1607  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
1608 }
1609 
1610 // not needed yet
1611 // template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
1612 // {
1613 // return _mm256_movemask_ps(x)==0xFF;
1614 // }
1615 
1616 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
1617 {
1618  return _mm256_movemask_ps(x) != 0;
1619 }
1620 
1621 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x)
1622 {
1623  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
1624 }
1625 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x)
1626 {
1627  return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
1628 }
1629 
1630 EIGEN_DEVICE_FUNC inline void
1631 ptranspose(PacketBlock<Packet8f,8>& kernel) {
1632  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1633  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1634  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1635  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1636  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1637  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1638  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1639  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1640  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
1641  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
1642  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
1643  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
1644  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
1645  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
1646  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
1647  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
1648  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
1649  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
1650  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
1651  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
1652  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
1653  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
1654  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
1655  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
1656 }
1657 
1658 EIGEN_DEVICE_FUNC inline void
1659 ptranspose(PacketBlock<Packet8f,4>& kernel) {
1660  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1661  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1662  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1663  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1664 
1665  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
1666  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
1667  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
1668  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
1669 
1670  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
1671  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
1672  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
1673  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
1674 }
1675 
1676 #define MM256_SHUFFLE_EPI32(A, B, M) \
1677  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
1678 
1679 #ifndef EIGEN_VECTORIZE_AVX2
1680 #define MM256_UNPACKLO_EPI32(A, B) \
1681  _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
1682 #define MM256_UNPACKHI_EPI32(A, B) \
1683  _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
1684 #else
1685 #define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B)
1686 #define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
1687 #endif
1688 
1689 
1690 EIGEN_DEVICE_FUNC inline void
1691 ptranspose(PacketBlock<Packet8i,8>& kernel) {
1692  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
1693  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
1694  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
1695  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
1696  __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]);
1697  __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
1698  __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
1699  __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
1700  __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
1701  __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
1702  __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
1703  __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
1704  __m256i S4 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(1,0,1,0));
1705  __m256i S5 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(3,2,3,2));
1706  __m256i S6 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(1,0,1,0));
1707  __m256i S7 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(3,2,3,2));
1708  kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
1709  kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
1710  kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
1711  kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
1712  kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
1713  kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
1714  kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
1715  kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
1716 }
1717 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 8>& kernel) {
1718  ptranspose((PacketBlock<Packet8i, 8>&)kernel);
1719 }
1720 
1721 EIGEN_DEVICE_FUNC inline void
1722 ptranspose(PacketBlock<Packet8i,4>& kernel) {
1723  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
1724  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
1725  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
1726  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
1727 
1728  __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
1729  __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
1730  __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
1731  __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
1732 
1733  kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
1734  kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
1735  kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
1736  kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
1737 }
1738 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 4>& kernel) {
1739  ptranspose((PacketBlock<Packet8i, 4>&)kernel);
1740 }
1741 
1742 EIGEN_DEVICE_FUNC inline void
1743 ptranspose(PacketBlock<Packet4d,4>& kernel) {
1744  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
1745  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
1746  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
1747  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
1748 
1749  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
1750  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
1751  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
1752  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
1753 }
1754 
1755 template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
1756 #ifdef EIGEN_VECTORIZE_AVX2
1757  const __m256i zero = _mm256_setzero_si256();
1758  const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1759  __m256i false_mask = _mm256_cmpeq_epi32(zero, select);
1760  return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask));
1761 #else
1762  const __m256 zero = _mm256_setzero_ps();
1763  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1764  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
1765  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
1766 #endif
1767 }
1768 
1769 template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
1770 #ifdef EIGEN_VECTORIZE_AVX2
1771  const __m256i zero = _mm256_setzero_si256();
1772  const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1773  __m256i false_mask = _mm256_cmpeq_epi64(select, zero);
1774  return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask));
1775 #else
1776  const __m256d zero = _mm256_setzero_pd();
1777  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1778  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
1779  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
1780 #endif
1781 }
1782 
1783 // Packet math for Eigen::half
1784 #ifndef EIGEN_VECTORIZE_AVX512FP16
1785 template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
1786 #endif
1787 
1788 template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
1789  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
1790 }
1791 
1792 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
1793  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
1794 }
1795 
1796 template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
1797  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1798 }
1799 
1800 template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
1801  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1802 }
1803 
1804 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
1805  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1806 }
1807 
1808 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
1809  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1810 }
1811 
1812 template<> EIGEN_STRONG_INLINE Packet8h
1813 ploaddup<Packet8h>(const Eigen::half* from) {
1814  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
1815  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
1816  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
1817  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
1818  return _mm_set_epi16(d, d, c, c, b, b, a, a);
1819 }
1820 
1821 template<> EIGEN_STRONG_INLINE Packet8h
1822 ploadquad<Packet8h>(const Eigen::half* from) {
1823  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
1824  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
1825  return _mm_set_epi16(b, b, b, b, a, a, a, a);
1826 }
1827 
1828 template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
1829  return _mm_cmpeq_epi32(a, a);
1830 }
1831 
1832 template <>
1833 EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
1834  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
1835  return _mm_andnot_si128(sign_mask, a);
1836 }
1837 
1838 EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
1839 #ifdef EIGEN_HAS_FP16_C
1840  return _mm256_cvtph_ps(a);
1841 #else
1842  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(_mm256_insertf128_si256(
1843  _mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
1844  return pp;
1845 #endif
1846 }
1847 
1848 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
1849 #ifdef EIGEN_HAS_FP16_C
1850  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
1851 #else
1852  __m128i lo = float2half(_mm256_extractf128_ps(a, 0));
1853  __m128i hi = float2half(_mm256_extractf128_ps(a, 1));
1854  return _mm_packus_epi32(lo, hi);
1855 #endif
1856 }
1857 
1858 template <>
1859 EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
1860  const Packet8h& b) {
1862 }
1863 
1864 template <>
1865 EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
1866  const Packet8h& b) {
1868 }
1869 
1870 template <>
1871 EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
1872  return float2half(plset<Packet8f>(static_cast<float>(a)));
1873 }
1874 
1875 template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
1876  // in some cases Packet4i is a wrapper around __m128i, so we either need to
1877  // cast to Packet4i to directly call the intrinsics as below:
1878  return _mm_or_si128(a,b);
1879 }
1880 template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
1881  return _mm_xor_si128(a,b);
1882 }
1883 template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
1884  return _mm_and_si128(a,b);
1885 }
1886 template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
1887  return _mm_andnot_si128(b,a);
1888 }
1889 
1890 template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
1891  return _mm_blendv_epi8(b, a, mask);
1892 }
1893 
1894 template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
1896 }
1897 
1898 template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
1900 }
1901 
1902 template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
1904 }
1905 
1906 template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
1908 }
1909 
1910 template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
1911  return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
1912 }
1913 
1914 template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
1915  return Pack16To8(pcmp_le(half2float(a), half2float(b)));
1916 }
1917 
1918 template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
1919  return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
1920 }
1921 
1922 template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
1924 }
1925 
1926 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
1927 
1928 template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
1929  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
1930  return _mm_xor_si128(a, sign_mask);
1931 }
1932 
1933 #ifndef EIGEN_VECTORIZE_AVX512FP16
1934 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
1935  Packet8f af = half2float(a);
1936  Packet8f bf = half2float(b);
1937  Packet8f rf = padd(af, bf);
1938  return float2half(rf);
1939 }
1940 
1941 template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
1942  Packet8f af = half2float(a);
1943  Packet8f bf = half2float(b);
1944  Packet8f rf = psub(af, bf);
1945  return float2half(rf);
1946 }
1947 
1948 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
1949  Packet8f af = half2float(a);
1950  Packet8f bf = half2float(b);
1951  Packet8f rf = pmul(af, bf);
1952  return float2half(rf);
1953 }
1954 
1955 template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
1956  Packet8f af = half2float(a);
1957  Packet8f bf = half2float(b);
1958  Packet8f rf = pdiv(af, bf);
1959  return float2half(rf);
1960 }
1961 #endif
1962 
1963 template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
1964 {
1965  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
1966  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
1967  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
1968  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
1969  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
1970  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
1971  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
1972  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
1973  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
1974 }
1975 
1976 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
1977 {
1978  EIGEN_ALIGN32 Eigen::half aux[8];
1979  pstore(aux, from);
1980  to[stride*0] = aux[0];
1981  to[stride*1] = aux[1];
1982  to[stride*2] = aux[2];
1983  to[stride*3] = aux[3];
1984  to[stride*4] = aux[4];
1985  to[stride*5] = aux[5];
1986  to[stride*6] = aux[6];
1987  to[stride*7] = aux[7];
1988 }
1989 
1990 
1991 #ifndef EIGEN_VECTORIZE_AVX512FP16
1992 template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
1993  Packet8f af = half2float(a);
1994  float reduced = predux<Packet8f>(af);
1995  return Eigen::half(reduced);
1996 }
1997 #endif
1998 
1999 template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
2000  Packet8f af = half2float(a);
2001  float reduced = predux_max<Packet8f>(af);
2002  return Eigen::half(reduced);
2003 }
2004 
2005 template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
2006  Packet8f af = half2float(a);
2007  float reduced = predux_min<Packet8f>(af);
2008  return Eigen::half(reduced);
2009 }
2010 
2011 template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
2012  Packet8f af = half2float(a);
2013  float reduced = predux_mul<Packet8f>(af);
2014  return Eigen::half(reduced);
2015 }
2016 
2017 template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
2018 {
2019  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2020  return _mm_shuffle_epi8(a,m);
2021 }
2022 
2023 EIGEN_STRONG_INLINE void
2024 ptranspose(PacketBlock<Packet8h,8>& kernel) {
2025  __m128i a = kernel.packet[0];
2026  __m128i b = kernel.packet[1];
2027  __m128i c = kernel.packet[2];
2028  __m128i d = kernel.packet[3];
2029  __m128i e = kernel.packet[4];
2030  __m128i f = kernel.packet[5];
2031  __m128i g = kernel.packet[6];
2032  __m128i h = kernel.packet[7];
2033 
2034  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2035  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2036  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2037  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2038  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2039  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2040  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2041  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2042 
2043  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2044  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2045  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2046  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2047  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2048  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2049  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2050  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2051 
2052  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2053  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2054  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2055  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2056  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2057  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2058  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2059  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2060 
2061  kernel.packet[0] = a0b0c0d0e0f0g0h0;
2062  kernel.packet[1] = a1b1c1d1e1f1g1h1;
2063  kernel.packet[2] = a2b2c2d2e2f2g2h2;
2064  kernel.packet[3] = a3b3c3d3e3f3g3h3;
2065  kernel.packet[4] = a4b4c4d4e4f4g4h4;
2066  kernel.packet[5] = a5b5c5d5e5f5g5h5;
2067  kernel.packet[6] = a6b6c6d6e6f6g6h6;
2068  kernel.packet[7] = a7b7c7d7e7f7g7h7;
2069 }
2070 
2071 EIGEN_STRONG_INLINE void
2072 ptranspose(PacketBlock<Packet8h,4>& kernel) {
2073  EIGEN_ALIGN32 Eigen::half in[4][8];
2074  pstore<Eigen::half>(in[0], kernel.packet[0]);
2075  pstore<Eigen::half>(in[1], kernel.packet[1]);
2076  pstore<Eigen::half>(in[2], kernel.packet[2]);
2077  pstore<Eigen::half>(in[3], kernel.packet[3]);
2078 
2079  EIGEN_ALIGN32 Eigen::half out[4][8];
2080 
2081  for (int i = 0; i < 4; ++i) {
2082  for (int j = 0; j < 4; ++j) {
2083  out[i][j] = in[j][2*i];
2084  }
2085  for (int j = 0; j < 4; ++j) {
2086  out[i][j+4] = in[j][2*i+1];
2087  }
2088  }
2089 
2090  kernel.packet[0] = pload<Packet8h>(out[0]);
2091  kernel.packet[1] = pload<Packet8h>(out[1]);
2092  kernel.packet[2] = pload<Packet8h>(out[2]);
2093  kernel.packet[3] = pload<Packet8h>(out[3]);
2094 }
2095 
2096 // BFloat16 implementation.
2097 
2098 EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
2099 #ifdef EIGEN_VECTORIZE_AVX2
2100  __m256i extend = _mm256_cvtepu16_epi32(a);
2101  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
2102 #else
2103  __m128i lo = _mm_cvtepu16_epi32(a);
2104  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
2105  __m128i lo_shift = _mm_slli_epi32(lo, 16);
2106  __m128i hi_shift = _mm_slli_epi32(hi, 16);
2107  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
2108 #endif
2109 }
2110 
2111 // Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
2112 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
2113 
2114  __m256i input = _mm256_castps_si256(a);
2115 
2116 #ifdef EIGEN_VECTORIZE_AVX2
2117  // uint32_t lsb = (input >> 16);
2118  __m256i t = _mm256_srli_epi32(input, 16);
2119  // uint32_t lsb = lsb & 1;
2120  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
2121  // uint32_t rounding_bias = 0x7fff + lsb;
2122  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
2123  // input += rounding_bias;
2124  t = _mm256_add_epi32(t, input);
2125  // input = input >> 16;
2126  t = _mm256_srli_epi32(t, 16);
2127  // Check NaN before converting back to bf16
2128  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2129  __m256i nan = _mm256_set1_epi32(0x7fc0);
2130  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
2131  // output = numext::bit_cast<uint16_t>(input);
2132  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
2133  _mm256_extractf128_si256(t, 1));
2134 #else
2135  // uint32_t lsb = (input >> 16);
2136  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
2137  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
2138  // uint32_t lsb = lsb & 1;
2139  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
2140  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
2141  // uint32_t rounding_bias = 0x7fff + lsb;
2142  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
2143  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
2144  // input += rounding_bias;
2145  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
2146  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
2147  // input = input >> 16;
2148  lo = _mm_srli_epi32(lo, 16);
2149  hi = _mm_srli_epi32(hi, 16);
2150  // Check NaN before converting back to bf16
2151  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
2152  __m128i nan = _mm_set1_epi32(0x7fc0);
2153  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
2154  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
2155  // output = numext::bit_cast<uint16_t>(input);
2156  return _mm_packus_epi32(lo, hi);
2157 #endif
2158 }
2159 
2160 template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
2161  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2162 }
2163 
2164 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
2165  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2166 }
2167 
2168 template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
2169  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
2170 }
2171 
2172 template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
2173  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
2174 }
2175 
2176 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
2177  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
2178 }
2179 
2180 template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
2181  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
2182 }
2183 
2184 template<> EIGEN_STRONG_INLINE Packet8bf
2185 ploaddup<Packet8bf>(const bfloat16* from) {
2186  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2187  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2188  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
2189  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
2190  return _mm_set_epi16(d, d, c, c, b, b, a, a);
2191 }
2192 
2193 template<> EIGEN_STRONG_INLINE Packet8bf
2194 ploadquad<Packet8bf>(const bfloat16* from) {
2195  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
2196  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
2197  return _mm_set_epi16(b, b, b, b, a, a, a, a);
2198 }
2199 
2200 template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
2201  return _mm_cmpeq_epi32(a, a);
2202 }
2203 
2204 template <>
2205 EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
2206  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2207  return _mm_andnot_si128(sign_mask, a);
2208 }
2209 
2210 template <>
2211 EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
2212  const Packet8bf& b) {
2214 }
2215 
2216 template <>
2217 EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
2218  const Packet8bf& b) {
2220 }
2221 
2222 template <>
2223 EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2224  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
2225 }
2226 
2227 template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
2228  return _mm_or_si128(a,b);
2229 }
2230 template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
2231  return _mm_xor_si128(a,b);
2232 }
2233 template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
2234  return _mm_and_si128(a,b);
2235 }
2236 template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
2237  return _mm_andnot_si128(b,a);
2238 }
2239 
2240 template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
2241  return _mm_blendv_epi8(b, a, mask);
2242 }
2243 
2244 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
2245 {
2247 }
2248 
2249 template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2251 }
2252 
2253 template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2255 }
2256 
2257 template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2259 }
2260 
2261 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
2262  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
2263 }
2264 
2265 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
2266  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
2267 }
2268 
2269 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
2270  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
2271 }
2272 
2273 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
2275 }
2276 
2277 template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
2278 
2279 template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
2280  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
2281  return _mm_xor_si128(a, sign_mask);
2282 }
2283 
2284 template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2286 }
2287 
2288 template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2290 }
2291 
2292 template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2294 }
2295 
2296 template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2298 }
2299 
2300 
2301 template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
2302 {
2303  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
2304  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
2305  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
2306  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
2307  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
2308  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
2309  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
2310  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
2311  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2312 }
2313 
2314 template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
2315 {
2316  EIGEN_ALIGN32 bfloat16 aux[8];
2317  pstore(aux, from);
2318  to[stride*0] = aux[0];
2319  to[stride*1] = aux[1];
2320  to[stride*2] = aux[2];
2321  to[stride*3] = aux[3];
2322  to[stride*4] = aux[4];
2323  to[stride*5] = aux[5];
2324  to[stride*6] = aux[6];
2325  to[stride*7] = aux[7];
2326 }
2327 
2328 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
2329  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
2330 }
2331 
2332 template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
2333  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
2334 }
2335 
2336 template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
2337  return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
2338 }
2339 
2340 template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
2341  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
2342 }
2343 
2344 template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
2345 {
2346  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2347  return _mm_shuffle_epi8(a,m);
2348 }
2349 
2350 EIGEN_STRONG_INLINE void
2351 ptranspose(PacketBlock<Packet8bf,8>& kernel) {
2352  __m128i a = kernel.packet[0];
2353  __m128i b = kernel.packet[1];
2354  __m128i c = kernel.packet[2];
2355  __m128i d = kernel.packet[3];
2356  __m128i e = kernel.packet[4];
2357  __m128i f = kernel.packet[5];
2358  __m128i g = kernel.packet[6];
2359  __m128i h = kernel.packet[7];
2360 
2361  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
2362  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
2363  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
2364  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2365  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
2366  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
2367  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
2368  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2369 
2370  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2371  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2372  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2373  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2374  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2375  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2376  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2377  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2378 
2379  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2380  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2381  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2382  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2383  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2384  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2385  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2386  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2387 }
2388 
2389 EIGEN_STRONG_INLINE void
2390 ptranspose(PacketBlock<Packet8bf,4>& kernel) {
2391  __m128i a = kernel.packet[0];
2392  __m128i b = kernel.packet[1];
2393  __m128i c = kernel.packet[2];
2394  __m128i d = kernel.packet[3];
2395 
2396  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
2397  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
2398  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
2399  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
2400 
2401  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
2402  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
2403  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
2404  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
2405 }
2406 
2407 } // end namespace internal
2408 
2409 } // end namespace Eigen
2410 
2411 #endif // EIGEN_PACKET_MATH_AVX_H
#define MM256_UNPACKLO_EPI32(A, B)
#define MM256_SHUFFLE_EPI32(A, B, M)
#define MM256_UNPACKHI_EPI32(A, B)
Matrix3f m
Array< int, 3, 1 > b
#define EIGEN_ALIGN32
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Array33i c
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:883
#define EIGEN_FAST_MATH
Definition: Macros.h:50
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define vec4i_swizzle1(v, p, q, r, s)
@ Aligned32
Definition: Constants.h:238
@ Aligned16
Definition: Constants.h:237
Packet4d pmax< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet4d ploadu< Packet4d >(const double *from)
Packet4d ploaddup< Packet4d >(const double *from)
uint32_t pfirst< Packet8ui >(const Packet8ui &a)
Packet8h ploaddup< Packet8h >(const Eigen::half *from)
Packet pmin(const Packet &a, const Packet &b)
Packet8i pmin< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet pnmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8h pround< Packet8h >(const Packet8h &a)
Packet4d pmul< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet padd(const Packet &a, const Packet &b)
Packet4i predux_half_dowto4< Packet8i >(const Packet8i &a)
Packet8h pdiv< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
Packet8bf ploadquad< Packet8bf >(const bfloat16 *from)
void pstore< float >(float *to, const Packet4f &from)
Packet4f predux_half_dowto4< Packet8f >(const Packet8f &a)
Packet4d pload< Packet4d >(const double *from)
Packet8f pmax< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4d pceil< Packet4d >(const Packet4d &a)
double predux_min< Packet4d >(const Packet4d &a)
__vector int Packet4i
Packet8h pset1< Packet8h >(const Eigen::half &from)
Packet8h pfloor< Packet8h >(const Packet8h &a)
Packet8ui ploaddup< Packet8ui >(const uint32_t *from)
Packet8f psub< Packet8f >(const Packet8f &a, const Packet8f &b)
unpacket_traits< Packet >::type predux(const Packet &a)
Packet8h ptrue(const Packet8h &a)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
void pstore1< Packet4d >(double *to, const double &a)
void pstore< int >(int *to, const Packet4i &from)
Packet8ui pload< Packet8ui >(const uint32_t *from)
Packet8bf F32ToBf16(Packet4f p4f)
Packet8f pgather< float, Packet8f >(const float *from, Index stride)
Packet4d padd< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8f pand< Packet8f >(const Packet8f &a, const Packet8f &b)
void pscatter< uint32_t, Packet8ui >(uint32_t *to, const Packet8ui &from, Index stride)
Packet4d pdiv< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8h print< Packet8h >(const Packet8h &a)
Packet8i pdiv< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f ploadquad< Packet8f >(const float *from)
Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8i ploadu< Packet8i >(const int *from)
float pfirst< Packet8f >(const Packet8f &a)
Packet8ui pand< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pandnot< Packet4d >(const Packet4d &a, const Packet4d &b)
float predux_max< Packet8f >(const Packet8f &a)
void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet8ui pselect< Packet8ui >(const Packet8ui &mask, const Packet8ui &a, const Packet8ui &b)
Packet4d pload1< Packet4d >(const double *from)
Packet8bf pdiv< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8h pceil< Packet8h >(const Packet8h &a)
Packet8i ploadquad< Packet8i >(const int *from)
bfloat16 predux_max< Packet8bf >(const Packet8bf &a)
Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4f pabs(const Packet4f &a)
Packet8f pround< Packet8f >(const Packet8f &a)
Packet pmax(const Packet &a, const Packet &b)
Packet8f Bf16ToF32(const Packet8bf &a)
Packet4ui predux_half_dowto4< Packet8ui >(const Packet8ui &a)
Packet4d pfrexp< Packet4d >(const Packet4d &a, Packet4d &exponent)
void pscatter< double, Packet4d >(double *to, const Packet4d &from, Index stride)
Packet2cf pnegate(const Packet2cf &a)
Packet4d pround< Packet4d >(const Packet4d &a)
Packet8f pmul< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8bf pfloor< Packet8bf >(const Packet8bf &a)
Packet8i psub< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8i pmul< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4d pmin< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8bf print< Packet8bf >(const Packet8bf &a)
Packet8ui pmul< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8bf pload< Packet8bf >(const bfloat16 *from)
Packet8f pload1< Packet8f >(const float *from)
Packet4i plogical_shift_right(const Packet4i &a)
Packet8f por< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4ui ploadu< Packet4ui >(const uint32_t *from)
int predux< Packet8i >(const Packet8i &a)
Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Packet8f ploaddup< Packet8f >(const float *from)
Packet4d print< Packet4d >(const Packet4d &a)
float predux< Packet8f >(const Packet8f &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu< uint64_t >(uint64_t *to, const Packet2ul &from)
Packet8f pfloor< Packet8f >(const Packet8f &a)
Packet8bf pmul< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4d pand< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui ploadquad< Packet8ui >(const uint32_t *from)
Packet8f ploadu< Packet8f >(const float *from)
int pfirst< Packet8i >(const Packet8i &a)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
bfloat16 pfirst(const Packet8bf &a)
Packet4d pfloor< Packet4d >(const Packet4d &a)
Packet8f pxor< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8f pldexp< Packet8f >(const Packet8f &a, const Packet8f &exponent)
Packet psign(const Packet &a)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
__vector unsigned int Packet4ui
double predux_max< Packet4d >(const Packet4d &a)
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8bf psub< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4i ploadu< Packet4i >(const int *from)
Packet8ui pset1< Packet8ui >(const uint32_t &from)
Packet8bf pceil< Packet8bf >(const Packet8bf &a)
Packet8f print< Packet8f >(const Packet8f &a)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet4d pset1< Packet4d >(const double &from)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
void pscatter< float, Packet8f >(float *to, const Packet8f &from, Index stride)
Packet8h plset< Packet8h >(const half &a)
Eigen::half predux< Packet8h >(const Packet8h &a)
bfloat16 predux_mul< Packet8bf >(const Packet8bf &a)
void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet8h psub< Packet8h >(const Packet8h &a, const Packet8h &b)
void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Packet8ui pandnot< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
Packet8bf pmin< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
__m128i Pack16To8(Packet8f rf)
Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Packet4i pset1< Packet4i >(const int &from)
Packet4d pldexp< Packet4d >(const Packet4d &a, const Packet4d &exponent)
void prefetch< uint32_t >(const uint32_t *addr)
Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f pload< Packet8f >(const float *from)
Packet8h float2half(const Packet8f &a)
void pstore1< Packet8f >(float *to, const float &a)
Packet8i plset< Packet8i >(const int &a)
Packet4d pmin< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8f pmin< Packet8f >(const Packet8f &a, const Packet8f &b)
float predux_mul< Packet8f >(const Packet8f &a)
Packet8f pset1frombits< Packet8f >(unsigned int from)
Packet8f pfrexp< Packet8f >(const Packet8f &a, Packet8f &exponent)
Packet8f pmin< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Eigen::half pfirst< Packet8h >(const Packet8h &from)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet psub(const Packet &a, const Packet &b)
Eigen::half predux_mul< Packet8h >(const Packet8h &a)
bfloat16 predux_min< Packet8bf >(const Packet8bf &a)
Packet8f pset1< Packet8f >(const float &from)
void pstoreu< int64_t >(int64_t *to, const Packet2l &from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet8h ploadquad< Packet8h >(const Eigen::half *from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
bfloat16 predux< Packet8bf >(const Packet8bf &a)
Packet8f half2float(const Packet8h &a)
double pfirst< Packet4d >(const Packet4d &a)
void pscatter< int, Packet8i >(int *to, const Packet8i &from, Index stride)
Packet8h pand(const Packet8h &a, const Packet8h &b)
void pstore< int64_t >(int64_t *to, const Packet2l &from)
const char * SsePrefetchPtrType
Packet8h ploadu< Packet8h >(const Eigen::half *from)
void pstoreu< float >(float *to, const Packet4f &from)
Packet4d plset< Packet4d >(const double &a)
Packet4d pmax< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui padd< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pmin< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8i pmax< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4d ptrue< Packet4d >(const Packet4d &a)
eigen_packet_wrapper< __m256i, 0 > Packet8i
Packet8ui pmax< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8f pmax< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8h pmax< Packet8h >(const Packet8h &a, const Packet8h &b)
float predux_min< Packet8f >(const Packet8f &a)
Packet8bf plset< Packet8bf >(const bfloat16 &a)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8bf ploadu< Packet8bf >(const bfloat16 *from)
Packet8i pset1< Packet8i >(const int &from)
Packet8i pselect< Packet8i >(const Packet8i &mask, const Packet8i &a, const Packet8i &b)
Packet8bf pset1< Packet8bf >(const bfloat16 &from)
Packet8i pload< Packet8i >(const int *from)
Packet8bf psignbit(const Packet8bf &a)
Packet4d pmax< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet pdiv(const Packet &a, const Packet &b)
Packet8f pmin< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Packet8f pmax< Packet8f >(const Packet8f &a, const Packet8f &b)
uint32_t predux< Packet8ui >(const Packet8ui &a)
Packet8i pgather< int, Packet8i >(const int *from, Index stride)
void pstore< uint64_t >(uint64_t *to, const Packet2ul &from)
Packet8ui por< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pxor< Packet4d >(const Packet4d &a, const Packet4d &b)
eigen_packet_wrapper< __m256i, 4 > Packet8ui
Packet2cf pconj(const Packet2cf &a)
EIGEN_ALWAYS_INLINE void pscatter< bfloat16, Packet8bf >(bfloat16 *to, const Packet8bf &from, Index stride)
void pstoreu< int >(int *to, const Packet4i &from)
double predux_mul< Packet4d >(const Packet4d &a)
Packet8ui psub< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8bf pmax< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4d pset1frombits< Packet4d >(uint64_t from)
Packet8f ptrue< Packet8f >(const Packet8f &a)
Packet4i plogical_shift_left(const Packet4i &a)
Packet8h pload< Packet8h >(const Eigen::half *from)
Packet8i ptrue< Packet8i >(const Packet8i &a)
Packet2cf preverse(const Packet2cf &a)
Packet4i parithmetic_shift_right(const Packet4i &a)
Packet8h pmul< Packet8h >(const Packet8h &a, const Packet8h &b)
bfloat16 pfirst< Packet8bf >(const Packet8bf &from)
Packet8h por(const Packet8h &a, const Packet8h &b)
Eigen::half predux_min< Packet8h >(const Packet8h &a)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet8ui pmin< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8f pselect< Packet8f >(const Packet8f &mask, const Packet8f &a, const Packet8f &b)
Packet8f pisnan(const Packet8f &a)
Packet8f plset< Packet8f >(const float &a)
Packet8f pandnot< Packet8f >(const Packet8f &a, const Packet8f &b)
__vector float Packet4f
Packet8ui plset< Packet8ui >(const uint32_t &a)
Packet4d psub< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui pgather< uint32_t, Packet8ui >(const uint32_t *from, Index stride)
double predux< Packet4d >(const Packet4d &a)
Packet8i padd< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f padd< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
void prefetch< int >(const int *addr)
Packet4d por< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet4d pselect< Packet4d >(const Packet4d &mask, const Packet4d &a, const Packet4d &b)
Packet8i ploaddup< Packet8i >(const int *from)
EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
Packet8f pceil< Packet8f >(const Packet8f &a)
Packet8h pmin< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8ui pxor< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
bool predux_any(const Packet4f &x)
eigen_packet_wrapper< __m128i, 2 > Packet8h
Packet8bf ploaddup< Packet8bf >(const bfloat16 *from)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4d pgather< double, Packet4d >(const double *from, Index stride)
Packet8bf pround< Packet8bf >(const Packet8bf &a)
void pstore1< Packet8i >(int *to, const int &a)
Packet8ui ploadu< Packet8ui >(const uint32_t *from)
Eigen::half predux_max< Packet8h >(const Packet8h &a)
Packet8f pdiv< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8f peven_mask(const Packet8f &)
std::uint8_t uint8_t
Definition: Meta.h:35
std::int64_t int64_t
Definition: Meta.h:42
std::uint16_t uint16_t
Definition: Meta.h:37
std::uint32_t uint32_t
Definition: Meta.h:39
std::uint64_t uint64_t
Definition: Meta.h:41
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
std::ptrdiff_t j