10 #ifndef EIGEN_PACKET_MATH_AVX_H
11 #define EIGEN_PACKET_MATH_AVX_H
13 #include "../../InternalHeaderCheck.h"
19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23 #if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
24 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
27 #ifdef EIGEN_VECTORIZE_FMA
28 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34 typedef eigen_packet_wrapper<__m256i, 0>
Packet8i;
36 #ifndef EIGEN_VECTORIZE_AVX512FP16
37 typedef eigen_packet_wrapper<__m128i, 2>
Packet8h;
39 typedef eigen_packet_wrapper<__m128i, 3>
Packet8bf;
42 #ifdef EIGEN_VECTORIZE_AVX2
44 typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
45 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
48 template<>
struct is_arithmetic<__m256> {
enum { value =
true }; };
49 template<>
struct is_arithmetic<__m256i> {
enum { value =
true }; };
50 template<>
struct is_arithmetic<__m256d> {
enum { value =
true }; };
51 template<>
struct is_arithmetic<
Packet8i> {
enum { value =
true }; };
55 template<>
struct is_arithmetic<
Packet8ui> {
enum { value =
false }; };
56 #ifndef EIGEN_VECTORIZE_AVX512FP16
57 template<>
struct is_arithmetic<
Packet8h> {
enum { value =
true }; };
59 template<>
struct is_arithmetic<
Packet8bf> {
enum { value =
true }; };
60 #ifdef EIGEN_VECTORIZE_AVX2
61 template<>
struct is_arithmetic<Packet4l> {
enum { value =
true }; };
65 template<>
struct is_arithmetic<Packet4ul> {
enum { value =
false }; };
70 #ifndef EIGEN_VECTORIZE_AVX512
71 template<>
struct packet_traits<float> : default_packet_traits
106 template<>
struct packet_traits<double> : default_packet_traits
131 struct packet_traits<
Eigen::half> : default_packet_traits {
173 struct packet_traits<bfloat16> : default_packet_traits {
215 template<>
struct packet_traits<int> : default_packet_traits
227 template<>
struct packet_traits<
uint32_t> : default_packet_traits
247 #ifdef EIGEN_VECTORIZE_AVX2
248 template<>
struct packet_traits<
int64_t> : default_packet_traits
250 typedef Packet4l type;
253 typedef Packet4l half;
261 template<>
struct packet_traits<
uint64_t> : default_packet_traits
263 typedef Packet4ul type;
266 typedef Packet4ul half;
287 template<>
struct scalar_div_cost<float,true> {
enum { value = 14 }; };
288 template<>
struct scalar_div_cost<double,true> {
enum { value = 16 }; };
290 template<>
struct unpacket_traits<
Packet8f> {
295 enum {
size=8, alignment=
Aligned32, vectorizable=
true, masked_load_available=
true, masked_store_available=
true
296 #ifdef EIGEN_VECTORIZE_AVX512
297 , masked_fpops_available=
true
301 template<>
struct unpacket_traits<
Packet4d> {
304 enum {
size=4, alignment=
Aligned32, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
306 template<>
struct unpacket_traits<
Packet8i> {
309 enum {
size=8, alignment=
Aligned32, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
311 template<>
struct unpacket_traits<
Packet8ui> {
314 enum {
size = 8, alignment =
Aligned32, vectorizable =
true, masked_load_available =
false, masked_store_available =
false};
316 #ifdef EIGEN_VECTORIZE_AVX2
317 template<>
struct unpacket_traits<Packet4l> {
319 typedef Packet4l half;
320 enum {
size=4, alignment=
Aligned32, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
322 template<>
struct unpacket_traits<Packet4ul> {
324 typedef Packet4ul half;
325 enum {
size = 4, alignment =
Aligned32, vectorizable =
true, masked_load_available =
false, masked_store_available =
false};
328 template<>
struct unpacket_traits<
Packet8bf> {
329 typedef bfloat16 type;
331 enum {
size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
337 return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
338 _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
341 #ifdef EIGEN_VECTORIZE_AVX2
343 EIGEN_STRONG_INLINE Packet4l pset1<Packet4l>(
const int64_t& from) {
344 return _mm256_set1_epi64x(from);
347 EIGEN_STRONG_INLINE Packet4ul pset1<Packet4ul>(
const uint64_t& from) {
348 return _mm256_set1_epi64x(numext::bit_cast<uint64_t>(from));
351 EIGEN_STRONG_INLINE Packet4l
pzero(
const Packet4l& ) {
352 return _mm256_setzero_si256();
355 EIGEN_STRONG_INLINE Packet4ul
pzero(
const Packet4ul& ) {
356 return _mm256_setzero_si256();
359 EIGEN_STRONG_INLINE Packet4l
peven_mask(
const Packet4l& ) {
360 return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
363 EIGEN_STRONG_INLINE Packet4ul
peven_mask(
const Packet4ul& ) {
364 return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
367 EIGEN_STRONG_INLINE Packet4l pload1<Packet4l>(
const int64_t* from) {
368 return _mm256_set1_epi64x(*from);
371 EIGEN_STRONG_INLINE Packet4ul pload1<Packet4ul>(
const uint64_t* from) {
372 return _mm256_set1_epi64x(*from);
375 EIGEN_STRONG_INLINE Packet4l padd<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
376 return _mm256_add_epi64(
a,
b);
379 EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
380 return _mm256_add_epi64(
a,
b);
383 EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(
const int64_t&
a) {
384 return padd(pset1<Packet4l>(
a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
387 EIGEN_STRONG_INLINE Packet4ul plset<Packet4ul>(
const uint64_t&
a) {
388 return padd(pset1<Packet4ul>(
a), Packet4ul(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
391 EIGEN_STRONG_INLINE Packet4l psub<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
392 return _mm256_sub_epi64(
a,
b);
395 EIGEN_STRONG_INLINE Packet4ul psub<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
396 return _mm256_sub_epi64(
a,
b);
399 EIGEN_STRONG_INLINE Packet4l
pnegate(
const Packet4l&
a) {
403 EIGEN_STRONG_INLINE Packet4l
pconj(
const Packet4l&
a) {
407 EIGEN_STRONG_INLINE Packet4l
pcmp_le(
const Packet4l&
a,
const Packet4l&
b) {
408 return _mm256_xor_si256(_mm256_cmpgt_epi64(
a,
b), _mm256_set1_epi32(-1));
411 EIGEN_STRONG_INLINE Packet4ul
pcmp_le(
const Packet4ul&
a,
const Packet4ul&
b) {
412 return (Packet4ul)
pcmp_le((Packet4l)
psub(
a, pset1<Packet4ul>(0x8000000000000000UL)),
413 (Packet4l)
psub(
b, pset1<Packet4ul>(0x8000000000000000UL)));
416 EIGEN_STRONG_INLINE Packet4l
pcmp_lt(
const Packet4l&
a,
const Packet4l&
b) {
417 return _mm256_cmpgt_epi64(
b,
a);
420 EIGEN_STRONG_INLINE Packet4ul
pcmp_lt(
const Packet4ul&
a,
const Packet4ul&
b) {
421 return (Packet4ul)
pcmp_lt((Packet4l)
psub(
a, pset1<Packet4ul>(0x8000000000000000UL)),
422 (Packet4l)
psub(
b, pset1<Packet4ul>(0x8000000000000000UL)));
425 EIGEN_STRONG_INLINE Packet4l
pcmp_eq(
const Packet4l&
a,
const Packet4l&
b) {
426 return _mm256_cmpeq_epi64(
a,
b);
429 EIGEN_STRONG_INLINE Packet4ul
pcmp_eq(
const Packet4ul&
a,
const Packet4ul&
b) {
430 return _mm256_cmpeq_epi64(
a,
b);
433 EIGEN_STRONG_INLINE Packet4l ptrue<Packet4l>(
const Packet4l&
a) {
434 return _mm256_cmpeq_epi64(
a,
a);
437 EIGEN_STRONG_INLINE Packet4ul ptrue<Packet4ul>(
const Packet4ul&
a) {
438 return _mm256_cmpeq_epi64(
a,
a);
441 EIGEN_STRONG_INLINE Packet4l pand<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
442 return _mm256_and_si256(
a,
b);
445 EIGEN_STRONG_INLINE Packet4l por<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
446 return _mm256_or_si256(
a,
b);
449 EIGEN_STRONG_INLINE Packet4l pxor<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
450 return _mm256_xor_si256(
a,
b);
453 EIGEN_STRONG_INLINE Packet4ul pxor<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
454 return _mm256_xor_si256(
a,
b);
457 EIGEN_STRONG_INLINE Packet4l pandnot<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
458 return _mm256_andnot_si256(
b,
a);
462 return _mm256_srli_epi64(
a, N);
466 return _mm256_slli_epi64(
a, N);
468 #ifdef EIGEN_VECTORIZE_AVX512FP16
478 __m256i hi_word = _mm256_srai_epi32(
a, N);
479 __m256i lo_word = _mm256_srli_epi64(
a, N);
480 return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
484 __m256i hi_word = _mm256_srai_epi32(
a, 31);
485 __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(
a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
486 return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
490 return _mm256_shuffle_epi32(_mm256_srai_epi32(
a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
494 return parithmetic_shift_right<int(N&63)>(
a);
498 EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(
const int64_t* from) {
502 EIGEN_STRONG_INLINE Packet4ul pload<Packet4ul>(
const uint64_t* from) {
506 EIGEN_STRONG_INLINE Packet4l ploadu<Packet4l>(
const int64_t* from) {
510 EIGEN_STRONG_INLINE Packet4ul ploadu<Packet4ul>(
const uint64_t* from) {
515 EIGEN_STRONG_INLINE Packet4l ploaddup<Packet4l>(
const int64_t* from) {
516 const Packet4l
a = _mm256_castsi128_si256(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from)));
517 return _mm256_permutevar8x32_epi32(
a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
521 EIGEN_STRONG_INLINE Packet4ul ploaddup<Packet4ul>(
const uint64_t* from) {
522 const Packet4ul
a = _mm256_castsi128_si256(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from)));
523 return _mm256_permutevar8x32_epi32(
a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
543 return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
547 return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
551 __m128i low = _mm256_extractf128_si256(from, 0);
552 to[stride * 0] = _mm_extract_epi64(low, 0);
553 to[stride * 1] = _mm_extract_epi64(low, 1);
555 __m128i high = _mm256_extractf128_si256(from, 1);
556 to[stride * 2] = _mm_extract_epi64(high, 0);
557 to[stride * 3] = _mm_extract_epi64(high, 1);
561 __m128i low = _mm256_extractf128_si256(from, 0);
562 to[stride * 0] = _mm_extract_epi64(low, 0);
563 to[stride * 1] = _mm_extract_epi64(low, 1);
565 __m128i high = _mm256_extractf128_si256(from, 1);
566 to[stride * 2] = _mm_extract_epi64(high, 0);
567 to[stride * 3] = _mm_extract_epi64(high, 1);
570 EIGEN_STRONG_INLINE
void pstore1<Packet4l>(
int64_t* to,
const int64_t&
a) {
571 Packet4l pa = pset1<Packet4l>(
a);
576 Packet4ul pa = pset1<Packet4ul>(
a);
580 EIGEN_STRONG_INLINE
int64_t pfirst<Packet4l>(
const Packet4l&
a) {
581 return _mm_cvtsi128_si64(_mm256_castsi256_si128(
a));
584 EIGEN_STRONG_INLINE
uint64_t pfirst<Packet4ul>(
const Packet4ul&
a) {
585 return _mm_cvtsi128_si64(_mm256_castsi256_si128(
a));
588 EIGEN_STRONG_INLINE
int64_t predux<Packet4l>(
const Packet4l&
a) {
589 __m128i r = _mm_add_epi64(_mm256_castsi256_si128(
a), _mm256_extractf128_si256(
a, 1));
590 return _mm_extract_epi64(r, 0) + _mm_extract_epi64(r, 1);
593 EIGEN_STRONG_INLINE
uint64_t predux<Packet4ul>(
const Packet4ul&
a) {
594 __m128i r = _mm_add_epi64(_mm256_castsi256_si128(
a), _mm256_extractf128_si256(
a, 1));
595 return numext::bit_cast<uint64_t>(_mm_extract_epi64(r, 0) + _mm_extract_epi64(r, 1));
597 #define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
599 __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
600 __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0);
601 __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15);
602 __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0);
604 kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32));
605 kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49));
606 kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32));
607 kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49));
610 ptranspose((PacketBlock<Packet4l, 4>&)kernel);
613 EIGEN_STRONG_INLINE Packet4l pmin<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
614 __m256i cmp = _mm256_cmpgt_epi64(
a,
b);
615 __m256i a_min = _mm256_andnot_si256(cmp,
a);
616 __m256i b_min = _mm256_and_si256(cmp,
b);
617 return Packet4l(_mm256_or_si256(a_min, b_min));
620 EIGEN_STRONG_INLINE Packet4ul pmin<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
621 return padd((Packet4ul)
pmin((Packet4l)
psub(
a, pset1<Packet4ul>(0x8000000000000000UL)),
622 (Packet4l)
psub(
b, pset1<Packet4ul>(0x8000000000000000UL))),
623 pset1<Packet4ul>(0x8000000000000000UL));
626 EIGEN_STRONG_INLINE Packet4l pmax<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
627 __m256i cmp = _mm256_cmpgt_epi64(
a,
b);
628 __m256i a_min = _mm256_and_si256(cmp,
a);
629 __m256i b_min = _mm256_andnot_si256(cmp,
b);
630 return Packet4l(_mm256_or_si256(a_min, b_min));
633 EIGEN_STRONG_INLINE Packet4ul pmax<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
634 return padd((Packet4ul)
pmax((Packet4l)
psub(
a, pset1<Packet4ul>(0x8000000000000000UL)),
635 (Packet4l)
psub(
b, pset1<Packet4ul>(0x8000000000000000UL))),
636 pset1<Packet4ul>(0x8000000000000000UL));
639 EIGEN_STRONG_INLINE Packet4l pabs<Packet4l>(
const Packet4l&
a) {
640 Packet4l pz = pzero<Packet4l>(
a);
641 Packet4l cmp = _mm256_cmpgt_epi64(
a, pz);
645 EIGEN_STRONG_INLINE Packet4ul pabs<Packet4ul>(
const Packet4ul&
a) {
649 EIGEN_STRONG_INLINE Packet4l pmul<Packet4l>(
const Packet4l&
a,
const Packet4l&
b) {
651 __m256i upper32_a = _mm256_srli_epi64(
a, 32);
652 __m256i upper32_b = _mm256_srli_epi64(
b, 32);
655 __m256i mul1 = _mm256_mul_epu32(upper32_a,
b);
656 __m256i mul2 = _mm256_mul_epu32(upper32_b,
a);
658 __m256i mul3 = _mm256_mul_epu32(
a,
b);
660 __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32);
661 return _mm256_add_epi64(high, mul3);
664 EIGEN_STRONG_INLINE Packet4ul pmul<Packet4ul>(
const Packet4ul&
a,
const Packet4ul&
b) {
665 return (Packet4ul)pmul<Packet4l>((Packet4l)
a, (Packet4l)
b);
683 template<> EIGEN_STRONG_INLINE
Packet8f peven_mask(
const Packet8f& ) {
return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
686 template<> EIGEN_STRONG_INLINE
Packet4d peven_mask(
const Packet4d& ) {
return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
692 #ifdef EIGEN_VECTORIZE_AVX512
695 __mmask16 mask =
static_cast<__mmask16
>(umask & 0x00FF);
696 return _mm512_castps512_ps256(_mm512_maskz_add_ps(
698 _mm512_castps256_ps512(
a),
699 _mm512_castps256_ps512(
b)));
704 #ifdef EIGEN_VECTORIZE_AVX2
705 return _mm256_add_epi32(
a,
b);
707 __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
708 __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
709 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
714 #ifdef EIGEN_VECTORIZE_AVX2
715 return _mm256_add_epi32(
a,
b);
717 __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
718 __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
719 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
731 #ifdef EIGEN_VECTORIZE_AVX2
732 return _mm256_sub_epi32(
a,
b);
734 __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
735 __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
736 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
741 #ifdef EIGEN_VECTORIZE_AVX2
742 return _mm256_sub_epi32(
a,
b);
744 __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
745 __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
746 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
752 const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
753 return _mm256_xor_ps(
a, mask);
757 const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
758 return _mm256_xor_pd(
a, mask);
772 #ifdef EIGEN_VECTORIZE_AVX2
773 return _mm256_mullo_epi32(
a,
b);
775 const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
776 const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
777 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
782 #ifdef EIGEN_VECTORIZE_AVX2
783 return _mm256_mullo_epi32(
a,
b);
785 const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
786 const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
787 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
796 #ifdef EIGEN_VECTORIZE_AVX512
797 return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(
a), _mm512_cvtepi32_pd(
b)));
801 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
805 #ifdef EIGEN_VECTORIZE_FMA
808 return _mm256_fmadd_ps(
a,
b,
c);
812 return _mm256_fmadd_pd(
a,
b,
c);
817 return _mm256_fmsub_ps(
a,
b,
c);
822 return _mm256_fmsub_pd(
a,
b,
c);
827 return _mm256_fnmadd_ps(
a,
b,
c);
832 return _mm256_fnmadd_pd(
a,
b,
c);
837 return _mm256_fnmsub_ps(
a,
b,
c);
842 return _mm256_fnmsub_pd(
a,
b,
c);
859 #ifdef EIGEN_VECTORIZE_AVX2
860 return _mm256_xor_si256(_mm256_cmpgt_epi32(
a,
b), _mm256_set1_epi32(-1));
862 __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
863 lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
864 __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
865 hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
866 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
870 #ifdef EIGEN_VECTORIZE_AVX2
871 return _mm256_cmpgt_epi32(
b,
a);
873 __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(
b, 0), _mm256_extractf128_si256(
a, 0));
874 __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(
b, 1), _mm256_extractf128_si256(
a, 1));
875 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
879 #ifdef EIGEN_VECTORIZE_AVX2
880 return _mm256_cmpeq_epi32(
a,
b);
882 __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
883 __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
884 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
888 #ifdef EIGEN_VECTORIZE_AVX2
889 return _mm256_cmpeq_epi32(
a,
b);
891 __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
892 __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
893 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
898 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
904 asm(
"vminps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
908 return _mm256_min_ps(
b,
a);
912 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
915 asm(
"vminpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
919 return _mm256_min_pd(
b,
a);
923 #ifdef EIGEN_VECTORIZE_AVX2
924 return _mm256_min_epi32(
a,
b);
926 __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
927 __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
928 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
932 #ifdef EIGEN_VECTORIZE_AVX2
933 return _mm256_min_epu32(
a,
b);
935 __m128i lo = _mm_min_epu32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
936 __m128i hi = _mm_min_epu32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
937 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
942 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
945 asm(
"vmaxps %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
949 return _mm256_max_ps(
b,
a);
953 #if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
956 asm(
"vmaxpd %[a], %[b], %[res]" : [
res]
"=x" (
res) : [
a]
"x" (
a), [
b]
"x" (
b));
960 return _mm256_max_pd(
b,
a);
964 #ifdef EIGEN_VECTORIZE_AVX2
965 return _mm256_max_epi32(
a,
b);
967 __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
968 __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
969 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
973 #ifdef EIGEN_VECTORIZE_AVX2
974 return _mm256_max_epu32(
a,
b);
976 __m128i lo = _mm_max_epu32(_mm256_extractf128_si256(
a, 0), _mm256_extractf128_si256(
b, 0));
977 __m128i hi = _mm_max_epu32(_mm256_extractf128_si256(
a, 1), _mm256_extractf128_si256(
b, 1));
978 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
982 #ifdef EIGEN_VECTORIZE_AVX2
984 return _mm256_sign_epi32(_mm256_set1_epi32(1),
a);
1033 #ifdef EIGEN_VECTORIZE_AVX2
1035 return _mm256_cmpeq_epi32(
a,
a);
1037 const __m256
b = _mm256_castsi256_ps(
a);
1038 return _mm256_castps_si256(_mm256_cmp_ps(
b,
b,_CMP_TRUE_UQ));
1043 #ifdef EIGEN_VECTORIZE_AVX2
1045 const __m256i
b = _mm256_castps_si256(
a);
1046 return _mm256_castsi256_ps(_mm256_cmpeq_epi32(
b,
b));
1048 return _mm256_cmp_ps(
a,
a,_CMP_TRUE_UQ);
1053 #ifdef EIGEN_VECTORIZE_AVX2
1055 const __m256i
b = _mm256_castpd_si256(
a);
1056 return _mm256_castsi256_pd(_mm256_cmpeq_epi64(
b,
b));
1058 return _mm256_cmp_pd(
a,
a,_CMP_TRUE_UQ);
1065 #ifdef EIGEN_VECTORIZE_AVX2
1066 return _mm256_and_si256(
a,
b);
1068 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(
a),_mm256_castsi256_ps(
b)));
1072 #ifdef EIGEN_VECTORIZE_AVX2
1073 return _mm256_and_si256(
a,
b);
1075 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(
a),_mm256_castsi256_ps(
b)));
1082 #ifdef EIGEN_VECTORIZE_AVX2
1083 return _mm256_or_si256(
a,
b);
1085 return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(
a),_mm256_castsi256_ps(
b)));
1089 #ifdef EIGEN_VECTORIZE_AVX2
1090 return _mm256_or_si256(
a,
b);
1092 return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(
a),_mm256_castsi256_ps(
b)));
1099 #ifdef EIGEN_VECTORIZE_AVX2
1100 return _mm256_xor_si256(
a,
b);
1102 return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(
a),_mm256_castsi256_ps(
b)));
1106 #ifdef EIGEN_VECTORIZE_AVX2
1107 return _mm256_xor_si256(
a,
b);
1109 return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(
a), _mm256_castsi256_ps(
b)));
1116 #ifdef EIGEN_VECTORIZE_AVX2
1117 return _mm256_andnot_si256(
b,
a);
1119 return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(
b),_mm256_castsi256_ps(
a)));
1123 #ifdef EIGEN_VECTORIZE_AVX2
1124 return _mm256_andnot_si256(
b,
a);
1126 return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(
b),_mm256_castsi256_ps(
a)));
1141 return _mm256_round_ps(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
1147 return _mm256_round_pd(
padd(
por(
pand(
a, mask), prev0dot5),
a), _MM_FROUND_TO_ZERO);
1151 {
return _mm256_blendv_ps(
b,
a,mask); }
1153 {
return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(
b), _mm256_castsi256_ps(
a), _mm256_castsi256_ps(mask))); }
1155 {
return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(
b), _mm256_castsi256_ps(
a), _mm256_castsi256_ps(mask))); }
1158 {
return _mm256_blendv_pd(
b,
a,mask); }
1161 #ifdef EIGEN_VECTORIZE_AVX2
1162 return _mm256_srai_epi32(
a, N);
1164 __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(
a, 0), N);
1165 __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(
a, 1), N);
1166 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1171 #ifdef EIGEN_VECTORIZE_AVX2
1172 return _mm256_srli_epi32(
a, N);
1174 __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(
a, 0), N);
1175 __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(
a, 1), N);
1176 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1181 #ifdef EIGEN_VECTORIZE_AVX2
1182 return _mm256_slli_epi32(
a, N);
1184 __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(
a, 0), N);
1185 __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(
a, 1), N);
1186 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1211 #ifdef EIGEN_VECTORIZE_AVX512
1212 __mmask16 mask =
static_cast<__mmask16
>(umask & 0x00FF);
1215 Packet8i mask = _mm256_set1_epi8(
static_cast<char>(umask));
1216 const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
1218 mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1232 Packet8f tmp = _mm256_broadcast_ps((
const __m128*)(
const void*)from);
1234 tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
1236 return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
1241 Packet4d tmp = _mm256_broadcast_pd((
const __m128d*)(
const void*)from);
1242 return _mm256_permute_pd(tmp, 3<<2);
1247 #ifdef EIGEN_VECTORIZE_AVX2
1249 return _mm256_permutevar8x32_epi32(
a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1251 __m256 tmp = _mm256_broadcast_ps((
const __m128*)(
const void*)from);
1253 tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
1255 return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)));
1259 #ifdef EIGEN_VECTORIZE_AVX2
1261 return _mm256_permutevar8x32_epi32(
a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
1263 __m256 tmp = _mm256_broadcast_ps((
const __m128*)(
const void*)from);
1265 tmp = _mm256_blend_ps(
1266 tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
1269 return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
1276 Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
1277 return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
1281 return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1);
1284 return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
1298 #ifdef EIGEN_VECTORIZE_AVX512
1299 __mmask16 mask =
static_cast<__mmask16
>(umask & 0x00FF);
1302 Packet8i mask = _mm256_set1_epi8(
static_cast<char>(umask));
1303 const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
1305 mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
1308 const __m256i ifrom = _mm256_castps_si256(from);
1309 EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
reinterpret_cast<char*
>(to));
1310 EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
reinterpret_cast<char*
>(to + 4));
1321 return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
1322 from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1326 return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1330 return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
1331 from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
1339 __m128 low = _mm256_extractf128_ps(from, 0);
1340 to[stride*0] = _mm_cvtss_f32(low);
1341 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
1342 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
1343 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
1345 __m128 high = _mm256_extractf128_ps(from, 1);
1346 to[stride*4] = _mm_cvtss_f32(high);
1347 to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
1348 to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
1349 to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
1353 __m128d low = _mm256_extractf128_pd(from, 0);
1354 to[stride*0] = _mm_cvtsd_f64(low);
1355 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
1356 __m128d high = _mm256_extractf128_pd(from, 1);
1357 to[stride*2] = _mm_cvtsd_f64(high);
1358 to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
1362 __m128i low = _mm256_extractf128_si256(from, 0);
1363 to[stride*0] = _mm_extract_epi32(low, 0);
1364 to[stride*1] = _mm_extract_epi32(low, 1);
1365 to[stride*2] = _mm_extract_epi32(low, 2);
1366 to[stride*3] = _mm_extract_epi32(low, 3);
1368 __m128i high = _mm256_extractf128_si256(from, 1);
1369 to[stride*4] = _mm_extract_epi32(high, 0);
1370 to[stride*5] = _mm_extract_epi32(high, 1);
1371 to[stride*6] = _mm_extract_epi32(high, 2);
1372 to[stride*7] = _mm_extract_epi32(high, 3);
1394 #ifndef EIGEN_VECTORIZE_AVX512
1402 return _mm_cvtss_f32(_mm256_castps256_ps128(
a));
1405 return _mm_cvtsd_f64(_mm256_castpd256_pd128(
a));
1408 return _mm_cvtsi128_si32(_mm256_castsi256_si128(
a));
1411 return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(
a)));
1417 __m256 tmp = _mm256_shuffle_ps(
a,
a,0x1b);
1418 return _mm256_permute2f128_ps(tmp, tmp, 1);
1422 __m256d tmp = _mm256_shuffle_pd(
a,
a,5);
1423 return _mm256_permute2f128_pd(tmp, tmp, 1);
1427 __m256d swap_halves = _mm256_permute2f128_pd(
a,
a,1);
1428 return _mm256_permute_pd(swap_halves,5);
1433 return _mm256_castps_si256(
preverse(_mm256_castsi256_ps(
a)));
1436 return _mm256_castps_si256(
preverse(_mm256_castsi256_ps(
a)));
1439 #ifdef EIGEN_VECTORIZE_AVX2
1440 template<> EIGEN_STRONG_INLINE Packet4l
preverse(
const Packet4l&
a)
1442 return _mm256_castpd_si256(
preverse(_mm256_castsi256_pd(
a)));
1444 template<> EIGEN_STRONG_INLINE Packet4ul
preverse(
const Packet4ul&
a) {
1445 return _mm256_castpd_si256(
preverse(_mm256_castsi256_pd(
a)));
1452 const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
1453 return _mm256_and_ps(
a,mask);
1457 const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
1458 return _mm256_and_pd(
a,mask);
1462 #ifdef EIGEN_VECTORIZE_AVX2
1463 return _mm256_abs_epi32(
a);
1465 __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(
a, 0));
1466 __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(
a, 1));
1467 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
1476 #ifdef EIGEN_VECTORIZE_AVX2
1477 template<> EIGEN_STRONG_INLINE
Packet4d psignbit(
const Packet4d&
a) {
return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(
a))); }
1478 template<> EIGEN_STRONG_INLINE Packet4ul
psignbit(
const Packet4ul&
a) {
return pzero(
a); }
1490 __m256i a_expo = _mm256_castpd_si256(
pand(
a, cst_exp_mask));
1491 #ifdef EIGEN_VECTORIZE_AVX2
1492 a_expo = _mm256_srli_epi64(a_expo, 52);
1493 __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1494 __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1496 __m128i lo = _mm256_extractf128_si256(a_expo, 0);
1497 __m128i hi = _mm256_extractf128_si256(a_expo, 1);
1498 lo = _mm_srli_epi64(lo, 52);
1499 hi = _mm_srli_epi64(hi, 52);
1503 Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
1504 exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
1528 Packet4i lo = _mm_slli_epi64(hi, 52);
1529 hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1530 Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1536 lo = _mm_slli_epi64(hi, 52);
1537 hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
1538 c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
1545 return predux(
Packet4f(_mm_add_ps(_mm256_castps256_ps128(
a),_mm256_extractf128_ps(
a,1))));
1549 return predux(
Packet2d(_mm_add_pd(_mm256_castpd256_pd128(
a),_mm256_extractf128_pd(
a,1))));
1553 return predux(
Packet4i(_mm_add_epi32(_mm256_castsi256_si128(
a),_mm256_extractf128_si256(
a,1))));
1556 return predux(
Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(
a), _mm256_extractf128_si256(
a, 1))));
1561 return _mm_add_ps(_mm256_castps256_ps128(
a),_mm256_extractf128_ps(
a,1));
1565 return _mm_add_epi32(_mm256_castsi256_si128(
a),_mm256_extractf128_si256(
a,1));
1568 return _mm_add_epi32(_mm256_castsi256_si128(
a), _mm256_extractf128_si256(
a, 1));
1574 tmp = _mm256_mul_ps(
a, _mm256_permute2f128_ps(
a,
a,1));
1575 tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1576 return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1581 tmp = _mm256_mul_pd(
a, _mm256_permute2f128_pd(
a,
a,1));
1582 return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
1587 Packet8f tmp = _mm256_min_ps(
a, _mm256_permute2f128_ps(
a,
a,1));
1588 tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1589 return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1593 Packet4d tmp = _mm256_min_pd(
a, _mm256_permute2f128_pd(
a,
a,1));
1594 return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
1599 Packet8f tmp = _mm256_max_ps(
a, _mm256_permute2f128_ps(
a,
a,1));
1600 tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
1601 return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
1606 Packet4d tmp = _mm256_max_pd(
a, _mm256_permute2f128_pd(
a,
a,1));
1607 return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
1618 return _mm256_movemask_ps(
x) != 0;
1623 return _mm256_movemask_ps(_mm256_castsi256_ps(
x)) != 0;
1627 return _mm256_movemask_ps(_mm256_castsi256_ps(
x)) != 0;
1632 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1633 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1634 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1635 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1636 __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1637 __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1638 __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1639 __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1640 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
1641 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
1642 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
1643 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
1644 __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
1645 __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
1646 __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
1647 __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
1648 kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
1649 kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
1650 kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
1651 kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
1652 kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
1653 kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
1654 kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
1655 kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
1660 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1661 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1662 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1663 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1665 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
1666 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
1667 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
1668 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
1670 kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
1671 kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
1672 kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
1673 kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
1676 #define MM256_SHUFFLE_EPI32(A, B, M) \
1677 _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
1679 #ifndef EIGEN_VECTORIZE_AVX2
1680 #define MM256_UNPACKLO_EPI32(A, B) \
1681 _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
1682 #define MM256_UNPACKHI_EPI32(A, B) \
1683 _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
1685 #define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B)
1686 #define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
1708 kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
1709 kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
1710 kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
1711 kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
1712 kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
1713 kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
1714 kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
1715 kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
1718 ptranspose((PacketBlock<Packet8i, 8>&)kernel);
1733 kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
1734 kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
1735 kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
1736 kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
1739 ptranspose((PacketBlock<Packet8i, 4>&)kernel);
1744 __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
1745 __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
1746 __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
1747 __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
1749 kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
1750 kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
1751 kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
1752 kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
1756 #ifdef EIGEN_VECTORIZE_AVX2
1757 const __m256i zero = _mm256_setzero_si256();
1758 const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1759 __m256i false_mask = _mm256_cmpeq_epi32(zero, select);
1760 return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask));
1762 const __m256 zero = _mm256_setzero_ps();
1763 const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1764 __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
1765 return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
1770 #ifdef EIGEN_VECTORIZE_AVX2
1771 const __m256i zero = _mm256_setzero_si256();
1772 const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1773 __m256i false_mask = _mm256_cmpeq_epi64(select, zero);
1774 return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask));
1776 const __m256d zero = _mm256_setzero_pd();
1777 const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1778 __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
1779 return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
1784 #ifndef EIGEN_VECTORIZE_AVX512FP16
1785 template<>
struct unpacket_traits<
Packet8h> {
typedef Eigen::half type;
enum {
size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet8h half; };
1789 return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
1793 return numext::bit_cast<Eigen::half>(
static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
1797 return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1801 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1805 _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1809 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1818 return _mm_set_epi16(d, d,
c,
c,
b,
b,
a,
a);
1825 return _mm_set_epi16(
b,
b,
b,
b,
a,
a,
a,
a);
1829 return _mm_cmpeq_epi32(
a,
a);
1834 const __m128i sign_mask = _mm_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
1835 return _mm_andnot_si128(sign_mask,
a);
1839 #ifdef EIGEN_HAS_FP16_C
1840 return _mm256_cvtph_ps(
a);
1843 _mm256_castsi128_si256(half2floatsse(
a)), half2floatsse(_mm_srli_si128(
a, 8)), 1));
1849 #ifdef EIGEN_HAS_FP16_C
1850 return _mm256_cvtps_ph(
a, _MM_FROUND_TO_NEAREST_INT);
1852 __m128i lo =
float2half(_mm256_extractf128_ps(
a, 0));
1853 __m128i hi =
float2half(_mm256_extractf128_ps(
a, 1));
1854 return _mm_packus_epi32(lo, hi);
1878 return _mm_or_si128(
a,
b);
1881 return _mm_xor_si128(
a,
b);
1884 return _mm_and_si128(
a,
b);
1887 return _mm_andnot_si128(
b,
a);
1891 return _mm_blendv_epi8(
b,
a, mask);
1930 return _mm_xor_si128(
a, sign_mask);
1933 #ifndef EIGEN_VECTORIZE_AVX512FP16
1965 const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
1966 const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
1967 const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
1968 const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
1969 const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
1970 const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
1971 const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
1972 const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
1973 return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
1980 to[stride*0] = aux[0];
1981 to[stride*1] = aux[1];
1982 to[stride*2] = aux[2];
1983 to[stride*3] = aux[3];
1984 to[stride*4] = aux[4];
1985 to[stride*5] = aux[5];
1986 to[stride*6] = aux[6];
1987 to[stride*7] = aux[7];
1991 #ifndef EIGEN_VECTORIZE_AVX512FP16
2019 __m128i
m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2020 return _mm_shuffle_epi8(
a,
m);
2023 EIGEN_STRONG_INLINE
void
2025 __m128i
a = kernel.packet[0];
2026 __m128i
b = kernel.packet[1];
2027 __m128i
c = kernel.packet[2];
2028 __m128i d = kernel.packet[3];
2029 __m128i
e = kernel.packet[4];
2030 __m128i f = kernel.packet[5];
2031 __m128i g = kernel.packet[6];
2032 __m128i h = kernel.packet[7];
2034 __m128i a03b03 = _mm_unpacklo_epi16(
a,
b);
2035 __m128i c03d03 = _mm_unpacklo_epi16(
c, d);
2036 __m128i e03f03 = _mm_unpacklo_epi16(
e, f);
2037 __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2038 __m128i a47b47 = _mm_unpackhi_epi16(
a,
b);
2039 __m128i c47d47 = _mm_unpackhi_epi16(
c, d);
2040 __m128i e47f47 = _mm_unpackhi_epi16(
e, f);
2041 __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2043 __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2044 __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2045 __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2046 __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2047 __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2048 __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2049 __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2050 __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2052 __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2053 __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2054 __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2055 __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2056 __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2057 __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2058 __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2059 __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2061 kernel.packet[0] = a0b0c0d0e0f0g0h0;
2062 kernel.packet[1] = a1b1c1d1e1f1g1h1;
2063 kernel.packet[2] = a2b2c2d2e2f2g2h2;
2064 kernel.packet[3] = a3b3c3d3e3f3g3h3;
2065 kernel.packet[4] = a4b4c4d4e4f4g4h4;
2066 kernel.packet[5] = a5b5c5d5e5f5g5h5;
2067 kernel.packet[6] = a6b6c6d6e6f6g6h6;
2068 kernel.packet[7] = a7b7c7d7e7f7g7h7;
2071 EIGEN_STRONG_INLINE
void
2074 pstore<Eigen::half>(in[0], kernel.packet[0]);
2075 pstore<Eigen::half>(in[1], kernel.packet[1]);
2076 pstore<Eigen::half>(in[2], kernel.packet[2]);
2077 pstore<Eigen::half>(in[3], kernel.packet[3]);
2081 for (
int i = 0;
i < 4; ++
i) {
2082 for (
int j = 0;
j < 4; ++
j) {
2083 out[
i][
j] = in[
j][2*
i];
2085 for (
int j = 0;
j < 4; ++
j) {
2086 out[
i][
j+4] = in[
j][2*
i+1];
2099 #ifdef EIGEN_VECTORIZE_AVX2
2100 __m256i extend = _mm256_cvtepu16_epi32(
a);
2101 return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
2103 __m128i lo = _mm_cvtepu16_epi32(
a);
2104 __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(
a, 8));
2105 __m128i lo_shift = _mm_slli_epi32(lo, 16);
2106 __m128i hi_shift = _mm_slli_epi32(hi, 16);
2107 return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
2114 __m256i input = _mm256_castps_si256(
a);
2116 #ifdef EIGEN_VECTORIZE_AVX2
2118 __m256i t = _mm256_srli_epi32(input, 16);
2120 t = _mm256_and_si256(t, _mm256_set1_epi32(1));
2122 t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
2124 t = _mm256_add_epi32(t, input);
2126 t = _mm256_srli_epi32(t, 16);
2128 __m256 mask = _mm256_cmp_ps(
a,
a, _CMP_ORD_Q);
2129 __m256i nan = _mm256_set1_epi32(0x7fc0);
2130 t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
2132 return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
2133 _mm256_extractf128_si256(t, 1));
2136 __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
2137 __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
2139 lo = _mm_and_si128(lo, _mm_set1_epi32(1));
2140 hi = _mm_and_si128(hi, _mm_set1_epi32(1));
2142 lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
2143 hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
2145 lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
2146 hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
2148 lo = _mm_srli_epi32(lo, 16);
2149 hi = _mm_srli_epi32(hi, 16);
2151 __m256 mask = _mm256_cmp_ps(
a,
a, _CMP_ORD_Q);
2152 __m128i nan = _mm_set1_epi32(0x7fc0);
2153 lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
2154 hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
2156 return _mm_packus_epi32(lo, hi);
2161 return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
2165 return numext::bit_cast<bfloat16>(
static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
2169 return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
2173 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
2177 _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
2181 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
2184 template<> EIGEN_STRONG_INLINE
Packet8bf
2190 return _mm_set_epi16(d, d,
c,
c,
b,
b,
a,
a);
2193 template<> EIGEN_STRONG_INLINE
Packet8bf
2197 return _mm_set_epi16(
b,
b,
b,
b,
a,
a,
a,
a);
2201 return _mm_cmpeq_epi32(
a,
a);
2206 const __m128i sign_mask = _mm_set1_epi16(
static_cast<numext::uint16_t>(0x8000));
2207 return _mm_andnot_si128(sign_mask,
a);
2228 return _mm_or_si128(
a,
b);
2231 return _mm_xor_si128(
a,
b);
2234 return _mm_and_si128(
a,
b);
2237 return _mm_andnot_si128(
b,
a);
2241 return _mm_blendv_epi8(
b,
a, mask);
2281 return _mm_xor_si128(
a, sign_mask);
2303 const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
2304 const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
2305 const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
2306 const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
2307 const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
2308 const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
2309 const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
2310 const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
2311 return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
2318 to[stride*0] = aux[0];
2319 to[stride*1] = aux[1];
2320 to[stride*2] = aux[2];
2321 to[stride*3] = aux[3];
2322 to[stride*4] = aux[4];
2323 to[stride*5] = aux[5];
2324 to[stride*6] = aux[6];
2325 to[stride*7] = aux[7];
2346 __m128i
m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
2347 return _mm_shuffle_epi8(
a,
m);
2350 EIGEN_STRONG_INLINE
void
2351 ptranspose(PacketBlock<Packet8bf,8>& kernel) {
2352 __m128i
a = kernel.packet[0];
2353 __m128i
b = kernel.packet[1];
2354 __m128i
c = kernel.packet[2];
2355 __m128i d = kernel.packet[3];
2356 __m128i
e = kernel.packet[4];
2357 __m128i f = kernel.packet[5];
2358 __m128i g = kernel.packet[6];
2359 __m128i h = kernel.packet[7];
2361 __m128i a03b03 = _mm_unpacklo_epi16(
a,
b);
2362 __m128i c03d03 = _mm_unpacklo_epi16(
c, d);
2363 __m128i e03f03 = _mm_unpacklo_epi16(
e, f);
2364 __m128i g03h03 = _mm_unpacklo_epi16(g, h);
2365 __m128i a47b47 = _mm_unpackhi_epi16(
a,
b);
2366 __m128i c47d47 = _mm_unpackhi_epi16(
c, d);
2367 __m128i e47f47 = _mm_unpackhi_epi16(
e, f);
2368 __m128i g47h47 = _mm_unpackhi_epi16(g, h);
2370 __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
2371 __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
2372 __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
2373 __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
2374 __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
2375 __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
2376 __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
2377 __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
2379 kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
2380 kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
2381 kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
2382 kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
2383 kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
2384 kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
2385 kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
2386 kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
2389 EIGEN_STRONG_INLINE
void
2390 ptranspose(PacketBlock<Packet8bf,4>& kernel) {
2391 __m128i
a = kernel.packet[0];
2392 __m128i
b = kernel.packet[1];
2393 __m128i
c = kernel.packet[2];
2394 __m128i d = kernel.packet[3];
2396 __m128i ab_03 = _mm_unpacklo_epi16(
a,
b);
2397 __m128i cd_03 = _mm_unpacklo_epi16(
c, d);
2398 __m128i ab_47 = _mm_unpackhi_epi16(
a,
b);
2399 __m128i cd_47 = _mm_unpackhi_epi16(
c, d);
2401 kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
2402 kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
2403 kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
2404 kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
#define MM256_UNPACKLO_EPI32(A, B)
#define MM256_SHUFFLE_EPI32(A, B, M)
#define MM256_UNPACKHI_EPI32(A, B)
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEVICE_FUNC
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define vec4i_swizzle1(v, p, q, r, s)
Packet4d pmax< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet4d ploadu< Packet4d >(const double *from)
Packet4d ploaddup< Packet4d >(const double *from)
uint32_t pfirst< Packet8ui >(const Packet8ui &a)
Packet8h ploaddup< Packet8h >(const Eigen::half *from)
Packet pmin(const Packet &a, const Packet &b)
Packet8i pmin< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet pnmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8h pround< Packet8h >(const Packet8h &a)
Packet4d pmul< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet padd(const Packet &a, const Packet &b)
Packet4i predux_half_dowto4< Packet8i >(const Packet8i &a)
Packet8h pdiv< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
Packet8bf ploadquad< Packet8bf >(const bfloat16 *from)
void pstore< float >(float *to, const Packet4f &from)
Packet4f predux_half_dowto4< Packet8f >(const Packet8f &a)
Packet4d pload< Packet4d >(const double *from)
Packet8f pmax< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4d pceil< Packet4d >(const Packet4d &a)
double predux_min< Packet4d >(const Packet4d &a)
Packet8h pset1< Packet8h >(const Eigen::half &from)
Packet8h pfloor< Packet8h >(const Packet8h &a)
Packet8ui ploaddup< Packet8ui >(const uint32_t *from)
Packet8f psub< Packet8f >(const Packet8f &a, const Packet8f &b)
unpacket_traits< Packet >::type predux(const Packet &a)
Packet8h ptrue(const Packet8h &a)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
void pstore1< Packet4d >(double *to, const double &a)
void pstore< int >(int *to, const Packet4i &from)
Packet8ui pload< Packet8ui >(const uint32_t *from)
Packet8bf F32ToBf16(Packet4f p4f)
Packet8f pgather< float, Packet8f >(const float *from, Index stride)
Packet4d padd< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8f pand< Packet8f >(const Packet8f &a, const Packet8f &b)
void pscatter< uint32_t, Packet8ui >(uint32_t *to, const Packet8ui &from, Index stride)
Packet4d pdiv< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8h print< Packet8h >(const Packet8h &a)
Packet8i pdiv< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f ploadquad< Packet8f >(const float *from)
Packet8h padd< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8i ploadu< Packet8i >(const int *from)
float pfirst< Packet8f >(const Packet8f &a)
Packet8ui pand< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pandnot< Packet4d >(const Packet4d &a, const Packet4d &b)
float predux_max< Packet8f >(const Packet8f &a)
void pstoreu< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet8ui pselect< Packet8ui >(const Packet8ui &mask, const Packet8ui &a, const Packet8ui &b)
Packet4d pload1< Packet4d >(const double *from)
Packet8bf pdiv< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8h pceil< Packet8h >(const Packet8h &a)
Packet8i ploadquad< Packet8i >(const int *from)
bfloat16 predux_max< Packet8bf >(const Packet8bf &a)
Packet8h pandnot(const Packet8h &a, const Packet8h &b)
Packet8i pxor< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4f pabs(const Packet4f &a)
Packet8f pround< Packet8f >(const Packet8f &a)
Packet pmax(const Packet &a, const Packet &b)
Packet8f Bf16ToF32(const Packet8bf &a)
Packet4ui predux_half_dowto4< Packet8ui >(const Packet8ui &a)
Packet4d pfrexp< Packet4d >(const Packet4d &a, Packet4d &exponent)
void pscatter< double, Packet4d >(double *to, const Packet4d &from, Index stride)
Packet2cf pnegate(const Packet2cf &a)
Packet4d pround< Packet4d >(const Packet4d &a)
Packet8f pmul< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8bf pfloor< Packet8bf >(const Packet8bf &a)
Packet8i psub< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8i pmul< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4d pmin< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8bf print< Packet8bf >(const Packet8bf &a)
Packet8ui pmul< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8bf pload< Packet8bf >(const bfloat16 *from)
Packet8f pload1< Packet8f >(const float *from)
Packet4i plogical_shift_right(const Packet4i &a)
Packet8f por< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4ui ploadu< Packet4ui >(const uint32_t *from)
int predux< Packet8i >(const Packet8i &a)
Packet pminmax_propagate_nan(const Packet &a, const Packet &b, Op op)
Packet8f ploaddup< Packet8f >(const float *from)
Packet4d print< Packet4d >(const Packet4d &a)
float predux< Packet8f >(const Packet8f &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu< uint64_t >(uint64_t *to, const Packet2ul &from)
Packet8f pfloor< Packet8f >(const Packet8f &a)
Packet8bf pmul< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4d pand< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui ploadquad< Packet8ui >(const uint32_t *from)
Packet8f ploadu< Packet8f >(const float *from)
int pfirst< Packet8i >(const Packet8i &a)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
bfloat16 pfirst(const Packet8bf &a)
Packet4d pfloor< Packet4d >(const Packet4d &a)
Packet8f pxor< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8f pldexp< Packet8f >(const Packet8f &a, const Packet8f &exponent)
Packet psign(const Packet &a)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
__vector unsigned int Packet4ui
double predux_max< Packet4d >(const Packet4d &a)
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8bf psub< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8i pandnot< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4i ploadu< Packet4i >(const int *from)
Packet8ui pset1< Packet8ui >(const uint32_t &from)
Packet8bf pceil< Packet8bf >(const Packet8bf &a)
Packet8f print< Packet8f >(const Packet8f &a)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet4d pset1< Packet4d >(const double &from)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
void pscatter< float, Packet8f >(float *to, const Packet8f &from, Index stride)
Packet8h plset< Packet8h >(const half &a)
Eigen::half predux< Packet8h >(const Packet8h &a)
bfloat16 predux_mul< Packet8bf >(const Packet8bf &a)
void pstore< uint32_t >(uint32_t *to, const Packet8ui &from)
Packet8h psub< Packet8h >(const Packet8h &a, const Packet8h &b)
void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Packet8ui pandnot< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
Packet8bf pmin< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
__m128i Pack16To8(Packet8f rf)
Packet pminmax_propagate_numbers(const Packet &a, const Packet &b, Op op)
Packet4i pset1< Packet4i >(const int &from)
Packet4d pldexp< Packet4d >(const Packet4d &a, const Packet4d &exponent)
void prefetch< uint32_t >(const uint32_t *addr)
Packet8i pand< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f pload< Packet8f >(const float *from)
Packet8h float2half(const Packet8f &a)
void pstore1< Packet8f >(float *to, const float &a)
Packet8i plset< Packet8i >(const int &a)
Packet4d pmin< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8f pmin< Packet8f >(const Packet8f &a, const Packet8f &b)
float predux_mul< Packet8f >(const Packet8f &a)
Packet8f pset1frombits< Packet8f >(unsigned int from)
Packet8f pfrexp< Packet8f >(const Packet8f &a, Packet8f &exponent)
Packet8f pmin< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Eigen::half pfirst< Packet8h >(const Packet8h &from)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet psub(const Packet &a, const Packet &b)
Eigen::half predux_mul< Packet8h >(const Packet8h &a)
bfloat16 predux_min< Packet8bf >(const Packet8bf &a)
Packet8f pset1< Packet8f >(const float &from)
void pstoreu< int64_t >(int64_t *to, const Packet2l &from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet8h ploadquad< Packet8h >(const Eigen::half *from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
bfloat16 predux< Packet8bf >(const Packet8bf &a)
Packet8f half2float(const Packet8h &a)
double pfirst< Packet4d >(const Packet4d &a)
void pscatter< int, Packet8i >(int *to, const Packet8i &from, Index stride)
Packet8h pand(const Packet8h &a, const Packet8h &b)
void pstore< int64_t >(int64_t *to, const Packet2l &from)
const char * SsePrefetchPtrType
Packet8h ploadu< Packet8h >(const Eigen::half *from)
void pstoreu< float >(float *to, const Packet4f &from)
Packet4d plset< Packet4d >(const double &a)
Packet4d pmax< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui padd< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pmin< PropagateNaN, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8i pmax< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet4d ptrue< Packet4d >(const Packet4d &a)
eigen_packet_wrapper< __m256i, 0 > Packet8i
Packet8ui pmax< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8f pmax< PropagateNumbers, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8h pmax< Packet8h >(const Packet8h &a, const Packet8h &b)
float predux_min< Packet8f >(const Packet8f &a)
Packet8bf plset< Packet8bf >(const bfloat16 &a)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8bf ploadu< Packet8bf >(const bfloat16 *from)
Packet8i pset1< Packet8i >(const int &from)
Packet8i pselect< Packet8i >(const Packet8i &mask, const Packet8i &a, const Packet8i &b)
Packet8bf pset1< Packet8bf >(const bfloat16 &from)
Packet8i pload< Packet8i >(const int *from)
Packet8bf psignbit(const Packet8bf &a)
Packet4d pmax< PropagateNumbers, Packet4d >(const Packet4d &a, const Packet4d &b)
Packet pdiv(const Packet &a, const Packet &b)
Packet8f pmin< PropagateNaN, Packet8f >(const Packet8f &a, const Packet8f &b)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
Packet8f pmax< Packet8f >(const Packet8f &a, const Packet8f &b)
uint32_t predux< Packet8ui >(const Packet8ui &a)
Packet8i pgather< int, Packet8i >(const int *from, Index stride)
void pstore< uint64_t >(uint64_t *to, const Packet2ul &from)
Packet8ui por< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet4d pxor< Packet4d >(const Packet4d &a, const Packet4d &b)
eigen_packet_wrapper< __m256i, 4 > Packet8ui
Packet2cf pconj(const Packet2cf &a)
EIGEN_ALWAYS_INLINE void pscatter< bfloat16, Packet8bf >(bfloat16 *to, const Packet8bf &from, Index stride)
void pstoreu< int >(int *to, const Packet4i &from)
double predux_mul< Packet4d >(const Packet4d &a)
Packet8ui psub< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8bf pmax< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4d pset1frombits< Packet4d >(uint64_t from)
Packet8f ptrue< Packet8f >(const Packet8f &a)
Packet4i plogical_shift_left(const Packet4i &a)
Packet8h pload< Packet8h >(const Eigen::half *from)
Packet8i ptrue< Packet8i >(const Packet8i &a)
Packet2cf preverse(const Packet2cf &a)
Packet4i parithmetic_shift_right(const Packet4i &a)
Packet8h pmul< Packet8h >(const Packet8h &a, const Packet8h &b)
bfloat16 pfirst< Packet8bf >(const Packet8bf &from)
Packet8h por(const Packet8h &a, const Packet8h &b)
Eigen::half predux_min< Packet8h >(const Packet8h &a)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet8ui pmin< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
Packet8f pselect< Packet8f >(const Packet8f &mask, const Packet8f &a, const Packet8f &b)
Packet8f pisnan(const Packet8f &a)
Packet8f plset< Packet8f >(const float &a)
Packet8f pandnot< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8ui plset< Packet8ui >(const uint32_t &a)
Packet4d psub< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet8ui pgather< uint32_t, Packet8ui >(const uint32_t *from, Index stride)
double predux< Packet4d >(const Packet4d &a)
Packet8i padd< Packet8i >(const Packet8i &a, const Packet8i &b)
Packet8f padd< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8i por< Packet8i >(const Packet8i &a, const Packet8i &b)
void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
void prefetch< int >(const int *addr)
Packet4d por< Packet4d >(const Packet4d &a, const Packet4d &b)
Packet4d pselect< Packet4d >(const Packet4d &mask, const Packet4d &a, const Packet4d &b)
Packet8i ploaddup< Packet8i >(const int *from)
EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
Packet8f pceil< Packet8f >(const Packet8f &a)
Packet8h pmin< Packet8h >(const Packet8h &a, const Packet8h &b)
Packet8ui pxor< Packet8ui >(const Packet8ui &a, const Packet8ui &b)
bool predux_any(const Packet4f &x)
eigen_packet_wrapper< __m128i, 2 > Packet8h
Packet8bf ploaddup< Packet8bf >(const bfloat16 *from)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4d pgather< double, Packet4d >(const double *from, Index stride)
Packet8bf pround< Packet8bf >(const Packet8bf &a)
void pstore1< Packet8i >(int *to, const int &a)
Packet8ui ploadu< Packet8ui >(const uint32_t *from)
Eigen::half predux_max< Packet8h >(const Packet8h &a)
Packet8f pdiv< Packet8f >(const Packet8f &a, const Packet8f &b)
Packet8f peven_mask(const Packet8f &)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.