13 #ifndef EIGEN_PACKET_MATH_MSA_H
14 #define EIGEN_PACKET_MATH_MSA_H
19 #include "../../InternalHeaderCheck.h"
25 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
26 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
29 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
30 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
34 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
38 #define EIGEN_MSA_DEBUG \
39 static bool firstTime = true; \
42 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
47 #define EIGEN_MSA_DEBUG
50 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
56 #define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
57 #define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
58 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
61 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
66 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
71 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
76 struct packet_traits<float> : default_packet_traits {
101 struct packet_traits<
int32_t> : default_packet_traits {
117 enum {
size = 4, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
124 enum {
size = 4, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
140 return __builtin_msa_fill_w(from);
156 return __builtin_msa_fill_w(*from);
163 return __builtin_msa_fadd_w(
a,
b);
170 return __builtin_msa_addv_w(
a,
b);
177 static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
185 static const Packet4i countdown = { 0, 1, 2, 3 };
193 return __builtin_msa_fsub_w(
a,
b);
200 return __builtin_msa_subv_w(
a,
b);
207 return (
Packet4f)__builtin_msa_bnegi_w((v4u32)
a, 31);
214 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)
a, 0), 1);
235 return __builtin_msa_fmul_w(
a,
b);
242 return __builtin_msa_mulv_w(
a,
b);
249 return __builtin_msa_fdiv_w(
a,
b);
256 return __builtin_msa_div_s_w(
a,
b);
263 return __builtin_msa_fmadd_w(
c,
a,
b);
272 __asm__(
"maddv.w %w[value], %w[a], %w[b]\n"
274 : [value]
"+f"(value)
276 : [
a]
"f"(
a), [
b]
"f"(
b));
284 return (
Packet4f)__builtin_msa_and_v((v16u8)
a, (v16u8)
b);
291 return (
Packet4i)__builtin_msa_and_v((v16u8)
a, (v16u8)
b);
298 return (
Packet4f)__builtin_msa_or_v((v16u8)
a, (v16u8)
b);
305 return (
Packet4i)__builtin_msa_or_v((v16u8)
a, (v16u8)
b);
312 return (
Packet4f)__builtin_msa_xor_v((v16u8)
a, (v16u8)
b);
319 return (
Packet4i)__builtin_msa_xor_v((v16u8)
a, (v16u8)
b);
342 return __builtin_msa_fmin_w(
a,
b);
347 return (
Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)
b, (v16u8)
a);
355 return __builtin_msa_min_s_w(
a,
b);
364 return __builtin_msa_fmax_w(
a,
b);
369 return (
Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)
b, (v16u8)
a);
377 return __builtin_msa_max_s_w(
a,
b);
412 float f0 = from[0], f1 = from[1];
415 return (
Packet4f)__builtin_msa_ilvr_d((v2i64)
v1, (v2i64)v0);
422 int32_t i0 = from[0], i1 = from[1];
425 return (
Packet4i)__builtin_msa_ilvr_d((v2i64)
v1, (v2i64)v0);
463 v[2] = from[2 * stride];
464 v[3] = from[3 * stride];
475 v[2] = from[2 * stride];
476 v[3] = from[3 * stride];
512 __builtin_prefetch(addr);
519 __builtin_prefetch(addr);
554 return (
Packet4f)__builtin_msa_bclri_w((v4u32)
a, 31);
561 Packet4i zero = __builtin_msa_ldi_w(0);
562 return __builtin_msa_add_a_w(zero,
a);
614 v16u8 unord = (v16u8)__builtin_msa_fcun_w(
a, swapped);
616 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
619 Packet4f v = __builtin_msa_fmin_w(
a, swapped);
620 v = __builtin_msa_fmin_w(
624 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
625 v = (
Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)
v);
649 v16u8 unord = (v16u8)__builtin_msa_fcun_w(
a, swapped);
651 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
654 Packet4f v = __builtin_msa_fmax_w(
a, swapped);
655 v = __builtin_msa_fmax_w(
659 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
660 v = (
Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)
v);
674 inline std::ostream&
operator<<(std::ostream& os,
const PacketBlock<Packet4f, 4>& value) {
675 os <<
"[ " << value.packet[0] <<
"," << std::endl
676 <<
" " << value.packet[1] <<
"," << std::endl
677 <<
" " << value.packet[2] <<
"," << std::endl
678 <<
" " << value.packet[3] <<
" ]";
685 v4i32 tmp1, tmp2, tmp3, tmp4;
687 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
688 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
689 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
690 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
692 kernel.packet[0] = (
Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
693 kernel.packet[1] = (
Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
694 kernel.packet[2] = (
Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
695 kernel.packet[3] = (
Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
698 inline std::ostream&
operator<<(std::ostream& os,
const PacketBlock<Packet4i, 4>& value) {
699 os <<
"[ " << value.packet[0] <<
"," << std::endl
700 <<
" " << value.packet[1] <<
"," << std::endl
701 <<
" " << value.packet[2] <<
"," << std::endl
702 <<
" " << value.packet[3] <<
" ]";
709 v4i32 tmp1, tmp2, tmp3, tmp4;
711 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
712 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
713 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
714 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
716 kernel.packet[0] = (
Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
717 kernel.packet[1] = (
Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
718 kernel.packet[2] = (
Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
719 kernel.packet[3] = (
Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
726 return __builtin_msa_fsqrt_w(
a);
734 return __builtin_msa_frsqrt_w(
a);
736 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
746 "cfcmsa %[old_mode], $1\n"
747 "ori %[new_mode], %[old_mode], 3\n"
748 "ctcmsa $1, %[new_mode]\n"
749 "frint.w %w[v], %w[v]\n"
750 "ctcmsa $1, %[old_mode]\n"
752 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
765 "cfcmsa %[old_mode], $1\n"
766 "ori %[new_mode], %[old_mode], 3\n"
767 "xori %[new_mode], %[new_mode], 1\n"
768 "ctcmsa $1, %[new_mode]\n"
769 "frint.w %w[v], %w[v]\n"
770 "ctcmsa $1, %[old_mode]\n"
772 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
785 "cfcmsa %[old_mode], $1\n"
786 "ori %[new_mode], %[old_mode], 3\n"
787 "xori %[new_mode], %[new_mode], 3\n"
788 "ctcmsa $1, %[new_mode]\n"
789 "frint.w %w[v], %w[v]\n"
790 "ctcmsa $1, %[old_mode]\n"
792 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
803 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
804 ifPacket.select[3] };
806 return (
Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
812 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
813 ifPacket.select[3] };
815 return (
Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
824 #define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
825 #define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
826 #define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
829 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
834 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
839 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
844 struct packet_traits<double> : default_packet_traits {
866 enum {
size = 2, alignment =
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false };
882 return __builtin_msa_fadd_d(
a,
b);
889 static const Packet2d countdown = { 0.0, 1.0 };
897 return __builtin_msa_fsub_d(
a,
b);
904 return (
Packet2d)__builtin_msa_bnegi_d((v2u64)
a, 63);
918 return __builtin_msa_fmul_d(
a,
b);
925 return __builtin_msa_fdiv_d(
a,
b);
932 return __builtin_msa_fmadd_d(
c,
a,
b);
941 return (
Packet2d)__builtin_msa_and_v((v16u8)
a, (v16u8)
b);
948 return (
Packet2d)__builtin_msa_or_v((v16u8)
a, (v16u8)
b);
955 return (
Packet2d)__builtin_msa_xor_v((v16u8)
a, (v16u8)
b);
978 return __builtin_msa_fmin_d(
a,
b);
981 v2i64 aNaN = __builtin_msa_fcun_d(
a,
a);
982 v2i64 aMinOrNaN =
por(__builtin_msa_fclt_d(
a,
b), aNaN);
983 return (
Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)
b, (v16u8)
a);
993 return __builtin_msa_fmax_d(
a,
b);
996 v2i64 aNaN = __builtin_msa_fcun_d(
a,
a);
997 v2i64 aMaxOrNaN =
por(__builtin_msa_fclt_d(
b,
a), aNaN);
998 return (
Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)
b, (v16u8)
a);
1056 __builtin_prefetch(addr);
1077 return (
Packet2d)__builtin_msa_bclri_d((v2u64)
a, 63);
1105 Packet2d v = __builtin_msa_fmin_d(
a, swapped);
1108 double a0 =
a[0], a1 =
a[1];
1120 Packet2d v = __builtin_msa_fmax_d(
a, swapped);
1123 double a0 =
a[0], a1 =
a[1];
1132 return __builtin_msa_fsqrt_d(
a);
1140 return __builtin_msa_frsqrt_d(
a);
1142 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1147 inline std::ostream&
operator<<(std::ostream& os,
const PacketBlock<Packet2d, 2>& value) {
1148 os <<
"[ " << value.packet[0] <<
"," << std::endl <<
" " << value.packet[1] <<
" ]";
1155 Packet2d trn1 = (
Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1156 Packet2d trn2 = (
Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1157 kernel.packet[0] = trn1;
1158 kernel.packet[1] = trn2;
1166 "cfcmsa %[old_mode], $1\n"
1167 "ori %[new_mode], %[old_mode], 3\n"
1168 "ctcmsa $1, %[new_mode]\n"
1169 "frint.d %w[v], %w[v]\n"
1170 "ctcmsa $1, %[old_mode]\n"
1172 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1185 "cfcmsa %[old_mode], $1\n"
1186 "ori %[new_mode], %[old_mode], 3\n"
1187 "xori %[new_mode], %[new_mode], 1\n"
1188 "ctcmsa $1, %[new_mode]\n"
1189 "frint.d %w[v], %w[v]\n"
1190 "ctcmsa $1, %[old_mode]\n"
1192 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1205 "cfcmsa %[old_mode], $1\n"
1206 "ori %[new_mode], %[old_mode], 3\n"
1207 "xori %[new_mode], %[new_mode], 3\n"
1208 "ctcmsa $1, %[new_mode]\n"
1209 "frint.d %w[v], %w[v]\n"
1210 "ctcmsa $1, %[old_mode]\n"
1212 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1223 Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
1225 return (
Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
Array< int, Dynamic, 1 > v
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_MSA_SHF_I8(a, b, c, d)
#define EIGEN_DEVICE_FUNC
M1<< 1, 2, 3, 4, 5, 6, 7, 8, 9;Map< RowVectorXf > v1(M1.data(), M1.size())
std::ostream & operator<<(std::ostream &s, const Packet16c &v)
Packet pmin(const Packet &a, const Packet &b)
Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet padd(const Packet &a, const Packet &b)
double predux_max< Packet2d >(const Packet2d &a)
Packet4i pload1< Packet4i >(const int32_t *from)
void pstore< float >(float *to, const Packet4f &from)
float predux_max< Packet4f >(const Packet4f &a)
double predux_min< Packet2d >(const Packet2d &a)
void pstoreu< int32_t >(int32_t *to, const Packet4i &from)
Packet2d plset< Packet2d >(const double &a)
Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4f pload1< Packet4f >(const float *from)
Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
int predux< Packet4i >(const Packet4i &a)
Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
float predux_min< Packet4f >(const Packet4f &a)
Packet2d pceil< Packet2d >(const Packet2d &a)
Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4f pabs(const Packet4f &a)
Packet pmax(const Packet &a, const Packet &b)
float predux< Packet4f >(const Packet4f &a)
Packet4f plset< Packet4f >(const float &a)
Packet2cf pnegate(const Packet2cf &a)
double predux< Packet2d >(const Packet2d &a)
Packet2d pround< Packet2d >(const Packet2d &a)
Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
int predux_max< Packet4i >(const Packet4i &a)
Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
double predux_mul< Packet2d >(const Packet2d &a)
Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet2d ploaddup< Packet2d >(const double *from)
Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d pgather< double, Packet2d >(const double *from, Index stride)
void pstoreu< double >(double *to, const Packet4d &from)
__vector unsigned int Packet4ui
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4i ploadu< Packet4i >(const int *from)
float pfirst< Packet4f >(const Packet4f &a)
double pfirst< Packet2d >(const Packet2d &a)
Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i pset1< Packet4i >(const int &from)
Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pload< Packet4i >(const int *from)
Packet4f ploadu< Packet4f >(const float *from)
float predux_mul< Packet4f >(const Packet4f &a)
Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4f psqrt(const Packet4f &a)
Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Packet4i pgather< int32_t, Packet4i >(const int32_t *from, Index stride)
Packet4i ploaddup< Packet4i >(const int *from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pset1< Packet2d >(const double &from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet8h pand(const Packet8h &a, const Packet8h &b)
Packet2d ploadu< Packet2d >(const double *from)
Packet4i plset< Packet4i >(const int &a)
void pstoreu< float >(float *to, const Packet4f &from)
void prefetch< int32_t >(const int32_t *addr)
Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pfloor< Packet2d >(const Packet2d &a)
void pstore< int32_t >(int32_t *to, const Packet4i &from)
EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
int predux_min< Packet4i >(const Packet4i &a)
Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
int predux_mul< Packet4i >(const Packet4i &a)
Packet pdiv(const Packet &a, const Packet &b)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
int pfirst< Packet4i >(const Packet4i &a)
Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2cf pconj(const Packet2cf &a)
Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2d pload< Packet2d >(const double *from)
Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
void pscatter< int32_t, Packet4i >(int32_t *to, const Packet4i &from, Index stride)
Packet2cf preverse(const Packet2cf &a)
Packet4f pload< Packet4f >(const float *from)
Packet8h por(const Packet8h &a, const Packet8h &b)
Packet4f pset1< Packet4f >(const float &from)
Packet4f ploaddup< Packet4f >(const float *from)
Packet4f pround< Packet4f >(const Packet4f &a)
Packet4f pfloor< Packet4f >(const Packet4f &a)
void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pceil< Packet4f >(const Packet4f &a)
Packet4f prsqrt(const Packet4f &a)
Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE bool() isnan(const Eigen::bfloat16 &h)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.