10 #ifndef EIGEN_PACKET_MATH_SVE_H
11 #define EIGEN_PACKET_MATH_SVE_H
13 #include "../../InternalHeaderCheck.h"
19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
29 template <
typename Scalar,
int SVEVectorLength>
30 struct sve_packet_size_selector {
31 enum {
size = SVEVectorLength / (
sizeof(Scalar) * CHAR_BIT) };
35 typedef svint32_t PacketXi
__attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
38 struct packet_traits<numext::
int32_t> : default_packet_traits {
39 typedef PacketXi type;
40 typedef PacketXi
half;
64 struct unpacket_traits<PacketXi> {
66 typedef PacketXi half;
71 masked_load_available =
false,
72 masked_store_available =
false
79 svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
85 return svdup_n_s32(from);
93 return svadd_s32_z(svptrue_b32(),
pset1<PacketXi>(
a), svld1_s32(svptrue_b32(),
c));
99 return svadd_s32_z(svptrue_b32(),
a,
b);
105 return svsub_s32_z(svptrue_b32(),
a,
b);
109 EIGEN_STRONG_INLINE PacketXi
pnegate(
const PacketXi&
a)
111 return svneg_s32_z(svptrue_b32(),
a);
115 EIGEN_STRONG_INLINE PacketXi
pconj(
const PacketXi&
a)
123 return svmul_s32_z(svptrue_b32(),
a,
b);
129 return svdiv_s32_z(svptrue_b32(),
a,
b);
133 EIGEN_STRONG_INLINE PacketXi
pmadd(
const PacketXi&
a,
const PacketXi&
b,
const PacketXi&
c)
135 return svmla_s32_z(svptrue_b32(),
c,
a,
b);
141 return svmin_s32_z(svptrue_b32(),
a,
b);
147 return svmax_s32_z(svptrue_b32(),
a,
b);
153 return svdup_n_s32_z(svcmple_s32(svptrue_b32(),
a,
b), 0xffffffffu);
159 return svdup_n_s32_z(svcmplt_s32(svptrue_b32(),
a,
b), 0xffffffffu);
165 return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(),
a,
b), 0xffffffffu);
171 return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
177 return svdup_n_s32_z(svptrue_b32(), 0);
183 return svand_s32_z(svptrue_b32(),
a,
b);
189 return svorr_s32_z(svptrue_b32(),
a,
b);
195 return sveor_s32_z(svptrue_b32(),
a,
b);
201 return svbic_s32_z(svptrue_b32(),
a,
b);
207 return svasrd_n_s32_z(svptrue_b32(),
a, N);
213 return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(
a), N));
219 return svlsl_n_s32_z(svptrue_b32(),
a, N);
237 svuint32_t indices = svindex_u32(0, 1);
238 indices = svzip1_u32(indices, indices);
239 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
245 svuint32_t indices = svindex_u32(0, 1);
246 indices = svzip1_u32(indices, indices);
247 indices = svzip1_u32(indices, indices);
248 return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
252 EIGEN_STRONG_INLINE
void pstore<numext::int32_t>(
numext::int32_t* to,
const PacketXi& from)
258 EIGEN_STRONG_INLINE
void pstoreu<numext::int32_t>(
numext::int32_t* to,
const PacketXi& from)
267 svint32_t indices = svindex_s32(0, stride);
268 return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
275 svint32_t indices = svindex_s32(0, stride);
276 svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
283 return svlasta_s32(svpfalse_b(),
a);
287 EIGEN_STRONG_INLINE PacketXi
preverse(
const PacketXi&
a)
293 EIGEN_STRONG_INLINE PacketXi
pabs(
const PacketXi&
a)
295 return svabs_s32_z(svptrue_b32(),
a);
308 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
311 svint32_t prod = svmul_s32_z(svptrue_b32(),
a, svrev_s32(
a));
315 if (EIGEN_ARM64_SVE_VL >= 2048) {
316 half_prod = svtbl_s32(prod, svindex_u32(32, 1));
317 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
319 if (EIGEN_ARM64_SVE_VL >= 1024) {
320 half_prod = svtbl_s32(prod, svindex_u32(16, 1));
321 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
323 if (EIGEN_ARM64_SVE_VL >= 512) {
324 half_prod = svtbl_s32(prod, svindex_u32(8, 1));
325 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
327 if (EIGEN_ARM64_SVE_VL >= 256) {
328 half_prod = svtbl_s32(prod, svindex_u32(4, 1));
329 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
332 half_prod = svtbl_s32(prod, svindex_u32(2, 1));
333 prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
342 return svminv_s32(svptrue_b32(),
a);
348 return svmaxv_s32(svptrue_b32(),
a);
356 PacketXi stride_index = svindex_s32(0, N);
358 for (
i = 0;
i < N;
i++) {
359 svst1_scatter_s32index_s32(svptrue_b32(), buffer +
i, stride_index, kernel.packet[
i]);
361 for (
i = 0;
i < N;
i++) {
368 typedef svfloat32_t PacketXf
__attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
371 struct packet_traits<float> : default_packet_traits {
372 typedef PacketXf type;
373 typedef PacketXf
half;
409 struct unpacket_traits<PacketXf> {
411 typedef PacketXf half;
412 typedef PacketXi integer_packet;
418 masked_load_available =
false,
419 masked_store_available =
false
426 return svdup_n_f32(from);
432 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
440 return svadd_f32_z(svptrue_b32(),
pset1<PacketXf>(
a), svld1_f32(svptrue_b32(),
c));
446 return svadd_f32_z(svptrue_b32(),
a,
b);
452 return svsub_f32_z(svptrue_b32(),
a,
b);
456 EIGEN_STRONG_INLINE PacketXf
pnegate(
const PacketXf&
a)
458 return svneg_f32_z(svptrue_b32(),
a);
462 EIGEN_STRONG_INLINE PacketXf
pconj(
const PacketXf&
a)
470 return svmul_f32_z(svptrue_b32(),
a,
b);
476 return svdiv_f32_z(svptrue_b32(),
a,
b);
480 EIGEN_STRONG_INLINE PacketXf
pmadd(
const PacketXf&
a,
const PacketXf&
b,
const PacketXf&
c)
482 return svmla_f32_z(svptrue_b32(),
c,
a,
b);
488 return svmin_f32_z(svptrue_b32(),
a,
b);
500 return svminnm_f32_z(svptrue_b32(),
a,
b);
506 return svmax_f32_z(svptrue_b32(),
a,
b);
518 return svmaxnm_f32_z(svptrue_b32(),
a,
b);
526 return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(),
a,
b), 0xffffffffu));
532 return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(),
a,
b), 0xffffffffu));
538 return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(),
a,
b), 0xffffffffu));
547 return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(),
a,
b)), 0xffffffffu));
553 return svrintm_f32_z(svptrue_b32(),
a);
559 return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
566 return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
572 return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
578 return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
584 return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(
a), svreinterpret_u32_f32(
b)));
602 svuint32_t indices = svindex_u32(0, 1);
603 indices = svzip1_u32(indices, indices);
604 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
610 svuint32_t indices = svindex_u32(0, 1);
611 indices = svzip1_u32(indices, indices);
612 indices = svzip1_u32(indices, indices);
613 return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
632 svint32_t indices = svindex_s32(0, stride);
633 return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
640 svint32_t indices = svindex_s32(0, stride);
641 svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
648 return svlasta_f32(svpfalse_b(),
a);
652 EIGEN_STRONG_INLINE PacketXf
preverse(
const PacketXf&
a)
658 EIGEN_STRONG_INLINE PacketXf
pabs(
const PacketXf&
a)
660 return svabs_f32_z(svptrue_b32(),
a);
674 return svaddv_f32(svptrue_b32(),
a);
684 EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
686 svfloat32_t prod = svmul_f32_z(svptrue_b32(),
a, svrev_f32(
a));
687 svfloat32_t half_prod;
690 if (EIGEN_ARM64_SVE_VL >= 2048) {
691 half_prod = svtbl_f32(prod, svindex_u32(32, 1));
692 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
694 if (EIGEN_ARM64_SVE_VL >= 1024) {
695 half_prod = svtbl_f32(prod, svindex_u32(16, 1));
696 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
698 if (EIGEN_ARM64_SVE_VL >= 512) {
699 half_prod = svtbl_f32(prod, svindex_u32(8, 1));
700 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
702 if (EIGEN_ARM64_SVE_VL >= 256) {
703 half_prod = svtbl_f32(prod, svindex_u32(4, 1));
704 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
707 half_prod = svtbl_f32(prod, svindex_u32(2, 1));
708 prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
717 return svminv_f32(svptrue_b32(),
a);
723 return svmaxv_f32(svptrue_b32(),
a);
732 PacketXi stride_index = svindex_s32(0, N);
734 for (
i = 0;
i < N;
i++) {
735 svst1_scatter_s32index_f32(svptrue_b32(), buffer +
i, stride_index, kernel.packet[
i]);
738 for (
i = 0;
i < N;
i++) {
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_DEVICE_FUNC
#define EIGEN_STATIC_ASSERT(X, MSG)
PacketXf pmax< PropagateNumbers, PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXf pgather< float, PacketXf >(const float *from, Index stride)
void pstore< float >(float *to, const Packet4f &from)
PacketXi pand< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pmin< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi pzero< PacketXi >(const PacketXi &)
PacketXi psub< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf psub< PacketXf >(const PacketXf &a, const PacketXf &b)
numext::int32_t predux_mul< PacketXi >(const PacketXi &a)
PacketXi pmax< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pmul< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi plset< PacketXi >(const numext::int32_t &a)
PacketXf pmax< PropagateNaN, PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXf pmin< PropagateNaN, PacketXf >(const PacketXf &a, const PacketXf &b)
Packet4f pabs(const Packet4f &a)
PacketXf pmax< PacketXf >(const PacketXf &a, const PacketXf &b)
float predux_min< PacketXf >(const PacketXf &a)
Packet2cf pnegate(const Packet2cf &a)
PacketXf pset1< PacketXf >(const float &from)
PacketXf pcmp_lt_or_nan< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi por< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pset1frombits< PacketXf >(numext::uint32_t from)
Packet4i plogical_shift_right(const Packet4i &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
PacketXf ploadquad< PacketXf >(const float *from)
float predux_max< PacketXf >(const PacketXf &a)
PacketXi pcmp_lt< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pldexp< PacketXf >(const PacketXf &a, const PacketXf &exponent)
PacketXi pcmp_eq< PacketXi >(const PacketXi &a, const PacketXi &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
PacketXf pfrexp< PacketXf >(const PacketXf &a, PacketXf &exponent)
float predux_mul< PacketXf >(const PacketXf &a)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
PacketXf pcmp_eq< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi ploadquad< PacketXi >(const numext::int32_t *from)
PacketXi pset1< PacketXi >(const numext::int32_t &from)
PacketXi pcmp_le< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pcmp_le< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi pandnot< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXi pdiv< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pmin< PropagateNumbers, PacketXf >(const PacketXf &a, const PacketXf &b)
void pstoreu< float >(float *to, const Packet4f &from)
PacketXi ptrue< PacketXi >(const PacketXi &)
PacketXf pxor< PacketXf >(const PacketXf &a, const PacketXf &b)
float pfirst< PacketXf >(const PacketXf &a)
PacketXf ploaddup< PacketXf >(const float *from)
PacketXf padd< PacketXf >(const PacketXf &a, const PacketXf &b)
numext::int32_t pfirst< PacketXi >(const PacketXi &a)
Packet2cf pconj(const Packet2cf &a)
PacketXf plset< PacketXf >(const float &a)
PacketXf pfloor< PacketXf >(const PacketXf &a)
PacketXf ptrue< PacketXf >(const PacketXf &)
PacketXi padd< PacketXi >(const PacketXi &a, const PacketXi &b)
Packet4i plogical_shift_left(const Packet4i &a)
Packet2cf preverse(const Packet2cf &a)
PacketXi pmul< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXi ploaddup< PacketXi >(const numext::int32_t *from)
Packet4i parithmetic_shift_right(const Packet4i &a)
PacketXi pload< PacketXi >(const numext::int32_t *from)
svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)))
numext::int32_t predux< PacketXi >(const PacketXi &a)
void pscatter< float, PacketXf >(float *to, const PacketXf &from, Index stride)
PacketXf pload< PacketXf >(const float *from)
PacketXi pmin< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pand< PacketXf >(const PacketXf &a, const PacketXf &b)
float predux< PacketXf >(const PacketXf &a)
PacketXf pandnot< PacketXf >(const PacketXf &a, const PacketXf &b)
numext::int32_t predux_max< PacketXi >(const PacketXi &a)
PacketXf por< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXf pdiv< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi pxor< PacketXi >(const PacketXi &a, const PacketXi &b)
PacketXf pcmp_lt< PacketXf >(const PacketXf &a, const PacketXf &b)
PacketXi ploadu< PacketXi >(const numext::int32_t *from)
numext::int32_t predux_min< PacketXi >(const PacketXi &a)
PacketXf ploadu< PacketXf >(const float *from)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.