10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
14 #include "../InternalHeaderCheck.h"
26 template<
typename LhsScalar_,
typename RhsScalar_,
bool ConjLhs_=false,
bool ConjRhs_=false,
int Arch=Architecture::Target,
int PacketSize_=GEBPPacketFull>
36 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
39 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
42 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
45 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
48 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
51 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
54 #if EIGEN_ARCH_i386_OR_x86_64
73 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
74 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
75 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
79 CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
95 static CacheSizes m_cacheSizes;
101 m_cacheSizes.m_l1 = *l1;
102 m_cacheSizes.m_l2 = *l2;
103 m_cacheSizes.m_l3 = *l3;
108 *l1 = m_cacheSizes.m_l1;
109 *l2 = m_cacheSizes.m_l2;
110 *l3 = m_cacheSizes.m_l3;
130 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
133 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
140 std::ptrdiff_t l1, l2, l3;
142 #ifdef EIGEN_VECTORIZE_AVX512
153 if (num_threads > 1) {
154 typedef typename Traits::ResScalar ResScalar;
156 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
157 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
167 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
169 k = k_cache - (k_cache % kr);
173 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
175 if (n_cache <= n_per_thread) {
178 n = n_cache - (n_cache % nr);
181 n = (numext::mini<Index>)(
n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
186 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
188 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
189 m = m_cache - (m_cache % mr);
192 m = (numext::mini<Index>)(
m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
199 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
212 typedef typename Traits::ResScalar ResScalar;
215 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
216 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
226 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
227 const Index old_k = k;
233 k = (k%max_kc)==0 ? max_kc
234 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
236 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
245 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
246 const Index actual_l2 = l3;
248 const Index actual_l2 = 1572864;
258 const Index lhs_bytes =
m * k *
sizeof(LhsScalar);
259 const Index remaining_l1 = l1- k_sub - lhs_bytes;
260 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
263 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
268 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
271 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
279 : (nc - Traits::nr * ((nc-(
n%nc))/(Traits::nr*(
n/nc+1))));
286 Index problem_size = k*
n*
sizeof(LhsScalar);
287 Index actual_lm = actual_l2;
289 if(problem_size<=1024)
295 else if(l3!=0 && problem_size<=32768)
300 max_mc = (numext::mini<Index>)(576,max_mc);
302 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
303 if (mc > Traits::mr) mc -= mc % Traits::mr;
304 else if (mc==0)
return;
306 : (mc - Traits::mr * ((mc-(
m%mc))/(Traits::mr*(
m/mc+1))));
311 template <
typename Index>
314 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
315 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
316 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
317 m = numext::mini<Index>(
m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
318 n = numext::mini<Index>(
n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
345 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
349 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k,
m,
n, num_threads);
353 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
356 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k,
m,
n, num_threads);
359 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
360 struct RhsPanelHelper {
364 typedef std::conditional_t<remaining_registers>=4, RhsPacketx4, RhsPacket> type;
367 template <
typename Packet>
370 Packet B_0, B1, B2, B3;
371 const Packet& get(
const FixedInt<0>&)
const {
return B_0; }
372 const Packet& get(
const FixedInt<1>&)
const {
return B1; }
373 const Packet& get(
const FixedInt<2>&)
const {
return B2; }
374 const Packet& get(
const FixedInt<3>&)
const {
return B3; }
377 template <
int N,
typename T1,
typename T2,
typename T3>
378 struct packet_conditional {
typedef T3 type; };
380 template <
typename T1,
typename T2,
typename T3>
381 struct packet_conditional<
GEBPPacketFull, T1, T2, T3> {
typedef T1 type; };
383 template <
typename T1,
typename T2,
typename T3>
384 struct packet_conditional<
GEBPPacketHalf, T1, T2, T3> {
typedef T2 type; };
386 #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391 name ## Packet ## postfix
393 #define PACKET_DECL_COND(name, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<name ## Scalar>::type, \
396 typename packet_traits<name ## Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
400 #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405 ScalarPacket ## postfix
407 #define PACKET_DECL_COND_SCALAR(packet_size) \
408 typedef typename packet_conditional<packet_size, \
409 typename packet_traits<Scalar>::type, \
410 typename packet_traits<Scalar>::half, \
411 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
424 template<
typename LhsScalar_,
typename RhsScalar_,
bool ConjLhs_,
bool ConjRhs_,
int Arch,
int PacketSize_>
428 typedef LhsScalar_ LhsScalar;
429 typedef RhsScalar_ RhsScalar;
430 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
439 Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
450 default_mr = (
plain_enum_min(16, NumberOfRegisters)/2/nr)*LhsPacketSize,
457 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
462 LhsProgress = LhsPacketSize,
467 typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
468 typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
469 typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
470 typedef LhsPacket LhsPacket4Packing;
472 typedef QuadPacket<RhsPacket> RhsPacketx4;
473 typedef ResPacket AccPacket;
475 EIGEN_STRONG_INLINE
void initAcc(AccPacket&
p)
477 p = pset1<ResPacket>(ResScalar(0));
480 template<
typename RhsPacketType>
481 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
483 dest = pset1<RhsPacketType>(*
b);
486 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketx4& dest)
const
491 template<
typename RhsPacketType>
492 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
497 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
501 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar*
b, RhsPacket& dest)
const
503 dest = ploadquad<RhsPacket>(
b);
506 template<
typename LhsPacketType>
507 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar*
a, LhsPacketType& dest)
const
509 dest = pload<LhsPacketType>(
a);
512 template<
typename LhsPacketType>
513 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar*
a, LhsPacketType& dest)
const
515 dest = ploadu<LhsPacketType>(
a);
518 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
519 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketType&
b, AccPacketType&
c, RhsPacketType& tmp,
const LaneIdType&)
const
521 conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
526 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
530 tmp =
b; tmp = cj.pmul(
a,tmp);
c =
padd(
c,tmp);
534 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
535 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketx4&
b, AccPacketType&
c, RhsPacket& tmp,
const LaneIdType& lane)
const
537 madd(
a,
b.get(lane),
c, tmp, lane);
540 EIGEN_STRONG_INLINE
void acc(
const AccPacket&
c,
const ResPacket& alpha, ResPacket& r)
const
545 template<
typename ResPacketHalf>
546 EIGEN_STRONG_INLINE
void acc(
const ResPacketHalf&
c,
const ResPacketHalf& alpha, ResPacketHalf& r)
const
553 template<
typename RealScalar,
bool ConjLhs_,
int Arch,
int PacketSize_>
554 class gebp_traits<
std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_>
557 typedef std::complex<RealScalar> LhsScalar;
558 typedef RealScalar RhsScalar;
559 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
568 Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
575 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
577 mr = 3*LhsPacketSize,
582 LhsProgress = LhsPacketSize,
586 typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
587 typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
588 typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
589 typedef LhsPacket LhsPacket4Packing;
591 typedef QuadPacket<RhsPacket> RhsPacketx4;
593 typedef ResPacket AccPacket;
595 EIGEN_STRONG_INLINE
void initAcc(AccPacket&
p)
597 p = pset1<ResPacket>(ResScalar(0));
600 template<
typename RhsPacketType>
601 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
603 dest = pset1<RhsPacketType>(*
b);
606 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketx4& dest)
const
611 template<
typename RhsPacketType>
612 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
617 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
620 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar*
b, RhsPacket& dest)
const
622 loadRhsQuad_impl(
b,dest, std::conditional_t<RhsPacketSize==16,true_type,false_type>());
625 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar*
b, RhsPacket& dest,
const true_type&)
const
629 RhsScalar tmp[4] = {
b[0],
b[0],
b[1],
b[1]};
630 dest = ploadquad<RhsPacket>(tmp);
633 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar*
b, RhsPacket& dest,
const false_type&)
const
636 dest = pset1<RhsPacket>(*
b);
639 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar*
a, LhsPacket& dest)
const
641 dest = pload<LhsPacket>(
a);
644 template<
typename LhsPacketType>
645 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar*
a, LhsPacketType& dest)
const
647 dest = ploadu<LhsPacketType>(
a);
650 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
651 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketType&
b, AccPacketType&
c, RhsPacketType& tmp,
const LaneIdType&)
const
653 madd_impl(
a,
b,
c, tmp, std::conditional_t<Vectorizable,true_type,false_type>());
656 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
657 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType&
a,
const RhsPacketType&
b, AccPacketType&
c, RhsPacketType& tmp,
const true_type&)
const
659 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
667 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar&
a,
const RhsScalar&
b, ResScalar&
c, RhsScalar& ,
const false_type&)
const
672 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
673 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketx4&
b, AccPacketType&
c, RhsPacket& tmp,
const LaneIdType& lane)
const
675 madd(
a,
b.get(lane),
c, tmp, lane);
678 template <
typename ResPacketType,
typename AccPacketType>
679 EIGEN_STRONG_INLINE
void acc(
const AccPacketType&
c,
const ResPacketType& alpha, ResPacketType& r)
const
681 conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
682 r = cj.pmadd(
c,alpha,r);
688 template<
typename Packet>
695 template<
typename Packet>
696 DoublePacket<Packet>
padd(
const DoublePacket<Packet> &
a,
const DoublePacket<Packet> &
b)
698 DoublePacket<Packet>
res;
708 template<
typename Packet>
709 const DoublePacket<Packet>&
711 std::enable_if_t<unpacket_traits<Packet>::size<=8>* = 0)
716 template<
typename Packet>
717 DoublePacket<typename unpacket_traits<Packet>::half>
722 DoublePacket<typename unpacket_traits<Packet>::half>
res;
723 typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
724 typedef typename packet_traits<Cplx>::type CplxPacket;
731 template<
typename Scalar,
typename RealPacket>
733 std::enable_if_t<unpacket_traits<RealPacket>::size<=8>* = 0)
739 template<
typename Scalar,
typename RealPacket>
747 dest.first = ploadquad<RealPacket>(r);
748 dest.second = ploadquad<RealPacket>(
i);
752 template<
typename Packet>
struct unpacket_traits<DoublePacket<Packet> > {
753 typedef DoublePacket<typename unpacket_traits<Packet>::half>
half;
767 template<
typename RealScalar,
bool ConjLhs_,
bool ConjRhs_,
int Arch,
int PacketSize_>
768 class gebp_traits<
std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_ >
771 typedef std::complex<RealScalar> Scalar;
772 typedef std::complex<RealScalar> LhsScalar;
773 typedef std::complex<RealScalar> RhsScalar;
774 typedef std::complex<RealScalar> ResScalar;
785 Vectorizable = unpacket_traits<RealPacket>::vectorizable
786 && unpacket_traits<ScalarPacket>::vectorizable,
796 LhsProgress = ResPacketSize,
800 typedef DoublePacket<RealPacket> DoublePacketType;
802 typedef std::conditional_t<Vectorizable,ScalarPacket,Scalar> LhsPacket4Packing;
803 typedef std::conditional_t<Vectorizable,RealPacket, Scalar> LhsPacket;
804 typedef std::conditional_t<Vectorizable,DoublePacketType,Scalar> RhsPacket;
805 typedef std::conditional_t<Vectorizable,ScalarPacket,Scalar> ResPacket;
806 typedef std::conditional_t<Vectorizable,DoublePacketType,Scalar> AccPacket;
809 typedef QuadPacket<RhsPacket> RhsPacketx4;
811 EIGEN_STRONG_INLINE
void initAcc(Scalar&
p) {
p = Scalar(0); }
813 EIGEN_STRONG_INLINE
void initAcc(DoublePacketType&
p)
815 p.first = pset1<RealPacket>(RealScalar(0));
816 p.second = pset1<RealPacket>(RealScalar(0));
820 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, ScalarPacket& dest)
const
822 dest = pset1<ScalarPacket>(*
b);
826 template<
typename RealPacketType>
827 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, DoublePacket<RealPacketType>& dest)
const
833 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketx4& dest)
const
835 loadRhs(
b, dest.B_0);
836 loadRhs(
b + 1, dest.B1);
837 loadRhs(
b + 2, dest.B2);
838 loadRhs(
b + 3, dest.B3);
842 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*
b, ScalarPacket& dest)
const
848 template<
typename RealPacketType>
849 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*
b, DoublePacket<RealPacketType>& dest)
const
854 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const {}
856 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar*
b, ResPacket& dest)
const
860 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar*
b, DoublePacketType& dest)
const
866 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar*
a, LhsPacket& dest)
const
868 dest = pload<LhsPacket>((
const typename unpacket_traits<LhsPacket>::type*)(
a));
871 template<
typename LhsPacketType>
872 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar*
a, LhsPacketType& dest)
const
874 dest = ploadu<LhsPacketType>((
const typename unpacket_traits<LhsPacketType>::type*)(
a));
877 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
879 std::enable_if_t<!is_same<RhsPacketType,RhsPacketx4>::value>
880 madd(
const LhsPacketType&
a,
const RhsPacketType&
b, DoublePacket<ResPacketType>&
c, TmpType& ,
const LaneIdType&)
const
886 template<
typename LaneIdType>
887 EIGEN_STRONG_INLINE
void madd(
const LhsPacket&
a,
const RhsPacket&
b, ResPacket&
c, RhsPacket& ,
const LaneIdType&)
const
892 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
893 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketx4&
b, AccPacketType&
c, RhsPacket& tmp,
const LaneIdType& lane)
const
895 madd(
a,
b.get(lane),
c, tmp, lane);
898 EIGEN_STRONG_INLINE
void acc(
const Scalar&
c,
const Scalar& alpha, Scalar& r)
const { r += alpha *
c; }
900 template<
typename RealPacketType,
typename ResPacketType>
901 EIGEN_STRONG_INLINE
void acc(
const DoublePacket<RealPacketType>&
c,
const ResPacketType& alpha, ResPacketType& r)
const
905 if((!ConjLhs)&&(!ConjRhs))
908 tmp =
padd(ResPacketType(
c.first),tmp);
910 else if((!ConjLhs)&&(ConjRhs))
913 tmp =
padd(ResPacketType(
c.first),tmp);
915 else if((ConjLhs)&&(!ConjRhs))
918 tmp =
padd(
pconj(ResPacketType(
c.first)),tmp);
920 else if((ConjLhs)&&(ConjRhs))
923 tmp =
psub(
pconj(ResPacketType(
c.first)),tmp);
926 r =
pmadd(tmp,alpha,r);
930 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
933 template<
typename RealScalar,
bool ConjRhs_,
int Arch,
int PacketSize_>
934 class gebp_traits<RealScalar,
std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_ >
937 typedef std::complex<RealScalar> Scalar;
938 typedef RealScalar LhsScalar;
939 typedef Scalar RhsScalar;
940 typedef Scalar ResScalar;
948 #undef PACKET_DECL_COND_SCALAR_POSTFIX
949 #undef PACKET_DECL_COND_POSTFIX
950 #undef PACKET_DECL_COND_SCALAR
951 #undef PACKET_DECL_COND
956 Vectorizable = unpacket_traits<RealPacket_>::vectorizable
957 && unpacket_traits<ScalarPacket_>::vectorizable,
967 LhsProgress = ResPacketSize,
971 typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
972 typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
973 typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
974 typedef LhsPacket LhsPacket4Packing;
975 typedef QuadPacket<RhsPacket> RhsPacketx4;
976 typedef ResPacket AccPacket;
978 EIGEN_STRONG_INLINE
void initAcc(AccPacket&
p)
980 p = pset1<ResPacket>(ResScalar(0));
983 template<
typename RhsPacketType>
984 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
986 dest = pset1<RhsPacketType>(*
b);
989 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar*
b, RhsPacketx4& dest)
const
994 template<
typename RhsPacketType>
995 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*
b, RhsPacketType& dest)
const
1000 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
1003 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar*
a, LhsPacket& dest)
const
1005 dest = ploaddup<LhsPacket>(
a);
1008 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar*
b, RhsPacket& dest)
const
1010 dest = ploadquad<RhsPacket>(
b);
1013 template<
typename LhsPacketType>
1014 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar*
a, LhsPacketType& dest)
const
1016 dest = ploaddup<LhsPacketType>(
a);
1019 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1020 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketType&
b, AccPacketType&
c, RhsPacketType& tmp,
const LaneIdType&)
const
1022 madd_impl(
a,
b,
c, tmp, std::conditional_t<Vectorizable,true_type,false_type>());
1025 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1026 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType&
a,
const RhsPacketType&
b, AccPacketType&
c, RhsPacketType& tmp,
const true_type&)
const
1028 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1037 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar&
a,
const RhsScalar&
b, ResScalar&
c, RhsScalar& ,
const false_type&)
const
1042 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1043 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType&
a,
const RhsPacketx4&
b, AccPacketType&
c, RhsPacket& tmp,
const LaneIdType& lane)
const
1045 madd(
a,
b.get(lane),
c, tmp, lane);
1048 template <
typename ResPacketType,
typename AccPacketType>
1049 EIGEN_STRONG_INLINE
void acc(
const AccPacketType&
c,
const ResPacketType& alpha, ResPacketType& r)
const
1051 conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
1052 r = cj.pmadd(alpha,
c,r);
1066 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1069 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1070 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1071 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1073 typedef typename Traits::ResScalar ResScalar;
1074 typedef typename Traits::LhsPacket LhsPacket;
1075 typedef typename Traits::RhsPacket RhsPacket;
1076 typedef typename Traits::ResPacket ResPacket;
1077 typedef typename Traits::AccPacket AccPacket;
1078 typedef typename Traits::RhsPacketx4 RhsPacketx4;
1080 typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1081 typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
1083 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1085 typedef typename SwappedTraits::ResScalar SResScalar;
1086 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1087 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1088 typedef typename SwappedTraits::ResPacket SResPacket;
1089 typedef typename SwappedTraits::AccPacket SAccPacket;
1091 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1092 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1093 typedef typename HalfTraits::ResPacket ResPacketHalf;
1094 typedef typename HalfTraits::AccPacket AccPacketHalf;
1096 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1097 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1098 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1099 typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1101 typedef typename DataMapper::LinearMapper LinearMapper;
1104 Vectorizable = Traits::Vectorizable,
1105 LhsProgress = Traits::LhsProgress,
1106 LhsProgressHalf = HalfTraits::LhsProgress,
1107 LhsProgressQuarter = QuarterTraits::LhsProgress,
1108 RhsProgress = Traits::RhsProgress,
1109 RhsProgressHalf = HalfTraits::RhsProgress,
1110 RhsProgressQuarter = QuarterTraits::RhsProgress,
1111 ResPacketSize = Traits::ResPacketSize
1115 void operator()(
const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1120 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1121 int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1122 struct last_row_process_16_packets
1124 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1125 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1127 typedef typename Traits::ResScalar ResScalar;
1128 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1129 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1130 typedef typename SwappedTraits::ResPacket SResPacket;
1131 typedef typename SwappedTraits::AccPacket SAccPacket;
1133 EIGEN_STRONG_INLINE
void operator()(
const DataMapper&
res, SwappedTraits &straits,
const LhsScalar* blA,
1135 ResScalar alpha, SAccPacket &C0)
1151 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1152 struct last_row_process_16_packets<LhsScalar, RhsScalar,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1153 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1154 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1156 typedef typename Traits::ResScalar ResScalar;
1157 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1158 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1159 typedef typename SwappedTraits::ResPacket SResPacket;
1160 typedef typename SwappedTraits::AccPacket SAccPacket;
1162 EIGEN_STRONG_INLINE
void operator()(
const DataMapper&
res, SwappedTraits &straits,
const LhsScalar* blA,
1164 ResScalar alpha, SAccPacket &C0)
1166 typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1167 typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1168 typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1169 typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1171 SResPacketQuarter R =
res.template gatherPacket<SResPacketQuarter>(
i, j2);
1172 SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1174 if (depth - endk > 0)
1180 for (
Index kk = endk; kk < depth; kk++)
1182 SLhsPacketQuarter a0;
1183 SRhsPacketQuarter b0;
1184 straits.loadLhsUnaligned(blB, a0);
1185 straits.loadRhs(blA, b0);
1186 straits.madd(a0,b0,c0,b0, fix<0>);
1187 blB += SwappedTraits::LhsProgress/4;
1190 straits.acc(c0, alphav, R);
1196 res.scatterPacket(
i, j2, R);
1200 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1201 struct lhs_process_one_packet
1203 typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1205 EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1209 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1210 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1211 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1212 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1213 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1214 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1215 #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1216 __asm__ (
"" :
"+x,m" (*A0));
1222 const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar alpha,
1227 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
1230 for(
Index i=peelStart;
i<peelEnd;
i+=LhsProgress)
1232 #if EIGEN_ARCH_ARM64
1234 for(
Index j2=0; j2<packet_cols8; j2+=8)
1236 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1240 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1250 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1251 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1252 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1253 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1254 LinearMapper r4 =
res.getLinearMapper(
i, j2 + 4);
1255 LinearMapper r5 =
res.getLinearMapper(
i, j2 + 5);
1256 LinearMapper r6 =
res.getLinearMapper(
i, j2 + 6);
1257 LinearMapper r7 =
res.getLinearMapper(
i, j2 + 7);
1258 r0.prefetch(prefetch_res_offset);
1259 r1.prefetch(prefetch_res_offset);
1260 r2.prefetch(prefetch_res_offset);
1261 r3.prefetch(prefetch_res_offset);
1262 r4.prefetch(prefetch_res_offset);
1263 r5.prefetch(prefetch_res_offset);
1264 r6.prefetch(prefetch_res_offset);
1265 r7.prefetch(prefetch_res_offset);
1266 const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
1270 for(
Index k=0; k<peeled_kc; k+=pk)
1272 RhsPacketx4 rhs_panel;
1274 #define EIGEN_GEBGP_ONESTEP(K) \
1276 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1277 traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1278 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1279 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1280 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1281 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1282 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1283 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1284 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1285 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1286 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1287 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1288 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1289 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1290 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1291 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1292 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1293 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1294 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1308 blB += pk*8*RhsProgress;
1309 blA += pk*(1*LhsProgress);
1314 for(
Index k=peeled_kc; k<depth; k++)
1316 RhsPacketx4 rhs_panel;
1319 blB += 8*RhsProgress;
1320 blA += 1*LhsProgress;
1323 #undef EIGEN_GEBGP_ONESTEP
1326 ResPacket alphav = pset1<ResPacket>(alpha);
1328 R0 = r0.template loadPacket<ResPacket>(0);
1329 R1 = r1.template loadPacket<ResPacket>(0);
1330 traits.acc(C0, alphav, R0);
1331 traits.acc(C1, alphav, R1);
1332 r0.storePacket(0, R0);
1333 r1.storePacket(0, R1);
1335 R0 = r2.template loadPacket<ResPacket>(0);
1336 R1 = r3.template loadPacket<ResPacket>(0);
1337 traits.acc(C2, alphav, R0);
1338 traits.acc(C3, alphav, R1);
1339 r2.storePacket(0, R0);
1340 r3.storePacket(0, R1);
1342 R0 = r4.template loadPacket<ResPacket>(0);
1343 R1 = r5.template loadPacket<ResPacket>(0);
1344 traits.acc(C4, alphav, R0);
1345 traits.acc(C5, alphav, R1);
1346 r4.storePacket(0, R0);
1347 r5.storePacket(0, R1);
1349 R0 = r6.template loadPacket<ResPacket>(0);
1350 R1 = r7.template loadPacket<ResPacket>(0);
1351 traits.acc(C6, alphav, R0);
1352 traits.acc(C7, alphav, R1);
1353 r6.storePacket(0, R0);
1354 r7.storePacket(0, R1);
1360 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1365 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1369 AccPacket C0, C1, C2, C3;
1379 AccPacket D0, D1, D2, D3;
1385 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1386 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1387 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1388 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1390 r0.prefetch(prefetch_res_offset);
1391 r1.prefetch(prefetch_res_offset);
1392 r2.prefetch(prefetch_res_offset);
1393 r3.prefetch(prefetch_res_offset);
1396 const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
1400 for(
Index k=0; k<peeled_kc; k+=pk)
1403 RhsPacketx4 rhs_panel;
1407 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1408 peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1409 peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1410 peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1412 peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1413 peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1414 peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1415 peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1417 blB += pk*4*RhsProgress;
1418 blA += pk*LhsProgress;
1428 for(
Index k=peeled_kc; k<depth; k++)
1430 RhsPacketx4 rhs_panel;
1432 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1433 blB += 4*RhsProgress;
1438 ResPacket alphav = pset1<ResPacket>(alpha);
1440 R0 = r0.template loadPacket<ResPacket>(0);
1441 R1 = r1.template loadPacket<ResPacket>(0);
1442 traits.acc(C0, alphav, R0);
1443 traits.acc(C1, alphav, R1);
1444 r0.storePacket(0, R0);
1445 r1.storePacket(0, R1);
1447 R0 = r2.template loadPacket<ResPacket>(0);
1448 R1 = r3.template loadPacket<ResPacket>(0);
1449 traits.acc(C2, alphav, R0);
1450 traits.acc(C3, alphav, R1);
1451 r2.storePacket(0, R0);
1452 r3.storePacket(0, R1);
1456 for(
Index j2=packet_cols4; j2<
cols; j2++)
1459 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(LhsProgress)];
1466 LinearMapper r0 =
res.getLinearMapper(
i, j2);
1469 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1472 for(
Index k= 0; k<peeled_kc; k+=pk)
1477 #define EIGEN_GEBGP_ONESTEP(K) \
1479 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1480 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1482 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1483 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1485 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1497 blB += pk*RhsProgress;
1498 blA += pk*LhsProgress;
1504 for(
Index k=peeled_kc; k<depth; k++)
1511 #undef EIGEN_GEBGP_ONESTEP
1513 ResPacket alphav = pset1<ResPacket>(alpha);
1514 R0 = r0.template loadPacket<ResPacket>(0);
1515 traits.acc(C0, alphav, R0);
1516 r0.storePacket(0, R0);
1522 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1523 struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1526 EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1530 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1531 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1532 traits.madd(*A0, *B_0, *C0, *B_0);
1533 traits.madd(*A0, *B1, *C1, *B1);
1534 traits.madd(*A0, *B2, *C2, *B2);
1535 traits.madd(*A0, *B3, *C3, *B3);
1540 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1543 ::operator()(
const DataMapper&
res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1548 SwappedTraits straits;
1550 if(strideA==-1) strideA = depth;
1551 if(strideB==-1) strideB = depth;
1552 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1553 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
1554 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
1555 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (
rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1556 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((
rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1557 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((
rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1558 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((
rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1559 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((
rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1561 const Index peeled_kc = depth & ~(pk-1);
1562 const int prefetch_res_offset = 32/
sizeof(ResScalar);
1568 if(mr>=3*Traits::LhsProgress)
1579 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1580 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1582 const Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc3);
1583 #if EIGEN_ARCH_ARM64
1585 for(
Index j2=0; j2<packet_cols8; j2+=8)
1587 for(
Index i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1589 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*LhsProgress)];
1592 AccPacket C0, C1, C2, C3, C4, C5, C6, C7,
1593 C8, C9, C10, C11, C12, C13, C14, C15,
1594 C16, C17, C18, C19, C20, C21, C22, C23;
1595 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1596 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1597 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1598 traits.initAcc(C12); traits.initAcc(C13); traits.initAcc(C14); traits.initAcc(C15);
1599 traits.initAcc(C16); traits.initAcc(C17); traits.initAcc(C18); traits.initAcc(C19);
1600 traits.initAcc(C20); traits.initAcc(C21); traits.initAcc(C22); traits.initAcc(C23);
1602 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1603 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1604 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1605 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1606 LinearMapper r4 =
res.getLinearMapper(
i, j2 + 4);
1607 LinearMapper r5 =
res.getLinearMapper(
i, j2 + 5);
1608 LinearMapper r6 =
res.getLinearMapper(
i, j2 + 6);
1609 LinearMapper r7 =
res.getLinearMapper(
i, j2 + 7);
1621 const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
1624 for(
Index k=0; k<peeled_kc; k+=pk)
1628 RhsPanel27 rhs_panel;
1631 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
1635 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1637 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1640 #define EIGEN_GEBP_ONESTEP(K) \
1642 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1643 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1644 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1645 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1646 EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND \
1647 traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1648 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1649 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1650 traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1651 traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1652 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1653 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1654 traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1655 traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1656 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1657 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1658 traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1659 traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1660 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1661 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1662 traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1663 traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1664 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1665 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1666 traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1667 traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1668 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1669 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1670 traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1671 traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1672 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1673 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1674 traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1675 traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1676 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1677 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1678 traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1679 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1691 blB += pk * 8 * RhsProgress;
1692 blA += pk * 3 * Traits::LhsProgress;
1697 for (
Index k = peeled_kc; k < depth; k++)
1700 RhsPanel27 rhs_panel;
1704 blB += 8 * RhsProgress;
1705 blA += 3 * Traits::LhsProgress;
1708 #undef EIGEN_GEBP_ONESTEP
1710 ResPacket R0, R1, R2;
1711 ResPacket alphav = pset1<ResPacket>(alpha);
1713 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1714 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1715 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1716 traits.acc(C0, alphav, R0);
1717 traits.acc(C8, alphav, R1);
1718 traits.acc(C16, alphav, R2);
1719 r0.storePacket(0 * Traits::ResPacketSize, R0);
1720 r0.storePacket(1 * Traits::ResPacketSize, R1);
1721 r0.storePacket(2 * Traits::ResPacketSize, R2);
1723 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1724 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1725 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1726 traits.acc(C1, alphav, R0);
1727 traits.acc(C9, alphav, R1);
1728 traits.acc(C17, alphav, R2);
1729 r1.storePacket(0 * Traits::ResPacketSize, R0);
1730 r1.storePacket(1 * Traits::ResPacketSize, R1);
1731 r1.storePacket(2 * Traits::ResPacketSize, R2);
1733 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1734 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1735 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1736 traits.acc(C2, alphav, R0);
1737 traits.acc(C10, alphav, R1);
1738 traits.acc(C18, alphav, R2);
1739 r2.storePacket(0 * Traits::ResPacketSize, R0);
1740 r2.storePacket(1 * Traits::ResPacketSize, R1);
1741 r2.storePacket(2 * Traits::ResPacketSize, R2);
1743 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1744 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1745 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1746 traits.acc(C3, alphav, R0);
1747 traits.acc(C11, alphav, R1);
1748 traits.acc(C19, alphav, R2);
1749 r3.storePacket(0 * Traits::ResPacketSize, R0);
1750 r3.storePacket(1 * Traits::ResPacketSize, R1);
1751 r3.storePacket(2 * Traits::ResPacketSize, R2);
1753 R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1754 R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1755 R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1756 traits.acc(C4, alphav, R0);
1757 traits.acc(C12, alphav, R1);
1758 traits.acc(C20, alphav, R2);
1759 r4.storePacket(0 * Traits::ResPacketSize, R0);
1760 r4.storePacket(1 * Traits::ResPacketSize, R1);
1761 r4.storePacket(2 * Traits::ResPacketSize, R2);
1763 R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1764 R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1765 R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1766 traits.acc(C5, alphav, R0);
1767 traits.acc(C13, alphav, R1);
1768 traits.acc(C21, alphav, R2);
1769 r5.storePacket(0 * Traits::ResPacketSize, R0);
1770 r5.storePacket(1 * Traits::ResPacketSize, R1);
1771 r5.storePacket(2 * Traits::ResPacketSize, R2);
1773 R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1774 R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1775 R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1776 traits.acc(C6, alphav, R0);
1777 traits.acc(C14, alphav, R1);
1778 traits.acc(C22, alphav, R2);
1779 r6.storePacket(0 * Traits::ResPacketSize, R0);
1780 r6.storePacket(1 * Traits::ResPacketSize, R1);
1781 r6.storePacket(2 * Traits::ResPacketSize, R2);
1783 R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1786 traits.acc(C7, alphav, R0);
1787 traits.acc(C15, alphav, R1);
1788 traits.acc(C23, alphav, R2);
1789 r7.storePacket(0 * Traits::ResPacketSize, R0);
1790 r7.storePacket(1 * Traits::ResPacketSize, R1);
1791 r7.storePacket(2 * Traits::ResPacketSize, R2);
1796 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1798 for(
Index i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1804 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*LhsProgress)];
1808 AccPacket C0, C1, C2, C3,
1811 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1812 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1813 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1815 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1816 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1817 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1818 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1826 const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
1830 for(
Index k=0; k<peeled_kc; k+=pk)
1834 RhsPanel15 rhs_panel;
1837 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
1841 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1843 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1845 #define EIGEN_GEBP_ONESTEP(K) \
1847 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1848 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1849 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1850 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1851 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1853 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1854 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1855 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1856 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1857 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1858 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1859 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1860 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1861 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1862 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1863 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1864 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1865 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1866 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1867 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1868 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1869 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1870 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1871 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1872 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1873 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1886 blB += pk*4*RhsProgress;
1887 blA += pk*3*Traits::LhsProgress;
1892 for(
Index k=peeled_kc; k<depth; k++)
1894 RhsPanel15 rhs_panel;
1898 blB += 4*RhsProgress;
1899 blA += 3*Traits::LhsProgress;
1902 #undef EIGEN_GEBP_ONESTEP
1904 ResPacket R0, R1, R2;
1905 ResPacket alphav = pset1<ResPacket>(alpha);
1907 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1908 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1909 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1910 traits.acc(C0, alphav, R0);
1911 traits.acc(C4, alphav, R1);
1912 traits.acc(C8, alphav, R2);
1913 r0.storePacket(0 * Traits::ResPacketSize, R0);
1914 r0.storePacket(1 * Traits::ResPacketSize, R1);
1915 r0.storePacket(2 * Traits::ResPacketSize, R2);
1917 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1918 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1919 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1920 traits.acc(C1, alphav, R0);
1921 traits.acc(C5, alphav, R1);
1922 traits.acc(C9, alphav, R2);
1923 r1.storePacket(0 * Traits::ResPacketSize, R0);
1924 r1.storePacket(1 * Traits::ResPacketSize, R1);
1925 r1.storePacket(2 * Traits::ResPacketSize, R2);
1927 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1928 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1929 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1930 traits.acc(C2, alphav, R0);
1931 traits.acc(C6, alphav, R1);
1932 traits.acc(C10, alphav, R2);
1933 r2.storePacket(0 * Traits::ResPacketSize, R0);
1934 r2.storePacket(1 * Traits::ResPacketSize, R1);
1935 r2.storePacket(2 * Traits::ResPacketSize, R2);
1937 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1938 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1939 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1940 traits.acc(C3, alphav, R0);
1941 traits.acc(C7, alphav, R1);
1942 traits.acc(C11, alphav, R2);
1943 r3.storePacket(0 * Traits::ResPacketSize, R0);
1944 r3.storePacket(1 * Traits::ResPacketSize, R1);
1945 r3.storePacket(2 * Traits::ResPacketSize, R2);
1950 for(
Index j2=packet_cols4; j2<
cols; j2++)
1952 for(
Index i=i1;
i<actual_panel_end;
i+=3*LhsProgress)
1955 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(3*Traits::LhsProgress)];
1959 AccPacket C0, C4, C8;
1964 LinearMapper r0 =
res.getLinearMapper(
i, j2);
1968 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1969 LhsPacket A0, A1, A2;
1971 for(
Index k=0; k<peeled_kc; k+=pk)
1975 #define EIGEN_GEBGP_ONESTEP(K) \
1977 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1978 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1979 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1980 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1981 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1982 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1983 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1984 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1985 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1986 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1998 blB += int(pk) * int(RhsProgress);
1999 blA += int(pk) * 3 * int(Traits::LhsProgress);
2005 for(
Index k=peeled_kc; k<depth; k++)
2010 blA += 3*Traits::LhsProgress;
2012 #undef EIGEN_GEBGP_ONESTEP
2013 ResPacket R0, R1, R2;
2014 ResPacket alphav = pset1<ResPacket>(alpha);
2016 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2017 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2018 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
2019 traits.acc(C0, alphav, R0);
2020 traits.acc(C4, alphav, R1);
2021 traits.acc(C8, alphav, R2);
2022 r0.storePacket(0 * Traits::ResPacketSize, R0);
2023 r0.storePacket(1 * Traits::ResPacketSize, R1);
2024 r0.storePacket(2 * Traits::ResPacketSize, R2);
2031 if(mr>=2*Traits::LhsProgress)
2037 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
2039 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
2041 Index actual_panel_end = (
std::min)(i1+actual_panel_rows, peeled_mc2);
2042 #if EIGEN_ARCH_ARM64
2044 for(
Index j2=0; j2<packet_cols8; j2+=8)
2046 for(
Index i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
2048 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
2051 AccPacket C0, C1, C2, C3, C4, C5, C6, C7,
2052 C8, C9, C10, C11, C12, C13, C14, C15;
2053 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
2054 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2055 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
2056 traits.initAcc(C12); traits.initAcc(C13); traits.initAcc(C14); traits.initAcc(C15);
2058 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
2059 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
2060 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
2061 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
2062 LinearMapper r4 =
res.getLinearMapper(
i, j2 + 4);
2063 LinearMapper r5 =
res.getLinearMapper(
i, j2 + 5);
2064 LinearMapper r6 =
res.getLinearMapper(
i, j2 + 6);
2065 LinearMapper r7 =
res.getLinearMapper(
i, j2 + 7);
2066 r0.prefetch(prefetch_res_offset);
2067 r1.prefetch(prefetch_res_offset);
2068 r2.prefetch(prefetch_res_offset);
2069 r3.prefetch(prefetch_res_offset);
2070 r4.prefetch(prefetch_res_offset);
2071 r5.prefetch(prefetch_res_offset);
2072 r6.prefetch(prefetch_res_offset);
2073 r7.prefetch(prefetch_res_offset);
2075 const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
2078 for(
Index k=0; k<peeled_kc; k+=pk)
2080 RhsPacketx4 rhs_panel;
2084 #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE)
2085 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
2087 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
2089 #define EIGEN_GEBGP_ONESTEP(K) \
2091 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
2092 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2093 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2094 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
2095 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2096 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2097 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2098 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2099 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2100 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2101 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2102 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2103 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2104 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2105 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2106 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2107 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2108 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2109 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2110 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2111 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2112 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2113 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2114 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2115 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2116 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2117 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2118 EIGEN_GEBP_2Px8_SPILLING_WORKAROUND \
2119 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2133 blB += pk*8*RhsProgress;
2134 blA += pk*(2*Traits::LhsProgress);
2139 for(
Index k=peeled_kc; k<depth; k++)
2141 RhsPacketx4 rhs_panel;
2144 blB += 8*RhsProgress;
2145 blA += 2*Traits::LhsProgress;
2148 #undef EIGEN_GEBGP_ONESTEP
2150 ResPacket R0, R1, R2, R3;
2151 ResPacket alphav = pset1<ResPacket>(alpha);
2153 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2154 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2155 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2156 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2157 traits.acc(C0, alphav, R0);
2158 traits.acc(C8, alphav, R1);
2159 traits.acc(C1, alphav, R2);
2160 traits.acc(C9, alphav, R3);
2161 r0.storePacket(0 * Traits::ResPacketSize, R0);
2162 r0.storePacket(1 * Traits::ResPacketSize, R1);
2163 r1.storePacket(0 * Traits::ResPacketSize, R2);
2164 r1.storePacket(1 * Traits::ResPacketSize, R3);
2166 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2167 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2168 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2169 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2170 traits.acc(C2, alphav, R0);
2171 traits.acc(C10, alphav, R1);
2172 traits.acc(C3, alphav, R2);
2173 traits.acc(C11, alphav, R3);
2174 r2.storePacket(0 * Traits::ResPacketSize, R0);
2175 r2.storePacket(1 * Traits::ResPacketSize, R1);
2176 r3.storePacket(0 * Traits::ResPacketSize, R2);
2177 r3.storePacket(1 * Traits::ResPacketSize, R3);
2179 R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2180 R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2181 R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2182 R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2183 traits.acc(C4, alphav, R0);
2184 traits.acc(C12, alphav, R1);
2185 traits.acc(C5, alphav, R2);
2186 traits.acc(C13, alphav, R3);
2187 r4.storePacket(0 * Traits::ResPacketSize, R0);
2188 r4.storePacket(1 * Traits::ResPacketSize, R1);
2189 r5.storePacket(0 * Traits::ResPacketSize, R2);
2190 r5.storePacket(1 * Traits::ResPacketSize, R3);
2192 R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2193 R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2194 R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2195 R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2196 traits.acc(C6, alphav, R0);
2197 traits.acc(C14, alphav, R1);
2198 traits.acc(C7, alphav, R2);
2199 traits.acc(C15, alphav, R3);
2200 r6.storePacket(0 * Traits::ResPacketSize, R0);
2201 r6.storePacket(1 * Traits::ResPacketSize, R1);
2202 r7.storePacket(0 * Traits::ResPacketSize, R2);
2203 r7.storePacket(1 * Traits::ResPacketSize, R3);
2208 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2210 for(
Index i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
2216 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
2220 AccPacket C0, C1, C2, C3,
2222 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
2223 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2225 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
2226 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
2227 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
2228 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
2230 r0.prefetch(prefetch_res_offset);
2231 r1.prefetch(prefetch_res_offset);
2232 r2.prefetch(prefetch_res_offset);
2233 r3.prefetch(prefetch_res_offset);
2236 const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
2240 for(
Index k=0; k<peeled_kc; k+=pk)
2243 RhsPacketx4 rhs_panel;
2248 #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2249 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
2251 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2253 #define EIGEN_GEBGP_ONESTEP(K) \
2255 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2256 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2257 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2258 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2259 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2260 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2261 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2262 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2263 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2264 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2265 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2266 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2267 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2268 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2282 blB += pk*4*RhsProgress;
2283 blA += pk*(2*Traits::LhsProgress);
2288 for(
Index k=peeled_kc; k<depth; k++)
2290 RhsPacketx4 rhs_panel;
2293 blB += 4*RhsProgress;
2294 blA += 2*Traits::LhsProgress;
2296 #undef EIGEN_GEBGP_ONESTEP
2298 ResPacket R0, R1, R2, R3;
2299 ResPacket alphav = pset1<ResPacket>(alpha);
2301 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2302 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2303 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2304 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2305 traits.acc(C0, alphav, R0);
2306 traits.acc(C4, alphav, R1);
2307 traits.acc(C1, alphav, R2);
2308 traits.acc(C5, alphav, R3);
2309 r0.storePacket(0 * Traits::ResPacketSize, R0);
2310 r0.storePacket(1 * Traits::ResPacketSize, R1);
2311 r1.storePacket(0 * Traits::ResPacketSize, R2);
2312 r1.storePacket(1 * Traits::ResPacketSize, R3);
2314 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2315 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2316 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2317 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2318 traits.acc(C2, alphav, R0);
2319 traits.acc(C6, alphav, R1);
2320 traits.acc(C3, alphav, R2);
2321 traits.acc(C7, alphav, R3);
2322 r2.storePacket(0 * Traits::ResPacketSize, R0);
2323 r2.storePacket(1 * Traits::ResPacketSize, R1);
2324 r3.storePacket(0 * Traits::ResPacketSize, R2);
2325 r3.storePacket(1 * Traits::ResPacketSize, R3);
2330 for(
Index j2=packet_cols4; j2<
cols; j2++)
2332 for(
Index i=i1;
i<actual_panel_end;
i+=2*LhsProgress)
2335 const LhsScalar* blA = &blockA[
i*strideA+offsetA*(2*Traits::LhsProgress)];
2343 LinearMapper r0 =
res.getLinearMapper(
i, j2);
2344 r0.prefetch(prefetch_res_offset);
2347 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2350 for(
Index k=0; k<peeled_kc; k+=pk)
2355 #define EIGEN_GEBGP_ONESTEP(K) \
2357 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2358 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2359 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
2360 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
2361 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
2362 traits.madd(A0, B_0, C0, B1, fix<0>); \
2363 traits.madd(A1, B_0, C4, B_0, fix<0>); \
2364 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2376 blB += int(pk) * int(RhsProgress);
2377 blA += int(pk) * 2 * int(Traits::LhsProgress);
2383 for(
Index k=peeled_kc; k<depth; k++)
2388 blA += 2*Traits::LhsProgress;
2390 #undef EIGEN_GEBGP_ONESTEP
2392 ResPacket alphav = pset1<ResPacket>(alpha);
2394 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2395 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2396 traits.acc(C0, alphav, R0);
2397 traits.acc(C4, alphav, R1);
2398 r0.storePacket(0 * Traits::ResPacketSize, R0);
2399 r0.storePacket(1 * Traits::ResPacketSize, R1);
2405 if(mr>=1*Traits::LhsProgress)
2407 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
p;
2408 p(
res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols, depth, packet_cols4);
2411 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
2413 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
p;
2414 p(
res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols, depth, packet_cols4);
2417 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
2419 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper>
p;
2420 p(
res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk,
cols, depth, packet_cols4);
2423 if(peeled_mc_quarter<
rows)
2425 #if EIGEN_ARCH_ARM64
2428 for(
Index j2=0; j2<packet_cols8; j2+=8)
2433 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
2436 ResScalar C0(0),C1(0),C2(0),C3(0),C4(0),C5(0),C6(0),C7(0);
2437 const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
2438 for(
Index k=0; k<depth; k++)
2440 LhsScalar A0 = blA[k];
2444 C0 = cj.pmadd(A0, B_0, C0);
2447 C1 = cj.pmadd(A0, B_0, C1);
2450 C2 = cj.pmadd(A0, B_0, C2);
2453 C3 = cj.pmadd(A0, B_0, C3);
2456 C4 = cj.pmadd(A0, B_0, C4);
2459 C5 = cj.pmadd(A0, B_0, C5);
2462 C6 = cj.pmadd(A0, B_0, C6);
2465 C7 = cj.pmadd(A0, B_0, C7);
2469 res(
i, j2 + 0) += alpha * C0;
2470 res(
i, j2 + 1) += alpha * C1;
2471 res(
i, j2 + 2) += alpha * C2;
2472 res(
i, j2 + 3) += alpha * C3;
2473 res(
i, j2 + 4) += alpha * C4;
2474 res(
i, j2 + 5) += alpha * C5;
2475 res(
i, j2 + 6) += alpha * C6;
2476 res(
i, j2 + 7) += alpha * C7;
2482 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2487 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
2489 const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
2494 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>
::size;
2495 const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>
::size;
2501 if (kCanLoadSRhsQuad &&
2502 (SwappedTraits::LhsProgress % 4) == 0 &&
2503 (SwappedTraits::LhsProgress<=16) &&
2504 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
2505 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
2507 SAccPacket C0, C1, C2, C3;
2508 straits.initAcc(C0);
2509 straits.initAcc(C1);
2510 straits.initAcc(C2);
2511 straits.initAcc(C3);
2513 const Index spk = (
std::max)(1,SwappedTraits::LhsProgress/4);
2514 const Index endk = (depth/spk)*spk;
2515 const Index endk4 = (depth/(spk*4))*(spk*4);
2518 for(; k<endk4; k+=4*spk)
2523 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
2524 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
2526 straits.loadRhsQuad(blA+0*spk, B_0);
2527 straits.loadRhsQuad(blA+1*spk, B_1);
2528 straits.madd(A0,B_0,C0,B_0, fix<0>);
2529 straits.madd(A1,B_1,C1,B_1, fix<0>);
2531 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
2532 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
2533 straits.loadRhsQuad(blA+2*spk, B_0);
2534 straits.loadRhsQuad(blA+3*spk, B_1);
2535 straits.madd(A0,B_0,C2,B_0, fix<0>);
2536 straits.madd(A1,B_1,C3,B_1, fix<0>);
2538 blB += 4*SwappedTraits::LhsProgress;
2542 for(; k<endk; k+=spk)
2547 straits.loadLhsUnaligned(blB, A0);
2548 straits.loadRhsQuad(blA, B_0);
2549 straits.madd(A0,B_0,C0,B_0, fix<0>);
2551 blB += SwappedTraits::LhsProgress;
2554 if(SwappedTraits::LhsProgress==8)
2557 typedef std::conditional_t<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SResPacket>::half,SResPacket> SResPacketHalf;
2558 typedef std::conditional_t<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SLhsPacket>::half,SLhsPacket> SLhsPacketHalf;
2559 typedef std::conditional_t<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SRhsPacket>::half,SRhsPacket> SRhsPacketHalf;
2560 typedef std::conditional_t<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SAccPacket>::half,SAccPacket> SAccPacketHalf;
2562 SResPacketHalf R =
res.template gatherPacket<SResPacketHalf>(
i, j2);
2563 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2570 straits.loadLhsUnaligned(blB, a0);
2571 straits.loadRhs(blA, b0);
2573 straits.madd(a0,b0,c0,b0, fix<0>);
2574 straits.acc(c0, alphav, R);
2580 res.scatterPacket(
i, j2, R);
2582 else if (SwappedTraits::LhsProgress==16)
2588 last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
p;
2589 p(
res, straits, blA, blB, depth, endk,
i, j2,alpha, C0);
2593 SResPacket R =
res.template gatherPacket<SResPacket>(
i, j2);
2594 SResPacket alphav = pset1<SResPacket>(alpha);
2595 straits.acc(C0, alphav, R);
2596 res.scatterPacket(
i, j2, R);
2602 ResScalar C0(0), C1(0), C2(0), C3(0);
2604 for(
Index k=0; k<depth; k++)
2613 C0 = cj.pmadd(A0,B_0,C0);
2614 C1 = cj.pmadd(A0,B_1,C1);
2618 C2 = cj.pmadd(A0,B_0,C2);
2619 C3 = cj.pmadd(A0,B_1,C3);
2623 res(
i, j2 + 0) += alpha * C0;
2624 res(
i, j2 + 1) += alpha * C1;
2625 res(
i, j2 + 2) += alpha * C2;
2626 res(
i, j2 + 3) += alpha * C3;
2631 for(
Index j2=packet_cols4; j2<
cols; j2++)
2636 const LhsScalar* blA = &blockA[
i*strideA+offsetA];
2640 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2641 for(
Index k=0; k<depth; k++)
2643 LhsScalar A0 = blA[k];
2644 RhsScalar B_0 = blB[k];
2645 C0 = cj.pmadd(A0, B_0, C0);
2647 res(
i, j2) += alpha * C0;
2668 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2669 struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2671 typedef typename DataMapper::LinearMapper LinearMapper;
2675 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2679 typedef typename unpacket_traits<Packet>::half HalfPacket;
2680 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2684 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2685 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2690 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2691 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2692 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2695 const Index peeled_mc3 = Pack1>=3*PacketSize ? (
rows/(3*PacketSize))*(3*PacketSize) : 0;
2696 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((
rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2697 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((
rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2698 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((
rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2699 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (
rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2700 const Index last_lhs_progress =
rows > peeled_mc_quarter ? (
rows - peeled_mc_quarter) & ~1 : 0;
2701 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2702 : Pack2>1 && last_lhs_progress ? (
rows/last_lhs_progress)*last_lhs_progress : 0;
2707 if(Pack1>=3*PacketSize)
2709 for(;
i<peeled_mc3;
i+=3*PacketSize)
2711 if(PanelMode) count += (3*PacketSize) * offset;
2713 for(
Index k=0; k<depth; k++)
2716 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2717 B = lhs.template loadPacket<Packet>(
i+1*PacketSize, k);
2718 C = lhs.template loadPacket<Packet>(
i+2*PacketSize, k);
2719 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
2720 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
2721 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2723 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2727 if(Pack1>=2*PacketSize)
2729 for(;
i<peeled_mc2;
i+=2*PacketSize)
2731 if(PanelMode) count += (2*PacketSize) * offset;
2733 for(
Index k=0; k<depth; k++)
2736 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2737 B = lhs.template loadPacket<Packet>(
i+1*PacketSize, k);
2738 pstore(blockA+count, cj.pconj(
A)); count+=PacketSize;
2739 pstore(blockA+count, cj.pconj(
B)); count+=PacketSize;
2741 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2745 if(Pack1>=1*PacketSize)
2747 for(;
i<peeled_mc1;
i+=1*PacketSize)
2749 if(PanelMode) count += (1*PacketSize) * offset;
2751 for(
Index k=0; k<depth; k++)
2754 A = lhs.template loadPacket<Packet>(
i+0*PacketSize, k);
2755 pstore(blockA+count, cj.pconj(
A));
2758 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2762 if(HasHalf && Pack1>=HalfPacketSize)
2764 for(;
i<peeled_mc_half;
i+=HalfPacketSize)
2766 if(PanelMode) count += (HalfPacketSize) * offset;
2768 for(
Index k=0; k<depth; k++)
2771 A = lhs.template loadPacket<HalfPacket>(
i+0*(HalfPacketSize), k);
2772 pstoreu(blockA+count, cj.pconj(
A));
2773 count+=HalfPacketSize;
2775 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2779 if(HasQuarter && Pack1>=QuarterPacketSize)
2781 for(;
i<peeled_mc_quarter;
i+=QuarterPacketSize)
2783 if(PanelMode) count += (QuarterPacketSize) * offset;
2785 for(
Index k=0; k<depth; k++)
2788 A = lhs.template loadPacket<QuarterPacket>(
i+0*(QuarterPacketSize), k);
2789 pstoreu(blockA+count, cj.pconj(
A));
2790 count+=QuarterPacketSize;
2792 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2801 if(Pack2<PacketSize && Pack2>1)
2803 for(;
i<peeled_mc0;
i+=last_lhs_progress)
2805 if(PanelMode) count += last_lhs_progress * offset;
2807 for(
Index k=0; k<depth; k++)
2808 for(
Index w=0;
w<last_lhs_progress;
w++)
2809 blockA[count++] = cj(lhs(
i+
w, k));
2811 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2817 if(PanelMode) count += offset;
2818 for(
Index k=0; k<depth; k++)
2819 blockA[count++] = cj(lhs(
i, k));
2820 if(PanelMode) count += (stride-offset-depth);
2824 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2825 struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2827 typedef typename DataMapper::LinearMapper LinearMapper;
2831 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2835 typedef typename unpacket_traits<Packet>::half HalfPacket;
2836 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2840 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2841 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2846 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2847 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2849 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2853 Index psize = PacketSize;
2857 Index peeled_mc = gone_last ? Pack2>1 ? (
rows/pack)*pack : 0 :
i+(remaining_rows/pack)*pack;
2859 for(;
i<peeled_mc;
i+=pack)
2861 if(PanelMode) count += pack * offset;
2864 if(pack>=psize && psize >= QuarterPacketSize)
2866 const Index peeled_k = (depth/psize)*psize;
2867 for(; k<peeled_k; k+=psize)
2869 for (
Index m = 0;
m < pack;
m += psize)
2871 if (psize == PacketSize) {
2872 PacketBlock<Packet> kernel;
2873 for (
Index p = 0;
p < psize; ++
p) kernel.packet[
p] = lhs.template loadPacket<Packet>(
i+
p+
m, k);
2875 for (
Index p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel.packet[
p]));
2876 }
else if (HasHalf && psize == HalfPacketSize) {
2878 PacketBlock<HalfPacket> kernel_half;
2879 for (
Index p = 0;
p < psize; ++
p) kernel_half.packet[
p] = lhs.template loadPacket<HalfPacket>(
i+
p+
m, k);
2881 for (
Index p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_half.packet[
p]));
2882 }
else if (HasQuarter && psize == QuarterPacketSize) {
2883 gone_quarter =
true;
2884 PacketBlock<QuarterPacket> kernel_quarter;
2885 for (
Index p = 0;
p < psize; ++
p) kernel_quarter.packet[
p] = lhs.template loadPacket<QuarterPacket>(
i+
p+
m, k);
2887 for (
Index p = 0;
p < psize; ++
p)
pstore(blockA+count+
m+(pack)*
p, cj.pconj(kernel_quarter.packet[
p]));
2890 count += psize*pack;
2897 for(;
w<pack-3;
w+=4)
2899 Scalar
a(cj(lhs(
i+
w+0, k))),
2900 b(cj(lhs(
i+
w+1, k))),
2901 c(cj(lhs(
i+
w+2, k))),
2902 d(cj(lhs(
i+
w+3, k)));
2903 blockA[count++] =
a;
2904 blockA[count++] =
b;
2905 blockA[count++] =
c;
2906 blockA[count++] = d;
2910 blockA[count++] = cj(lhs(
i+
w, k));
2913 if(PanelMode) count += pack * (stride-offset-depth);
2920 (starting_pos ==
i || left >= psize/2 || left >= psize/4) &&
2921 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2922 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2933 if (Pack2 < PacketSize && !gone_last) {
2935 psize = pack = left & ~1;
2942 if(PanelMode) count += offset;
2943 for(
Index k=0; k<depth; k++)
2944 blockA[count++] = cj(lhs(
i, k));
2945 if(PanelMode) count += (stride-offset-depth);
2956 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2957 struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2959 typedef typename packet_traits<Scalar>::type Packet;
2960 typedef typename DataMapper::LinearMapper LinearMapper;
2965 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2972 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2973 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2974 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
2975 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
2977 const Index peeled_k = (depth/PacketSize)*PacketSize;
2979 #if EIGEN_ARCH_ARM64
2982 for(
Index j2=0; j2<packet_cols8; j2+=8)
2985 if(PanelMode) count += 8 * offset;
2986 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2987 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2988 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2989 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2990 const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2991 const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2992 const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2993 const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2995 if (PacketSize % 2 == 0 && PacketSize <= 8)
2997 for (; k < peeled_k; k += PacketSize)
2999 if (PacketSize == 2)
3001 PacketBlock<Packet, PacketSize==2 ?2:PacketSize> kernel0, kernel1, kernel2, kernel3;
3002 kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3003 kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3004 kernel1.packet[0%PacketSize] = dm2.template loadPacket<Packet>(k);
3005 kernel1.packet[1%PacketSize] = dm3.template loadPacket<Packet>(k);
3006 kernel2.packet[0%PacketSize] = dm4.template loadPacket<Packet>(k);
3007 kernel2.packet[1%PacketSize] = dm5.template loadPacket<Packet>(k);
3008 kernel3.packet[0%PacketSize] = dm6.template loadPacket<Packet>(k);
3009 kernel3.packet[1%PacketSize] = dm7.template loadPacket<Packet>(k);
3015 pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
3016 pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
3017 pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
3018 pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
3020 pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
3021 pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
3022 pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
3023 pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
3024 count+=8*PacketSize;
3026 else if (PacketSize == 4)
3028 PacketBlock<Packet, PacketSize == 4?4:PacketSize> kernel0, kernel1;
3030 kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3031 kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3032 kernel0.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3033 kernel0.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3034 kernel1.packet[0%PacketSize] = dm4.template loadPacket<Packet>(k);
3035 kernel1.packet[1%PacketSize] = dm5.template loadPacket<Packet>(k);
3036 kernel1.packet[2%PacketSize] = dm6.template loadPacket<Packet>(k);
3037 kernel1.packet[3%PacketSize] = dm7.template loadPacket<Packet>(k);
3041 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize]));
3042 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel1.packet[0%PacketSize]));
3043 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[1%PacketSize]));
3044 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel1.packet[1%PacketSize]));
3045 pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[2%PacketSize]));
3046 pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel1.packet[2%PacketSize]));
3047 pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[3%PacketSize]));
3048 pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel1.packet[3%PacketSize]));
3049 count+=8*PacketSize;
3051 else if (PacketSize == 8)
3053 PacketBlock<Packet, PacketSize==8?8:PacketSize> kernel0;
3055 kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3056 kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3057 kernel0.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3058 kernel0.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3059 kernel0.packet[4%PacketSize] = dm4.template loadPacket<Packet>(k);
3060 kernel0.packet[5%PacketSize] = dm5.template loadPacket<Packet>(k);
3061 kernel0.packet[6%PacketSize] = dm6.template loadPacket<Packet>(k);
3062 kernel0.packet[7%PacketSize] = dm7.template loadPacket<Packet>(k);
3065 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize]));
3066 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel0.packet[1%PacketSize]));
3067 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[2%PacketSize]));
3068 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel0.packet[3%PacketSize]));
3069 pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[4%PacketSize]));
3070 pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel0.packet[5%PacketSize]));
3071 pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[6%PacketSize]));
3072 pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel0.packet[7%PacketSize]));
3073 count+=8*PacketSize;
3080 blockB[count+0] = cj(dm0(k));
3081 blockB[count+1] = cj(dm1(k));
3082 blockB[count+2] = cj(dm2(k));
3083 blockB[count+3] = cj(dm3(k));
3084 blockB[count+4] = cj(dm4(k));
3085 blockB[count+5] = cj(dm5(k));
3086 blockB[count+6] = cj(dm6(k));
3087 blockB[count+7] = cj(dm7(k));
3091 if(PanelMode) count += 8 * (stride-offset-depth);
3098 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
3101 if(PanelMode) count += 4 * offset;
3102 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
3103 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
3104 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
3105 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
3108 if((PacketSize%4)==0)
3110 for(; k<peeled_k; k+=PacketSize) {
3111 PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
3112 kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
3113 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3114 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3115 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3117 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
3118 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
3119 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
3120 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
3121 count+=4*PacketSize;
3126 blockB[count+0] = cj(dm0(k));
3127 blockB[count+1] = cj(dm1(k));
3128 blockB[count+2] = cj(dm2(k));
3129 blockB[count+3] = cj(dm3(k));
3133 if(PanelMode) count += 4 * (stride-offset-depth);
3138 for(
Index j2=packet_cols4; j2<
cols; ++j2)
3140 if(PanelMode) count += offset;
3141 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
3142 for(
Index k=0; k<depth; k++)
3144 blockB[count] = cj(dm0(k));
3147 if(PanelMode) count += (stride-offset-depth);
3152 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
3153 struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
3155 typedef typename packet_traits<Scalar>::type Packet;
3156 typedef typename unpacket_traits<Packet>::half HalfPacket;
3157 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
3158 typedef typename DataMapper::LinearMapper LinearMapper;
3167 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
3168 const bool HasHalf = (int)HalfPacketSize < (
int)PacketSize;
3169 const bool HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize;
3170 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
3171 Index packet_cols8 = nr>=8 ? (
cols/8) * 8 : 0;
3172 Index packet_cols4 = nr>=4 ? (
cols/4) * 4 : 0;
3175 #if EIGEN_ARCH_ARM64
3178 for(
Index j2=0; j2<packet_cols8; j2+=8)
3181 if(PanelMode) count += 8 * offset;
3182 for(
Index k=0; k<depth; k++)
3184 if (PacketSize==8) {
3185 Packet
A = rhs.template loadPacket<Packet>(k, j2);
3186 pstoreu(blockB+count, cj.pconj(
A));
3187 count += PacketSize;
3188 }
else if (PacketSize==4) {
3189 Packet
A = rhs.template loadPacket<Packet>(k, j2);
3190 Packet
B = rhs.template loadPacket<Packet>(k, j2 + 4);
3191 pstoreu(blockB+count, cj.pconj(
A));
3192 pstoreu(blockB+count+PacketSize, cj.pconj(
B));
3193 count += 2*PacketSize;
3195 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3196 blockB[count+0] = cj(dm0(0));
3197 blockB[count+1] = cj(dm0(1));
3198 blockB[count+2] = cj(dm0(2));
3199 blockB[count+3] = cj(dm0(3));
3200 blockB[count+4] = cj(dm0(4));
3201 blockB[count+5] = cj(dm0(5));
3202 blockB[count+6] = cj(dm0(6));
3203 blockB[count+7] = cj(dm0(7));
3208 if(PanelMode) count += 8 * (stride-offset-depth);
3215 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
3218 if(PanelMode) count += 4 * offset;
3219 for(
Index k=0; k<depth; k++)
3221 if (PacketSize==4) {
3222 Packet
A = rhs.template loadPacket<Packet>(k, j2);
3223 pstoreu(blockB+count, cj.pconj(
A));
3224 count += PacketSize;
3225 }
else if (HasHalf && HalfPacketSize==4) {
3226 HalfPacket
A = rhs.template loadPacket<HalfPacket>(k, j2);
3227 pstoreu(blockB+count, cj.pconj(
A));
3228 count += HalfPacketSize;
3229 }
else if (HasQuarter && QuarterPacketSize==4) {
3230 QuarterPacket
A = rhs.template loadPacket<QuarterPacket>(k, j2);
3231 pstoreu(blockB+count, cj.pconj(
A));
3232 count += QuarterPacketSize;
3234 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3235 blockB[count+0] = cj(dm0(0));
3236 blockB[count+1] = cj(dm0(1));
3237 blockB[count+2] = cj(dm0(2));
3238 blockB[count+3] = cj(dm0(3));
3243 if(PanelMode) count += 4 * (stride-offset-depth);
3247 for(
Index j2=packet_cols4; j2<
cols; ++j2)
3249 if(PanelMode) count += offset;
3250 for(
Index k=0; k<depth; k++)
3252 blockB[count] = cj(rhs(k, j2));
3255 if(PanelMode) count += stride-offset-depth;
3266 std::ptrdiff_t l1, l2, l3;
3275 std::ptrdiff_t l1, l2, l3;
3285 std::ptrdiff_t l1, l2, l3;
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
const ImagReturnType imag() const
RealReturnType real() const
IndexedView_or_Block operator()(const RowIndices &rowIndices, const ColIndices &colIndices)
#define EIGEN_ASM_COMMENT(X)
#define eigen_internal_assert(x)
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DONT_INLINE
#define EIGEN_IF_CONSTEXPR(X)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
bfloat16() max(const bfloat16 &a, const bfloat16 &b)
bfloat16() min(const bfloat16 &a, const bfloat16 &b)
const std::ptrdiff_t defaultL2CacheSize
Packet padd(const Packet &a, const Packet &b)
constexpr int plain_enum_min(A a, B b)
void pstore(Scalar *to, const Packet &from)
void queryCacheSizes(int &l1, int &l2, int &l3)
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, std::enable_if_t< unpacket_traits< RealPacket >::size<=8 > *=0)
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Packet1cd pcplxflip(const Packet1cd &x)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu(Scalar *to, const Packet &from)
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
const std::ptrdiff_t defaultL3CacheSize
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Packet psub(const Packet &a, const Packet &b)
void prefetch(const Scalar *addr)
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Packet2cf pconj(const Packet2cf &a)
const std::ptrdiff_t defaultL1CacheSize
Packet4c predux_half_dowto4(const Packet8c &a)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
T div_ceil(const T &a, const T &b)
std::ptrdiff_t l1CacheSize()
std::ptrdiff_t l2CacheSize()
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
std::ptrdiff_t l3CacheSize()
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
#define EIGEN_GEBGP_ONESTEP(K)
#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size)
#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size)
#define PACKET_DECL_COND_SCALAR(packet_size)
#define EIGEN_GEBP_ONESTEP(K)
#define PACKET_DECL_COND(name, packet_size)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.