products/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 #include "../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
24 };
25 
26 template<typename LhsScalar_, typename RhsScalar_, bool ConjLhs_=false, bool ConjRhs_=false, int Arch=Architecture::Target, int PacketSize_=GEBPPacketFull>
27 class gebp_traits;
28 
29 
31 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
32 {
33  return a<=0 ? b : a;
34 }
35 
36 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
38 #else
39 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
40 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
41 
42 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
44 #else
45 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
46 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
47 
48 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
50 #else
51 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
52 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
53 
54 #if EIGEN_ARCH_i386_OR_x86_64
55 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
56 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
57 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
58 #elif EIGEN_ARCH_PPC
59 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
60 #ifdef _ARCH_PWR10
61 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2*1024*1024);
62 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8*1024*1024);
63 #else
64 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
65 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
66 #endif
67 #else
68 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
69 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
70 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
71 #endif
72 
73 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
74 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
75 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
76 
78 struct CacheSizes {
79  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
85  }
86 
87  std::ptrdiff_t m_l1;
88  std::ptrdiff_t m_l2;
89  std::ptrdiff_t m_l3;
90 };
91 
93 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
94 {
95  static CacheSizes m_cacheSizes;
96 
97  if(action==SetAction)
98  {
99  // set the cpu cache size and cache all block sizes from a global cache size in byte
100  eigen_internal_assert(l1!=0 && l2!=0);
101  m_cacheSizes.m_l1 = *l1;
102  m_cacheSizes.m_l2 = *l2;
103  m_cacheSizes.m_l3 = *l3;
104  }
105  else if(action==GetAction)
106  {
107  eigen_internal_assert(l1!=0 && l2!=0);
108  *l1 = m_cacheSizes.m_l1;
109  *l2 = m_cacheSizes.m_l2;
110  *l3 = m_cacheSizes.m_l3;
111  }
112  else
113  {
114  eigen_internal_assert(false);
115  }
116 }
117 
118 /* Helper for computeProductBlockingSizes.
119  *
120  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
121  * this function computes the blocking size parameters along the respective dimensions
122  * for matrix products and related algorithms. The blocking sizes depends on various
123  * parameters:
124  * - the L1 and L2 cache sizes,
125  * - the register level blocking sizes defined by gebp_traits,
126  * - the number of scalars that fit into a packet (when vectorization is enabled).
127  *
128  * \sa setCpuCacheSizes */
129 
130 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
132 {
133  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
134 
135  // Explanations:
136  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
137  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
138  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
139  // at the register level. This small horizontal panel has to stay within L1 cache.
140  std::ptrdiff_t l1, l2, l3;
141  manage_caching_sizes(GetAction, &l1, &l2, &l3);
142  #ifdef EIGEN_VECTORIZE_AVX512
143  // We need to find a rationale for that, but without this adjustment,
144  // performance with AVX512 is pretty bad, like -20% slower.
145  // One reason is that with increasing packet-size, the blocking size k
146  // has to become pretty small if we want that 1 lhs panel fit within L1.
147  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
148  // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
149  // This is quite small for a good reuse of the accumulation registers.
150  l1 *= 4;
151  #endif
152 
153  if (num_threads > 1) {
154  typedef typename Traits::ResScalar ResScalar;
155  enum {
156  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
157  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
158  kr = 8,
159  mr = Traits::mr,
160  nr = Traits::nr
161  };
162  // Increasing k gives us more time to prefetch the content of the "C"
163  // registers. However once the latency is hidden there is no point in
164  // increasing the value of k, so we'll cap it at 320 (value determined
165  // experimentally).
166  // To avoid that k vanishes, we make k_cache at least as big as kr
167  const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
168  if (k_cache < k) {
169  k = k_cache - (k_cache % kr);
170  eigen_internal_assert(k > 0);
171  }
172 
173  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
174  const Index n_per_thread = numext::div_ceil(n, num_threads);
175  if (n_cache <= n_per_thread) {
176  // Don't exceed the capacity of the l2 cache.
177  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
178  n = n_cache - (n_cache % nr);
180  } else {
181  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
182  }
183 
184  if (l3 > l2) {
185  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
186  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
187  const Index m_per_thread = numext::div_ceil(m, num_threads);
188  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
189  m = m_cache - (m_cache % mr);
191  } else {
192  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
193  }
194  }
195  }
196  else {
197  // In unit tests we do not want to use extra large matrices,
198  // so we reduce the cache size to check the blocking strategy is not flawed
199 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
200  l1 = 9*1024;
201  l2 = 32*1024;
202  l3 = 512*1024;
203 #endif
204 
205  // Early return for small problems because the computation below are time consuming for small problems.
206  // Perhaps it would make more sense to consider k*n*m??
207  // Note that for very tiny problem, this function should be bypassed anyway
208  // because we use the coefficient-based implementation for them.
209  if((numext::maxi)(k,(numext::maxi)(m,n))<48)
210  return;
211 
212  typedef typename Traits::ResScalar ResScalar;
213  enum {
214  k_peeling = 8,
215  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
216  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
217  };
218 
219  // ---- 1st level of blocking on L1, yields kc ----
220 
221  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
222  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
223  // We also include a register-level block of the result (mx x nr).
224  // (In an ideal world only the lhs panel would stay in L1)
225  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
226  const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
227  const Index old_k = k;
228  if(k>max_kc)
229  {
230  // We are really blocking on the third dimension:
231  // -> reduce blocking size to make sure the last block is as large as possible
232  // while keeping the same number of sweeps over the result.
233  k = (k%max_kc)==0 ? max_kc
234  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
235 
236  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
237  }
238 
239  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
240 
241  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
242  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
243  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
244  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
245  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
246  const Index actual_l2 = l3;
247  #else
248  const Index actual_l2 = 1572864; // == 1.5 MB
249  #endif
250 
251  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
252  // The second half is implicitly reserved to access the result and lhs coefficients.
253  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
254  // to limit this growth: we bound nc to growth by a factor x1.5.
255  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
256  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
257  Index max_nc;
258  const Index lhs_bytes = m * k * sizeof(LhsScalar);
259  const Index remaining_l1 = l1- k_sub - lhs_bytes;
260  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
261  {
262  // L1 blocking
263  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
264  }
265  else
266  {
267  // L2 blocking
268  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
269  }
270  // WARNING Below, we assume that Traits::nr is a power of two.
271  Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272  if(n>nc)
273  {
274  // We are really blocking over the columns:
275  // -> reduce blocking size to make sure the last block is as large as possible
276  // while keeping the same number of sweeps over the packed lhs.
277  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
278  n = (n%nc)==0 ? nc
279  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
280  }
281  else if(old_k==k)
282  {
283  // So far, no blocking at all, i.e., kc==k, and nc==n.
284  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
285  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
286  Index problem_size = k*n*sizeof(LhsScalar);
287  Index actual_lm = actual_l2;
288  Index max_mc = m;
289  if(problem_size<=1024)
290  {
291  // problem is small enough to keep in L1
292  // Let's choose m such that lhs's block fit in 1/3 of L1
293  actual_lm = l1;
294  }
295  else if(l3!=0 && problem_size<=32768)
296  {
297  // we have both L2 and L3, and problem is small enough to be kept in L2
298  // Let's choose m such that lhs's block fit in 1/3 of L2
299  actual_lm = l2;
300  max_mc = (numext::mini<Index>)(576,max_mc);
301  }
302  Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
303  if (mc > Traits::mr) mc -= mc % Traits::mr;
304  else if (mc==0) return;
305  m = (m%mc)==0 ? mc
306  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
307  }
308  }
309 }
310 
311 template <typename Index>
313 {
314 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
315  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
316  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
317  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
318  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
319  return true;
320  }
321 #else
325 #endif
326  return false;
327 }
328 
345 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
346 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
347 {
348  if (!useSpecificBlockingSizes(k, m, n)) {
349  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
350  }
351 }
352 
353 template<typename LhsScalar, typename RhsScalar, typename Index>
354 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
355 {
356  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
357 }
358 
359 template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
360 struct RhsPanelHelper {
361  private:
362  static constexpr int remaining_registers = (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
363  public:
364  typedef std::conditional_t<remaining_registers>=4, RhsPacketx4, RhsPacket> type;
365 };
366 
367 template <typename Packet>
368 struct QuadPacket
369 {
370  Packet B_0, B1, B2, B3;
371  const Packet& get(const FixedInt<0>&) const { return B_0; }
372  const Packet& get(const FixedInt<1>&) const { return B1; }
373  const Packet& get(const FixedInt<2>&) const { return B2; }
374  const Packet& get(const FixedInt<3>&) const { return B3; }
375 };
376 
377 template <int N, typename T1, typename T2, typename T3>
378 struct packet_conditional { typedef T3 type; };
379 
380 template <typename T1, typename T2, typename T3>
381 struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
382 
383 template <typename T1, typename T2, typename T3>
384 struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
385 
386 #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
387  typedef typename packet_conditional<packet_size, \
388  typename packet_traits<name ## Scalar>::type, \
389  typename packet_traits<name ## Scalar>::half, \
390  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
391  name ## Packet ## postfix
392 
393 #define PACKET_DECL_COND(name, packet_size) \
394  typedef typename packet_conditional<packet_size, \
395  typename packet_traits<name ## Scalar>::type, \
396  typename packet_traits<name ## Scalar>::half, \
397  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
398  name ## Packet
399 
400 #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
401  typedef typename packet_conditional<packet_size, \
402  typename packet_traits<Scalar>::type, \
403  typename packet_traits<Scalar>::half, \
404  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
405  ScalarPacket ## postfix
406 
407 #define PACKET_DECL_COND_SCALAR(packet_size) \
408  typedef typename packet_conditional<packet_size, \
409  typename packet_traits<Scalar>::type, \
410  typename packet_traits<Scalar>::half, \
411  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
412  ScalarPacket
413 
414 /* Vectorization logic
415  * real*real: unpack rhs to constant packets, ...
416  *
417  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
418  * storing each res packet into two packets (2x2),
419  * at the end combine them: swap the second and addsub them
420  * cf*cf : same but with 2x4 blocks
421  * cplx*real : unpack rhs to constant packets, ...
422  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
423  */
424 template<typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
425 class gebp_traits
426 {
427 public:
428  typedef LhsScalar_ LhsScalar;
429  typedef RhsScalar_ RhsScalar;
430  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
431 
432  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
433  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
434  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
435 
436  enum {
437  ConjLhs = ConjLhs_,
438  ConjRhs = ConjRhs_,
439  Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
440  LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
441  RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
442  ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
443 
444  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
445 
446  // register block size along the N direction must be 1 or 4
447  nr = 4,
448 
449  // register block size along the M direction (currently, this one cannot be modified)
450  default_mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*LhsPacketSize,
451 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
452  && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
453  // we assume 16 registers or more
454  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
455  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
456  // Bug 1515: MSVC prior to v19.14 yields to register spilling.
457  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
458 #else
459  mr = default_mr,
460 #endif
461 
462  LhsProgress = LhsPacketSize,
463  RhsProgress = 1
464  };
465 
466 
467  typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
468  typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
469  typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
470  typedef LhsPacket LhsPacket4Packing;
471 
472  typedef QuadPacket<RhsPacket> RhsPacketx4;
473  typedef ResPacket AccPacket;
474 
475  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
476  {
477  p = pset1<ResPacket>(ResScalar(0));
478  }
479 
480  template<typename RhsPacketType>
481  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
482  {
483  dest = pset1<RhsPacketType>(*b);
484  }
485 
486  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
487  {
488  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
489  }
490 
491  template<typename RhsPacketType>
492  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
493  {
494  loadRhs(b, dest);
495  }
496 
497  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
498  {
499  }
500 
501  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
502  {
503  dest = ploadquad<RhsPacket>(b);
504  }
505 
506  template<typename LhsPacketType>
507  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
508  {
509  dest = pload<LhsPacketType>(a);
510  }
511 
512  template<typename LhsPacketType>
513  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
514  {
515  dest = ploadu<LhsPacketType>(a);
516  }
517 
518  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
519  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
520  {
521  conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
522  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
523  // let gcc allocate the register in which to store the result of the pmul
524  // (in the case where there is no FMA) gcc fails to figure out how to avoid
525  // spilling register.
526 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
528  c = cj.pmadd(a,b,c);
529 #else
530  tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
531 #endif
532  }
533 
534  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
535  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
536  {
537  madd(a, b.get(lane), c, tmp, lane);
538  }
539 
540  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
541  {
542  r = pmadd(c,alpha,r);
543  }
544 
545  template<typename ResPacketHalf>
546  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
547  {
548  r = pmadd(c,alpha,r);
549  }
550 
551 };
552 
553 template<typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
554 class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_>
555 {
556 public:
557  typedef std::complex<RealScalar> LhsScalar;
558  typedef RealScalar RhsScalar;
559  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
560 
561  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
562  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
563  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
564 
565  enum {
566  ConjLhs = ConjLhs_,
567  ConjRhs = false,
568  Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
569  LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
570  RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
571  ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
572 
573  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
574  nr = 4,
575 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
576  // we assume 16 registers
577  mr = 3*LhsPacketSize,
578 #else
579  mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*LhsPacketSize,
580 #endif
581 
582  LhsProgress = LhsPacketSize,
583  RhsProgress = 1
584  };
585 
586  typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
587  typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
588  typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
589  typedef LhsPacket LhsPacket4Packing;
590 
591  typedef QuadPacket<RhsPacket> RhsPacketx4;
592 
593  typedef ResPacket AccPacket;
594 
595  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
596  {
597  p = pset1<ResPacket>(ResScalar(0));
598  }
599 
600  template<typename RhsPacketType>
601  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
602  {
603  dest = pset1<RhsPacketType>(*b);
604  }
605 
606  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
607  {
608  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
609  }
610 
611  template<typename RhsPacketType>
612  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
613  {
614  loadRhs(b, dest);
615  }
616 
617  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
618  {}
619 
620  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
621  {
622  loadRhsQuad_impl(b,dest, std::conditional_t<RhsPacketSize==16,true_type,false_type>());
623  }
624 
625  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
626  {
627  // FIXME we can do better!
628  // what we want here is a ploadheight
629  RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
630  dest = ploadquad<RhsPacket>(tmp);
631  }
632 
633  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
634  {
635  eigen_internal_assert(RhsPacketSize<=8);
636  dest = pset1<RhsPacket>(*b);
637  }
638 
639  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
640  {
641  dest = pload<LhsPacket>(a);
642  }
643 
644  template<typename LhsPacketType>
645  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
646  {
647  dest = ploadu<LhsPacketType>(a);
648  }
649 
650  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
651  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
652  {
653  madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable,true_type,false_type>());
654  }
655 
656  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
657  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
658  {
659 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
661  c.v = pmadd(a.v,b,c.v);
662 #else
663  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
664 #endif
665  }
666 
667  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
668  {
669  c += a * b;
670  }
671 
672  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
673  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
674  {
675  madd(a, b.get(lane), c, tmp, lane);
676  }
677 
678  template <typename ResPacketType, typename AccPacketType>
679  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
680  {
681  conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
682  r = cj.pmadd(c,alpha,r);
683  }
684 
685 protected:
686 };
687 
688 template<typename Packet>
689 struct DoublePacket
690 {
691  Packet first;
692  Packet second;
693 };
694 
695 template<typename Packet>
696 DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
697 {
698  DoublePacket<Packet> res;
699  res.first = padd(a.first, b.first);
700  res.second = padd(a.second,b.second);
701  return res;
702 }
703 
704 // note that for DoublePacket<RealPacket> the "4" in "downto4"
705 // corresponds to the number of complexes, so it means "8"
706 // it terms of real coefficients.
707 
708 template<typename Packet>
709 const DoublePacket<Packet>&
710 predux_half_dowto4(const DoublePacket<Packet> &a,
711  std::enable_if_t<unpacket_traits<Packet>::size<=8>* = 0)
712 {
713  return a;
714 }
715 
716 template<typename Packet>
717 DoublePacket<typename unpacket_traits<Packet>::half>
718 predux_half_dowto4(const DoublePacket<Packet> &a,
719  std::enable_if_t<unpacket_traits<Packet>::size==16>* = 0)
720 {
721  // yes, that's pretty hackish :(
722  DoublePacket<typename unpacket_traits<Packet>::half> res;
723  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
724  typedef typename packet_traits<Cplx>::type CplxPacket;
725  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
726  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
727  return res;
728 }
729 
730 // same here, "quad" actually means "8" in terms of real coefficients
731 template<typename Scalar, typename RealPacket>
732 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
733  std::enable_if_t<unpacket_traits<RealPacket>::size<=8>* = 0)
734 {
735  dest.first = pset1<RealPacket>(numext::real(*b));
736  dest.second = pset1<RealPacket>(numext::imag(*b));
737 }
738 
739 template<typename Scalar, typename RealPacket>
740 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
741  std::enable_if_t<unpacket_traits<RealPacket>::size==16>* = 0)
742 {
743  // yes, that's pretty hackish too :(
744  typedef typename NumTraits<Scalar>::Real RealScalar;
745  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
746  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
747  dest.first = ploadquad<RealPacket>(r);
748  dest.second = ploadquad<RealPacket>(i);
749 }
750 
751 
752 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
753  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
754  enum{
756  };
757 };
758 // template<typename Packet>
759 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
760 // {
761 // DoublePacket<Packet> res;
762 // res.first = padd(a.first, b.first);
763 // res.second = padd(a.second,b.second);
764 // return res;
765 // }
766 
767 template<typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
768 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_ >
769 {
770 public:
771  typedef std::complex<RealScalar> Scalar;
772  typedef std::complex<RealScalar> LhsScalar;
773  typedef std::complex<RealScalar> RhsScalar;
774  typedef std::complex<RealScalar> ResScalar;
775 
776  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
777  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
778  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
779  PACKET_DECL_COND(Real, PacketSize_);
780  PACKET_DECL_COND_SCALAR(PacketSize_);
781 
782  enum {
783  ConjLhs = ConjLhs_,
784  ConjRhs = ConjRhs_,
785  Vectorizable = unpacket_traits<RealPacket>::vectorizable
786  && unpacket_traits<ScalarPacket>::vectorizable,
787  ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
788  LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
789  RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
790  RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
791 
792  // FIXME: should depend on NumberOfRegisters
793  nr = 4,
794  mr = ResPacketSize,
795 
796  LhsProgress = ResPacketSize,
797  RhsProgress = 1
798  };
799 
800  typedef DoublePacket<RealPacket> DoublePacketType;
801 
802  typedef std::conditional_t<Vectorizable,ScalarPacket,Scalar> LhsPacket4Packing;
803  typedef std::conditional_t<Vectorizable,RealPacket, Scalar> LhsPacket;
804  typedef std::conditional_t<Vectorizable,DoublePacketType,Scalar> RhsPacket;
805  typedef std::conditional_t<Vectorizable,ScalarPacket,Scalar> ResPacket;
806  typedef std::conditional_t<Vectorizable,DoublePacketType,Scalar> AccPacket;
807 
808  // this actually holds 8 packets!
809  typedef QuadPacket<RhsPacket> RhsPacketx4;
810 
811  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
812 
813  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
814  {
815  p.first = pset1<RealPacket>(RealScalar(0));
816  p.second = pset1<RealPacket>(RealScalar(0));
817  }
818 
819  // Scalar path
820  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
821  {
822  dest = pset1<ScalarPacket>(*b);
823  }
824 
825  // Vectorized path
826  template<typename RealPacketType>
827  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
828  {
829  dest.first = pset1<RealPacketType>(numext::real(*b));
830  dest.second = pset1<RealPacketType>(numext::imag(*b));
831  }
832 
833  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
834  {
835  loadRhs(b, dest.B_0);
836  loadRhs(b + 1, dest.B1);
837  loadRhs(b + 2, dest.B2);
838  loadRhs(b + 3, dest.B3);
839  }
840 
841  // Scalar path
842  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
843  {
844  loadRhs(b, dest);
845  }
846 
847  // Vectorized path
848  template<typename RealPacketType>
849  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
850  {
851  loadRhs(b, dest);
852  }
853 
854  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
855 
856  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
857  {
858  loadRhs(b,dest);
859  }
860  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
861  {
863  }
864 
865  // nothing special here
866  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
867  {
868  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
869  }
870 
871  template<typename LhsPacketType>
872  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
873  {
874  dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
875  }
876 
877  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
878  EIGEN_STRONG_INLINE
879  std::enable_if_t<!is_same<RhsPacketType,RhsPacketx4>::value>
880  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
881  {
882  c.first = padd(pmul(a,b.first), c.first);
883  c.second = padd(pmul(a,b.second),c.second);
884  }
885 
886  template<typename LaneIdType>
887  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
888  {
889  c = cj.pmadd(a,b,c);
890  }
891 
892  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
893  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
894  {
895  madd(a, b.get(lane), c, tmp, lane);
896  }
897 
898  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
899 
900  template<typename RealPacketType, typename ResPacketType>
901  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
902  {
903  // assemble c
904  ResPacketType tmp;
905  if((!ConjLhs)&&(!ConjRhs))
906  {
907  tmp = pcplxflip(pconj(ResPacketType(c.second)));
908  tmp = padd(ResPacketType(c.first),tmp);
909  }
910  else if((!ConjLhs)&&(ConjRhs))
911  {
912  tmp = pconj(pcplxflip(ResPacketType(c.second)));
913  tmp = padd(ResPacketType(c.first),tmp);
914  }
915  else if((ConjLhs)&&(!ConjRhs))
916  {
917  tmp = pcplxflip(ResPacketType(c.second));
918  tmp = padd(pconj(ResPacketType(c.first)),tmp);
919  }
920  else if((ConjLhs)&&(ConjRhs))
921  {
922  tmp = pcplxflip(ResPacketType(c.second));
923  tmp = psub(pconj(ResPacketType(c.first)),tmp);
924  }
925 
926  r = pmadd(tmp,alpha,r);
927  }
928 
929 protected:
930  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
931 };
932 
933 template<typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
934 class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_ >
935 {
936 public:
937  typedef std::complex<RealScalar> Scalar;
938  typedef RealScalar LhsScalar;
939  typedef Scalar RhsScalar;
940  typedef Scalar ResScalar;
941 
942  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
943  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
944  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
945  PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_);
946  PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_);
947 
948 #undef PACKET_DECL_COND_SCALAR_POSTFIX
949 #undef PACKET_DECL_COND_POSTFIX
950 #undef PACKET_DECL_COND_SCALAR
951 #undef PACKET_DECL_COND
952 
953  enum {
954  ConjLhs = false,
955  ConjRhs = ConjRhs_,
956  Vectorizable = unpacket_traits<RealPacket_>::vectorizable
957  && unpacket_traits<ScalarPacket_>::vectorizable,
958  LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
959  RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
960  ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
961 
962  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
963  // FIXME: should depend on NumberOfRegisters
964  nr = 4,
965  mr = (plain_enum_min(16, NumberOfRegisters)/2/nr)*ResPacketSize,
966 
967  LhsProgress = ResPacketSize,
968  RhsProgress = 1
969  };
970 
971  typedef std::conditional_t<Vectorizable,LhsPacket_,LhsScalar> LhsPacket;
972  typedef std::conditional_t<Vectorizable,RhsPacket_,RhsScalar> RhsPacket;
973  typedef std::conditional_t<Vectorizable,ResPacket_,ResScalar> ResPacket;
974  typedef LhsPacket LhsPacket4Packing;
975  typedef QuadPacket<RhsPacket> RhsPacketx4;
976  typedef ResPacket AccPacket;
977 
978  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
979  {
980  p = pset1<ResPacket>(ResScalar(0));
981  }
982 
983  template<typename RhsPacketType>
984  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
985  {
986  dest = pset1<RhsPacketType>(*b);
987  }
988 
989  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
990  {
991  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
992  }
993 
994  template<typename RhsPacketType>
995  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
996  {
997  loadRhs(b, dest);
998  }
999 
1000  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
1001  {}
1002 
1003  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
1004  {
1005  dest = ploaddup<LhsPacket>(a);
1006  }
1007 
1008  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
1009  {
1010  dest = ploadquad<RhsPacket>(b);
1011  }
1012 
1013  template<typename LhsPacketType>
1014  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
1015  {
1016  dest = ploaddup<LhsPacketType>(a);
1017  }
1018 
1019  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1020  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
1021  {
1022  madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable,true_type,false_type>());
1023  }
1024 
1025  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1026  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
1027  {
1028 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1029  EIGEN_UNUSED_VARIABLE(tmp);
1030  c.v = pmadd(a,b.v,c.v);
1031 #else
1032  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1033 #endif
1034 
1035  }
1036 
1037  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
1038  {
1039  c += a * b;
1040  }
1041 
1042  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1043  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1044  {
1045  madd(a, b.get(lane), c, tmp, lane);
1046  }
1047 
1048  template <typename ResPacketType, typename AccPacketType>
1049  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
1050  {
1051  conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
1052  r = cj.pmadd(alpha,c,r);
1053  }
1054 
1055 protected:
1056 
1057 };
1058 
1059 /* optimized General packed Block * packed Panel product kernel
1060  *
1061  * Mixing type logic: C += A * B
1062  * | A | B | comments
1063  * |real |cplx | no vectorization yet, would require to pack A with duplication
1064  * |cplx |real | easy vectorization
1065  */
1066 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1067 struct gebp_kernel
1068 {
1069  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1070  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1071  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1072 
1073  typedef typename Traits::ResScalar ResScalar;
1074  typedef typename Traits::LhsPacket LhsPacket;
1075  typedef typename Traits::RhsPacket RhsPacket;
1076  typedef typename Traits::ResPacket ResPacket;
1077  typedef typename Traits::AccPacket AccPacket;
1078  typedef typename Traits::RhsPacketx4 RhsPacketx4;
1079 
1080  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1081  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
1082 
1083  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1084 
1085  typedef typename SwappedTraits::ResScalar SResScalar;
1086  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1087  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1088  typedef typename SwappedTraits::ResPacket SResPacket;
1089  typedef typename SwappedTraits::AccPacket SAccPacket;
1090 
1091  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1092  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1093  typedef typename HalfTraits::ResPacket ResPacketHalf;
1094  typedef typename HalfTraits::AccPacket AccPacketHalf;
1095 
1096  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1097  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1098  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1099  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1100 
1101  typedef typename DataMapper::LinearMapper LinearMapper;
1102 
1103  enum {
1104  Vectorizable = Traits::Vectorizable,
1105  LhsProgress = Traits::LhsProgress,
1106  LhsProgressHalf = HalfTraits::LhsProgress,
1107  LhsProgressQuarter = QuarterTraits::LhsProgress,
1108  RhsProgress = Traits::RhsProgress,
1109  RhsProgressHalf = HalfTraits::RhsProgress,
1110  RhsProgressQuarter = QuarterTraits::RhsProgress,
1111  ResPacketSize = Traits::ResPacketSize
1112  };
1113 
1115  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1116  Index rows, Index depth, Index cols, ResScalar alpha,
1117  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1118 };
1119 
1120 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1121 int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1122 struct last_row_process_16_packets
1123 {
1124  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1125  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1126 
1127  typedef typename Traits::ResScalar ResScalar;
1128  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1129  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1130  typedef typename SwappedTraits::ResPacket SResPacket;
1131  typedef typename SwappedTraits::AccPacket SAccPacket;
1132 
1133  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1134  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1135  ResScalar alpha, SAccPacket &C0)
1136  {
1138  EIGEN_UNUSED_VARIABLE(straits);
1139  EIGEN_UNUSED_VARIABLE(blA);
1140  EIGEN_UNUSED_VARIABLE(blB);
1141  EIGEN_UNUSED_VARIABLE(depth);
1142  EIGEN_UNUSED_VARIABLE(endk);
1145  EIGEN_UNUSED_VARIABLE(alpha);
1147  }
1148 };
1149 
1150 
1151 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1152 struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1153  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1154  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1155 
1156  typedef typename Traits::ResScalar ResScalar;
1157  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1158  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1159  typedef typename SwappedTraits::ResPacket SResPacket;
1160  typedef typename SwappedTraits::AccPacket SAccPacket;
1161 
1162  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1163  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1164  ResScalar alpha, SAccPacket &C0)
1165  {
1166  typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1167  typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1168  typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1169  typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1170 
1171  SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1172  SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1173 
1174  if (depth - endk > 0)
1175  {
1176  // We have to handle the last row(s) of the rhs, which
1177  // correspond to a half-packet
1178  SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1179 
1180  for (Index kk = endk; kk < depth; kk++)
1181  {
1182  SLhsPacketQuarter a0;
1183  SRhsPacketQuarter b0;
1184  straits.loadLhsUnaligned(blB, a0);
1185  straits.loadRhs(blA, b0);
1186  straits.madd(a0,b0,c0,b0, fix<0>);
1187  blB += SwappedTraits::LhsProgress/4;
1188  blA += 1;
1189  }
1190  straits.acc(c0, alphav, R);
1191  }
1192  else
1193  {
1194  straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1195  }
1196  res.scatterPacket(i, j2, R);
1197  }
1198 };
1199 
1200 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1201 struct lhs_process_one_packet
1202 {
1203  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1204 
1205  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1206  {
1207  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1208  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1209  traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1210  traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1211  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1212  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1213  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1214  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1215  #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1216  __asm__ ("" : "+x,m" (*A0));
1217  #endif
1218  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1219  }
1220 
1221  EIGEN_STRONG_INLINE void operator()(
1222  const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1223  Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1224  int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1225  {
1226  GEBPTraits traits;
1227  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1228  // loops on each largest micro horizontal panel of lhs
1229  // (LhsProgress x depth)
1230  for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1231  {
1232 #if EIGEN_ARCH_ARM64
1233  EIGEN_IF_CONSTEXPR(nr>=8) {
1234  for(Index j2=0; j2<packet_cols8; j2+=8)
1235  {
1236  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1237  prefetch(&blA[0]);
1238 
1239  // gets res block as register
1240  AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1241  traits.initAcc(C0);
1242  traits.initAcc(C1);
1243  traits.initAcc(C2);
1244  traits.initAcc(C3);
1245  traits.initAcc(C4);
1246  traits.initAcc(C5);
1247  traits.initAcc(C6);
1248  traits.initAcc(C7);
1249 
1250  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1251  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1252  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1253  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1254  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1255  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1256  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1257  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1258  r0.prefetch(prefetch_res_offset);
1259  r1.prefetch(prefetch_res_offset);
1260  r2.prefetch(prefetch_res_offset);
1261  r3.prefetch(prefetch_res_offset);
1262  r4.prefetch(prefetch_res_offset);
1263  r5.prefetch(prefetch_res_offset);
1264  r6.prefetch(prefetch_res_offset);
1265  r7.prefetch(prefetch_res_offset);
1266  const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
1267  prefetch(&blB[0]);
1268 
1269  LhsPacket A0;
1270  for(Index k=0; k<peeled_kc; k+=pk)
1271  {
1272  RhsPacketx4 rhs_panel;
1273  RhsPacket T0;
1274 #define EIGEN_GEBGP_ONESTEP(K) \
1275  do { \
1276  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1277  traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1278  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1279  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1280  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1281  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1282  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1283  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1284  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1285  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1286  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1287  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1288  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1289  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1290  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1291  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1292  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1293  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1294  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1295  } while (false)
1296 
1297  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
1298 
1307 
1308  blB += pk*8*RhsProgress;
1309  blA += pk*(1*LhsProgress);
1310 
1311  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
1312  }
1313  // process remaining peeled loop
1314  for(Index k=peeled_kc; k<depth; k++)
1315  {
1316  RhsPacketx4 rhs_panel;
1317  RhsPacket T0;
1319  blB += 8*RhsProgress;
1320  blA += 1*LhsProgress;
1321  }
1322 
1323 #undef EIGEN_GEBGP_ONESTEP
1324 
1325  ResPacket R0, R1;
1326  ResPacket alphav = pset1<ResPacket>(alpha);
1327 
1328  R0 = r0.template loadPacket<ResPacket>(0);
1329  R1 = r1.template loadPacket<ResPacket>(0);
1330  traits.acc(C0, alphav, R0);
1331  traits.acc(C1, alphav, R1);
1332  r0.storePacket(0, R0);
1333  r1.storePacket(0, R1);
1334 
1335  R0 = r2.template loadPacket<ResPacket>(0);
1336  R1 = r3.template loadPacket<ResPacket>(0);
1337  traits.acc(C2, alphav, R0);
1338  traits.acc(C3, alphav, R1);
1339  r2.storePacket(0, R0);
1340  r3.storePacket(0, R1);
1341 
1342  R0 = r4.template loadPacket<ResPacket>(0);
1343  R1 = r5.template loadPacket<ResPacket>(0);
1344  traits.acc(C4, alphav, R0);
1345  traits.acc(C5, alphav, R1);
1346  r4.storePacket(0, R0);
1347  r5.storePacket(0, R1);
1348 
1349  R0 = r6.template loadPacket<ResPacket>(0);
1350  R1 = r7.template loadPacket<ResPacket>(0);
1351  traits.acc(C6, alphav, R0);
1352  traits.acc(C7, alphav, R1);
1353  r6.storePacket(0, R0);
1354  r7.storePacket(0, R1);
1355  }
1356  }
1357 #endif
1358 
1359  // loops on each largest micro vertical panel of rhs (depth * nr)
1360  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1361  {
1362  // We select a LhsProgress x nr micro block of res
1363  // which is entirely stored into 1 x nr registers.
1364 
1365  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1366  prefetch(&blA[0]);
1367 
1368  // gets res block as register
1369  AccPacket C0, C1, C2, C3;
1370  traits.initAcc(C0);
1371  traits.initAcc(C1);
1372  traits.initAcc(C2);
1373  traits.initAcc(C3);
1374  // To improve instruction pipelining, let's double the accumulation registers:
1375  // even k will accumulate in C*, while odd k will accumulate in D*.
1376  // This trick is crutial to get good performance with FMA, otherwise it is
1377  // actually faster to perform separated MUL+ADD because of a naturally
1378  // better instruction-level parallelism.
1379  AccPacket D0, D1, D2, D3;
1380  traits.initAcc(D0);
1381  traits.initAcc(D1);
1382  traits.initAcc(D2);
1383  traits.initAcc(D3);
1384 
1385  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1386  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1387  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1388  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1389 
1390  r0.prefetch(prefetch_res_offset);
1391  r1.prefetch(prefetch_res_offset);
1392  r2.prefetch(prefetch_res_offset);
1393  r3.prefetch(prefetch_res_offset);
1394 
1395  // performs "inner" products
1396  const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
1397  prefetch(&blB[0]);
1398  LhsPacket A0, A1;
1399 
1400  for(Index k=0; k<peeled_kc; k+=pk)
1401  {
1402  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1403  RhsPacketx4 rhs_panel;
1404  RhsPacket T0;
1405 
1406  internal::prefetch(blB+(48+0));
1407  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1408  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1409  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1410  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1411  internal::prefetch(blB+(48+16));
1412  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1413  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1414  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1415  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1416 
1417  blB += pk*4*RhsProgress;
1418  blA += pk*LhsProgress;
1419 
1420  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1421  }
1422  C0 = padd(C0,D0);
1423  C1 = padd(C1,D1);
1424  C2 = padd(C2,D2);
1425  C3 = padd(C3,D3);
1426 
1427  // process remaining peeled loop
1428  for(Index k=peeled_kc; k<depth; k++)
1429  {
1430  RhsPacketx4 rhs_panel;
1431  RhsPacket T0;
1432  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1433  blB += 4*RhsProgress;
1434  blA += LhsProgress;
1435  }
1436 
1437  ResPacket R0, R1;
1438  ResPacket alphav = pset1<ResPacket>(alpha);
1439 
1440  R0 = r0.template loadPacket<ResPacket>(0);
1441  R1 = r1.template loadPacket<ResPacket>(0);
1442  traits.acc(C0, alphav, R0);
1443  traits.acc(C1, alphav, R1);
1444  r0.storePacket(0, R0);
1445  r1.storePacket(0, R1);
1446 
1447  R0 = r2.template loadPacket<ResPacket>(0);
1448  R1 = r3.template loadPacket<ResPacket>(0);
1449  traits.acc(C2, alphav, R0);
1450  traits.acc(C3, alphav, R1);
1451  r2.storePacket(0, R0);
1452  r3.storePacket(0, R1);
1453  }
1454 
1455  // Deal with remaining columns of the rhs
1456  for(Index j2=packet_cols4; j2<cols; j2++)
1457  {
1458  // One column at a time
1459  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1460  prefetch(&blA[0]);
1461 
1462  // gets res block as register
1463  AccPacket C0;
1464  traits.initAcc(C0);
1465 
1466  LinearMapper r0 = res.getLinearMapper(i, j2);
1467 
1468  // performs "inner" products
1469  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1470  LhsPacket A0;
1471 
1472  for(Index k= 0; k<peeled_kc; k+=pk)
1473  {
1474  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1475  RhsPacket B_0;
1476 
1477 #define EIGEN_GEBGP_ONESTEP(K) \
1478  do { \
1479  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1480  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1481  /* FIXME: why unaligned???? */ \
1482  traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1483  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1484  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1485  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1486  } while(false);
1487 
1496 
1497  blB += pk*RhsProgress;
1498  blA += pk*LhsProgress;
1499 
1500  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1501  }
1502 
1503  // process remaining peeled loop
1504  for(Index k=peeled_kc; k<depth; k++)
1505  {
1506  RhsPacket B_0;
1508  blB += RhsProgress;
1509  blA += LhsProgress;
1510  }
1511 #undef EIGEN_GEBGP_ONESTEP
1512  ResPacket R0;
1513  ResPacket alphav = pset1<ResPacket>(alpha);
1514  R0 = r0.template loadPacket<ResPacket>(0);
1515  traits.acc(C0, alphav, R0);
1516  r0.storePacket(0, R0);
1517  }
1518  }
1519  }
1520 };
1521 
1522 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1523 struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1524 {
1525 
1526 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1527  {
1528  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1529  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1530  traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1531  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1532  traits.madd(*A0, *B_0, *C0, *B_0);
1533  traits.madd(*A0, *B1, *C1, *B1);
1534  traits.madd(*A0, *B2, *C2, *B2);
1535  traits.madd(*A0, *B3, *C3, *B3);
1536  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1537  }
1538 };
1539 
1540 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1543  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1544  Index rows, Index depth, Index cols, ResScalar alpha,
1545  Index strideA, Index strideB, Index offsetA, Index offsetB)
1546  {
1547  Traits traits;
1548  SwappedTraits straits;
1549 
1550  if(strideA==-1) strideA = depth;
1551  if(strideB==-1) strideB = depth;
1552  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1553  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1554  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
1555  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1556  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1557  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1558  const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1559  const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1560  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1561  const Index peeled_kc = depth & ~(pk-1);
1562  const int prefetch_res_offset = 32/sizeof(ResScalar);
1563 // const Index depth2 = depth & ~1;
1564 
1565  //---------- Process 3 * LhsProgress rows at once ----------
1566  // This corresponds to 3*LhsProgress x nr register blocks.
1567  // Usually, make sense only with FMA
1568  if(mr>=3*Traits::LhsProgress)
1569  {
1570  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1571  // and on each largest micro vertical panel of the rhs (depth * nr).
1572  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1573  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1574  // This actual number of rows is computed as follow:
1575  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1576  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1577  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1578  // or because we are testing specific blocking sizes.
1579  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1580  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1581  {
1582  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1583 #if EIGEN_ARCH_ARM64
1584  EIGEN_IF_CONSTEXPR(nr>=8) {
1585  for(Index j2=0; j2<packet_cols8; j2+=8)
1586  {
1587  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1588  {
1589  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1590  prefetch(&blA[0]);
1591  // gets res block as register
1592  AccPacket C0, C1, C2, C3, C4, C5, C6, C7,
1593  C8, C9, C10, C11, C12, C13, C14, C15,
1594  C16, C17, C18, C19, C20, C21, C22, C23;
1595  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1596  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1597  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1598  traits.initAcc(C12); traits.initAcc(C13); traits.initAcc(C14); traits.initAcc(C15);
1599  traits.initAcc(C16); traits.initAcc(C17); traits.initAcc(C18); traits.initAcc(C19);
1600  traits.initAcc(C20); traits.initAcc(C21); traits.initAcc(C22); traits.initAcc(C23);
1601 
1602  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1603  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1604  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1605  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1606  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1607  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1608  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1609  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1610 
1611  r0.prefetch(0);
1612  r1.prefetch(0);
1613  r2.prefetch(0);
1614  r3.prefetch(0);
1615  r4.prefetch(0);
1616  r5.prefetch(0);
1617  r6.prefetch(0);
1618  r7.prefetch(0);
1619 
1620  // performs "inner" products
1621  const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
1622  prefetch(&blB[0]);
1623  LhsPacket A0, A1;
1624  for(Index k=0; k<peeled_kc; k+=pk)
1625  {
1626  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
1627  // 27 registers are taken (24 for acc, 3 for lhs).
1628  RhsPanel27 rhs_panel;
1629  RhsPacket T0;
1630  LhsPacket A2;
1631  #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
1632  // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1633  // without this workaround A0, A1, and A2 are loaded in the same register,
1634  // which is not good for pipelining
1635  #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1636  #else
1637  #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1638  #endif
1639 
1640 #define EIGEN_GEBP_ONESTEP(K) \
1641  do { \
1642  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1643  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1644  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1645  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1646  EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND \
1647  traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1648  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1649  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1650  traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1651  traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1652  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1653  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1654  traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1655  traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1656  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1657  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1658  traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1659  traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1660  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1661  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1662  traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1663  traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1664  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1665  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1666  traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1667  traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1668  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1669  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1670  traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1671  traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1672  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1673  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1674  traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1675  traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1676  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1677  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1678  traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1679  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1680  } while (false)
1681 
1682  EIGEN_GEBP_ONESTEP(0);
1683  EIGEN_GEBP_ONESTEP(1);
1684  EIGEN_GEBP_ONESTEP(2);
1685  EIGEN_GEBP_ONESTEP(3);
1686  EIGEN_GEBP_ONESTEP(4);
1687  EIGEN_GEBP_ONESTEP(5);
1688  EIGEN_GEBP_ONESTEP(6);
1689  EIGEN_GEBP_ONESTEP(7);
1690 
1691  blB += pk * 8 * RhsProgress;
1692  blA += pk * 3 * Traits::LhsProgress;
1693  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
1694  }
1695 
1696  // process remaining peeled loop
1697  for (Index k = peeled_kc; k < depth; k++)
1698  {
1699 
1700  RhsPanel27 rhs_panel;
1701  RhsPacket T0;
1702  LhsPacket A2;
1703  EIGEN_GEBP_ONESTEP(0);
1704  blB += 8 * RhsProgress;
1705  blA += 3 * Traits::LhsProgress;
1706  }
1707 
1708  #undef EIGEN_GEBP_ONESTEP
1709 
1710  ResPacket R0, R1, R2;
1711  ResPacket alphav = pset1<ResPacket>(alpha);
1712 
1713  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1714  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1715  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1716  traits.acc(C0, alphav, R0);
1717  traits.acc(C8, alphav, R1);
1718  traits.acc(C16, alphav, R2);
1719  r0.storePacket(0 * Traits::ResPacketSize, R0);
1720  r0.storePacket(1 * Traits::ResPacketSize, R1);
1721  r0.storePacket(2 * Traits::ResPacketSize, R2);
1722 
1723  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1724  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1725  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1726  traits.acc(C1, alphav, R0);
1727  traits.acc(C9, alphav, R1);
1728  traits.acc(C17, alphav, R2);
1729  r1.storePacket(0 * Traits::ResPacketSize, R0);
1730  r1.storePacket(1 * Traits::ResPacketSize, R1);
1731  r1.storePacket(2 * Traits::ResPacketSize, R2);
1732 
1733  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1734  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1735  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1736  traits.acc(C2, alphav, R0);
1737  traits.acc(C10, alphav, R1);
1738  traits.acc(C18, alphav, R2);
1739  r2.storePacket(0 * Traits::ResPacketSize, R0);
1740  r2.storePacket(1 * Traits::ResPacketSize, R1);
1741  r2.storePacket(2 * Traits::ResPacketSize, R2);
1742 
1743  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1744  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1745  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1746  traits.acc(C3, alphav, R0);
1747  traits.acc(C11, alphav, R1);
1748  traits.acc(C19, alphav, R2);
1749  r3.storePacket(0 * Traits::ResPacketSize, R0);
1750  r3.storePacket(1 * Traits::ResPacketSize, R1);
1751  r3.storePacket(2 * Traits::ResPacketSize, R2);
1752 
1753  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1754  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1755  R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1756  traits.acc(C4, alphav, R0);
1757  traits.acc(C12, alphav, R1);
1758  traits.acc(C20, alphav, R2);
1759  r4.storePacket(0 * Traits::ResPacketSize, R0);
1760  r4.storePacket(1 * Traits::ResPacketSize, R1);
1761  r4.storePacket(2 * Traits::ResPacketSize, R2);
1762 
1763  R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1764  R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1765  R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1766  traits.acc(C5, alphav, R0);
1767  traits.acc(C13, alphav, R1);
1768  traits.acc(C21, alphav, R2);
1769  r5.storePacket(0 * Traits::ResPacketSize, R0);
1770  r5.storePacket(1 * Traits::ResPacketSize, R1);
1771  r5.storePacket(2 * Traits::ResPacketSize, R2);
1772 
1773  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1774  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1775  R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1776  traits.acc(C6, alphav, R0);
1777  traits.acc(C14, alphav, R1);
1778  traits.acc(C22, alphav, R2);
1779  r6.storePacket(0 * Traits::ResPacketSize, R0);
1780  r6.storePacket(1 * Traits::ResPacketSize, R1);
1781  r6.storePacket(2 * Traits::ResPacketSize, R2);
1782 
1783  R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784  R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785  R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1786  traits.acc(C7, alphav, R0);
1787  traits.acc(C15, alphav, R1);
1788  traits.acc(C23, alphav, R2);
1789  r7.storePacket(0 * Traits::ResPacketSize, R0);
1790  r7.storePacket(1 * Traits::ResPacketSize, R1);
1791  r7.storePacket(2 * Traits::ResPacketSize, R2);
1792  }
1793  }
1794  }
1795 #endif
1796  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
1797  {
1798  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1799  {
1800 
1801  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1802  // stored into 3 x nr registers.
1803 
1804  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1805  prefetch(&blA[0]);
1806 
1807  // gets res block as register
1808  AccPacket C0, C1, C2, C3,
1809  C4, C5, C6, C7,
1810  C8, C9, C10, C11;
1811  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1812  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1813  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1814 
1815  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1816  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1817  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1818  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1819 
1820  r0.prefetch(0);
1821  r1.prefetch(0);
1822  r2.prefetch(0);
1823  r3.prefetch(0);
1824 
1825  // performs "inner" products
1826  const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
1827  prefetch(&blB[0]);
1828  LhsPacket A0, A1;
1829 
1830  for(Index k=0; k<peeled_kc; k+=pk)
1831  {
1832  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1833  // 15 registers are taken (12 for acc, 3 for lhs).
1834  RhsPanel15 rhs_panel;
1835  RhsPacket T0;
1836  LhsPacket A2;
1837  #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
1838  // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1839  // without this workaround A0, A1, and A2 are loaded in the same register,
1840  // which is not good for pipelining
1841  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1842  #else
1843  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1844  #endif
1845 #define EIGEN_GEBP_ONESTEP(K) \
1846  do { \
1847  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1848  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1849  internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1850  if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1851  internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1852  } /* Bug 953 */ \
1853  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1854  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1855  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1856  EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1857  traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1858  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1859  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1860  traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1861  traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1862  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1863  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1864  traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1865  traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1866  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1867  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1868  traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1869  traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1870  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1871  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1872  traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1873  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1874  } while (false)
1875 
1876  internal::prefetch(blB);
1877  EIGEN_GEBP_ONESTEP(0);
1878  EIGEN_GEBP_ONESTEP(1);
1879  EIGEN_GEBP_ONESTEP(2);
1880  EIGEN_GEBP_ONESTEP(3);
1881  EIGEN_GEBP_ONESTEP(4);
1882  EIGEN_GEBP_ONESTEP(5);
1883  EIGEN_GEBP_ONESTEP(6);
1884  EIGEN_GEBP_ONESTEP(7);
1885 
1886  blB += pk*4*RhsProgress;
1887  blA += pk*3*Traits::LhsProgress;
1888 
1889  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1890  }
1891  // process remaining peeled loop
1892  for(Index k=peeled_kc; k<depth; k++)
1893  {
1894  RhsPanel15 rhs_panel;
1895  RhsPacket T0;
1896  LhsPacket A2;
1897  EIGEN_GEBP_ONESTEP(0);
1898  blB += 4*RhsProgress;
1899  blA += 3*Traits::LhsProgress;
1900  }
1901 
1902 #undef EIGEN_GEBP_ONESTEP
1903 
1904  ResPacket R0, R1, R2;
1905  ResPacket alphav = pset1<ResPacket>(alpha);
1906 
1907  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1908  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1909  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1910  traits.acc(C0, alphav, R0);
1911  traits.acc(C4, alphav, R1);
1912  traits.acc(C8, alphav, R2);
1913  r0.storePacket(0 * Traits::ResPacketSize, R0);
1914  r0.storePacket(1 * Traits::ResPacketSize, R1);
1915  r0.storePacket(2 * Traits::ResPacketSize, R2);
1916 
1917  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1918  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1919  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1920  traits.acc(C1, alphav, R0);
1921  traits.acc(C5, alphav, R1);
1922  traits.acc(C9, alphav, R2);
1923  r1.storePacket(0 * Traits::ResPacketSize, R0);
1924  r1.storePacket(1 * Traits::ResPacketSize, R1);
1925  r1.storePacket(2 * Traits::ResPacketSize, R2);
1926 
1927  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1928  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1929  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1930  traits.acc(C2, alphav, R0);
1931  traits.acc(C6, alphav, R1);
1932  traits.acc(C10, alphav, R2);
1933  r2.storePacket(0 * Traits::ResPacketSize, R0);
1934  r2.storePacket(1 * Traits::ResPacketSize, R1);
1935  r2.storePacket(2 * Traits::ResPacketSize, R2);
1936 
1937  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1938  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1939  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1940  traits.acc(C3, alphav, R0);
1941  traits.acc(C7, alphav, R1);
1942  traits.acc(C11, alphav, R2);
1943  r3.storePacket(0 * Traits::ResPacketSize, R0);
1944  r3.storePacket(1 * Traits::ResPacketSize, R1);
1945  r3.storePacket(2 * Traits::ResPacketSize, R2);
1946  }
1947  }
1948 
1949  // Deal with remaining columns of the rhs
1950  for(Index j2=packet_cols4; j2<cols; j2++)
1951  {
1952  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1953  {
1954  // One column at a time
1955  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1956  prefetch(&blA[0]);
1957 
1958  // gets res block as register
1959  AccPacket C0, C4, C8;
1960  traits.initAcc(C0);
1961  traits.initAcc(C4);
1962  traits.initAcc(C8);
1963 
1964  LinearMapper r0 = res.getLinearMapper(i, j2);
1965  r0.prefetch(0);
1966 
1967  // performs "inner" products
1968  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1969  LhsPacket A0, A1, A2;
1970 
1971  for(Index k=0; k<peeled_kc; k+=pk)
1972  {
1973  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1974  RhsPacket B_0;
1975 #define EIGEN_GEBGP_ONESTEP(K) \
1976  do { \
1977  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1978  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1979  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1980  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1981  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1982  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1983  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1984  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1985  traits.madd(A2, B_0, C8, B_0, fix<0>); \
1986  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1987  } while (false)
1988 
1997 
1998  blB += int(pk) * int(RhsProgress);
1999  blA += int(pk) * 3 * int(Traits::LhsProgress);
2000 
2001  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
2002  }
2003 
2004  // process remaining peeled loop
2005  for(Index k=peeled_kc; k<depth; k++)
2006  {
2007  RhsPacket B_0;
2009  blB += RhsProgress;
2010  blA += 3*Traits::LhsProgress;
2011  }
2012 #undef EIGEN_GEBGP_ONESTEP
2013  ResPacket R0, R1, R2;
2014  ResPacket alphav = pset1<ResPacket>(alpha);
2015 
2016  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2017  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2018  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
2019  traits.acc(C0, alphav, R0);
2020  traits.acc(C4, alphav, R1);
2021  traits.acc(C8, alphav, R2);
2022  r0.storePacket(0 * Traits::ResPacketSize, R0);
2023  r0.storePacket(1 * Traits::ResPacketSize, R1);
2024  r0.storePacket(2 * Traits::ResPacketSize, R2);
2025  }
2026  }
2027  }
2028  }
2029 
2030  //---------- Process 2 * LhsProgress rows at once ----------
2031  if(mr>=2*Traits::LhsProgress)
2032  {
2033  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
2034  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
2035  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
2036  // or because we are testing specific blocking sizes.
2037  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
2038 
2039  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
2040  {
2041  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
2042 #if EIGEN_ARCH_ARM64
2043  EIGEN_IF_CONSTEXPR(nr>=8) {
2044  for(Index j2=0; j2<packet_cols8; j2+=8)
2045  {
2046  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
2047  {
2048  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2049  prefetch(&blA[0]);
2050 
2051  AccPacket C0, C1, C2, C3, C4, C5, C6, C7,
2052  C8, C9, C10, C11, C12, C13, C14, C15;
2053  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
2054  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2055  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
2056  traits.initAcc(C12); traits.initAcc(C13); traits.initAcc(C14); traits.initAcc(C15);
2057 
2058  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
2059  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
2060  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
2061  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
2062  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
2063  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
2064  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
2065  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
2066  r0.prefetch(prefetch_res_offset);
2067  r1.prefetch(prefetch_res_offset);
2068  r2.prefetch(prefetch_res_offset);
2069  r3.prefetch(prefetch_res_offset);
2070  r4.prefetch(prefetch_res_offset);
2071  r5.prefetch(prefetch_res_offset);
2072  r6.prefetch(prefetch_res_offset);
2073  r7.prefetch(prefetch_res_offset);
2074 
2075  const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
2076  prefetch(&blB[0]);
2077  LhsPacket A0, A1;
2078  for(Index k=0; k<peeled_kc; k+=pk)
2079  {
2080  RhsPacketx4 rhs_panel;
2081  RhsPacket T0;
2082  // NOTE: the begin/end asm comments below work around bug 935!
2083  // but they are not enough for gcc>=6 without FMA (bug 1637)
2084  #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE)
2085  #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
2086  #else
2087  #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
2088  #endif
2089 #define EIGEN_GEBGP_ONESTEP(K) \
2090  do { \
2091  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
2092  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2093  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2094  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
2095  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2096  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2097  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2098  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2099  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2100  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2101  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2102  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2103  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2104  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2105  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2106  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2107  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2108  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2109  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2110  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2111  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2112  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2113  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2114  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2115  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2116  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2117  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2118  EIGEN_GEBP_2Px8_SPILLING_WORKAROUND \
2119  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2120  } while (false)
2121 
2122  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
2123 
2132 
2133  blB += pk*8*RhsProgress;
2134  blA += pk*(2*Traits::LhsProgress);
2135 
2136  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
2137  }
2138  // process remaining peeled loop
2139  for(Index k=peeled_kc; k<depth; k++)
2140  {
2141  RhsPacketx4 rhs_panel;
2142  RhsPacket T0;
2144  blB += 8*RhsProgress;
2145  blA += 2*Traits::LhsProgress;
2146  }
2147 
2148 #undef EIGEN_GEBGP_ONESTEP
2149 
2150  ResPacket R0, R1, R2, R3;
2151  ResPacket alphav = pset1<ResPacket>(alpha);
2152 
2153  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2154  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2155  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2156  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2157  traits.acc(C0, alphav, R0);
2158  traits.acc(C8, alphav, R1);
2159  traits.acc(C1, alphav, R2);
2160  traits.acc(C9, alphav, R3);
2161  r0.storePacket(0 * Traits::ResPacketSize, R0);
2162  r0.storePacket(1 * Traits::ResPacketSize, R1);
2163  r1.storePacket(0 * Traits::ResPacketSize, R2);
2164  r1.storePacket(1 * Traits::ResPacketSize, R3);
2165 
2166  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2167  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2168  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2169  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2170  traits.acc(C2, alphav, R0);
2171  traits.acc(C10, alphav, R1);
2172  traits.acc(C3, alphav, R2);
2173  traits.acc(C11, alphav, R3);
2174  r2.storePacket(0 * Traits::ResPacketSize, R0);
2175  r2.storePacket(1 * Traits::ResPacketSize, R1);
2176  r3.storePacket(0 * Traits::ResPacketSize, R2);
2177  r3.storePacket(1 * Traits::ResPacketSize, R3);
2178 
2179  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2180  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2181  R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2182  R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2183  traits.acc(C4, alphav, R0);
2184  traits.acc(C12, alphav, R1);
2185  traits.acc(C5, alphav, R2);
2186  traits.acc(C13, alphav, R3);
2187  r4.storePacket(0 * Traits::ResPacketSize, R0);
2188  r4.storePacket(1 * Traits::ResPacketSize, R1);
2189  r5.storePacket(0 * Traits::ResPacketSize, R2);
2190  r5.storePacket(1 * Traits::ResPacketSize, R3);
2191 
2192  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2193  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2194  R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2195  R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2196  traits.acc(C6, alphav, R0);
2197  traits.acc(C14, alphav, R1);
2198  traits.acc(C7, alphav, R2);
2199  traits.acc(C15, alphav, R3);
2200  r6.storePacket(0 * Traits::ResPacketSize, R0);
2201  r6.storePacket(1 * Traits::ResPacketSize, R1);
2202  r7.storePacket(0 * Traits::ResPacketSize, R2);
2203  r7.storePacket(1 * Traits::ResPacketSize, R3);
2204  }
2205  }
2206  }
2207 #endif
2208  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2209  {
2210  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
2211  {
2212 
2213  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
2214  // stored into 2 x nr registers.
2215 
2216  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2217  prefetch(&blA[0]);
2218 
2219  // gets res block as register
2220  AccPacket C0, C1, C2, C3,
2221  C4, C5, C6, C7;
2222  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
2223  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
2224 
2225  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
2226  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
2227  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
2228  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
2229 
2230  r0.prefetch(prefetch_res_offset);
2231  r1.prefetch(prefetch_res_offset);
2232  r2.prefetch(prefetch_res_offset);
2233  r3.prefetch(prefetch_res_offset);
2234 
2235  // performs "inner" products
2236  const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
2237  prefetch(&blB[0]);
2238  LhsPacket A0, A1;
2239 
2240  for(Index k=0; k<peeled_kc; k+=pk)
2241  {
2242  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
2243  RhsPacketx4 rhs_panel;
2244  RhsPacket T0;
2245 
2246  // NOTE: the begin/end asm comments below work around bug 935!
2247  // but they are not enough for gcc>=6 without FMA (bug 1637)
2248  #if EIGEN_GNUC_STRICT_AT_LEAST(6,0,0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2249  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
2250  #else
2251  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2252  #endif
2253 #define EIGEN_GEBGP_ONESTEP(K) \
2254  do { \
2255  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2256  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2257  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2258  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2259  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2260  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2261  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2262  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2263  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2264  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2265  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2266  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2267  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2268  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2269  } while (false)
2270 
2271  internal::prefetch(blB+(48+0));
2276  internal::prefetch(blB+(48+16));
2281 
2282  blB += pk*4*RhsProgress;
2283  blA += pk*(2*Traits::LhsProgress);
2284 
2285  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
2286  }
2287  // process remaining peeled loop
2288  for(Index k=peeled_kc; k<depth; k++)
2289  {
2290  RhsPacketx4 rhs_panel;
2291  RhsPacket T0;
2293  blB += 4*RhsProgress;
2294  blA += 2*Traits::LhsProgress;
2295  }
2296 #undef EIGEN_GEBGP_ONESTEP
2297 
2298  ResPacket R0, R1, R2, R3;
2299  ResPacket alphav = pset1<ResPacket>(alpha);
2300 
2301  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2302  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2303  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2304  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2305  traits.acc(C0, alphav, R0);
2306  traits.acc(C4, alphav, R1);
2307  traits.acc(C1, alphav, R2);
2308  traits.acc(C5, alphav, R3);
2309  r0.storePacket(0 * Traits::ResPacketSize, R0);
2310  r0.storePacket(1 * Traits::ResPacketSize, R1);
2311  r1.storePacket(0 * Traits::ResPacketSize, R2);
2312  r1.storePacket(1 * Traits::ResPacketSize, R3);
2313 
2314  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2315  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2316  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2317  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2318  traits.acc(C2, alphav, R0);
2319  traits.acc(C6, alphav, R1);
2320  traits.acc(C3, alphav, R2);
2321  traits.acc(C7, alphav, R3);
2322  r2.storePacket(0 * Traits::ResPacketSize, R0);
2323  r2.storePacket(1 * Traits::ResPacketSize, R1);
2324  r3.storePacket(0 * Traits::ResPacketSize, R2);
2325  r3.storePacket(1 * Traits::ResPacketSize, R3);
2326  }
2327  }
2328 
2329  // Deal with remaining columns of the rhs
2330  for(Index j2=packet_cols4; j2<cols; j2++)
2331  {
2332  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
2333  {
2334  // One column at a time
2335  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
2336  prefetch(&blA[0]);
2337 
2338  // gets res block as register
2339  AccPacket C0, C4;
2340  traits.initAcc(C0);
2341  traits.initAcc(C4);
2342 
2343  LinearMapper r0 = res.getLinearMapper(i, j2);
2344  r0.prefetch(prefetch_res_offset);
2345 
2346  // performs "inner" products
2347  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2348  LhsPacket A0, A1;
2349 
2350  for(Index k=0; k<peeled_kc; k+=pk)
2351  {
2352  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
2353  RhsPacket B_0, B1;
2354 
2355 #define EIGEN_GEBGP_ONESTEP(K) \
2356  do { \
2357  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2358  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2359  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
2360  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
2361  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
2362  traits.madd(A0, B_0, C0, B1, fix<0>); \
2363  traits.madd(A1, B_0, C4, B_0, fix<0>); \
2364  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2365  } while(false)
2366 
2375 
2376  blB += int(pk) * int(RhsProgress);
2377  blA += int(pk) * 2 * int(Traits::LhsProgress);
2378 
2379  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
2380  }
2381 
2382  // process remaining peeled loop
2383  for(Index k=peeled_kc; k<depth; k++)
2384  {
2385  RhsPacket B_0, B1;
2387  blB += RhsProgress;
2388  blA += 2*Traits::LhsProgress;
2389  }
2390 #undef EIGEN_GEBGP_ONESTEP
2391  ResPacket R0, R1;
2392  ResPacket alphav = pset1<ResPacket>(alpha);
2393 
2394  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2395  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2396  traits.acc(C0, alphav, R0);
2397  traits.acc(C4, alphav, R1);
2398  r0.storePacket(0 * Traits::ResPacketSize, R0);
2399  r0.storePacket(1 * Traits::ResPacketSize, R1);
2400  }
2401  }
2402  }
2403  }
2404  //---------- Process 1 * LhsProgress rows at once ----------
2405  if(mr>=1*Traits::LhsProgress)
2406  {
2407  lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
2408  p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2409  }
2410  //---------- Process LhsProgressHalf rows at once ----------
2411  if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
2412  {
2413  lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
2414  p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2415  }
2416  //---------- Process LhsProgressQuarter rows at once ----------
2417  if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
2418  {
2419  lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
2420  p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2421  }
2422  //---------- Process remaining rows, 1 at once ----------
2423  if(peeled_mc_quarter<rows)
2424  {
2425 #if EIGEN_ARCH_ARM64
2426  EIGEN_IF_CONSTEXPR(nr>=8) {
2427  // loop on each panel of the rhs
2428  for(Index j2=0; j2<packet_cols8; j2+=8)
2429  {
2430  // loop on each row of the lhs (1*LhsProgress x depth)
2431  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2432  {
2433  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2434  prefetch(&blA[0]);
2435  // gets a 1 x 1 res block as registers
2436  ResScalar C0(0),C1(0),C2(0),C3(0),C4(0),C5(0),C6(0),C7(0);
2437  const RhsScalar* blB = &blockB[j2*strideB+offsetB*8];
2438  for(Index k=0; k<depth; k++)
2439  {
2440  LhsScalar A0 = blA[k];
2441  RhsScalar B_0;
2442 
2443  B_0 = blB[0];
2444  C0 = cj.pmadd(A0, B_0, C0);
2445 
2446  B_0 = blB[1];
2447  C1 = cj.pmadd(A0, B_0, C1);
2448 
2449  B_0 = blB[2];
2450  C2 = cj.pmadd(A0, B_0, C2);
2451 
2452  B_0 = blB[3];
2453  C3 = cj.pmadd(A0, B_0, C3);
2454 
2455  B_0 = blB[4];
2456  C4 = cj.pmadd(A0, B_0, C4);
2457 
2458  B_0 = blB[5];
2459  C5 = cj.pmadd(A0, B_0, C5);
2460 
2461  B_0 = blB[6];
2462  C6 = cj.pmadd(A0, B_0, C6);
2463 
2464  B_0 = blB[7];
2465  C7 = cj.pmadd(A0, B_0, C7);
2466 
2467  blB += 8;
2468  }
2469  res(i, j2 + 0) += alpha * C0;
2470  res(i, j2 + 1) += alpha * C1;
2471  res(i, j2 + 2) += alpha * C2;
2472  res(i, j2 + 3) += alpha * C3;
2473  res(i, j2 + 4) += alpha * C4;
2474  res(i, j2 + 5) += alpha * C5;
2475  res(i, j2 + 6) += alpha * C6;
2476  res(i, j2 + 7) += alpha * C7;
2477  }
2478  }
2479  }
2480 #endif
2481 
2482  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2483  {
2484  // loop on each row of the lhs (1*LhsProgress x depth)
2485  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2486  {
2487  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2488  prefetch(&blA[0]);
2489  const RhsScalar* blB = &blockB[j2*strideB+offsetB*4];
2490 
2491  // If LhsProgress is 8 or 16, it assumes that there is a
2492  // half or quarter packet, respectively, of the same size as
2493  // nr (which is currently 4) for the return type.
2494  const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
2495  const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
2496  // The following code assumes we can load SRhsPacket in such a way that
2497  // it multiplies blocks of 4 elements in SLhsPacket. This is not the
2498  // case for some customized kernels (i.e. NEON fp16). If the assumption
2499  // fails, drop down to the scalar path.
2500  constexpr bool kCanLoadSRhsQuad = (unpacket_traits<SLhsPacket>::size < 4) || (unpacket_traits<SRhsPacket>::size % (unpacket_traits<SLhsPacket>::size / 4)) == 0;
2501  if (kCanLoadSRhsQuad &&
2502  (SwappedTraits::LhsProgress % 4) == 0 &&
2503  (SwappedTraits::LhsProgress<=16) &&
2504  (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
2505  (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
2506  {
2507  SAccPacket C0, C1, C2, C3;
2508  straits.initAcc(C0);
2509  straits.initAcc(C1);
2510  straits.initAcc(C2);
2511  straits.initAcc(C3);
2512 
2513  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
2514  const Index endk = (depth/spk)*spk;
2515  const Index endk4 = (depth/(spk*4))*(spk*4);
2516 
2517  Index k=0;
2518  for(; k<endk4; k+=4*spk)
2519  {
2520  SLhsPacket A0,A1;
2521  SRhsPacket B_0,B_1;
2522 
2523  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
2524  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
2525 
2526  straits.loadRhsQuad(blA+0*spk, B_0);
2527  straits.loadRhsQuad(blA+1*spk, B_1);
2528  straits.madd(A0,B_0,C0,B_0, fix<0>);
2529  straits.madd(A1,B_1,C1,B_1, fix<0>);
2530 
2531  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
2532  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
2533  straits.loadRhsQuad(blA+2*spk, B_0);
2534  straits.loadRhsQuad(blA+3*spk, B_1);
2535  straits.madd(A0,B_0,C2,B_0, fix<0>);
2536  straits.madd(A1,B_1,C3,B_1, fix<0>);
2537 
2538  blB += 4*SwappedTraits::LhsProgress;
2539  blA += 4*spk;
2540  }
2541  C0 = padd(padd(C0,C1),padd(C2,C3));
2542  for(; k<endk; k+=spk)
2543  {
2544  SLhsPacket A0;
2545  SRhsPacket B_0;
2546 
2547  straits.loadLhsUnaligned(blB, A0);
2548  straits.loadRhsQuad(blA, B_0);
2549  straits.madd(A0,B_0,C0,B_0, fix<0>);
2550 
2551  blB += SwappedTraits::LhsProgress;
2552  blA += spk;
2553  }
2554  if(SwappedTraits::LhsProgress==8)
2555  {
2556  // Special case where we have to first reduce the accumulation register C0
2557  typedef std::conditional_t<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket> SResPacketHalf;
2558  typedef std::conditional_t<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket> SLhsPacketHalf;
2559  typedef std::conditional_t<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket> SRhsPacketHalf;
2560  typedef std::conditional_t<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket> SAccPacketHalf;
2561 
2562  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2563  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2564 
2565  if(depth-endk>0)
2566  {
2567  // We have to handle the last row of the rhs which corresponds to a half-packet
2568  SLhsPacketHalf a0;
2569  SRhsPacketHalf b0;
2570  straits.loadLhsUnaligned(blB, a0);
2571  straits.loadRhs(blA, b0);
2572  SAccPacketHalf c0 = predux_half_dowto4(C0);
2573  straits.madd(a0,b0,c0,b0, fix<0>);
2574  straits.acc(c0, alphav, R);
2575  }
2576  else
2577  {
2578  straits.acc(predux_half_dowto4(C0), alphav, R);
2579  }
2580  res.scatterPacket(i, j2, R);
2581  }
2582  else if (SwappedTraits::LhsProgress==16)
2583  {
2584  // Special case where we have to first reduce the
2585  // accumulation register C0. We specialize the block in
2586  // template form, so that LhsProgress < 16 paths don't
2587  // fail to compile
2588  last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2589  p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2590  }
2591  else
2592  {
2593  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2594  SResPacket alphav = pset1<SResPacket>(alpha);
2595  straits.acc(C0, alphav, R);
2596  res.scatterPacket(i, j2, R);
2597  }
2598  }
2599  else // scalar path
2600  {
2601  // get a 1 x 4 res block as registers
2602  ResScalar C0(0), C1(0), C2(0), C3(0);
2603 
2604  for(Index k=0; k<depth; k++)
2605  {
2606  LhsScalar A0;
2607  RhsScalar B_0, B_1;
2608 
2609  A0 = blA[k];
2610 
2611  B_0 = blB[0];
2612  B_1 = blB[1];
2613  C0 = cj.pmadd(A0,B_0,C0);
2614  C1 = cj.pmadd(A0,B_1,C1);
2615 
2616  B_0 = blB[2];
2617  B_1 = blB[3];
2618  C2 = cj.pmadd(A0,B_0,C2);
2619  C3 = cj.pmadd(A0,B_1,C3);
2620 
2621  blB += 4;
2622  }
2623  res(i, j2 + 0) += alpha * C0;
2624  res(i, j2 + 1) += alpha * C1;
2625  res(i, j2 + 2) += alpha * C2;
2626  res(i, j2 + 3) += alpha * C3;
2627  }
2628  }
2629  }
2630  // remaining columns
2631  for(Index j2=packet_cols4; j2<cols; j2++)
2632  {
2633  // loop on each row of the lhs (1*LhsProgress x depth)
2634  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2635  {
2636  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2637  prefetch(&blA[0]);
2638  // gets a 1 x 1 res block as registers
2639  ResScalar C0(0);
2640  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2641  for(Index k=0; k<depth; k++)
2642  {
2643  LhsScalar A0 = blA[k];
2644  RhsScalar B_0 = blB[k];
2645  C0 = cj.pmadd(A0, B_0, C0);
2646  }
2647  res(i, j2) += alpha * C0;
2648  }
2649  }
2650  }
2651  }
2652 
2653 
2654 // pack a block of the lhs
2655 // The traversal is as follow (mr==4):
2656 // 0 4 8 12 ...
2657 // 1 5 9 13 ...
2658 // 2 6 10 14 ...
2659 // 3 7 11 15 ...
2660 //
2661 // 16 20 24 28 ...
2662 // 17 21 25 29 ...
2663 // 18 22 26 30 ...
2664 // 19 23 27 31 ...
2665 //
2666 // 32 33 34 35 ...
2667 // 36 36 38 39 ...
2668 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2669 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2670 {
2671  typedef typename DataMapper::LinearMapper LinearMapper;
2672  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2673 };
2674 
2675 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2677  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2678 {
2679  typedef typename unpacket_traits<Packet>::half HalfPacket;
2680  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2681  enum { PacketSize = unpacket_traits<Packet>::size,
2682  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2683  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2684  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2685  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2686 
2687  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2688  EIGEN_UNUSED_VARIABLE(stride);
2689  EIGEN_UNUSED_VARIABLE(offset);
2690  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2691  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2692  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2693  Index count = 0;
2694 
2695  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2696  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2697  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2698  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2699  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2700  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2701  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2702  : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2703 
2704  Index i=0;
2705 
2706  // Pack 3 packets
2707  if(Pack1>=3*PacketSize)
2708  {
2709  for(; i<peeled_mc3; i+=3*PacketSize)
2710  {
2711  if(PanelMode) count += (3*PacketSize) * offset;
2712 
2713  for(Index k=0; k<depth; k++)
2714  {
2715  Packet A, B, C;
2716  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2717  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2718  C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2719  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2720  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2721  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2722  }
2723  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2724  }
2725  }
2726  // Pack 2 packets
2727  if(Pack1>=2*PacketSize)
2728  {
2729  for(; i<peeled_mc2; i+=2*PacketSize)
2730  {
2731  if(PanelMode) count += (2*PacketSize) * offset;
2732 
2733  for(Index k=0; k<depth; k++)
2734  {
2735  Packet A, B;
2736  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2737  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2738  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2739  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2740  }
2741  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2742  }
2743  }
2744  // Pack 1 packets
2745  if(Pack1>=1*PacketSize)
2746  {
2747  for(; i<peeled_mc1; i+=1*PacketSize)
2748  {
2749  if(PanelMode) count += (1*PacketSize) * offset;
2750 
2751  for(Index k=0; k<depth; k++)
2752  {
2753  Packet A;
2754  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2755  pstore(blockA+count, cj.pconj(A));
2756  count+=PacketSize;
2757  }
2758  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2759  }
2760  }
2761  // Pack half packets
2762  if(HasHalf && Pack1>=HalfPacketSize)
2763  {
2764  for(; i<peeled_mc_half; i+=HalfPacketSize)
2765  {
2766  if(PanelMode) count += (HalfPacketSize) * offset;
2767 
2768  for(Index k=0; k<depth; k++)
2769  {
2770  HalfPacket A;
2771  A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2772  pstoreu(blockA+count, cj.pconj(A));
2773  count+=HalfPacketSize;
2774  }
2775  if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2776  }
2777  }
2778  // Pack quarter packets
2779  if(HasQuarter && Pack1>=QuarterPacketSize)
2780  {
2781  for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2782  {
2783  if(PanelMode) count += (QuarterPacketSize) * offset;
2784 
2785  for(Index k=0; k<depth; k++)
2786  {
2787  QuarterPacket A;
2788  A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2789  pstoreu(blockA+count, cj.pconj(A));
2790  count+=QuarterPacketSize;
2791  }
2792  if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2793  }
2794  }
2795  // Pack2 may be *smaller* than PacketSize—that happens for
2796  // products like real * complex, where we have to go half the
2797  // progress on the lhs in order to duplicate those operands to
2798  // address both real & imaginary parts on the rhs. This portion will
2799  // pack those half ones until they match the number expected on the
2800  // last peeling loop at this point (for the rhs).
2801  if(Pack2<PacketSize && Pack2>1)
2802  {
2803  for(; i<peeled_mc0; i+=last_lhs_progress)
2804  {
2805  if(PanelMode) count += last_lhs_progress * offset;
2806 
2807  for(Index k=0; k<depth; k++)
2808  for(Index w=0; w<last_lhs_progress; w++)
2809  blockA[count++] = cj(lhs(i+w, k));
2810 
2811  if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2812  }
2813  }
2814  // Pack scalars
2815  for(; i<rows; i++)
2816  {
2817  if(PanelMode) count += offset;
2818  for(Index k=0; k<depth; k++)
2819  blockA[count++] = cj(lhs(i, k));
2820  if(PanelMode) count += (stride-offset-depth);
2821  }
2822 }
2823 
2824 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2825 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2826 {
2827  typedef typename DataMapper::LinearMapper LinearMapper;
2828  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2829 };
2830 
2831 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2833  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2834 {
2835  typedef typename unpacket_traits<Packet>::half HalfPacket;
2836  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2837  enum { PacketSize = unpacket_traits<Packet>::size,
2838  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2839  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2840  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2841  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2842 
2843  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2844  EIGEN_UNUSED_VARIABLE(stride);
2845  EIGEN_UNUSED_VARIABLE(offset);
2846  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2847  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2848  Index count = 0;
2849  bool gone_half = false, gone_quarter = false, gone_last = false;
2850 
2851  Index i = 0;
2852  Index pack = Pack1;
2853  Index psize = PacketSize;
2854  while(pack>0)
2855  {
2856  Index remaining_rows = rows-i;
2857  Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2858  Index starting_pos = i;
2859  for(; i<peeled_mc; i+=pack)
2860  {
2861  if(PanelMode) count += pack * offset;
2862 
2863  Index k=0;
2864  if(pack>=psize && psize >= QuarterPacketSize)
2865  {
2866  const Index peeled_k = (depth/psize)*psize;
2867  for(; k<peeled_k; k+=psize)
2868  {
2869  for (Index m = 0; m < pack; m += psize)
2870  {
2871  if (psize == PacketSize) {
2872  PacketBlock<Packet> kernel;
2873  for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2874  ptranspose(kernel);
2875  for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2876  } else if (HasHalf && psize == HalfPacketSize) {
2877  gone_half = true;
2878  PacketBlock<HalfPacket> kernel_half;
2879  for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2880  ptranspose(kernel_half);
2881  for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2882  } else if (HasQuarter && psize == QuarterPacketSize) {
2883  gone_quarter = true;
2884  PacketBlock<QuarterPacket> kernel_quarter;
2885  for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2886  ptranspose(kernel_quarter);
2887  for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2888  }
2889  }
2890  count += psize*pack;
2891  }
2892  }
2893 
2894  for(; k<depth; k++)
2895  {
2896  Index w=0;
2897  for(; w<pack-3; w+=4)
2898  {
2899  Scalar a(cj(lhs(i+w+0, k))),
2900  b(cj(lhs(i+w+1, k))),
2901  c(cj(lhs(i+w+2, k))),
2902  d(cj(lhs(i+w+3, k)));
2903  blockA[count++] = a;
2904  blockA[count++] = b;
2905  blockA[count++] = c;
2906  blockA[count++] = d;
2907  }
2908  if(pack%4)
2909  for(;w<pack;++w)
2910  blockA[count++] = cj(lhs(i+w, k));
2911  }
2912 
2913  if(PanelMode) count += pack * (stride-offset-depth);
2914  }
2915 
2916  pack -= psize;
2917  Index left = rows - i;
2918  if (pack <= 0) {
2919  if (!gone_last &&
2920  (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2921  ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2922  (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2923  psize /= 2;
2924  pack = psize;
2925  continue;
2926  }
2927  // Pack2 may be *smaller* than PacketSize—that happens for
2928  // products like real * complex, where we have to go half the
2929  // progress on the lhs in order to duplicate those operands to
2930  // address both real & imaginary parts on the rhs. This portion will
2931  // pack those half ones until they match the number expected on the
2932  // last peeling loop at this point (for the rhs).
2933  if (Pack2 < PacketSize && !gone_last) {
2934  gone_last = true;
2935  psize = pack = left & ~1;
2936  }
2937  }
2938  }
2939 
2940  for(; i<rows; i++)
2941  {
2942  if(PanelMode) count += offset;
2943  for(Index k=0; k<depth; k++)
2944  blockA[count++] = cj(lhs(i, k));
2945  if(PanelMode) count += (stride-offset-depth);
2946  }
2947 }
2948 
2949 // copy a complete panel of the rhs
2950 // this version is optimized for column major matrices
2951 // The traversal order is as follow: (nr==4):
2952 // 0 1 2 3 12 13 14 15 24 27
2953 // 4 5 6 7 16 17 18 19 25 28
2954 // 8 9 10 11 20 21 22 23 26 29
2955 // . . . . . . . . . .
2956 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2957 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2958 {
2959  typedef typename packet_traits<Scalar>::type Packet;
2960  typedef typename DataMapper::LinearMapper LinearMapper;
2961  enum { PacketSize = packet_traits<Scalar>::size };
2962  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2963 };
2964 
2965 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2967  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2968 {
2969  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2970  EIGEN_UNUSED_VARIABLE(stride);
2971  EIGEN_UNUSED_VARIABLE(offset);
2972  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2973  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2974  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2975  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2976  Index count = 0;
2977  const Index peeled_k = (depth/PacketSize)*PacketSize;
2978 
2979 #if EIGEN_ARCH_ARM64
2980  EIGEN_IF_CONSTEXPR(nr>=8)
2981  {
2982  for(Index j2=0; j2<packet_cols8; j2+=8)
2983  {
2984  // skip what we have before
2985  if(PanelMode) count += 8 * offset;
2986  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2987  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2988  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2989  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2990  const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2991  const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2992  const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2993  const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2994  Index k = 0;
2995  if (PacketSize % 2 == 0 && PacketSize <= 8) // 2 4 8
2996  {
2997  for (; k < peeled_k; k += PacketSize)
2998  {
2999  if (PacketSize == 2)
3000  {
3001  PacketBlock<Packet, PacketSize==2 ?2:PacketSize> kernel0, kernel1, kernel2, kernel3;
3002  kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3003  kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3004  kernel1.packet[0%PacketSize] = dm2.template loadPacket<Packet>(k);
3005  kernel1.packet[1%PacketSize] = dm3.template loadPacket<Packet>(k);
3006  kernel2.packet[0%PacketSize] = dm4.template loadPacket<Packet>(k);
3007  kernel2.packet[1%PacketSize] = dm5.template loadPacket<Packet>(k);
3008  kernel3.packet[0%PacketSize] = dm6.template loadPacket<Packet>(k);
3009  kernel3.packet[1%PacketSize] = dm7.template loadPacket<Packet>(k);
3010  ptranspose(kernel0);
3011  ptranspose(kernel1);
3012  ptranspose(kernel2);
3013  ptranspose(kernel3);
3014 
3015  pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
3016  pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
3017  pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
3018  pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
3019 
3020  pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
3021  pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
3022  pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
3023  pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
3024  count+=8*PacketSize;
3025  }
3026  else if (PacketSize == 4)
3027  {
3028  PacketBlock<Packet, PacketSize == 4?4:PacketSize> kernel0, kernel1;
3029 
3030  kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3031  kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3032  kernel0.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3033  kernel0.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3034  kernel1.packet[0%PacketSize] = dm4.template loadPacket<Packet>(k);
3035  kernel1.packet[1%PacketSize] = dm5.template loadPacket<Packet>(k);
3036  kernel1.packet[2%PacketSize] = dm6.template loadPacket<Packet>(k);
3037  kernel1.packet[3%PacketSize] = dm7.template loadPacket<Packet>(k);
3038  ptranspose(kernel0);
3039  ptranspose(kernel1);
3040 
3041  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize]));
3042  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel1.packet[0%PacketSize]));
3043  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[1%PacketSize]));
3044  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel1.packet[1%PacketSize]));
3045  pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[2%PacketSize]));
3046  pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel1.packet[2%PacketSize]));
3047  pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[3%PacketSize]));
3048  pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel1.packet[3%PacketSize]));
3049  count+=8*PacketSize;
3050  }
3051  else if (PacketSize == 8)
3052  {
3053  PacketBlock<Packet, PacketSize==8?8:PacketSize> kernel0;
3054 
3055  kernel0.packet[0%PacketSize] = dm0.template loadPacket<Packet>(k);
3056  kernel0.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3057  kernel0.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3058  kernel0.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3059  kernel0.packet[4%PacketSize] = dm4.template loadPacket<Packet>(k);
3060  kernel0.packet[5%PacketSize] = dm5.template loadPacket<Packet>(k);
3061  kernel0.packet[6%PacketSize] = dm6.template loadPacket<Packet>(k);
3062  kernel0.packet[7%PacketSize] = dm7.template loadPacket<Packet>(k);
3063  ptranspose(kernel0);
3064 
3065  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel0.packet[0%PacketSize]));
3066  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel0.packet[1%PacketSize]));
3067  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel0.packet[2%PacketSize]));
3068  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel0.packet[3%PacketSize]));
3069  pstoreu(blockB+count+4*PacketSize, cj.pconj(kernel0.packet[4%PacketSize]));
3070  pstoreu(blockB+count+5*PacketSize, cj.pconj(kernel0.packet[5%PacketSize]));
3071  pstoreu(blockB+count+6*PacketSize, cj.pconj(kernel0.packet[6%PacketSize]));
3072  pstoreu(blockB+count+7*PacketSize, cj.pconj(kernel0.packet[7%PacketSize]));
3073  count+=8*PacketSize;
3074  }
3075  }
3076  }
3077 
3078  for(; k<depth; k++)
3079  {
3080  blockB[count+0] = cj(dm0(k));
3081  blockB[count+1] = cj(dm1(k));
3082  blockB[count+2] = cj(dm2(k));
3083  blockB[count+3] = cj(dm3(k));
3084  blockB[count+4] = cj(dm4(k));
3085  blockB[count+5] = cj(dm5(k));
3086  blockB[count+6] = cj(dm6(k));
3087  blockB[count+7] = cj(dm7(k));
3088  count += 8;
3089  }
3090  // skip what we have after
3091  if(PanelMode) count += 8 * (stride-offset-depth);
3092  }
3093  }
3094 #endif
3095 
3096  EIGEN_IF_CONSTEXPR(nr>=4)
3097  {
3098  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
3099  {
3100  // skip what we have before
3101  if(PanelMode) count += 4 * offset;
3102  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
3103  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
3104  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
3105  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
3106 
3107  Index k=0;
3108  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
3109  {
3110  for(; k<peeled_k; k+=PacketSize) {
3111  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
3112  kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
3113  kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
3114  kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
3115  kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
3116  ptranspose(kernel);
3117  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
3118  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
3119  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
3120  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
3121  count+=4*PacketSize;
3122  }
3123  }
3124  for(; k<depth; k++)
3125  {
3126  blockB[count+0] = cj(dm0(k));
3127  blockB[count+1] = cj(dm1(k));
3128  blockB[count+2] = cj(dm2(k));
3129  blockB[count+3] = cj(dm3(k));
3130  count += 4;
3131  }
3132  // skip what we have after
3133  if(PanelMode) count += 4 * (stride-offset-depth);
3134  }
3135  }
3136 
3137  // copy the remaining columns one at a time (nr==1)
3138  for(Index j2=packet_cols4; j2<cols; ++j2)
3139  {
3140  if(PanelMode) count += offset;
3141  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
3142  for(Index k=0; k<depth; k++)
3143  {
3144  blockB[count] = cj(dm0(k));
3145  count += 1;
3146  }
3147  if(PanelMode) count += (stride-offset-depth);
3148  }
3149 }
3150 
3151 // this version is optimized for row major matrices
3152 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3153 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3154 {
3155  typedef typename packet_traits<Scalar>::type Packet;
3156  typedef typename unpacket_traits<Packet>::half HalfPacket;
3157  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
3158  typedef typename DataMapper::LinearMapper LinearMapper;
3159  enum { PacketSize = packet_traits<Scalar>::size,
3160  HalfPacketSize = unpacket_traits<HalfPacket>::size,
3161  QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
3162  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
3163  {
3164  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
3165  EIGEN_UNUSED_VARIABLE(stride);
3166  EIGEN_UNUSED_VARIABLE(offset);
3167  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
3168  const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
3169  const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
3170  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
3171  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
3172  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
3173  Index count = 0;
3174 
3175 #if EIGEN_ARCH_ARM64
3176  EIGEN_IF_CONSTEXPR(nr>=8)
3177  {
3178  for(Index j2=0; j2<packet_cols8; j2+=8)
3179  {
3180  // skip what we have before
3181  if(PanelMode) count += 8 * offset;
3182  for(Index k=0; k<depth; k++)
3183  {
3184  if (PacketSize==8) {
3185  Packet A = rhs.template loadPacket<Packet>(k, j2);
3186  pstoreu(blockB+count, cj.pconj(A));
3187  count += PacketSize;
3188  } else if (PacketSize==4) {
3189  Packet A = rhs.template loadPacket<Packet>(k, j2);
3190  Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
3191  pstoreu(blockB+count, cj.pconj(A));
3192  pstoreu(blockB+count+PacketSize, cj.pconj(B));
3193  count += 2*PacketSize;
3194  } else {
3195  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3196  blockB[count+0] = cj(dm0(0));
3197  blockB[count+1] = cj(dm0(1));
3198  blockB[count+2] = cj(dm0(2));
3199  blockB[count+3] = cj(dm0(3));
3200  blockB[count+4] = cj(dm0(4));
3201  blockB[count+5] = cj(dm0(5));
3202  blockB[count+6] = cj(dm0(6));
3203  blockB[count+7] = cj(dm0(7));
3204  count += 8;
3205  }
3206  }
3207  // skip what we have after
3208  if(PanelMode) count += 8 * (stride-offset-depth);
3209  }
3210  }
3211 #endif
3212 
3213  if(nr>=4)
3214  {
3215  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
3216  {
3217  // skip what we have before
3218  if(PanelMode) count += 4 * offset;
3219  for(Index k=0; k<depth; k++)
3220  {
3221  if (PacketSize==4) {
3222  Packet A = rhs.template loadPacket<Packet>(k, j2);
3223  pstoreu(blockB+count, cj.pconj(A));
3224  count += PacketSize;
3225  } else if (HasHalf && HalfPacketSize==4) {
3226  HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
3227  pstoreu(blockB+count, cj.pconj(A));
3228  count += HalfPacketSize;
3229  } else if (HasQuarter && QuarterPacketSize==4) {
3230  QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
3231  pstoreu(blockB+count, cj.pconj(A));
3232  count += QuarterPacketSize;
3233  } else {
3234  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3235  blockB[count+0] = cj(dm0(0));
3236  blockB[count+1] = cj(dm0(1));
3237  blockB[count+2] = cj(dm0(2));
3238  blockB[count+3] = cj(dm0(3));
3239  count += 4;
3240  }
3241  }
3242  // skip what we have after
3243  if(PanelMode) count += 4 * (stride-offset-depth);
3244  }
3245  }
3246  // copy the remaining columns one at a time (nr==1)
3247  for(Index j2=packet_cols4; j2<cols; ++j2)
3248  {
3249  if(PanelMode) count += offset;
3250  for(Index k=0; k<depth; k++)
3251  {
3252  blockB[count] = cj(rhs(k, j2));
3253  count += 1;
3254  }
3255  if(PanelMode) count += stride-offset-depth;
3256  }
3257  }
3258 };
3259 
3260 } // end namespace internal
3261 
3264 inline std::ptrdiff_t l1CacheSize()
3265 {
3266  std::ptrdiff_t l1, l2, l3;
3267  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3268  return l1;
3269 }
3270 
3273 inline std::ptrdiff_t l2CacheSize()
3274 {
3275  std::ptrdiff_t l1, l2, l3;
3276  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3277  return l2;
3278 }
3279 
3283 inline std::ptrdiff_t l3CacheSize()
3284 {
3285  std::ptrdiff_t l1, l2, l3;
3286  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3287  return l3;
3288 }
3289 
3295 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
3296 {
3297  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
3298 }
3299 
3300 } // end namespace Eigen
3301 
3302 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
Matrix3f m
Array< int, 3, 1 > b
int n
const ImagReturnType imag() const
RealReturnType real() const
MatrixXcf A
Array33i c
MatrixXf B
IndexedView_or_Block operator()(const RowIndices &rowIndices, const ColIndices &colIndices)
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:963
#define EIGEN_COMP_MSVC
Definition: Macros.h:129
#define eigen_internal_assert(x)
Definition: Macros.h:908
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
#define EIGEN_DONT_INLINE
Definition: Macros.h:844
#define eigen_assert(x)
Definition: Macros.h:902
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1298
RowVector3d w
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
float * p
@ ColMajor
Definition: Constants.h:321
@ RowMajor
Definition: Constants.h:323
bfloat16() max(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:690
bfloat16() min(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:684
const std::ptrdiff_t defaultL2CacheSize
Packet padd(const Packet &a, const Packet &b)
constexpr int plain_enum_min(A a, B b)
Definition: Meta.h:516
void pstore(Scalar *to, const Packet &from)
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:1185
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, std::enable_if_t< unpacket_traits< RealPacket >::size<=8 > *=0)
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Packet1cd pcplxflip(const Packet1cd &x)
Definition: MSA/Complex.h:617
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu(Scalar *to, const Packet &from)
Packet pmul(const Packet &a, const Packet &b)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
const std::ptrdiff_t defaultL3CacheSize
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Packet psub(const Packet &a, const Packet &b)
void prefetch(const Scalar *addr)
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Packet2cf pconj(const Packet2cf &a)
const std::ptrdiff_t defaultL1CacheSize
Packet4c predux_half_dowto4(const Packet8c &a)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
T div_ceil(const T &a, const T &b)
Definition: Meta.h:428
: InteropHeaders
Definition: Core:139
std::ptrdiff_t l1CacheSize()
std::ptrdiff_t l2CacheSize()
@ GetAction
Definition: Constants.h:508
@ SetAction
Definition: Constants.h:508
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
std::ptrdiff_t l3CacheSize()
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: BFloat16.h:222
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
#define EIGEN_GEBGP_ONESTEP(K)
#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size)
#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size)
#define PACKET_DECL_COND_SCALAR(packet_size)
#define EIGEN_GEBP_ONESTEP(K)
#define PACKET_DECL_COND(name, packet_size)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:231