MatrixProduct.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
5 // Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
13 
14 #ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15 #define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
16 #endif
17 
18 #include "MatrixProductCommon.h"
19 
20 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
21 #define EIGEN_ALTIVEC_DISABLE_MMA 0
22 #endif
23 
24 // Check for MMA builtin support.
25 #if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
26 #if __has_builtin(__builtin_mma_assemble_acc)
27  #define EIGEN_ALTIVEC_MMA_SUPPORT
28 #endif
29 #endif
30 
31 // Check if and how we should actually use MMA if supported.
32 #if defined(EIGEN_ALTIVEC_MMA_SUPPORT)
33 
34 #if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH)
35 #define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0
36 #endif
37 
38 // Check if we want to enable dynamic dispatch. Not supported by LLVM.
39 #if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM
40 #define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1
41 // Otherwise, use MMA by default if available.
42 #elif defined(__MMA__)
43 #define EIGEN_ALTIVEC_MMA_ONLY 1
44 #endif
45 
46 #endif // EIGEN_ALTIVEC_MMA_SUPPORT
47 
48 #if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
49  #include "MatrixProductMMA.h"
50 #endif
51 
52 
57 #include "../../InternalHeaderCheck.h"
58 
59 namespace Eigen {
60 
61 namespace internal {
62 
63 
66 template<typename Scalar>
67 struct quad_traits
68 {
69  typedef typename packet_traits<Scalar>::type vectortype;
70  typedef PacketBlock<vectortype,4> type;
71  typedef vectortype rhstype;
72  enum
73  {
74  vectorsize = packet_traits<Scalar>::size,
75  size = 4,
76  rows = 4
77  };
78 };
79 
80 template<>
81 struct quad_traits<double>
82 {
83  typedef Packet2d vectortype;
84  typedef PacketBlock<vectortype,4> type;
85  typedef PacketBlock<Packet2d,2> rhstype;
86  enum
87  {
88  vectorsize = packet_traits<double>::size,
89  size = 2,
90  rows = 4
91  };
92 };
93 
94 template<>
95 struct quad_traits<bfloat16>
96 {
97  typedef Packet8bf vectortype;
98  typedef PacketBlock<vectortype,4> type;
99  typedef vectortype rhstype;
100  enum
101  {
102  vectorsize = packet_traits<bfloat16>::size,
103  size = 8,
104  rows = 4
105  };
106 };
107 
108 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
109 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
110 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
111 
112 const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
113  8, 9, 10, 11,
114  16, 17, 18, 19,
115  24, 25, 26, 27};
116 
117 const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
118  12, 13, 14, 15,
119  20, 21, 22, 23,
120  28, 29, 30, 31};
121 
122 
140 template<typename Scalar, int StorageOrder>
141 EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
142 {
143  std::complex<Scalar> v;
144  if(i < j)
145  {
146  v.real( dt(j,i).real());
147  v.imag(-dt(j,i).imag());
148  } else if(i > j)
149  {
150  v.real( dt(i,j).real());
151  v.imag( dt(i,j).imag());
152  } else {
153  v.real( dt(i,j).real());
154  v.imag((Scalar)0.0);
155  }
156  return v;
157 }
158 
159 template<typename Scalar, int StorageOrder, int N>
160 EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
161 {
162  const Index depth = k2 + rows;
163  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
164  const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
165  const Index vectorDelta = vectorSize * rows;
166  Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
167 
168  Index rir = 0, rii, j = 0;
169  for(; j + vectorSize <= cols; j+=vectorSize)
170  {
171  rii = rir + vectorDelta;
172 
173  for(Index i = k2; i < depth; i++)
174  {
175  for(Index k = 0; k < vectorSize; k++)
176  {
177  std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j + k, rhs);
178 
179  blockBf[rir + k] = v.real();
180  blockBf[rii + k] = v.imag();
181  }
182  rir += vectorSize;
183  rii += vectorSize;
184  }
185 
186  rir += vectorDelta;
187  }
188 
189  for(; j < cols; j++)
190  {
191  rii = rir + rows;
192 
193  for(Index i = k2; i < depth; i++)
194  {
195  std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j, rhs);
196 
197  blockBf[rir] = v.real();
198  blockBf[rii] = v.imag();
199 
200  rir += 1;
201  rii += 1;
202  }
203 
204  rir += rows;
205  }
206 }
207 
208 template<typename Scalar, int StorageOrder>
209 EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
210 {
211  const Index depth = cols;
212  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
213  const Index vectorSize = quad_traits<Scalar>::vectorsize;
214  const Index vectorDelta = vectorSize * depth;
215  Scalar* blockAf = reinterpret_cast<Scalar *>(blockA);
216 
217  Index rir = 0, rii, j = 0;
218  for(; j + vectorSize <= rows; j+=vectorSize)
219  {
220  rii = rir + vectorDelta;
221 
222  for(Index i = 0; i < depth; i++)
223  {
224  for(Index k = 0; k < vectorSize; k++)
225  {
226  std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j+k, i, lhs);
227 
228  blockAf[rir + k] = v.real();
229  blockAf[rii + k] = v.imag();
230  }
231  rir += vectorSize;
232  rii += vectorSize;
233  }
234 
235  rir += vectorDelta;
236  }
237 
238  if (j < rows)
239  {
240  rii = rir + ((rows - j) * depth);
241 
242  for(Index i = 0; i < depth; i++)
243  {
244  Index k = j;
245  for(; k < rows; k++)
246  {
247  std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(k, i, lhs);
248 
249  blockAf[rir] = v.real();
250  blockAf[rii] = v.imag();
251 
252  rir += 1;
253  rii += 1;
254  }
255  }
256  }
257 }
258 
259 template<typename Scalar, int StorageOrder, int N>
260 EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
261 {
262  const Index depth = k2 + rows;
263  const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
264  const Index vectorSize = quad_traits<Scalar>::vectorsize;
265 
266  Index ri = 0, j = 0;
267  for(; j + N*vectorSize <= cols; j+=N*vectorSize)
268  {
269  Index i = k2;
270  for(; i < depth; i++)
271  {
272  for(Index k = 0; k < N*vectorSize; k++)
273  {
274  if(i <= j+k)
275  blockB[ri + k] = rhs(j+k, i);
276  else
277  blockB[ri + k] = rhs(i, j+k);
278  }
279  ri += N*vectorSize;
280  }
281  }
282 
283  for(; j < cols; j++)
284  {
285  for(Index i = k2; i < depth; i++)
286  {
287  if(j <= i)
288  blockB[ri] = rhs(i, j);
289  else
290  blockB[ri] = rhs(j, i);
291  ri += 1;
292  }
293  }
294 }
295 
296 template<typename Scalar, int StorageOrder>
297 EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
298 {
299  const Index depth = cols;
300  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
301  const Index vectorSize = quad_traits<Scalar>::vectorsize;
302 
303  Index ri = 0, j = 0;
304  for(; j + vectorSize <= rows; j+=vectorSize)
305  {
306  Index i = 0;
307 
308  for(; i < depth; i++)
309  {
310  for(Index k = 0; k < vectorSize; k++)
311  {
312  if(i <= j+k)
313  blockA[ri + k] = lhs(j+k, i);
314  else
315  blockA[ri + k] = lhs(i, j+k);
316  }
317  ri += vectorSize;
318  }
319  }
320 
321  if (j < rows)
322  {
323  for(Index i = 0; i < depth; i++)
324  {
325  Index k = j;
326  for(; k < rows; k++)
327  {
328  if(i <= k)
329  blockA[ri] = lhs(k, i);
330  else
331  blockA[ri] = lhs(i, k);
332  ri += 1;
333  }
334  }
335  }
336 }
337 
338 template<typename Index, int nr, int StorageOrder>
339 struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
340 {
341  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
342  {
343  symm_pack_complex_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
344  }
345 };
346 
347 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
348 struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
349 {
350  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
351  {
352  symm_pack_complex_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
353  }
354 };
355 
356 // *********** symm_pack std::complex<float64> ***********
357 
358 template<typename Index, int nr, int StorageOrder>
359 struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
360 {
361  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
362  {
363  symm_pack_complex_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
364  }
365 };
366 
367 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
368 struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
369 {
370  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
371  {
372  symm_pack_complex_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
373  }
374 };
375 
376 // *********** symm_pack float32 ***********
377 template<typename Index, int nr, int StorageOrder>
378 struct symm_pack_rhs<float, Index, nr, StorageOrder>
379 {
380  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
381  {
382  symm_pack_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
383  }
384 };
385 
386 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
387 struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
388 {
389  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
390  {
391  symm_pack_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
392  }
393 };
394 
395 // *********** symm_pack float64 ***********
396 template<typename Index, int nr, int StorageOrder>
397 struct symm_pack_rhs<double, Index, nr, StorageOrder>
398 {
399  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
400  {
401  symm_pack_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
402  }
403 };
404 
405 template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
406 struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
407 {
408  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
409  {
410  symm_pack_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
411  }
412 };
413 
425 template<typename Scalar, typename Packet, int N>
426 EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
427 {
428  const Index size = 16 / sizeof(Scalar);
429  pstore<Scalar>(to + (0 * size), block.packet[0]);
430  pstore<Scalar>(to + (1 * size), block.packet[1]);
431  if (N > 2) {
432  pstore<Scalar>(to + (2 * size), block.packet[2]);
433  }
434  if (N > 3) {
435  pstore<Scalar>(to + (3 * size), block.packet[3]);
436  }
437 }
438 
439 // General template for lhs & rhs complex packing.
440 template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
441 struct dhs_cpack {
442  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
443  {
444  const Index vectorSize = quad_traits<Scalar>::vectorsize;
445  const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
446  Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
447  Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
448  Index j = 0;
449 
450  for(; j + vectorSize <= rows; j+=vectorSize)
451  {
452  const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
453  Index i = 0;
454 
455  rii = rir + vectorDelta;
456 
457  for(; i + vectorSize <= depth; i+=vectorSize)
458  {
459  PacketBlock<Packet,4> blockr, blocki;
460  PacketBlock<PacketC,8> cblock;
461 
462  if (UseLhs) {
463  bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
464  } else {
465  bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
466  }
467 
468  blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
469  blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
470  blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
471  blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
472 
473  blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
474  blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
475  blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
476  blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
477 
478  if(Conjugate)
479  {
480  blocki.packet[0] = -blocki.packet[0];
481  blocki.packet[1] = -blocki.packet[1];
482  blocki.packet[2] = -blocki.packet[2];
483  blocki.packet[3] = -blocki.packet[3];
484  }
485 
486  if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
487  {
488  ptranspose(blockr);
489  ptranspose(blocki);
490  }
491 
492  storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
493  storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
494 
495  rir += 4*vectorSize;
496  rii += 4*vectorSize;
497  }
498  for(; i < depth; i++)
499  {
500  PacketBlock<Packet,1> blockr, blocki;
501  PacketBlock<PacketC,2> cblock;
502 
503  if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
504  {
505  if (UseLhs) {
506  cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
507  cblock.packet[1] = lhs2.template loadPacket<PacketC>(2, i);
508  } else {
509  cblock.packet[0] = lhs2.template loadPacket<PacketC>(i, 0);
510  cblock.packet[1] = lhs2.template loadPacket<PacketC>(i, 2);
511  }
512  } else {
513  if (UseLhs) {
514  cblock.packet[0] = pload2(lhs2(0, i), lhs2(1, i));
515  cblock.packet[1] = pload2(lhs2(2, i), lhs2(3, i));
516  } else {
517  cblock.packet[0] = pload2(lhs2(i, 0), lhs2(i, 1));
518  cblock.packet[1] = pload2(lhs2(i, 2), lhs2(i, 3));
519  }
520  }
521 
522  blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
523  blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
524 
525  if(Conjugate)
526  {
527  blocki.packet[0] = -blocki.packet[0];
528  }
529 
530  pstore<Scalar>(blockAt + rir, blockr.packet[0]);
531  pstore<Scalar>(blockAt + rii, blocki.packet[0]);
532 
533  rir += vectorSize;
534  rii += vectorSize;
535  }
536 
537  rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
538  }
539 
540  if (!UseLhs)
541  {
542  if(PanelMode) rir -= (offset*(vectorSize - 1));
543 
544  for(; j < rows; j++)
545  {
546  const DataMapper lhs2 = lhs.getSubMapper(0, j);
547  rii = rir + ((PanelMode) ? stride : depth);
548 
549  for(Index i = 0; i < depth; i++)
550  {
551  blockAt[rir] = lhs2(i, 0).real();
552 
553  if(Conjugate)
554  blockAt[rii] = -lhs2(i, 0).imag();
555  else
556  blockAt[rii] = lhs2(i, 0).imag();
557 
558  rir += 1;
559  rii += 1;
560  }
561 
562  rir += ((PanelMode) ? (2*stride - depth) : depth);
563  }
564  } else {
565  if (j < rows)
566  {
567  if(PanelMode) rir += (offset*(rows - j - vectorSize));
568  rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
569 
570  for(Index i = 0; i < depth; i++)
571  {
572  Index k = j;
573  for(; k < rows; k++)
574  {
575  blockAt[rir] = lhs(k, i).real();
576 
577  if(Conjugate)
578  blockAt[rii] = -lhs(k, i).imag();
579  else
580  blockAt[rii] = lhs(k, i).imag();
581 
582  rir += 1;
583  rii += 1;
584  }
585  }
586  }
587  }
588  }
589 };
590 
591 // General template for lhs & rhs packing.
592 template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
593 struct dhs_pack{
594  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
595  {
596  const Index vectorSize = quad_traits<Scalar>::vectorsize;
597  Index ri = 0, j = 0;
598 
599  for(; j + vectorSize <= rows; j+=vectorSize)
600  {
601  const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
602  Index i = 0;
603 
604  if(PanelMode) ri += vectorSize*offset;
605 
606  for(; i + vectorSize <= depth; i+=vectorSize)
607  {
608  PacketBlock<Packet,4> block;
609 
610  if (UseLhs) {
611  bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, 0, i);
612  } else {
613  bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block, lhs2, i, 0);
614  }
615  if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
616  {
617  ptranspose(block);
618  }
619 
620  storeBlock<Scalar, Packet, 4>(blockA + ri, block);
621 
622  ri += 4*vectorSize;
623  }
624  for(; i < depth; i++)
625  {
626  if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
627  {
628  if (UseLhs) {
629  blockA[ri+0] = lhs2(0, i);
630  blockA[ri+1] = lhs2(1, i);
631  blockA[ri+2] = lhs2(2, i);
632  blockA[ri+3] = lhs2(3, i);
633  } else {
634  blockA[ri+0] = lhs2(i, 0);
635  blockA[ri+1] = lhs2(i, 1);
636  blockA[ri+2] = lhs2(i, 2);
637  blockA[ri+3] = lhs2(i, 3);
638  }
639  } else {
640  Packet lhsV;
641  if (UseLhs) {
642  lhsV = lhs2.template loadPacket<Packet>(0, i);
643  } else {
644  lhsV = lhs2.template loadPacket<Packet>(i, 0);
645  }
646  pstore<Scalar>(blockA + ri, lhsV);
647  }
648 
649  ri += vectorSize;
650  }
651 
652  if(PanelMode) ri += vectorSize*(stride - offset - depth);
653  }
654 
655  if (!UseLhs)
656  {
657  if(PanelMode) ri += offset;
658 
659  for(; j < rows; j++)
660  {
661  const DataMapper lhs2 = lhs.getSubMapper(0, j);
662  for(Index i = 0; i < depth; i++)
663  {
664  blockA[ri] = lhs2(i, 0);
665  ri += 1;
666  }
667 
668  if(PanelMode) ri += stride - depth;
669  }
670  } else {
671  if (j < rows)
672  {
673  if(PanelMode) ri += offset*(rows - j);
674 
675  for(Index i = 0; i < depth; i++)
676  {
677  Index k = j;
678  for(; k < rows; k++)
679  {
680  blockA[ri] = lhs(k, i);
681  ri += 1;
682  }
683  }
684  }
685  }
686  }
687 };
688 
689 // General template for lhs packing, float64 specialization.
690 template<typename DataMapper, int StorageOrder, bool PanelMode>
691 struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
692 {
693  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
694  {
695  const Index vectorSize = quad_traits<double>::vectorsize;
696  Index ri = 0, j = 0;
697 
698  for(; j + vectorSize <= rows; j+=vectorSize)
699  {
700  const DataMapper lhs2 = lhs.getSubMapper(j, 0);
701  Index i = 0;
702 
703  if(PanelMode) ri += vectorSize*offset;
704 
705  for(; i + vectorSize <= depth; i+=vectorSize)
706  {
707  PacketBlock<Packet2d,2> block;
708  if(StorageOrder == RowMajor)
709  {
710  block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i);
711  block.packet[1] = lhs2.template loadPacket<Packet2d>(1, i);
712 
713  ptranspose(block);
714  } else {
715  block.packet[0] = lhs2.template loadPacket<Packet2d>(0, i + 0);
716  block.packet[1] = lhs2.template loadPacket<Packet2d>(0, i + 1);
717  }
718 
719  storeBlock<double, Packet2d, 2>(blockA + ri, block);
720 
721  ri += 2*vectorSize;
722  }
723  for(; i < depth; i++)
724  {
725  if(StorageOrder == RowMajor)
726  {
727  blockA[ri+0] = lhs2(0, i);
728  blockA[ri+1] = lhs2(1, i);
729  } else {
730  Packet2d lhsV = lhs2.template loadPacket<Packet2d>(0, i);
731  pstore<double>(blockA + ri, lhsV);
732  }
733 
734  ri += vectorSize;
735  }
736 
737  if(PanelMode) ri += vectorSize*(stride - offset - depth);
738  }
739 
740  if (j < rows)
741  {
742  if(PanelMode) ri += offset*(rows - j);
743 
744  for(Index i = 0; i < depth; i++)
745  {
746  Index k = j;
747  for(; k < rows; k++)
748  {
749  blockA[ri] = lhs(k, i);
750  ri += 1;
751  }
752  }
753  }
754  }
755 };
756 
757 // General template for rhs packing, float64 specialization.
758 template<typename DataMapper, int StorageOrder, bool PanelMode>
759 struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
760 {
761  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
762  {
763  const Index vectorSize = quad_traits<double>::vectorsize;
764  Index ri = 0, j = 0;
765 
766  for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
767  {
768  const DataMapper rhs2 = rhs.getSubMapper(0, j);
769  Index i = 0;
770 
771  if(PanelMode) ri += offset*(2*vectorSize);
772 
773  for(; i + vectorSize <= depth; i+=vectorSize)
774  {
775  PacketBlock<Packet2d,4> block;
776  if(StorageOrder == ColMajor)
777  {
778  PacketBlock<Packet2d,2> block1, block2;
779  block1.packet[0] = rhs2.template loadPacket<Packet2d>(i, 0);
780  block1.packet[1] = rhs2.template loadPacket<Packet2d>(i, 1);
781  block2.packet[0] = rhs2.template loadPacket<Packet2d>(i, 2);
782  block2.packet[1] = rhs2.template loadPacket<Packet2d>(i, 3);
783 
784  ptranspose(block1);
785  ptranspose(block2);
786 
787  pstore<double>(blockB + ri , block1.packet[0]);
788  pstore<double>(blockB + ri + 2, block2.packet[0]);
789  pstore<double>(blockB + ri + 4, block1.packet[1]);
790  pstore<double>(blockB + ri + 6, block2.packet[1]);
791  } else {
792  block.packet[0] = rhs2.template loadPacket<Packet2d>(i + 0, 0); //[a1 a2]
793  block.packet[1] = rhs2.template loadPacket<Packet2d>(i + 0, 2); //[a3 a4]
794  block.packet[2] = rhs2.template loadPacket<Packet2d>(i + 1, 0); //[b1 b2]
795  block.packet[3] = rhs2.template loadPacket<Packet2d>(i + 1, 2); //[b3 b4]
796 
797  storeBlock<double, Packet2d, 4>(blockB + ri, block);
798  }
799 
800  ri += 4*vectorSize;
801  }
802  for(; i < depth; i++)
803  {
804  if(StorageOrder == ColMajor)
805  {
806  blockB[ri+0] = rhs2(i, 0);
807  blockB[ri+1] = rhs2(i, 1);
808 
809  ri += vectorSize;
810 
811  blockB[ri+0] = rhs2(i, 2);
812  blockB[ri+1] = rhs2(i, 3);
813  } else {
814  Packet2d rhsV = rhs2.template loadPacket<Packet2d>(i, 0);
815  pstore<double>(blockB + ri, rhsV);
816 
817  ri += vectorSize;
818 
819  rhsV = rhs2.template loadPacket<Packet2d>(i, 2);
820  pstore<double>(blockB + ri, rhsV);
821  }
822  ri += vectorSize;
823  }
824 
825  if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
826  }
827 
828  if(PanelMode) ri += offset;
829 
830  for(; j < cols; j++)
831  {
832  const DataMapper rhs2 = rhs.getSubMapper(0, j);
833  for(Index i = 0; i < depth; i++)
834  {
835  blockB[ri] = rhs2(i, 0);
836  ri += 1;
837  }
838 
839  if(PanelMode) ri += stride - depth;
840  }
841  }
842 };
843 
844 // General template for lhs packing, bfloat16 specialization.
845 template<typename DataMapper, int StorageOrder, bool PanelMode>
846 struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true>
847 {
848  EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
849  {
850  const Index vectorSize = quad_traits<bfloat16>::vectorsize;
851  Index ri = 0, j = 0;
852 
853  for(; j + 2*vectorSize <= rows; j+=2*vectorSize)
854  {
855  const DataMapper lhs2 = lhs.getSubMapper(j, 0);
856  Index i = 0;
857 
858  if(PanelMode) ri += 2*vectorSize*offset;
859 
860  if(StorageOrder == ColMajor)
861  {
862  for(; i + 2 <= depth; i+=2)
863  {
864  PacketBlock<Packet8bf,4> block;
865 
866  block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
867  block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
868  block.packet[2] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
869  block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
870 
871  Packet8bf t0, t1;
872  t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
873  t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
874  block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
875  block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
876  block.packet[0] = t0;
877  block.packet[1] = t1;
878 
879  storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
880 
881  ri += 2*2*vectorSize;
882  }
883  if (depth & 1)
884  {
885  PacketBlock<Packet8bf,2> block;
886 
887  block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
888  block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
889 
890  storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
891 
892  ri += 2*vectorSize;
893  }
894  } else {
895  for(; i + vectorSize <= depth; i+=vectorSize)
896  {
897  PacketBlock<Packet8bf,8> block1, block2;
898 
899  bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
900  bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
901 
902  Packet4ui v1[8], v2[8];
903 
904  v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
905  v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
906  v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
907  v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
908  v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
909  v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
910  v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
911  v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
912  v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
913  v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
914  v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
915  v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
916  v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
917  v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
918  v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
919  v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
920 
921 #ifdef EIGEN_VECTORIZE_VSX
922  block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
923  block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
924  block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
925  block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
926  block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
927  block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
928  block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
929  block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
930  block2.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
931  block2.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
932  block2.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
933  block2.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
934  block2.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
935  block2.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
936  block2.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
937  block2.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
938 #else
939  block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
940  block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
941  block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
942  block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
943  block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
944  block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
945  block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
946  block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
947  block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_HI));
948  block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_LO));
949  block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_HI));
950  block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_LO));
951  block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_HI));
952  block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_LO));
953  block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_HI));
954  block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_LO));
955 #endif
956 
957  for(Index M = 0; M < 8; M+=2) {
958  pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2*vectorSize * M), block1.packet[M+0]);
959  pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2*vectorSize * M), block1.packet[M+1]);
960  pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2*vectorSize * M), block2.packet[M+0]);
961  pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2*vectorSize * M), block2.packet[M+1]);
962  }
963 
964  ri += 2*vectorSize*vectorSize;
965  }
966  for(; i + 2 <= depth; i+=2)
967  {
968  for(Index M = 0; M < 2*vectorSize; M++) {
969  blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
970  blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
971  }
972 
973  ri += 2*2*vectorSize;
974  }
975  if (depth & 1)
976  {
977  for(Index M = 0; M < 2*vectorSize; M++) {
978  blockA[ri + M] = lhs2(M, i);
979  }
980  ri += 2*vectorSize;
981  }
982  }
983 
984  if(PanelMode) ri += 2*vectorSize*(stride - offset - depth);
985  }
986  for(; j + vectorSize <= rows; j+=vectorSize)
987  {
988  const DataMapper lhs2 = lhs.getSubMapper(j, 0);
989  Index i = 0;
990 
991  if(PanelMode) ri += vectorSize*offset;
992 
993  if(StorageOrder == ColMajor)
994  {
995  for(; i + 2 <= depth; i+=2)
996  {
997  PacketBlock<Packet8bf,2> block;
998 
999  block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
1000  block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
1001 
1002  Packet8bf t0;
1003  t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
1004  block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
1005  block.packet[0] = t0;
1006 
1007  storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
1008 
1009  ri += 2*vectorSize;
1010  }
1011  if (depth & 1)
1012  {
1013  Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
1014  pstore<bfloat16>(blockA + ri, lhsV);
1015 
1016  ri += vectorSize;
1017  }
1018  } else {
1019  for(; i + vectorSize <= depth; i+=vectorSize)
1020  {
1021  PacketBlock<Packet8bf,8> block1;
1022 
1023  bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
1024 
1025  Packet4ui v1[8];
1026 
1027  // This is transposing and interleaving data
1028  v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
1029  v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
1030  v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
1031  v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
1032  v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
1033  v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
1034  v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
1035  v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
1036 
1037 #ifdef EIGEN_VECTORIZE_VSX
1038  block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
1039  block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
1040  block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
1041  block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
1042  block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
1043  block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
1044  block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
1045  block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
1046 #else
1047  block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
1048  block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
1049  block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
1050  block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
1051  block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
1052  block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
1053  block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
1054  block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
1055 #endif
1056 
1057  for(Index M = 0; M < 8; M++) {
1058  pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
1059  }
1060 
1061  ri += vectorSize*vectorSize;
1062  }
1063  for(; i + 2 <= depth; i+=2)
1064  {
1065  for(Index M = 0; M < vectorSize; M++) {
1066  blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
1067  blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
1068  }
1069 
1070  ri += 2*vectorSize;
1071  }
1072  if (depth & 1)
1073  {
1074  for(Index M = 0; M < vectorSize; M++) {
1075  blockA[ri + M] = lhs2(M, i);
1076  }
1077 
1078  ri += vectorSize;
1079  }
1080  }
1081 
1082  if(PanelMode) ri += vectorSize*(stride - offset - depth);
1083  }
1084  if(j + 4 <= rows)
1085  {
1086  const DataMapper lhs2 = lhs.getSubMapper(j, 0);
1087  Index i = 0;
1088 
1089  if(PanelMode) ri += 4*offset;
1090 
1091  for(; i + 2 <= depth; i+=2)
1092  {
1093  if(StorageOrder == ColMajor)
1094  {
1095  PacketBlock<Packet8bf,2> block;
1096 
1097  block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
1098  block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
1099 
1100  block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
1101 
1102  pstore<bfloat16>(blockA + ri, block.packet[0]);
1103  } else {
1104  blockA[ri+0] = lhs2(0, i + 0);
1105  blockA[ri+1] = lhs2(0, i + 1);
1106  blockA[ri+2] = lhs2(1, i + 0);
1107  blockA[ri+3] = lhs2(1, i + 1);
1108  blockA[ri+4] = lhs2(2, i + 0);
1109  blockA[ri+5] = lhs2(2, i + 1);
1110  blockA[ri+6] = lhs2(3, i + 0);
1111  blockA[ri+7] = lhs2(3, i + 1);
1112  }
1113 
1114  ri += 2*4;
1115  }
1116  if (depth & 1)
1117  {
1118  if(StorageOrder == ColMajor)
1119  {
1120  Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
1121 
1122  pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
1123  } else {
1124  blockA[ri+0] = lhs2(0, i);
1125  blockA[ri+1] = lhs2(1, i);
1126  blockA[ri+2] = lhs2(2, i);
1127  blockA[ri+3] = lhs2(3, i);
1128  }
1129 
1130  ri += 4;
1131  }
1132 
1133  if(PanelMode) ri += 4*(stride - offset - depth);
1134  j += 4;
1135  }
1136 
1137  if (j < rows)
1138  {
1139  if(PanelMode) ri += offset*(rows - j);
1140 
1141  Index i = 0;
1142  for(; i + 2 <= depth; i+=2)
1143  {
1144  Index k = j;
1145  for(; k < rows; k++)
1146  {
1147  blockA[ri+0] = lhs(k, i + 0);
1148  blockA[ri+1] = lhs(k, i + 1);
1149  ri += 2;
1150  }
1151  }
1152  if (depth & 1)
1153  {
1154  for(; j < rows; j++)
1155  {
1156  blockA[ri] = lhs(j, i);
1157  ri += 1;
1158  }
1159  }
1160  }
1161  }
1162 };
1163 
1164 // General template for rhs packing, bfloat16 specialization.
1165 template<typename DataMapper, int StorageOrder, bool PanelMode>
1166 struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false>
1167 {
1168  EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1169  {
1170  const Index vectorSize = quad_traits<bfloat16>::vectorsize;
1171  Index ri = 0, j = 0;
1172 
1173  for(; j + 4 <= cols; j+=4)
1174  {
1175  const DataMapper rhs2 = rhs.getSubMapper(0, j);
1176  Index i = 0;
1177 
1178  if(PanelMode) ri += 4*offset;
1179 
1180  for(; i + vectorSize <= depth; i+=vectorSize)
1181  {
1182  if(StorageOrder == ColMajor)
1183  {
1184  PacketBlock<Packet8bf,4> block;
1185 
1186  bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
1187 
1188  Packet4ui t0, t1, t2, t3;
1189 
1190  t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
1191  t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
1192  t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
1193  t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
1194 
1195 #ifdef EIGEN_VECTORIZE_VSX
1196  block.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
1197  block.packet[1] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
1198  block.packet[2] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
1199  block.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
1200 #else
1201  block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_HI));
1202  block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_LO));
1203  block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_HI));
1204  block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_LO));
1205 #endif
1206 
1207  storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
1208  } else {
1209  PacketBlock<Packet8bf,8> block;
1210 
1211  for (int M = 0; M < 8; M++) {
1212  block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
1213  }
1214 
1215  block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
1216  block.packet[1] = vec_mergeh(block.packet[2].m_val, block.packet[3].m_val);
1217  block.packet[2] = vec_mergeh(block.packet[4].m_val, block.packet[5].m_val);
1218  block.packet[3] = vec_mergeh(block.packet[6].m_val, block.packet[7].m_val);
1219 
1220  const Index size = 16 / sizeof(bfloat16);
1221 
1222  for (int M = 0; M < 4; M++) {
1223  pstore<bfloat16>(blockB + ri + (M * size), block.packet[M]);
1224  }
1225  }
1226 
1227  ri += 4*vectorSize;
1228  }
1229  for (; i + 2 <= depth; i += 2) {
1230  if(StorageOrder == ColMajor)
1231  {
1232  blockB[ri+0] = rhs2(i + 0, 0);
1233  blockB[ri+1] = rhs2(i + 1, 0);
1234  blockB[ri+2] = rhs2(i + 0, 1);
1235  blockB[ri+3] = rhs2(i + 1, 1);
1236  blockB[ri+4] = rhs2(i + 0, 2);
1237  blockB[ri+5] = rhs2(i + 1, 2);
1238  blockB[ri+6] = rhs2(i + 0, 3);
1239  blockB[ri+7] = rhs2(i + 1, 3);
1240  } else {
1241  PacketBlock<Packet8bf,2> block;
1242 
1243  for (int M = 0; M < 2; M++) {
1244  block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
1245  }
1246 
1247  block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
1248 
1249  pstore<bfloat16>(blockB + ri, block.packet[0]);
1250  }
1251 
1252  ri += 4*2;
1253  }
1254  if (depth & 1)
1255  {
1256  blockB[ri+0] = rhs2(i, 0);
1257  blockB[ri+1] = rhs2(i, 1);
1258  blockB[ri+2] = rhs2(i, 2);
1259  blockB[ri+3] = rhs2(i, 3);
1260 
1261  ri += 4;
1262  }
1263 
1264  if(PanelMode) ri += 4*(stride - offset - depth);
1265  }
1266 
1267  if (j < cols)
1268  {
1269  if(PanelMode) ri += offset*(cols - j);
1270 
1271  Index i = 0;
1272  for(; i + 2 <= depth; i+=2)
1273  {
1274  Index k = j;
1275  for(; k < cols; k++)
1276  {
1277  blockB[ri+0] = rhs(i + 0, k);
1278  blockB[ri+1] = rhs(i + 1, k);
1279  ri += 2;
1280  }
1281  }
1282  if (depth & 1)
1283  {
1284  for(; j < cols; j++)
1285  {
1286  blockB[ri] = rhs(i, j);
1287  ri += 1;
1288  }
1289  }
1290  }
1291  }
1292 };
1293 
1294 // General template for lhs complex packing, float64 specialization.
1295 template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
1296 struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
1297 {
1298  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
1299  {
1300  const Index vectorSize = quad_traits<double>::vectorsize;
1301  const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
1302  Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
1303  double* blockAt = reinterpret_cast<double *>(blockA);
1304  Index j = 0;
1305 
1306  for(; j + vectorSize <= rows; j+=vectorSize)
1307  {
1308  const DataMapper lhs2 = lhs.getSubMapper(j, 0);
1309  Index i = 0;
1310 
1311  rii = rir + vectorDelta;
1312 
1313  for(; i + vectorSize <= depth; i+=vectorSize)
1314  {
1315  PacketBlock<Packet,2> blockr, blocki;
1316  PacketBlock<PacketC,4> cblock;
1317 
1318  if(StorageOrder == ColMajor)
1319  {
1320  cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
1321  cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
1322 
1323  cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
1324  cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
1325 
1326  blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
1327  blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
1328 
1329  blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
1330  blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
1331  } else {
1332  cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
1333  cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
1334 
1335  cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
1336  cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
1337 
1338  blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
1339  blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
1340 
1341  blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
1342  blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
1343  }
1344 
1345  if(Conjugate)
1346  {
1347  blocki.packet[0] = -blocki.packet[0];
1348  blocki.packet[1] = -blocki.packet[1];
1349  }
1350 
1351  storeBlock<double, Packet, 2>(blockAt + rir, blockr);
1352  storeBlock<double, Packet, 2>(blockAt + rii, blocki);
1353 
1354  rir += 2*vectorSize;
1355  rii += 2*vectorSize;
1356  }
1357  for(; i < depth; i++)
1358  {
1359  PacketBlock<Packet,1> blockr, blocki;
1360  PacketBlock<PacketC,2> cblock;
1361 
1362  cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
1363  cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);
1364 
1365  blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
1366  blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
1367 
1368  if(Conjugate)
1369  {
1370  blocki.packet[0] = -blocki.packet[0];
1371  }
1372 
1373  pstore<double>(blockAt + rir, blockr.packet[0]);
1374  pstore<double>(blockAt + rii, blocki.packet[0]);
1375 
1376  rir += vectorSize;
1377  rii += vectorSize;
1378  }
1379 
1380  rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
1381  }
1382 
1383  if (j < rows)
1384  {
1385  if(PanelMode) rir += (offset*(rows - j - vectorSize));
1386  rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
1387 
1388  for(Index i = 0; i < depth; i++)
1389  {
1390  Index k = j;
1391  for(; k < rows; k++)
1392  {
1393  blockAt[rir] = lhs(k, i).real();
1394 
1395  if(Conjugate)
1396  blockAt[rii] = -lhs(k, i).imag();
1397  else
1398  blockAt[rii] = lhs(k, i).imag();
1399 
1400  rir += 1;
1401  rii += 1;
1402  }
1403  }
1404  }
1405  }
1406 };
1407 
1408 // General template for rhs complex packing, float64 specialization.
1409 template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
1410 struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
1411 {
1412  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
1413  {
1414  const Index vectorSize = quad_traits<double>::vectorsize;
1415  const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
1416  Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
1417  double* blockBt = reinterpret_cast<double *>(blockB);
1418  Index j = 0;
1419 
1420  for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
1421  {
1422  const DataMapper rhs2 = rhs.getSubMapper(0, j);
1423  Index i = 0;
1424 
1425  rii = rir + vectorDelta;
1426 
1427  for(; i < depth; i++)
1428  {
1429  PacketBlock<PacketC,4> cblock;
1430  PacketBlock<Packet,2> blockr, blocki;
1431 
1432  bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
1433 
1434  blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
1435  blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
1436 
1437  blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
1438  blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
1439 
1440  if(Conjugate)
1441  {
1442  blocki.packet[0] = -blocki.packet[0];
1443  blocki.packet[1] = -blocki.packet[1];
1444  }
1445 
1446  storeBlock<double, Packet, 2>(blockBt + rir, blockr);
1447  storeBlock<double, Packet, 2>(blockBt + rii, blocki);
1448 
1449  rir += 2*vectorSize;
1450  rii += 2*vectorSize;
1451  }
1452 
1453  rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
1454  }
1455 
1456  if(PanelMode) rir -= (offset*(2*vectorSize - 1));
1457 
1458  for(; j < cols; j++)
1459  {
1460  const DataMapper rhs2 = rhs.getSubMapper(0, j);
1461  rii = rir + ((PanelMode) ? stride : depth);
1462 
1463  for(Index i = 0; i < depth; i++)
1464  {
1465  blockBt[rir] = rhs2(i, 0).real();
1466 
1467  if(Conjugate)
1468  blockBt[rii] = -rhs2(i, 0).imag();
1469  else
1470  blockBt[rii] = rhs2(i, 0).imag();
1471 
1472  rir += 1;
1473  rii += 1;
1474  }
1475 
1476  rir += ((PanelMode) ? (2*stride - depth) : depth);
1477  }
1478  }
1479 };
1480 
1481 
1485 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
1486 template<typename Packet, bool NegativeAccumulate, int N>
1487 EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,N>* acc, const Packet& lhsV, const Packet* rhsV)
1488 {
1489  if(NegativeAccumulate)
1490  {
1491  for (int M = 0; M < N; M++) {
1492  acc->packet[M] = vec_nmsub(lhsV, rhsV[M], acc->packet[M]);
1493  }
1494  } else {
1495  for (int M = 0; M < N; M++) {
1496  acc->packet[M] = vec_madd(lhsV, rhsV[M], acc->packet[M]);
1497  }
1498  }
1499 }
1500 
1501 template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
1502 EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
1503 {
1504  Packet lhsV = pload<Packet>(lhs);
1505 
1506  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
1507 }
1508 
1509 // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
1510 template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1511 EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
1512 {
1513  pger_common<Packet, false, N>(accReal, lhsV, rhsV);
1514  if(LhsIsReal)
1515  {
1516  pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
1517  EIGEN_UNUSED_VARIABLE(lhsVi);
1518  } else {
1519  if (!RhsIsReal) {
1520  pger_common<Packet, ConjugateLhs == ConjugateRhs, N>(accReal, lhsVi, rhsVi);
1521  pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
1522  } else {
1523  EIGEN_UNUSED_VARIABLE(rhsVi);
1524  }
1525  pger_common<Packet, ConjugateLhs, N>(accImag, lhsVi, rhsV);
1526  }
1527 }
1528 
1529 template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
1530 EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
1531 {
1532  Packet lhsV = ploadLhs<Packet>(lhs_ptr);
1533  Packet lhsVi;
1534  if(!LhsIsReal) lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
1535  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1536 
1537  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1538 }
1539 
1540 template<typename Packet>
1542 {
1543  return ploadu<Packet>(lhs);
1544 }
1545 
1546 // Zero the accumulator on PacketBlock.
1547 template<typename Packet, int N>
1548 EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,N>& acc)
1549 {
1550  for (int M = 0; M < N; M++) {
1551  acc.packet[M] = pset1<Packet>((__UNPACK_TYPE__(Packet))0);
1552  }
1553 }
1554 
1555 template<typename Packet, int N>
1556 EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
1557 {
1558  for (int M = 0; M < N; M++) {
1559  acc.packet[M] = vec_mul(accZ.packet[M], pAlpha);
1560  }
1561 }
1562 
1563 template<typename Packet, int N>
1564 EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,N>& acc, const Packet& pMask)
1565 {
1566  for (int M = 0; M < N; M++) {
1567  acc.packet[M] = pand<Packet>(acc.packet[M], pMask);
1568  }
1569 }
1570 
1571 // Complex version of PacketBlock scaling.
1572 template<typename Packet, int N, bool mask>
1573 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask)
1574 {
1575  if (mask && (sizeof(__UNPACK_TYPE__(Packet)) == sizeof(float))) {
1576  band<Packet, N>(aReal, pMask);
1577  band<Packet, N>(aImag, pMask);
1578  } else {
1579  EIGEN_UNUSED_VARIABLE(pMask);
1580  }
1581 
1582  bscalec_common<Packet, N>(cReal, aReal, bReal);
1583 
1584  bscalec_common<Packet, N>(cImag, aImag, bReal);
1585 
1586  pger_common<Packet, true, N>(&cReal, bImag, aImag.packet);
1587 
1588  pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
1589 }
1590 
1591 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
1592 //
1593 // full = operate (load) on the entire PacketBlock or only half
1594 template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
1595 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
1596 {
1597  if (StorageOrder == RowMajor) {
1598  for (int M = 0; M < N; M++) {
1599  acc.packet[M] = res.template loadPacket<Packet>(row + M, col);
1600  }
1601  if (Complex) {
1602  for (int M = 0; M < N; M++) {
1603  acc.packet[M+N] = res.template loadPacket<Packet>(row + M, col + accCols);
1604  }
1605  }
1606  } else {
1607  for (int M = 0; M < N; M++) {
1608  acc.packet[M] = res.template loadPacket<Packet>(row, col + M);
1609  }
1610  if (Complex && full) {
1611  for (int M = 0; M < N; M++) {
1612  acc.packet[M+N] = res.template loadPacket<Packet>(row + accCols, col + M);
1613  }
1614  }
1615  }
1616 }
1617 
1618 template<typename DataMapper, typename Packet, int N>
1619 EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row)
1620 {
1621  for (int M = 0; M < N; M++) {
1622  res.template storePacket<Packet>(row, M, acc.packet[M]);
1623  }
1624 }
1625 
1626 #ifdef USE_PARTIAL_PACKETS
1627 template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
1628 EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements)
1629 {
1630  for (Index M = 0; M < N; M++) {
1631  acc.packet[M] = res.template loadPacketPartial<Packet>(row, M, elements);
1632  }
1633  if (Complex && full) {
1634  for (Index M = 0; M < N; M++) {
1635  acc.packet[M+N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
1636  }
1637  }
1638 }
1639 
1640 template<typename DataMapper, typename Packet, Index N>
1641 EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements)
1642 {
1643  for (Index M = 0; M < N; M++) {
1644  res.template storePacketPartial<Packet>(row, M, acc.packet[M], elements);
1645  }
1646 }
1647 #endif
1648 
1649 #ifdef _ARCH_PWR10
1650 #define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
1651 #else
1652 #define USE_P10_AND_PVIPR2_0 0
1653 #endif
1654 
1655 #if !USE_P10_AND_PVIPR2_0
1656 const static Packet4i mask4[4] = { { 0, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, -1, 0, 0 }, { -1, -1, -1, 0 } };
1657 #endif
1658 
1659 template<typename Packet>
1660 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
1661 {
1662 #if USE_P10_AND_PVIPR2_0
1663 #ifdef _BIG_ENDIAN
1664  return Packet(vec_reve(vec_genwm((1 << remaining_rows) - 1)));
1665 #else
1666  return Packet(vec_genwm((1 << remaining_rows) - 1));
1667 #endif
1668 #else
1669  return Packet(mask4[remaining_rows]);
1670 #endif
1671 }
1672 
1673 template<>
1675 {
1676 #if USE_P10_AND_PVIPR2_0
1677  Packet2d mask2 = Packet2d(vec_gendm(remaining_rows));
1678 #ifdef _BIG_ENDIAN
1679  return preverse(mask2);
1680 #else
1681  return mask2;
1682 #endif
1683 #else
1684  Packet2l ret = { -remaining_rows, 0 };
1685  return Packet2d(ret);
1686 #endif
1687 }
1688 
1689 template<typename Packet, int N>
1690 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
1691 {
1692  for (int M = 0; M < N; M++) {
1693  acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
1694  }
1695 }
1696 
1697 // Scale the PacketBlock vectors by alpha.
1698 template<typename Packet, int N, bool mask>
1699 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
1700 {
1701  if (mask) {
1702  band<Packet, N>(accZ, pMask);
1703  } else {
1704  EIGEN_UNUSED_VARIABLE(pMask);
1705  }
1706 
1707  bscale<Packet, N>(acc, accZ, pAlpha);
1708 }
1709 
1710 template<typename Packet, int N, bool real>
1712  const __UNPACK_TYPE__(Packet) *ap1, const __UNPACK_TYPE__(Packet) *ap2,
1713  Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1714 {
1715  a0 = pset1<Packet>(ap0[0]);
1716  if (N == 4) {
1717  a1 = pset1<Packet>(ap0[1]);
1718  a2 = pset1<Packet>(ap0[2]);
1719  a3 = pset1<Packet>(ap0[3]);
1720  EIGEN_UNUSED_VARIABLE(ap1);
1721  EIGEN_UNUSED_VARIABLE(ap2);
1722  } else {
1723  if (N > 1) {
1724  a1 = pset1<Packet>(ap1[0]);
1725  } else {
1727  EIGEN_UNUSED_VARIABLE(ap1);
1728  }
1729  if (N > 2) {
1730  a2 = pset1<Packet>(ap2[0]);
1731  } else {
1733  EIGEN_UNUSED_VARIABLE(ap2);
1734  }
1735  }
1736 }
1737 
1738 template<> EIGEN_ALWAYS_INLINE void
1739 pbroadcastN<Packet4f,4,true>(const float *ap0, const float *, const float *,
1740  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
1741 {
1742  pbroadcast4<Packet4f>(ap0, a0, a1, a2, a3);
1743 }
1744 
1745 template<> EIGEN_ALWAYS_INLINE void
1746 pbroadcastN<Packet4f,4,false>(const float *ap0, const float *ap1, const float *ap2,
1747  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
1748 {
1749  pbroadcastN<Packet4f,4,true>(ap0, ap1, ap2, a0, a1, a2, a3);
1750 }
1751 
1752 template<>
1753 EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d,4,false>(const double* ap0, const double *,
1754  const double *, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1755 {
1756  a1 = pload<Packet2d>(ap0);
1757  a3 = pload<Packet2d>(ap0 + 2);
1758  a0 = vec_splat(a1, 0);
1759  a1 = vec_splat(a1, 1);
1760  a2 = vec_splat(a3, 0);
1761  a3 = vec_splat(a3, 1);
1762 }
1763 
1764 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
1765 template<typename Packet, typename Packetc, int N, bool full>
1766 EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
1767 {
1768  for (int M = 0; M < N; M++) {
1769  acc1.packet[M].v = vec_mergeh(taccReal.packet[M], taccImag.packet[M]);
1770  }
1771 
1772  if (full) {
1773  for (int M = 0; M < N; M++) {
1774  acc2.packet[M].v = vec_mergel(taccReal.packet[M], taccImag.packet[M]);
1775  }
1776  }
1777 }
1778 
1779 template<typename Packet, typename Packetc, int N, bool full>
1780 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
1781 {
1782  bcouple_common<Packet, Packetc, N, full>(taccReal, taccImag, acc1, acc2);
1783 
1784  for (int M = 0; M < N; M++) {
1785  acc1.packet[M] = padd<Packetc>(tRes.packet[M], acc1.packet[M]);
1786  }
1787 
1788  if (full) {
1789  for (int M = 0; M < N; M++) {
1790  acc2.packet[M] = padd<Packetc>(tRes.packet[M+N], acc2.packet[M]);
1791  }
1792  }
1793 }
1794 
1795 // PEEL loop factor.
1796 #define PEEL 7
1797 #define PEEL_ROW 7
1798 
1799 #define MICRO_UNROLL(func) \
1800  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1801 
1802 #define MICRO_NORMAL_ROWS \
1803  accRows == quad_traits<Scalar>::rows || accRows == 1
1804 
1805 #define MICRO_NEW_ROWS ((MICRO_NORMAL_ROWS) ? accRows : 1)
1806 
1807 #define MICRO_RHS(ptr, N) rhs_##ptr##N
1808 
1809 #define MICRO_ZERO_PEEL(peel) \
1810  if ((PEEL_ROW > peel) && (peel != 0)) { \
1811  bsetzero<Packet, accRows>(accZero##peel); \
1812  } else { \
1813  EIGEN_UNUSED_VARIABLE(accZero##peel); \
1814  }
1815 
1816 #define MICRO_ADD(ptr, N) \
1817  if (MICRO_NORMAL_ROWS) { \
1818  MICRO_RHS(ptr,0) += (accRows * N); \
1819  } else { \
1820  MICRO_RHS(ptr,0) += N; \
1821  MICRO_RHS(ptr,1) += N; \
1822  if (accRows == 3) { \
1823  MICRO_RHS(ptr,2) += N; \
1824  } \
1825  }
1826 
1827 #define MICRO_ADD_ROWS(N) MICRO_ADD(ptr, N)
1828 
1829 #define MICRO_BROADCAST1(peel, ptr, rhsV, real) \
1830  if (MICRO_NORMAL_ROWS) { \
1831  pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + (accRows * peel), MICRO_RHS(ptr,0), MICRO_RHS(ptr,0), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1832  } else { \
1833  pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + peel, MICRO_RHS(ptr,1) + peel, MICRO_RHS(ptr,2) + peel, rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1834  }
1835 
1836 #define MICRO_BROADCAST(peel) MICRO_BROADCAST1(peel, ptr, rhsV, true)
1837 
1838 #define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real) \
1839  pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0), MICRO_RHS(ptr,1), MICRO_RHS(ptr,2), rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1840 
1841 #define MICRO_BROADCAST_EXTRA \
1842  Packet rhsV[4]; \
1843  MICRO_BROADCAST_EXTRA1(ptr, rhsV, true) \
1844  MICRO_ADD_ROWS(1)
1845 
1846 #define MICRO_SRC2(ptr, N, M) \
1847  if (MICRO_NORMAL_ROWS) { \
1848  EIGEN_UNUSED_VARIABLE(strideB); \
1849  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,1)); \
1850  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
1851  } else { \
1852  MICRO_RHS(ptr,1) = rhs_base + N + M; \
1853  if (accRows == 3) { \
1854  MICRO_RHS(ptr,2) = rhs_base + N*2 + M; \
1855  } else { \
1856  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
1857  } \
1858  }
1859 
1860 #define MICRO_SRC2_PTR MICRO_SRC2(ptr, strideB, 0)
1861 
1862 #define MICRO_ZERO_PEEL_ROW MICRO_UNROLL(MICRO_ZERO_PEEL)
1863 
1864 #define MICRO_WORK_PEEL(peel) \
1865  if (PEEL_ROW > peel) { \
1866  MICRO_BROADCAST(peel) \
1867  pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
1868  } else { \
1869  EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1870  }
1871 
1872 #define MICRO_WORK_PEEL_ROW \
1873  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
1874  MICRO_UNROLL(MICRO_WORK_PEEL) \
1875  lhs_ptr += (remaining_rows * PEEL_ROW); \
1876  MICRO_ADD_ROWS(PEEL_ROW)
1877 
1878 #define MICRO_ADD_PEEL(peel, sum) \
1879  if (PEEL_ROW > peel) { \
1880  for (Index i = 0; i < accRows; i++) { \
1881  accZero##sum.packet[i] += accZero##peel.packet[i]; \
1882  } \
1883  }
1884 
1885 #define MICRO_ADD_PEEL_ROW \
1886  MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
1887  MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
1888 
1889 #define MICRO_PREFETCHN1(ptr, N) \
1890  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,0)); \
1891  if (N == 2 || N == 3) { \
1892  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,1)); \
1893  if (N == 3) { \
1894  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,2)); \
1895  } \
1896  }
1897 
1898 #define MICRO_PREFETCHN(N) MICRO_PREFETCHN1(ptr, N)
1899 
1900 #define MICRO_COMPLEX_PREFETCHN(N) \
1901  MICRO_PREFETCHN1(ptr_real, N); \
1902  if(!RhsIsReal) { \
1903  MICRO_PREFETCHN1(ptr_imag, N); \
1904  }
1905 
1906 template<typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
1908  const Scalar* &lhs_ptr,
1909  const Scalar* &rhs_ptr0,
1910  const Scalar* &rhs_ptr1,
1911  const Scalar* &rhs_ptr2,
1912  PacketBlock<Packet,accRows> &accZero)
1913 {
1915  pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1916  lhs_ptr += remaining_rows;
1917 }
1918 
1919 template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index remaining_rows>
1921  const DataMapper& res,
1922  const Scalar* lhs_base,
1923  const Scalar* rhs_base,
1924  Index depth,
1925  Index strideA,
1926  Index offsetA,
1927  Index strideB,
1928  Index row,
1929  Index rows,
1930  const Packet& pAlpha,
1931  const Packet& pMask)
1932 {
1933  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
1934  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1935  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
1936 
1938  bsetzero<Packet, accRows>(accZero0);
1939 
1940  Index remaining_depth = depth & -quad_traits<Scalar>::rows;
1941  Index k = 0;
1942  if (remaining_depth >= PEEL_ROW) {
1944  do
1945  {
1946  MICRO_PREFETCHN(accRows)
1947  EIGEN_POWER_PREFETCH(lhs_ptr);
1949  } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
1951  }
1952  for(; k < depth; k++)
1953  {
1954  MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
1955  }
1956 
1957 #ifdef USE_PARTIAL_PACKETS
1959  EIGEN_UNUSED_VARIABLE(pMask);
1960  bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row, remaining_rows);
1961  bscale<Packet,accRows>(acc, accZero0, pAlpha);
1962  bstore_partial<DataMapper, Packet, accRows>(acc, res, row, remaining_rows);
1963 #else
1964  bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
1965  if ((accRows == 1) || (rows >= accCols))
1966  {
1967  bscale<Packet,accRows,true>(acc, accZero0, pAlpha, pMask);
1968  bstore<DataMapper, Packet, accRows>(acc, res, row);
1969  } else {
1970  bscale<Packet,accRows,false>(acc, accZero0, pAlpha, pMask);
1971  for(Index j = 0; j < accRows; j++) {
1972  for(Index i = 0; i < remaining_rows; i++) {
1973  res(row + i, j) = acc.packet[j][i];
1974  }
1975  }
1976  }
1977 #endif
1978 }
1979 
1980 #define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \
1981  switch(value) { \
1982  default: \
1983  MICRO_EXTRA_UNROLL(1) \
1984  break; \
1985  case 2: \
1986  if (is_col || (sizeof(Scalar) == sizeof(float))) { \
1987  MICRO_EXTRA_UNROLL(2) \
1988  } \
1989  break; \
1990  case 3: \
1991  if (is_col || (sizeof(Scalar) == sizeof(float))) { \
1992  MICRO_EXTRA_UNROLL(3) \
1993  } \
1994  break; \
1995  }
1996 
1997 #define MICRO_EXTRA_ROWS(N) \
1998  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
1999 
2000 template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
2002  const DataMapper& res,
2003  const Scalar* lhs_base,
2004  const Scalar* rhs_base,
2005  Index depth,
2006  Index strideA,
2007  Index offsetA,
2008  Index strideB,
2009  Index row,
2010  Index rows,
2011  Index remaining_rows,
2012  const Packet& pAlpha,
2013  const Packet& pMask)
2014 {
2015  MICRO_EXTRA(MICRO_EXTRA_ROWS, remaining_rows, false)
2016 }
2017 
2018 #define MICRO_UNROLL_WORK(func, func2, peel) \
2019  MICRO_UNROLL(func2); \
2020  func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
2021  func(4,peel) func(5,peel) func(6,peel) func(7,peel)
2022 
2023 #define MICRO_WORK_ONE(iter, peel) \
2024  if (unroll_factor > iter) { \
2025  pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
2026  }
2027 
2028 #define MICRO_TYPE_PEEL4(func, func2, peel) \
2029  if (PEEL > peel) { \
2030  Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
2031  MICRO_BROADCAST(peel) \
2032  MICRO_UNROLL_WORK(func, func2, peel) \
2033  } else { \
2034  EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2035  }
2036 
2037 #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2038  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
2039  func(func1,func2,0) func(func1,func2,1) \
2040  func(func1,func2,2) func(func1,func2,3) \
2041  func(func1,func2,4) func(func1,func2,5) \
2042  func(func1,func2,6) func(func1,func2,7)
2043 
2044 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
2045  Packet rhsV0[M]; \
2046  func(func1,func2,0)
2047 
2048 #define MICRO_UNROLL_TYPE(MICRO_TYPE, size) \
2049  MICRO_TYPE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE) \
2050  MICRO_ADD_ROWS(size)
2051 
2052 #define MICRO_ONE_PEEL4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_PEEL, PEEL)
2053 
2054 #define MICRO_ONE4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_ONE, 1)
2055 
2056 #define MICRO_DST_PTR_ONE(iter) \
2057  if (unroll_factor > iter) { \
2058  bsetzero<Packet, accRows>(accZero##iter); \
2059  } else { \
2060  EIGEN_UNUSED_VARIABLE(accZero##iter); \
2061  }
2062 
2063 #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
2064 
2065 #define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
2066 
2067 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
2068 
2069 #ifdef USE_PARTIAL_PACKETS
2070 #define MICRO_STORE_ONE(iter) \
2071  if (unroll_factor > iter) { \
2072  if (MICRO_NORMAL_PARTIAL(iter)) { \
2073  bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
2074  bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
2075  bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
2076  } else { \
2077  bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter*accCols, accCols2); \
2078  bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
2079  bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter*accCols, accCols2); \
2080  } \
2081  }
2082 #else
2083 #define MICRO_STORE_ONE(iter) \
2084  if (unroll_factor > iter) { \
2085  bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
2086  bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
2087  bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
2088  }
2089 #endif
2090 
2091 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
2092 
2093 #ifdef USE_PARTIAL_PACKETS
2094 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, bool full>
2095 #else
2096 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
2097 #endif
2099  const DataMapper& res,
2100  const Scalar* lhs_base,
2101  const Scalar* rhs_base,
2102  Index depth,
2103  Index strideA,
2104  Index offsetA,
2105  Index strideB,
2106  Index& row,
2107  const Packet& pAlpha,
2108 #ifdef USE_PARTIAL_PACKETS
2109  Index accCols2
2110 #else
2111  const Packet& pMask
2112 #endif
2113  )
2114 {
2115  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
2116  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
2117  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
2118  PacketBlock<Packet,accRows> acc;
2119 
2123 
2124  Index k = 0;
2125  for(; k + PEEL <= depth; k+= PEEL)
2126  {
2127  MICRO_PREFETCHN(accRows)
2130  }
2131  for(; k < depth; k++)
2132  {
2133  MICRO_ONE4
2134  }
2135  MICRO_STORE
2136 
2137  MICRO_UPDATE
2138 }
2139 
2140 #ifdef USE_PARTIAL_PACKETS
2141 #define MICRO_UNROLL_ITER2(N, M) \
2142  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
2143  if (M) return;
2144 #else
2145 #define MICRO_UNROLL_ITER2(N, M) \
2146  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
2147  if (M) return;
2148 #endif
2149 
2150 template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
2152  const DataMapper& res,
2153  const Scalar* blockA,
2154  const Scalar* blockB,
2155  Index depth,
2156  Index strideA,
2157  Index offsetA,
2158  Index strideB,
2159  Index offsetB,
2160  Index col,
2161  Index rows,
2162  Index remaining_rows,
2163  const Packet& pAlpha,
2164  const Packet& pMask)
2165 {
2166  const DataMapper res3 = res.getSubMapper(0, col);
2167 
2168  const Scalar* rhs_base = blockB + col*strideB + MICRO_NEW_ROWS*offsetB;
2169  const Scalar* lhs_base = blockA + accCols*offsetA;
2170  Index row = 0;
2171 
2172 #define MAX_UNROLL 7
2173  while(row + MAX_UNROLL*accCols <= rows) {
2175  }
2176  switch( (rows-row)/accCols ) {
2177 #if MAX_UNROLL > 7
2178  case 7:
2180  break;
2181 #endif
2182 #if MAX_UNROLL > 6
2183  case 6:
2185  break;
2186 #endif
2187 #if MAX_UNROLL > 5
2188  case 5:
2190  break;
2191 #endif
2192 #if MAX_UNROLL > 4
2193  case 4:
2195  break;
2196 #endif
2197 #if MAX_UNROLL > 3
2198  case 3:
2200  break;
2201 #endif
2202 #if MAX_UNROLL > 2
2203  case 2:
2205  break;
2206 #endif
2207 #if MAX_UNROLL > 1
2208  case 1:
2210  break;
2211 #endif
2212  default:
2213  break;
2214  }
2215 #undef MAX_UNROLL
2216 
2217  if(remaining_rows > 0)
2218  {
2219  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
2220  }
2221 }
2222 
2223 #define MICRO_EXTRA_COLS(N) \
2224  gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
2225 
2226 template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
2228  const DataMapper& res,
2229  const Scalar* blockA,
2230  const Scalar* blockB,
2231  Index depth,
2232  Index strideA,
2233  Index offsetA,
2234  Index strideB,
2235  Index offsetB,
2236  Index col,
2237  Index rows,
2238  Index cols,
2239  Index remaining_rows,
2240  const Packet& pAlpha,
2241  const Packet& pMask)
2242 {
2244 }
2245 
2246 
2249 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
2250 EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
2251 {
2252  const Index remaining_rows = rows % accCols;
2253 
2254  if( strideA == -1 ) strideA = depth;
2255  if( strideB == -1 ) strideB = depth;
2256 
2257  const Packet pAlpha = pset1<Packet>(alpha);
2258  const Packet pMask = bmask<Packet>(remaining_rows);
2259 
2260  Index col = 0;
2261  for(; col + accRows <= cols; col += accRows)
2262  {
2263  gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
2264  }
2265 
2266  if (col != cols)
2267  {
2268  gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
2269  }
2270 }
2271 
2272 #define accColsC (accCols / 2)
2273 #define advanceRows ((LhsIsReal) ? 1 : 2)
2274 #define advanceCols ((RhsIsReal) ? 1 : 2)
2275 
2276 // PEEL_COMPLEX loop factor.
2277 #define PEEL_COMPLEX 3
2278 #define PEEL_COMPLEX_ROW 3
2279 
2280 #define MICRO_COMPLEX_UNROLL(func) \
2281  func(0) func(1) func(2) func(3)
2282 
2283 #define MICRO_COMPLEX_ZERO_PEEL(peel) \
2284  if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
2285  bsetzero<Packet, accRows>(accReal##peel); \
2286  bsetzero<Packet, accRows>(accImag##peel); \
2287  } else { \
2288  EIGEN_UNUSED_VARIABLE(accReal##peel); \
2289  EIGEN_UNUSED_VARIABLE(accImag##peel); \
2290  }
2291 
2292 #define MICRO_COMPLEX_ADD_ROWS(N, used) \
2293  MICRO_ADD(ptr_real, N) \
2294  if (!RhsIsReal) { \
2295  MICRO_ADD(ptr_imag, N) \
2296  } else if (used) { \
2297  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
2298  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
2299  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
2300  }
2301 
2302 #define MICRO_COMPLEX_BROADCAST(peel) \
2303  MICRO_BROADCAST1(peel, ptr_real, rhsV, false) \
2304  if (!RhsIsReal) { \
2305  MICRO_BROADCAST1(peel, ptr_imag, rhsVi, false) \
2306  } else { \
2307  EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2308  }
2309 
2310 #define MICRO_COMPLEX_BROADCAST_EXTRA \
2311  Packet rhsV[4], rhsVi[4]; \
2312  MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false) \
2313  if(!RhsIsReal) { \
2314  MICRO_BROADCAST_EXTRA1(ptr_imag, rhsVi, false) \
2315  } else { \
2316  EIGEN_UNUSED_VARIABLE(rhsVi); \
2317  } \
2318  MICRO_COMPLEX_ADD_ROWS(1, true)
2319 
2320 #define MICRO_COMPLEX_SRC2_PTR \
2321  MICRO_SRC2(ptr_real, strideB*advanceCols, 0) \
2322  if (!RhsIsReal) { \
2323  MICRO_RHS(ptr_imag,0) = rhs_base + MICRO_NEW_ROWS*strideB; \
2324  MICRO_SRC2(ptr_imag, strideB*advanceCols, strideB) \
2325  } else { \
2326  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
2327  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
2328  EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
2329  }
2330 
2331 #define MICRO_COMPLEX_ZERO_PEEL_ROW MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_ZERO_PEEL)
2332 
2333 #define MICRO_COMPLEX_WORK_PEEL(peel) \
2334  if (PEEL_COMPLEX_ROW > peel) { \
2335  MICRO_COMPLEX_BROADCAST(peel) \
2336  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
2337  } else { \
2338  EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2339  EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2340  }
2341 
2342 #define MICRO_COMPLEX_ADD_COLS(size) \
2343  lhs_ptr_real += (remaining_rows * size); \
2344  if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * size); \
2345  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
2346 
2347 #define MICRO_COMPLEX_WORK_PEEL_ROW \
2348  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
2349  Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
2350  MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL) \
2351  MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW) \
2352  MICRO_COMPLEX_ADD_ROWS(PEEL_COMPLEX_ROW, false)
2353 
2354 #define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
2355  if (PEEL_COMPLEX_ROW > peel) { \
2356  for (Index i = 0; i < accRows; i++) { \
2357  accReal##sum.packet[i] += accReal##peel.packet[i]; \
2358  accImag##sum.packet[i] += accImag##peel.packet[i]; \
2359  } \
2360  }
2361 
2362 #define MICRO_COMPLEX_ADD_PEEL_ROW \
2363  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
2364  MICRO_COMPLEX_ADD_PEEL(1, 0)
2365 
2366 template<typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
2368  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
2369  const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2,
2370  const Scalar* &rhs_ptr_imag0, const Scalar* &rhs_ptr_imag1, const Scalar* &rhs_ptr_imag2,
2371  PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
2372 {
2374  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
2376 }
2377 
2378 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
2380  const DataMapper& res,
2381  const Scalar* lhs_base,
2382  const Scalar* rhs_base,
2383  Index depth,
2384  Index strideA,
2385  Index offsetA,
2386  Index strideB,
2387  Index row,
2388  Index rows,
2389  const Packet& pAlphaReal,
2390  const Packet& pAlphaImag,
2391  const Packet& pMask)
2392 {
2393  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
2394  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
2395  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
2396  const Scalar* lhs_ptr_imag = NULL;
2397  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
2398  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
2399  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
2400  PacketBlock<Packet,accRows> taccReal, taccImag;
2401  PacketBlock<Packetc,accRows> acc0, acc1;
2402  PacketBlock<Packetc,accRows*2> tRes;
2403 
2405 
2406  bsetzero<Packet, accRows>(accReal0);
2407  bsetzero<Packet, accRows>(accImag0);
2408 
2409  Index remaining_depth = depth & -quad_traits<Scalar>::rows;
2410  Index k = 0;
2411  if (remaining_depth >= PEEL_COMPLEX_ROW) {
2413  do
2414  {
2415  MICRO_COMPLEX_PREFETCHN(accRows)
2416  EIGEN_POWER_PREFETCH(lhs_ptr_real);
2417  if(!LhsIsReal) {
2418  EIGEN_POWER_PREFETCH(lhs_ptr_imag);
2419  }
2421  } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
2423  }
2424  for(; k < depth; k++)
2425  {
2426  MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0);
2427  }
2428 
2429  constexpr bool full = (remaining_rows > accColsC);
2430  bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
2431  if ((accRows == 1) || (rows >= accCols))
2432  {
2433  bscalec<Packet,accRows,true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
2434  bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
2435  bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
2436  if (full) {
2437  bstore<DataMapper, Packetc, accRows>(acc1, res, row + accColsC);
2438  }
2439  } else {
2440  bscalec<Packet,accRows,false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
2441  bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
2442 
2443  if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
2444  {
2445  for(Index j = 0; j < accRows; j++) {
2446  res(row + 0, j) = pfirst<Packetc>(acc0.packet[j]);
2447  }
2448  } else {
2449  bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
2450  if (full) {
2451  for(Index j = 0; j < accRows; j++) {
2452  res(row + accColsC, j) = pfirst<Packetc>(acc1.packet[j]);
2453  }
2454  }
2455  }
2456  }
2457 }
2458 
2459 #define MICRO_COMPLEX_EXTRA_ROWS(N) \
2460  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
2461 
2462 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2464  const DataMapper& res,
2465  const Scalar* lhs_base,
2466  const Scalar* rhs_base,
2467  Index depth,
2468  Index strideA,
2469  Index offsetA,
2470  Index strideB,
2471  Index row,
2472  Index rows,
2473  Index remaining_rows,
2474  const Packet& pAlphaReal,
2475  const Packet& pAlphaImag,
2476  const Packet& pMask)
2477 {
2478  MICRO_EXTRA(MICRO_COMPLEX_EXTRA_ROWS, remaining_rows, false)
2479 }
2480 
2481 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2482  MICRO_COMPLEX_UNROLL(func2); \
2483  func(0,peel) func(1,peel) func(2,peel) func(3,peel)
2484 
2485 #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
2486  if (unroll_factor > iter) { \
2487  pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2488  }
2489 
2490 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2491  if (PEEL_COMPLEX > peel) { \
2492  Packet lhsV0, lhsV1, lhsV2, lhsV3; \
2493  Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
2494  MICRO_COMPLEX_BROADCAST(peel) \
2495  MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2496  } else { \
2497  EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2498  EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2499  }
2500 
2501 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2502  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
2503  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
2504  func(func1,func2,0) func(func1,func2,1) \
2505  func(func1,func2,2) func(func1,func2,3)
2506 
2507 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2508  Packet rhsV0[M], rhsVi0[M];\
2509  func(func1,func2,0)
2510 
2511 #define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size) \
2512  MICRO_COMPLEX_TYPE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE) \
2513  MICRO_COMPLEX_ADD_ROWS(size, false)
2514 
2515 #define MICRO_COMPLEX_ONE_PEEL4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_PEEL, PEEL_COMPLEX)
2516 
2517 #define MICRO_COMPLEX_ONE4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_ONE, 1)
2518 
2519 #define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2520  if (unroll_factor > iter) { \
2521  bsetzero<Packet, accRows>(accReal##iter); \
2522  bsetzero<Packet, accRows>(accImag##iter); \
2523  } else { \
2524  EIGEN_UNUSED_VARIABLE(accReal##iter); \
2525  EIGEN_UNUSED_VARIABLE(accImag##iter); \
2526  }
2527 
2528 #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2529 
2530 #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2531 
2532 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2533 
2534 #define MICRO_COMPLEX_STORE_ONE(iter) \
2535  if (unroll_factor > iter) { \
2536  constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
2537  bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter*accCols, 0); \
2538  bscalec<Packet,accRows,!(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \
2539  bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1); \
2540  bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter*accCols + 0); \
2541  if (full) { \
2542  bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter*accCols + accColsC); \
2543  } \
2544  }
2545 
2546 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2547 
2548 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2550  const DataMapper& res,
2551  const Scalar* lhs_base,
2552  const Scalar* rhs_base,
2553  Index depth,
2554  Index strideA,
2555  Index offsetA,
2556  Index strideB,
2557  Index& row,
2558  const Packet& pAlphaReal,
2559  const Packet& pAlphaImag,
2560  const Packet& pMask)
2561 {
2562  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
2563  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
2564  const Index imag_delta = accCols*strideA;
2565  const Index imag_delta2 = accCols2*strideA;
2566  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
2567  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
2568  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
2569  PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
2570  PacketBlock<Packet,accRows> taccReal, taccImag;
2571  PacketBlock<Packetc,accRows> acc0, acc1;
2572  PacketBlock<Packetc,accRows*2> tRes;
2573 
2577 
2578  Index k = 0;
2579  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2580  {
2581  MICRO_COMPLEX_PREFETCHN(accRows)
2584  }
2585  for(; k < depth; k++)
2586  {
2588  }
2590 
2592 }
2593 
2594 #define MICRO_COMPLEX_UNROLL_ITER2(N, M) \
2595  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
2596  if (M) return;
2597 
2598 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2600  const DataMapper& res,
2601  const Scalar* blockA,
2602  const Scalar* blockB,
2603  Index depth,
2604  Index strideA,
2605  Index offsetA,
2606  Index strideB,
2607  Index offsetB,
2608  Index col,
2609  Index rows,
2610  Index remaining_rows,
2611  const Packet& pAlphaReal,
2612  const Packet& pAlphaImag,
2613  const Packet& pMask)
2614 {
2615  const DataMapper res3 = res.getSubMapper(0, col);
2616 
2617  const Scalar* rhs_base = blockB + advanceCols*col*strideB + MICRO_NEW_ROWS*offsetB;
2618  const Scalar* lhs_base = blockA + accCols*offsetA;
2619  Index row = 0;
2620 
2621 #define MAX_COMPLEX_UNROLL 4
2622  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2624  }
2625  switch( (rows-row)/accCols ) {
2626 #if MAX_COMPLEX_UNROLL > 4
2627  case 4:
2629  break;
2630 #endif
2631 #if MAX_COMPLEX_UNROLL > 3
2632  case 3:
2634  break;
2635 #endif
2636 #if MAX_COMPLEX_UNROLL > 2
2637  case 2:
2639  break;
2640 #endif
2641 #if MAX_COMPLEX_UNROLL > 1
2642  case 1:
2644  break;
2645 #endif
2646  default:
2647  break;
2648  }
2649 #undef MAX_COMPLEX_UNROLL
2650 
2651  if(remaining_rows > 0)
2652  {
2653  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2654  }
2655 }
2656 
2657 #define MICRO_COMPLEX_EXTRA_COLS(N) \
2658  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2659 
2660 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2662  const DataMapper& res,
2663  const Scalar* blockA,
2664  const Scalar* blockB,
2665  Index depth,
2666  Index strideA,
2667  Index offsetA,
2668  Index strideB,
2669  Index offsetB,
2670  Index col,
2671  Index rows,
2672  Index cols,
2673  Index remaining_rows,
2674  const Packet& pAlphaReal,
2675  const Packet& pAlphaImag,
2676  const Packet& pMask)
2677 {
2679 }
2680 
2681 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
2682 EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
2683 {
2684  const Index remaining_rows = rows % accCols;
2685 
2686  if( strideA == -1 ) strideA = depth;
2687  if( strideB == -1 ) strideB = depth;
2688 
2689  const Packet pAlphaReal = pset1<Packet>(alpha.real());
2690  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2691  const Packet pMask = bmask<Packet>(remaining_rows);
2692 
2693  const Scalar* blockA = (Scalar *) blockAc;
2694  const Scalar* blockB = (Scalar *) blockBc;
2695 
2696  Index col = 0;
2697  for(; col + accRows <= cols; col += accRows)
2698  {
2699  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2700  }
2701 
2702  if (col != cols)
2703  {
2704  gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2705  }
2706 }
2707 
2708 #undef accColsC
2709 #undef advanceCols
2710 #undef advanceRows
2711 
2713 {
2714 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
2715  return true;
2716 #else
2717 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) && __has_builtin(__builtin_cpu_supports)
2718  return __builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma");
2719 #else
2720  return false; // No dynamic dispatch for LLVM
2721 #endif
2722 #endif
2723 }
2724 
2726 {
2727  Packet4f result_block = ploadu<Packet4f>(result);
2728  return pmadd(acc, pAlpha, result_block);
2729 }
2730 
2731 template<bool lhsExtraRows>
2732 EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows)
2733 {
2734  if (lhsExtraRows) {
2735  pstoreu_partial(result, result_block, extra_rows);
2736  } else {
2737  pstoreu(result, result_block);
2738  }
2739  result += rows;
2740 }
2741 
2742 template<bool rhsExtraCols, bool lhsExtraRows>
2743 EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows)
2744 {
2745  Index x = 0;
2746  if (rhsExtraCols) {
2747  do{
2748  Packet4f result_block = loadAndMultiplyF32(acc[x], pAlpha, result);
2749  storeF32<lhsExtraRows>(result, result_block, rows, extra_rows);
2750  } while (++x < extra_cols);
2751  } else {
2752  Packet4f result_block[4];
2753  float *result2 = result;
2754  do{
2755  result_block[x] = loadAndMultiplyF32(acc[x], pAlpha, result);
2756  result += rows;
2757  } while (++x < 4);
2758  x = 0;
2759  do{
2760  storeF32<lhsExtraRows>(result2, result_block[x], rows, extra_rows);
2761  } while (++x < 4);
2762  }
2763 }
2764 
2766 {
2767  Packet8us z = pset1<Packet8us>(0);
2768 #ifdef _BIG_ENDIAN
2769  return reinterpret_cast<Packet4f>(vec_mergeh(data, z));
2770 #else
2771  return reinterpret_cast<Packet4f>(vec_mergeh(z, data));
2772 #endif
2773 }
2774 
2776 {
2777  Packet8us z = pset1<Packet8us>(0);
2778 #ifdef _BIG_ENDIAN
2779  return reinterpret_cast<Packet4f>(vec_mergel(data, z));
2780 #else
2781  return reinterpret_cast<Packet4f>(vec_mergel(z, data));
2782 #endif
2783 }
2784 
2785 template<Index N, Index M>
2786 EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra = 0)
2787 {
2788  if (N < 4) {
2789  pstoreu_partial(to + 0, oneConvertBF16Hi(block.packet[0].m_val), extra);
2790  } else if (N >= (M*8+4)) {
2791  pstoreu(to + 0, oneConvertBF16Hi(block.packet[M].m_val));
2792  if (N >= 8) {
2793  pstoreu(to + 4, oneConvertBF16Lo(block.packet[M].m_val));
2794  }
2795  }
2796 }
2797 
2798 template<Index N>
2799 EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra)
2800 {
2801  storeConvertTwoBF16<N, 0>(to + 0, block, extra);
2802  if (N >= 16) {
2803  storeConvertTwoBF16<N, 1>(to + 8, block);
2804  }
2805  if (N >= 32) {
2806  storeConvertTwoBF16<N, 2>(to + 16, block);
2807  storeConvertTwoBF16<N, 3>(to + 24, block);
2808  }
2809 }
2810 
2811 template<bool non_unit_stride, Index delta>
2813 {
2814  if (non_unit_stride) {
2815  return pgather<bfloat16, Packet8bf>(src + delta*resInc, resInc);
2816  } else {
2817  return ploadu<Packet8bf>(src + delta);
2818  }
2819 }
2820 
2821 static Packet16uc p16uc_MERGE16_32_1 = { 0, 1, 16,17, 2, 3, 18,19, 0, 1, 16,17, 2, 3, 18,19 };
2822 static Packet16uc p16uc_MERGE16_32_2 = { 4, 5, 20,21, 6, 7, 22,23, 4, 5, 20,21, 6, 7, 22,23 };
2823 static Packet16uc p16uc_MERGE16_32_3 = { 8, 9, 24,25, 10,11, 26,27, 8, 9, 24,25, 10,11, 26,27 };
2824 static Packet16uc p16uc_MERGE16_32_4 = { 12,13, 28,29, 14,15, 30,31, 12,13, 28,29, 14,15, 30,31 };
2825 
2826 static Packet16uc p16uc_MERGE16_32_5 = { 0,1, 16,17, 16,17, 16,17, 0,1, 16,17, 16,17, 16,17 };
2827 static Packet16uc p16uc_MERGE16_32_6 = { 2,3, 18,19, 18,19, 18,19, 2,3, 18,19, 18,19, 18,19 };
2828 static Packet16uc p16uc_MERGE16_32_7 = { 4,5, 20,21, 20,21, 20,21, 4,5, 20,21, 20,21, 20,21 };
2829 static Packet16uc p16uc_MERGE16_32_8 = { 6,7, 22,23, 22,23, 22,23, 6,7, 22,23, 22,23, 22,23 };
2830 
2832 {
2833  Packet8us z = pset1<Packet8us>(0);
2834 #ifdef _BIG_ENDIAN
2835  return reinterpret_cast<Packet4f>(vec_perm(data, z, mask));
2836 #else
2837  return reinterpret_cast<Packet4f>(vec_perm(z, data, mask));
2838 #endif
2839 }
2840 
2841 template<bool lhsExtraRows, bool odd, Index size>
2843 {
2844  Packet4f dup[4*4];
2845  Packet8bf data[4];
2846 
2847  for (Index i = 0; i < size; i++) {
2848  data[i] = ploadu<Packet8bf>(src + rows*i);
2849  }
2850 
2851  for (Index i = 0, j = 0; i < size; i++, j += 4) {
2852  dup[j+0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
2853  dup[j+1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
2854  dup[j+2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
2855  dup[j+3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
2856  }
2857 
2858  for (Index j = 0; j < 4*size; j += 4) {
2859  if (lhsExtraRows) {
2860  Packet4f z = pset1<Packet4f>(float(0));
2861  Index i = 0;
2862  do {
2863  pstoreu(result + (j+i)*4, dup[j+i]);
2864  } while (++i < extra_rows);
2865  do {
2866  pstoreu(result + (j+i)*4, z);
2867  } while (++i < 4);
2868  } else {
2869  for (Index i = 0; i < 4; i++) {
2870  pstoreu(result + (j+i)*4, dup[j+i]);
2871  }
2872  }
2873  }
2874 }
2875 
2876 template<bool lhsExtraRows>
2877 EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float *result, Index cols, Index rows, const bfloat16* src, Index delta, Index extra_rows)
2878 {
2879  Index col = 0;
2880  src += delta*2;
2881  for(; col + 4*2 <= cols; col += 4*2, result += 4*4*4, src += 4*rows) {
2882  convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,4>(result, rows, src, extra_rows);
2883  }
2884  for(; col + 2 <= cols; col += 2, result += 4*4, src += rows) {
2885  convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,1>(result, rows, src, extra_rows);
2886  }
2887  if (cols & 1) {
2888  convertArrayPointerBF16toF32DupOne<lhsExtraRows,true,1>(result, rows, src - delta, extra_rows);
2889  }
2890 }
2891 
2892 template<const Index size, bool non_unit_stride>
2894 {
2895  constexpr Index extra = ((size < 4) ? 4 : size);
2896  while (i + size <= rows) {
2897  PacketBlock<Packet8bf,(size+7)/8> r32;
2898  r32.packet[0] = loadBF16fromResult<non_unit_stride, 0>(src, resInc);
2899  if (size >= 16) {
2900  r32.packet[1] = loadBF16fromResult<non_unit_stride, 8>(src, resInc);
2901  }
2902  if (size >= 32) {
2903  r32.packet[2] = loadBF16fromResult<non_unit_stride, 16>(src, resInc);
2904  r32.packet[3] = loadBF16fromResult<non_unit_stride, 24>(src, resInc);
2905  }
2906  storeConvertBlockBF16<size>(result + i, r32, rows & 3);
2907  i += extra; src += extra*resInc;
2908  if (size != 32) break;
2909  }
2910 }
2911 
2912 template<bool non_unit_stride>
2914 {
2915  for(Index col = 0; col < cols; col++, src += (rows*resInc), result += rows) {
2916  Index i = 0;
2917  bfloat16* src2 = src;
2918  convertPointerBF16toF32<32, non_unit_stride>(i, result, rows, src2, resInc);
2919  convertPointerBF16toF32<16, non_unit_stride>(i, result, rows, src2, resInc);
2920  convertPointerBF16toF32<8, non_unit_stride>(i, result, rows, src2, resInc);
2921  convertPointerBF16toF32<4, non_unit_stride>(i, result, rows, src2, resInc);
2922  convertPointerBF16toF32<1, non_unit_stride>(i, result, rows, src2, resInc);
2923  }
2924 }
2925 
2926 template<Index num_acc, Index size = 4>
2928 {
2929  Packet4f z = pset1<Packet4f>(float(0));
2930 
2931  for(Index k = 0; k < num_acc; k++) {
2932  for(Index j = 0; j < size; j++) {
2933  acc[k][j] = z;
2934  }
2935  }
2936 }
2937 
2938 template<Index num_acc>
2940 {
2941  for(Index i = 0; i < num_acc; i++) {
2942  Packet4ui t0, t1, t2, t3;
2943  t0 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
2944  t1 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
2945  t2 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][1]), reinterpret_cast<Packet4ui>(acc[i][3]));
2946  t3 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][1]), reinterpret_cast<Packet4ui>(acc[i][3]));
2947  acc[i][0] = reinterpret_cast<Packet4f>(vec_mergeh(t0, t2));
2948  acc[i][1] = reinterpret_cast<Packet4f>(vec_mergel(t0, t2));
2949  acc[i][2] = reinterpret_cast<Packet4f>(vec_mergeh(t1, t3));
2950  acc[i][3] = reinterpret_cast<Packet4f>(vec_mergel(t1, t3));
2951  }
2952 }
2953 
2954 template<Index num_acc>
2955 EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4])
2956 {
2957  for(Index i = 0, j = 0; j < num_acc; i++, j += 2) {
2958  for(Index x = 0, y = 0; x < 2; x++, y += 2) {
2959  for(Index w = 0, z = 0; w < 2; w++, z += 2) {
2960  acc[i][y+w] = acc[j+x][z+0] + acc[j+x][z+1];
2961  }
2962  }
2963  }
2964 }
2965 
2966 template<Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
2967 EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result, const Index extra_cols, Index extra_rows)
2968 {
2969  tranposeResults<num_acc>(acc);
2970  addResults<num_acc>(acc);
2971 
2972  constexpr Index real_rhs = ((num_rhs / 2) - (rhsExtraCols ? 1 : 0));
2973  Index k = 0;
2974  for(Index i = 0; i < real_rhs; i++, result += 4*rows, k++){
2975  storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
2976  }
2977  if(rhsExtraCols) {
2978  storeResults<rhsExtraCols, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
2979  }
2980 }
2981 
2982 template<bool zero>
2983 EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f &dhs1)
2984 {
2985  dhs0 = ploadu<Packet4f>(block + strideB*i + 0);
2986  if (zero) {
2987  Packet4f dhs2 = pset1<Packet4f>(float(0));
2988  dhs1 = vec_mergel(dhs0, dhs2);
2989  dhs0 = vec_mergeh(dhs0, dhs2);
2990  } else {
2991  dhs1 = ploadu<Packet4f>(block + strideB*i + 4);
2992  }
2993 }
2994 
2995 template<Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
2997 (
2998  const float* indexA,
2999  const float* indexB,
3000  Packet4f (&acc)[num_acc][4],
3001  Index strideB,
3002  Index k,
3003  Index offsetB,
3004  Index extra_cols
3005 )
3006 {
3007  constexpr Index num_lhs = 4;
3008  Packet4f lhs[num_lhs], rhs[num_rhs];
3009 
3010  constexpr Index real_rhs = (num_rhs - (rhsExtraCols ? 2 : 0));
3011  for(Index i = 0; i < real_rhs; i += 2){
3012  loadTwoRhsFloat32<zero>(indexB + k*4, strideB, i, rhs[i + 0], rhs[i + 1]);
3013  }
3014  if(rhsExtraCols) {
3015  loadTwoRhsFloat32<zero>(indexB + k*extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
3016  }
3017 
3018  indexA += 2*k*4;
3019  for(Index j = 0; j < num_lhs; j++) {
3020  lhs[j] = ploadu<Packet4f>(indexA + j*4);
3021  }
3022 
3023  for(Index j = 0; j < num_rhs; j++) {
3024  for(Index i = 0; i < num_lhs; i++) {
3025  acc[j][i] = pmadd(rhs[j], lhs[i], acc[j][i]);
3026  }
3027  }
3028 }
3029 
3030 template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
3031 EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result, const Index extra_cols, const Index extra_rows)
3032 {
3033  constexpr Index num_rhs = num_acc;
3034 
3035  Packet4f acc[num_acc][4];
3036 
3037  zeroAccumulators<num_acc>(acc);
3038 
3039  Index k;
3040  for(k = 0; k + 2 <= depth; k += 2){
3041  KLoop<num_acc, false, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
3042  }
3043  if(depth&1){
3044  KLoop<num_acc, true, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
3045  }
3046 
3047  outputResultsVSX<num_acc, rhsExtraCols, lhsExtraRows, num_rhs>(acc, rows, pAlpha, result, extra_cols, extra_rows);
3048 }
3049 
3050 // No more than 4 (uses 2X the accumulators or 8X the number of VSX registers)
3051 #define MAX_BFLOAT16_ACC_VSX 4
3052 
3053 template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
3054 void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result)
3055 {
3056  constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
3057  const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
3058  const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
3059  constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC_VSX);
3060 
3061  do{
3062  colVSXLoopBodyIter<num_acc*2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
3063 
3064  indexB += strideB*(num_acc * 2);
3065  result += rows*step;
3066  } while(multiIters && (step <= cols - (col += step)));
3067 }
3068 
3069 template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
3070 EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
3071 {
3072  if (MAX_BFLOAT16_ACC_VSX > num_acc) {
3073  colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
3074  }
3075 }
3076 
3077 template<bool rhsExtraCols, bool lhsExtraRows>
3078 void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
3079 {
3080  switch ((cols - col) >> 2) {
3081  case 3:
3082  colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
3083  break;
3084  case 2:
3085  colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
3086  break;
3087  case 1:
3088  colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
3089  break;
3090  default:
3091  if (rhsExtraCols) {
3092  colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
3093  }
3094  break;
3095  }
3096 }
3097 
3098 template<Index size, bool lhsExtraRows = false>
3099 EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const float* indexA2, const float* blockB2, Index strideA, Index strideB, Index offsetB, float* result2)
3100 {
3101  Index delta_rows = 2*(lhsExtraRows ? (rows & 3) : size);
3102  for (Index row = 0; row < size; row += 4) {
3103  convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float *>(indexA2), strideA, delta_rows, indexA, row, rows & 3);
3104 
3105  const float *blockB = blockB2;
3106  float *result = result2 + row;
3107 
3108  Index col = 0;
3109  if (cols >= (MAX_BFLOAT16_ACC_VSX * 4)) {
3110  colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
3111  blockB += (strideB >> 1)*col;
3112  result += rows*col;
3113  }
3114  if (cols & 3) {
3115  colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB, result);
3116  } else {
3117  colVSXLoopBodyExtra<false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
3118  }
3119  }
3120 }
3121 
3122 template<Index size>
3123 EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexB, Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
3124 {
3125  if ((size == 16) || (rows & size)) {
3126  indexA += size*offsetA;
3127  colVSXLoops<size>(depth, cols, rows, pAlpha, indexA, indexA2, indexB, strideA, strideB, offsetB, result + row);
3128  row += size;
3129  indexA += bigSuffix*size/16;
3130  }
3131 }
3132 
3133 template<const Index size, typename DataMapper>
3134 EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float *result, Index rows, const DataMapper& src)
3135 {
3136  constexpr Index extra = ((size < 4) ? 4 : size);
3137  while (i + size <= rows) {
3138  PacketBlock<Packet8bf,(size+7)/8> r32;
3139  r32.packet[0] = src.template loadPacket<Packet8bf>(i + 0);
3140  if (size >= 16) {
3141  r32.packet[1] = src.template loadPacket<Packet8bf>(i + 8);
3142  }
3143  if (size >= 32) {
3144  r32.packet[2] = src.template loadPacket<Packet8bf>(i + 16);
3145  r32.packet[3] = src.template loadPacket<Packet8bf>(i + 24);
3146  }
3147  storeConvertBlockBF16<size>(result + i, r32, rows & 3);
3148  i += extra;
3149  if (size != 32) break;
3150  }
3151 }
3152 
3153 template<typename DataMapper>
3154 EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src)
3155 {
3156  typedef typename DataMapper::LinearMapper LinearMapper;
3157  for(Index j = 0; j < cols; j++, result += rows){
3158  const LinearMapper src2 = src.getLinearMapper(0, j);
3159  Index i = 0;
3160  convertBF16toF32<32, LinearMapper>(i, result, rows, src2);
3161  convertBF16toF32<16, LinearMapper>(i, result, rows, src2);
3162  convertBF16toF32<8, LinearMapper>(i, result, rows, src2);
3163  convertBF16toF32<4, LinearMapper>(i, result, rows, src2);
3164  convertBF16toF32<1, LinearMapper>(i, result, rows, src2);
3165  }
3166 }
3167 
3169 {
3171 }
3172 
3173 template<typename DataMapper, const Index size>
3174 EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float *result, Index col, Index rows, const DataMapper& res)
3175 {
3176  const DataMapper res2 = res.getSubMapper(0, col);
3177  Index row;
3178  float *result2 = result + col*rows;
3179  for(row = 0; row + 8 <= rows; row += 8, result2 += 8){
3180  // get and save block
3181  PacketBlock<Packet8bf,size> block;
3182  for(Index j = 0; j < size; j++){
3183  block.packet[j] = convertF32toBF16VSX(result2 + j*rows);
3184  }
3185  res2.template storePacketBlock<Packet8bf,size>(row, 0, block);
3186  }
3187  // extra rows
3188  if(row < rows){
3189  for(Index j = 0; j < size; j++){
3190  Packet8bf fp16 = convertF32toBF16VSX(result2 + j*rows);
3191  res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
3192  }
3193  }
3194 }
3195 
3196 template<typename DataMapper>
3197 EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float *result, Index cols, Index rows, const DataMapper& res)
3198 {
3199  Index col;
3200  for(col = 0; col + 4 <= cols; col += 4){
3201  convertArrayF32toBF16ColVSX<DataMapper,4>(result, col, rows, res);
3202  }
3203  // extra cols
3204  switch (cols - col) {
3205  case 1:
3206  convertArrayF32toBF16ColVSX<DataMapper,1>(result, col, rows, res);
3207  break;
3208  case 2:
3209  convertArrayF32toBF16ColVSX<DataMapper,2>(result, col, rows, res);
3210  break;
3211  case 3:
3212  convertArrayF32toBF16ColVSX<DataMapper,3>(result, col, rows, res);
3213  break;
3214  }
3215 }
3216 
3217 template<typename DataMapper>
3218 void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
3219 {
3220  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
3221  const Packet4f pAlpha = pset1<Packet4f>(falpha);
3222 
3223  if( strideA == -1 ) strideA = depth;
3224  if( strideB == -1 ) strideB = depth;
3225 
3227  ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB*cols, 0);
3228  ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2)*4*2, 0);
3229 
3230  convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
3231  convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16 *>(indexB));
3232 
3233  Index bigSuffix = 2*8*(strideA-offsetA);
3234  float* indexBF32 = indexB2 + 4*offsetB;
3235  offsetB *= 3;
3236  strideB *= 2;
3237 
3238  Index row = 0;
3239  // LHS (8x16) block
3240  while(row + 16 <= rows){
3241  calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
3242  }
3243  // LHS (8x8) block
3244  calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
3245  // LHS (8x4) block
3246  calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
3247  // extra rows
3248  if(rows & 3){
3249  // This index is the beginning of remaining block.
3250  colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB, result + row);
3251  }
3252 
3253  // Convert back to bfloat16
3254  convertArrayF32toBF16VSX<DataMapper>(result, cols, rows, res);
3255 }
3256 
3257 #undef MAX_BFLOAT16_ACC_VSX
3258 
3259 #include "MatrixVectorProduct.h"
3260 
3261 
3264 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3265 struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3266 {
3267  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3268 };
3269 
3270 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3272  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3273 {
3274  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
3275  pack(blockA, lhs, depth, rows, stride, offset);
3276 }
3277 
3278 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3279 struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3280 {
3281  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3282 };
3283 
3284 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3286  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3287 {
3288  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
3289  pack(blockA, lhs, depth, rows, stride, offset);
3290 }
3291 
3292 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
3293 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3294 struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3295 {
3296  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3297 };
3298 
3299 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3301  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3302 {
3303  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
3304  pack(blockB, rhs, depth, cols, stride, offset);
3305 }
3306 
3307 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3308 struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3309 {
3310  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3311 };
3312 
3313 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3315  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3316 {
3317  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
3318  pack(blockB, rhs, depth, cols, stride, offset);
3319 }
3320 
3321 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3322 struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3323 {
3324  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3325 };
3326 
3327 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3329  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3330 {
3331  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
3332  pack(blockB, rhs, depth, cols, stride, offset);
3333 }
3334 
3335 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3336 struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3337 {
3338  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3339 };
3340 
3341 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3343  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3344 {
3345  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
3346  pack(blockB, rhs, depth, cols, stride, offset);
3347 }
3348 #endif
3349 
3350 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3351 struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3352 {
3353  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3354 };
3355 
3356 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3358  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3359 {
3360  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
3361  pack(blockA, lhs, depth, rows, stride, offset);
3362 }
3363 
3364 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3365 struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3366 {
3367  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3368 };
3369 
3370 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3372  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3373 {
3374  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
3375  pack(blockA, lhs, depth, rows, stride, offset);
3376 }
3377 
3378 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3379 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3380 {
3381  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3382 };
3383 
3384 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3386  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3387 {
3388  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
3389  pack(blockA, lhs, depth, rows, stride, offset);
3390 }
3391 
3392 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3393 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3394 {
3395  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3396 };
3397 
3398 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3400  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3401 {
3402  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
3403  pack(blockA, lhs, depth, rows, stride, offset);
3404 }
3405 
3406 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3407 struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3408 {
3409  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3410 };
3411 
3412 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3413 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3414  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3415 {
3416  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
3417  pack(blockA, lhs, depth, rows, stride, offset);
3418 }
3419 
3420 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3421 struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3422 {
3423  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3424 };
3425 
3426 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3427 void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3428  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3429 {
3430  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
3431  pack(blockA, lhs, depth, rows, stride, offset);
3432 }
3433 
3434 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
3435 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3436 struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3437 {
3438  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3439 };
3440 
3441 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3443  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3444 {
3445  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
3446  pack(blockB, rhs, depth, cols, stride, offset);
3447 }
3448 
3449 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3450 struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3451 {
3452  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3453 };
3454 
3455 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3457  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3458 {
3459  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
3460  pack(blockB, rhs, depth, cols, stride, offset);
3461 }
3462 #endif
3463 
3464 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3465 struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3466 {
3467  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3468 };
3469 
3470 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3471 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3472  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3473 {
3474  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
3475  pack(blockB, rhs, depth, cols, stride, offset);
3476 }
3477 
3478 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3479 struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3480 {
3481  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3482 };
3483 
3484 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3485 void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3486  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3487 {
3488  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
3489  pack(blockB, rhs, depth, cols, stride, offset);
3490 }
3491 
3492 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3493 struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3494 {
3495  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3496 };
3497 
3498 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3499 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
3500  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3501 {
3502  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
3503  pack(blockA, lhs, depth, rows, stride, offset);
3504 }
3505 
3506 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3507 struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3508 {
3509  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
3510 };
3511 
3512 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
3513 void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
3514  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
3515 {
3516  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
3517  pack(blockA, lhs, depth, rows, stride, offset);
3518 }
3519 
3520 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3521 struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3522 {
3523  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3524 };
3525 
3526 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3527 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
3528  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3529 {
3530  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
3531  pack(blockB, rhs, depth, cols, stride, offset);
3532 }
3533 
3534 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3535 struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3536 {
3537  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
3538 };
3539 
3540 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3541 void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
3542  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
3543 {
3544  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
3545  pack(blockB, rhs, depth, cols, stride, offset);
3546 }
3547 
3548 // ********* gebp specializations *********
3549 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3550 struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3551 {
3552  typedef typename quad_traits<float>::vectortype Packet;
3553  typedef typename quad_traits<float>::rhstype RhsPacket;
3554 
3555  void operator()(const DataMapper& res, const float* blockA, const float* blockB,
3556  Index rows, Index depth, Index cols, float alpha,
3557  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3558 };
3559 
3560 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3562  ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
3563  Index rows, Index depth, Index cols, float alpha,
3564  Index strideA, Index strideB, Index offsetA, Index offsetB)
3565  {
3566  const Index accRows = quad_traits<float>::rows;
3567  const Index accCols = quad_traits<float>::size;
3568  static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index) =
3569  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3570  (supportsMMA()) ?
3571  &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
3572  #endif
3573  &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
3574  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3575  }
3576 
3577 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3578 struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3579 {
3580  typedef Packet4f Packet;
3581  typedef Packet2cf Packetc;
3582  typedef Packet4f RhsPacket;
3583 
3584  void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
3585  Index rows, Index depth, Index cols, std::complex<float> alpha,
3586  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3587 };
3588 
3589 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3590 void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3591  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
3592  Index rows, Index depth, Index cols, std::complex<float> alpha,
3593  Index strideA, Index strideB, Index offsetA, Index offsetB)
3594  {
3595  const Index accRows = quad_traits<float>::rows;
3596  const Index accCols = quad_traits<float>::size;
3597  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
3598  Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
3599  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3600  (supportsMMA()) ?
3601  &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
3602  #endif
3603  &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
3604  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3605  }
3606 
3607 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3608 struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3609 {
3610  typedef Packet4f Packet;
3611  typedef Packet2cf Packetc;
3612  typedef Packet4f RhsPacket;
3613 
3614  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
3615  Index rows, Index depth, Index cols, std::complex<float> alpha,
3616  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3617 };
3618 
3619 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3620 void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3621  ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
3622  Index rows, Index depth, Index cols, std::complex<float> alpha,
3623  Index strideA, Index strideB, Index offsetA, Index offsetB)
3624  {
3625  const Index accRows = quad_traits<float>::rows;
3626  const Index accCols = quad_traits<float>::size;
3627  static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
3628  Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
3629  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3630  (supportsMMA()) ?
3631  &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
3632  #endif
3633  &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
3634  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3635  }
3636 
3637 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3638 struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3639 {
3640  typedef Packet4f Packet;
3641  typedef Packet2cf Packetc;
3642  typedef Packet4f RhsPacket;
3643 
3644  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
3645  Index rows, Index depth, Index cols, std::complex<float> alpha,
3646  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3647 };
3648 
3649 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3650 void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3651  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
3652  Index rows, Index depth, Index cols, std::complex<float> alpha,
3653  Index strideA, Index strideB, Index offsetA, Index offsetB)
3654  {
3655  const Index accRows = quad_traits<float>::rows;
3656  const Index accCols = quad_traits<float>::size;
3657  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
3658  Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
3659  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3660  (supportsMMA()) ?
3661  &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
3662  #endif
3663  &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
3664  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3665  }
3666 
3667 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3668 struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3669 {
3670  typedef typename quad_traits<double>::vectortype Packet;
3671  typedef typename quad_traits<double>::rhstype RhsPacket;
3672 
3673  void operator()(const DataMapper& res, const double* blockA, const double* blockB,
3674  Index rows, Index depth, Index cols, double alpha,
3675  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3676 };
3677 
3678 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3680  ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
3681  Index rows, Index depth, Index cols, double alpha,
3682  Index strideA, Index strideB, Index offsetA, Index offsetB)
3683  {
3684  const Index accRows = quad_traits<double>::rows;
3685  const Index accCols = quad_traits<double>::size;
3686  static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index) =
3687  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3688  (supportsMMA()) ?
3689  &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
3690  #endif
3691  &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
3692  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3693  }
3694 
3695 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3696 struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3697 {
3698  typedef quad_traits<double>::vectortype Packet;
3699  typedef Packet1cd Packetc;
3700  typedef quad_traits<double>::rhstype RhsPacket;
3701 
3702  void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
3703  Index rows, Index depth, Index cols, std::complex<double> alpha,
3704  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3705 };
3706 
3707 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3708 void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3709  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
3710  Index rows, Index depth, Index cols, std::complex<double> alpha,
3711  Index strideA, Index strideB, Index offsetA, Index offsetB)
3712  {
3713  const Index accRows = quad_traits<double>::rows;
3714  const Index accCols = quad_traits<double>::size;
3715  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
3716  Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
3717  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3718  (supportsMMA()) ?
3719  &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
3720  #endif
3721  &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
3722  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3723  }
3724 
3725 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3726 struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3727 {
3728  typedef quad_traits<double>::vectortype Packet;
3729  typedef Packet1cd Packetc;
3730  typedef quad_traits<double>::rhstype RhsPacket;
3731 
3732  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
3733  Index rows, Index depth, Index cols, std::complex<double> alpha,
3734  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3735 };
3736 
3737 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3738 void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3739  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
3740  Index rows, Index depth, Index cols, std::complex<double> alpha,
3741  Index strideA, Index strideB, Index offsetA, Index offsetB)
3742  {
3743  const Index accRows = quad_traits<double>::rows;
3744  const Index accCols = quad_traits<double>::size;
3745  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
3746  Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
3747  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3748  (supportsMMA()) ?
3749  &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
3750  #endif
3751  &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
3752  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3753  }
3754 
3755 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3756 struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3757 {
3758  typedef quad_traits<double>::vectortype Packet;
3759  typedef Packet1cd Packetc;
3760  typedef quad_traits<double>::rhstype RhsPacket;
3761 
3762  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
3763  Index rows, Index depth, Index cols, std::complex<double> alpha,
3764  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3765 };
3766 
3767 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3768 void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3769  ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
3770  Index rows, Index depth, Index cols, std::complex<double> alpha,
3771  Index strideA, Index strideB, Index offsetA, Index offsetB)
3772  {
3773  const Index accRows = quad_traits<double>::rows;
3774  const Index accCols = quad_traits<double>::size;
3775  static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
3776  Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
3777  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3778  (supportsMMA()) ?
3779  &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
3780  #endif
3781  &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
3782  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3783  }
3784 
3785 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3786 struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
3787 {
3788  typedef typename quad_traits<bfloat16>::vectortype Packet;
3789  typedef typename quad_traits<bfloat16>::rhstype RhsPacket;
3790 
3791  void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
3792  Index rows, Index depth, Index cols, bfloat16 alpha,
3793  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
3794 };
3795 
3796 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
3798  ::operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
3799  Index rows, Index depth, Index cols, bfloat16 alpha,
3800  Index strideA, Index strideB, Index offsetA, Index offsetB)
3801  {
3802  static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16, Index, Index, Index, Index) =
3803  #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
3804  (supportsMMA()) ?
3805  &Eigen::internal::gemmMMAbfloat16<DataMapper> :
3806  #endif
3807  &Eigen::internal::gemmbfloat16<DataMapper>;
3808  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
3809  }
3810 } // end namespace internal
3811 
3812 } // end namespace Eigen
3813 
3814 #endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
Array< int, Dynamic, 1 > v
EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL FixedBlockXpr<...,... >::Type block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols)
Definition: BlockMethods.h:96
RowXpr row(Index i)
This is the const version of row(). *‍/.
ColXpr col(Index i)
This is the const version of col().
Matrix4Xd M
IndexedView_or_Block operator()(const RowIndices &rowIndices, const ColIndices &colIndices)
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
int data[]
#define MICRO_COMPLEX_UNROLL_ITER(func, N)
#define MICRO_UPDATE
#define MICRO_COMPLEX_UPDATE
#define EIGEN_POWER_PREFETCH(p)
#define MICRO_UNROLL_ITER(func, N)
#define MICRO_COMPLEX_EXTRA_COLS(N)
#define MICRO_COMPLEX_DST_PTR
#define MICRO_COMPLEX_PREFETCHN(N)
#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col)
#define advanceCols
#define MICRO_COMPLEX_SRC2_PTR
#define MICRO_PREFETCHN(N)
#define MICRO_NEW_ROWS
#define PEEL_COMPLEX_ROW
#define MICRO_ONE_PEEL4
#define PEEL_ROW
#define MICRO_STORE
#define MICRO_WORK_PEEL_ROW
#define accColsC
#define MICRO_DST_PTR
#define MAX_UNROLL
#define advanceRows
#define MICRO_EXTRA_ROWS(N)
#define MICRO_EXTRA_COLS(N)
#define MICRO_COMPLEX_ONE_PEEL4
#define MICRO_COMPLEX_PREFETCH
#define MICRO_COMPLEX_EXTRA_ROWS(N)
#define PEEL_COMPLEX
#define MICRO_COMPLEX_BROADCAST_EXTRA
#define MICRO_COMPLEX_WORK_PEEL_ROW
#define MICRO_ADD_PEEL_ROW
#define MICRO_ZERO_PEEL_ROW
#define MICRO_COMPLEX_ZERO_PEEL_ROW
#define PEEL
#define MICRO_SRC2_PTR
#define MICRO_COMPLEX_ONE4
#define MICRO_COMPLEX_ADD_PEEL_ROW
#define MICRO_PREFETCH
#define MICRO_ONE4
#define MICRO_UNROLL_ITER2(N, M)
#define MAX_BFLOAT16_ACC_VSX
#define MICRO_COMPLEX_UNROLL_ITER2(N, M)
#define MICRO_COMPLEX_STORE
#define MICRO_SRC_PTR
#define MICRO_COMPLEX_SRC_PTR
#define MAX_COMPLEX_UNROLL
#define MICRO_BROADCAST_EXTRA
#define MICRO_COMPLEX_ADD_COLS(size)
RowVector3d w
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)
Definition: Memory.h:847
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Map< RowVectorXf > v2(M2.data(), M2.size())
M1<< 1, 2, 3, 4, 5, 6, 7, 8, 9;Map< RowVectorXf > v1(M1.data(), M1.size())
@ ColMajor
Definition: Constants.h:321
@ RowMajor
Definition: Constants.h:323
float bfloat16_to_float(__bfloat16_raw h)
Definition: BFloat16.h:571
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data)
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask)
EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16 *indexA, const float *indexA2, const float *blockB2, Index strideA, Index strideB, Index offsetB, float *result2)
EIGEN_ALWAYS_INLINE void storeResults(Packet4f(&acc)[4], Index rows, const Packet4f pAlpha, float *result, Index extra_cols, Index extra_rows)
EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
__vector int Packet4i
EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Packet &lhsV, Packet &lhsVi, const Packet *rhsV, const Packet *rhsVi)
EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16 *src, Index resInc)
void symm_pack_lhs_helper(Scalar *blockA, const Scalar *_lhs, Index lhsStride, Index cols, Index rows)
__vector unsigned char Packet16uc
void colVSXLoopBody(Index &col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float *indexA, const float *indexB, Index strideB, Index offsetB, float *result)
void gemmbfloat16(const DataMapper &res, const bfloat16 *indexA, const bfloat16 *indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_ALWAYS_INLINE void pgerc(PacketBlock< Packet, N > *accReal, PacketBlock< Packet, N > *accImag, const Scalar *lhs_ptr, const Scalar *lhs_ptr_imag, const Packet *rhsV, const Packet *rhsVi)
EIGEN_ALWAYS_INLINE void storeF32(float *&result, Packet4f result_block, Index rows, Index extra_rows)
EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f(&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float *result, const Index extra_cols, Index extra_rows)
void symm_pack_rhs_helper(Scalar *blockB, const Scalar *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock< Packet, N > &acc)
static Packet16uc p16uc_MERGE16_32_8
EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
const Scalar & y
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data)
EIGEN_ALWAYS_INLINE void pstore_partial< bfloat16 >(bfloat16 *to, const Packet8bf &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock< Packet, N > &acc, PacketBlock< Packet, N > &accZ, const Packet &pAlpha)
static Packet16uc p16uc_TRANSPOSE64_LO
EIGEN_ALWAYS_INLINE std::complex< Scalar > getAdjointVal(Index i, Index j, const_blas_data_mapper< std::complex< Scalar >, Index, StorageOrder > &dt)
EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f(&acc)[num_acc][4])
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16 *src, Index resInc)
__vector unsigned short int Packet8us
EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float *indexA, const float *blockB, Index strideB, Index offsetB, float *result)
EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar *&lhs_ptr, const Scalar *&rhs_ptr0, const Scalar *&rhs_ptr1, const Scalar *&rhs_ptr2, PacketBlock< Packet, accRows > &accZero)
EIGEN_ALWAYS_INLINE void convertBF16toF32(Index &i, float *result, Index rows, const DataMapper &src)
EIGEN_ALWAYS_INLINE void bstore(PacketBlock< Packet, N > &acc, const DataMapper &res, Index row)
EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) *ap0, const __UNPACK_TYPE__(Packet) *ap1, const __UNPACK_TYPE__(Packet) *ap2, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
__UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
void pstoreu(Scalar *to, const Packet &from)
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock< Packet, N > &aReal, PacketBlock< Packet, N > &aImag, const Packet &bReal, const Packet &bImag, PacketBlock< Packet, N > &cReal, PacketBlock< Packet, N > &cImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar *&lhs_ptr_real, const Scalar *&lhs_ptr_imag, const Scalar *&rhs_ptr_real0, const Scalar *&rhs_ptr_real1, const Scalar *&rhs_ptr_real2, const Scalar *&rhs_ptr_imag0, const Scalar *&rhs_ptr_imag1, const Scalar *&rhs_ptr_imag2, PacketBlock< Packet, accRows > &accReal, PacketBlock< Packet, accRows > &accImag)
void gemm_complex(const DataMapper &res, const LhsScalar *blockAc, const RhsScalar *blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
void gemm_complexMMA(const DataMapper &res, const LhsScalar *blockAc, const RhsScalar *blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
void symm_pack_complex_lhs_helper(std::complex< Scalar > *blockA, const std::complex< Scalar > *_lhs, Index lhsStride, Index cols, Index rows)
void pstoreu_partial(Scalar *to, const Packet &from, const Index n, const Index offset=0)
EIGEN_ALWAYS_INLINE void bload(PacketBlock< Packet, N *(Complex?2:1)> &acc, const DataMapper &res, Index row, Index col)
__vector unsigned int Packet4ui
EIGEN_ALWAYS_INLINE bool supportsMMA()
EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16 *&indexA, const float *indexA2, Index &row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float *indexB, Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
void symm_pack_complex_rhs_helper(std::complex< Scalar > *blockB, const std::complex< Scalar > *_rhs, Index rhsStride, Index rows, Index cols, Index k2)
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float *result, Index rows, const bfloat16 *src, Index extra_rows)
static const Packet16uc p16uc_GETIMAG32
static const Packet16uc p16uc_GETREAL32
static Packet16uc p16uc_MERGE16_32_2
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float *result)
EIGEN_ALWAYS_INLINE void pbroadcastN< Packet4f, 4, false >(const float *ap0, const float *ap1, const float *ap2, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
Packet4f ploadu< Packet4f >(const float *from)
EIGEN_ALWAYS_INLINE void pbroadcastN< Packet4f, 4, true >(const float *ap0, const float *, const float *, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
static Packet16uc p16uc_MERGE16_32_3
EIGEN_ALWAYS_INLINE void bscale(PacketBlock< Packet, N > &acc, PacketBlock< Packet, N > &accZ, const Packet &pAlpha)
EIGEN_ALWAYS_INLINE void storeBlock(Scalar *to, PacketBlock< Packet, N > &block)
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float *result, Index cols, Index rows, const bfloat16 *src, Index delta, Index extra_rows)
EIGEN_ALWAYS_INLINE void pbroadcastN< Packet2d, 4, false >(const double *ap0, const double *, const double *, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
Packet8us pset1< Packet8us >(const unsigned short int &from)
EIGEN_ALWAYS_INLINE void band(PacketBlock< Packet, N > &acc, const Packet &pMask)
EIGEN_ALWAYS_INLINE void addResults(Packet4f(&acc)[num_acc][4])
EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f(&acc)[num_acc][size])
EIGEN_ALWAYS_INLINE void gemm_complex_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE Packet2d bmask< Packet2d >(const Index remaining_rows)
Packet2cf pload2(const std::complex< float > &from0, const std::complex< float > &from1)
EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) *lhs)
Packet8bf ploadu< Packet8bf >(const bfloat16 *from)
void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float *indexA, const float *blockB, Index strideB, Index offsetB, float *result)
static Packet16uc p16uc_MERGE16_32_4
EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock< Packet, N > &taccReal, PacketBlock< Packet, N > &taccImag, PacketBlock< Packetc, N > &acc1, PacketBlock< Packetc, N > &acc2)
EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float *res)
static Packet16uc p16uc_MERGE16_32_6
Packet2d pload< Packet2d >(const double *from)
EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float *to, PacketBlock< Packet8bf,(N+7)/8 > &block, Index extra=0)
EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, const Packet &pAlpha, const Packet &pMask)
Packet2cf preverse(const Packet2cf &a)
EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float *block, Index strideB, Index i, Packet4f &dhs0, Packet4f &dhs1)
EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float *to, PacketBlock< Packet8bf,(N+7)/8 > &block, Index extra)
static Packet16uc p16uc_MERGE16_32_7
Packet4f pset1< Packet4f >(const float &from)
static Packet16uc p16uc_MERGE16_32_5
EIGEN_ALWAYS_INLINE void pger_common(PacketBlock< Packet, N > *acc, const Packet &lhsV, const Packet *rhsV)
EIGEN_ALWAYS_INLINE void gemm_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void bcouple(PacketBlock< Packet, N > &taccReal, PacketBlock< Packet, N > &taccImag, PacketBlock< Packetc, N *2 > &tRes, PacketBlock< Packetc, N > &acc1, PacketBlock< Packetc, N > &acc2)
__vector float Packet4f
EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float *result, Index cols, Index rows, const DataMapper &res)
EIGEN_ALWAYS_INLINE void KLoop(const float *indexA, const float *indexB, Packet4f(&acc)[num_acc][4], Index strideB, Index k, Index offsetB, Index extra_cols)
static Packet16uc p16uc_MERGE16_32_1
Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi)
EIGEN_ALWAYS_INLINE void pger(PacketBlock< Packet, N > *acc, const Scalar *lhs, const Packet *rhsV)
void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
void pbroadcast4< Packet4f >(const float *a, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
void gemm(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index &i, float *result, Index rows, bfloat16 *&src, Index resInc)
static const Packet4i mask4[4]
EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float *result, Index col, Index rows, const DataMapper &res)
void pstore< double >(double *to, const Packet4d &from)
EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper &src)
EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float *indexA, const float *indexB, Index strideB, Index offsetB, float *result, const Index extra_cols, const Index extra_rows)
static Packet16uc p16uc_TRANSPOSE64_HI
: InteropHeaders
Definition: Core:139
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_imag_op< typename Derived::Scalar >, const Derived > imag(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_real_op< typename Derived::Scalar >, const Derived > real(const Eigen::ArrayBase< Derived > &x)
Definition: BFloat16.h:222
std::ptrdiff_t j