MatrixVectorProduct.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
11 #define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
12 
13 #include "../../InternalHeaderCheck.h"
14 
15 #if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA
16 #if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3)
17 #define USE_GEMV_MMA
18 #endif
19 
20 #if !EIGEN_COMP_LLVM && (__GNUC__ < 11)
21 // Only allow one vector_pair in buggy gcc - gcc 10.x has a bug
22 #define GCC_ONE_VECTORPAIR_BUG
23 #endif
24 #endif
25 
26 //#define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved)
27 
28 //#define EIGEN_POWER_USE_GEMV_PREFETCH
29 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
30 #define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
31 #else
32 #define EIGEN_POWER_GEMV_PREFETCH(p)
33 #endif
34 
35 #ifdef __has_builtin
36 #if !__has_builtin(__builtin_vsx_assemble_pair)
37 #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
38 #endif
39 #if !__has_builtin(__builtin_vsx_disassemble_pair)
40 #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
41 #endif
42 #endif
43 
44 #if EIGEN_COMP_LLVM
45 #define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
46  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
47 #else
48 #if (__GNUC__ <= 10)
49 #if (__GNUC_MINOR__ > 3)
50 #define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
51  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
52 #else
53 #define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
54  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
55 #endif
56 #else
57 #define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
58  __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
59 #endif
60 #endif
61 
62 #define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16))
63 #define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
64 #define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
65 #define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
66 
68 template<typename ResPacket, typename ResScalar>
69 EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data)
70 {
71  pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
72 }
73 
74 template<typename ResScalar>
75 EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data)
76 {
77  *res += (alpha * data);
78 }
79 
80 #define GEMV_UNROLL(func, N) \
81  func(0, N) func(1, N) func(2, N) func(3, N) \
82  func(4, N) func(5, N) func(6, N) func(7, N)
83 
84 #define GEMV_UNROLL_HALF(func, N) \
85  func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
86 
87 #define GEMV_GETN(N) (((N) * ResPacketSize) >> 2)
88 
89 #define GEMV_LOADPACKET_COL(iter) \
90  lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
91 
92 #ifdef USE_GEMV_MMA
93 #define GEMV_UNROLL3(func, N, which) \
94  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
95  func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
96 
97 #define GEMV_UNUSED_VAR(iter, N, which) \
98  if (GEMV_GETN(N) <= iter) { \
99  EIGEN_UNUSED_VARIABLE(which##iter); \
100  }
101 
102 #define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \
103  if (N <= iter) { \
104  EIGEN_UNUSED_VARIABLE(which##iter); \
105  }
106 
107 #define GEMV_UNUSED_EXTRA(N, which) \
108  GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
109 
110 #define GEMV_UNUSED(N, which) \
111  GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
112 
113 #define GEMV_INIT_MMA(iter, N) \
114  if (GEMV_GETN(N) > iter) { \
115  __builtin_mma_xxsetaccz(&e##iter); \
116  }
117 
118 #if EIGEN_COMP_LLVM
119 #define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
120  GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1));
121 #else
122 #define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
123  const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \
124  b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src##iter1));
125 #endif
126 
127 #define GEMV_LOAD1A_COL_MMA(iter, N) \
128  if (GEMV_GETN(N) > iter) { \
129  if (GEMV_IS_FLOAT) { \
130  g##iter = GEMV_LOADPACKET_COL(iter); \
131  EIGEN_UNUSED_VARIABLE(b##iter); \
132  } else { \
133  GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \
134  EIGEN_UNUSED_VARIABLE(g##iter); \
135  } \
136  } else { \
137  EIGEN_UNUSED_VARIABLE(b##iter); \
138  EIGEN_UNUSED_VARIABLE(g##iter); \
139  }
140 
141 #define GEMV_WORK1A_COL_MMA(iter, N) \
142  if (GEMV_GETN(N) > iter) { \
143  if (GEMV_IS_FLOAT) { \
144  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, a0, g##iter); \
145  } else { \
146  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, b##iter, a0); \
147  } \
148  }
149 
150 #define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \
151  if (GEMV_GETN(N) > iter1) { \
152  if (GEMV_IS_FLOAT) { \
153  GEMV_LOADPAIR_COL_MMA(iter2, iter2) \
154  EIGEN_UNUSED_VARIABLE(b##iter3); \
155  } else { \
156  GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \
157  GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \
158  } \
159  } else { \
160  EIGEN_UNUSED_VARIABLE(b##iter2); \
161  EIGEN_UNUSED_VARIABLE(b##iter3); \
162  } \
163  EIGEN_UNUSED_VARIABLE(g##iter2); \
164  EIGEN_UNUSED_VARIABLE(g##iter3);
165 
166 #define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \
167  if (GEMV_GETN(N) > iter1) { \
168  if (GEMV_IS_FLOAT) { \
169  LhsPacket h[2]; \
170  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(h), &b##iter2); \
171  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]); \
172  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]); \
173  } else { \
174  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0); \
175  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0); \
176  } \
177  }
178 
179 #if EIGEN_COMP_LLVM
180 #define GEMV_LOAD_COL_MMA(N) \
181  if (GEMV_GETN(N) > 1) { \
182  GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \
183  } else { \
184  GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \
185  }
186 
187 #define GEMV_WORK_COL_MMA(N) \
188  if (GEMV_GETN(N) > 1) { \
189  GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \
190  } else { \
191  GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \
192  }
193 #else
194 #define GEMV_LOAD_COL_MMA(N) \
195  GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
196 
197 #define GEMV_WORK_COL_MMA(N) \
198  GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
199 #endif
200 
201 #define GEMV_DISASSEMBLE_MMA(iter, N) \
202  if (GEMV_GETN(N) > iter) { \
203  __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \
204  if (!GEMV_IS_FLOAT) { \
205  result##iter.packet[0][1] = result##iter.packet[1][0]; \
206  result##iter.packet[2][1] = result##iter.packet[3][0]; \
207  } \
208  }
209 
210 #define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
211  b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
212 
213 #define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
214  if (GEMV_GETN(N) > iter1) { \
215  if (GEMV_IS_FLOAT) { \
216  GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \
217  EIGEN_UNUSED_VARIABLE(b##iter3); \
218  } else { \
219  GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \
220  GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \
221  } \
222  } else { \
223  EIGEN_UNUSED_VARIABLE(b##iter2); \
224  EIGEN_UNUSED_VARIABLE(b##iter3); \
225  }
226 
227 #if EIGEN_COMP_LLVM
228 #define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
229  ResPacket f##iter2[2]; \
230  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
231  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
232  f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
233  GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
234 #else
235 #define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
236  if (GEMV_IS_FLOAT) { \
237  __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
238  } else { \
239  __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
240  }
241 #endif
242 
243 #define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \
244  if (GEMV_GETN(N) > iter1) { \
245  if (GEMV_IS_FLOAT) { \
246  GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \
247  } else { \
248  GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \
249  GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \
250  } \
251  }
252 
253 #define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \
254  *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
255 
256 #define GEMV_STORE_COL_MMA(iter, N) \
257  if (GEMV_GETN(N) > iter) { \
258  if (GEMV_IS_FLOAT) { \
259  storeMaddData<ResPacket, ResScalar>(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \
260  } else { \
261  GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \
262  GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \
263  GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \
264  } \
265  }
266 
267 #define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \
268  if (GEMV_GETN(N) > iter1) { \
269  if (GEMV_IS_FLOAT) { \
270  GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \
271  } else { \
272  GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \
273  GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \
274  } \
275  }
276 
277 #define GEMV_PROCESS_COL_ONE_MMA(N) \
278  GEMV_UNROLL(GEMV_INIT_MMA, N) \
279  Index j = j2; \
280  __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \
281  do { \
282  LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \
283  RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
284  GEMV_UNROLL(GEMV_PREFETCH, N) \
285  GEMV_LOAD_COL_MMA(N) \
286  GEMV_WORK_COL_MMA(N) \
287  } while (++j < jend); \
288  GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \
289  if (GEMV_GETN(N) <= 1) { \
290  GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \
291  } else { \
292  GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \
293  GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \
294  GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \
295  } \
296  i += (ResPacketSize * N);
297 #endif
298 
299 #define GEMV_INIT(iter, N) \
300  if (N > iter) { \
301  c##iter = pset1<ResPacket>(ResScalar(0)); \
302  } else { \
303  EIGEN_UNUSED_VARIABLE(c##iter); \
304  }
305 
306 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
307 #define GEMV_PREFETCH(iter, N) \
308  if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \
309  lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \
310  }
311 #else
312 #define GEMV_PREFETCH(iter, N)
313 #endif
314 
315 #define GEMV_WORK_COL(iter, N) \
316  if (N > iter) { \
317  c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \
318  }
319 
320 #define GEMV_STORE_COL(iter, N) \
321  if (N > iter) { \
322  pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
323  }
324 
326 #define GEMV_PROCESS_COL_ONE(N) \
327  GEMV_UNROLL(GEMV_INIT, N) \
328  Index j = j2; \
329  do { \
330  RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
331  GEMV_UNROLL(GEMV_PREFETCH, N) \
332  GEMV_UNROLL(GEMV_WORK_COL, N) \
333  } while (++j < jend); \
334  GEMV_UNROLL(GEMV_STORE_COL, N) \
335  i += (ResPacketSize * N);
336 
337 #ifdef USE_GEMV_MMA
338 #define GEMV_PROCESS_COL(N) \
339  GEMV_PROCESS_COL_ONE_MMA(N)
340 #else
341 #define GEMV_PROCESS_COL(N) \
342  GEMV_PROCESS_COL_ONE(N)
343 #endif
344 
346 #ifdef USE_GEMV_MMA
347 template<typename LhsPacket, typename RhsPacket, bool accumulate>
348 EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
349 {
350  if (accumulate)
351  {
352  __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
353  }
354  else
355  {
356  __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
357  }
358 }
359 
361 template<typename LhsPacket, typename RhsPacket, bool accumulate>
362 EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b)
363 {
364  if (accumulate)
365  {
366  __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
367  }
368  else
369  {
370  __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
371  }
372 }
373 #endif
374 
375 template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
376 EIGEN_STRONG_INLINE void gemv_col(
377  Index rows, Index cols,
378  const LhsMapper& alhs,
379  const RhsMapper& rhs,
380  ResScalar* res, Index resIncr,
381  ResScalar alpha)
382 {
383  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
384 
385  typedef typename Traits::LhsPacket LhsPacket;
386  typedef typename Traits::RhsPacket RhsPacket;
387  typedef typename Traits::ResPacket ResPacket;
388 
389  EIGEN_UNUSED_VARIABLE(resIncr);
390  eigen_internal_assert(resIncr == 1);
391 
392  // The following copy tells the compiler that lhs's attributes are not modified outside this function
393  // This helps GCC to generate proper code.
394  LhsMapper lhs(alhs);
395  RhsMapper rhs2(rhs);
396 
397  conj_helper<LhsScalar, RhsScalar, false, false> cj;
398  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
399 
400  const Index lhsStride = lhs.stride();
401  // TODO: for padded aligned inputs, we could enable aligned reads
402  enum {
403  LhsAlignment = Unaligned,
404  ResPacketSize = Traits::ResPacketSize,
405  LhsPacketSize = Traits::LhsPacketSize,
406  RhsPacketSize = Traits::RhsPacketSize,
407  };
408 
409 #ifndef GCC_ONE_VECTORPAIR_BUG
410  const Index n8 = rows - 8 * ResPacketSize + 1;
411  const Index n4 = rows - 4 * ResPacketSize + 1;
412  const Index n2 = rows - 2 * ResPacketSize + 1;
413 #endif
414  const Index n1 = rows - 1 * ResPacketSize + 1;
415 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
416  const Index prefetch_dist = 64 * LhsPacketSize;
417 #endif
418 
419  // TODO: improve the following heuristic:
420  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
421  ResPacket palpha = pset1<ResPacket>(alpha);
422 
423  for (Index j2 = 0; j2 < cols; j2 += block_cols)
424  {
425  Index jend = numext::mini(j2 + block_cols, cols);
426  Index i = 0;
427  ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
428 #ifdef USE_GEMV_MMA
429  __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
430  PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
431  GEMV_UNUSED(8, e)
432  GEMV_UNUSED(8, result)
433  GEMV_UNUSED_EXTRA(1, c)
434 #endif
435 #ifndef GCC_ONE_VECTORPAIR_BUG
436  while (i < n8)
437  {
439  }
440  if (i < n4)
441  {
443  }
444  if (i < n2)
445  {
447  }
448  if (i < n1)
449 #else
450  while (i < n1)
451 #endif
452  {
454  }
455  for (;i < rows;++i)
456  {
457  ResScalar d0(0);
458  Index j = j2;
459  do {
460  d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
461  } while (++j < jend);
462  res[i] += alpha * d0;
463  }
464  }
465 }
466 
467 template<bool extraRows>
468 EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float *result, Packet4f pAlpha, Index extra_rows)
469 {
470  Packet4f d0 = ploadu<Packet4f>(result);
471  d0 = pmadd(acc, pAlpha, d0);
472  if (extraRows) {
473  pstoreu_partial(result, d0, extra_rows);
474  } else {
475  pstoreu(result, d0);
476  }
477 }
478 
479 template<Index num_acc, bool extraRows, Index size>
480 EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
481 {
482  constexpr Index real_acc = (num_acc - (extraRows ? 1 : 0));
483  for(Index k = 0; k < real_acc; k++) {
484  outputVecCol<false>(acc[k][0], result + k*4, pAlpha, extra_rows);
485  }
486  if (extraRows) {
487  outputVecCol<true>(acc[real_acc][0], result + real_acc*4, pAlpha, extra_rows);
488  }
489 }
490 
491 static Packet16uc p16uc_MERGE16_32_V1 = { 0, 1, 16,17, 0, 1, 16,17, 0, 1, 16,17, 0, 1, 16,17 };
492 static Packet16uc p16uc_MERGE16_32_V2 = { 2, 3, 18,19, 2, 3, 18,19, 2, 3, 18,19, 2, 3, 18,19 };
493 
494 template<Index num_acc, typename LhsMapper, bool zero>
495 EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2])
496 {
497  Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k*4, 0);
498  Packet8bf b1;
499  if (!zero) {
500  b1 = lhs.template loadPacket<Packet8bf>(k*4, 1);
501 
502  a0[k + 0][1] = oneConvertBF16Hi(b1.m_val);
503  }
504  a0[k + 0][0] = oneConvertBF16Hi(c0.m_val);
505 
506  if (num_acc > (k + 1)) {
507  a0[k + 1][0] = oneConvertBF16Lo(c0.m_val);
508  if (!zero) {
509  a0[k + 1][1] = oneConvertBF16Lo(b1.m_val);
510  }
511  }
512 }
513 
514 template<Index num_acc, bool zero>
515 EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2])
516 {
517  for(Index k = 0; k < num_acc; k++) {
518  for(Index i = 0; i < (zero ? 1 : 2); i++) {
519  acc[k][i] = pmadd(b0[i], a0[k][i], acc[k][i]);
520  }
521  }
522 }
523 
524 template<typename RhsMapper, bool linear>
526 {
527  // linear == false
528  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
529  {
530  return pgather<bfloat16, Packet8bf>(&rhs(j + 0, 0), rhs.stride());
531  }
532 };
533 
534 template<typename RhsMapper>
535 struct loadColData_impl<RhsMapper, true>
536 {
537  // linear == true
538  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
539  {
540  return rhs.template loadPacket<Packet8bf>(j + 0);
541  }
542 };
543 
544 template<typename RhsMapper, bool linear>
546 {
548 }
549 
550 template<Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
551 EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2])
552 {
553  Packet4f a0[num_acc][2], b0[2];
554  Packet8bf b2 = loadColData<RhsMapper, linear>(rhs, j);
555 
556  b0[0] = oneConvertBF16Perm(b2.m_val, p16uc_MERGE16_32_V1);
557  if (!zero) {
558  b0[1] = oneConvertBF16Perm(b2.m_val, p16uc_MERGE16_32_V2);
559  }
560 
561  LhsMapper lhs2 = lhs.getSubMapper(0, j);
562  for(Index k = 0; k < num_acc; k += 2) {
563  loadVecLoopVSX<num_acc, LhsMapper, zero>(k, lhs2, a0);
564  }
565 
566  multVecVSX<num_acc, zero>(acc, a0, b0);
567 }
568 
569 template<Index num_acc>
571 {
572  for(Index i = 0; i < num_acc; i++) {
573  acc[i][0] = acc[i][0] + acc[i][1];
574  }
575 }
576 
577 // Uses 2X the accumulators or 4X the number of VSX registers
578 #define MAX_BFLOAT16_VEC_ACC_VSX 8
579 
580 template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
581 void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
582 {
583  constexpr Index step = (num_acc * 4);
584  const Index extra_rows = (extraRows) ? (rows & 3) : 0;
585  constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
586 
587  do{
588  Packet4f acc[num_acc][2];
589 
590  zeroAccumulators<num_acc, 2>(acc);
591 
592  LhsMapper lhs2 = lhs.getSubMapper(row, 0);
593  for(Index j = 0; j + 2 <= cend; j += 2) {
594  vecColLoopVSX<num_acc, LhsMapper, RhsMapper, false, linear>(j, lhs2, rhs, acc);
595  }
596  if (cend & 1) {
597  vecColLoopVSX<num_acc, LhsMapper, RhsMapper, true, linear>(cend - 1, lhs2, rhs, acc);
598  }
599 
600  addResultsVSX<num_acc>(acc);
601 
602  outputVecColResults<num_acc, extraRows, 2>(acc, result, pAlpha, extra_rows);
603 
604  result += step;
605  } while(multiIters && (step <= rows - (row += step)));
606 }
607 
608 template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
609 EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
610 {
611  if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
612  colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
613  }
614 }
615 
616 template<typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
617 EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
618 {
619  switch ((rows - row) >> 2) {
620  case 7:
621  colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
622  break;
623  case 6:
624  colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
625  break;
626  case 5:
627  colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
628  break;
629  case 4:
630  colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
631  break;
632  case 3:
633  colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
634  break;
635  case 2:
636  colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
637  break;
638  case 1:
639  colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
640  break;
641  default:
642  if (extraRows) {
643  colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
644  }
645  break;
646  }
647 }
648 
649 template<typename LhsMapper, typename RhsMapper, bool linear>
650 EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
651 {
652  Index row = 0;
653  if (rows >= (MAX_BFLOAT16_VEC_ACC_VSX * 4)) {
654  colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
655  result += row;
656  }
657  if (rows & 3) {
658  colVSXVecColLoopBodyExtra<LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
659  } else {
660  colVSXVecColLoopBodyExtra<LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
661  }
662 }
663 
664 template<const Index size, bool inc, Index delta>
666 {
667  if (inc) {
668  if (size < 8) {
669  pscatter_partial(dst + delta*resInc, data, resInc, extra);
670  } else {
671  pscatter(dst + delta*resInc, data, resInc);
672  }
673  } else {
674  if (size < 8) {
675  pstoreu_partial(dst + delta, data, extra);
676  } else {
677  pstoreu(dst + delta, data);
678  }
679  }
680 }
681 
682 template<const Index size, bool inc = false>
683 EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc = 1)
684 {
685  constexpr Index extra = ((size < 8) ? 8 : size);
686  while (i + size <= rows) {
687  PacketBlock<Packet8bf,(size+7)/8> r32;
688  r32.packet[0] = convertF32toBF16VSX(result + i + 0);
689  if (size >= 16) {
690  r32.packet[1] = convertF32toBF16VSX(result + i + 8);
691  }
692  if (size >= 32) {
693  r32.packet[2] = convertF32toBF16VSX(result + i + 16);
694  r32.packet[3] = convertF32toBF16VSX(result + i + 24);
695  }
696  storeBF16fromResult<size, inc, 0>(dst, r32.packet[0], resInc, rows & 7);
697  if (size >= 16) {
698  storeBF16fromResult<size, inc, 8>(dst, r32.packet[1], resInc);
699  }
700  if (size >= 32) {
701  storeBF16fromResult<size, inc, 16>(dst, r32.packet[2], resInc);
702  storeBF16fromResult<size, inc, 24>(dst, r32.packet[3], resInc);
703  }
704  i += extra; dst += extra*resInc;
705  if (size != 32) break;
706  }
707 }
708 
709 template<bool inc = false>
710 EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float *result, Index rows, bfloat16* dst, Index resInc = 1)
711 {
712  Index i = 0;
713  convertPointerF32toBF16VSX<32,inc>(i, result, rows, dst, resInc);
714  convertPointerF32toBF16VSX<16,inc>(i, result, rows, dst, resInc);
715  convertPointerF32toBF16VSX<8,inc>(i, result, rows, dst, resInc);
716  convertPointerF32toBF16VSX<1,inc>(i, result, rows, dst, resInc);
717 }
718 
719 template<typename LhsMapper, typename RhsMapper>
721  Index rows, Index cols,
722  const LhsMapper& alhs,
723  const RhsMapper& rhs,
724  bfloat16* res, Index resIncr,
725  bfloat16 alpha)
726 {
727  typedef typename RhsMapper::LinearMapper LinearMapper;
728 
729  EIGEN_UNUSED_VARIABLE(resIncr);
730  eigen_internal_assert(resIncr == 1);
731 
732  // The following copy tells the compiler that lhs's attributes are not modified outside this function
733  // This helps GCC to generate proper code.
734  LhsMapper lhs(alhs);
735  RhsMapper rhs2(rhs);
736 
737  const Index lhsStride = lhs.stride();
738 
739  // TODO: improve the following heuristic:
740  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(bfloat16) < 16000 ? 16 : 8);
741  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
742  Packet4f pAlpha = pset1<Packet4f>(falpha);
743 
745 
747 
748  for (Index j2 = 0; j2 < cols; j2 += block_cols)
749  {
750  Index jend = numext::mini(j2 + block_cols, cols);
751 
752  LhsMapper lhs2 = lhs.getSubMapper(0, j2);
753  if (rhs.stride() == 1) {
754  LinearMapper rhs3 = rhs2.getLinearMapper(j2, 0);
755  calcVSXVecColLoops<LhsMapper, LinearMapper, true>(jend - j2, rows, lhs2, rhs3, pAlpha, result);
756  } else {
757  RhsMapper rhs3 = rhs2.getSubMapper(j2, 0);
758  calcVSXVecColLoops<LhsMapper, RhsMapper, false>(jend - j2, rows, lhs2, rhs3, pAlpha, result);
759  }
760  }
761 
763 }
764 
765 template<Index num_acc, Index size>
766 EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha)
767 {
768  constexpr Index extra = num_acc & 3;
769 
770  for(Index k = 0; k < num_acc; k += 4) {
771  Packet4f d0 = ploadu<Packet4f>(result + k);
772  d0 = pmadd(acc[k + 0][0], pAlpha, d0);
773 
774  if (num_acc > (k + 3)) {
775  pstoreu(result + k, d0);
776  } else {
777  if (extra == 3) {
778  pstoreu_partial(result + k, d0, extra);
779  } else {
780  memcpy((void *)(result + k), (void *)(&d0), sizeof(float) * extra);
781  }
782  }
783  }
784 }
785 
786 template<Index num_acc>
788 {
789  if (num_acc > (k + 1)) {
790  acc[k][1] = vec_mergel(acc[k + 0][0], acc[k + 1][0]);
791  acc[k][0] = vec_mergeh(acc[k + 0][0], acc[k + 1][0]);
792  acc[k][0] = acc[k][0] + acc[k][1];
793  acc[k][0] += vec_sld(acc[k][0], acc[k][0], 8);
794  } else {
795  acc[k][0] += vec_sld(acc[k][0], acc[k][0], 8);
796 #ifdef _BIG_ENDIAN
797  acc[k][0] += vec_sld(acc[k][0], acc[k][0], 12);
798 #else
799  acc[k][0] += vec_sld(acc[k][0], acc[k][0], 4);
800 #endif
801  }
802 }
803 
804 template<Index num_acc>
806 {
807  for(Index k = 0; k < num_acc; k += 4) {
808  preduxVecResults2VSX<num_acc>(acc, k + 0);
809  if (num_acc > (k + 2)) {
810  preduxVecResults2VSX<num_acc>(acc, k + 2);
811 #ifdef EIGEN_VECTORIZE_VSX
812  acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
813 #else
814  acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0],acc[k + 2][0],p16uc_TRANSPOSE64_HI));
815 #endif
816  }
817  }
818 }
819 
820 #ifndef _ARCH_PWR9
822 {
823  Packet16uc shift = pset1<Packet16uc>(8 * 2 * (8 - extra_cols));
824 #ifdef _BIG_ENDIAN
825  return reinterpret_cast<Packet8us>(vec_slo(vec_sro(reinterpret_cast<Packet16uc>(data), shift), shift));
826 #else
827  return reinterpret_cast<Packet8us>(vec_sro(vec_slo(reinterpret_cast<Packet16uc>(data), shift), shift));
828 #endif
829 }
830 #endif
831 
832 template<Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
833 EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j, Index extra_cols)
834 {
835  Packet4f a0[num_acc][2], b0[2];
836  Packet8bf a1, b1;
837 
838  if (extra) {
839  b1 = rhs.template loadPacketPartial<Packet8bf>(j, extra_cols);
840 #ifndef _ARCH_PWR9
841  b1 = loadPacketPartialZero(b1.m_val, extra_cols);
842 #endif
843  } else {
844  b1 = rhs.template loadPacket<Packet8bf>(j);
845  }
846  b0[0] = oneConvertBF16Hi(b1.m_val);
847  b0[1] = oneConvertBF16Lo(b1.m_val);
848 
849  const LhsMapper lhs2 = lhs.getSubMapper(0, j);
850  for(Index k = 0; k < num_acc; k++) {
851  if (extra) {
852  a1 = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
853 #ifndef _ARCH_PWR9
854  a1 = loadPacketPartialZero(a1.m_val, extra_cols);
855 #endif
856  } else {
857  a1 = lhs2.template loadPacket<Packet8bf>(k, 0);
858  }
859  a0[k][0] = oneConvertBF16Hi(a1.m_val);
860  a0[k][1] = oneConvertBF16Lo(a1.m_val);
861  }
862 
863  multVecVSX<num_acc, false>(acc, a0, b0);
864 }
865 
866 template<Index num_acc, typename LhsMapper, typename RhsMapper>
867 EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2], Index extra_cols)
868 {
869  Index j = 0;
870  for(; j + 8 <= cols; j += 8){
871  multVSXVecLoop<num_acc, LhsMapper, RhsMapper, false>(acc, lhs, rhs, j, extra_cols);
872  }
873 
874  if (extra_cols) {
875  multVSXVecLoop<num_acc, LhsMapper, RhsMapper, true>(acc, lhs, rhs, j, extra_cols);
876  }
877 }
878 
879 template<const Index num_acc, typename LhsMapper, typename RhsMapper>
880 void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
881 {
882  constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
883  const Index extra_cols = (cols & 7);
884 
885  do{
886  Packet4f acc[num_acc][2];
887 
888  zeroAccumulators<num_acc, 2>(acc);
889 
890  const LhsMapper lhs2 = lhs.getSubMapper(row, 0);
891  vecVSXLoop<num_acc, LhsMapper, RhsMapper>(cols, lhs2, rhs, acc, extra_cols);
892 
893  addResultsVSX<num_acc>(acc);
894 
895  preduxVecResultsVSX<num_acc>(acc);
896 
897  outputVecResults<num_acc, 2>(acc, result, pAlpha);
898 
899  result += num_acc;
900  } while(multiIters && (num_acc <= rows - (row += num_acc)));
901 }
902 
903 template<const Index num_acc, typename LhsMapper, typename RhsMapper>
904 EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
905 {
906  if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
907  colVSXVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
908  }
909 }
910 
911 template<typename LhsMapper, typename RhsMapper>
912 EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
913 {
914  switch (rows - row) {
915  case 7:
916  colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
917  break;
918  case 6:
919  colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
920  break;
921  case 5:
922  colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
923  break;
924  case 4:
925  colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
926  break;
927  case 3:
928  colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
929  break;
930  case 2:
931  colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
932  break;
933  case 1:
934  colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
935  break;
936  }
937 }
938 
939 template<typename LhsMapper, typename RhsMapper>
940 EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
941 {
942  Index row = 0;
944  colVSXVecLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
945  result += row;
946  }
947  colVSXVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
948 }
949 
950 template<typename LhsMapper, typename RhsMapper>
951 EIGEN_STRONG_INLINE void gemv_bfloat16_row(
952  Index rows, Index cols,
953  const LhsMapper& alhs,
954  const RhsMapper& rhs,
955  bfloat16* res, Index resIncr,
956  bfloat16 alpha)
957 {
958  typedef typename RhsMapper::LinearMapper LinearMapper;
959 
960  // The following copy tells the compiler that lhs's attributes are not modified outside this function
961  // This helps GCC to generate proper code.
962  LhsMapper lhs(alhs);
963  LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
964 
965  eigen_internal_assert(rhs.stride() == 1);
966 
967  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
968  const Packet4f pAlpha = pset1<Packet4f>(falpha);
969 
971  if (resIncr == 1) {
973  } else {
974  convertArrayPointerBF16toF32<true>(result, 1, rows, res, resIncr);
975  }
976  calcVSXVecLoops<LhsMapper, LinearMapper>(cols, rows, lhs, rhs2, pAlpha, result);
977  if (resIncr == 1) {
979  } else {
980  convertArrayPointerF32toBF16VSX<true>(result, rows, res, resIncr);
981  }
982 }
983 
984 #undef MAX_BFLOAT16_VEC_ACC_VSX
985 
986 const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb };
987 const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 };
988 
989 #ifdef _BIG_ENDIAN
990 const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
991 const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
992 const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
993 const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
994 const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
995 const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
996 #else
997 const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
998 const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
999 const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 };
1000 const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
1001 const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 };
1002 const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
1003 #endif
1004 
1005 #ifdef _BIG_ENDIAN
1006 #define COMPLEX_DELTA 0
1007 #else
1008 #define COMPLEX_DELTA 2
1009 #endif
1010 
1012 EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) {
1013  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
1014 }
1015 
1016 EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) {
1017  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
1018 }
1019 
1021 EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) {
1022 #ifdef __POWER8_VECTOR__
1023  return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
1024 #else
1025  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
1026 #endif
1027 }
1028 
1029 EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) {
1030  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
1031 }
1032 
1033 #if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12)
1034 #define PERMXOR_GOOD // Clang had a bug with vec_permxor and endianness prior to version 12
1035 #endif
1036 
1039 {
1040 #ifdef PERMXOR_GOOD
1041  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
1042 #else
1043  return pcplxflip(pconj2(a));
1044 #endif
1045 }
1046 
1048 {
1049 #ifdef PERMXOR_GOOD
1050  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
1051 #else
1052  return pcplxflip(pconj2(a));
1053 #endif
1054 }
1055 
1058 {
1059 #ifdef PERMXOR_GOOD
1060  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
1061 #else
1062  return pconj2(pcplxflip(a));
1063 #endif
1064 }
1065 
1067 {
1068 #ifdef PERMXOR_GOOD
1069  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
1070 #else
1071  return pconj2(pcplxflip(a));
1072 #endif
1073 }
1074 
1076 EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
1077 {
1078 #ifdef __POWER8_VECTOR__
1079  return Packet2cf(vec_neg(a.v));
1080 #else
1081  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
1082 #endif
1083 }
1084 
1085 EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a)
1086 {
1087 #ifdef __POWER8_VECTOR__
1088  return Packet1cd(vec_neg(a.v));
1089 #else
1090  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
1091 #endif
1092 }
1093 
1096 {
1097 #ifdef PERMXOR_GOOD
1098  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
1099 #else
1100  return pcplxflip(pnegate2(a));
1101 #endif
1102 }
1103 
1105 {
1106 #ifdef PERMXOR_GOOD
1107  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
1108 #else
1109  return pcplxflip(pnegate2(a));
1110 #endif
1111 }
1112 
1114 EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a)
1115 {
1116  return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
1117 }
1118 
1119 EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
1120 {
1121 #ifdef EIGEN_VECTORIZE_VSX
1122  return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
1123 #else
1124  return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
1125 #endif
1126 }
1127 
1130 {
1131  Packet4f t;
1132 #ifdef EIGEN_VECTORIZE_VSX
1133  // Load float64/two float32 (doubleword alignment)
1134  __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src));
1135 #else
1136  *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
1137 #endif
1138  return t;
1139 }
1140 
1142 template<typename RhsScalar>
1144 {
1145 #ifdef _ARCH_PWR9
1146  __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<float*>(src) + 0)));
1147  __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<float*>(src) + 1)));
1148 #else
1149  Packet4f t = pload_complex_half(src);
1150  r = vec_splat(t, COMPLEX_DELTA + 0);
1151  i = vec_splat(t, COMPLEX_DELTA + 1);
1152 #endif
1153 }
1154 
1155 template<typename RhsScalar>
1157 {
1158 #ifdef EIGEN_VECTORIZE_VSX
1159  __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
1160  __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<double*>(src) + 1)));
1161 #else
1162  Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
1163  r = vec_splat(t, 0);
1164  i = vec_splat(t, 1);
1165 #endif
1166 }
1167 
1168 #ifndef __POWER8_VECTOR__
1169 const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B };
1170 
1171 const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
1172 #endif
1173 
1175 template<typename RhsScalar>
1177 {
1178  Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
1179 #ifdef __POWER8_VECTOR__
1180  r = vec_mergee(t, t);
1181  i = vec_mergeo(t, t);
1182 #else
1183  r = vec_perm(t, t, p16uc_MERGEE);
1184  i = vec_perm(t, t, p16uc_MERGEO);
1185 #endif
1186 }
1187 
1188 template<typename RhsScalar>
1190 {
1191  return pload_realimag(src, r, i);
1192 }
1193 
1196 {
1197 #ifdef EIGEN_VECTORIZE_VSX
1198  Packet4f ret;
1199  __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
1200  return ret;
1201 #else
1202  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
1203 #endif
1204 }
1205 
1207 {
1208  return ploadu<Packet1cd>(src).v;
1209 }
1210 
1213 {
1214  return ploadu<Packet2cf>(src).v;
1215 }
1216 
1218 {
1219  return ploadu<Packet1cd>(src).v;
1220 }
1221 
1223 template<typename ResPacket>
1224 EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src)
1225 {
1226  if (GEMV_IS_SCALAR) {
1227  return pload_complex_half(src);
1228  }
1229  else
1230  {
1231  return ploadu<Packet4f>(reinterpret_cast<float*>(src));
1232  }
1233 }
1234 
1235 template<typename ResPacket>
1236 EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src)
1237 {
1238  return ploadu<Packet2d>(reinterpret_cast<double*>(src));
1239 }
1240 
1242 template<typename ResPacket>
1244 {
1245  return src->v;
1246 }
1247 
1248 template<typename ResPacket>
1250 {
1251  return src->v;
1252 }
1253 
1256 {
1257  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
1258 }
1259 
1261 {
1262  return ploadu<Packet1cd>(src).v;
1263 }
1264 
1267 {
1268  return ploadu<Packet2cf>(src).v;
1269 }
1270 
1272 {
1273  return pload_complex_full(src);
1274 }
1275 
1278 {
1279  return pset1<Packet4f>(*src);
1280 }
1281 
1283 {
1284  return pset1<Packet2d>(*src);
1285 }
1286 
1288 {
1289  return src;
1290 }
1291 
1293 {
1294  return src;
1295 }
1296 
1299 {
1300  Packet4f ret = ploadu<Packet4f>(src);
1301  return vec_mergeh(ret, ret);
1302 }
1303 
1305 {
1306  return pload_real(src);
1307 }
1308 
1310 {
1311  return pload_complex_full(src); // Just for compilation
1312 }
1313 
1315 {
1316  return pload_complex_full(src); // Just for compilation
1317 }
1318 
1320 template<typename ResPacket>
1322 {
1323  if (GEMV_IS_SCALAR) {
1324  return pload_real_full(src);
1325  }
1326  else {
1327  return ploadu<Packet4f>(src);
1328  }
1329 }
1330 
1331 template<typename ResPacket>
1333 {
1334  return pload_real(src);
1335 }
1336 
1337 EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b)
1338 {
1340  return a; // Just for compilation
1341 }
1342 
1343 EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b)
1344 {
1346  return a; // Just for compilation
1347 }
1348 
1350 template<typename Scalar, typename ResScalar>
1351 EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj)
1352 {
1353  return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
1354 }
1355 
1357 template<typename Scalar, typename ResScalar, typename ResPacket, int which>
1358 EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha)
1359 {
1360  Packet2cf ret;
1361  ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
1362  ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
1363  ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
1364  ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
1365  return ret;
1366 }
1367 
1368 template<typename Scalar, typename ResScalar, typename ResPacket, int which>
1369 EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha)
1370 {
1371  Packet1cd ret;
1372  ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
1373  ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
1374  return ret;
1375 }
1376 
1378 template<typename Packet>
1380 {
1381  return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
1382 }
1383 
1384 template<>
1386 {
1387  return Packet2cf(pset1<Packet4f>(float(0)));
1388 }
1389 
1390 template<>
1392 {
1393  return Packet1cd(pset1<Packet2d>(double(0)));
1394 }
1395 
1397 template<typename Packet, typename LhsPacket, typename RhsPacket>
1398 EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1)
1399 {
1402  return pset_zero<Packet>();
1403  }
1404  else
1405  {
1406  return c1; // Intentionally left uninitialized
1407  }
1408 }
1409 
1410 template<typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
1412 {
1413  alpha_store(ResScalar& alpha) {
1414  separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
1415  separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
1416  }
1417  struct ri {
1418  PResPacket r;
1419  PResPacket i;
1421 };
1422 
1424 template<typename ScalarPacket, typename AlphaData>
1425 EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0)
1426 {
1427  return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
1428 }
1429 
1431 template<typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData>
1432 EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res)
1433 {
1434  PResPacket c2 = pcplxflipconj(c0);
1435  if (GEMV_IS_SCALAR) {
1436  ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
1437  ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
1438  pstoreu(reinterpret_cast<Scalar*>(res), c3);
1439  } else {
1440  ScalarPacket c4 = pload_complex<ResPacket>(res);
1441  PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
1442  pstoreu(res, c3);
1443  }
1444 }
1445 
1446 template<typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData, Index ResPacketSize, Index iter2>
1447 EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res)
1448 {
1449  PResPacket c2 = pcplxflipconj(c0);
1450  PResPacket c3 = pcplxflipconj(c1);
1451 #if !defined(_ARCH_PWR10)
1452  ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
1453  ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
1454  PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
1455  PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
1456  pstoreu(res + (iter2 * ResPacketSize), c6);
1457  pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
1458 #else
1459  __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize));
1460 #if EIGEN_COMP_LLVM
1461  PResPacket c6[2];
1462  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
1463  c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
1464  c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
1465  GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
1466 #else
1467  if (GEMV_IS_COMPLEX_FLOAT) {
1468  __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
1469  __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
1470  } else {
1471  __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
1472  __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
1473  }
1474 #endif
1475  *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a;
1476 #endif
1477 }
1478 
1480 template<typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
1481 EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j)
1482 {
1483  if (sizeof(Scalar) == sizeof(LhsScalar)) {
1484  const LhsScalar& src = lhs(i + 0, j);
1485  return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
1486  }
1487  return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
1488 }
1489 
1491 template<typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
1492 EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c)
1493 {
1494  if (ConjugateLhs && ConjugateRhs) {
1495  return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
1496  }
1497  else if (Negate && !ConjugateLhs && ConjugateRhs) {
1498  return vec_nmsub(a, b, c);
1499  }
1500  else {
1501  return vec_madd(a, b, c);
1502  }
1503 }
1504 
1506 template<typename ComplexPacket, typename RealPacket, bool Conjugate>
1507 EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c)
1508 {
1509  if (Conjugate) {
1510  return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
1511  }
1512  else {
1513  return vec_madd(a, b, c);
1514  }
1515 }
1516 
1517 template<typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1518 EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1519 {
1520  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
1521  RhsPacket b0;
1522  if (StorageOrder == ColMajor) {
1523  b0 = pset1<RhsPacket>(*b);
1524  }
1525  else {
1526  b0 = ploadu<RhsPacket>(b);
1527  }
1528  c0 = pcj.pmadd(a0, b0, c0);
1529 }
1530 
1532 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1533 EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1)
1534 {
1535  ScalarPacket br, bi;
1536  if (StorageOrder == ColMajor) {
1537  pload_realimag<RhsScalar>(b, br, bi);
1538  }
1539  else {
1540  pload_realimag_row<RhsScalar>(b, br, bi);
1541  }
1542  if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
1543  LhsPacket a1 = pcplxflipconj(a0);
1544  ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
1545  ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
1546  c1 = ResPacket(ci);
1547  c0 = PResPacket(cr);
1548 }
1549 
1551 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1552 EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1553 {
1554  ScalarPacket b0;
1555  if (StorageOrder == ColMajor) {
1556  b0 = pload_complex_full(b);
1557  }
1558  else {
1559  b0 = pload_complex_full_row(b);
1560  }
1561  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
1562  c0 = PResPacket(cri);
1563 }
1564 
1566 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1567 EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
1568 {
1569  ScalarPacket a1 = pload_complex<ResPacket>(&a0);
1570  ScalarPacket b0;
1571  if (StorageOrder == ColMajor) {
1572  b0 = pload_real(b);
1573  }
1574  else {
1575  b0 = pload_real_row<ResPacket>(b);
1576  }
1577  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
1578  c0 = PResPacket(cri);
1579 }
1580 
1581 #define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \
1582 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1583 EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \
1584 { \
1585  gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0, c1); \
1586 }
1587 
1588 GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
1589 GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex<double>, Packet1cd)
1590 
1591 #define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \
1592 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1593 EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \
1594 { \
1595  gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1596 }
1597 
1598 GEMV_MULT_REAL_COMPLEX(float, std::complex<float>, Packet2cf)
1599 GEMV_MULT_REAL_COMPLEX(double, std::complex<double>, Packet1cd)
1600 GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
1601 GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex<double>, Packet1cd)
1602 
1603 #define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \
1604 template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1605 EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \
1606 { \
1607  gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1608 }
1609 
1610 GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex<float>)
1611 GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex<double>)
1612 GEMV_MULT_COMPLEX_REAL(std::complex<float>, float, Packet2cf, std::complex<float>)
1613 GEMV_MULT_COMPLEX_REAL(std::complex<double>, double, Packet1cd, std::complex<double>)
1614 
1615 #ifdef USE_GEMV_MMA
1617 template<typename T>
1618 EIGEN_ALWAYS_INLINE T convertReal(T a)
1619 {
1620  return a;
1621 }
1622 
1623 EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a)
1624 {
1625  return a.v;
1626 }
1627 
1628 EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a)
1629 {
1630  return a.v;
1631 }
1632 
1634 template<typename T>
1635 EIGEN_ALWAYS_INLINE T convertComplex(T a)
1636 {
1637  return a;
1638 }
1639 
1640 EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a)
1641 {
1642  return Packet2cf(a);
1643 }
1644 
1645 EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a)
1646 {
1647  return Packet1cd(a);
1648 }
1649 
1651 template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
1652 EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a)
1653 {
1654  a = SLhsPacket(pload_complex<ResPacket>(&a));
1655 }
1656 
1657 template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
1658 EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&)
1659 {
1660  // Pass thru
1661 }
1662 
1664 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
1665 EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b)
1666 {
1667  if (NegativeAccumulate)
1668  {
1669  __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
1670  }
1671  else {
1672  __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
1673  }
1674 }
1675 
1677 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
1678 EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b)
1679 {
1680  if (NegativeAccumulate)
1681  {
1682  __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
1683  }
1684  else {
1685  __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
1686  }
1687 }
1688 
1689 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
1690 EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&)
1691 {
1692  // Just for compilation
1693 }
1694 
1696 template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
1697 EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
1698 {
1699  if (ConjugateLhs && ConjugateRhs) {
1700  RealPacket b2 = pconj2(convertComplex(b)).v;
1701  return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
1702  }
1703  else if (Negate && !ConjugateLhs && ConjugateRhs) {
1704  return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
1705  }
1706  else {
1707  return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
1708  }
1709 }
1710 
1711 template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
1712 EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
1713 {
1714  if (ConjugateLhs && ConjugateRhs) {
1715  RealPacket b2 = pconj2(convertComplex(b)).v;
1716  return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
1717  }
1718  else if (Negate && !ConjugateLhs && ConjugateRhs) {
1719  return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
1720  }
1721  else {
1722  return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
1723  }
1724 }
1725 
1727 template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
1728 EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
1729 {
1730  RealPacket a2 = convertReal(a);
1731  if (Conjugate) {
1732  RealPacket b2 = pconj2(convertComplex(b)).v;
1733  if (StorageOrder == ColMajor) {
1734  return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
1735  } else {
1736  return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
1737  }
1738  }
1739  else {
1740  if (StorageOrder == ColMajor) {
1741  return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
1742  } else {
1743  return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
1744  }
1745  }
1746 }
1747 
1749 template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
1750 EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
1751 {
1752  if (Conjugate) {
1753  RealPacket b2 = pconj2(convertComplex(b)).v;
1754  return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
1755  }
1756  else {
1757  return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
1758  }
1759 }
1760 
1762 template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1763 EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1764 {
1765  ScalarPacket b0;
1766  if (StorageOrder == ColMajor) {
1767  b0 = pload_realimag_combine(b);
1768  } else {
1770  }
1771  pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
1772 }
1773 
1775 template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1776 EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1777 {
1778  pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
1779  ScalarPacket b0;
1780  if (StorageOrder == ColMajor) {
1781  b0 = pload_real(b);
1782  }
1783  else {
1784  b0 = pload_real_row<ResPacket>(b);
1785  }
1786  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
1787 }
1788 
1790 template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1791 EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
1792 {
1793  ScalarPacket b0;
1794  if (StorageOrder == ColMajor) {
1795  b0 = pload_complex_full(b);
1796  }
1797  else {
1798  b0 = pload_complex_full_row(b);
1799  }
1800  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs, (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
1801 }
1802 
1803 #define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \
1804 template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1805 EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1806 { \
1807  gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1808 }
1809 
1810 GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
1811 GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex<float>)
1812 GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
1813 
1814 
1815 template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
1816 EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0)
1817 {
1818  if (sizeof(LhsScalar) == 16) {
1819  gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
1820  }
1821  else {
1822  gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
1823  }
1824 }
1825 
1826 #define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \
1827 template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1828 EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1829 { \
1830  gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1831 }
1832 
1833 GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex<float>)
1834 GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex<double>)
1835 
1836 #define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \
1837 template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
1838 EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
1839 { \
1840  gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
1841 }
1842 
1843 GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float)
1844 GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double)
1845 GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float)
1846 GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double)
1847 
1848 
1849 template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
1850 EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1851 {
1852  __builtin_mma_disassemble_acc(&result0.packet, c0);
1853  if (sizeof(LhsPacket) == 16) {
1854  if (sizeof(RhsPacket) == 16) {
1855  ScalarPacket tmp0, tmp2;
1856  tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
1857  tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
1858  result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
1859  result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
1860  result0.packet[2] = tmp2;
1861  result0.packet[0] = tmp0;
1862 
1863  if (ConjugateLhs) {
1864  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1865  result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
1866  } else if (ConjugateRhs) {
1867  result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
1868  result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
1869  } else {
1870  result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
1871  result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
1872  }
1873  result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
1874  result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
1875  } else {
1876  result0.packet[0][1] = result0.packet[1][1];
1877  result0.packet[2][1] = result0.packet[3][1];
1878  }
1879  }
1880 }
1881 
1882 template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
1883 EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1884 {
1885  __builtin_mma_disassemble_acc(&result0.packet, c0);
1887  if (ConjugateLhs) {
1888  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1889  result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
1890  } else {
1891  if (ConjugateRhs) {
1892  result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
1893  } else {
1894  result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
1895  }
1896  }
1897  result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
1898  } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
1899  if (ConjugateLhs) {
1900  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
1901  }
1902  } else {
1903  result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
1904  }
1905 }
1906 
1907 template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
1908 EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
1909 {
1910  if (!GEMV_IS_COMPLEX_FLOAT) {
1911  disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
1912  } else {
1913  disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
1914  }
1915 }
1916 #endif
1917 
1918 #define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1)
1919 
1920 #define GEMV_LOADPACKET_COL_COMPLEX(iter) \
1921  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
1922 
1923 #define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \
1924  convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
1925 
1926 #ifdef USE_GEMV_MMA
1927 #define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \
1928  if (GEMV_GETN_COMPLEX(N) > iter) { \
1929  __builtin_mma_xxsetaccz(&e0##iter); \
1930  }
1931 
1932 #if EIGEN_COMP_LLVM
1933 #define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
1934  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
1935  EIGEN_UNUSED_VARIABLE(f##iter1);
1936 #else
1937 #define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
1938  if (sizeof(LhsPacket) == 16) { \
1939  const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \
1940  a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src)); \
1941  EIGEN_UNUSED_VARIABLE(f##iter1); \
1942  } else { \
1943  f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j); \
1944  GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \
1945  }
1946 #endif
1947 
1948 #define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \
1949  if (GEMV_GETN_COMPLEX(N) > iter) { \
1950  if (GEMV_IS_COMPLEX_FLOAT) { \
1951  f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
1952  EIGEN_UNUSED_VARIABLE(a##iter); \
1953  } else { \
1954  GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \
1955  } \
1956  } else { \
1957  EIGEN_UNUSED_VARIABLE(a##iter); \
1958  EIGEN_UNUSED_VARIABLE(f##iter); \
1959  }
1960 
1961 #define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \
1962  if (GEMV_GETN_COMPLEX(N) > iter) { \
1963  if (GEMV_IS_COMPLEX_FLOAT) { \
1964  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter); \
1965  } else { \
1966  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter); \
1967  } \
1968  }
1969 
1970 #define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
1971  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
1972 
1973 #define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
1974  if (GEMV_GETN_COMPLEX(N) > iter1) { \
1975  if (GEMV_IS_COMPLEX_FLOAT) { \
1976  GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \
1977  EIGEN_UNUSED_VARIABLE(a##iter3) \
1978  } else { \
1979  GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \
1980  GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \
1981  } \
1982  } else { \
1983  EIGEN_UNUSED_VARIABLE(a##iter2); \
1984  EIGEN_UNUSED_VARIABLE(a##iter3); \
1985  } \
1986  EIGEN_UNUSED_VARIABLE(f##iter2); \
1987  EIGEN_UNUSED_VARIABLE(f##iter3);
1988 
1989 #define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
1990  if (GEMV_GETN_COMPLEX(N) > iter1) { \
1991  if (GEMV_IS_COMPLEX_FLOAT) { \
1992  PLhsPacket g[2]; \
1993  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2); \
1994  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2); \
1995  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3); \
1996  } else { \
1997  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2); \
1998  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3); \
1999  } \
2000  }
2001 
2002 #if EIGEN_COMP_LLVM
2003 #define GEMV_LOAD_COL_COMPLEX_MMA(N) \
2004  if (GEMV_GETN_COMPLEX(N) > 1) { \
2005  GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \
2006  } else { \
2007  GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \
2008  }
2009 
2010 #define GEMV_WORK_COL_COMPLEX_MMA(N) \
2011  if (GEMV_GETN_COMPLEX(N) > 1) { \
2012  GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \
2013  } else { \
2014  GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \
2015  }
2016 #else
2017 #define GEMV_LOAD_COL_COMPLEX_MMA(N) \
2018  GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
2019 
2020 #define GEMV_WORK_COL_COMPLEX_MMA(N) \
2021  GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
2022 #endif
2023 
2024 #define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
2025  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter);
2026 
2027 #define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
2028  if (GEMV_GETN_COMPLEX(N) > iter) { \
2029  GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \
2030  c0##iter = PResPacket(result0##iter.packet[0]); \
2031  if (GEMV_IS_COMPLEX_FLOAT) { \
2032  pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
2033  } else { \
2034  pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \
2035  c0##iter = PResPacket(result0##iter.packet[2]); \
2036  pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \
2037  } \
2038  }
2039 
2040 #define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
2041  if (GEMV_GETN_COMPLEX(N) > iter1) { \
2042  GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \
2043  GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \
2044  c0##iter2 = PResPacket(result0##iter2.packet[0]); \
2045  if (GEMV_IS_COMPLEX_FLOAT) { \
2046  c0##iter3 = PResPacket(result0##iter3.packet[0]); \
2047  pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
2048  } else { \
2049  c0##iter3 = PResPacket(result0##iter2.packet[2]); \
2050  pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
2051  c0##iter2 = PResPacket(result0##iter3.packet[0]); \
2052  c0##iter3 = PResPacket(result0##iter3.packet[2]); \
2053  pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
2054  } \
2055  }
2056 
2057 #define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
2058  GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \
2059  Index j = j2; \
2060  do { \
2061  const RhsScalar& b1 = rhs2(j, 0); \
2062  RhsScalar* b = const_cast<RhsScalar *>(&b1); \
2063  GEMV_UNROLL(GEMV_PREFETCH, N) \
2064  GEMV_LOAD_COL_COMPLEX_MMA(N) \
2065  GEMV_WORK_COL_COMPLEX_MMA(N) \
2066  } while (++j < jend); \
2067  if (GEMV_GETN(N) <= 2) { \
2068  GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \
2069  } else { \
2070  GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \
2071  } \
2072  i += (ResPacketSize * N);
2073 #endif
2074 
2075 #define GEMV_INIT_COMPLEX(iter, N) \
2076  if (N > iter) { \
2077  c0##iter = pset_zero<PResPacket>(); \
2078  c1##iter = pset_init<ResPacket, LhsPacket, RhsPacket>(c1##iter); \
2079  } else { \
2080  EIGEN_UNUSED_VARIABLE(c0##iter); \
2081  EIGEN_UNUSED_VARIABLE(c1##iter); \
2082  }
2083 
2084 #define GEMV_WORK_COL_COMPLEX(iter, N) \
2085  if (N > iter) { \
2086  f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
2087  gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter); \
2088  } else { \
2089  EIGEN_UNUSED_VARIABLE(f##iter); \
2090  }
2091 
2092 #define GEMV_STORE_COL_COMPLEX(iter, N) \
2093  if (N > iter) { \
2094  if (GEMV_IS_COMPLEX_COMPLEX) { \
2095  c0##iter = padd(c0##iter, c1##iter); \
2096  } \
2097  pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
2098  }
2099 
2101 #define GEMV_PROCESS_COL_COMPLEX_ONE(N) \
2102  GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \
2103  Index j = j2; \
2104  do { \
2105  const RhsScalar& b1 = rhs2(j, 0); \
2106  RhsScalar* b = const_cast<RhsScalar *>(&b1); \
2107  GEMV_UNROLL(GEMV_PREFETCH, N) \
2108  GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \
2109  } while (++j < jend); \
2110  GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \
2111  i += (ResPacketSize * N);
2112 
2113 #if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA))
2114 #define USE_GEMV_COL_COMPLEX_MMA
2115 #endif
2116 
2117 #ifdef USE_GEMV_COL_COMPLEX_MMA
2118 #define GEMV_PROCESS_COL_COMPLEX(N) \
2119  GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
2120 #else
2121 #if defined(USE_GEMV_MMA) && (__GNUC__ > 10)
2122 #define GEMV_PROCESS_COL_COMPLEX(N) \
2123  if (sizeof(Scalar) != sizeof(LhsPacket)) { \
2124  GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
2125  } else { \
2126  GEMV_PROCESS_COL_COMPLEX_ONE(N) \
2127  }
2128 #else
2129 #define GEMV_PROCESS_COL_COMPLEX(N) \
2130  GEMV_PROCESS_COL_COMPLEX_ONE(N)
2131 #endif
2132 #endif
2133 
2134 template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
2135 EIGEN_STRONG_INLINE void gemv_complex_col(
2136  Index rows, Index cols,
2137  const LhsMapper& alhs,
2138  const RhsMapper& rhs,
2139  ResScalar* res, Index resIncr,
2140  ResScalar alpha)
2141 {
2142  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
2143 
2144  typedef typename Traits::LhsPacket LhsPacket;
2145  typedef typename Traits::RhsPacket RhsPacket;
2146  typedef typename Traits::ResPacket ResPacket;
2147 
2148  typedef typename packet_traits<Scalar>::type ScalarPacket;
2149  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
2150  typedef typename packet_traits<ResScalar>::type PResPacket;
2151  typedef gemv_traits<ResPacket, ResPacket> PTraits;
2152 
2153  EIGEN_UNUSED_VARIABLE(resIncr);
2154  eigen_internal_assert(resIncr == 1);
2155 
2156  // The following copy tells the compiler that lhs's attributes are not modified outside this function
2157  // This helps GCC to generate proper code.
2158  LhsMapper lhs(alhs);
2159  RhsMapper rhs2(rhs);
2160 
2161  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
2162 
2163  const Index lhsStride = lhs.stride();
2164  // TODO: for padded aligned inputs, we could enable aligned reads
2165  enum {
2166  LhsAlignment = Unaligned,
2167  ResPacketSize = PTraits::ResPacketSize,
2168  LhsPacketSize = PTraits::LhsPacketSize,
2169  RhsPacketSize = PTraits::RhsPacketSize,
2170  };
2171 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
2172  const Index prefetch_dist = 64 * LhsPacketSize;
2173 #endif
2174 
2175 #ifndef GCC_ONE_VECTORPAIR_BUG
2176  const Index n8 = rows - 8 * ResPacketSize + 1;
2177  const Index n4 = rows - 4 * ResPacketSize + 1;
2178  const Index n2 = rows - 2 * ResPacketSize + 1;
2179 #endif
2180  const Index n1 = rows - 1 * ResPacketSize + 1;
2181 
2182  // TODO: improve the following heuristic:
2183  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
2184 
2186  AlphaData alpha_data(alpha);
2187 
2188  for (Index j2 = 0; j2 < cols; j2 += block_cols)
2189  {
2190  Index jend = numext::mini(j2 + block_cols, cols);
2191  Index i = 0;
2192  PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
2193  ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
2194  PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
2195 #ifdef USE_GEMV_MMA
2196  __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
2197  __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
2198  PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
2199  GEMV_UNUSED(8, e0)
2200  GEMV_UNUSED(8, result0)
2201  GEMV_UNUSED(8, a)
2202  GEMV_UNUSED(8, f)
2203 #if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA)
2205 #endif
2206 #endif
2207 #ifndef GCC_ONE_VECTORPAIR_BUG
2208  {
2209  while (i < n8)
2210  {
2212  }
2213  }
2214  while (i < n4)
2215  {
2217  }
2218  if (i < n2)
2219  {
2221  }
2222  if (i < n1)
2223 #else
2224  while (i < n1)
2225 #endif
2226  {
2228  }
2229  for (;i < rows;++i)
2230  {
2231  ResScalar d0(0);
2232  Index j = j2;
2233  do {
2234  d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
2235  } while (++j < jend);
2236  res[i] += alpha * d0;
2237  }
2238  }
2239 }
2240 
2241 template <typename Scalar, int N> struct ScalarBlock {
2242  Scalar scalar[N];
2243 };
2244 
2245 #ifdef USE_GEMV_MMA
2246 static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
2247 
2249 template<typename ResScalar, typename ResPacket>
2250 EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1)
2251 {
2252  PacketBlock<ResPacket, 4> result0, result1;
2253  __builtin_mma_disassemble_acc(&result0.packet, acc0);
2254  __builtin_mma_disassemble_acc(&result1.packet, acc1);
2255  result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
2256  result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
2257  result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
2258  result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
2259  result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
2260  return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
2261 }
2262 
2263 template<>
2264 EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1)
2265 {
2266  PacketBlock<Packet2d, 4> result0, result1;
2267  __builtin_mma_disassemble_acc(&result0.packet, acc0);
2268  __builtin_mma_disassemble_acc(&result1.packet, acc1);
2269  result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
2270  return *reinterpret_cast<ScalarBlock<double, 2> *>(&result0.packet[0]);
2271 }
2272 
2274 template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
2275 EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0, PacketBlock<Packet4f, 4>& result1)
2276 {
2278  result0.packet[0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
2279  result0.packet[2] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
2280  result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
2282  result0.packet[1] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
2283  result0.packet[3] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
2284  result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
2285  if (ConjugateLhs) {
2286  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
2287  result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
2288  } else if (ConjugateRhs) {
2289  result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
2290  } else {
2291  result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
2292  }
2293  result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
2294  } else {
2295  if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
2296  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
2297  }
2298  }
2299  cc0.scalar[0].real(result0.packet[0][0]);
2300  cc0.scalar[0].imag(result0.packet[0][1]);
2301  cc0.scalar[1].real(result0.packet[0][2]);
2302  cc0.scalar[1].imag(result0.packet[0][3]);
2303  return cc0;
2304 }
2305 
2306 template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
2307 EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&, PacketBlock<Packet2d, 4>&)
2308 {
2310  EIGEN_UNUSED_VARIABLE(cc0);
2311  return cc0; // Just for compilation
2312 }
2313 
2315 template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
2316 EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1)
2317 {
2318  PacketBlock<ResPacket, 4> result0, result1;
2319  __builtin_mma_disassemble_acc(&result0.packet, acc0);
2320  __builtin_mma_disassemble_acc(&result1.packet, acc1);
2321  return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
2322 }
2323 
2324 template<typename ResScalar, typename ResPacket>
2326 {
2327  PacketBlock<ResPacket, 4> result0;
2328  __builtin_mma_disassemble_acc(&result0.packet, acc0);
2329  result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
2330  return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
2331 }
2332 
2333 template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
2335 {
2337  PacketBlock<ResPacket, 4> result0;
2338  __builtin_mma_disassemble_acc(&result0.packet, acc0);
2340  if (ConjugateLhs) {
2341  result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
2342  result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
2343  } else if (ConjugateRhs) {
2344  result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
2345  result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
2346  } else {
2347  result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
2348  result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
2349  }
2350  result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
2351  result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
2352  } else {
2353  result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
2354  result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
2355  }
2356  cc0.scalar[0].real(result0.packet[0][0]);
2357  cc0.scalar[0].imag(result0.packet[0][1]);
2358  cc0.scalar[1].real(result0.packet[2][0]);
2359  cc0.scalar[1].imag(result0.packet[2][1]);
2360  return cc0;
2361 }
2362 #endif
2363 
2364 template<typename ResScalar, typename ResPacket>
2366 {
2368  cc0.scalar[0] = predux(a);
2369  cc0.scalar[1] = predux(b);
2370  return cc0;
2371 }
2372 
2373 template<typename ResScalar, typename ResPacket>
2375 {
2376  return predux_real<ResScalar, ResPacket>(a, b);
2377 }
2378 
2379 #define GEMV_UNROLL_ROW(func, N) \
2380  func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
2381 
2382 #define GEMV_UNROLL_ROW_HALF(func, N) \
2383  func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
2384 
2385 #define GEMV_LOADPACKET_ROW(iter) \
2386  lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
2387 
2388 #ifdef USE_GEMV_MMA
2389 #define GEMV_UNROLL3_ROW(func, N, which) \
2390  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
2391  func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
2392 
2393 #define GEMV_UNUSED_ROW(N, which) \
2394  GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
2395 
2396 #define GEMV_INIT_ROW(iter, N) \
2397  if (GEMV_GETN(N) > iter) { \
2398  __builtin_mma_xxsetaccz(&c##iter); \
2399  }
2400 
2401 #define GEMV_LOADPAIR_ROW(iter1, iter2) \
2402  GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1));
2403 
2404 #define GEMV_WORK_ROW(iter, N) \
2405  if (GEMV_GETN(N) > iter) { \
2406  if (GEMV_IS_FLOAT) { \
2407  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \
2408  } else { \
2409  __vector_pair b##iter; \
2410  GEMV_LOADPAIR_ROW(iter, iter << 1) \
2411  pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0); \
2412  } \
2413  }
2414 
2415 #define GEMV_PREDUX2(iter1, iter2, iter3, N) \
2416  if (N > iter1) { \
2417  if (GEMV_IS_FLOAT) { \
2418  cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter2, &c##iter3); \
2419  } else { \
2420  cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1); \
2421  } \
2422  } else { \
2423  EIGEN_UNUSED_VARIABLE(cc##iter1); \
2424  }
2425 #else
2426 #define GEMV_INIT_ROW(iter, N) \
2427  if (N > iter) { \
2428  c##iter = pset1<ResPacket>(ResScalar(0)); \
2429  } else { \
2430  EIGEN_UNUSED_VARIABLE(c##iter); \
2431  }
2432 
2433 #define GEMV_WORK_ROW(iter, N) \
2434  if (N > iter) { \
2435  c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \
2436  }
2437 
2438 #define GEMV_PREDUX2(iter1, iter2, iter3, N) \
2439  if (N > iter1) { \
2440  cc##iter1 = predux_real<ResScalar, ResPacket>(c##iter2, c##iter3); \
2441  } else { \
2442  EIGEN_UNUSED_VARIABLE(cc##iter1); \
2443  }
2444 #endif
2445 
2446 #define GEMV_MULT(iter1, iter2, iter3, N) \
2447  if (N > iter1) { \
2448  cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \
2449  cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \
2450  }
2451 
2452 #define GEMV_STORE_ROW(iter1, iter2, iter3, N) \
2453  if (N > iter1) { \
2454  storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
2455  storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
2456  }
2457 
2459 #define GEMV_PROCESS_ROW(N) \
2460  for (; i < n##N; i += N) { \
2461  GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
2462  Index j = 0; \
2463  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
2464  RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
2465  GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
2466  } \
2467  GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
2468  for (; j < cols; ++j) { \
2469  RhsScalar a0 = rhs2(j); \
2470  GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
2471  } \
2472  GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
2473  }
2474 
2475 template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
2476 EIGEN_STRONG_INLINE void gemv_row(
2477  Index rows, Index cols,
2478  const LhsMapper& alhs,
2479  const RhsMapper& rhs,
2480  ResScalar* res, Index resIncr,
2481  ResScalar alpha)
2482 {
2483  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
2484 
2485  typedef typename Traits::LhsPacket LhsPacket;
2486  typedef typename Traits::RhsPacket RhsPacket;
2487  typedef typename Traits::ResPacket ResPacket;
2488 
2489  // The following copy tells the compiler that lhs's attributes are not modified outside this function
2490  // This helps GCC to generate proper code.
2491  LhsMapper lhs(alhs);
2492  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
2493 
2494  eigen_internal_assert(rhs.stride() == 1);
2495  conj_helper<LhsScalar, RhsScalar, false, false> cj;
2496  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
2497 
2498  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
2499  // processing 8 rows at once might be counter productive wrt cache.
2500 #ifndef GCC_ONE_VECTORPAIR_BUG
2501  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
2502  const Index n4 = rows - 3;
2503  const Index n2 = rows - 1;
2504 #endif
2505 
2506  // TODO: for padded aligned inputs, we could enable aligned reads
2507  enum {
2508  LhsAlignment = Unaligned,
2509  ResPacketSize = Traits::ResPacketSize,
2510  LhsPacketSize = Traits::LhsPacketSize,
2511  RhsPacketSize = Traits::RhsPacketSize,
2512  };
2513 
2514  Index i = 0;
2515 #ifdef USE_GEMV_MMA
2516  __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
2517  GEMV_UNUSED_ROW(8, c)
2518 #else
2519  ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
2520 #endif
2521 #ifndef GCC_ONE_VECTORPAIR_BUG
2522  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
2523  GEMV_PROCESS_ROW(8)
2524  GEMV_PROCESS_ROW(4)
2525  GEMV_PROCESS_ROW(2)
2526 #endif
2527  for (; i < rows; ++i)
2528  {
2529  ResPacket d0 = pset1<ResPacket>(ResScalar(0));
2530  Index j = 0;
2531  for (; j + LhsPacketSize <= cols; j += LhsPacketSize)
2532  {
2533  RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
2534 
2535  d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
2536  }
2537  ResScalar dd0 = predux(d0);
2538  for (; j < cols; ++j)
2539  {
2540  dd0 += cj.pmul(lhs(i, j), rhs2(j));
2541  }
2542  res[i * resIncr] += alpha * dd0;
2543  }
2544 }
2545 
2546 #define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \
2547 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2548 struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
2549 { \
2550  typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
2551 \
2552  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2553  Index rows, Index cols, \
2554  const LhsMapper& lhs, \
2555  const RhsMapper& rhs, \
2556  ResScalar* res, Index resIncr, \
2557  ResScalar alpha) { \
2558  gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2559  } \
2560 };
2561 
2562 #define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \
2563 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2564 struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
2565 { \
2566  typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
2567 \
2568  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2569  Index rows, Index cols, \
2570  const LhsMapper& lhs, \
2571  const RhsMapper& rhs, \
2572  ResScalar* res, Index resIncr, \
2573  ResScalar alpha) { \
2574  gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2575  } \
2576 };
2577 
2582 
2583 #ifdef USE_GEMV_MMA
2584 #define gemv_bf16_col gemvMMA_bfloat16_col
2585 #define gemv_bf16_row gemvMMA_bfloat16_row
2586 #else
2587 #define gemv_bf16_col gemv_bfloat16_col
2588 #define gemv_bf16_row gemv_bfloat16_row
2589 #endif
2590 
2591 #define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16() \
2592 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2593 struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
2594 { \
2595  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2596  Index rows, Index cols, \
2597  const LhsMapper& lhs, \
2598  const RhsMapper& rhs, \
2599  bfloat16* res, Index resIncr, \
2600  bfloat16 alpha) { \
2601  gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2602  } \
2603 };
2604 
2605 #define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16() \
2606 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2607 struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
2608 { \
2609  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2610  Index rows, Index cols, \
2611  const LhsMapper& lhs, \
2612  const RhsMapper& rhs, \
2613  bfloat16* res, Index resIncr, \
2614  bfloat16 alpha) { \
2615  gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2616  } \
2617 };
2618 
2621 
2622 template<typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
2623 EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1)
2624 {
2626  a0 = padd(a0, a1);
2627  b0 = padd(b0, b1);
2628  }
2629  return predux_complex<ResScalar, PResPacket>(a0, b0);
2630 }
2631 
2632 #define GEMV_LOADPACKET_ROW_COMPLEX(iter) \
2633  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
2634 
2635 #define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \
2636  convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
2637 
2638 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
2639  j = 0; \
2640  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
2641  const RhsScalar& b1 = rhs2(j); \
2642  RhsScalar* b = const_cast<RhsScalar *>(&b1); \
2643  GEMV_UNROLL_ROW(which, N) \
2644  }
2645 
2646 #define GEMV_PROCESS_END_ROW_COMPLEX(N) \
2647  for (; j < cols; ++j) { \
2648  RhsScalar b0 = rhs2(j); \
2649  GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
2650  } \
2651  GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
2652 
2653 #ifdef USE_GEMV_MMA
2654 #define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \
2655  if (GEMV_GETN_COMPLEX(N) > iter) { \
2656  __builtin_mma_xxsetaccz(&e0##iter); \
2657  }
2658 
2659 #define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \
2660  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1));
2661 
2662 #define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \
2663  if (GEMV_GETN_COMPLEX(N) > iter) { \
2664  if (GEMV_IS_COMPLEX_FLOAT) { \
2665  PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
2666  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
2667  } else { \
2668  __vector_pair a##iter; \
2669  GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \
2670  gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
2671  } \
2672  }
2673 
2674 #define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \
2675  if (N > iter1) { \
2676  if (GEMV_IS_COMPLEX_FLOAT) { \
2677  cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter2, &e0##iter3); \
2678  } else { \
2679  cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
2680  } \
2681  } else { \
2682  EIGEN_UNUSED_VARIABLE(cc##iter1); \
2683  }
2684 
2685 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
2686  GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \
2687  GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N)
2688 
2689 #define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \
2690  for (; i < n##N; i += N) { \
2691  GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
2692  GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \
2693  GEMV_PROCESS_END_ROW_COMPLEX(N); \
2694  }
2695 #endif
2696 
2697 #define GEMV_WORK_ROW_COMPLEX(iter, N) \
2698  if (N > iter) { \
2699  PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
2700  gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter); \
2701  }
2702 
2703 #define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \
2704  if (N > iter1) { \
2705  cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \
2706  } else { \
2707  EIGEN_UNUSED_VARIABLE(cc##iter1); \
2708  }
2709 
2710 #define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \
2711  if (N > iter1) { \
2712  cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \
2713  cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \
2714  }
2715 
2716 #define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \
2717  if (N > iter1) { \
2718  storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
2719  storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
2720  }
2721 
2722 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2723  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \
2724  GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N)
2725 
2727 #define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
2728  for (; i < n##N; i += N) { \
2729  GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2730  GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \
2731  GEMV_PROCESS_END_ROW_COMPLEX(N); \
2732  }
2733 
2734 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
2735  if (GEMV_IS_COMPLEX_COMPLEX) { \
2736  c0##iter = padd(c0##iter, c1##iter); \
2737  } \
2738  dd0 = predux(c0##iter);
2739 
2740 #if EIGEN_COMP_LLVM
2741 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
2742  GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
2743 
2744 #define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
2745  GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
2746 
2747 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
2748  GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
2749 #else
2750 // gcc seems to be reading and writing registers unnecessarily to memory.
2751 // Use the old way for complex double until it is fixed.
2752 
2753 #define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \
2754  lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
2755 
2756 #define GEMV_INIT_COMPLEX_OLD(iter, N) \
2757  EIGEN_UNUSED_VARIABLE(c0##iter); \
2758  if (N > iter) { \
2759  c1##iter = pset_zero<ResPacket>(); \
2760  } else { \
2761  EIGEN_UNUSED_VARIABLE(c1##iter); \
2762  }
2763 
2764 #define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \
2765  if (N > iter) { \
2766  LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \
2767  c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \
2768  }
2769 
2770 #define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \
2771  if (N > iter1) { \
2772  cc##iter1.scalar[0] = predux(c1##iter2); \
2773  cc##iter1.scalar[1] = predux(c1##iter3); \
2774  } else { \
2775  EIGEN_UNUSED_VARIABLE(cc##iter1); \
2776  }
2777 
2778 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2779  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
2780  j = 0; \
2781  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
2782  RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
2783  GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
2784  }
2785 
2786 #define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
2787  for (; i < n##N; i += N) { \
2788  GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2789  GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \
2790  GEMV_PROCESS_END_ROW_COMPLEX(N) \
2791  }
2792 
2793 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
2794  dd0 = predux(c1##iter);
2795 
2796 #if (__GNUC__ > 10)
2797 #define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
2798 #else
2799 #define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \
2800  (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
2801 #endif
2802 
2803 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
2804  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2805  GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
2806  } else { \
2807  GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
2808  }
2809 
2810 #define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
2811  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2812  GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
2813  } else { \
2814  GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
2815  }
2816 
2817 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
2818  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
2819  GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
2820  } else { \
2821  GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
2822  }
2823 #endif
2824 
2825 #ifdef USE_GEMV_MMA
2826 #define GEMV_PROCESS_ROW_COMPLEX(N) \
2827  GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
2828 #else
2829 #define GEMV_PROCESS_ROW_COMPLEX(N) \
2830  GEMV_PROCESS_ROW_COMPLEX_ONE(N)
2831 #endif
2832 
2833 template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
2834 EIGEN_STRONG_INLINE void gemv_complex_row(
2835  Index rows, Index cols,
2836  const LhsMapper& alhs,
2837  const RhsMapper& rhs,
2838  ResScalar* res, Index resIncr,
2839  ResScalar alpha)
2840 {
2841  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
2842 
2843  typedef typename Traits::LhsPacket LhsPacket;
2844  typedef typename Traits::RhsPacket RhsPacket;
2845  typedef typename Traits::ResPacket ResPacket;
2846 
2847  typedef typename packet_traits<Scalar>::type ScalarPacket;
2848  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
2849  typedef typename packet_traits<ResScalar>::type PResPacket;
2850  typedef gemv_traits<ResPacket, ResPacket> PTraits;
2851 
2852  // The following copy tells the compiler that lhs's attributes are not modified outside this function
2853  // This helps GCC to generate proper code.
2854  LhsMapper lhs(alhs);
2855  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
2856 
2857  eigen_internal_assert(rhs.stride() == 1);
2858  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
2859 #if !EIGEN_COMP_LLVM
2860  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
2861 #endif
2862 
2863  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
2864  // processing 8 rows at once might be counter productive wrt cache.
2865 #ifndef GCC_ONE_VECTORPAIR_BUG
2866  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
2867  const Index n4 = rows - 3;
2868  const Index n2 = rows - 1;
2869 #endif
2870 
2871  // TODO: for padded aligned inputs, we could enable aligned reads
2872  enum {
2873  LhsAlignment = Unaligned,
2874  ResPacketSize = PTraits::ResPacketSize,
2875  LhsPacketSize = PTraits::LhsPacketSize,
2876  RhsPacketSize = PTraits::RhsPacketSize,
2877  };
2878 
2879  Index i = 0, j;
2880  PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
2881  ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
2882 #ifdef USE_GEMV_MMA
2883  __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
2884  GEMV_UNUSED_ROW(8, e0)
2885  GEMV_UNUSED_EXTRA(1, c0)
2886  GEMV_UNUSED_EXTRA(1, c1)
2887 #endif
2888  ResScalar dd0;
2889 #ifndef GCC_ONE_VECTORPAIR_BUG
2890  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
2891 #ifdef USE_GEMV_MMA
2893 #endif
2894  {
2896  }
2899 #endif
2900  for (; i < rows; ++i)
2901  {
2904  for (; j < cols; ++j)
2905  {
2906  dd0 += cj.pmul(lhs(i, j), rhs2(j));
2907  }
2908  res[i * resIncr] += alpha * dd0;
2909  }
2910 }
2911 
2912 #define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \
2913 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2914 struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
2915 { \
2916  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
2917 \
2918  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2919  Index rows, Index cols, \
2920  const LhsMapper& lhs, \
2921  const RhsMapper& rhs, \
2922  ResScalar* res, Index resIncr, \
2923  ResScalar alpha) { \
2924  gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2925  } \
2926 };
2927 
2928 #define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \
2929 template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
2930 struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
2931 { \
2932  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
2933 \
2934  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
2935  Index rows, Index cols, \
2936  const LhsMapper& lhs, \
2937  const RhsMapper& rhs, \
2938  ResScalar* res, Index resIncr, \
2939  ResScalar alpha) { \
2940  gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
2941  } \
2942 };
2943 
2944 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex<float>)
2945 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, float)
2946 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, std::complex<float>)
2947 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex<double>)
2948 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, double)
2949 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, std::complex<double>)
2950 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex<float>)
2951 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, float)
2952 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, std::complex<float>)
2953 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex<double>)
2954 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, double)
2955 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, std::complex<double>)
2956 
2957 #endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
2958 
#define __UNPACK_TYPE__(PACKETNAME)
Array< int, Dynamic, 1 > v
Array< int, 3, 1 > b
RowXpr row(Index i)
This is the const version of row(). *‍/.
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Array33i c
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
#define eigen_internal_assert(x)
Definition: Macros.h:908
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
int data[]
static Packet16uc p16uc_MERGE16_32_V1
EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket &a, RealPacket &b, RealPacket &c)
EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper &lhs, Index i, Index j)
EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a)
const Packet16uc p16uc_COMPLEX64_XORFLIP
EIGEN_ALWAYS_INLINE Packet4f pload_real(float *src)
EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a)
#define MAX_BFLOAT16_VEC_ACC_VSX
void gemv_col(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, ResScalar *res, Index resIncr, ResScalar alpha)
EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index &row, Index cols, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
const Packet16uc p16uc_COMPLEX32_CONJ_XOR
EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket &a, RealPacket &b, RealPacket &c)
#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()
EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf &a, std::complex< float > &b)
EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket &a0, RhsScalar *b, PResPacket &c0, ResPacket &c1)
EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar &alpha, int which, int conj)
EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16 *dst, Packet8bf data, Index resInc, Index extra)
EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a)
EIGEN_ALWAYS_INLINE Packet pset_init(Packet &c1)
const Packet16uc p16uc_MERGEE
EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter)
EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index &row, Index cols, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
#define GEMV_PROCESS_ROW_COMPLEX(N)
#define GEMV_PROCESS_COL_COMPLEX(N)
EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f(&acc)[num_acc][2], Packet4f(&a0)[num_acc][2], Packet4f(&b0)[2])
EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex< float > *src)
#define GEMV_PROCESS_ROW(N)
EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket &c0, ScalarPacket &c2, ScalarPacket &c4, AlphaData &b0)
#define GEMV_IS_COMPLEX_FLOAT
EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f(&acc)[num_acc][2], const LhsMapper &lhs, RhsMapper &rhs, Index j, Index extra_cols)
const Packet16uc p16uc_COMPLEX64_CONJ_XOR2
EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float *src)
EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f(&acc)[num_acc][2])
EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf &a)
#define GEMV_IS_SCALAR
EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex< float > *src)
#define GEMV_PROCESS_COL(N)
const Packet16uc p16uc_COMPLEX32_CONJ_XOR2
EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf &a)
EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex< float > *src)
static Packet16uc p16uc_MERGE16_32_V2
#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar)
EIGEN_ALWAYS_INLINE Packet pset_zero()
EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper &lhs, Packet4f(&a0)[num_acc][2])
EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
void colVSXVecColLoopBody(Index &row, Index cend, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper &lhs, RhsMapper &rhs, Packet4f(&acc)[num_acc][2])
EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket &c0, AlphaData &b0, ResScalar *res)
EIGEN_ALWAYS_INLINE Packet1cd pset_zero< Packet1cd >()
EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper &rhs, Index j)
EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar *src, Packet4f &r, Packet4f &i)
EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha)
EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar *src, Packet4f &r, Packet4f &i)
EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float *result, Index rows, bfloat16 *dst, Index resInc=1)
EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex< float > *src)
EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f(&acc)[num_acc][2], Index k)
EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex< float > *src)
void gemv_complex_col(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, ResScalar *res, Index resIncr, ResScalar alpha)
EIGEN_ALWAYS_INLINE Packet2cf pset_zero< Packet2cf >()
void colVSXVecLoopBody(Index &row, Index cols, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
#define GEMV_PROCESS_COL_COMPLEX_ONE(N)
EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a)
#define GEMV_PROCESS_COL_ONE(N)
EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket &a0, RhsScalar *b, PResPacket &c0)
EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index &row, Index cend, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f(&acc)[num_acc][2])
EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index &i, float *result, Index rows, bfloat16 *&dst, Index resInc=1)
EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper &lhs, RhsMapper &rhs, Packet4f(&acc)[num_acc][2], Index extra_cols)
#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2)
EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket &a0, RhsScalar *b, PResPacket &c0)
EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float *src)
#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar)
const Packet16uc p16uc_COMPLEX32_NEGATE
EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar *res, ResPacket &palpha, ResPacket &data)
#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar)
EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex< float > &alpha)
EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex< float > *src)
const Packet16uc p16uc_COMPLEX64_CONJ_XOR
EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index &row, Index cend, Index rows, LhsMapper &lhs, RhsMapper &rhs, const Packet4f pAlpha, float *result)
#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()
EIGEN_ALWAYS_INLINE ScalarBlock< ResScalar, 2 > predux_complex(ResPacket &a, ResPacket &b)
EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols)
const Packet16uc p16uc_MERGEO
EIGEN_ALWAYS_INLINE ScalarBlock< ResScalar, 2 > predux_real(ResPacket &a, ResPacket &b)
const Packet16uc p16uc_COMPLEX64_NEGATE
#define GEMV_BUILDPAIR_MMA(dst, src1, src2)
const Packet16uc p16uc_COMPLEX32_XORFLIP
#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType)
void gemv_bfloat16_col(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, bfloat16 *res, Index resIncr, bfloat16 alpha)
#define GEMV_IS_COMPLEX_COMPLEX
void gemv_bfloat16_row(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, bfloat16 *res, Index resIncr, bfloat16 alpha)
#define COMPLEX_DELTA
EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float *result, Packet4f pAlpha, Index extra_rows)
void gemv_complex_row(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, ResScalar *res, Index resIncr, ResScalar alpha)
#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar)
EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket &a0, RhsScalar *b, PResPacket &c0)
void gemv_row(Index rows, Index cols, const LhsMapper &alhs, const RhsMapper &rhs, ResScalar *res, Index resIncr, ResScalar alpha)
#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N)
#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType)
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)
Definition: Memory.h:847
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
@ Unaligned
Definition: Constants.h:235
@ ColMajor
Definition: Constants.h:321
float bfloat16_to_float(__bfloat16_raw h)
Definition: BFloat16.h:571
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data)
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask)
unpacket_traits< Packet >::type predux(const Packet &a)
__vector unsigned char Packet16uc
Packet16uc pset1< Packet16uc >(const unsigned char &from)
Packet2cf ploadu< Packet2cf >(const std::complex< float > *from)
EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data)
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16 *src, Index resInc)
__vector unsigned short int Packet8us
Packet1cd ploadu< Packet1cd >(const std::complex< double > *from)
Definition: MSA/Complex.h:453
Packet1cd pcplxflip(const Packet1cd &x)
Definition: MSA/Complex.h:617
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Packet2d ploaddup< Packet2d >(const double *from)
void pstoreu(Scalar *to, const Packet &from)
void pstoreu_partial(Scalar *to, const Packet &from, const Index n, const Index offset=0)
void pscatter(Scalar *to, const Packet &from, Index stride, typename unpacket_traits< Packet >::mask_t umask)
void pscatter_partial(Scalar *to, const Packet &from, Index stride, const Index n)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
Packet4f ploadu< Packet4f >(const float *from)
Packet2d pset1< Packet2d >(const double &from)
Packet2d ploadu< Packet2d >(const double *from)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float *res)
Packet4f pset1< Packet4f >(const float &from)
__vector float Packet4f
EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
static Packet16uc p16uc_TRANSPOSE64_HI
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_conjugate_op< typename Derived::Scalar >, const Derived > conj(const Eigen::ArrayBase< Derived > &x)
struct alpha_store::ri separate
alpha_store(ResScalar &alpha)
static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper &rhs, Index j)
static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper &rhs, Index j)
std::ptrdiff_t j