MatrixProductMMA.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
5 // Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
12 #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
13 
14 // If using dynamic dispatch, set the CPU target.
15 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
16 #pragma GCC push_options
17 #pragma GCC target("cpu=power10,htm")
18 #endif
19 
20 #ifdef __has_builtin
21 #if !__has_builtin(__builtin_vsx_assemble_pair)
22 #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
23 #endif
24 #if !__has_builtin(__builtin_vsx_disassemble_pair)
25 #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
26 #endif
27 #endif
28 
29 #include "../../InternalHeaderCheck.h"
30 
32 
33 namespace Eigen {
34 
35 namespace internal {
36 
37 #define accColsC (accCols / 2)
38 
39 EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
40 {
41  __builtin_mma_xxsetaccz(acc);
42 }
43 
44 #ifdef USE_PARTIAL_PACKETS
45 template<typename DataMapper, typename Packet, bool full>
46 EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
47 #else
48 template<typename DataMapper, typename Packet, const Index accCols, const Index accCols2>
49 EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Packet& pMask, __vector_quad* acc)
50 #endif
51 {
52  PacketBlock<Packet, 4> result;
53  __builtin_mma_disassemble_acc(&result.packet, acc);
54 
55  PacketBlock<Packet, 4> tRes;
56 #ifdef USE_PARTIAL_PACKETS
57  if (full) {
58  EIGEN_UNUSED_VARIABLE(elements);
59  bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
60  bscale<Packet, 4>(tRes, result, alpha);
61  bstore<DataMapper, Packet, 4>(tRes, data, i);
62  } else {
63  bload_partial<DataMapper, Packet, 0, false, 4>(tRes, data, i, elements);
64  bscale<Packet, 4>(tRes, result, alpha);
65  bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
66  }
67 #else
68  bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
69  bscale<Packet, 4, (accCols != accCols2)>(tRes, result, alpha, pMask);
70  bstore<DataMapper, Packet, 4>(tRes, data, i);
71 #endif
72 }
73 
74 template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
75 EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag)
76 {
77  constexpr bool full = (accCols2 > accColsC);
78  PacketBlock<Packet, 4> resultReal, resultImag;
79  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
80  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
81 
82  PacketBlock<Packetc, 8> tRes;
83  bload<DataMapper, Packetc, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
84 
85  PacketBlock<Packet, 4> taccReal, taccImag;
86  bscalec<Packet, 4, (accCols != accCols2)>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask);
87 
88  PacketBlock<Packetc, 4> acc1, acc2;
89  bcouple<Packet, Packetc, 4, full>(taccReal, taccImag, tRes, acc1, acc2);
90 
91  bstore<DataMapper, Packetc, 4>(acc1, data, i);
92  if (full) {
93  bstore<DataMapper, Packetc, 4>(acc2, data, i + accColsC);
94  }
95 }
96 
97 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
98 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
99 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
100 {
101  if(NegativeAccumulate)
102  {
103  __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
104  } else {
105  __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
106  }
107 }
108 
109 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
110 EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
111 {
112  if(NegativeAccumulate)
113  {
114  __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
115  } else {
116  __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
117  }
118 }
119 
120 template<typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
121 EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi, const RhsPacket& rhsV, RhsPacket& rhsVi)
122 {
123  pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
124  if(LhsIsReal) {
125  pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
126  EIGEN_UNUSED_VARIABLE(lhsVi);
127  } else {
128  if(!RhsIsReal) {
129  pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
130  pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
131  } else {
132  EIGEN_UNUSED_VARIABLE(rhsVi);
133  }
134  pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
135  }
136 }
137 
138 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
139 template<typename Packet>
140 EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet)* rhs)
141 {
142  return ploadu<Packet>(rhs);
143 }
144 
145 template<typename Scalar, typename Packet>
146 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
147 {
148  rhsV = ploadRhs<Packet>(rhs);
149 }
150 
151 template<>
152 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV)
153 {
154 #if EIGEN_COMP_LLVM
155  __builtin_vsx_assemble_pair(&rhsV,
156  reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
157  reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
158 #else
159  rhsV = *reinterpret_cast<__vector_pair *>(const_cast<double *>(rhs));
160 #endif
161 }
162 
163 EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV)
164 {
165  ploadRhsMMA(lhs, lhsV);
166 }
167 
168 #if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
169 #define VECTOR_PAIR_LOADS_LHS
170 #endif
171 
172 // PEEL_MMA loop factor.
173 #define PEEL_MMA 7
174 
175 #define MICRO_MMA_UNROLL(func) \
176  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
177 
178 #define MICRO_MMA_WORK(func, type, peel) \
179  func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
180  func(4,type,peel) func(5,type,peel) func(6,type,peel) func(7,type,peel)
181 
182 #define MICRO_MMA_WORK_ONE(iter, type, peel) \
183  if (unroll_factor > iter) { \
184  pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV##iter); \
185  }
186 
187 #ifdef VECTOR_PAIR_LOADS_LHS
188 #define MICRO_MMA_WORK_TWO(iter, type, peel) \
189  if (unroll_factor > iter) { \
190  pgerMMA<Packet, type, false>(&accZero##iter, rhsV[peel], lhsV2##iter.packet[peel & 1]); \
191  }
192 
193 #define MICRO_MMA_LOAD1_TWO(lhs_ptr, iter) \
194  if (unroll_factor > iter) { \
195  if (MICRO_NORMAL(iter)) { \
196  ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##iter), plhsV##iter); \
197  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##iter.packet), &plhsV##iter); \
198  lhs_ptr##iter += accCols*2; \
199  } else { \
200  lhsV2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr##iter); \
201  lhsV2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr##iter + accCols2); \
202  lhs_ptr##iter += accCols2*2; \
203  EIGEN_UNUSED_VARIABLE(plhsV##iter) \
204  } \
205  } else { \
206  EIGEN_UNUSED_VARIABLE(lhsV2##iter); \
207  EIGEN_UNUSED_VARIABLE(plhsV##iter) \
208  }
209 
210 #define MICRO_MMA_LOAD_TWO(iter) MICRO_MMA_LOAD1_TWO(lhs_ptr, iter)
211 #endif
212 
213 #define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
214  if (PEEL_MMA > peel) { \
215  Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
216  ploadRhsMMA(rhs_ptr + (accRows * peel), rhsV[peel]); \
217  MICRO_MMA_UNROLL(funcl) \
218  MICRO_MMA_WORK(funcw, type, peel) \
219  }
220 
221 #ifndef VECTOR_PAIR_LOADS_LHS
222 #define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
223  type rhsV[8]; \
224  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
225  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
226  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
227  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
228 #else
229 #define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
230  if (PEEL_MMA > peel2) { \
231  PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
232  __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
233  if (sizeof(type) == 16) { \
234  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr + (accRows * peel1)), prhsV##peel1); \
235  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
236  } else { \
237  EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
238  ploadRhsMMA(rhs_ptr + (accRows * peel1), rhsV[peel1]); \
239  ploadRhsMMA(rhs_ptr + (accRows * peel2), rhsV[peel2]); \
240  } \
241  MICRO_MMA_UNROLL(funcl2) \
242  MICRO_MMA_WORK(funcw2, type, peel1) \
243  MICRO_MMA_WORK(funcw2, type, peel2) \
244  } else { \
245  EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
246  MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
247  }
248 
249 #define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
250  type rhsV[8]; \
251  __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
252  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
253  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
254  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \
255  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7)
256 #endif
257 
258 #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
259  type rhsV[1]; \
260  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
261 
262 #define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
263  MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
264  rhs_ptr += (accRows * size);
265 
266 #ifndef VECTOR_PAIR_LOADS_LHS
267 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
268 #else
269 #define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
270  MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
271  rhs_ptr += (accRows * size);
272 
273 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
274 #endif
275 
276 #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
277 
278 #define MICRO_MMA_DST_PTR_ONE(iter) \
279  if (unroll_factor > iter) { \
280  bsetzeroMMA(&accZero##iter); \
281  } else { \
282  EIGEN_UNUSED_VARIABLE(accZero##iter); \
283  }
284 
285 #define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
286 
287 #define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_SRC_PTR_ONE)
288 
289 #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
290 
291 #ifdef USE_PARTIAL_PACKETS
292 #define MICRO_MMA_STORE_ONE(iter) \
293  if (unroll_factor > iter) { \
294  storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(iter)>(row + iter*accCols, res, pAlpha, accCols2, &accZero##iter); \
295  }
296 #else
297 #define MICRO_MMA_STORE_ONE(iter) \
298  if (unroll_factor > iter) { \
299  storeAccumulator<DataMapper, Packet, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlpha, pMask, &accZero##iter); \
300  }
301 #endif
302 
303 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
304 
305 #ifdef USE_PARTIAL_PACKETS
306 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full>
307 #else
308 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
309 #endif
311  const DataMapper& res,
312  const Scalar* lhs_base,
313  const Scalar* rhs_base,
314  Index depth,
315  Index strideA,
316  Index offsetA,
317  Index& row,
318  const Packet& pAlpha,
319 #ifdef USE_PARTIAL_PACKETS
320  Index accCols2
321 #else
322  const Packet& pMask
323 #endif
324  )
325 {
326  const Scalar* rhs_ptr = rhs_base;
327  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
328  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
329 
332 
333  Index k = 0, depth2 = depth - PEEL_MMA;
334  for(; k <= depth2; k += PEEL_MMA)
335  {
336  EIGEN_POWER_PREFETCH(rhs_ptr);
339  }
340  for(; k < depth; k++)
341  {
343  }
345 
347 }
348 
349 #ifdef USE_PARTIAL_PACKETS
350 #define MICRO_MMA_UNROLL_ITER2(N, M) \
351  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
352  if (M) return;
353 #else
354 #define MICRO_MMA_UNROLL_ITER2(N, M) \
355  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, row, pAlpha, pMask); \
356  if (M) return;
357 #endif
358 
359 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
361  const DataMapper& res,
362  const Scalar* blockA,
363  const Scalar* blockB,
364  Index depth,
365  Index strideA,
366  Index offsetA,
367  Index strideB,
368  Index offsetB,
369  Index col,
370  Index rows,
371  Index remaining_rows,
372  const Packet& pAlpha,
373  const Packet& pMask)
374 {
375  const DataMapper res3 = res.getSubMapper(0, col);
376 
377  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
378  const Scalar* lhs_base = blockA + accCols*offsetA;
379  Index row = 0;
380 
381 #define MAX_MMA_UNROLL 7
382  while(row + MAX_MMA_UNROLL*accCols <= rows) {
384  }
385  switch( (rows-row)/accCols ) {
386 #if MAX_MMA_UNROLL > 7
387  case 7:
389  break;
390 #endif
391 #if MAX_MMA_UNROLL > 6
392  case 6:
394  break;
395 #endif
396 #if MAX_MMA_UNROLL > 5
397  case 5:
399  break;
400 #endif
401 #if MAX_MMA_UNROLL > 4
402  case 4:
404  break;
405 #endif
406 #if MAX_MMA_UNROLL > 3
407  case 3:
409  break;
410 #endif
411 #if MAX_MMA_UNROLL > 2
412  case 2:
414  break;
415 #endif
416 #if MAX_MMA_UNROLL > 1
417  case 1:
419  break;
420 #endif
421  default:
422  break;
423  }
424 #undef MAX_MMA_UNROLL
425 
426  if(remaining_rows > 0)
427  {
428  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
429  }
430 }
431 
432 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
433 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
434 {
435  const Index remaining_rows = rows % accCols;
436 
437  if( strideA == -1 ) strideA = depth;
438  if( strideB == -1 ) strideB = depth;
439 
440  const Packet pAlpha = pset1<Packet>(alpha);
441  const Packet pMask = bmask<Packet>(remaining_rows);
442 
443  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
444 
445  Index col = 0;
446  for(; col + accRows <= cols; col += accRows)
447  {
448  gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
449  }
450 
451  if (col != cols)
452  {
453  gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
454  }
455 }
456 
457 #define advanceRows ((LhsIsReal) ? 1 : 2)
458 #define advanceCols ((RhsIsReal) ? 1 : 2)
459 
460 // PEEL_COMPLEX_MMA loop factor.
461 #define PEEL_COMPLEX_MMA 3
462 
463 #define MICRO_COMPLEX_MMA_UNROLL(func) \
464  func(0) func(1) func(2) func(3)
465 
466 #define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
467  func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel)
468 
469 #define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel) \
470  if (unroll_factor > iter) { \
471  pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV[peel], rhsVi[peel]); \
472  }
473 
474 #ifdef VECTOR_PAIR_LOADS_LHS
475 #define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel) \
476  if (unroll_factor > iter) { \
477  pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##iter.packet[peel & 1], lhsVi2##iter.packet[peel & 1], rhsV[peel], rhsVi[peel]); \
478  }
479 
480 #define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter) \
481  if (!LhsIsReal && (unroll_factor > iter)) { \
482  if (MICRO_NORMAL(iter)) { \
483  ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##iter + imag_delta), plhsVi##iter); \
484  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##iter.packet), &plhsVi##iter); \
485  } else { \
486  lhsVi2##iter.packet[0] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2); \
487  lhsVi2##iter.packet[1] = ploadLhs<Packet>(lhs_ptr_real##iter + imag_delta2 + accCols2); \
488  EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
489  } \
490  } else { \
491  EIGEN_UNUSED_VARIABLE(lhsVi2##iter); \
492  EIGEN_UNUSED_VARIABLE(plhsVi##iter) \
493  } \
494  MICRO_MMA_LOAD1_TWO(lhs_ptr_real, iter)
495 
496 #define MICRO_COMPLEX_MMA_LOAD_TWO(iter) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, iter)
497 #endif
498 
499 #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
500  if (PEEL_COMPLEX_MMA > peel) { \
501  Packet lhsV0, lhsV1, lhsV2, lhsV3; \
502  Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
503  ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV[peel]); \
504  if(!RhsIsReal) { \
505  ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi[peel]); \
506  } \
507  MICRO_COMPLEX_MMA_UNROLL(funcl) \
508  MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
509  }
510 
511 #ifndef VECTOR_PAIR_LOADS_LHS
512 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
513  type rhsV[4], rhsVi[4]; \
514  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
515  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
516 #else
517 #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
518  if (PEEL_COMPLEX_MMA > peel2) { \
519  PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
520  PacketBlock<Packet,2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
521  __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
522  __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
523  if (sizeof(type) == 16) { \
524  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real + (accRows * peel1)), prhsV##peel1); \
525  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV[peel1]), &prhsV##peel1); \
526  if(!RhsIsReal) { \
527  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag + (accRows * peel1)), prhsVi##peel1); \
528  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi[peel1]), &prhsVi##peel1); \
529  } else { \
530  EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
531  } \
532  } else { \
533  EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
534  EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
535  ploadRhsMMA(rhs_ptr_real + (accRows * peel1), rhsV[peel1]); \
536  ploadRhsMMA(rhs_ptr_real + (accRows * peel2), rhsV[peel2]); \
537  if(!RhsIsReal) { \
538  ploadRhsMMA(rhs_ptr_imag + (accRows * peel1), rhsVi[peel1]); \
539  ploadRhsMMA(rhs_ptr_imag + (accRows * peel2), rhsVi[peel2]); \
540  } \
541  } \
542  MICRO_COMPLEX_MMA_UNROLL(funcl2) \
543  MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
544  MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
545  } else { \
546  EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
547  EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
548  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
549  }
550 
551 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
552  type rhsV[4], rhsVi[4]; \
553  __vector_pair prhsV0, prhsV2; \
554  __vector_pair prhsVi0, prhsVi2; \
555  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
556  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3)
557 #endif
558 
559 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
560  type rhsV[1], rhsVi[1]; \
561  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
562 
563 #define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
564  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
565  rhs_ptr_real += (accRows * size); \
566  if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
567 
568 #ifndef VECTOR_PAIR_LOADS_LHS
569 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
570 #else
571 #define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
572  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
573  rhs_ptr_real += (accRows * size); \
574  if(!RhsIsReal) rhs_ptr_imag += (accRows * size);
575 
576 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
577 #endif
578 
579 #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
580 
581 #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
582  if (unroll_factor > iter) { \
583  bsetzeroMMA(&accReal##iter); \
584  bsetzeroMMA(&accImag##iter); \
585  } else { \
586  EIGEN_UNUSED_VARIABLE(accReal##iter); \
587  EIGEN_UNUSED_VARIABLE(accImag##iter); \
588  }
589 
590 #define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
591 
592 #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
593 
594 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
595 
596 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
597  if (unroll_factor > iter) { \
598  storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (iter + 1)) ? accCols : accCols2>(row + iter*accCols, res, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
599  }
600 
601 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
602 
603 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
605  const DataMapper& res,
606  const Scalar* lhs_base,
607  const Scalar* rhs_base,
608  Index depth,
609  Index strideA,
610  Index offsetA,
611  Index strideB,
612  Index& row,
613  const Packet& pAlphaReal,
614  const Packet& pAlphaImag,
615  const Packet& pMask)
616 {
617  const Scalar* rhs_ptr_real = rhs_base;
618  const Scalar* rhs_ptr_imag = NULL;
619  const Index imag_delta = accCols*strideA;
620  const Index imag_delta2 = accCols2*strideA;
621  if(!RhsIsReal) {
622  rhs_ptr_imag = rhs_base + accRows*strideB;
623  } else {
624  EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
625  }
626  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
627  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
628  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
629 
632 
633  Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
634  for(; k <= depth2; k += PEEL_COMPLEX_MMA)
635  {
636  EIGEN_POWER_PREFETCH(rhs_ptr_real);
637  if(!RhsIsReal) {
638  EIGEN_POWER_PREFETCH(rhs_ptr_imag);
639  }
642  }
643  for(; k < depth; k++)
644  {
646  }
648 
650 }
651 
652 #define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
653  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
654  if (M) return;
655 
656 template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
658  const DataMapper& res,
659  const Scalar* blockA,
660  const Scalar* blockB,
661  Index depth,
662  Index strideA,
663  Index offsetA,
664  Index strideB,
665  Index offsetB,
666  Index col,
667  Index rows,
668  Index remaining_rows,
669  const Packet& pAlphaReal,
670  const Packet& pAlphaImag,
671  const Packet& pMask)
672 {
673  const DataMapper res3 = res.getSubMapper(0, col);
674 
675  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
676  const Scalar* lhs_base = blockA + accCols*offsetA;
677  Index row = 0;
678 
679 #define MAX_COMPLEX_MMA_UNROLL 4
680  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
682  }
683  switch( (rows-row)/accCols ) {
684 #if MAX_COMPLEX_MMA_UNROLL > 4
685  case 4:
687  break;
688 #endif
689 #if MAX_COMPLEX_MMA_UNROLL > 3
690  case 3:
692  break;
693 #endif
694 #if MAX_COMPLEX_MMA_UNROLL > 2
695  case 2:
697  break;
698 #endif
699 #if MAX_COMPLEX_MMA_UNROLL > 1
700  case 1:
702  break;
703 #endif
704  default:
705  break;
706  }
707 #undef MAX_COMPLEX_MMA_UNROLL
708 
709  if(remaining_rows > 0)
710  {
711  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
712  }
713 }
714 
715 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
716 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
717 {
718  const Index remaining_rows = rows % accCols;
719 
720  if( strideA == -1 ) strideA = depth;
721  if( strideB == -1 ) strideB = depth;
722 
723  const Packet pAlphaReal = pset1<Packet>(alpha.real());
724  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
725  const Packet pMask = bmask<Packet>(remaining_rows);
726 
727  const Scalar* blockA = (Scalar *) blockAc;
728  const Scalar* blockB = (Scalar *) blockBc;
729 
730  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
731 
732  Index col = 0;
733  for(; col + accRows <= cols; col += accRows)
734  {
735  gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
736  }
737 
738  if (col != cols)
739  {
740  gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
741  }
742 }
743 
744 #undef accColsC
745 #undef advanceRows
746 #undef advanceCols
747 
748 } // end namespace internal
749 
750 } // end namespace Eigen
751 
752 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
753 #pragma GCC pop_options
754 #endif
755 
756 #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
757 
Array< int, 3, 1 > b
RowXpr row(Index i)
This is the const version of row(). *‍/.
ColXpr col(Index i)
This is the const version of col().
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
int data[]
#define MICRO_COMPLEX_UNROLL_ITER(func, N)
#define MICRO_UPDATE
#define MICRO_COMPLEX_UPDATE
#define EIGEN_POWER_PREFETCH(p)
#define MICRO_UNROLL_ITER(func, N)
#define MICRO_MMA_DST_PTR
#define advanceCols
#define MICRO_COMPLEX_MMA_ONE
#define accColsC
#define MICRO_MMA_STORE
#define MICRO_COMPLEX_MMA_DST_PTR
#define MICRO_COMPLEX_MMA_SRC_PTR
#define MICRO_MMA_PREFETCH
#define MICRO_COMPLEX_MMA_STORE
#define MICRO_MMA_ONE
#define MAX_COMPLEX_MMA_UNROLL
#define MICRO_MMA_SRC_PTR
#define MICRO_COMPLEX_MMA_ONE_PEEL
#define MICRO_MMA_ONE_PEEL
#define MAX_MMA_UNROLL
#define PEEL_MMA
#define MICRO_MMA_UNROLL_ITER2(N, M)
#define PEEL_COMPLEX_MMA
#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M)
#define MICRO_COMPLEX_MMA_PREFETCH
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index &row, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad *acc)
EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper &data, const Packet &alpha, const Packet &pMask, __vector_quad *acc)
EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper &data, const Packet &alphaReal, const Packet &alphaImag, const Packet &pMask, __vector_quad *accReal, __vector_quad *accImag)
EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad *accReal, __vector_quad *accImag, const Packet &lhsV, Packet &lhsVi, const RhsPacket &rhsV, RhsPacket &rhsVi)
EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index &row, const Packet &pAlpha, const Packet &pMask)
__UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
void gemmMMA(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
void gemm_complexMMA(const DataMapper &res, const LhsScalar *blockAc, const RhsScalar *blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
EIGEN_ALWAYS_INLINE void gemmMMA_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double *lhs, __vector_pair &lhsV)
EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar *rhs, Packet &rhsV)
EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet) *rhs)
EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad *acc, const RhsPacket &a, const LhsPacket &b)
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82