MatrixProductCommon.h
Go to the documentation of this file.
1 //#define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines
2 #ifdef EIGEN_POWER_USE_PREFETCH
3 #define EIGEN_POWER_PREFETCH(p) prefetch(p)
4 #else
5 #define EIGEN_POWER_PREFETCH(p)
6 #endif
7 
8 #ifdef _ARCH_PWR9
9 #define USE_PARTIAL_PACKETS
10 #endif
11 
12 #include "../../InternalHeaderCheck.h"
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
20  const DataMapper& res,
21  const Scalar* lhs_base,
22  const Scalar* rhs_base,
23  Index depth,
24  Index strideA,
25  Index offsetA,
26  Index strideB,
27  Index row,
28  Index rows,
29  Index remaining_rows,
30  const Packet& pAlpha,
31  const Packet& pMask);
32 
33 template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
35  const DataMapper& res,
36  const Scalar* blockA,
37  const Scalar* blockB,
38  Index depth,
39  Index strideA,
40  Index offsetA,
41  Index strideB,
42  Index offsetB,
43  Index col,
44  Index rows,
45  Index cols,
46  Index remaining_rows,
47  const Packet& pAlpha,
48  const Packet& pMask);
49 
50 template<typename Packet>
51 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
52 
53 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
55  const DataMapper& res,
56  const Scalar* lhs_base,
57  const Scalar* rhs_base,
58  Index depth,
59  Index strideA,
60  Index offsetA,
61  Index strideB,
62  Index row,
63  Index rows,
64  Index remaining_rows,
65  const Packet& pAlphaReal,
66  const Packet& pAlphaImag,
67  const Packet& pMask);
68 
69 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
71  const DataMapper& res,
72  const Scalar* blockA,
73  const Scalar* blockB,
74  Index depth,
75  Index strideA,
76  Index offsetA,
77  Index strideB,
78  Index offsetB,
79  Index col,
80  Index rows,
81  Index cols,
82  Index remaining_rows,
83  const Packet& pAlphaReal,
84  const Packet& pAlphaImag,
85  const Packet& pMask);
86 
87 template<typename DataMapper>
88 EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src);
89 
90 template<const Index size, bool non_unit_stride, Index delta>
92 
93 template<bool non_unit_stride = false>
94 EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16* src, Index resInc = 1);
95 
96 template<bool rhsExtraCols, bool lhsExtraRows>
97 EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows);
98 
99 template<Index num_acc, bool extraRows, Index size = 4>
100 EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows);
101 
102 template<Index num_acc, Index size = 4>
103 EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha);
104 
105 template<typename RhsMapper, bool linear>
107 
108 template<typename Packet>
109 EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
110 
111 template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
112 EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col);
113 
114 template<typename DataMapper, typename Packet, int N>
115 EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
116 
117 #ifdef USE_PARTIAL_PACKETS
118 template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
119 EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements);
120 
121 template<typename DataMapper, typename Packet, Index N>
122 EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements);
123 #endif
124 
125 template<typename Packet, int N>
126 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
127 
128 template<typename Packet, int N, bool mask>
129 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask);
130 
131 template<typename Packet, int N, bool mask>
132 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask);
133 
134 template<typename Packet, typename Packetc, int N, bool full>
135 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2);
136 
137 #define MICRO_NORMAL(iter) \
138  (accCols == accCols2) || (unroll_factor != (iter + 1))
139 
140 #define MICRO_UNROLL_ITER1(func, N) \
141  switch (remaining_rows) { \
142  default: \
143  func(N, 0) \
144  break; \
145  case 1: \
146  func(N, 1) \
147  break; \
148  case 2: \
149  if (sizeof(Scalar) == sizeof(float)) { \
150  func(N, 2) \
151  } \
152  break; \
153  case 3: \
154  if (sizeof(Scalar) == sizeof(float)) { \
155  func(N, 3) \
156  } \
157  break; \
158  }
159 
160 #ifdef USE_PARTIAL_PACKETS
161 #define MICRO_UNROLL_ITER(func, N) \
162  if (remaining_rows) { \
163  func(N, true); \
164  } else { \
165  func(N, false); \
166  }
167 
168 #define MICRO_NORMAL_PARTIAL(iter) \
169  full || (unroll_factor != (iter + 1))
170 #else
171 #define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
172 #endif
173 
174 #define MICRO_COMPLEX_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
175 
176 #define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
177 
178 #define MICRO_LOAD1(lhs_ptr, iter) \
179  if (unroll_factor > iter) { \
180  lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
181  lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
182  } else { \
183  EIGEN_UNUSED_VARIABLE(lhsV##iter); \
184  }
185 
186 #define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
187 
188 #define MICRO_COMPLEX_LOAD_ONE(iter) \
189  if (!LhsIsReal && (unroll_factor > iter)) { \
190  lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
191  } else { \
192  EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
193  } \
194  MICRO_LOAD1(lhs_ptr_real, iter) \
195 
196 #define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
197  if (unroll_factor > iter) { \
198  lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \
199  } else { \
200  EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
201  }
202 
203 #define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
204 
205 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
206 
207 #define MICRO_PREFETCH1(lhs_ptr, iter) \
208  if (unroll_factor > iter) { \
209  EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
210  }
211 
212 #define MICRO_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr, iter)
213 
214 #define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter)
215 
216 #ifdef USE_PARTIAL_PACKETS
217 #define MICRO_UPDATE_MASK
218 #else
219 #define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
220 #endif
221 
222 #define MICRO_UPDATE \
223  if (accCols == accCols2) { \
224  MICRO_UPDATE_MASK \
225  EIGEN_UNUSED_VARIABLE(offsetA); \
226  row += unroll_factor*accCols; \
227  }
228 
229 #define MICRO_COMPLEX_UPDATE \
230  MICRO_UPDATE \
231  if(LhsIsReal || (accCols == accCols2)) { \
232  EIGEN_UNUSED_VARIABLE(imag_delta2); \
233  }
234 
235 
236 } // end namespace internal
237 } // end namespace Eigen
RowXpr row(Index i)
This is the const version of row(). *‍/.
ColXpr col(Index i)
This is the const version of col().
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
int data[]
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_ALWAYS_INLINE void storeResults(Packet4f(&acc)[4], Index rows, const Packet4f pAlpha, float *result, Index extra_cols, Index extra_rows)
EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16 *dst, Packet8bf data, Index resInc, Index extra=0)
EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16 *src, Index resInc)
EIGEN_ALWAYS_INLINE void bstore(PacketBlock< Packet, N > &acc, const DataMapper &res, Index row)
__UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper &rhs, Index j)
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock< Packet, N > &aReal, PacketBlock< Packet, N > &aImag, const Packet &bReal, const Packet &bImag, PacketBlock< Packet, N > &cReal, PacketBlock< Packet, N > &cImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void bload(PacketBlock< Packet, N *(Complex?2:1)> &acc, const DataMapper &res, Index row, Index col)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha)
EIGEN_ALWAYS_INLINE void bscale(PacketBlock< Packet, N > &acc, PacketBlock< Packet, N > &accZ, const Packet &pAlpha)
EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) *lhs)
EIGEN_ALWAYS_INLINE void bcouple(PacketBlock< Packet, N > &taccReal, PacketBlock< Packet, N > &taccImag, PacketBlock< Packetc, N *2 > &tRes, PacketBlock< Packetc, N > &acc1, PacketBlock< Packetc, N > &acc2)
__vector float Packet4f
EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper &src)
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
std::ptrdiff_t j