2 #ifdef EIGEN_POWER_USE_PREFETCH
3 #define EIGEN_POWER_PREFETCH(p) prefetch(p)
5 #define EIGEN_POWER_PREFETCH(p)
9 #define USE_PARTIAL_PACKETS
12 #include "../../InternalHeaderCheck.h"
18 template<
typename Scalar,
typename Packet,
typename DataMapper, const Index accRows, const Index accCols>
20 const DataMapper&
res,
21 const Scalar* lhs_base,
22 const Scalar* rhs_base,
33 template<
typename Scalar,
typename Packet,
typename DataMapper, const Index accCols>
35 const DataMapper&
res,
50 template<
typename Packet>
53 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
55 const DataMapper&
res,
56 const Scalar* lhs_base,
57 const Scalar* rhs_base,
65 const Packet& pAlphaReal,
66 const Packet& pAlphaImag,
69 template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
71 const DataMapper&
res,
83 const Packet& pAlphaReal,
84 const Packet& pAlphaImag,
87 template<
typename DataMapper>
90 template<const Index size,
bool non_unit_str
ide, Index delta>
93 template<
bool non_unit_str
ide = false>
96 template<
bool rhsExtraCols,
bool lhsExtraRows>
99 template<Index num_acc,
bool extraRows, Index size = 4>
102 template<Index num_acc, Index size = 4>
105 template<
typename RhsMapper,
bool linear>
108 template<
typename Packet>
111 template<
typename DataMapper,
typename Packet, const Index accCols,
int StorageOrder,
bool Complex,
int N,
bool full = true>
114 template<
typename DataMapper,
typename Packet,
int N>
117 #ifdef USE_PARTIAL_PACKETS
118 template<
typename DataMapper,
typename Packet, const Index accCols,
bool Complex, Index N,
bool full = true>
121 template<
typename DataMapper,
typename Packet, Index N>
125 template<
typename Packet,
int N>
128 template<
typename Packet,
int N,
bool mask>
129 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ,
const Packet& pAlpha,
const Packet& pMask);
131 template<
typename Packet,
int N,
bool mask>
132 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag,
const Packet& pMask);
134 template<
typename Packet,
typename Packetc,
int N,
bool full>
135 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2);
137 #define MICRO_NORMAL(iter) \
138 (accCols == accCols2) || (unroll_factor != (iter + 1))
140 #define MICRO_UNROLL_ITER1(func, N) \
141 switch (remaining_rows) { \
149 if (sizeof(Scalar) == sizeof(float)) { \
154 if (sizeof(Scalar) == sizeof(float)) { \
160 #ifdef USE_PARTIAL_PACKETS
161 #define MICRO_UNROLL_ITER(func, N) \
162 if (remaining_rows) { \
168 #define MICRO_NORMAL_PARTIAL(iter) \
169 full || (unroll_factor != (iter + 1))
171 #define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
174 #define MICRO_COMPLEX_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
176 #define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
178 #define MICRO_LOAD1(lhs_ptr, iter) \
179 if (unroll_factor > iter) { \
180 lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
181 lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
183 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
186 #define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
188 #define MICRO_COMPLEX_LOAD_ONE(iter) \
189 if (!LhsIsReal && (unroll_factor > iter)) { \
190 lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
192 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
194 MICRO_LOAD1(lhs_ptr_real, iter) \
196 #define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
197 if (unroll_factor > iter) { \
198 lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \
200 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
203 #define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
205 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
207 #define MICRO_PREFETCH1(lhs_ptr, iter) \
208 if (unroll_factor > iter) { \
209 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
212 #define MICRO_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr, iter)
214 #define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter)
216 #ifdef USE_PARTIAL_PACKETS
217 #define MICRO_UPDATE_MASK
219 #define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
222 #define MICRO_UPDATE \
223 if (accCols == accCols2) { \
225 EIGEN_UNUSED_VARIABLE(offsetA); \
226 row += unroll_factor*accCols; \
229 #define MICRO_COMPLEX_UPDATE \
231 if(LhsIsReal || (accCols == accCols2)) { \
232 EIGEN_UNUSED_VARIABLE(imag_delta2); \
RowXpr row(Index i)
This is the const version of row(). */.
ColXpr col(Index i)
This is the const version of col().
#define EIGEN_ALWAYS_INLINE
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
EIGEN_ALWAYS_INLINE void storeResults(Packet4f(&acc)[4], Index rows, const Packet4f pAlpha, float *result, Index extra_cols, Index extra_rows)
EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16 *dst, Packet8bf data, Index resInc, Index extra=0)
EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16 *src, Index resInc)
EIGEN_ALWAYS_INLINE void bstore(PacketBlock< Packet, N > &acc, const DataMapper &res, Index row)
__UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper &rhs, Index j)
EIGEN_ALWAYS_INLINE void bscalec(PacketBlock< Packet, N > &aReal, PacketBlock< Packet, N > &aImag, const Packet &bReal, const Packet &bImag, PacketBlock< Packet, N > &cReal, PacketBlock< Packet, N > &cImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper &res, const Scalar *lhs_base, const Scalar *rhs_base, Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlphaReal, const Packet &pAlphaImag, const Packet &pMask)
EIGEN_ALWAYS_INLINE void bload(PacketBlock< Packet, N *(Complex?2:1)> &acc, const DataMapper &res, Index row, Index col)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f(&acc)[num_acc][size], float *result, Packet4f pAlpha)
EIGEN_ALWAYS_INLINE void bscale(PacketBlock< Packet, N > &acc, PacketBlock< Packet, N > &accZ, const Packet &pAlpha)
EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) *lhs)
EIGEN_ALWAYS_INLINE void bcouple(PacketBlock< Packet, N > &taccReal, PacketBlock< Packet, N > &taccImag, PacketBlock< Packetc, N *2 > &tRes, PacketBlock< Packetc, N > &acc1, PacketBlock< Packetc, N > &acc2)
EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows, Index cols, Index remaining_rows, const Packet &pAlpha, const Packet &pMask)
EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper &src)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.