10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
25 template<
typename Shuffle,
typename XprType>
26 struct traits<TensorShufflingOp<Shuffle, XprType> > :
public traits<XprType>
28 typedef typename XprType::Scalar Scalar;
30 typedef typename XprTraits::StorageKind StorageKind;
31 typedef typename XprTraits::Index
Index;
32 typedef typename XprType::Nested Nested;
33 typedef std::remove_reference_t<Nested> Nested_;
34 static constexpr
int NumDimensions = XprTraits::NumDimensions;
35 static constexpr
int Layout = XprTraits::Layout;
36 typedef typename XprTraits::PointerType PointerType;
39 template<
typename Shuffle,
typename XprType>
40 struct eval<TensorShufflingOp<Shuffle, XprType>,
Eigen::Dense>
42 typedef const TensorShufflingOp<Shuffle, XprType>& type;
45 template<
typename Shuffle,
typename XprType>
46 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
48 typedef TensorShufflingOp<Shuffle, XprType> type;
55 template<
typename Shuffle,
typename XprType>
60 typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar
Scalar;
63 typedef typename Eigen::internal::nested<TensorShufflingOp>::type
Nested;
64 typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind
StorageKind;
65 typedef typename Eigen::internal::traits<TensorShufflingOp>::Index
Index;
87 template<
typename Shuffle,
typename ArgType,
typename Device>
93 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
107 PreferBlockAccess =
true,
118 typedef typename internal::TensorMaterializedBlock<
ScalarNoConst, NumDims,
125 m_impl(op.expression(), device)
129 m_is_identity =
true;
130 for (
int i = 0;
i < NumDims; ++
i) {
131 m_shuffle[
i] =
static_cast<int>(shuffle[
i]);
132 m_dimensions[
i] = input_dims[shuffle[
i]];
133 m_inverseShuffle[shuffle[
i]] =
i;
134 if (m_is_identity && shuffle[
i] !=
i) {
135 m_is_identity =
false;
140 m_unshuffledInputStrides[0] = 1;
141 m_outputStrides[0] = 1;
143 for (
int i = 1;
i < NumDims; ++
i) {
144 m_unshuffledInputStrides[
i] =
145 m_unshuffledInputStrides[
i - 1] * input_dims[
i - 1];
146 m_outputStrides[
i] = m_outputStrides[
i - 1] * m_dimensions[
i - 1];
147 m_fastOutputStrides[
i] = internal::TensorIntDivisor<Index>(
148 m_outputStrides[
i] > 0 ? m_outputStrides[
i] :
Index(1));
151 m_unshuffledInputStrides[NumDims - 1] = 1;
152 m_outputStrides[NumDims - 1] = 1;
153 for (
int i = NumDims - 2;
i >= 0; --
i) {
154 m_unshuffledInputStrides[
i] =
155 m_unshuffledInputStrides[
i + 1] * input_dims[
i + 1];
156 m_outputStrides[
i] = m_outputStrides[
i + 1] * m_dimensions[
i + 1];
157 m_fastOutputStrides[
i] = internal::TensorIntDivisor<Index>(
158 m_outputStrides[
i] > 0 ? m_outputStrides[
i] :
Index(1));
162 for (
int i = 0;
i < NumDims; ++
i) {
163 m_inputStrides[
i] = m_unshuffledInputStrides[shuffle[
i]];
170 m_impl.evalSubExprsIfNeeded(NULL);
174 #ifdef EIGEN_USE_THREADS
175 template <
typename EvalSubExprsCallback>
176 EIGEN_STRONG_INLINE
void evalSubExprsIfNeededAsync(
178 m_impl.evalSubExprsIfNeededAsync(
nullptr, [done](
bool) { done(
true); });
189 return m_impl.coeff(index);
191 return m_impl.coeff(srcCoeff(index));
195 template <
int LoadMode,
typename Self,
bool ImplPacketAccess>
196 struct PacketLoader {
202 values[
i] =
self.coeff(index +
i);
209 template<
int LoadMode,
typename Self>
210 struct PacketLoader<LoadMode,
Self, true> {
213 if (
self.m_is_identity) {
214 return self.m_impl.template packet<LoadMode>(index);
219 values[
i] =
self.coeff(index +
i);
227 template<
int LoadMode>
231 return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*
this, index);
236 static const int inner_dim =
239 const size_t target_size =
m_device.firstLevelCacheSize();
240 const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
247 using BlockRequirements = internal::TensorBlockResourceRequirements;
248 if (inner_dim_shuffled) {
249 return BlockRequirements::uniform<Scalar>(target_size)
250 .addCostPerCoeff({0, 0, NumDims * 28});
252 return BlockRequirements::skewed<Scalar>(target_size);
258 bool root_of_expr_ast =
false)
const {
261 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
263 typedef typename TensorBlockIO::Dst TensorBlockIODst;
264 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
266 const typename TensorBlock::Storage block_storage =
267 TensorBlock::prepareStorage(
268 desc, scratch, root_of_expr_ast);
270 typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
271 TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
273 TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
274 block_storage.data());
276 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
277 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
279 return block_storage.AsTensorMaterializedBlock();
283 const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
284 NumDims * (2 * TensorOpCost::AddCost<Index>() +
285 2 * TensorOpCost::MulCost<Index>() +
286 TensorOpCost::DivCost<Index>());
287 return m_impl.costPerCoeff(vectorized) +
298 const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides)
const {
299 Index output_index = 0;
301 for (
int i = NumDims - 1;
i > 0; --
i) {
302 const Index idx = input_index / fast_input_block_strides[
i];
303 output_index += idx * output_block_strides[m_inverseShuffle[
i]];
304 input_index -= idx * input_block_strides[
i];
306 return output_index + input_index *
307 output_block_strides[m_inverseShuffle[0]];
309 for (
int i = 0;
i < NumDims - 1; ++
i) {
310 const Index idx = input_index / fast_input_block_strides[
i];
311 output_index += idx * output_block_strides[m_inverseShuffle[
i]];
312 input_index -= idx * input_block_strides[
i];
314 return output_index + input_index *
315 output_block_strides[m_inverseShuffle[NumDims - 1]];
320 Index inputIndex = 0;
322 for (
int i = NumDims - 1;
i > 0; --
i) {
323 const Index idx = index / m_fastOutputStrides[
i];
324 inputIndex += idx * m_inputStrides[
i];
325 index -= idx * m_outputStrides[
i];
327 return inputIndex + index * m_inputStrides[0];
329 for (
int i = 0;
i < NumDims - 1; ++
i) {
330 const Index idx = index / m_fastOutputStrides[
i];
331 inputIndex += idx * m_inputStrides[
i];
332 index -= idx * m_outputStrides[
i];
334 return inputIndex + index * m_inputStrides[NumDims - 1];
353 template<
typename Shuffle,
typename ArgType,
typename Device>
355 :
public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
361 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
373 PreferBlockAccess =
true,
389 return this->m_impl.coeffRef(this->srcCoeff(index));
392 template <
int StoreMode> EIGEN_STRONG_INLINE
396 internal::pstore<CoeffReturnType, PacketReturnType>(values,
x);
403 template <
typename TensorBlock>
408 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
410 typedef typename TensorBlockIO::Dst TensorBlockIODst;
411 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
418 if (block_buffer == NULL) {
422 typedef internal::TensorBlockAssignment<
424 TensorBlockAssignment;
426 TensorBlockAssignment::Run(
427 TensorBlockAssignment::target(
428 desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
436 TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
440 typename TensorBlockIO::Dimensions output_strides(
441 this->m_unshuffledInputStrides);
442 typename TensorBlockIO::Dimensions output_dimensions;
443 for (
int i = 0;
i < NumDims; ++
i) {
444 output_dimensions[this->m_shuffle[
i]] = desc.dimension(
i);
446 TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
447 this->srcCoeff(desc.offset()));
450 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
451 for (
int i = 0;
i < NumDims; ++
i) {
452 dst_to_src_dim_map[
i] =
static_cast<int>(this->m_inverseShuffle[
i]);
454 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
457 if (mem != NULL) this->
m_device.deallocate(mem);
#define EIGEN_UNROLL_LOOP
#define EIGEN_DEVICE_FUNC
#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)
A tensor expression mapping an existing array of data.
const Shuffle & shufflePermutation() const
Eigen::internal::nested< TensorShufflingOp >::type Nested
Eigen::internal::traits< TensorShufflingOp >::StorageKind StorageKind
const internal::remove_all_t< typename XprType::Nested > & expression() const
TensorShufflingOp(const XprType &expr, const Shuffle &shfl)
XprType::CoeffReturnType CoeffReturnType
TensorBase< TensorShufflingOp< Shuffle, XprType > > Base
Eigen::NumTraits< Scalar >::Real RealScalar
Eigen::internal::traits< TensorShufflingOp >::Scalar Scalar
Eigen::internal::traits< TensorShufflingOp >::Index Index
typename remove_all< T >::type remove_all_t
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
internal::packet_traits< Scalar >::type type
XprType::CoeffReturnType CoeffReturnType
CoeffReturnType & coeffRef(Index index) const
std::remove_const_t< Scalar > ScalarNoConst
TensorEvaluator(const XprType &op, const Device &device)
void writeBlock(const TensorBlockDesc &desc, const TensorBlock &block)
TensorShufflingOp< Shuffle, ArgType > XprType
DSizes< Index, NumDims > Dimensions
void writePacket(Index index, const PacketReturnType &x) const
TensorEvaluator< const TensorShufflingOp< Shuffle, ArgType >, Device > Base
internal::TensorBlockDescriptor< NumDims, Index > TensorBlockDesc
PacketType< CoeffReturnType, Device >::type PacketReturnType
static PacketReturnType Run(const Self &self, Index index)
static PacketReturnType Run(const Self &self, Index index)
DSizes< Index, NumDims > Dimensions
CoeffReturnType coeff(Index index) const
array< Index, NumDims > m_unshuffledInputStrides
TensorEvaluator(const XprType &op, const Device &device)
Index srcCoeff(Index index) const
array< int, NumDims > m_shuffle
internal::TensorBlockResourceRequirements getResourceRequirements() const
array< internal::TensorIntDivisor< Index >, NumDims > m_fastOutputStrides
std::remove_const_t< Scalar > ScalarNoConst
TensorShufflingOp< Shuffle, ArgType > XprType
TensorEvaluator< const TensorShufflingOp< Shuffle, ArgType >, Device > Self
PacketReturnType packet(Index index) const
Storage::Type EvaluatorPointerType
TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool root_of_expr_ast=false) const
TensorEvaluator< ArgType, Device > m_impl
internal::TensorBlockScratchAllocator< Device > TensorBlockScratch
array< Index, NumDims > m_inverseShuffle
internal::TensorBlockDescriptor< NumDims, Index > TensorBlockDesc
array< Index, NumDims > m_outputStrides
Index GetBlockOutputIndex(Index input_index, const DSizes< Index, NumDims > &input_block_strides, const DSizes< Index, NumDims > &output_block_strides, const DSizes< internal::TensorIntDivisor< Index >, NumDims > &fast_input_block_strides) const
PacketType< CoeffReturnType, Device >::type PacketReturnType
TensorOpCost costPerCoeff(bool vectorized) const
const Dimensions & dimensions() const
StorageMemory< CoeffReturnType, Device > Storage
const Device EIGEN_DEVICE_REF m_device
bool evalSubExprsIfNeeded(EvaluatorPointerType)
internal::TensorMaterializedBlock< ScalarNoConst, NumDims, Layout, Index > TensorBlock
Storage::Type data() const
XprType::CoeffReturnType CoeffReturnType
array< Index, NumDims > m_inputStrides
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool=false) const
Storage::Type EvaluatorPointerType
CoeffReturnType & coeffRef(Index index) const
static constexpr int PacketSize
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
std::remove_const_t< Scalar > ScalarNoConst