TensorShuffling.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 template<typename Shuffle, typename XprType>
26 struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
27 {
28  typedef typename XprType::Scalar Scalar;
29  typedef traits<XprType> XprTraits;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  typedef std::remove_reference_t<Nested> Nested_;
34  static constexpr int NumDimensions = XprTraits::NumDimensions;
35  static constexpr int Layout = XprTraits::Layout;
36  typedef typename XprTraits::PointerType PointerType;
37 };
38 
39 template<typename Shuffle, typename XprType>
40 struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
41 {
42  typedef const TensorShufflingOp<Shuffle, XprType>& type;
43 };
44 
45 template<typename Shuffle, typename XprType>
46 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
47 {
48  typedef TensorShufflingOp<Shuffle, XprType> type;
49 };
50 
51 } // end namespace internal
52 
53 
54 
55 template<typename Shuffle, typename XprType>
56 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
57 {
58  public:
60  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
62  typedef typename XprType::CoeffReturnType CoeffReturnType;
63  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
64  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
65  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
66 
67  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
68  : m_xpr(expr), m_shuffle(shfl) {}
69 
71  const Shuffle& shufflePermutation() const { return m_shuffle; }
72 
75  expression() const { return m_xpr; }
76 
78 
79 
80  protected:
81  typename XprType::Nested m_xpr;
82  const Shuffle m_shuffle;
83 };
84 
85 
86 // Eval as rvalue
87 template<typename Shuffle, typename ArgType, typename Device>
88 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
89 {
92  typedef typename XprType::Index Index;
93  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
95  typedef typename XprType::Scalar Scalar;
101 
103  enum {
104  IsAligned = false,
107  PreferBlockAccess = true,
108  CoordAccess = false, // to be implemented
109  RawAccess = false
110  };
111 
112  typedef std::remove_const_t<Scalar> ScalarNoConst;
113 
114  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
115  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
116  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
117 
118  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
119  Layout, Index>
121  //===--------------------------------------------------------------------===//
122 
123  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
124  : m_device(device),
125  m_impl(op.expression(), device)
126  {
127  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
128  const Shuffle& shuffle = op.shufflePermutation();
129  m_is_identity = true;
130  for (int i = 0; i < NumDims; ++i) {
131  m_shuffle[i] = static_cast<int>(shuffle[i]);
132  m_dimensions[i] = input_dims[shuffle[i]];
133  m_inverseShuffle[shuffle[i]] = i;
134  if (m_is_identity && shuffle[i] != i) {
135  m_is_identity = false;
136  }
137  }
138 
139  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
140  m_unshuffledInputStrides[0] = 1;
141  m_outputStrides[0] = 1;
142 
143  for (int i = 1; i < NumDims; ++i) {
144  m_unshuffledInputStrides[i] =
145  m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
146  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
147  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
148  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
149  }
150  } else {
151  m_unshuffledInputStrides[NumDims - 1] = 1;
152  m_outputStrides[NumDims - 1] = 1;
153  for (int i = NumDims - 2; i >= 0; --i) {
154  m_unshuffledInputStrides[i] =
155  m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
156  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
157  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
158  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
159  }
160  }
161 
162  for (int i = 0; i < NumDims; ++i) {
163  m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
164  }
165  }
166 
167  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
168 
169  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
170  m_impl.evalSubExprsIfNeeded(NULL);
171  return true;
172  }
173 
174 #ifdef EIGEN_USE_THREADS
175  template <typename EvalSubExprsCallback>
176  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
177  EvaluatorPointerType, EvalSubExprsCallback done) {
178  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
179  }
180 #endif // EIGEN_USE_THREADS
181 
182  EIGEN_STRONG_INLINE void cleanup() {
183  m_impl.cleanup();
184  }
185 
186  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
187  {
188  if (m_is_identity) {
189  return m_impl.coeff(index);
190  } else {
191  return m_impl.coeff(srcCoeff(index));
192  }
193  }
194 
195  template <int LoadMode, typename Self, bool ImplPacketAccess>
196  struct PacketLoader {
197  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
198  static PacketReturnType Run(const Self& self, Index index) {
199  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
201  for (int i = 0; i < PacketSize; ++i) {
202  values[i] = self.coeff(index + i);
203  }
204  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
205  return rslt;
206  }
207  };
208 
209  template<int LoadMode, typename Self>
210  struct PacketLoader<LoadMode, Self, true> {
211  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
212  static PacketReturnType Run(const Self& self, Index index) {
213  if (self.m_is_identity) {
214  return self.m_impl.template packet<LoadMode>(index);
215  } else {
216  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
218  for (int i = 0; i < PacketSize; ++i) {
219  values[i] = self.coeff(index + i);
220  }
221  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
222  return rslt;
223  }
224  }
225  };
226 
227  template<int LoadMode>
228  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
229  {
230  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
231  return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
232  }
233 
234  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
235  internal::TensorBlockResourceRequirements getResourceRequirements() const {
236  static const int inner_dim =
237  Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
238 
239  const size_t target_size = m_device.firstLevelCacheSize();
240  const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
241 
242  // Shuffled inner dimensions leads to a random memory access, which is not
243  // captured by default cost model bytes loaded/stored. We add this cost
244  // explicitly. The number of cycles picked based on the benchmarks.
245  // TODO(ezhulenev): This number was picked based on a very questionable
246  // benchmarks, add benchmarks that are representative of real workloads.
247  using BlockRequirements = internal::TensorBlockResourceRequirements;
248  if (inner_dim_shuffled) {
249  return BlockRequirements::uniform<Scalar>(target_size)
250  .addCostPerCoeff({0, 0, NumDims * 28});
251  } else {
252  return BlockRequirements::skewed<Scalar>(target_size);
253  }
254  }
255 
256  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
258  bool root_of_expr_ast = false) const {
259  eigen_assert(m_impl.data() != NULL);
260 
261  typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
262  TensorBlockIO;
263  typedef typename TensorBlockIO::Dst TensorBlockIODst;
264  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
265 
266  const typename TensorBlock::Storage block_storage =
267  TensorBlock::prepareStorage(
268  desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
269 
270  typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
271  TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
272 
273  TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
274  block_storage.data());
275 
276  typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
277  TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
278 
279  return block_storage.AsTensorMaterializedBlock();
280  }
281 
282  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
283  const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
284  NumDims * (2 * TensorOpCost::AddCost<Index>() +
285  2 * TensorOpCost::MulCost<Index>() +
286  TensorOpCost::DivCost<Index>());
287  return m_impl.costPerCoeff(vectorized) +
288  TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
289  }
290 
291  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
292 
293  protected:
295  Index input_index,
296  const DSizes<Index, NumDims>& input_block_strides,
297  const DSizes<Index, NumDims>& output_block_strides,
298  const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
299  Index output_index = 0;
300  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
301  for (int i = NumDims - 1; i > 0; --i) {
302  const Index idx = input_index / fast_input_block_strides[i];
303  output_index += idx * output_block_strides[m_inverseShuffle[i]];
304  input_index -= idx * input_block_strides[i];
305  }
306  return output_index + input_index *
307  output_block_strides[m_inverseShuffle[0]];
308  } else {
309  for (int i = 0; i < NumDims - 1; ++i) {
310  const Index idx = input_index / fast_input_block_strides[i];
311  output_index += idx * output_block_strides[m_inverseShuffle[i]];
312  input_index -= idx * input_block_strides[i];
313  }
314  return output_index + input_index *
315  output_block_strides[m_inverseShuffle[NumDims - 1]];
316  }
317  }
318 
319  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
320  Index inputIndex = 0;
321  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
322  for (int i = NumDims - 1; i > 0; --i) {
323  const Index idx = index / m_fastOutputStrides[i];
324  inputIndex += idx * m_inputStrides[i];
325  index -= idx * m_outputStrides[i];
326  }
327  return inputIndex + index * m_inputStrides[0];
328  } else {
329  for (int i = 0; i < NumDims - 1; ++i) {
330  const Index idx = index / m_fastOutputStrides[i];
331  inputIndex += idx * m_inputStrides[i];
332  index -= idx * m_outputStrides[i];
333  }
334  return inputIndex + index * m_inputStrides[NumDims - 1];
335  }
336  }
337 
341  array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
346 
349 };
350 
351 
352 // Eval as lvalue
353 template<typename Shuffle, typename ArgType, typename Device>
354 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
355  : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
356 {
358 
360  typedef typename XprType::Index Index;
361  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
363  typedef typename XprType::Scalar Scalar;
368 
369  enum {
370  IsAligned = false,
373  PreferBlockAccess = true,
374  RawAccess = false
375  };
376 
377  typedef std::remove_const_t<Scalar> ScalarNoConst;
378 
379  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
380  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
381  //===--------------------------------------------------------------------===//
382 
383  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
384  : Base(op, device)
385  { }
386 
387  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
388  {
389  return this->m_impl.coeffRef(this->srcCoeff(index));
390  }
391 
392  template <int StoreMode> EIGEN_STRONG_INLINE
393  void writePacket(Index index, const PacketReturnType& x) const
394  {
395  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
396  internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
398  for (int i = 0; i < PacketSize; ++i) {
399  this->coeffRef(index+i) = values[i];
400  }
401  }
402 
403  template <typename TensorBlock>
404  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
405  const TensorBlockDesc& desc, const TensorBlock& block) {
406  eigen_assert(this->m_impl.data() != NULL);
407 
408  typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
409  TensorBlockIO;
410  typedef typename TensorBlockIO::Dst TensorBlockIODst;
411  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
412 
413  const Scalar* block_buffer = block.data();
414 
415  // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
416  // expression with coefficient and packet access as `src`.
417  void* mem = NULL;
418  if (block_buffer == NULL) {
419  mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
420  ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
421 
422  typedef internal::TensorBlockAssignment<
423  ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
424  TensorBlockAssignment;
425 
426  TensorBlockAssignment::Run(
427  TensorBlockAssignment::target(
428  desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
429  buf),
430  block.expr());
431 
432  block_buffer = buf;
433  }
434 
435  // Read from block.
436  TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
437  block_buffer);
438 
439  // Write to the output buffer.
440  typename TensorBlockIO::Dimensions output_strides(
441  this->m_unshuffledInputStrides);
442  typename TensorBlockIO::Dimensions output_dimensions;
443  for (int i = 0; i < NumDims; ++i) {
444  output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
445  }
446  TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
447  this->srcCoeff(desc.offset()));
448 
449  // Reorder dimensions according to the shuffle.
450  typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
451  for (int i = 0; i < NumDims; ++i) {
452  dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
453  }
454  TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
455 
456  // Deallocate temporary buffer used for the block materialization.
457  if (mem != NULL) this->m_device.deallocate(mem);
458  }
459 };
460 
461 
462 } // end namespace Eigen
463 
464 #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
int i
#define EIGEN_ALIGN_MAX
#define EIGEN_UNROLL_LOOP
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)
Definition: TensorMacros.h:80
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:36
The tensor base class.
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:32
const Shuffle & shufflePermutation() const
Eigen::internal::nested< TensorShufflingOp >::type Nested
Eigen::internal::traits< TensorShufflingOp >::StorageKind StorageKind
const internal::remove_all_t< typename XprType::Nested > & expression() const
TensorShufflingOp(const XprType &expr, const Shuffle &shfl)
XprType::CoeffReturnType CoeffReturnType
TensorBase< TensorShufflingOp< Shuffle, XprType > > Base
Eigen::NumTraits< Scalar >::Real RealScalar
Eigen::internal::traits< TensorShufflingOp >::Scalar Scalar
Eigen::internal::traits< TensorShufflingOp >::Index Index
typename remove_all< T >::type remove_all_t
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
void writeBlock(const TensorBlockDesc &desc, const TensorBlock &block)
void writePacket(Index index, const PacketReturnType &x) const
TensorEvaluator< const TensorShufflingOp< Shuffle, ArgType >, Device > Base
internal::TensorBlockDescriptor< NumDims, Index > TensorBlockDesc
internal::TensorBlockResourceRequirements getResourceRequirements() const
array< internal::TensorIntDivisor< Index >, NumDims > m_fastOutputStrides
TensorEvaluator< const TensorShufflingOp< Shuffle, ArgType >, Device > Self
TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool root_of_expr_ast=false) const
internal::TensorBlockDescriptor< NumDims, Index > TensorBlockDesc
Index GetBlockOutputIndex(Index input_index, const DSizes< Index, NumDims > &input_block_strides, const DSizes< Index, NumDims > &output_block_strides, const DSizes< internal::TensorIntDivisor< Index >, NumDims > &fast_input_block_strides) const
internal::TensorMaterializedBlock< ScalarNoConst, NumDims, Layout, Index > TensorBlock
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool=false) const
Storage::Type EvaluatorPointerType
CoeffReturnType & coeffRef(Index index) const
static constexpr int PacketSize
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
std::remove_const_t< Scalar > ScalarNoConst