TensorBlock.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // This Source Code Form is subject to the terms of the Mozilla
5 // Public License v. 2.0. If a copy of the MPL was not distributed
6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 
8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10 
11 #include "./InternalHeaderCheck.h"
12 
13 namespace Eigen {
14 namespace internal {
15 
16 // -------------------------------------------------------------------------- //
17 // Forward declarations for templates defined below.
18 template <typename Scalar, typename IndexType, int NumDims, int Layout>
19 class TensorBlockIO;
20 
21 // -------------------------------------------------------------------------- //
22 // Helper function to compute strides for densely stored buffer of given
23 // dimensions.
24 
25 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
26 // this function instead everywhere.
27 template <int Layout, typename IndexType, int NumDims>
29  const DSizes<IndexType, NumDims>& dimensions) {
31  if (NumDims == 0) return strides;
32 
33  // TODO(ezhulenev): Use templates to unroll this loop (similar to
34  // h_array_reduce in CXX11meta.h)? Benchmark it.
35  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
36  strides[0] = 1;
37  for (int i = 1; i < NumDims; ++i) {
38  strides[i] = strides[i - 1] * dimensions[i - 1];
39  }
40  } else {
41  strides[NumDims - 1] = 1;
42  for (int i = NumDims - 2; i >= 0; --i) {
43  strides[i] = strides[i + 1] * dimensions[i + 1];
44  }
45  }
46 
47  return strides;
48 }
49 
50 template <int Layout, typename IndexType, size_t NumDims>
52  const Eigen::array<IndexType, NumDims>& dimensions) {
53  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
54 }
55 
56 template <int Layout, std::ptrdiff_t... Indices>
57 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
58  const Sizes<Indices...>& sizes) {
59  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
60 }
61 
62 // -------------------------------------------------------------------------- //
63 
64 // Tensor block shape type defines what are the shape preference for the blocks
65 // extracted from the larger tensor.
66 //
67 // Example: blocks of 100 elements from the large 100x100 tensor:
68 // - tensor: 100x100
69 // - target_block_size: 100
70 //
71 // TensorBlockShapeType:
72 // - kUniformAllDims: 100 blocks of size 10x10
73 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
74 // or row major layout)
76 
77 struct TensorBlockResourceRequirements {
78  TensorBlockShapeType shape_type; // target block shape
79  size_t size; // target block size
80  TensorOpCost cost_per_coeff; // cost of computing a single block element
81 
82 #ifdef EIGEN_HIPCC
83  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
84  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
85  // errors out complaining about the lack of a matching constructor
87  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
88  TensorOpCost cost_)
89  : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
90  {}
91 #endif
92 
93  template <typename Scalar>
94  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
95  TensorBlockShapeType shape_type, size_t size_in_bytes,
96  TensorOpCost cost) {
97  const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
98  return {shape_type, size, cost};
99  }
100 
101  template <typename Scalar>
102  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
103  TensorBlockShapeType shape_type, size_t size_in_bytes) {
104  // This default cost per coefficient is valid for most materialized tensor
105  // block evaluation implementations, because they typically just read
106  // coefficients from the underlying tensor storage, and write to the tensor
107  // block buffer (scratch or destination memory, reads and writes have linear
108  // access pattern). We ignore the fixed cost of block evaluation, because in
109  // practice it should negligible.
110  //
111  // Lazy block evaluation adds the cost of calling a functor for each
112  // coefficient.
113  //
114  // All non-trivial block evaluation implementations must provide their own
115  // cost approximation (e.g. shuffling inner dimension has a much higher cost
116  // because it reads memory randomly, although the total number of moved
117  // bytes is the same).
118  return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
119  {/*bytes_loaded=*/sizeof(Scalar),
120  /*bytes_stored=*/sizeof(Scalar),
121  /*compute_cycles=*/0});
122  }
123 
124  template <typename Scalar>
125  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
126  size_t size_in_bytes) {
127  return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
128  size_in_bytes);
129  }
130 
131  template <typename Scalar>
132  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
133  size_t size_in_bytes) {
134  return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
135  size_in_bytes);
136  }
137 
139  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
140  merge(const TensorBlockResourceRequirements& lhs,
141  const TensorBlockResourceRequirements& rhs) {
142  return {merge(lhs.shape_type, rhs.shape_type), // shape_type
143  merge(lhs.size, rhs.size), // size
144  merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
145  }
146 
147  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
148  TensorOpCost cost) {
149  cost_per_coeff += cost;
150  return *this;
151  }
152 
153  // This is a resource requirement that should be returned from expressions
154  // that do not have any block evaluation preference (e.g. default tensor
155  // expression with raw buffer access).
157  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
158  return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
159  }
160 
161  private:
162  using Requirements = TensorBlockResourceRequirements;
163 
165  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
166  return numext::maxi(lhs_size, rhs_size);
167  }
168 
170  static EIGEN_STRONG_INLINE TensorBlockShapeType
176  }
177 
179  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
180  TensorOpCost rhs_cost) {
181  return lhs_cost + rhs_cost;
182  }
183 };
184 
185 // -------------------------------------------------------------------------- //
186 // TensorBlockDescriptor specifies a block offset within a tensor and the block
187 // sizes along each of the tensor dimensions.
188 
189 template <int NumDims, typename IndexType = Eigen::Index>
190 class TensorBlockDescriptor {
191  public:
192  typedef DSizes<IndexType, NumDims> Dimensions;
193 
194  // If we evaluate a Tensor assignment, and expression on the left, already has
195  // a memory buffer, then we might do performance optimization, and evaluate
196  // the root expression directly into the final output memory. Some time it's
197  // possible to reuse it for materializing subexpressions inside an expression
198  // tree, to to avoid dynamic memory allocation.
199  //
200  // The pointer type of the underlying storage is erased, because passing
201  // Scalar type through all the expression evaluation layers is way too many
202  // templates. In practice destination buffer type should always match the
203  // evaluated expression scalar type.
204  class DestinationBuffer {
205  public:
206  enum DestinationBufferKind : int {
207  // The above explicit specification of "int" as the enum basetype is
208  // needed to get around a HIPCC link error ("the field type is not
209  // amp-compatible")
210  // which is issued for class members with the enum type.
211  // TODO(rocm):
212  // remove the "int" basetype once HIPCC has been fixed to not error out
213  // in the above scenario.
214 
215  // Destination buffer is not defined (`m_data` == nullptr).
216  kEmpty,
217 
218  // Tensor block defined by an owning tensor block descriptor can fit
219  // contiguously into the destination buffer. In this case it's safe to
220  // materialize tensor block in the destination buffer, wrap it in a
221  // TensorMap, and use to build Eigen expression on top of it.
222  kContiguous,
223 
224  // Destination buffer strides do not match strides of the contiguously
225  // stored block, and it's impossible to define a TensorMap over this
226  // buffer. However if we are evaluating a root of an expression tree, we
227  // still can materialize an output into this destination, because we can
228  // guarantee that no one will ever access it through block API.
229  //
230  // In theory it is possible to build valid TensorStriding<TensorMap>
231  // expression on top of this destination buffer, however it has
232  // inefficient coeff/packet access, and defeats the purpose of fast block
233  // evaluation API.
234  kStrided
235  };
236 
237  template <typename Scalar>
238  Scalar* data() const {
239  eigen_assert(m_data_type_size == sizeof(Scalar));
240  return static_cast<Scalar*>(m_data);
241  }
242 
243  const Dimensions& strides() const { return m_strides; }
244  const DestinationBufferKind& kind() const { return m_kind; }
245 
246  private:
247  friend class TensorBlockDescriptor<NumDims, IndexType>;
248 
249  DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
250 
251  template <typename Scalar>
252  DestinationBuffer(Scalar* data, const Dimensions& strides,
253  DestinationBufferKind kind)
254  : m_data(static_cast<void*>(data)),
255  m_data_type_size(sizeof(Scalar)),
256  m_strides(strides),
257  m_kind(kind) {}
258 
259  template <int Layout, typename Scalar>
260  static DestinationBuffer make(const TensorBlockDescriptor& desc,
261  Scalar* data, const Dimensions& strides) {
262  return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
263  }
264 
265  template <int Layout>
266  static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
267  const Dimensions& strides) {
268  const Dimensions& desc_dims = desc.dimensions();
269  const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
270  for (int i = 0; i < NumDims; ++i) {
271  if (desc_dims[i] == 1) continue;
272  if (desc_strides[i] != strides[i]) return kStrided;
273  }
274  return kContiguous;
275  }
276 
277  // Storage pointer is type erased, to reduce template bloat, but we still
278  // keep the size of the underlying element type for error checking.
279  void* m_data;
280  size_t m_data_type_size;
281 
282  // Destination buffer dimensions always match the dimensions of a tensor
283  // block descriptor it belongs to, however strides might be different.
284  Dimensions m_strides;
285 
286  DestinationBufferKind m_kind;
287  };
288 
289  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
290  const DestinationBuffer& destination)
291  : m_offset(offset),
292  m_dimensions(dimensions),
293  m_destination(destination) {}
294 
295  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
296  : m_offset(offset),
297  m_dimensions(dimensions),
298  m_destination(DestinationBuffer()) {}
299 
300  IndexType offset() const { return m_offset; }
301  const Dimensions& dimensions() const { return m_dimensions; }
302  IndexType dimension(int index) const { return m_dimensions[index]; }
303  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
304 
305  const DestinationBuffer& destination() const { return m_destination; }
306 
307  template <int Layout, typename Scalar>
308  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
309  eigen_assert(dst_base != NULL);
310  m_destination =
311  DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
312  }
313 
314  template <int Layout, typename Scalar, typename DstStridesIndexType>
315  void AddDestinationBuffer(
316  Scalar* dst_base,
317  const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
318  // DSizes constructor will do index type promotion if it's safe.
319  AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
320  }
321 
322  TensorBlockDescriptor& DropDestinationBuffer() {
323  m_destination.m_data = NULL;
324  m_destination.m_kind = DestinationBuffer::kEmpty;
325  return *this;
326  }
327 
328  bool HasDestinationBuffer() const {
329  return m_destination.kind() != DestinationBuffer::kEmpty;
330  }
331 
332  // Returns a copy of `*this` with updated offset.
333  TensorBlockDescriptor WithOffset(IndexType offset) const {
334  return TensorBlockDescriptor(offset, m_dimensions, m_destination);
335  }
336 
337  private:
338  // Offset and dimensions are immutable after construction. Block descriptor
339  // can only be mutated by adding or dropping destination.
340  const IndexType m_offset;
341  const Dimensions m_dimensions;
342  DestinationBuffer m_destination;
343 };
344 
345 // -------------------------------------------------------------------------- //
346 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
347 
348 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
349 class TensorBlockMapper {
350  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
351 
352  public:
353  typedef DSizes<IndexType, NumDims> Dimensions;
354 
355  TensorBlockMapper() = default;
356  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
357  const TensorBlockResourceRequirements& requirements)
358  : m_tensor_dimensions(dimensions), m_requirements(requirements) {
359  // Compute block dimensions and the total number of blocks.
360  InitializeBlockDimensions();
361  }
362 
363  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
364  return m_total_block_count;
365  }
366 
367  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
368  return m_block_dimensions.TotalSize();
369  }
370 
371  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
372  blockDimensions() const {
373  return m_block_dimensions;
374  }
375 
376  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
377  blockDescriptor(IndexType block_index) const {
378  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
379 
380  IndexType offset = 0;
381  DSizes<IndexType, NumDims> dimensions;
382 
383  if (NumDims == 0) return BlockDescriptor(offset, dimensions);
384 
385  // Iterate outer -> inner dimensions.
386  for (int i = NumDims - 1; i >= 0; --i) {
387  const int dim = isColMajor ? i : NumDims - i - 1;
388 
389  const IndexType idx = block_index / m_block_strides[dim];
390  block_index -= idx * m_block_strides[dim];
391 
392  const IndexType coord = idx * m_block_dimensions[dim];
393  dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
394  m_block_dimensions[dim]);
395  offset += coord * m_tensor_strides[dim];
396  }
397 
398  return {offset, dimensions};
399  }
400 
401  private:
402  void InitializeBlockDimensions() {
403  // Requested block shape and size.
404  const TensorBlockShapeType shape_type = m_requirements.shape_type;
405  IndexType target_block_size =
406  numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
407 
408  IndexType tensor_size = m_tensor_dimensions.TotalSize();
409 
410  // Corner case: one of the dimensions is zero. Logic below is too complex
411  // to handle this case on a general basis, just use unit block size.
412  // Note: we must not yield blocks with zero dimensions (recipe for
413  // overflows/underflows, divisions by zero and NaNs later).
414  if (tensor_size == 0) {
415  for (int i = 0; i < NumDims; ++i) {
416  m_block_dimensions[i] = 1;
417  }
418  m_total_block_count = 0;
419  return;
420  }
421 
422  // If tensor fits into a target block size, evaluate it as a single block.
423  if (tensor_size <= target_block_size) {
424  m_block_dimensions = m_tensor_dimensions;
425  m_total_block_count = 1;
426  // The only valid block index is `0`, and in this case we do not need
427  // to compute real strides for tensor or blocks (see blockDescriptor).
428  for (int i = 0; i < NumDims; ++i) {
429  m_tensor_strides[i] = 0;
430  m_block_strides[i] = 1;
431  }
432  return;
433  }
434 
435  static const bool isColMajor = Layout == static_cast<int>(ColMajor);
436 
437  // Block shape skewed towards inner dimension.
438  if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
439  IndexType coeff_to_allocate = target_block_size;
440 
441  for (int i = 0; i < NumDims; ++i) {
442  const int dim = isColMajor ? i : NumDims - i - 1;
443  m_block_dimensions[dim] =
444  numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
445  coeff_to_allocate = divup(
446  coeff_to_allocate,
447  numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
448  }
449  eigen_assert(coeff_to_allocate == 1);
450 
451  } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
452  // Tensor will not fit within 'target_block_size' budget: calculate tensor
453  // block dimension sizes based on "square" dimension size target.
454  const IndexType dim_size_target = convert_index<IndexType>(
455  std::pow(static_cast<float>(target_block_size),
456  1.0f / static_cast<float>(m_block_dimensions.rank())));
457 
458  for (int i = 0; i < NumDims; ++i) {
459  // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
460  // a multiple of the packet size. Note that reducing
461  // 'block_dim_size' in this manner can increase the number of
462  // blocks, and so will amplify any per-block overhead.
463  m_block_dimensions[i] =
464  numext::mini(dim_size_target, m_tensor_dimensions[i]);
465  }
466 
467  // Add any un-allocated coefficients to inner dimension(s).
468  IndexType total_size = m_block_dimensions.TotalSize();
469  for (int i = 0; i < NumDims; ++i) {
470  const int dim = isColMajor ? i : NumDims - i - 1;
471 
472  if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
473  const IndexType total_size_other_dims =
474  total_size / m_block_dimensions[dim];
475  const IndexType alloc_avail =
476  divup<IndexType>(target_block_size, total_size_other_dims);
477  if (alloc_avail == m_block_dimensions[dim]) {
478  // Insufficient excess coefficients to allocate.
479  break;
480  }
481  m_block_dimensions[dim] =
482  numext::mini(m_tensor_dimensions[dim], alloc_avail);
483  total_size = total_size_other_dims * m_block_dimensions[dim];
484  }
485  }
486 
487  } else {
488  eigen_assert(false); // unknown block shape
489  }
490 
491  eigen_assert(m_block_dimensions.TotalSize() >=
492  numext::mini<IndexType>(target_block_size,
493  m_tensor_dimensions.TotalSize()));
494 
495  // Calculate block counts by dimension and total block count.
496  DSizes<IndexType, NumDims> block_count;
497  for (int i = 0; i < NumDims; ++i) {
498  block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
499  }
500  m_total_block_count = array_prod(block_count);
501 
502  // Calculate block strides (used for enumerating blocks).
503  m_tensor_strides = strides<Layout>(m_tensor_dimensions);
504  m_block_strides = strides<Layout>(block_count);
505  }
506 
507  DSizes<IndexType, NumDims> m_tensor_dimensions;
508  TensorBlockResourceRequirements m_requirements;
509 
510  DSizes<IndexType, NumDims> m_block_dimensions;
511  IndexType m_total_block_count;
512 
513  DSizes<IndexType, NumDims> m_tensor_strides;
514  DSizes<IndexType, NumDims> m_block_strides;
515 };
516 
517 // -------------------------------------------------------------------------- //
518 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
519 // for block evaluation (output or input block materialization). Given that
520 // Eigen expression traversal order is deterministic, all temporary allocations
521 // are happening in the same order, and usually have exactly the same size.
522 // Scratch allocator keeps a trace of all dynamic allocations, and after the
523 // first block evaluation is completed, we should be able to reuse all the
524 // temporary buffers for the next block evaluation.
525 
526 template <typename Device>
527 class TensorBlockScratchAllocator {
528  public:
529  explicit TensorBlockScratchAllocator(const Device& device)
530  : m_device(device), m_allocation_index(0) {}
531 
532  ~TensorBlockScratchAllocator() {
533  for (size_t i = 0; i < m_allocations.size(); ++i) {
534  m_device.deallocate(m_allocations[i].ptr);
535  }
536  }
537 
538  void* allocate(size_t size) {
539  // TODO(ezhulenev): Remove when replaced with inlined vector.
540  if (m_allocations.capacity() == 0) m_allocations.reserve(8);
541 
542  // Check if we already have an existing allocation att current index.
543  const int num_allocations = static_cast<int>(m_allocations.size());
544  const bool has_allocation = m_allocation_index < num_allocations;
545 
546  // Allocation index can't be larger than the number of allocations.
547  eigen_assert(m_allocation_index <= num_allocations);
548 
549  // If we have existing allocation, and its size is larger or equal to
550  // requested size, we do nothing.
551 
552  // If current allocation can't fit requested size, we deallocate it, and
553  // replace with a larger allocation.
554  if (has_allocation && m_allocations[m_allocation_index].size < size) {
555  m_device.deallocate(m_allocations[m_allocation_index].ptr);
556  m_allocations[m_allocation_index].ptr = m_device.allocate(size);
557  m_allocations[m_allocation_index].size = size;
558  }
559 
560  // Make a new allocation if we don't have and existing one.
561  if (!has_allocation) {
562  Allocation allocation;
563  allocation.ptr = m_device.allocate(size);
564  allocation.size = size;
565  m_allocations.push_back(allocation);
566  }
567 
568  eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
569  eigen_assert(m_allocations[m_allocation_index].size >= size);
570 
571  return m_allocations[m_allocation_index++].ptr;
572  }
573 
574  void reset() { m_allocation_index = 0; }
575 
576  private:
577  struct Allocation {
578  void* ptr;
579  size_t size;
580  };
581 
582  const Device& m_device;
583  int m_allocation_index;
584  // TODO(ezhulenev): This should be an inlined vector.
585  std::vector<Allocation> m_allocations;
586 };
587 
588 // -------------------------------------------------------------------------- //
589 // TensorBlockKind represents all possible block kinds, that can be produced by
590 // TensorEvaluator::evalBlock function.
592  // Tensor block that is a lazy expression that must be assigned to a
593  // destination using TensorBlockAssign.
595 
596  // Tensor block that is a view into a memory buffer owned by an underlying
597  // Tensor expression (e.g. it can be a view into a Tensor buffer).
599 
600  // Tensor block that was materialized in a scratch memory buffer, allocated
601  // with TensorBlockScratchAllocator. This block must be copied to a
602  // destination, similar to a block of `kExpr` type.
604 
605  // Tensor block that was materialized directly into the final output memory
606  // buffer. For example if the left side of an assignment is a Tensor, we can
607  // directly materialize the block in the destination memory.
608  //
609  // If strides in the output buffer do not match tensor block strides, the
610  // Tensor expression will be invalid, and should not be used by
611  // TensorBlockAssign or for constructing another block expression.
613 };
614 
615 // -------------------------------------------------------------------------- //
616 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
617 // TensorEvaluators that do not support block evaluation.
618 
619 class TensorBlockNotImplemented {
620  public:
621  typedef void XprType;
622 };
623 
624 // -------------------------------------------------------------------------- //
625 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
626 // is not void). It's required to be able to define lazy block expression for
627 // argument types, that do not support block evaluation.
628 
629 template <typename XprType>
630 struct XprScalar {
631  typedef typename XprType::Scalar type;
632 };
633 template <>
634 struct XprScalar<void> {
635  typedef void type;
636 };
637 
638 // -------------------------------------------------------------------------- //
639 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
640 // and XprType is just a TensorMap over the data. This block type is typically
641 // used to materialize blocks of tensor expressions, that can't be efficiently
642 // represented as lazy Tensor expressions with fast coeff/packet operations,
643 // e.g. we materialize all broadcasts into evaluated blocks.
644 //
645 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
646 // buffer that backs the original expression (e.g. block is just a view into a
647 // Tensor), or a memory buffer allocated with scratch allocator, and in this
648 // case the scratch allocator will deallocate it at the end of block based
649 // expression execution.
650 //
651 // If the block was evaluated directly into the output buffer, and strides in
652 // the output buffer do not match block strides, the TensorMap expression will
653 // be invalid, and should never be used in block assignment or any other tensor
654 // expression.
655 
656 template <typename Scalar, int NumDims, int Layout,
657  typename IndexType = Eigen::Index>
658 class TensorMaterializedBlock {
659  public:
660  typedef DSizes<IndexType, NumDims> Dimensions;
661  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
662 
663  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
664  const Dimensions& dimensions, bool valid_expr = true)
665  : m_kind(kind),
666  m_data(data),
667  m_dimensions(dimensions),
668  m_expr(m_data, m_dimensions),
669  m_valid_expr(valid_expr) {
673  }
674 
675  TensorBlockKind kind() const { return m_kind; }
676  // NOTE(ezhulenev): Returning XprType by value like in other block types
677  // causes asan failures. The theory is that XprType::Nested doesn't work
678  // properly for TensorMap.
679  const XprType& expr() const {
680  eigen_assert(m_valid_expr);
681  return m_expr;
682  }
683  const Scalar* data() const { return m_data; }
684  void cleanup() {}
685 
686  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
687 
688  // TensorMaterializedBlock can be backed by different types of storage:
689  //
690  // (1) Contiguous block of memory allocated with scratch allocator.
691  // (2) Contiguous block of memory reused from tensor block descriptor
692  // destination buffer.
693  // (3) Strided block of memory reused from tensor block descriptor
694  // destination buffer.
695  //
696  class Storage {
697  public:
698  Scalar* data() const { return m_data; }
699  const Dimensions& dimensions() const { return m_dimensions; }
700  const Dimensions& strides() const { return m_strides; }
701 
702  TensorMaterializedBlock AsTensorMaterializedBlock() const {
703  return TensorMaterializedBlock(
704  m_materialized_in_output
707  m_data, m_dimensions, !m_strided_storage);
708  }
709 
710  private:
711  friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
712 
713  Storage(Scalar* data, const Dimensions& dimensions,
714  const Dimensions& strides, bool materialized_in_output,
715  bool strided_storage)
716  : m_data(data),
717  m_dimensions(dimensions),
718  m_strides(strides),
719  m_materialized_in_output(materialized_in_output),
720  m_strided_storage(strided_storage) {}
721 
722  Scalar* m_data;
723  Dimensions m_dimensions;
724  Dimensions m_strides;
725  bool m_materialized_in_output;
726  bool m_strided_storage;
727  };
728 
729  // Creates a storage for materialized block either from the block descriptor
730  // destination buffer, or allocates a new buffer with scratch allocator.
731  template <typename TensorBlockScratch>
732  EIGEN_STRONG_INLINE static Storage prepareStorage(
733  TensorBlockDesc& desc, TensorBlockScratch& scratch,
734  bool allow_strided_storage = false) {
735  // Try to reuse destination as an output block buffer.
736  typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
737 
738  if (desc.destination().kind() == DestinationBuffer::kContiguous) {
739  Scalar* buffer = desc.destination().template data<Scalar>();
740  desc.DropDestinationBuffer();
741  return Storage(buffer, desc.dimensions(),
742  internal::strides<Layout>(desc.dimensions()),
743  /*materialized_in_output=*/true,
744  /*strided_storage=*/false);
745 
746  } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
747  allow_strided_storage) {
748  Scalar* buffer = desc.destination().template data<Scalar>();
749  desc.DropDestinationBuffer();
750  return Storage(buffer, desc.dimensions(), desc.destination().strides(),
751  /*materialized_in_output=*/true, /*strided_storage=*/true);
752 
753  } else {
754  void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
755  return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
756  internal::strides<Layout>(desc.dimensions()),
757  /*materialized_in_output=*/false,
758  /*strided_storage=*/false);
759  }
760  }
761 
762  // Creates a materialized block for the given descriptor from a memory buffer.
763  template <typename DataDimensions, typename TensorBlockScratch>
764  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
765  const Scalar* data, const DataDimensions& data_dims,
766  TensorBlockDesc& desc, TensorBlockScratch& scratch) {
767  eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
768 
769  // If a tensor block dimensions covers a contiguous block of the underlying
770  // memory, we can skip block buffer memory allocation, and construct a block
771  // from existing `data` memory buffer.
772  //
773  // Example: (RowMajor layout)
774  // data_dims: [11, 12, 13, 14]
775  // desc.dimensions(): [1, 1, 3, 14]
776  //
777  // In this case we can construct a TensorBlock starting at
778  // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
779  static const bool is_col_major = Layout == ColMajor;
780 
781  // Find out how many inner dimensions have a matching size.
782  int num_matching_inner_dims = 0;
783  for (int i = 0; i < NumDims; ++i) {
784  int dim = is_col_major ? i : NumDims - i - 1;
785  if (data_dims[dim] != desc.dimensions()[dim]) break;
786  ++num_matching_inner_dims;
787  }
788 
789  // All the outer dimensions must be of size `1`, except a single dimension
790  // before the matching inner dimension (`3` in the example above).
791  bool can_use_direct_access = true;
792  for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
793  int dim = is_col_major ? i : NumDims - i - 1;
794  if (desc.dimension(dim) != 1) {
795  can_use_direct_access = false;
796  break;
797  }
798  }
799 
800  if (can_use_direct_access) {
801  const Scalar* block_start = data + desc.offset();
802  return TensorMaterializedBlock(internal::TensorBlockKind::kView,
803  block_start, desc.dimensions());
804 
805  } else {
806  // Reuse destination buffer or allocate new buffer with scratch allocator.
807  const Storage storage = prepareStorage(desc, scratch);
808 
809  typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
810  TensorBlockIO;
811  typedef typename TensorBlockIO::Dst TensorBlockIODst;
812  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
813 
814  TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
815  data, desc.offset());
816  TensorBlockIODst dst(storage.dimensions(), storage.strides(),
817  storage.data());
818 
819  TensorBlockIO::Copy(dst, src);
820  return storage.AsTensorMaterializedBlock();
821  }
822  }
823 
824  private:
825  TensorBlockKind m_kind;
826  const Scalar* m_data;
827  Dimensions m_dimensions;
828  XprType m_expr;
829  bool m_valid_expr;
830 };
831 
832 // -------------------------------------------------------------------------- //
833 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
834 // functor to the blocks produced by the underlying Tensor expression.
835 
836 template <typename UnaryOp, typename ArgTensorBlock>
837 class TensorCwiseUnaryBlock {
838  static constexpr bool NoArgBlockAccess =
839  internal::is_void<typename ArgTensorBlock::XprType>::value;
840 
841  public:
842  typedef std::conditional_t<
843  NoArgBlockAccess, void,
844  TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
845  XprType;
846 
847  typedef typename XprScalar<XprType>::type Scalar;
848 
849  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
850  : m_arg_block(arg_block), m_functor(functor) {}
851 
852  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
853 
854  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
855  const Scalar* data() const { return NULL; }
856  void cleanup() { m_arg_block.cleanup(); }
857 
858  private:
859  ArgTensorBlock m_arg_block;
860  UnaryOp m_functor;
861 };
862 
863 // -------------------------------------------------------------------------- //
864 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
865 // functor to the blocks produced by the underlying Tensor expression.
866 
867 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
868 class TensorCwiseBinaryBlock {
869  static constexpr bool NoArgBlockAccess =
870  internal::is_void<typename LhsTensorBlock::XprType>::value ||
871  internal::is_void<typename RhsTensorBlock::XprType>::value;
872 
873  public:
874  typedef std::conditional_t<
875  NoArgBlockAccess, void,
876  TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
877  const typename RhsTensorBlock::XprType> >
878  XprType;
879 
880  typedef typename XprScalar<XprType>::type Scalar;
881 
882  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
883  const RhsTensorBlock& right_block,
884  const BinaryOp& functor)
885  : m_left_block(left_block),
886  m_right_block(right_block),
887  m_functor(functor) {}
888 
889  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
890 
891  XprType expr() const {
892  return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
893  }
894 
895  const Scalar* data() const { return NULL; }
896 
897  void cleanup() {
898  m_left_block.cleanup();
899  m_right_block.cleanup();
900  }
901 
902  private:
903  LhsTensorBlock m_left_block;
904  RhsTensorBlock m_right_block;
905  BinaryOp m_functor;
906 };
907 
908 // -------------------------------------------------------------------------- //
909 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
910 // an arbitrary tensor expression from a block of the underlying type (this is a
911 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
912 
913 template <typename BlockFactory, typename ArgTensorBlock>
914 class TensorUnaryExprBlock {
915  typedef typename ArgTensorBlock::XprType ArgXprType;
916  static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
917 
918  public:
919  typedef std::conditional_t<
920  NoArgBlockAccess, void,
921  typename BlockFactory::template XprType<ArgXprType>::type> XprType;
922 
923  typedef typename XprScalar<XprType>::type Scalar;
924 
925  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
926  const BlockFactory& factory)
927  : m_arg_block(arg_block), m_factory(factory) {}
928 
929  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
930  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
931  const Scalar* data() const { return NULL; }
932  void cleanup() { m_arg_block.cleanup(); }
933 
934  private:
935  ArgTensorBlock m_arg_block;
936  BlockFactory m_factory;
937 };
938 
939 // -------------------------------------------------------------------------- //
940 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
941 // an arbitrary tensor expression from three blocks of the underlying type.
942 
943 template <typename BlockFactory, typename Arg1TensorBlock,
944  typename Arg2TensorBlock, typename Arg3TensorBlock>
945 class TensorTernaryExprBlock {
946  typedef typename Arg1TensorBlock::XprType Arg1XprType;
947  typedef typename Arg2TensorBlock::XprType Arg2XprType;
948  typedef typename Arg3TensorBlock::XprType Arg3XprType;
949 
950  static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
951  internal::is_void<Arg2XprType>::value ||
952  internal::is_void<Arg3XprType>::value;
953 
954  public:
955  typedef std::conditional_t<
956  NoArgBlockAccess, void,
957  typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
958  Arg3XprType>::type> XprType;
959 
960  typedef typename XprScalar<XprType>::type Scalar;
961 
962  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
963  const Arg2TensorBlock& arg2_block,
964  const Arg3TensorBlock& arg3_block,
965  const BlockFactory& factory)
966  : m_arg1_block(arg1_block),
967  m_arg2_block(arg2_block),
968  m_arg3_block(arg3_block),
969  m_factory(factory) {}
970 
971  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
972  XprType expr() const {
973  return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
974  m_arg3_block.expr());
975  }
976  const Scalar* data() const { return NULL; }
977  void cleanup() {
978  m_arg1_block.cleanup();
979  m_arg2_block.cleanup();
980  m_arg3_block.cleanup();
981  }
982 
983  private:
984  Arg1TensorBlock m_arg1_block;
985  Arg2TensorBlock m_arg2_block;
986  Arg3TensorBlock m_arg3_block;
987  BlockFactory m_factory;
988 };
989 
990 // -------------------------------------------------------------------------- //
991 // StridedLinearBufferCopy provides a method to copy data between two linear
992 // buffers with different strides, with optimized paths for scatter/gather.
993 
994 template <typename Scalar, typename IndexType>
995 class StridedLinearBufferCopy {
996  typedef typename packet_traits<Scalar>::type Packet;
997  typedef typename unpacket_traits<Packet>::half HalfPacket;
998  enum {
999  Vectorizable = packet_traits<Scalar>::Vectorizable,
1000  PacketSize = packet_traits<Scalar>::size,
1001  HalfPacketSize = unpacket_traits<HalfPacket>::size,
1002  HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
1003  };
1004 
1005  public:
1006  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1007  enum class Kind {
1008  Linear = 0, // src_stride == 1 && dst_stride == 1
1009  Scatter = 1, // src_stride == 1 && dst_stride != 1
1010  FillLinear = 2, // src_stride == 0 && dst_stride == 1
1011  FillScatter = 3, // src_stride == 0 && dst_stride != 1
1012  Gather = 4, // dst_stride == 1
1013  Random = 5 // everything else
1014  };
1015 
1016  struct Dst {
1017  Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1018 
1019  IndexType offset;
1020  IndexType stride;
1021  Scalar* data;
1022  };
1023 
1024  struct Src {
1025  Src(IndexType o, IndexType s, const Scalar* d)
1026  : offset(o), stride(s), data(d) {}
1027 
1028  IndexType offset;
1029  IndexType stride;
1030  const Scalar* data;
1031  };
1032 
1033  template <typename StridedLinearBufferCopy::Kind kind>
1034  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
1035  const Src& src,
1036  const size_t count) {
1037  Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1038  src.data);
1039  }
1040 
1041  private:
1042  template <typename StridedLinearBufferCopy::Kind kind>
1043  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1044  const IndexType count, const IndexType dst_offset,
1045  const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1046  const IndexType src_offset, const IndexType src_stride,
1047  const Scalar* EIGEN_RESTRICT src_data) {
1048  const Scalar* src = &src_data[src_offset];
1049  Scalar* dst = &dst_data[dst_offset];
1050 
1051  if (!Vectorizable) {
1052  for (Index i = 0; i < count; ++i) {
1053  dst[i * dst_stride] = src[i * src_stride];
1054  }
1055  return;
1056  }
1057 
1058  const IndexType vectorized_size = count - PacketSize;
1059  IndexType i = 0;
1060 
1061  if (kind == StridedLinearBufferCopy::Kind::Linear) {
1062  // ******************************************************************** //
1063  // Linear copy from `src` to `dst`.
1064  const IndexType unrolled_size = count - 4 * PacketSize;
1065  eigen_assert(src_stride == 1 && dst_stride == 1);
1066  for (; i <= unrolled_size; i += 4 * PacketSize) {
1067  for (int j = 0; j < 4; ++j) {
1068  Packet p = ploadu<Packet>(src + i + j * PacketSize);
1069  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1070  }
1071  }
1072  for (; i <= vectorized_size; i += PacketSize) {
1073  Packet p = ploadu<Packet>(src + i);
1074  pstoreu<Scalar, Packet>(dst + i, p);
1075  }
1076  if (HasHalfPacket) {
1077  const IndexType vectorized_half_size = count - HalfPacketSize;
1078  if (i <= vectorized_half_size) {
1079  HalfPacket p = ploadu<HalfPacket>(src + i);
1080  pstoreu<Scalar, HalfPacket>(dst + i, p);
1081  i += HalfPacketSize;
1082  }
1083  }
1084  for (; i < count; ++i) {
1085  dst[i] = src[i];
1086  }
1087  // ******************************************************************** //
1088  } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1089  // Scatter from `src` to `dst`.
1090  eigen_assert(src_stride == 1 && dst_stride != 1);
1091  for (; i <= vectorized_size; i += PacketSize) {
1092  Packet p = ploadu<Packet>(src + i);
1093  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1094  }
1095  if (HasHalfPacket) {
1096  const IndexType vectorized_half_size = count - HalfPacketSize;
1097  if (i <= vectorized_half_size) {
1098  HalfPacket p = ploadu<HalfPacket>(src + i);
1099  pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
1100  i += HalfPacketSize;
1101  }
1102  }
1103  for (; i < count; ++i) {
1104  dst[i * dst_stride] = src[i];
1105  }
1106  // ******************************************************************** //
1107  } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1108  // Fill `dst` with value at `*src`.
1109  eigen_assert(src_stride == 0 && dst_stride == 1);
1110  const IndexType unrolled_size = count - 4 * PacketSize;
1111  Scalar s = *src;
1112  Packet p = pset1<Packet>(s);
1113  for (; i <= unrolled_size; i += 4 * PacketSize) {
1114  for (int j = 0; j < 4; ++j) {
1115  pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1116  }
1117  }
1118  for (; i <= vectorized_size; i += PacketSize) {
1119  pstoreu<Scalar, Packet>(dst + i, p);
1120  }
1121  if (HasHalfPacket) {
1122  const IndexType vectorized_half_size = count - HalfPacketSize;
1123  if (i <= vectorized_half_size) {
1124  HalfPacket hp = pset1<HalfPacket>(s);
1125  pstoreu<Scalar, HalfPacket>(dst + i, hp);
1126  i += HalfPacketSize;
1127  }
1128  }
1129  for (; i < count; ++i) {
1130  dst[i] = s;
1131  }
1132  // ******************************************************************** //
1133  } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1134  // Scatter `*src` into `dst`.
1135  eigen_assert(src_stride == 0 && dst_stride != 1);
1136  Scalar s = *src;
1137  Packet p = pset1<Packet>(s);
1138  for (; i <= vectorized_size; i += PacketSize) {
1139  pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1140  }
1141  if (HasHalfPacket) {
1142  const IndexType vectorized_half_size = count - HalfPacketSize;
1143  if (i <= vectorized_half_size) {
1144  HalfPacket hp = pset1<HalfPacket>(s);
1145  pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
1146  i += HalfPacketSize;
1147  }
1148  }
1149  for (; i < count; ++i) {
1150  dst[i * dst_stride] = s;
1151  }
1152  // ******************************************************************** //
1153  } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1154  // Gather from `src` into `dst`.
1155  eigen_assert(dst_stride == 1);
1156  for (; i <= vectorized_size; i += PacketSize) {
1157  Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1158  pstoreu<Scalar, Packet>(dst + i, p);
1159  }
1160  if (HasHalfPacket) {
1161  const IndexType vectorized_half_size = count - HalfPacketSize;
1162  if (i <= vectorized_half_size) {
1163  HalfPacket p =
1164  pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
1165  pstoreu<Scalar, HalfPacket>(dst + i, p);
1166  i += HalfPacketSize;
1167  }
1168  }
1169  for (; i < count; ++i) {
1170  dst[i] = src[i * src_stride];
1171  }
1172  // ******************************************************************** //
1173  } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1174  // Random.
1175  for (; i < count; ++i) {
1176  dst[i * dst_stride] = src[i * src_stride];
1177  }
1178  } else {
1179  eigen_assert(false);
1180  }
1181  }
1182 };
1183 
1184 // -------------------------------------------------------------------------- //
1185 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1186 // It's possible to specify src->dst dimension mapping for the copy operation.
1187 // Dimensions of `dst` specify how many elements have to be copied, for the
1188 // `src` we need to know only stride to navigate through source memory buffer.
1189 
1190 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1191 class TensorBlockIO {
1192  static constexpr bool IsColMajor = (Layout == ColMajor);
1193 
1194  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1195 
1196  public:
1197  typedef DSizes<IndexType, NumDims> Dimensions;
1198  typedef DSizes<int, NumDims> DimensionsMap;
1199 
1200  struct Dst {
1201  Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1202  IndexType dst_offset = 0)
1203  : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1204 
1205  Dimensions dims;
1206  Dimensions strides;
1207  Scalar* data;
1208  IndexType offset;
1209  };
1210 
1211  struct Src {
1212  Src(const Dimensions& src_strides, const Scalar* src,
1213  IndexType src_offset = 0)
1214  : strides(src_strides), data(src), offset(src_offset) {}
1215 
1216  Dimensions strides;
1217  const Scalar* data;
1218  IndexType offset;
1219  };
1220 
1221  // Copies data to `dst` from `src`, using provided dimensions mapping:
1222  //
1223  // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1224  //
1225  // Returns the number of copied elements.
1226  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1227  const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1228  // Copy single scalar value from `src` to `dst`.
1229  if (NumDims == 0) {
1230  *(dst.data + dst.offset) = *(src.data + src.offset);
1231  return 1;
1232  }
1233 
1234  // Both `dst` and `src` must have contiguous innermost dimension. We also
1235  // accept the special case with stride '0', because it's used as a trick to
1236  // implement broadcasting.
1237  {
1238  int inner_dim = IsColMajor ? 0 : NumDims - 1;
1239  EIGEN_UNUSED_VARIABLE(inner_dim);
1240  eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1241  eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1242  }
1243 
1244  // Give a shorter name to `dst_to_src_dim_map`.
1245  const DimensionsMap& dim_map = dst_to_src_dim_map;
1246 
1247  // Do not squeeze reordered inner dimensions.
1248  int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1249 
1250  // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1251  // block, and we write data linearly into that dimension, reading it from
1252  // the src. If dimensions are reordered, we might end up reading data from
1253  // the src with `stride != 1`.
1254  //
1255  // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1256  // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1257 
1258  // Find the innermost dimension in the dst whose size is not 1. This is the
1259  // effective inner dim.
1260  int num_size_one_inner_dims = 0;
1261  for (int i = 0; i < num_squeezable_dims; ++i) {
1262  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1263  if (dst.dims[dst_dim] != 1) break;
1264  num_size_one_inner_dims++;
1265  }
1266 
1267  // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1268  if (num_size_one_inner_dims == NumDims) {
1269  *(dst.data + dst.offset) = *(src.data + src.offset);
1270  return 1;
1271  }
1272 
1273  // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1274  const int dst_stride1_dim = IsColMajor
1275  ? num_size_one_inner_dims
1276  : NumDims - num_size_one_inner_dims - 1;
1277 
1278  // Dimension in the src that corresponds to the dst innermost dimension.
1279  const int src_dim_for_dst_stride1_dim =
1280  NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1281 
1282  // Size of the innermost dimension (length of contiguous blocks of memory).
1283  IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1284 
1285  // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1286  // `src` memory, so we can do less linear copy calls.
1287  for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1288  const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1289  const IndexType dst_stride = dst.strides[dst_dim];
1290  const IndexType src_stride = src.strides[dim_map[dst_dim]];
1291  if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1292  dst_inner_dim_size *= dst.dims[dst_dim];
1293  ++num_size_one_inner_dims;
1294  } else {
1295  break;
1296  }
1297  }
1298 
1299  // Setup strides to read data from `src` and write to `dst`.
1300  IndexType input_offset = src.offset;
1301  IndexType output_offset = dst.offset;
1302  IndexType input_stride =
1303  NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1304  IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1305 
1306  const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1308 
1309  // Initialize block iterator state. Squeeze away any dimension of size 1.
1310  int idx = 0; // currently initialized iterator state index
1311  for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1312  const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1313  if (dst.dims[dst_dim] == 1) continue;
1314 
1315  it[idx].size = dst.dims[dst_dim];
1316  it[idx].input_stride = src.strides[dim_map[dst_dim]];
1317  it[idx].output_stride = dst.strides[dst_dim];
1318 
1319  it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1320  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1321 
1322  idx++;
1323  }
1324 
1325  // Iterate copying data from src to dst.
1326  const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1327 
1328 #define COPY_INNER_DIM(KIND) \
1329  IndexType num_copied = 0; \
1330  for (num_copied = 0; num_copied < block_total_size; \
1331  num_copied += dst_inner_dim_size) { \
1332  LinCopy::template Run<KIND>( \
1333  typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1334  typename LinCopy::Src(input_offset, input_stride, src.data), \
1335  dst_inner_dim_size); \
1336  \
1337  for (int j = 0; j < idx; ++j) { \
1338  if (++it[j].count < it[j].size) { \
1339  input_offset += it[j].input_stride; \
1340  output_offset += it[j].output_stride; \
1341  break; \
1342  } \
1343  it[j].count = 0; \
1344  input_offset -= it[j].input_span; \
1345  output_offset -= it[j].output_span; \
1346  } \
1347  } \
1348  return num_copied;
1349 
1350  if (input_stride == 1 && output_stride == 1) {
1351  COPY_INNER_DIM(LinCopy::Kind::Linear);
1352  } else if (input_stride == 1 && output_stride != 1) {
1353  COPY_INNER_DIM(LinCopy::Kind::Scatter);
1354  } else if (input_stride == 0 && output_stride == 1) {
1355  COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1356  } else if (input_stride == 0 && output_stride != 1) {
1357  COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1358  } else if (output_stride == 1) {
1359  COPY_INNER_DIM(LinCopy::Kind::Gather);
1360  } else {
1361  COPY_INNER_DIM(LinCopy::Kind::Random);
1362  }
1363 
1364 #undef COPY_INNER_DIM
1365  }
1366 
1367  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1368  // the number of copied elements.
1369  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1370  const Src& src) {
1371  DimensionsMap dst_to_src_map;
1372  for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1373  return Copy(dst, src, dst_to_src_map);
1374  }
1375 
1376  private:
1377  struct BlockIteratorState {
1378  BlockIteratorState()
1379  : size(0),
1380  count(0),
1381  input_stride(0),
1382  output_stride(0),
1383  input_span(0),
1384  output_span(0) {}
1385 
1386  IndexType size;
1387  IndexType count;
1388  IndexType input_stride;
1389  IndexType output_stride;
1390  IndexType input_span;
1391  IndexType output_span;
1392  };
1393 
1394  // Compute how many inner dimensions it's allowed to squeeze when doing IO
1395  // between two tensor blocks. It's safe to squeeze inner dimensions, only
1396  // if they are not reordered.
1397  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1398  int num_squeezable_dims = 0;
1399  for (int i = 0; i < NumDims; ++i) {
1400  const int dim = IsColMajor ? i : NumDims - i - 1;
1401  if (dim_map[dim] != dim) break;
1402  num_squeezable_dims++;
1403  }
1404  return num_squeezable_dims;
1405  }
1406 };
1407 
1408 // -------------------------------------------------------------------------- //
1409 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1410 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1411 //
1412 // Currently there is no way to write from a Tensor expression to a block of
1413 // memory, if dimensions are reordered. If you need to do that, you should
1414 // materialize a Tensor block expression into a memory buffer, and then use
1415 // TensorBlockIO to copy data between two memory buffers with a custom
1416 // `target->src` dimension map (see definition above).
1417 //
1418 // Also currently the innermost dimension of `target` must have a stride '1'
1419 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1420 // but in practice it's never needed, and there is a similar TensorBlockIO
1421 // workaround for that.
1422 //
1423 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1424 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1425 // to use expressions instead of pointers, and after that TensorBlockAssignment
1426 // will become an alias to IO.
1427 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1428  typename IndexType = Eigen::Index>
1429 class TensorBlockAssignment {
1430  // We will use coeff/packet path to evaluate block expressions.
1431  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1432  TensorBlockEvaluator;
1433 
1434  typedef DSizes<IndexType, NumDims> Dimensions;
1435 
1436  enum {
1437  Vectorizable = packet_traits<Scalar>::Vectorizable,
1438  PacketSize = packet_traits<Scalar>::size
1439  };
1440 
1441  template <bool Vectorizable, typename Evaluator>
1442  struct InnerDimAssign {
1443  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1444  const Evaluator& eval,
1445  IndexType eval_offset) {
1446  for (IndexType i = 0; i < count; ++i) {
1447  target[i] = eval.coeff(eval_offset + i);
1448  }
1449  }
1450  };
1451 
1452  template <typename Evaluator>
1453  struct InnerDimAssign<true, Evaluator> {
1454  EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1455  const Evaluator& eval,
1456  IndexType eval_offset) {
1457  typedef typename packet_traits<Scalar>::type Packet;
1458 
1459  const IndexType unrolled_size = count - 4 * PacketSize;
1460  const IndexType vectorized_size = count - PacketSize;
1461  IndexType i = 0;
1462 
1463  for (; i <= unrolled_size; i += 4 * PacketSize) {
1464  for (int j = 0; j < 4; ++j) {
1465  const IndexType idx = eval_offset + i + j * PacketSize;
1466  Packet p = eval.template packet<Unaligned>(idx);
1467  pstoreu<Scalar>(target + i + j * PacketSize, p);
1468  }
1469  }
1470 
1471  for (; i <= vectorized_size; i += PacketSize) {
1472  Packet p = eval.template packet<Unaligned>(eval_offset + i);
1473  pstoreu<Scalar>(target + i, p);
1474  }
1475 
1476  for (; i < count; ++i) {
1477  target[i] = eval.coeff(eval_offset + i);
1478  }
1479  }
1480  };
1481 
1482  public:
1483  struct Target {
1484  Target(const Dimensions& target_dims, const Dimensions& target_strides,
1485  Scalar* target_data, IndexType target_offset = 0)
1486  : dims(target_dims),
1487  strides(target_strides),
1488  data(target_data),
1489  offset(target_offset) {}
1490 
1491  Dimensions dims;
1492  Dimensions strides;
1493  Scalar* data;
1494  IndexType offset;
1495  };
1496 
1497  static Target target(const Dimensions& target_dims,
1498  const Dimensions& target_strides, Scalar* target_data,
1499  IndexType target_offset = 0) {
1500  return Target(target_dims, target_strides, target_data, target_offset);
1501  }
1502 
1503  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1504  static Target target(
1505  const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1506  const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1507  Scalar* target_data, IndexType target_offset = 0) {
1508  // DSizes constructor will do index type promotion if it's safe.
1509  return Target(Dimensions(target_dims), Dimensions(target_strides),
1510  target_data, target_offset);
1511  }
1512 
1513  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1514  const Target& target, const TensorBlockExpr& expr) {
1515  // Prepare evaluator for block expression.
1516  DefaultDevice default_device;
1517  TensorBlockEvaluator eval(expr, default_device);
1518 
1519  // Tensor block expression dimension should match destination dimensions.
1520  eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1521 
1522  static const int Layout = TensorBlockEvaluator::Layout;
1523  static const bool is_col_major = Layout == ColMajor;
1524 
1525  // Initialize output inner dimension size based on a layout.
1526  const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1527  const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1528  IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1529 
1530  // Target inner dimension stride must be '1'.
1531  eigen_assert(target.strides[inner_dim_idx] == 1);
1532 
1533  // Squeeze multiple inner dims into one if they are contiguous in `target`.
1534  IndexType num_squeezed_dims = 0;
1535  for (Index i = 1; i < NumDims; ++i) {
1536  const Index dim = is_col_major ? i : NumDims - i - 1;
1537  const IndexType target_stride = target.strides[dim];
1538 
1539  if (output_inner_dim_size == target_stride) {
1540  output_inner_dim_size *= target.dims[dim];
1541  num_squeezed_dims++;
1542  } else {
1543  break;
1544  }
1545  }
1546 
1547  // Initialize output block iterator state. Dimension in this array are
1548  // always in inner_most -> outer_most order (col major layout).
1550 
1551  int idx = 0; // currently initialized iterator state index
1552  for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1553  const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1554 
1555  it[idx].count = 0;
1556  it[idx].size = target.dims[dim];
1557  it[idx].output_stride = target.strides[dim];
1558  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1559  idx++;
1560  }
1561 
1562  // We read block expression from the beginning, and start writing data to
1563  // `target` at given offset.
1564  IndexType input_offset = 0;
1565  IndexType output_offset = target.offset;
1566 
1567  // Iterate copying data from `eval` to `target`.
1568  for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1569  // Assign to `target` at current offset.
1570  InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1571  TensorBlockEvaluator>::Run(target.data + output_offset,
1572  output_inner_dim_size, eval,
1573  input_offset);
1574 
1575  // Move input offset forward by the number of assigned coefficients.
1576  input_offset += output_inner_dim_size;
1577 
1578  // Update index.
1579  for (int j = 0; j < idx; ++j) {
1580  if (++it[j].count < it[j].size) {
1581  output_offset += it[j].output_stride;
1582  break;
1583  }
1584  it[j].count = 0;
1585  output_offset -= it[j].output_span;
1586  }
1587  }
1588  }
1589 
1590  private:
1591  struct BlockIteratorState {
1592  BlockIteratorState()
1593  : count(0), size(0), output_stride(0), output_span(0) {}
1594 
1595  IndexType count;
1596  IndexType size;
1597  IndexType output_stride;
1598  IndexType output_span;
1599  };
1600 };
1601 
1602 // -------------------------------------------------------------------------- //
1603 
1604 } // namespace internal
1605 } // namespace Eigen
1606 
1607 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
int i
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
int data[]
#define COPY_INNER_DIM(KIND)
float * p
Target
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:28
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition: TensorMeta.h:32
Eigen::AutoDiffScalar< EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(internal::remove_all_t< DerType >, typename internal::traits< internal::remove_all_t< DerType >>::Scalar, product) > pow(const Eigen::AutoDiffScalar< DerType > &x, const typename internal::traits< internal::remove_all_t< DerType >>::Scalar &y)
EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
SparseMat::Index size
static constexpr int Layout
std::ptrdiff_t j