TensorExecutor.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
31 namespace internal {
32 
41 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
42 
43 template<typename Expression>
44 struct ExpressionHasTensorBroadcastingOp {
45  enum { value = false };
46 };
47 
48 template<typename LhsXprType, typename RhsXprType>
49 struct ExpressionHasTensorBroadcastingOp<
50  const TensorAssignOp<LhsXprType, RhsXprType> > {
51  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
52 };
53 
54 template<typename UnaryOp, typename XprType>
55 struct ExpressionHasTensorBroadcastingOp<
56  const TensorCwiseUnaryOp<UnaryOp, XprType> > {
57  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
58 };
59 
60 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
61 struct ExpressionHasTensorBroadcastingOp<
62  const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
63  enum {
64  value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
65  ExpressionHasTensorBroadcastingOp<RhsXprType>::value
66  };
67 };
68 
69 template<typename Broadcast, typename XprType>
70 struct ExpressionHasTensorBroadcastingOp<
71  const TensorBroadcastingOp<Broadcast, XprType> > {
72  enum { value = true };
73 };
74 
75 // -------------------------------------------------------------------------- //
76 
81 template <typename Expression, typename Device, bool Vectorizable,
82  TiledEvaluation Tiling>
83 class TensorExecutor {
84  public:
85  typedef typename Expression::Index StorageIndex;
86 
87  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
88  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
89  // violation. If this template is instantiated with a non-default device, it
90  // means that this header file was included without defining
91  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
92  static_assert(std::is_same<Device, DefaultDevice>::value,
93  "Default executor instantiated with non-default device. "
94  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
95  "EIGEN_USE_SYCL before including Eigen headers.");
96 
97  static EIGEN_STRONG_INLINE void run(const Expression& expr,
98  const Device& device = DefaultDevice()) {
99  TensorEvaluator<Expression, Device> evaluator(expr, device);
100  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
101  if (needs_assign) {
102  const StorageIndex size = array_prod(evaluator.dimensions());
103  for (StorageIndex i = 0; i < size; ++i) {
104  evaluator.evalScalar(i);
105  }
106  }
107  evaluator.cleanup();
108  }
109 };
110 
115 template <typename Expression, typename Device, typename DoneCallback,
116  bool Vectorizable, TiledEvaluation Tiling>
117 class TensorAsyncExecutor {};
118 
122 template <typename Expression>
123 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
124  /*Tiling=*/TiledEvaluation::Off> {
125  public:
126  typedef typename Expression::Index StorageIndex;
127 
128  static EIGEN_STRONG_INLINE void run(
129  const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
130  TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
131  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
132  if (needs_assign) {
133  const StorageIndex size = array_prod(evaluator.dimensions());
134  const int PacketSize = unpacket_traits<typename TensorEvaluator<
135  Expression, DefaultDevice>::PacketReturnType>::size;
136 
137  // Give compiler a strong possibility to unroll the loop. But don't insist
138  // on unrolling, because if the function is expensive compiler should not
139  // unroll the loop at the expense of inlining.
140  const StorageIndex UnrolledSize =
141  (size / (4 * PacketSize)) * 4 * PacketSize;
142  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143  for (StorageIndex j = 0; j < 4; j++) {
144  evaluator.evalPacket(i + j * PacketSize);
145  }
146  }
147  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149  evaluator.evalPacket(i);
150  }
151  for (StorageIndex i = VectorizedSize; i < size; ++i) {
152  evaluator.evalScalar(i);
153  }
154  }
155  evaluator.cleanup();
156  }
157 };
158 
163 template <typename Expression, bool Vectorizable>
164 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
165  /*Tiling=*/TiledEvaluation::On> {
166  public:
167  typedef typename traits<Expression>::Scalar Scalar;
168  typedef std::remove_const_t<Scalar> ScalarNoConst;
169 
170  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
171  typedef typename traits<Expression>::Index StorageIndex;
172 
173  static constexpr int NumDims = traits<Expression>::NumDimensions;
174 
176  static EIGEN_STRONG_INLINE void run(const Expression& expr,
177  const DefaultDevice& device = DefaultDevice()) {
178  typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
179  TensorBlockMapper;
180 
181  typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
182  TensorBlockDesc;
183  typedef internal::TensorBlockScratchAllocator<DefaultDevice>
184  TensorBlockScratch;
185 
186  Evaluator evaluator(expr, device);
187 
188  // TODO(ezhulenev): Do not use tiling for small tensors?
189  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
190 
191  if (needs_assign) {
192  // Query expression tree for desired block size/shape.
193  const TensorBlockResourceRequirements requirements =
194  evaluator.getResourceRequirements();
195 
196  const TensorBlockMapper block_mapper(
197  typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
198  requirements);
199 
200  // Share scratch memory allocator between all blocks.
201  TensorBlockScratch scratch(device);
202 
203  const StorageIndex total_block_count = block_mapper.blockCount();
204  for (StorageIndex i = 0; i < total_block_count; ++i) {
205  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206  evaluator.evalBlock(desc, scratch);
207  scratch.reset();
208  }
209  }
210  evaluator.cleanup();
211  }
212 };
213 
225 #ifdef EIGEN_USE_THREADS
226 
227 template <typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229  TensorExecutorTilingContext() = default;
230  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
231  const TensorOpCost& b_cost, size_t b_aligned_size)
232  : block_mapper(b_mapper),
233  cost(b_cost),
234  aligned_blocksize(b_aligned_size) {}
235 
236  TensorBlockMapper block_mapper; // navigate through blocks
237  TensorOpCost cost; // cost of computing a single block
238  size_t aligned_blocksize; // block size after memory alignment
239 };
240 
241 // Computes a block evaluation parameters, and allocates temporary memory buffer
242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245  const Evaluator& evaluator) {
246  // Query expression tree for desired block size/shape.
247  TensorBlockResourceRequirements requirements =
248  evaluator.getResourceRequirements();
249 
250  // Update target block size based on cost model.
252  1, requirements.cost_per_coeff);
253  requirements.size = static_cast<size_t>(1.0 / taskSize);
254 
255  TensorBlockMapper block_mapper(
256  typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257  requirements);
258 
259  size_t block_size = block_mapper.blockTotalSize();
260  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261  const size_t aligned_blocksize =
262  align *
263  divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
264 
265  return {block_mapper, requirements.cost_per_coeff * block_size,
266  aligned_blocksize};
267 }
268 
269 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
270 struct EvalRange {
271  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
272  const StorageIndex lastIdx) {
273  Evaluator evaluator = *evaluator_in;
274  eigen_assert(lastIdx >= firstIdx);
275  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276  evaluator.evalScalar(i);
277  }
278  }
279 
280  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
281 };
282 
283 template <typename Evaluator, typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
285  static constexpr int PacketSize =
286  unpacket_traits<typename Evaluator::PacketReturnType>::size;
287 
288  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
289  const StorageIndex lastIdx) {
290  Evaluator evaluator = *evaluator_in;
291  eigen_assert(lastIdx >= firstIdx);
292  StorageIndex i = firstIdx;
293  if (lastIdx - firstIdx >= PacketSize) {
294  eigen_assert(firstIdx % PacketSize == 0);
295  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
296  // Give compiler a strong possibility to unroll the loop. But don't insist
297  // on unrolling, because if the function is expensive compiler should not
298  // unroll the loop at the expense of inlining.
299  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300  for (StorageIndex j = 0; j < 4; j++) {
301  evaluator.evalPacket(i + j * PacketSize);
302  }
303  }
304  last_chunk_offset = lastIdx - PacketSize;
305  for (; i <= last_chunk_offset; i += PacketSize) {
306  evaluator.evalPacket(i);
307  }
308  }
309  for (; i < lastIdx; ++i) {
310  evaluator.evalScalar(i);
311  }
312  }
313 
314  static StorageIndex alignBlockSize(StorageIndex size) {
315  // Align block size to packet size and account for unrolling in run above.
316  if (size >= 16 * PacketSize) {
317  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318  }
319  // Aligning to 4 * PacketSize would increase block size by more than 25%.
320  return (size + PacketSize - 1) & ~(PacketSize - 1);
321  }
322 };
323 
324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
326  public:
327  typedef typename Expression::Index StorageIndex;
328 
329  static EIGEN_STRONG_INLINE void run(const Expression& expr,
330  const ThreadPoolDevice& device) {
331  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
333 
334  Evaluator evaluator(expr, device);
335  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
336  if (needs_assign) {
337  const StorageIndex size = array_prod(evaluator.dimensions());
338  device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339  EvalRange::alignBlockSize,
340  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341  EvalRange::run(&evaluator, firstIdx, lastIdx);
342  });
343  }
344  evaluator.cleanup();
345  }
346 };
347 
348 template <typename Expression, bool Vectorizable>
349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
350  /*Tiling=*/TiledEvaluation::On> {
351  public:
352  typedef typename traits<Expression>::Index IndexType;
353  typedef typename traits<Expression>::Scalar Scalar;
354  typedef std::remove_const_t<Scalar> ScalarNoConst;
355 
356  static constexpr int NumDims = traits<Expression>::NumDimensions;
357 
358  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
361 
362  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
363  TensorBlockDesc;
364  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
365  TensorBlockScratch;
366 
367  static EIGEN_STRONG_INLINE void run(const Expression& expr,
368  const ThreadPoolDevice& device) {
369  Evaluator evaluator(expr, device);
370 
371  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
372  if (needs_assign) {
373  const TilingContext tiling =
374  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375  Vectorizable>(evaluator);
376 
377  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378  IndexType lastBlockIdx) {
379  TensorBlockScratch scratch(device);
380 
381  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
382  ++block_idx) {
383  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384  evaluator.evalBlock(desc, scratch);
385  scratch.reset();
386  }
387  };
388 
389  // Evaluate small expressions directly as a single block.
390  if (tiling.block_mapper.blockCount() == 1) {
391  TensorBlockScratch scratch(device);
392  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393  evaluator.evalBlock(desc, scratch);
394  } else {
395  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
396  eval_block);
397  }
398  }
399  evaluator.cleanup();
400  }
401 };
402 
403 template <typename Expression, typename DoneCallback, bool Vectorizable,
404  TiledEvaluation Tiling>
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406  Vectorizable, Tiling> {
407  public:
408  typedef typename Expression::Index StorageIndex;
409  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
410 
411  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
412  const ThreadPoolDevice& device,
413  DoneCallback done) {
414  TensorAsyncExecutorContext* const ctx =
415  new TensorAsyncExecutorContext(expr, device, std::move(done));
416 
417  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
418  if (!need_assign) {
419  delete ctx;
420  return;
421  }
422 
423  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425  device.parallelForAsync(
426  size, ctx->evaluator.costPerCoeff(Vectorizable),
427  EvalRange::alignBlockSize,
428  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429  EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
430  },
431  [ctx]() { delete ctx; });
432  };
433 
434  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
435  }
436 
437  private:
438  struct TensorAsyncExecutorContext {
439  TensorAsyncExecutorContext(const Expression& expr,
440  const ThreadPoolDevice& thread_pool,
441  DoneCallback done)
442  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
443 
444  ~TensorAsyncExecutorContext() {
445  evaluator.cleanup();
446  on_done();
447  }
448 
449  Evaluator evaluator;
450 
451  private:
452  DoneCallback on_done;
453  };
454 };
455 
456 template <typename Expression, typename DoneCallback, bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458  Vectorizable, /*Tileable*/ TiledEvaluation::On> {
459  public:
460  typedef typename traits<Expression>::Index IndexType;
461  typedef typename traits<Expression>::Scalar Scalar;
462  typedef std::remove_const_t<Scalar> ScalarNoConst;
463 
464  static constexpr int NumDims = traits<Expression>::NumDimensions;
465 
466  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
469 
470  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
472  TensorBlockScratch;
473 
474  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
475  const ThreadPoolDevice& device,
476  DoneCallback done) {
477 
478  TensorAsyncExecutorContext* const ctx =
479  new TensorAsyncExecutorContext(expr, device, std::move(done));
480 
481  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
482  if (!need_assign) {
483  delete ctx;
484  return;
485  }
486 
487  ctx->tiling = internal::GetTensorExecutorTilingContext<
488  Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
489 
490  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491  TensorBlockScratch scratch(ctx->device);
492 
493  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
494  ++block_idx) {
495  TensorBlockDesc desc =
496  ctx->tiling.block_mapper.blockDescriptor(block_idx);
497  ctx->evaluator.evalBlock(desc, scratch);
498  scratch.reset();
499  }
500  };
501 
502  // Evaluate small expressions directly as a single block.
503  if (ctx->tiling.block_mapper.blockCount() == 1) {
504  TensorBlockScratch scratch(ctx->device);
505  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506  ctx->evaluator.evalBlock(desc, scratch);
507  delete ctx;
508  } else {
509  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510  ctx->tiling.cost, eval_block,
511  [ctx]() { delete ctx; });
512  }
513  };
514 
515  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
516  }
517 
518  private:
519  struct TensorAsyncExecutorContext {
520  TensorAsyncExecutorContext(const Expression& expr,
521  const ThreadPoolDevice& thread_pool,
522  DoneCallback done)
523  : device(thread_pool),
524  evaluator(expr, thread_pool),
525  on_done(std::move(done)) {}
526 
527  ~TensorAsyncExecutorContext() {
528  evaluator.cleanup();
529  on_done();
530  }
531 
532  const ThreadPoolDevice& device;
533  Evaluator evaluator;
534  TilingContext tiling;
535 
536  private:
537  DoneCallback on_done;
538  };
539 };
540 
541 #endif // EIGEN_USE_THREADS
542 
543 // GPU: the evaluation of the expression is offloaded to a GPU.
544 #if defined(EIGEN_USE_GPU)
545 
546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
548  public:
549  typedef typename Expression::Index StorageIndex;
550  static void run(const Expression& expr, const GpuDevice& device);
551 };
552 
553 #if defined(EIGEN_GPUCC)
554 // Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
555 template <typename Index>
556 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,
557  Index rhs) {
558  const Index highest = NumTraits<Index>::highest();
559  const Index lowest = NumTraits<Index>::lowest();
560  if (lhs > 0 && rhs > 0) {
561  return lhs > highest - rhs ? 1 : 0;
562  } else if (lhs < 0 && rhs < 0) {
563  return lhs < lowest - rhs ? -1 : 0;
564  } else {
565  return 0;
566  }
567 }
568 
569 // Returns lhs + rhs, saturating to the highest/lowest representable value on
570 // overflow/underflow respectively.
571 template <typename Index>
572 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
573  const Index highest = NumTraits<Index>::highest();
574  const Index lowest = NumTraits<Index>::lowest();
575  int overflow = sum_will_overflow(lhs, rhs);
576  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
577 }
578 
579 // A functor that adds step_size to a given index, saturating to avoid
580 // overflow/underflow. If overflow/underflow is not possible, regular addition
581 // is used (for efficiency).
582 template <typename Index>
583 struct SafeStep {
584  // lastIdx is one past the end of the possible indexes.
585  // step_size is the value that will be added to the given index when the
586  // functor is called.
587  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
588  : can_overflow_(sum_will_overflow(lastIdx, step_size)),
589  step_size_(step_size) {}
590 
591  // Adds step_size to index, saturating on overflow (if overflow is possible).
593  return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
594  }
595 
596  private:
597  const bool can_overflow_;
598  const Index step_size_;
599 };
600 
601 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
602 struct EigenMetaKernelEval {
604  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
605  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
606  for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
607  eval.evalScalar(i);
608  }
609  }
610 };
611 
612 template <typename Evaluator, typename StorageIndex>
613 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
615  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
616  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
617  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
618  const StorageIndex vectorized_step_size = step_size * PacketSize;
619 
620  SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
621  vectorized_step_size);
622  // Use the vector path
623  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
624  i = safe_vectorized_step(i)) {
625  eval.evalPacket(i);
626  }
627  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
628  for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
629  i = safe_step(i)) {
630  eval.evalScalar(i);
631  }
632  }
633 };
634 
635 template <typename Evaluator, typename StorageIndex>
636 __global__ void
637 __launch_bounds__(1024)
638 EigenMetaKernel(Evaluator eval, StorageIndex size) {
639 
640  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
641  const StorageIndex step_size = blockDim.x * gridDim.x;
642 
643  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
644  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
645 }
646 
647 /*static*/
648 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
650  const Expression& expr, const GpuDevice& device) {
651  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
652  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
653  if (needs_assign) {
654 
655  const int block_size = device.maxGpuThreadsPerBlock();
656  const int max_blocks =
657  numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
658  device.maxGpuThreadsPerMultiProcessor(),
659  NumTraits<StorageIndex>::highest()) /
660  block_size;
661  const StorageIndex size = array_prod(evaluator.dimensions());
662  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
663  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
664 
665  LAUNCH_GPU_KERNEL(
666  (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
667  num_blocks, block_size, 0, device, evaluator, size);
668  }
669  evaluator.cleanup();
670 }
671 
672 #endif // EIGEN_GPUCC
673 #endif // EIGEN_USE_GPU
674 
675 // SYCL Executor policy
676 #ifdef EIGEN_USE_SYCL
677 
678 template <typename Evaluator>
679 struct ExecExprFunctorKernel {
680  typedef typename Evaluator::Index Index;
681  Evaluator evaluator;
682  const Index range;
683  template <typename Scratch>
684  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
685  const Scratch, Evaluator evaluator_, const Index range_)
686  : evaluator(evaluator_), range(range_) {}
687 
689  cl::sycl::nd_item<1> itemID) const {
690  compute(itemID);
691  }
692  template <bool is_vec = Evaluator::PacketAccess>
693  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
694  compute(const cl::sycl::nd_item<1>& itemID) const {
695  Index gId = static_cast<Index>(itemID.get_global_linear_id());
696  Index total_threads = itemID.get_global_range(0);
697 
698  for (Index i = gId; i < range; i += total_threads) {
699  evaluator.evalScalar(i);
700  }
701  }
702  template <bool is_vec = Evaluator::PacketAccess>
703  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
704  compute(const cl::sycl::nd_item<1>& itemID) const {
705  const Index vectorizedRange =
706  (range / Evaluator::PacketSize) * Evaluator::PacketSize;
707  Index gId = static_cast<Index>(itemID.get_global_linear_id());
708  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
709  const Index start = Evaluator::PacketSize * gId;
710  for (Index i = start; i < vectorizedRange; i += step) {
711  evaluator.evalPacket(i);
712  }
713  gId += vectorizedRange;
714  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
715  evaluator.evalScalar(i);
716  }
717  }
718 };
719 
720 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
721 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
722  public:
723  typedef typename Expression::Index Index;
724  static EIGEN_STRONG_INLINE void run(const Expression& expr,
725  const Eigen::SyclDevice& dev) {
727  Evaluator evaluator(expr, dev);
728  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
729  if (needs_assign) {
730  Index range, GRange, tileSize;
731  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
732  total_size = (total_size == 0) ? 1 : total_size;
733  const int PacketSize =
734  Eigen::PacketType<typename Evaluator::CoeffReturnType,
735  Eigen::SyclDevice>::size;
736  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
737  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
738  range = total_size;
739 
740  dev.template nullary_kernel_launcher<
741  typename Evaluator::CoeffReturnType,
742  ExecExprFunctorKernel<Evaluator> >(
743  evaluator,
744  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
745  cl::sycl::range<1>(tileSize)),
746  Index(1), range).wait();
747  }
748  evaluator.cleanup();
749  }
750 };
751 
752 #endif
753 
754 } // end namespace internal
755 
756 } // end namespace Eigen
757 
758 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
solver compute(A)
int i
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_DEVICE_FUNC
#define eigen_assert(x)
static double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
The tensor executor class.
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
std::ptrdiff_t array_prod(const Sizes< Indices... > &)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
SparseMat::Index size
A cost model used to limit the number of threads used for evaluating tensor expression.
std::ptrdiff_t j