10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
43 template<
typename Expression>
44 struct ExpressionHasTensorBroadcastingOp {
45 enum { value =
false };
48 template<
typename LhsXprType,
typename RhsXprType>
49 struct ExpressionHasTensorBroadcastingOp<
50 const TensorAssignOp<LhsXprType, RhsXprType> > {
51 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
54 template<
typename UnaryOp,
typename XprType>
55 struct ExpressionHasTensorBroadcastingOp<
56 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
57 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
60 template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
61 struct ExpressionHasTensorBroadcastingOp<
62 const TensorCwiseBinaryOp<
BinaryOp, LhsXprType, RhsXprType> > {
64 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
65 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
69 template<
typename Broadcast,
typename XprType>
70 struct ExpressionHasTensorBroadcastingOp<
71 const TensorBroadcastingOp<Broadcast, XprType> > {
72 enum { value =
true };
81 template <
typename Expression,
typename Device,
bool Vectorizable,
85 typedef typename Expression::Index StorageIndex;
92 static_assert(std::is_same<Device, DefaultDevice>::value,
93 "Default executor instantiated with non-default device. "
94 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
95 "EIGEN_USE_SYCL before including Eigen headers.");
97 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
98 const Device& device = DefaultDevice()) {
99 TensorEvaluator<Expression, Device>
evaluator(expr, device);
100 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
103 for (StorageIndex i = 0;
i <
size; ++
i) {
115 template <
typename Expression,
typename Device,
typename DoneCallback,
117 class TensorAsyncExecutor {};
122 template <
typename Expression>
126 typedef typename Expression::Index StorageIndex;
128 static EIGEN_STRONG_INLINE
void run(
129 const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
130 TensorEvaluator<Expression, DefaultDevice>
evaluator(expr, device);
131 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
134 const int PacketSize = unpacket_traits<
typename TensorEvaluator<
135 Expression, DefaultDevice>::PacketReturnType>::size;
140 const StorageIndex UnrolledSize =
141 (
size / (4 * PacketSize)) * 4 * PacketSize;
142 for (StorageIndex i = 0;
i < UnrolledSize;
i += 4 * PacketSize) {
143 for (StorageIndex j = 0;
j < 4;
j++) {
144 evaluator.evalPacket(i + j * PacketSize);
147 const StorageIndex VectorizedSize = (
size / PacketSize) * PacketSize;
148 for (StorageIndex i = UnrolledSize;
i < VectorizedSize;
i += PacketSize) {
151 for (StorageIndex i = VectorizedSize;
i <
size; ++
i) {
163 template <
typename Expression,
bool Vectorizable>
168 typedef std::remove_const_t<Scalar> ScalarNoConst;
170 typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
176 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
177 const DefaultDevice& device = DefaultDevice()) {
178 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
181 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
183 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
189 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
193 const TensorBlockResourceRequirements requirements =
196 const TensorBlockMapper block_mapper(
197 typename TensorBlockDesc::Dimensions(
evaluator.dimensions()),
201 TensorBlockScratch scratch(device);
203 const StorageIndex total_block_count = block_mapper.blockCount();
204 for (StorageIndex i = 0;
i < total_block_count; ++
i) {
205 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
225 #ifdef EIGEN_USE_THREADS
227 template <
typename TensorBlockMapper>
228 struct TensorExecutorTilingContext {
229 TensorExecutorTilingContext() =
default;
230 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
231 const TensorOpCost& b_cost,
size_t b_aligned_size)
232 : block_mapper(b_mapper),
234 aligned_blocksize(b_aligned_size) {}
236 TensorBlockMapper block_mapper;
238 size_t aligned_blocksize;
243 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
247 TensorBlockResourceRequirements requirements =
252 1, requirements.cost_per_coeff);
253 requirements.size =
static_cast<size_t>(1.0 / taskSize);
255 TensorBlockMapper block_mapper(
256 typename TensorBlockMapper::Dimensions(
evaluator.dimensions()),
259 size_t block_size = block_mapper.blockTotalSize();
260 const size_t align =
numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261 const size_t aligned_blocksize =
263 divup<size_t>(block_size *
sizeof(
typename Evaluator::Scalar), align);
265 return {block_mapper, requirements.cost_per_coeff * block_size,
269 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
271 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
272 const StorageIndex lastIdx) {
275 for (StorageIndex i = firstIdx;
i < lastIdx; ++
i) {
280 static StorageIndex alignBlockSize(StorageIndex size) {
return size; }
283 template <
typename Evaluator,
typename StorageIndex>
284 struct EvalRange<Evaluator, StorageIndex, true> {
285 static constexpr
int PacketSize =
286 unpacket_traits<typename Evaluator::PacketReturnType>::size;
288 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
289 const StorageIndex lastIdx) {
292 StorageIndex
i = firstIdx;
293 if (lastIdx - firstIdx >= PacketSize) {
295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
299 for (;
i <= last_chunk_offset;
i += 4 * PacketSize) {
300 for (StorageIndex j = 0;
j < 4;
j++) {
301 evaluator.evalPacket(i + j * PacketSize);
304 last_chunk_offset = lastIdx - PacketSize;
305 for (;
i <= last_chunk_offset;
i += PacketSize) {
309 for (;
i < lastIdx; ++
i) {
314 static StorageIndex alignBlockSize(StorageIndex size) {
316 if (size >= 16 * PacketSize) {
317 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
320 return (size + PacketSize - 1) & ~(PacketSize - 1);
324 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
327 typedef typename Expression::Index StorageIndex;
329 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
330 const ThreadPoolDevice& device) {
331 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
335 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
nullptr);
338 device.parallelFor(size,
evaluator.costPerCoeff(Vectorizable),
339 EvalRange::alignBlockSize,
340 [&
evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341 EvalRange::run(&evaluator, firstIdx, lastIdx);
348 template <
typename Expression,
bool Vectorizable>
354 typedef std::remove_const_t<Scalar> ScalarNoConst;
358 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
362 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
364 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
367 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
368 const ThreadPoolDevice& device) {
371 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
nullptr);
373 const TilingContext tiling =
374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
377 auto eval_block = [&device, &
evaluator, &tiling](IndexType firstBlockIdx,
378 IndexType lastBlockIdx) {
379 TensorBlockScratch scratch(device);
381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
383 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
390 if (tiling.block_mapper.blockCount() == 1) {
391 TensorBlockScratch scratch(device);
392 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
403 template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406 Vectorizable, Tiling> {
408 typedef typename Expression::Index StorageIndex;
409 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
411 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
412 const ThreadPoolDevice& device,
414 TensorAsyncExecutorContext*
const ctx =
415 new TensorAsyncExecutorContext(expr, device, std::move(done));
417 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424 const StorageIndex
size =
array_prod(ctx->evaluator.dimensions());
425 device.parallelForAsync(
426 size, ctx->evaluator.costPerCoeff(Vectorizable),
427 EvalRange::alignBlockSize,
428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
431 [ctx]() { delete ctx; });
434 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
438 struct TensorAsyncExecutorContext {
439 TensorAsyncExecutorContext(
const Expression& expr,
440 const ThreadPoolDevice& thread_pool,
442 :
evaluator(expr, thread_pool), on_done(
std::move(done)) {}
444 ~TensorAsyncExecutorContext() {
452 DoneCallback on_done;
456 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
462 typedef std::remove_const_t<Scalar> ScalarNoConst;
466 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
470 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
474 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
475 const ThreadPoolDevice& device,
478 TensorAsyncExecutorContext*
const ctx =
479 new TensorAsyncExecutorContext(expr, device, std::move(done));
481 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
487 ctx->tiling = internal::GetTensorExecutorTilingContext<
488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491 TensorBlockScratch scratch(ctx->device);
493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
495 TensorBlockDesc desc =
496 ctx->tiling.block_mapper.blockDescriptor(block_idx);
497 ctx->evaluator.evalBlock(desc, scratch);
503 if (ctx->tiling.block_mapper.blockCount() == 1) {
504 TensorBlockScratch scratch(ctx->device);
505 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506 ctx->evaluator.evalBlock(desc, scratch);
509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510 ctx->tiling.cost, eval_block,
511 [ctx]() { delete ctx; });
515 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
519 struct TensorAsyncExecutorContext {
520 TensorAsyncExecutorContext(
const Expression& expr,
521 const ThreadPoolDevice& thread_pool,
523 : device(thread_pool),
525 on_done(
std::move(done)) {}
527 ~TensorAsyncExecutorContext() {
532 const ThreadPoolDevice& device;
534 TilingContext tiling;
537 DoneCallback on_done;
544 #if defined(EIGEN_USE_GPU)
546 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
549 typedef typename Expression::Index StorageIndex;
550 static void run(
const Expression& expr,
const GpuDevice& device);
553 #if defined(EIGEN_GPUCC)
555 template <
typename Index>
558 const Index highest = NumTraits<Index>::highest();
559 const Index lowest = NumTraits<Index>::lowest();
560 if (lhs > 0 && rhs > 0) {
561 return lhs > highest - rhs ? 1 : 0;
562 }
else if (lhs < 0 && rhs < 0) {
563 return lhs < lowest - rhs ? -1 : 0;
571 template <
typename Index>
573 const Index highest = NumTraits<Index>::highest();
574 const Index lowest = NumTraits<Index>::lowest();
575 int overflow = sum_will_overflow(lhs, rhs);
576 return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
582 template <
typename Index>
588 : can_overflow_(sum_will_overflow(lastIdx, step_size)),
589 step_size_(step_size) {}
593 return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
597 const bool can_overflow_;
598 const Index step_size_;
601 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
602 struct EigenMetaKernelEval {
604 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
605 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
606 for (StorageIndex i = firstIdx;
i < lastIdx;
i = safe_step(i)) {
612 template <
typename Evaluator,
typename StorageIndex>
613 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
615 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
616 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
617 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
618 const StorageIndex vectorized_step_size = step_size * PacketSize;
620 SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
621 vectorized_step_size);
623 for (StorageIndex i = firstIdx * PacketSize;
i < vectorized_size;
624 i = safe_vectorized_step(i)) {
627 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
628 for (StorageIndex i = saturate_add(vectorized_size, firstIdx);
i < lastIdx;
635 template <
typename Evaluator,
typename StorageIndex>
637 __launch_bounds__(1024)
638 EigenMetaKernel(Evaluator eval, StorageIndex size) {
640 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
641 const StorageIndex step_size = blockDim.x * gridDim.x;
643 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
644 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
648 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
650 const Expression& expr,
const GpuDevice& device) {
651 TensorEvaluator<Expression, GpuDevice>
evaluator(expr, device);
652 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(
nullptr);
655 const int block_size = device.maxGpuThreadsPerBlock();
656 const int max_blocks =
657 numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
658 device.maxGpuThreadsPerMultiProcessor(),
659 NumTraits<StorageIndex>::highest()) /
663 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
666 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
667 num_blocks, block_size, 0, device,
evaluator, size);
676 #ifdef EIGEN_USE_SYCL
678 template <
typename Evaluator>
679 struct ExecExprFunctorKernel {
680 typedef typename Evaluator::Index
Index;
683 template <
typename Scratch>
685 const Scratch, Evaluator evaluator_,
const Index range_)
686 :
evaluator(evaluator_), range(range_) {}
689 cl::sycl::nd_item<1> itemID)
const {
692 template <
bool is_vec = Evaluator::PacketAccess>
694 compute(
const cl::sycl::nd_item<1>& itemID)
const {
695 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
696 Index total_threads = itemID.get_global_range(0);
698 for (
Index i = gId;
i < range;
i += total_threads) {
702 template <
bool is_vec = Evaluator::PacketAccess>
704 compute(
const cl::sycl::nd_item<1>& itemID)
const {
705 const Index vectorizedRange =
706 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
707 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
708 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
709 const Index start = Evaluator::PacketSize * gId;
710 for (
Index i = start;
i < vectorizedRange;
i += step) {
713 gId += vectorizedRange;
714 for (
Index i = gId;
i < range;
i += itemID.get_global_range(0)) {
720 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
723 typedef typename Expression::Index
Index;
724 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
725 const Eigen::SyclDevice& dev) {
728 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
730 Index range, GRange, tileSize;
732 total_size = (total_size == 0) ? 1 : total_size;
733 const int PacketSize =
735 Eigen::SyclDevice>::size;
736 Index vectorizable_threads =
static_cast<Index>(total_size / PacketSize);
737 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
740 dev.template nullary_kernel_launcher<
741 typename Evaluator::CoeffReturnType,
742 ExecExprFunctorKernel<Evaluator> >(
744 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
745 cl::sycl::range<1>(tileSize)),
746 Index(1), range).wait();
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_DEVICE_FUNC
static double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
The tensor executor class.
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
std::ptrdiff_t array_prod(const Sizes< Indices... > &)
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
A cost model used to limit the number of threads used for evaluating tensor expression.