eigen/unsupported/TensorReductionSycl_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Mehdi Goli    Codeplay Software Ltd.

 // Ralph Potter  Codeplay Software Ltd.

 // Luke Iwanski  Codeplay Software Ltd.

 // Contact: <eigen@codeplay.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace TensorSycl {

 namespace internal {


 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>

 struct OpDefiner {

   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;

   typedef Op type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,

                                                                             const Index &) {

     return accumulator;

   }

 };


 template <typename CoeffReturnType, typename Index>

 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {

   typedef Eigen::internal::SumReducer<CoeffReturnType> type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {

     return type();

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,

                                                                            const Index &scale) {

     ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;

     return quotient_op(accumulator, CoeffReturnType(scale));

   }

 };


 template <typename CoeffReturnType, typename Index>

 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {

   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;

   typedef Eigen::internal::SumReducer<CoeffReturnType> type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {

     return type();

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,

                                                                             const Index &scale) {

     return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));

   }

 };


 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,

           Index local_range>

 struct SecondStepFullReducer {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;

   typedef typename OpDef::type Op;

   LocalAccessor scratch;

   InputAccessor aI;

   OutputAccessor outAcc;

   Op op;

   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)

       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}


   void operator()(cl::sycl::nd_item<1> itemID) const {

     // Our empirical research shows that the best performance will be achieved

     // when there is only one element per thread to reduce in the second step.

     // in this step the second step reduction time is almost negligible.

     // Hence, in the second step of reduction the input size is fixed to the

     // local size, thus, there is only one element read per thread. The

     // algorithm must be changed if the number of reduce per thread in the

     // second step is greater than 1. Otherwise, the result will be wrong.

     const Index localid = itemID.get_local_id(0);

     auto aInPtr = aI + localid;

     auto aOutPtr = outAcc;

     CoeffReturnType *scratchptr = scratch.get_pointer();

     CoeffReturnType accumulator = *aInPtr;


     scratchptr[localid] = op.finalize(accumulator);

     for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.reduce(scratchptr[localid + offset], &accumulator);

         scratchptr[localid] = op.finalize(accumulator);

       }

     }

     if (localid == 0) *aOutPtr = op.finalize(accumulator);

   }

 };


 // Full reduction first phase. In this version the vectorization is true and the reduction accept

 // any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ).

 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>

 class FullReductionKernelFunctor {

  public:

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,

                     (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

       OpDef;


   typedef typename OpDef::type Op;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::PacketReturnType PacketReturnType;

   typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),

                                               PacketReturnType, CoeffReturnType> OutType;

   typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   LocalAccessor scratch;

   Evaluator evaluator;

   EvaluatorPointerType final_output;

   Index rng;

   Op op;


   FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,

                              Index rng_, OpType op_)

       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}


   void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }


   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(

       const cl::sycl::nd_item<1> &itemID) const {

     auto output_ptr = final_output;

     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;

     Index globalid = itemID.get_global_id(0);

     Index localid = itemID.get_local_id(0);

     Index step = Evaluator::PacketSize * itemID.get_global_range(0);

     Index start = Evaluator::PacketSize * globalid;

     // vectorizable parts

     PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();

     for (Index i = start; i < VectorizedRange; i += step) {

       op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);

     }

     globalid += VectorizedRange;

     // non vectorizable parts

     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {

       op.template reducePacket<PacketReturnType>(

           ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(

               evaluator.impl().coeff(i), op.initialize()),

           &packetAccumulator);

     }

     scratch[localid] = packetAccumulator =

         OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);

     // reduction parts // Local size is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = local_range / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);

         scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);

       }

     }

     if (localid == 0) {

       output_ptr[itemID.get_group(0)] =

           op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));

     }

   }


   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(

       const cl::sycl::nd_item<1> &itemID) const {

     auto output_ptr = final_output;

     Index globalid = itemID.get_global_id(0);

     Index localid = itemID.get_local_id(0);

     // vectorizable parts

     CoeffReturnType accumulator = op.initialize();

     // non vectorizable parts

     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {

       op.reduce(evaluator.impl().coeff(i), &accumulator);

     }

     scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);


     // reduction parts. the local size is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = local_range / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.reduce(scratch[localid + offset], &accumulator);

         scratch[localid] = op.finalize(accumulator);

       }

     }

     if (localid == 0) {

       output_ptr[itemID.get_group(0)] = op.finalize(accumulator);

     }

   }

 };


 template <typename Evaluator, typename OpType>

 class GenericNondeterministicReducer {

  public:

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;

   typedef typename OpDef::type Op;

   template <typename Scratch>

   GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,

                        Index range_, Index num_values_to_reduce_)

       : evaluator(evaluator_),

         output_accessor(output_accessor_),

         functor(OpDef::get_op(functor_)),

         range(range_),

         num_values_to_reduce(num_values_to_reduce_) {}


   void operator()(cl::sycl::nd_item<1> itemID) const {

     //This is to bypass the statefull condition in Eigen meanReducer

     Op non_const_functor;

     std::memcpy(&non_const_functor, &functor, sizeof (Op));

     auto output_accessor_ptr = output_accessor;

     Index globalid = static_cast<Index>(itemID.get_global_linear_id());

     if (globalid < range) {

       CoeffReturnType accum = functor.initialize();

       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(

           evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);

       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);

     }

   }


  private:

   Evaluator evaluator;

   EvaluatorPointerType output_accessor;

   Op functor;

   Index range;

   Index num_values_to_reduce;

 };


 enum class reduction_dim { inner_most, outer_most };

 // default is preserver

 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>

 struct PartialReductionKernel {

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;

   typedef typename OpDef::type Op;

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       ScratchAcc;

   ScratchAcc scratch;

   Evaluator evaluator;

   EvaluatorPointerType output_accessor;

   Op op;

   const Index preserve_elements_num_groups;

   const Index reduce_elements_num_groups;

   const Index num_coeffs_to_preserve;

   const Index num_coeffs_to_reduce;


   PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,

                          const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,

                          const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)

       : scratch(scratch_),

         evaluator(evaluator_),

         output_accessor(output_accessor_),

         op(OpDef::get_op(op_)),

         preserve_elements_num_groups(preserve_elements_num_groups_),

         reduce_elements_num_groups(reduce_elements_num_groups_),

         num_coeffs_to_preserve(num_coeffs_to_preserve_),

         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,

                                                                  CoeffReturnType &accumulator) const {

     if (globalPId >= num_coeffs_to_preserve) {

       return;

     }

     Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)

                                                           : globalRId + (globalPId * num_coeffs_to_reduce);

     Index localOffset = globalRId;


     const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;

     const Index per_thread_global_stride =

         rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;

     for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {

       op.reduce(evaluator.impl().coeff(global_offset), &accumulator);

       localOffset += per_thread_local_stride;

       global_offset += per_thread_global_stride;

     }

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     const Index linearLocalThreadId = itemID.get_local_id(0);

     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP

                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;

     Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP

                                                            : linearLocalThreadId % PannelParameters::LocalThreadSizeR;

     const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups

                                                            : itemID.get_group(0) / reduce_elements_num_groups;

     const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups

                                                            : itemID.get_group(0) % reduce_elements_num_groups;


     Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;

     const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;

     CoeffReturnType*  scratchPtr = scratch.get_pointer();

     auto outPtr =

         output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);

     CoeffReturnType accumulator = op.initialize();


     element_wise_reduce(globalRId, globalPId, accumulator);


     accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);

     scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =

         accumulator;

     if (rt == reduction_dim::inner_most) {

       pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;

       rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;

       globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;

     }


     /* Apply the reduction operation between the current local

      * id and the one on the other half of the vector. */

     auto out_scratch_ptr =

         scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));

     itemID.barrier(cl::sycl::access::fence_space::local_space);

     if (rt == reduction_dim::inner_most) {

       accumulator = *out_scratch_ptr;

     }

     // The Local LocalThreadSizeR is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {

       if (rLocalThreadId < offset) {

         op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);

         // The result has already been divided for mean reducer in the

         // previous reduction so no need to divide furthermore

         *out_scratch_ptr = op.finalize(accumulator);

       }

       /* All threads collectively read from global memory into local.

        * The barrier ensures all threads' IO is resolved before

        * execution continues (strictly speaking, all threads within

        * a single work-group - there is no co-ordination between

        * work-groups, only work-items). */

       itemID.barrier(cl::sycl::access::fence_space::local_space);

     }


     if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {

       outPtr[globalPId] = op.finalize(accumulator);

     }

   }

 };


 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>

 struct SecondStepPartialReduction {

   typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;

   typedef typename OpDef::type Op;

   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       ScratchAccessor;

   InputAccessor input_accessor;

   OutputAccessor output_accessor;

   Op op;

   const Index num_coeffs_to_preserve;

   const Index num_coeffs_to_reduce;


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,

                                                                    OutputAccessor output_accessor_, OpType op_,

                                                                    const Index num_coeffs_to_preserve_,

                                                                    const Index num_coeffs_to_reduce_)

       : input_accessor(input_accessor_),

         output_accessor(output_accessor_),

         op(OpDef::get_op(op_)),

         num_coeffs_to_preserve(num_coeffs_to_preserve_),

         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     const Index globalId = itemID.get_global_id(0);


     if (globalId >= num_coeffs_to_preserve) return;


     auto in_ptr = input_accessor + globalId;


     OutScalar accumulator = op.initialize();

 // num_coeffs_to_reduce is not bigger that 256

     for (Index i = 0; i < num_coeffs_to_reduce; i++) {

       op.reduce(*in_ptr, &accumulator);

       in_ptr += num_coeffs_to_preserve;

     }

     output_accessor[globalId] = op.finalize(accumulator);

   }

 };  // namespace internal


 template <typename Index, Index LTP, Index LTR, bool BC_>

 struct ReductionPannel {

   static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;

   static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;

   static EIGEN_CONSTEXPR bool BC = BC_;

 };


 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>

 struct PartialReducerLauncher {

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::Storage Storage;

   typedef typename Self::Index Index;

   typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>

       PannelParameters;


   typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,

                   Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {

     Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);


     // getPowerOfTwo makes sure local range is power of 2 and <=

     // maxSyclThreadPerBlock this will help us to avoid extra check on the

     // kernel

     static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &

                     (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),

                   "The Local thread size must be a power of 2 for the reduction "

                   "operation");


     EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;

     // In this step, we force the code not to be more than 2-step reduction:

     // Our empirical research shows that if each thread reduces at least 64

     // elemnts individually, we get better performance. However, this can change

     // on different platforms. In this step we force the code not to be

     // morthan step reduction: Our empirical research shows that for inner_most

     // dim reducer, it is better to have 8 group in a reduce dimension for sizes

     // > 1024 to achieve the best performance.

     const Index reductionPerThread = 64;

     Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);

     const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;

     Index rGroups = (cu + pNumGroups - 1) / pNumGroups;

     const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;

     const Index globalRange = pNumGroups * rNumGroups * localRange;


     EIGEN_CONSTEXPR Index scratchSize =

         PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);

     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));

     if (rNumGroups > 1) {

       CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(

           dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));

       EvaluatorPointerType temp_accessor = dev.get(temp_pointer);

       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(

           self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,

           num_coeffs_to_reduce).wait();

       typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>

           SecondStepPartialReductionKernel;

       dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(

           temp_accessor, output,

           cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),

           reducer, num_coeffs_to_preserve, rNumGroups).wait();

       self.device().deallocate_temp(temp_pointer);

     } else {

       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(

           self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,

           num_coeffs_to_reduce).wait();

     }

     return false;

   }

 };

 }  // namespace internal

 }  // namespace TensorSycl


 namespace internal {


 template <typename Self, typename Op, bool Vectorizable>

 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;

   static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;

   static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {

     typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;

     static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &

                     (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),

                   "The Local thread size must be a power of 2 for the reduction "

                   "operation");

     EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;


     typename Self::Index inputSize = self.impl().dimensions().TotalSize();

     // In this step we force the code not to be more than 2-step reduction:

     // Our empirical research shows that if each thread reduces at least 512

     // elemnts individually, we get better performance.

     const Index reductionPerThread = 2048;

     // const Index num_work_group =

     Index reductionGroup = dev.getPowerOfTwo(

         (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);

     const Index num_work_group = std::min(reductionGroup, local_range);

     // 1

     // ? local_range

     // : 1);

     const Index global_range = num_work_group * local_range;


     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));

     typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;

     if (num_work_group > 1) {

       CoeffReturnType *temp_pointer =

           static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));

       typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);

       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,

                                                                       local_range, inputSize, reducer).wait();

       typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,

                                                           EvaluatorPointerType, Index, local_range>

           GenericRKernel;

       dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(

           tmp_global_accessor, data,

           cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,

           reducer).wait();

       dev.deallocate_temp(temp_pointer);

     } else {

       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,

                                                                       reducer).wait();


     }

   }

 };

 // vectorizable inner_most most dim preserver

 // col reduction

 template <typename Self, typename Op>

 struct OuterReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<

         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,

                                                                                  num_coeffs_to_reduce,

                                                                                  num_coeffs_to_preserve);

   }

 };

 // row reduction

 template <typename Self, typename Op>

 struct InnerReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<

         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,

                                                                                  num_coeffs_to_reduce,

                                                                                  num_coeffs_to_preserve);

   }

 };


 // ArmgMax uses this kernel for partial reduction//

 // TODO(@mehdi.goli) come up with a better kernel

 // generic partial reduction

 template <typename Self, typename Op>

 struct GenericReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;

   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     typename Self::Index range, GRange, tileSize;

     dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);


     dev.template unary_kernel_launcher<typename Self::CoeffReturnType,

                                        TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(

         self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),

         reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)).wait();

     return false;

   }

 };


 }  // namespace internal

 }  // namespace Eigen


 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

i
int i

operator()
IndexedView_or_VectorBlock operator()(const Indices &indices)

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP

EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC

evaluator

Eigen::TensorSycl::internal::reduction_dim
reduction_dim
Definition: TensorReductionSycl.h:253

Eigen::TensorSycl::internal::reduction_dim::outer_most
@ outer_most

Eigen::TensorSycl::internal::reduction_dim::inner_most
@ inner_most

Eigen
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend

Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index

Eigen::min
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
Definition: AutoDiffScalar.h:582

internal

InternalHeaderCheck.h