eigen/unsupported/TensorScanSycl_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Mehdi Goli    Codeplay Software Ltd.

 // Ralph Potter  Codeplay Software Ltd.

 // Luke Iwanski  Codeplay Software Ltd.

 // Contact: <eigen@codeplay.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP


 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace TensorSycl {

 namespace internal {


 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE

 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)

 #endif


 template <typename index_t>

 struct ScanParameters {

   // must be power of 2

   static EIGEN_CONSTEXPR index_t ScanPerThread = 8;

   const index_t total_size;

   const index_t non_scan_size;

   const index_t scan_size;

   const index_t non_scan_stride;

   const index_t scan_stride;

   const index_t panel_threads;

   const index_t group_threads;

   const index_t block_threads;

   const index_t elements_per_group;

   const index_t elements_per_block;

   const index_t loop_range;


   ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,

                  index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,

                  index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)

       : total_size(total_size_),

         non_scan_size(non_scan_size_),

         scan_size(scan_size_),

         non_scan_stride(non_scan_stride_),

         scan_stride(scan_stride_),

         panel_threads(panel_threads_),

         group_threads(group_threads_),

         block_threads(block_threads_),

         elements_per_group(elements_per_group_),

         elements_per_block(elements_per_block_),

         loop_range(loop_range_) {}

 };


 enum class scan_step { first, second };

 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,

           scan_step stp>

 struct ScanKernelFunctor {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;


   LocalAccessor scratch;

   Evaluator dev_eval;

   OutAccessor out_ptr;

   OutAccessor tmp_ptr;

   const ScanParameters<Index> scanParameters;

   Op accumulator;

   const bool inclusive;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,

                                                           OutAccessor out_accessor_, OutAccessor temp_accessor_,

                                                           const ScanParameters<Index> scanParameters_, Op accumulator_,

                                                           const bool inclusive_)

       : scratch(scratch_),

         dev_eval(dev_eval_),

         out_ptr(out_accessor_),

         tmp_ptr(temp_accessor_),

         scanParameters(scanParameters_),

         accumulator(accumulator_),

         inclusive(inclusive_) {}


   template <scan_step sst = stp, typename Input>

   std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC

       EIGEN_STRONG_INLINE

       read(const Input &inpt, Index global_id) const {

     return inpt.coeff(global_id);

   }


   template <scan_step sst = stp, typename Input>

   std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC

       EIGEN_STRONG_INLINE

       read(const Input &inpt, Index global_id) const {

     return inpt[global_id];

   }


   template <scan_step sst = stp, typename InclusiveOp>

   std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   first_step_inclusive_Operation(InclusiveOp inclusive_op) const {

     inclusive_op();

   }


   template <scan_step sst = stp, typename InclusiveOp>

   std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

   first_step_inclusive_Operation(InclusiveOp) const {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {


     for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {

       Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));

       Index tmp = data_offset % scanParameters.panel_threads;

       const Index panel_id = data_offset / scanParameters.panel_threads;

       const Index group_id = tmp / scanParameters.group_threads;

       tmp = tmp % scanParameters.group_threads;

       const Index block_id = tmp / scanParameters.block_threads;

       const Index local_id = tmp % scanParameters.block_threads;

       // we put one element per packet in scratch_mem

       const Index scratch_stride = scanParameters.elements_per_block / PacketSize;

       const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;

       CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];

       CoeffReturnType inclusive_scan;

       // the actual panel size is scan_size * non_scan_size.

       // elements_per_panel is roundup to power of 2 for binary tree

       const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;

       const Index group_offset = group_id * scanParameters.non_scan_stride;

       // This will be effective when the size is bigger than elements_per_block

       const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;

       const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);

       const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;

       Index next_elements = 0;

       EIGEN_UNROLL_LOOP

       for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         private_scan[i] = ((((block_id * scanParameters.elements_per_block) +

                              (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&

                            (global_id < scanParameters.total_size))

                               ? read(dev_eval, global_id)

                               : accumulator.initialize();

         next_elements += scanParameters.scan_stride;

       }

       first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {

         if (inclusive) {

           inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];

         }

       });

       // This for loop must be 2

       EIGEN_UNROLL_LOOP

       for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {

         Index private_offset = 1;

         // build sum in place up the tree

         EIGEN_UNROLL_LOOP

         for (Index d = PacketSize >> 1; d > 0; d >>= 1) {

           EIGEN_UNROLL_LOOP

           for (Index l = 0; l < d; l++) {

             Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;

             Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;

             CoeffReturnType accum = accumulator.initialize();

             accumulator.reduce(private_scan[ai], &accum);

             accumulator.reduce(private_scan[bi], &accum);

             private_scan[bi] = accumulator.finalize(accum);

           }

           private_offset *= 2;

         }

         scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =

             private_scan[PacketSize - 1 + packetIndex];

         private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();

         // traverse down tree & build scan

         EIGEN_UNROLL_LOOP

         for (Index d = 1; d < PacketSize; d *= 2) {

           private_offset >>= 1;

           EIGEN_UNROLL_LOOP

           for (Index l = 0; l < d; l++) {

             Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;

             Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;

             CoeffReturnType accum = accumulator.initialize();

             accumulator.reduce(private_scan[ai], &accum);

             accumulator.reduce(private_scan[bi], &accum);

             private_scan[ai] = private_scan[bi];

             private_scan[bi] = accumulator.finalize(accum);

           }

         }

       }


       Index offset = 1;

       // build sum in place up the tree

       for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {

         // Synchronise

         itemID.barrier(cl::sycl::access::fence_space::local_space);

         if (local_id < d) {

           Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;

           Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;

           CoeffReturnType accum = accumulator.initialize();

           accumulator.reduce(scratch[ai], &accum);

           accumulator.reduce(scratch[bi], &accum);

           scratch[bi] = accumulator.finalize(accum);

         }

         offset *= 2;

       }

       // Synchronise

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       // next step optimisation

       if (local_id == 0) {

         if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {

           const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *

                                     scanParameters.non_scan_size +

                                 group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +

                                 block_id;

           tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];

         }

         // clear the last element

         scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();

       }

       // traverse down tree & build scan

       for (Index d = 1; d < scratch_stride; d *= 2) {

         offset >>= 1;

         // Synchronise

         itemID.barrier(cl::sycl::access::fence_space::local_space);

         if (local_id < d) {

           Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;

           Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;

           CoeffReturnType accum = accumulator.initialize();

           accumulator.reduce(scratch[ai], &accum);

           accumulator.reduce(scratch[bi], &accum);

           scratch[ai] = scratch[bi];

           scratch[bi] = accumulator.finalize(accum);

         }

       }

       // Synchronise

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       // This for loop must be 2

       EIGEN_UNROLL_LOOP

       for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {

         EIGEN_UNROLL_LOOP

         for (Index i = 0; i < PacketSize; i++) {

           CoeffReturnType accum = private_scan[packetIndex + i];

           accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);

           private_scan[packetIndex + i] = accumulator.finalize(accum);

         }

       }

       first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {

         if (inclusive) {

           accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);

           private_scan[0] = accumulator.finalize(inclusive_scan);

         }

       });

       next_elements = 0;

       // right the first set of private param

       EIGEN_UNROLL_LOOP

       for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <

              scanParameters.scan_size) &&

             (global_id < scanParameters.total_size)) {

           Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));

           out_ptr[global_id] = private_scan[private_id];

         }

         next_elements += scanParameters.scan_stride;

       }

     }  // end for loop

   }

 };


 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>

 struct ScanAdjustmentKernelFunctor {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;

   InAccessor in_ptr;

   OutAccessor out_ptr;

   const ScanParameters<Index> scanParameters;

   Op accumulator;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,

                                                                     OutAccessor out_accessor_,

                                                                     const ScanParameters<Index> scanParameters_,

                                                                     Op accumulator_)

       : in_ptr(in_accessor_),

         out_ptr(out_accessor_),

         scanParameters(scanParameters_),

         accumulator(accumulator_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {


     for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {

       Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));

       Index tmp = data_offset % scanParameters.panel_threads;

       const Index panel_id = data_offset / scanParameters.panel_threads;

       const Index group_id = tmp / scanParameters.group_threads;

       tmp = tmp % scanParameters.group_threads;

       const Index block_id = tmp / scanParameters.block_threads;

       const Index local_id = tmp % scanParameters.block_threads;


       // the actual panel size is scan_size * non_scan_size.

       // elements_per_panel is roundup to power of 2 for binary tree

       const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;

       const Index group_offset = group_id * scanParameters.non_scan_stride;

       // This will be effective when the size is bigger than elements_per_block

       const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;

       const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;


       const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;

       const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;

       const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;

       CoeffReturnType adjust_val = in_ptr[in_id];


       Index next_elements = 0;

       EIGEN_UNROLL_LOOP

       for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <

              scanParameters.scan_size) &&

             (global_id < scanParameters.total_size)) {

           CoeffReturnType accum = adjust_val;

           accumulator.reduce(out_ptr[global_id], &accum);

           out_ptr[global_id] = accumulator.finalize(accum);

         }

         next_elements += scanParameters.scan_stride;

       }

     }

   }

 };


 template <typename Index>

 struct ScanInfo {

   const Index &total_size;

   const Index &scan_size;

   const Index &panel_size;

   const Index &non_scan_size;

   const Index &scan_stride;

   const Index &non_scan_stride;


   Index max_elements_per_block;

   Index block_size;

   Index panel_threads;

   Index group_threads;

   Index block_threads;

   Index elements_per_group;

   Index elements_per_block;

   Index loop_range;

   Index global_range;

   Index local_range;

   const Eigen::SyclDevice &dev;

   EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,

                                const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,

                                const Eigen::SyclDevice &dev_)

       : total_size(total_size_),

         scan_size(scan_size_),

         panel_size(panel_size_),

         non_scan_size(non_scan_size_),

         scan_stride(scan_stride_),

         non_scan_stride(non_scan_stride_),

         dev(dev_) {

     // must be power of 2

     local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),

                            Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));


     max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;


     elements_per_group =

         dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);

     const Index elements_per_panel = elements_per_group * non_scan_size;

     elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));

     panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;

     group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;

     block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;

     block_size = elements_per_group / elements_per_block;

 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE

     const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));

 #else

     const Index max_threads = panel_threads * panel_size;

 #endif

     global_range = roundUp(max_threads, local_range);

     loop_range = Index(

         std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));

   }

   inline ScanParameters<Index> get_scan_parameter() {

     return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,

                                  group_threads, block_threads, elements_per_group, elements_per_block, loop_range);

   }

   inline cl::sycl::nd_range<1> get_thread_range() {

     return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));

   }

 };


 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>

 struct SYCLAdjustBlockOffset {

   EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,

                                                            Reducer &accumulator, const Index total_size,

                                                            const Index scan_size, const Index panel_size,

                                                            const Index non_scan_size, const Index scan_stride,

                                                            const Index non_scan_stride, const Eigen::SyclDevice &dev) {

     auto scan_info =

         ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);


     typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>

         AdjustFuctor;

     dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),

                                                                       scan_info.max_elements_per_block,

                                                                       scan_info.get_scan_parameter(), accumulator).wait();

   }

 };


 template <typename CoeffReturnType, scan_step stp>

 struct ScanLauncher_impl {

   template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>

   EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,

                                              const Index total_size, const Index scan_size, const Index panel_size,

                                              const Index non_scan_size, const Index scan_stride,

                                              const Index non_scan_stride, const bool inclusive,

                                              const Eigen::SyclDevice &dev) {

     auto scan_info =

         ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);

     const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;

     const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);

     CoeffReturnType *temp_pointer =

         static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));

     EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);


     typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;

     dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(

         in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,

         scan_info.get_scan_parameter(), accumulator, inclusive).wait();


     if (scan_info.block_size > 1) {

       ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(

           tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,

           non_scan_size, Index(1), scan_info.block_size, false, dev);


       SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(

           tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,

           non_scan_stride, dev);

     }

     dev.deallocate_temp(temp_pointer);

   }

 };


 }  // namespace internal

 }  // namespace TensorSycl

 namespace internal {

 template <typename Self, typename Reducer, bool vectorize>

 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {

   typedef typename Self::Index Index;

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::Storage Storage;

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   void operator()(Self &self, EvaluatorPointerType data) const {

     const Index total_size = internal::array_prod(self.dimensions());

     const Index scan_size = self.size();

     const Index scan_stride = self.stride();

     // this is the scan op (can be sum or ...)

     auto accumulator = self.accumulator();

     auto inclusive = !self.exclusive();

     auto consume_dim = self.consume_dim();

     auto dev = self.device();


     auto dims = self.inner().dimensions();


     Index non_scan_size = 1;

     Index panel_size = 1;

     if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {

       for (int i = 0; i < consume_dim; i++) {

         non_scan_size *= dims[i];

       }

       for (int i = consume_dim + 1; i < Self::NumDims; i++) {

         panel_size *= dims[i];

       }

     } else {

       for (int i = Self::NumDims - 1; i > consume_dim; i--) {

         non_scan_size *= dims[i];

       }

       for (int i = consume_dim - 1; i >= 0; i--) {

         panel_size *= dims[i];

       }

     }

     const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;

     auto eval_impl = self.inner();

     TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(

         eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,

         inclusive, dev);

   }

 };

 } // namespace internal

 }  // namespace Eigen


 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

i
int i

operator()
IndexedView_or_VectorBlock operator()(const Indices &indices)

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP

EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC

EIGEN_SYCL_MAX_GLOBAL_RANGE
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition: TensorScanSycl.h:47

Eigen::ColMajor
ColMajor

Eigen::TensorSycl::internal::scan_step
scan_step
Definition: TensorScanSycl.h:82

Eigen::TensorSycl::internal::scan_step::first
@ first

Eigen::TensorSycl::internal::scan_step::second
@ second

Eigen::TensorSycl::internal::read
static std::enable_if_t< PacketLoad, PacketType > read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
Definition: TensorContractionSycl.h:161

Eigen::internal::array_prod
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))

Eigen
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index

Eigen::min
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
Definition: AutoDiffScalar.h:582

internal

InternalHeaderCheck.h