28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
33 namespace TensorSycl {
36 template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
38 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
42 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
48 template <
typename CoeffReturnType,
typename Index>
49 struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, false> {
50 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
51 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
55 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(
const CoeffReturnType &accumulator,
57 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
58 return quotient_op(accumulator, CoeffReturnType(scale));
62 template <
typename CoeffReturnType,
typename Index>
63 struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, true> {
64 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
65 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
66 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
70 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
72 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
76 template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
78 struct SecondStepFullReducer {
79 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
81 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
82 typedef typename OpDef::type Op;
83 LocalAccessor scratch;
85 OutputAccessor outAcc;
87 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
88 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
90 void operator()(cl::sycl::nd_item<1> itemID)
const {
98 const Index localid = itemID.get_local_id(0);
99 auto aInPtr = aI + localid;
100 auto aOutPtr = outAcc;
101 CoeffReturnType *scratchptr = scratch.get_pointer();
102 CoeffReturnType accumulator = *aInPtr;
104 scratchptr[localid] = op.finalize(accumulator);
105 for (
Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
106 itemID.barrier(cl::sycl::access::fence_space::local_space);
107 if (localid < offset) {
108 op.reduce(scratchptr[localid + offset], &accumulator);
109 scratchptr[localid] = op.finalize(accumulator);
112 if (localid == 0) *aOutPtr = op.finalize(accumulator);
118 template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
119 class FullReductionKernelFunctor {
121 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
122 typedef typename Evaluator::Index
Index;
123 typedef OpDefiner<OpType,
typename Evaluator::CoeffReturnType,
Index,
124 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
127 typedef typename OpDef::type Op;
128 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
129 typedef typename Evaluator::PacketReturnType PacketReturnType;
130 typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131 PacketReturnType, CoeffReturnType> OutType;
132 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
134 LocalAccessor scratch;
136 EvaluatorPointerType final_output;
140 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
141 Index rng_, OpType op_)
142 : scratch(scratch_),
evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
144 void operator()(cl::sycl::nd_item<1> itemID)
const { compute_reduction(itemID); }
146 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
148 const cl::sycl::nd_item<1> &itemID)
const {
149 auto output_ptr = final_output;
150 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
151 Index globalid = itemID.get_global_id(0);
152 Index localid = itemID.get_local_id(0);
153 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
154 Index start = Evaluator::PacketSize * globalid;
156 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
157 for (
Index i = start;
i < VectorizedRange;
i += step) {
158 op.template reducePacket<PacketReturnType>(
evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
160 globalid += VectorizedRange;
162 for (
Index i = globalid;
i < rng;
i += itemID.get_global_range(0)) {
163 op.template reducePacket<PacketReturnType>(
164 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
165 evaluator.impl().coeff(i), op.initialize()),
168 scratch[localid] = packetAccumulator =
169 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
172 for (
Index offset = local_range / 2; offset > 0; offset /= 2) {
173 itemID.barrier(cl::sycl::access::fence_space::local_space);
174 if (localid < offset) {
175 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
176 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
180 output_ptr[itemID.get_group(0)] =
181 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
185 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
187 const cl::sycl::nd_item<1> &itemID)
const {
188 auto output_ptr = final_output;
189 Index globalid = itemID.get_global_id(0);
190 Index localid = itemID.get_local_id(0);
192 CoeffReturnType accumulator = op.initialize();
194 for (
Index i = globalid;
i < rng;
i += itemID.get_global_range(0)) {
195 op.reduce(
evaluator.impl().coeff(i), &accumulator);
197 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
201 for (
Index offset = local_range / 2; offset > 0; offset /= 2) {
202 itemID.barrier(cl::sycl::access::fence_space::local_space);
203 if (localid < offset) {
204 op.reduce(scratch[localid + offset], &accumulator);
205 scratch[localid] = op.finalize(accumulator);
209 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
214 template <
typename Evaluator,
typename OpType>
215 class GenericNondeterministicReducer {
217 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
218 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
219 typedef typename Evaluator::Index
Index;
220 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
221 typedef typename OpDef::type Op;
222 template <
typename Scratch>
223 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
226 output_accessor(output_accessor_),
227 functor(OpDef::get_op(functor_)),
229 num_values_to_reduce(num_values_to_reduce_) {}
231 void operator()(cl::sycl::nd_item<1> itemID)
const {
233 Op non_const_functor;
234 std::memcpy(&non_const_functor, &functor,
sizeof (Op));
235 auto output_accessor_ptr = output_accessor;
236 Index globalid =
static_cast<Index>(itemID.get_global_linear_id());
237 if (globalid < range) {
238 CoeffReturnType accum = functor.initialize();
239 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
241 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
247 EvaluatorPointerType output_accessor;
250 Index num_values_to_reduce;
255 template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
256 struct PartialReductionKernel {
257 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
258 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
259 typedef typename Evaluator::Index
Index;
260 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
261 typedef typename OpDef::type Op;
262 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
266 EvaluatorPointerType output_accessor;
268 const Index preserve_elements_num_groups;
269 const Index reduce_elements_num_groups;
270 const Index num_coeffs_to_preserve;
271 const Index num_coeffs_to_reduce;
273 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
274 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
275 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
278 output_accessor(output_accessor_),
279 op(OpDef::get_op(op_)),
280 preserve_elements_num_groups(preserve_elements_num_groups_),
281 reduce_elements_num_groups(reduce_elements_num_groups_),
282 num_coeffs_to_preserve(num_coeffs_to_preserve_),
283 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
286 CoeffReturnType &accumulator)
const {
287 if (globalPId >= num_coeffs_to_preserve) {
291 : globalRId + (globalPId * num_coeffs_to_reduce);
292 Index localOffset = globalRId;
294 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
295 const Index per_thread_global_stride =
297 for (
Index i = globalRId;
i < num_coeffs_to_reduce;
i += per_thread_local_stride) {
298 op.reduce(
evaluator.impl().coeff(global_offset), &accumulator);
299 localOffset += per_thread_local_stride;
300 global_offset += per_thread_global_stride;
304 const Index linearLocalThreadId = itemID.get_local_id(0);
306 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
308 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
310 : itemID.get_group(0) / reduce_elements_num_groups;
312 : itemID.get_group(0) % reduce_elements_num_groups;
314 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
315 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
316 CoeffReturnType* scratchPtr = scratch.get_pointer();
318 output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
319 CoeffReturnType accumulator = op.initialize();
321 element_wise_reduce(globalRId, globalPId, accumulator);
323 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
324 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
327 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
328 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
329 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
334 auto out_scratch_ptr =
335 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
336 itemID.barrier(cl::sycl::access::fence_space::local_space);
338 accumulator = *out_scratch_ptr;
342 for (
Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
343 if (rLocalThreadId < offset) {
344 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
347 *out_scratch_ptr = op.finalize(accumulator);
354 itemID.barrier(cl::sycl::access::fence_space::local_space);
357 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
358 outPtr[globalPId] = op.finalize(accumulator);
363 template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
364 struct SecondStepPartialReduction {
365 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
366 typedef typename OpDef::type Op;
367 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
369 InputAccessor input_accessor;
370 OutputAccessor output_accessor;
372 const Index num_coeffs_to_preserve;
373 const Index num_coeffs_to_reduce;
375 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
376 OutputAccessor output_accessor_, OpType op_,
377 const Index num_coeffs_to_preserve_,
378 const Index num_coeffs_to_reduce_)
379 : input_accessor(input_accessor_),
380 output_accessor(output_accessor_),
381 op(OpDef::get_op(op_)),
382 num_coeffs_to_preserve(num_coeffs_to_preserve_),
383 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
386 const Index globalId = itemID.get_global_id(0);
388 if (globalId >= num_coeffs_to_preserve)
return;
390 auto in_ptr = input_accessor + globalId;
392 OutScalar accumulator = op.initialize();
394 for (
Index i = 0;
i < num_coeffs_to_reduce;
i++) {
395 op.reduce(*in_ptr, &accumulator);
396 in_ptr += num_coeffs_to_preserve;
398 output_accessor[globalId] = op.finalize(accumulator);
402 template <
typename Index, Index LTP, Index LTR,
bool BC_>
403 struct ReductionPannel {
409 template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
410 struct PartialReducerLauncher {
411 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
412 typedef typename Self::CoeffReturnType CoeffReturnType;
413 typedef typename Self::Storage Storage;
414 typedef typename Self::Index
Index;
415 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
418 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
420 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType output,
421 Index num_coeffs_to_reduce,
Index num_coeffs_to_preserve) {
422 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
427 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
428 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
429 "The Local thread size must be a power of 2 for the reduction "
432 EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
440 const Index reductionPerThread = 64;
441 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
442 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
443 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
444 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ?
std::min(rGroups, localRange) : 1;
445 const Index globalRange = pNumGroups * rNumGroups * localRange;
448 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
449 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
450 if (rNumGroups > 1) {
451 CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *
>(
452 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(CoeffReturnType)));
453 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
454 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
455 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
456 num_coeffs_to_reduce).wait();
457 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
458 SecondStepPartialReductionKernel;
459 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
460 temp_accessor, output,
461 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
Index(1),
462 reducer, num_coeffs_to_preserve, rNumGroups).wait();
463 self.device().deallocate_temp(temp_pointer);
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce).wait();
477 template <
typename Self,
typename Op,
bool Vectorizable>
478 struct FullReducer<Self, Op,
Eigen::SyclDevice, Vectorizable> {
479 typedef typename Self::CoeffReturnType CoeffReturnType;
480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
482 static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483 static void run(
const Self &
self, Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
484 typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487 "The Local thread size must be a power of 2 for the reduction "
489 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
491 typename Self::Index inputSize =
self.impl().dimensions().TotalSize();
495 const Index reductionPerThread = 2048;
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
499 const Index num_work_group =
std::min(reductionGroup, local_range);
503 const Index global_range = num_work_group * local_range;
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
506 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
507 if (num_work_group > 1) {
508 CoeffReturnType *temp_pointer =
509 static_cast<CoeffReturnType *
>(dev.allocate_temp(num_work_group *
sizeof(CoeffReturnType)));
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self, tmp_global_accessor, thread_range,
512 local_range, inputSize, reducer).wait();
513 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
514 EvaluatorPointerType,
Index, local_range>
516 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
517 tmp_global_accessor, data,
518 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
520 dev.deallocate_temp(temp_pointer);
522 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self, data, thread_range, local_range, inputSize,
530 template <
typename Self,
typename Op>
531 struct OuterReducer<Self, Op,
Eigen::SyclDevice> {
534 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
535 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
536 typename Self::Index num_coeffs_to_preserve) {
537 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
538 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(
self, reducer, dev, output,
539 num_coeffs_to_reduce,
540 num_coeffs_to_preserve);
544 template <
typename Self,
typename Op>
545 struct InnerReducer<Self, Op,
Eigen::SyclDevice> {
548 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
549 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
550 typename Self::Index num_coeffs_to_preserve) {
551 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
552 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(
self, reducer, dev, output,
553 num_coeffs_to_reduce,
554 num_coeffs_to_preserve);
561 template <
typename Self,
typename Op>
562 struct GenericReducer<Self, Op,
Eigen::SyclDevice> {
564 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
565 typename Self::EvaluatorPointerType output,
typename Self::Index num_values_to_reduce,
566 typename Self::Index num_coeffs_to_preserve) {
567 typename Self::Index range, GRange, tileSize;
568 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
570 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
571 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
572 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
573 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce :
static_cast<Index>(1)).wait();
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_UNROLL_LOOP
#define EIGEN_DEVICE_FUNC
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)