TensorReductionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 
28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30 #include "./InternalHeaderCheck.h"
31 
32 namespace Eigen {
33 namespace TensorSycl {
34 namespace internal {
35 
36 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
37 struct OpDefiner {
38  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
39  typedef Op type;
40  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
41 
42  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
43  const Index &) {
44  return accumulator;
45  }
46 };
47 
48 template <typename CoeffReturnType, typename Index>
49 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
50  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
51  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
52  return type();
53  }
54 
55  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
56  const Index &scale) {
57  ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
58  return quotient_op(accumulator, CoeffReturnType(scale));
59  }
60 };
61 
62 template <typename CoeffReturnType, typename Index>
63 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
64  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
65  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
66  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
67  return type();
68  }
69 
70  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
71  const Index &scale) {
72  return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
73  }
74 };
75 
76 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
77  Index local_range>
78 struct SecondStepFullReducer {
79  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
80  LocalAccessor;
81  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
82  typedef typename OpDef::type Op;
83  LocalAccessor scratch;
84  InputAccessor aI;
85  OutputAccessor outAcc;
86  Op op;
87  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
88  : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
89 
90  void operator()(cl::sycl::nd_item<1> itemID) const {
91  // Our empirical research shows that the best performance will be achieved
92  // when there is only one element per thread to reduce in the second step.
93  // in this step the second step reduction time is almost negligible.
94  // Hence, in the second step of reduction the input size is fixed to the
95  // local size, thus, there is only one element read per thread. The
96  // algorithm must be changed if the number of reduce per thread in the
97  // second step is greater than 1. Otherwise, the result will be wrong.
98  const Index localid = itemID.get_local_id(0);
99  auto aInPtr = aI + localid;
100  auto aOutPtr = outAcc;
101  CoeffReturnType *scratchptr = scratch.get_pointer();
102  CoeffReturnType accumulator = *aInPtr;
103 
104  scratchptr[localid] = op.finalize(accumulator);
105  for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
106  itemID.barrier(cl::sycl::access::fence_space::local_space);
107  if (localid < offset) {
108  op.reduce(scratchptr[localid + offset], &accumulator);
109  scratchptr[localid] = op.finalize(accumulator);
110  }
111  }
112  if (localid == 0) *aOutPtr = op.finalize(accumulator);
113  }
114 };
115 
116 // Full reduction first phase. In this version the vectorization is true and the reduction accept
117 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
118 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
119 class FullReductionKernelFunctor {
120  public:
121  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
122  typedef typename Evaluator::Index Index;
123  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
124  (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
125  OpDef;
126 
127  typedef typename OpDef::type Op;
128  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
129  typedef typename Evaluator::PacketReturnType PacketReturnType;
130  typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
131  PacketReturnType, CoeffReturnType> OutType;
132  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
133  LocalAccessor;
134  LocalAccessor scratch;
135  Evaluator evaluator;
136  EvaluatorPointerType final_output;
137  Index rng;
138  Op op;
139 
140  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
141  Index rng_, OpType op_)
142  : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
143 
144  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
145 
146  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
147  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
148  const cl::sycl::nd_item<1> &itemID) const {
149  auto output_ptr = final_output;
150  Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
151  Index globalid = itemID.get_global_id(0);
152  Index localid = itemID.get_local_id(0);
153  Index step = Evaluator::PacketSize * itemID.get_global_range(0);
154  Index start = Evaluator::PacketSize * globalid;
155  // vectorizable parts
156  PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
157  for (Index i = start; i < VectorizedRange; i += step) {
158  op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
159  }
160  globalid += VectorizedRange;
161  // non vectorizable parts
162  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
163  op.template reducePacket<PacketReturnType>(
164  ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
165  evaluator.impl().coeff(i), op.initialize()),
166  &packetAccumulator);
167  }
168  scratch[localid] = packetAccumulator =
169  OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
170  // reduction parts // Local size is always power of 2
172  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
173  itemID.barrier(cl::sycl::access::fence_space::local_space);
174  if (localid < offset) {
175  op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
176  scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
177  }
178  }
179  if (localid == 0) {
180  output_ptr[itemID.get_group(0)] =
181  op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
182  }
183  }
184 
185  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
186  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
187  const cl::sycl::nd_item<1> &itemID) const {
188  auto output_ptr = final_output;
189  Index globalid = itemID.get_global_id(0);
190  Index localid = itemID.get_local_id(0);
191  // vectorizable parts
192  CoeffReturnType accumulator = op.initialize();
193  // non vectorizable parts
194  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
195  op.reduce(evaluator.impl().coeff(i), &accumulator);
196  }
197  scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
198 
199  // reduction parts. the local size is always power of 2
201  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
202  itemID.barrier(cl::sycl::access::fence_space::local_space);
203  if (localid < offset) {
204  op.reduce(scratch[localid + offset], &accumulator);
205  scratch[localid] = op.finalize(accumulator);
206  }
207  }
208  if (localid == 0) {
209  output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
210  }
211  }
212 };
213 
214 template <typename Evaluator, typename OpType>
215 class GenericNondeterministicReducer {
216  public:
217  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
218  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
219  typedef typename Evaluator::Index Index;
220  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
221  typedef typename OpDef::type Op;
222  template <typename Scratch>
223  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
224  Index range_, Index num_values_to_reduce_)
225  : evaluator(evaluator_),
226  output_accessor(output_accessor_),
227  functor(OpDef::get_op(functor_)),
228  range(range_),
229  num_values_to_reduce(num_values_to_reduce_) {}
230 
231  void operator()(cl::sycl::nd_item<1> itemID) const {
232  //This is to bypass the statefull condition in Eigen meanReducer
233  Op non_const_functor;
234  std::memcpy(&non_const_functor, &functor, sizeof (Op));
235  auto output_accessor_ptr = output_accessor;
236  Index globalid = static_cast<Index>(itemID.get_global_linear_id());
237  if (globalid < range) {
238  CoeffReturnType accum = functor.initialize();
239  Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
240  evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
241  output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
242  }
243  }
244 
245  private:
246  Evaluator evaluator;
247  EvaluatorPointerType output_accessor;
248  Op functor;
249  Index range;
250  Index num_values_to_reduce;
251 };
252 
254 // default is preserver
255 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
256 struct PartialReductionKernel {
257  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
258  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
259  typedef typename Evaluator::Index Index;
260  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
261  typedef typename OpDef::type Op;
262  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
263  ScratchAcc;
264  ScratchAcc scratch;
265  Evaluator evaluator;
266  EvaluatorPointerType output_accessor;
267  Op op;
268  const Index preserve_elements_num_groups;
269  const Index reduce_elements_num_groups;
270  const Index num_coeffs_to_preserve;
271  const Index num_coeffs_to_reduce;
272 
273  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
274  const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
275  const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
276  : scratch(scratch_),
277  evaluator(evaluator_),
278  output_accessor(output_accessor_),
279  op(OpDef::get_op(op_)),
280  preserve_elements_num_groups(preserve_elements_num_groups_),
281  reduce_elements_num_groups(reduce_elements_num_groups_),
282  num_coeffs_to_preserve(num_coeffs_to_preserve_),
283  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
284 
285  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
286  CoeffReturnType &accumulator) const {
287  if (globalPId >= num_coeffs_to_preserve) {
288  return;
289  }
290  Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
291  : globalRId + (globalPId * num_coeffs_to_reduce);
292  Index localOffset = globalRId;
293 
294  const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
295  const Index per_thread_global_stride =
296  rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
297  for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
298  op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
299  localOffset += per_thread_local_stride;
300  global_offset += per_thread_global_stride;
301  }
302  }
303  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
304  const Index linearLocalThreadId = itemID.get_local_id(0);
305  Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
306  : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
307  Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
308  : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
309  const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
310  : itemID.get_group(0) / reduce_elements_num_groups;
311  const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
312  : itemID.get_group(0) % reduce_elements_num_groups;
313 
314  Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
315  const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
316  CoeffReturnType* scratchPtr = scratch.get_pointer();
317  auto outPtr =
318  output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
319  CoeffReturnType accumulator = op.initialize();
320 
321  element_wise_reduce(globalRId, globalPId, accumulator);
322 
323  accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
324  scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
325  accumulator;
326  if (rt == reduction_dim::inner_most) {
327  pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
328  rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
329  globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
330  }
331 
332  /* Apply the reduction operation between the current local
333  * id and the one on the other half of the vector. */
334  auto out_scratch_ptr =
335  scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
336  itemID.barrier(cl::sycl::access::fence_space::local_space);
337  if (rt == reduction_dim::inner_most) {
338  accumulator = *out_scratch_ptr;
339  }
340  // The Local LocalThreadSizeR is always power of 2
342  for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
343  if (rLocalThreadId < offset) {
344  op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
345  // The result has already been divided for mean reducer in the
346  // previous reduction so no need to divide furthermore
347  *out_scratch_ptr = op.finalize(accumulator);
348  }
349  /* All threads collectively read from global memory into local.
350  * The barrier ensures all threads' IO is resolved before
351  * execution continues (strictly speaking, all threads within
352  * a single work-group - there is no co-ordination between
353  * work-groups, only work-items). */
354  itemID.barrier(cl::sycl::access::fence_space::local_space);
355  }
356 
357  if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
358  outPtr[globalPId] = op.finalize(accumulator);
359  }
360  }
361 };
362 
363 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
364 struct SecondStepPartialReduction {
365  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
366  typedef typename OpDef::type Op;
367  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
368  ScratchAccessor;
369  InputAccessor input_accessor;
370  OutputAccessor output_accessor;
371  Op op;
372  const Index num_coeffs_to_preserve;
373  const Index num_coeffs_to_reduce;
374 
375  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
376  OutputAccessor output_accessor_, OpType op_,
377  const Index num_coeffs_to_preserve_,
378  const Index num_coeffs_to_reduce_)
379  : input_accessor(input_accessor_),
380  output_accessor(output_accessor_),
381  op(OpDef::get_op(op_)),
382  num_coeffs_to_preserve(num_coeffs_to_preserve_),
383  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
384 
385  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
386  const Index globalId = itemID.get_global_id(0);
387 
388  if (globalId >= num_coeffs_to_preserve) return;
389 
390  auto in_ptr = input_accessor + globalId;
391 
392  OutScalar accumulator = op.initialize();
393 // num_coeffs_to_reduce is not bigger that 256
394  for (Index i = 0; i < num_coeffs_to_reduce; i++) {
395  op.reduce(*in_ptr, &accumulator);
396  in_ptr += num_coeffs_to_preserve;
397  }
398  output_accessor[globalId] = op.finalize(accumulator);
399  }
400 }; // namespace internal
401 
402 template <typename Index, Index LTP, Index LTR, bool BC_>
403 struct ReductionPannel {
404  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
405  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
406  static EIGEN_CONSTEXPR bool BC = BC_;
407 };
408 
409 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
410 struct PartialReducerLauncher {
411  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
412  typedef typename Self::CoeffReturnType CoeffReturnType;
413  typedef typename Self::Storage Storage;
414  typedef typename Self::Index Index;
415  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
416  PannelParameters;
417 
418  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
419 
420  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
421  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
422  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
423 
424  // getPowerOfTwo makes sure local range is power of 2 and <=
425  // maxSyclThreadPerBlock this will help us to avoid extra check on the
426  // kernel
427  static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
428  (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
429  "The Local thread size must be a power of 2 for the reduction "
430  "operation");
431 
432  EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
433  // In this step, we force the code not to be more than 2-step reduction:
434  // Our empirical research shows that if each thread reduces at least 64
435  // elemnts individually, we get better performance. However, this can change
436  // on different platforms. In this step we force the code not to be
437  // morthan step reduction: Our empirical research shows that for inner_most
438  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
439  // > 1024 to achieve the best performance.
440  const Index reductionPerThread = 64;
441  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
442  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
443  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
444  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
445  const Index globalRange = pNumGroups * rNumGroups * localRange;
446 
447  EIGEN_CONSTEXPR Index scratchSize =
448  PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
449  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
450  if (rNumGroups > 1) {
451  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
452  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
453  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
454  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
455  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
456  num_coeffs_to_reduce).wait();
457  typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
458  SecondStepPartialReductionKernel;
459  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
460  temp_accessor, output,
461  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
462  reducer, num_coeffs_to_preserve, rNumGroups).wait();
463  self.device().deallocate_temp(temp_pointer);
464  } else {
465  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467  num_coeffs_to_reduce).wait();
468  }
469  return false;
470  }
471 };
472 } // namespace internal
473 } // namespace TensorSycl
474 
475 namespace internal {
476 
477 template <typename Self, typename Op, bool Vectorizable>
478 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
479  typedef typename Self::CoeffReturnType CoeffReturnType;
480  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
481  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
482  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
484  typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
485  static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487  "The Local thread size must be a power of 2 for the reduction "
488  "operation");
489  EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
490 
491  typename Self::Index inputSize = self.impl().dimensions().TotalSize();
492  // In this step we force the code not to be more than 2-step reduction:
493  // Our empirical research shows that if each thread reduces at least 512
494  // elemnts individually, we get better performance.
495  const Index reductionPerThread = 2048;
496  // const Index num_work_group =
497  Index reductionGroup = dev.getPowerOfTwo(
498  (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
499  const Index num_work_group = std::min(reductionGroup, local_range);
500  // 1
501  // ? local_range
502  // : 1);
503  const Index global_range = num_work_group * local_range;
504 
505  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
506  typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
507  if (num_work_group > 1) {
508  CoeffReturnType *temp_pointer =
509  static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
510  typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
512  local_range, inputSize, reducer).wait();
513  typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
514  EvaluatorPointerType, Index, local_range>
515  GenericRKernel;
516  dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
517  tmp_global_accessor, data,
518  cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
519  reducer).wait();
520  dev.deallocate_temp(temp_pointer);
521  } else {
522  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
523  reducer).wait();
524 
525  }
526  }
527 };
528 // vectorizable inner_most most dim preserver
529 // col reduction
530 template <typename Self, typename Op>
531 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
532  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
533 
534  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
535  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
536  typename Self::Index num_coeffs_to_preserve) {
537  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
538  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
539  num_coeffs_to_reduce,
540  num_coeffs_to_preserve);
541  }
542 };
543 // row reduction
544 template <typename Self, typename Op>
545 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
546  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
547 
548  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
549  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
550  typename Self::Index num_coeffs_to_preserve) {
551  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
552  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
553  num_coeffs_to_reduce,
554  num_coeffs_to_preserve);
555  }
556 };
557 
558 // ArmgMax uses this kernel for partial reduction//
559 // TODO(@mehdi.goli) come up with a better kernel
560 // generic partial reduction
561 template <typename Self, typename Op>
562 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
563  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
564  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
565  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
566  typename Self::Index num_coeffs_to_preserve) {
567  typename Self::Index range, GRange, tileSize;
568  dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
569 
570  dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
571  TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
572  self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
573  reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1)).wait();
574  return false;
575  }
576 };
577 
578 } // namespace internal
579 } // namespace Eigen
580 
581 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
int i
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_UNROLL_LOOP
#define EIGEN_CONSTEXPR
#define EIGEN_DEVICE_FUNC
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)