TensorContractionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library for linear algebra.
2 //
3 // Mehdi Goli Codeplay Software Ltd.
4 // Ralph Potter Codeplay Software Ltd.
5 // Luke Iwanski Codeplay Software Ltd.
6 // Contact: <eigen@codeplay.com>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
9 // distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 
19 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
20 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
21 
22 #include "./InternalHeaderCheck.h"
23 
24 namespace Eigen {
25 
26 namespace TensorSycl {
27 namespace internal {
28 
29 #ifndef EIGEN_SYCL_DISABLE_GEMV
44 template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
45 struct TVPanelSize {
46  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
47  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
48  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
49  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
50  // TileSizeDimNC: determines the tile size for the non-contracting dimension
51  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
52  // TileSizeDimC: determines the tile size for the contracting dimension
53  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
54  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
55  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
56  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
57  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
58  // BC : determines if supporting bank conflict is required
59  static EIGEN_CONSTEXPR bool BC = false;
60 };
61 #endif
62 
80 template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
81 struct TTPanelSize {
82  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
83  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
84  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
85  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
86 #ifndef EIGEN_SYCL_REG_M
87  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
88 #else
89  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
90 #endif
91 // WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
92 // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
93 #ifndef EIGEN_SYCL_REG_N
94  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
95 #else
96  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
97 #endif
98  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
99  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
100  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
101  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
102  // TileSizeDimM: determines the tile size for the m dimension
103  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
104  // TileSizeDimN: determines the tile size for the n dimension
105  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
106  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
107  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
108  ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
109  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
110  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
111  ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
112  // BC : determines if supporting bank conflict is required
113  static EIGEN_CONSTEXPR bool BC = true;
114  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
115  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
116  static EIGEN_CONSTEXPR bool DoubleBuffer =
117 #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
118  false;
119 #else
120  true;
121 #endif
122 };
123 
124 /* !
125  * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
126  * specialize the contraction algorithm based on device support for dedicated local memory.
127  */
129 /* !
130  * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
131  */
133 
159 template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
160  typename StorageIndex>
161 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<PacketLoad, PacketType> read(
162  const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
163  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
164  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
165  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
166 }
167 
190 template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
191 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!PacketLoad, PacketType> read(
192  const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
193  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
194  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
195  return tensorMapper(row, col);
196 }
197 
219 template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
220 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
221  std::enable_if_t<dt != data_source::global_mem, void>
222  write(PacketType &packet_data, DataScalar ptr) {
223  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
225  for (int i = 0; i < PacketSize; i++) {
226  *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
227  ptr += ld;
228  }
229 }
230 
246 template <data_source dt, typename PacketType, typename DataScalar>
247 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename std::enable_if_t<
248  Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>
249 write(PacketType &packet_data, DataScalar *ptr) {
250  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
251 }
252 
266 template <data_source dt, typename PacketType, typename DataScalar>
267 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename std::enable_if_t<
268  Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>
269 write(PacketType &packet_data, DataScalar *ptr) {
270  *ptr = packet_data;
271 }
272 
278 template <bool is_internal>
279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
280  return true;
281 }
282 
288 template <>
289 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
290  return cond;
291 }
292 
319 template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
320 struct BlockProperties {
321  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
322  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
323  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
324  typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
325  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
326  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
327  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
328  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
329 };
330 
370 template <typename StorageIndex>
371 struct ThreadProperties {
372  const StorageIndex linearLocalThreadId;
373  const StorageIndex kGroupId;
374  const StorageIndex mGroupOffset;
375  const StorageIndex nGroupOffset;
376  const StorageIndex kGroupOffset;
377  const StorageIndex mLocalOffset;
378  const StorageIndex nLocalOffset;
379  const StorageIndex mGlobalOffset;
380  const StorageIndex nGlobalOffset;
381  StorageIndex kSize;
382  const bool is_internal;
383  // this is used to adjust the last block
384  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
385  const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
386  const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
387  const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
388  StorageIndex kSize_, const bool is_internal_)
389  : linearLocalThreadId(linearLocalThreadId_),
390  kGroupId(kGroupId_),
391  mGroupOffset(mGroupOffset_),
392  nGroupOffset(nGroupOffset_),
393  kGroupOffset(kGroupOffset_),
394  mLocalOffset(mLocalOffset_),
395  nLocalOffset(nLocalOffset_),
396  mGlobalOffset(mGlobalOffset_),
397  nGlobalOffset(nGlobalOffset_),
398  kSize(kSize_),
399  is_internal(is_internal_) {}
400 };
401 
452 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
453  typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
454  typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
455 class TensorContractionKernel {
456  public:
457  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
458  PacketReturnType;
459  static EIGEN_CONSTEXPR int PacketSize =
460  Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
461  static EIGEN_CONSTEXPR bool is_lhs_transposed =
462  !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
463  static EIGEN_CONSTEXPR bool is_rhs_transposed =
464  !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
465 
466  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
467  PacketReturnType>
468  LHSBlockProperties;
469 
470  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
471  PacketReturnType>
472  RHSBlockProperties;
473 
474  static EIGEN_CONSTEXPR StorageIndex NStride =
475  contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
476 
477  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
478  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
479  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
480  typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr>
481  tile_ptr;
482  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
483  ? Properties::TileSizeDimM + Properties::BC
484  : Properties::WorkLoadPerThreadM;
485  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
486  ? Properties::TileSizeDimN + Properties::BC
487  : Properties::WorkLoadPerThreadN;
488  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
489 
502  template <contraction_type, StorageIndex>
503  struct MemHolder {
504  tile_ptr ptr;
505  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
506  };
510  template <StorageIndex MemSize>
511  struct MemHolder<contraction_type::no_local, MemSize> {
512  OutScalar ptr[MemSize] = {OutScalar{0}};
513  };
536  struct TiledMemory {
537  MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
538  MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
539  tile_ptr lhs_scratch_ptr_compute;
540  tile_ptr rhs_scratch_ptr_compute;
541  const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
542  const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
543  template <contraction_type tp = contraction_tp>
544  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
545  TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
546  std::enable_if_t<tp == contraction_type::no_local> * = 0)
547  : lhs_scratch_extract{},
548  rhs_scratch_extract{},
549  lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
550  rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
551  lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
552  rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
553 
554  template <contraction_type tp = contraction_tp>
555  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
556  TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
557  std::enable_if_t<tp == contraction_type::local> * = 0)
558  : lhs_scratch_extract{block_start_ptr},
559  rhs_scratch_extract{lhs_scratch_extract.ptr +
560  ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
561  lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
562  rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
563  lhs_extract_index(
564  local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
565  rhs_extract_index(
566  local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
567  };
568 
569  Scratch scratch;
570  const LhsMapper lhs;
571  const RhsMapper rhs;
572  OutAccessor out_res;
573  const StorageIndex groupSizeM;
574  const StorageIndex groupSizeN;
575  const StorageIndex numTiles;
576  const TripleDim triple_dim;
577 
578  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
579  const RhsMapper rhs_, OutAccessor out_res_,
580  const StorageIndex groupSizeM_,
581  const StorageIndex groupSizeN_,
582  const StorageIndex numTiles_,
583  const TripleDim triple_dim_)
584  : scratch(scratch_),
585  lhs(lhs_),
586  rhs(rhs_),
587  out_res(out_res_),
588  groupSizeM(groupSizeM_),
589  groupSizeN(groupSizeN_),
590  numTiles(numTiles_),
591  triple_dim(triple_dim_) {}
592 
593  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
594  const RhsMapper rhs_, OutAccessor out_res_,
595  const StorageIndex groupSizeM_,
596  const StorageIndex numTiles_,
597  const TripleDim triple_dim_)
598  : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
599 
600  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
601  const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
602  const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
603  const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
604  const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
605  const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
606  const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
607  const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
608  const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
609  const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
610  const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
611  const StorageIndex nLocalOffset = NStride * nLocalThreadId;
612  const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
613  const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
614 
615  const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
616  StorageIndex kGroupOffset = kGroupId * kSizePerWG;
617  const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
618  triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
619  triple_dim.K - kGroupOffset >= kSizePerWG;
620  // this is used to adjust the last block
621  StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
622  // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
623  // tile
624  kGroupOffset += kSize;
625 
626  auto thread_properties =
627  ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
628  mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
629 
630  auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
631 
632  (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
633  : compute_panel<false>(itemID, thread_properties, out_ptr);
634  }
635  // The compute block computes the contraction operation private block for each thread and store the resutl in the
636  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
637  // it only compute the block on each thread's private memory space
638  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
639  PacketReturnType *privateRes) const {
640  StorageIndex idx = 0;
641  EIGEN_CONSTEXPR StorageIndex lhs_stride =
642  contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
644  for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
645  auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
646  StorageIndex lhs_index = 0;
648  for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
649  PacketReturnType lhsPack{};
650  Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
651  lhs_block_ptr + lhs_index);
652  privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
653 
654  lhs_index += lhs_stride;
655  idx++;
656  }
657  }
658  }
659  // The store function write the computed contraction operation in the private memory of each thread to the global
660  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
661  // class.
662  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
663  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
664  StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
665  auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
666  return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
667  };
668  // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
669  // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
670  // WorkLoadPerThreadN slice of N
671  EIGEN_CONSTEXPR StorageIndex GlobalNStride =
672  contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
674  for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
675  // output leading dimension
676  StorageIndex outputLD = 0;
677  // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
678  // memory and extracting from local to global is the same as no transposed version. However, when local memory is
679  // not used and RHS is transposed we packetize the load for RHS.
681  for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
682  StorageIndex globalRow = mGlobalOffset;
684  for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
685  PacketReturnType privetOut = privateRes[wLPTM];
686  if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
687  // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
688  // StorageIndex Therefore it is always coalesced layout
689  write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
690  } else {
692  for (StorageIndex mId = 0; mId < PacketSize; mId++) {
693  StorageIndex mOffset = globalRow + mId;
694  if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
695  out_ptr[mOffset + outputLD] =
696  Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
697  }
698  }
699  }
700  globalRow += (PacketSize * Properties::LocalThreadSizeM);
701  }
702  outputLD += triple_dim.M;
703  privateRes += Properties::WorkLoadPerThreadM / PacketSize;
704  }
705  out_ptr += (GlobalNStride * outputLD);
706 
707  nGlobalOffset += (PrivateNStride * GlobalNStride);
708  }
709  }
710  // when no local memory is used the following extract_block will be enabled
711  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
712  contraction_type contract_tp = contraction_tp>
713  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
714  std::enable_if_t<contract_tp == contraction_type::no_local>
715  extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
716  const StorageIndex &ncOffset, const StorageIndex cOffset) const {
717  EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
718  InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
719  EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
720  InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
721  const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
722 
723  auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
724  return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
725  (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
726  };
727  const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
728  StorageIndex cIndex = cOffset;
729 
731  for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
732  StorageIndex ncIndex = ncOffset;
734  for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
735  if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
736  auto val =
737  read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
738  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
739 
740  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
741  data_source::private_mem>(val, private_ptr);
742  } else {
744  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
745  const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
746  const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
747  OutScalar val =
748  (ncInd < NC && cInd < triple_dim.K)
749  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
750  inpt, ncInd, cInd, ld)
751  : OutScalar(0);
752  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
754  val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
755  ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
756  }
757  }
758 
759  // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
760  // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
761  ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
762  ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
763  : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
764  private_ptr += InputBlockProperties::nc_stride;
765  }
766  // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
767  private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
768  cIndex += InputBlockProperties::c_stride;
769  }
770  }
771  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
772  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
773  const StorageIndex &linearLocalThreadId) {
774  const StorageIndex localThreadNC =
775  (InputBlockProperties::is_coalesced_layout)
776  ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
777  : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
778  const StorageIndex localThreadC =
779  (InputBlockProperties::is_coalesced_layout)
780  ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
781  : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
782  return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
783  }
784 
785  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
786  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
787  std::enable_if_t<db && ctp == contraction_type::local>
788  sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
789  db_offset = !db_offset;
790  }
791 
792  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
793  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
794  std::enable_if_t<!db && ctp == contraction_type::local>
795  sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
796  itemID.barrier(cl::sycl::access::fence_space::local_space);
797  }
798 
799  template <contraction_type ctp = contraction_tp>
800  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
801  std::enable_if_t<ctp == contraction_type::no_local>
802  sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
803  return;
804  }
805 
806  template <bool need_sync, contraction_type ctp = contraction_tp>
807  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
808  std::enable_if_t<need_sync && ctp == contraction_type::no_local>
809  sync_thread(const cl::sycl::nd_item<1> &
810 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
811  itemID
812 #endif
813  ) noexcept {
814 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
815  itemID.barrier(cl::sycl::access::fence_spacce::local_space);
816 #else
817  return;
818 #endif
819  }
820  template <bool need_sync, contraction_type ctp = contraction_tp>
821  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
822  std::enable_if_t<need_sync && ctp == contraction_type::local>
823  sync_thread(const cl::sycl::nd_item<1> &itemID) {
824  itemID.barrier(cl::sycl::access::fence_space::local_space);
825  }
826  template <bool need_sync>
827  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync> sync_thread(
828  const cl::sycl::nd_item<1> &) {
829  return;
830  }
831 
832  template <bool is_internal_block>
833  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
834  ThreadProperties<StorageIndex> &thread_properties,
835  TiledMemory &tiled_input_block,
836  PacketReturnType *privateRes, bool &db_offset) const {
837 
838  // Tiling the Rhs block from global to local memory
839  extract_block<RHSBlockProperties, is_internal_block>(
840  rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
841  tiled_input_block.rhs_extract_index,
842  contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
843  thread_properties.kGroupOffset - thread_properties.kSize);
844 
845  sync_thread<contraction_tp == contraction_type::no_local>(itemID);
846 
847  // Tiling the Lhs block from global to local memory
848  extract_block<LHSBlockProperties, is_internal_block>(
849  lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
850  tiled_input_block.lhs_extract_index,
851  contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
852  thread_properties.kGroupOffset - thread_properties.kSize);
853 
854  // itemID.barrier(cl::sycl::access::fence_space::local_space);
855  sync_thread<contraction_tp == contraction_type::local>(itemID);
856  // switch to compute mede
857  StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
858  StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
859  // Loop over the values of a single tile
860  for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
861  compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
862  tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
863  lhs_offset += LSDL;
864  rhs_offset += LSDR;
865  }
866  // computing the K index for the next tile
867  thread_properties.kSize -= Properties::TileSizeDimK;
868  sync_mem(itemID, db_offset);
869  }
870 
871  // when local memory is available the following compute_panel will be enabled
872  template <bool is_internal_block, typename OutPtr>
873  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
874  ThreadProperties<StorageIndex> &thread_properties,
875  OutPtr out_ptr) const {
876  auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
877  // Allocate register space
878  PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
879  PacketReturnType{0}};
880  bool db_offset = 0;
881 
882  while (thread_properties.kSize >= Properties::TileSizeDimK) {
883  compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
884  }
885  if (thread_properties.kSize > 0) {
886  compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
887  }
888 
889  // Storing the final results in the output
890  store<is_internal_block,
891  contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
892  out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
893  thread_properties.nGlobalOffset);
894  }
895  // When local memory is available the following extract_block will be enabled
896  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
897  contraction_type contract_tp = contraction_tp>
898  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
899  std::enable_if_t<contract_tp == contraction_type::local>
900  extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
901  const StorageIndex &ncOffset, const StorageIndex cOffset) const {
902  EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
903  InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
904  EIGEN_CONSTEXPR StorageIndex LoadPerThread =
905  InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
906  EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
907  static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
908  (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
909  " LocalOffset must be divisable by stride");
910  const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
911  StorageIndex localThreadNC = local_index.first;
912  StorageIndex localThreadC = local_index.second;
913  auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
914  return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
915  (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
916  };
918  for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
919  const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
920  const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
921  const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
922  if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
923  auto val =
924  read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
925  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
926  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
927  val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
928  (InputBlockProperties::c_stride * localThreadC * LSD));
929  } else {
931  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
932  const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
933  const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
934  OutScalar val =
935  (nCInd < NC && cInd < triple_dim.K)
936  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
937  inpt, nCInd, cInd, ld)
938  : OutScalar(0);
939 
940  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
941  val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
942  (InputBlockProperties::is_coalesced_layout ? i : 0) +
943  ((InputBlockProperties::c_stride * localThreadC +
944  (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
945  LSD));
946  }
947  }
948  localThreadNC += (InputBlockProperties::is_coalesced_layout)
949  ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
950  : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
951  localThreadC += (InputBlockProperties::is_coalesced_layout)
952  ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
953  : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
954  }
955  }
956 };
957 
958 #ifndef EIGEN_SYCL_DISABLE_GEMV
959 
1001 template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
1002  typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
1003 struct GeneralVectorTensor {
1004  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
1005  PacketReturnType;
1006  static EIGEN_CONSTEXPR int PacketSize =
1007  Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
1008  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1009 
1010  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
1011  KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1012 
1013  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
1014  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
1015  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
1016  VecBlockProperties;
1017 
1018  Scratch scratch;
1019  const VectorMapper vec;
1020  const TensorMapper mat;
1021  OutAccessor out_res;
1022  const StorageIndex nonContractGroupSize;
1023  const StorageIndex nonContractDim;
1024  const StorageIndex contractDim;
1025 
1026  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
1027  const TensorMapper mat_, OutAccessor out_res_,
1028  const StorageIndex nonContractGroupSize_,
1029  const StorageIndex nonContractDim_,
1030  const StorageIndex contractDim_)
1031  : scratch(scratch_),
1032  vec(vec_),
1033  mat(mat_),
1034  out_res(out_res_),
1035  nonContractGroupSize(nonContractGroupSize_),
1036  nonContractDim(nonContractDim_),
1037  contractDim(contractDim_) {}
1038 
1039  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
1040  auto scratch_ptr = scratch.get_pointer();
1041  const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
1042  StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
1043  : linearLocalThreadId % Properties::LocalThreadSizeNC;
1044  StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
1045  : linearLocalThreadId / Properties::LocalThreadSizeNC;
1046  const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
1047  const StorageIndex nonContractGroupId =
1048  is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
1049  const StorageIndex contractGroupId =
1050  is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
1051  auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
1052 
1053  const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
1054  const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
1055  auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1056  const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
1057  const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
1058  auto local_output = scratch_ptr + OutScratchOffset;
1059  const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
1060  contractDim - contractGroupOffset >= Properties::TileSizeDimC;
1061  is_internal
1062  ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
1064  scratch_ptr, contractGroupOffset,
1065 #endif
1066  nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1067  nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
1068  : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
1070  scratch_ptr, contractGroupOffset,
1071 #endif
1072  nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1073  nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
1074  }
1075  template <bool is_internal_block, typename OutPtr>
1076  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
1077  const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
1078  OutPtr out_ptr,
1080  OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
1081 #endif
1082  const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
1083  StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
1084  StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
1085  OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
1086  // Reading the vector
1087 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1088  const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
1089  extract_block<VecBlockProperties, is_internal_block, KFactor,
1090  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
1091  vectorOffset, contractDim);
1092 
1093  itemID.barrier(cl::sycl::access::fence_space::local_space);
1094  auto in_scratch_ptr = scratch_ptr + contractId;
1095 #endif
1096 
1097  StorageIndex privateOffsetC = 0;
1099  for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
1100  StorageIndex privateOffsetNC = 0;
1101  bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
1102 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1103  auto vecScalar = *in_scratch_ptr;
1104 #else
1105  auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
1106  ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
1107  is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
1108  : OutScalar(0);
1109 #endif
1111  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1112  auto matScalar = (check_boundary<is_internal_block>(
1113  contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
1114  ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
1115  : globalNonContractDimOffset + privateOffsetNC,
1116  is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
1117  : globalContractDimOffset + privateOffsetC)
1118  : OutScalar(0);
1119 
1120  outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
1121  privateOffsetNC += Properties::LocalThreadSizeNC;
1122  }
1123  privateOffsetC += Properties::LocalThreadSizeC;
1124 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1125  in_scratch_ptr += Properties::LocalThreadSizeC;
1126 #endif
1127  }
1128 
1129  auto out_scratch_ptr = local_output + outScratchIndex;
1130  // Each block of 16*16 element in shared memory should reduce to 16*1
1132  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1133  *out_scratch_ptr = outScalar[j];
1134 
1135  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1136  }
1137  if (is_lhs_vec) {
1138  nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
1139  contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
1140  outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1141  }
1142 
1143  out_scratch_ptr = local_output + outScratchIndex;
1145  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1147  for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
1148  itemID.barrier(cl::sycl::access::fence_space::local_space);
1149  if (contractId < offset) {
1150  StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
1151  *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
1152  }
1153  }
1154  // moving to next 16 by 16 block
1155  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1156  }
1157 
1158  if (contractId == 0) {
1159  out_scratch_ptr = local_output + nonContractId;
1160  StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
1161  out_ptr += global_final_offset;
1163  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1164  if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
1165  auto res = *out_scratch_ptr;
1166 
1167  *out_ptr = res;
1168  out_ptr += Properties::LocalThreadSizeNC;
1169  }
1170  // moving to next 16 by 16 block to ge the next 16 reduced elements
1171  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1172  if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
1173  }
1174  }
1175  }
1176 
1177  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
1178  typename Local>
1179  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
1180  const StorageIndex &linearLocalThreadId,
1181  const StorageIndex &cOffset, const StorageIndex &C) {
1182  local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
1183  StorageIndex cIndex = cOffset;
1184  for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
1185  if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
1186  auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
1187  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
1188  cIndex, StorageIndex(1));
1189  write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
1190  } else {
1192  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
1193  OutScalar val =
1194  (cIndex + i < C)
1195  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
1196  inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
1197  : OutScalar(0);
1198  write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
1199  }
1200  }
1201  local_ptr += InputBlockProperties::c_stride * GroupSize;
1202  cIndex += InputBlockProperties::c_stride * GroupSize;
1203  }
1204  }
1205 };
1206 #endif
1207 
1208 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1209 
1241 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
1242  typename RhsMapper, typename StorageIndex, bool Vectorizable>
1243 struct GeneralScalarContraction {
1244  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1245  Scratch scratch;
1246  const LhsMapper lhs;
1247  const RhsMapper rhs;
1248  OutAccessor out_res;
1249  const StorageIndex rng;
1250 
1252  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
1253  const StorageIndex rng_)
1254  : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
1255 
1256  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
1257 
1258  auto out_ptr = out_res;
1259  OutScalar * scratch_ptr = scratch.get_pointer();
1260 
1261  StorageIndex globalid = itemID.get_global_id(0);
1262  StorageIndex localid = itemID.get_local_id(0);
1263  OutScalar accumulator = OutScalar(0);
1264  for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
1265  accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
1266  }
1267  auto out_scratch_ptr = scratch_ptr + localid;
1268  *out_scratch_ptr = accumulator;
1269  for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
1270  itemID.barrier(cl::sycl::access::fence_space::local_space);
1271  if (localid < offset) {
1272  *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
1273  }
1274  }
1275  if (localid == 0) {
1276  out_ptr[itemID.get_group(0)] = accumulator;
1277  }
1278  }
1279 };
1280 #endif
1281 
1282 } // namespace internal
1283 } // namespace TensorSycl
1284 
1285 template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1287  Eigen::SyclDevice>
1288  : public TensorContractionEvaluatorBase<TensorEvaluator<
1289  const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
1290  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
1291  "SYCL tensor contraction does not support output kernels.");
1292 
1293  typedef Eigen::SyclDevice Device;
1294 
1298  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
1299  typedef typename XprType::Index StorageIndex;
1302  typedef typename Base::Storage Storage;
1304  struct TripleDim {
1308  TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
1309  };
1310  enum {
1312  BlockAccess = false,
1313  };
1314 
1316  static constexpr int LDims = Base::LDims;
1317  static constexpr int RDims = Base::RDims;
1318  static constexpr int ContractDims = Base::ContractDims;
1319 
1322 
1324  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
1325  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
1326 
1327  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
1328 
1330 
1333  typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType> LhsScalar;
1334  typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType> RhsScalar;
1335 
1338 
1339  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
1340  struct input_mapper_propertis {
1341  static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
1342  static EIGEN_CONSTEXPR bool is_rhs_matrix =
1343  (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
1344  };
1345 
1346  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
1347 
1348  // We need to redefine this method to make nvcc happy
1349  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
1350  this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1351  this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1352  if (!data) {
1353  this->m_result = this->m_device.get(
1354  static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
1355  data = this->m_result;
1356  }
1357  evalToSycl(data);
1358  return (this->m_result != NULL);
1359  }
1360  const Eigen::SyclDevice &device() const { return this->m_device; }
1361  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
1362  if (this->m_lhs_inner_dim_contiguous) {
1363  if (this->m_rhs_inner_dim_contiguous) {
1364  if (this->m_rhs_inner_dim_reordered) {
1365  evalTyped<true, true, true, Unaligned>(buffer);
1366  } else {
1367  evalTyped<true, true, false, Unaligned>(buffer);
1368  }
1369  } else {
1370  if (this->m_rhs_inner_dim_reordered) {
1371  evalTyped<true, false, true, Unaligned>(buffer);
1372  } else {
1373  evalTyped<true, false, false, Unaligned>(buffer);
1374  }
1375  }
1376  } else {
1377  if (this->m_rhs_inner_dim_contiguous) {
1378  if (this->m_rhs_inner_dim_reordered) {
1379  evalTyped<false, true, true, Unaligned>(buffer);
1380  } else {
1381  evalTyped<false, true, false, Unaligned>(buffer);
1382  }
1383  } else {
1384  if (this->m_rhs_inner_dim_reordered) {
1385  evalTyped<false, false, true, Unaligned>(buffer);
1386  } else {
1387  evalTyped<false, false, false, Unaligned>(buffer);
1388  }
1389  }
1390  }
1391  }
1392 
1393  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1394  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
1395  const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
1396  typedef internal::TensorContractionInputMapper<
1398  PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
1399  LhsMapper;
1400 
1401  typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
1403  PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
1404  rhs_inner_dim_reordered, Unaligned, MakePointer>
1405  RhsMapper;
1406 
1407  // initialize data mappers
1408  LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1409  this->m_left_contracting_strides, this->m_k_strides);
1410 
1411  RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1412  this->m_right_contracting_strides, this->m_k_strides);
1413 
1414 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1415  if (triple_dim.M == 1 && triple_dim.N == 1) {
1416  launchSC(buffer, lhs, rhs, triple_dim.K);
1417  } else
1418 #endif
1419 #ifndef EIGEN_SYCL_DISABLE_GEMV
1420  if (triple_dim.M != 1 && triple_dim.N == 1) {
1421  LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
1422  } else if (triple_dim.M == 1 && triple_dim.N != 1) {
1423  LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
1424  } else // This is equivalent of if (m!=1 && n!=1)
1425 #endif
1426  {
1427  typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
1428  inpt_mapper_properties;
1429 #ifndef EIGEN_SYCL_DISABLE_SKINNY
1430  bool skinny = false;
1431  auto platform_name = this->device().getPlatformName();
1432  // This is based on empirical calculation for AMD r9-nano and Fiji
1433  if (platform_name.find("AMD") == 0) {
1434  skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
1435  ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
1436  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
1437  } else {
1438  skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
1439  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
1440  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
1441  }
1442  if (skinny)
1443  adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1444  else
1445 #endif // EIGEN_SYCL_DISABLE_SKINNY
1446  adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1447  }
1448  }
1449 
1450  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
1451  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1452  const TripleDim &triple_dim) const {
1453 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1454  if (device().has_local_memory()) {
1455  typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
1456  launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
1457  buffer, lhs, rhs, triple_dim);
1458  }
1459 #endif
1460 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
1461  if (!(device().has_local_memory())) {
1462  typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
1463  launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
1464  buffer, lhs, rhs, triple_dim);
1465  }
1466 #endif
1467  }
1468 
1469  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
1470  typename Properties, typename LhsMapper, typename RhsMapper>
1471  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1472  const TripleDim &triple_dim) const {
1473  const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
1474  const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
1475  const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
1476  const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
1477 
1478  const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
1479  StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
1480  StorageIndex groupSizeK =
1481  skinny
1482  ? std::max(std::min(totalTilesK,
1483  (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
1484  (groupSizeM * groupSizeN)),
1485  StorageIndex(1))
1486  : StorageIndex(1);
1487 
1488  const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
1489 
1490  const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
1491 
1492  const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
1493  const StorageIndex globalRange = totalGroupSize * localRange;
1494 
1496  ? ((Properties::DoubleBuffer + 1) *
1497  (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
1498  ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
1499  (Properties::TileSizeDimN + Properties::BC))
1500  : StorageIndex(1);
1501 
1502  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1503  if (groupSizeK == 1) {
1504  typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
1505  LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1506  PacketAccess, input_mapper_properties, true, ct>
1507  ContractKernelName;
1508  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1509  lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim).wait();
1510  } else {
1511  typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
1512  LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1513  PacketAccess, input_mapper_properties, false, ct>
1514  ContractKernelName;
1515  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
1516  device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
1517  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1518 
1519  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1520  lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
1521  triple_dim).wait();
1522 
1523  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
1524  auto op = Op();
1525  typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
1527  ReductionKernel;
1528 
1529  device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1530  tmp_global_accessor, buffer,
1531  cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
1532  Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
1533  cl::sycl::range<1>(localRange)),
1534  StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK).wait();
1535  device().deallocate_temp(temp_pointer);
1536  }
1537  }
1538 
1539 #ifndef EIGEN_SYCL_DISABLE_GEMV
1540  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
1541  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
1542  StorageIndex NC, StorageIndex C) const {
1543  const StorageIndex nonContractDim = NC;
1544  EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
1545  EIGEN_CONSTEXPR StorageIndex CFactor = 1;
1546  EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
1547  typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
1548  Properties;
1549  const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
1550  const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
1551  const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
1552  const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
1553  const StorageIndex globalRange =
1554  (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
1555  const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
1556  const StorageIndex scratchSize =
1557  (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1558  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1559  if (cNumGroups > 1) {
1560  typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
1561  TensorMapper, StorageIndex, Properties, CFactor, false,
1562  is_lhs_vec, false>
1563  ContractKernelName;
1564  CoeffReturnType *temp_pointer =
1565  static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
1566  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1567 
1568  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1569  vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
1570 
1571  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
1572  typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
1574  ReductionKernel;
1575 
1576  device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1577  tmp_global_accessor, buffer,
1578  cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
1579  cl::sycl::range<1>(localRange)),
1580  StorageIndex(1), Op(), nonContractDim, cNumGroups).wait();
1581  device().deallocate_temp(temp_pointer);
1582  } else {
1583  typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
1584  TensorMapper, StorageIndex, Properties, CFactor, false,
1585  is_lhs_vec, true>
1586  ContractKernelName;
1587  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1588  vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C).wait();
1589 
1590  }
1591  }
1592 #endif
1593 
1594 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1595  template <typename LhsMapper, typename RhsMapper>
1596  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1597  StorageIndex K) const {
1598  EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
1599  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
1600  "The Local thread size must be a power of 2 for the reduction "
1601  "operation");
1602  EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
1603 
1604  // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
1605  // reduces at least 512 elementss individually, we get better performance.
1606  const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
1607  const StorageIndex global_range = num_work_group * local_range;
1608 
1609  typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
1610  CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
1611  ContractKernelName;
1612  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
1613  if (num_work_group > 1) {
1614  CoeffReturnType *temp_pointer =
1615  static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
1616  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1617  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
1618  thread_range, local_range, K).wait();
1619  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
1620  typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
1621  EvaluatorPointerType, StorageIndex, local_range>
1622  GenericRKernel;
1623  device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
1624  tmp_global_accessor, buffer,
1625  cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()).wait();
1626  device().deallocate_temp(temp_pointer);
1627  } else {
1628  device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
1629  local_range, K).wait();
1630  }
1631  }
1632 #endif
1633 
1634  EIGEN_STRONG_INLINE void cleanup() {
1635  this->m_leftImpl.cleanup();
1636  this->m_rightImpl.cleanup();
1637 
1638  if (this->m_result) {
1639  this->m_device.deallocate_temp(this->m_result);
1640  this->m_result = NULL;
1641  }
1642  }
1643 };
1644 } // namespace Eigen
1645 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
int i
RowXpr row(Index i) const
ColXpr col(Index i) const
Matrix4Xd M
IndexedView_or_VectorBlock operator()(const Indices &indices)
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNROLL_LOOP
#define EIGEN_CONSTEXPR
#define EIGEN_DEVICE_FUNC
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define EIGEN_STATIC_ASSERT(X, MSG)
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
Definition: TensorMacros.h:55
MatrixXf mat
Eigen::internal::traits< TensorContractionOp >::Index Index
internal::gebp_traits< typename LhsXprType::CoeffReturnType, typename RhsXprType::CoeffReturnType >::ResScalar CoeffReturnType
static std::enable_if_t< dt !=data_source::global_mem, void > write(PacketType &packet_data, DataScalar ptr)
write, a template function used for storing the data to local memory. This function is used to guaran...
static std::enable_if_t< PacketLoad, PacketType > read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
bool check_boundary(bool)
check_boundary: is used to check the edge condition for non-internal blocks.
bool check_boundary< false >(bool cond)
check_boundary: specialization of the check_boundary for non-internal blocks.
std::uint64_t uint64_t
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
std::array< T, N > array
CleanedUpDerType< DerType >::type() min(const AutoDiffScalar< DerType > &x, const T &y)
CleanedUpDerType< DerType >::type() max(const AutoDiffScalar< DerType > &x, const T &y)
internal::packet_traits< Scalar >::type type
Definition: TensorMeta.h:55
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType LeftArgType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType RightArgType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType OutputKernelType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices Indices
void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Device > Self
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const
EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, StorageIndex K) const
A cost model used to limit the number of threads used for evaluating tensor expression.
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
Storage::Type EvaluatorPointerType
EvaluatorPointerType data() const
Derived::Scalar CoeffReturnType
Derived::Dimensions Dimensions
std::ptrdiff_t j