15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
31 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
32 typename Kernel_accessor,
typename Buffer_accessor,
convolution_type Conv_Dim>
34 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
35 typename Kernel_accessor,
typename Buffer_accessor>
38 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
44 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout>
indexMapper;
48 Buffer_accessor buffer_acc_,
49 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
50 const size_t kernelSize_,
const cl::sycl::range<2> input_range_)
51 : local_acc(local_acc_),
52 device_evaluator(device_evaluator_),
53 kernel_filter(kernel_filter_),
54 buffer_acc(buffer_acc_),
55 indexMapper(indexMapper_),
56 kernelSize(kernelSize_),
57 input_range(input_range_) {}
59 template <
typename BooleanDim2>
61 return (boolean_check[0] && boolean_check[1]);
64 auto buffer_ptr = buffer_acc;
65 auto kernel_ptr = kernel_filter;
67 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
68 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
69 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
70 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
72 for (
size_t i = itemID.get_local_id(0);
i < num_input;
i += itemID.get_local_range()[0]) {
73 const size_t local_index =
i + plane_kernel_offset;
74 const size_t tensor_index =
75 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + input_offset);
77 local_acc[local_index] =
78 (((
i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
79 ? device_evaluator.coeff(tensor_index)
83 itemID.barrier(cl::sycl::access::fence_space::local_space);
86 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
87 if (boundary_check(itemID.get_global_id() < input_range)) {
88 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
89 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
90 for (
size_t k = 0; k < kernelSize; ++k) {
91 result += (local_acc[k + index] * kernel_ptr[k]);
93 const size_t tensor_index =
94 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
95 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
96 buffer_ptr[tensor_index] = result;
101 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
102 typename Kernel_accessor,
typename Buffer_accessor>
105 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
111 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout>
indexMapper;
115 Buffer_accessor buffer_acc_,
116 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
117 const cl::sycl::range<2> kernel_size_,
const cl::sycl::range<3> input_range_)
118 : local_acc(local_acc_),
119 device_evaluator(device_evaluator_),
120 kernel_filter(kernel_filter_),
121 buffer_acc(buffer_acc_),
122 indexMapper(indexMapper_),
123 kernel_size(kernel_size_),
124 input_range(input_range_) {}
125 template <
typename BooleanDim3>
127 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
131 auto buffer_ptr = buffer_acc;
132 auto kernel_ptr = kernel_filter;
134 const auto num_input = cl::sycl::range<2>{
135 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
137 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
138 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
140 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
141 itemID.get_group(1) * itemID.get_local_range()[1]};
144 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
145 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
146 const size_t local_input_offset = num_input[0] * (
j + plane_kernel_offset);
147 bool in_range_dim1 = ((
j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
148 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
149 const size_t local_index =
i + local_input_offset;
150 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
151 i + input_offset[0],
j + input_offset[1]);
152 local_acc[local_index] = (((
i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
153 in_range_dim1 && in_range_dim2)
154 ? device_evaluator.coeff(tensor_index)
155 : CoeffReturnType(0);
159 itemID.barrier(cl::sycl::access::fence_space::local_space);
162 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
163 itemID.get_group(1) * itemID.get_local_range()[1]};
165 if (boundary_check(itemID.get_global_id() < input_range)) {
166 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
168 for (
size_t j = 0;
j < kernel_size[1];
j++) {
169 size_t kernel_offset = kernel_size[0] *
j;
171 (num_input[0] * (plane_kernel_offset +
j + itemID.get_local_id(1))) + itemID.get_local_id(0);
172 for (
size_t i = 0;
i < kernel_size[0];
i++) {
173 result += (local_acc[
i + index] * kernel_ptr[
i + kernel_offset]);
176 const size_t tensor_index =
177 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
178 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
179 itemID.get_local_id(1) + output_offset[1]);
181 buffer_ptr[tensor_index] = result;
186 template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
187 typename Kernel_accessor,
typename Buffer_accessor>
190 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
196 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout>
indexMapper;
202 Buffer_accessor buffer_acc_,
203 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
204 const cl::sycl::range<3> kernel_size_,
const cl::sycl::range<3> input_range_,
206 : local_acc(local_acc_),
207 device_evaluator(device_evaluator_),
208 kernel_filter(kernel_filter_),
209 buffer_acc(buffer_acc_),
210 indexMapper(indexMapper_),
211 kernel_size(kernel_size_),
212 input_range(input_range_),
214 template <
typename BooleanDim3>
216 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
219 auto buffer_ptr = buffer_acc;
220 auto kernel_ptr = kernel_filter;
221 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
223 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
225 const auto output_offset =
226 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
228 for (
size_t p = 0;
p < numP;
p++) {
230 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(
p);
231 for (
size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
232 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
233 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
234 for (
size_t j = itemID.get_local_id(1);
j < num_input[1];
j += itemID.get_local_range()[1]) {
235 bool cond_j_dim = cond_k_dim && (
j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
236 size_t local_index_dim1 = (num_input[0] *
j) + local_index_dim2;
237 for (
size_t i = itemID.get_local_id(0);
i < num_input[0];
i += itemID.get_local_range()[0]) {
238 bool conds = cond_j_dim && (
i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
239 const size_t local_index = local_index_dim1 +
i;
240 const size_t tensor_index =
241 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
242 i + input_offset[0],
j + input_offset[1], k + input_offset[2]);
243 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
247 itemID.barrier(cl::sycl::access::fence_space::local_space);
251 if (boundary_check(itemID.get_global_id() < input_range)) {
252 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
253 for (
size_t k = 0; k < kernel_size[2]; k++) {
254 for (
size_t j = 0;
j < kernel_size[1];
j++) {
255 for (
size_t i = 0;
i < kernel_size[0];
i++) {
256 const size_t kernel_index =
i + kernel_size[0] * (
j + kernel_size[1] * k);
257 const size_t local_index =
258 ((
i + itemID.get_local_id(0)) +
259 num_input[0] * ((
j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
261 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
265 const size_t tensor_index =
266 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(
p) +
267 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
268 buffer_ptr[tensor_index] = result;
271 itemID.barrier(cl::sycl::access::fence_space::local_space);
276 template <
typename Indices,
typename InputArgType,
typename KernelArgType>
280 static constexpr
int NumDims =
281 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
282 static constexpr
int NumKernelDims = internal::array_size<Indices>::value;
289 typedef typename InputArgType::Scalar
Scalar;
301 PreferBlockAccess =
false,
311 : m_inputImpl(op.inputExpression(), device),
312 m_kernelArg(op.kernelExpression()),
313 m_kernelImpl(op.kernelExpression(), device),
314 m_indices(op.indices()),
317 m_local_kernel(false),
321 YOU_MADE_A_PROGRAMMING_MISTAKE);
327 m_dimensions = m_inputImpl.dimensions();
328 for (
int i = 0;
i < NumKernelDims; ++
i) {
330 const Index input_dim = input_dims[index];
331 const Index kernel_dim = kernel_dims[
i];
332 const Index result_dim = input_dim - kernel_dim + 1;
333 m_dimensions[index] = result_dim;
341 m_inputImpl.evalSubExprsIfNeeded(NULL);
354 m_inputImpl.cleanup();
359 if (m_local_kernel) {
361 m_local_kernel =
false;
376 m_local_kernel =
false;
378 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
381 EvalTo evalToTmp(
m_device.get(local), m_kernelArg);
382 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
383 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp,
m_device);
385 m_local_kernel =
true;
391 typedef typename InputEvaluator::Dimensions InputDims;
392 switch (NumKernelDims) {
394 const size_t numX =
dimensions()[m_indices[0]];
395 const size_t numP =
dimensions().TotalSize() / numX;
396 const auto input_dim = std::array<size_t, 2>{numX, numP};
397 auto global_range = cl::sycl::range<2>{1, 1};
398 auto local_range = cl::sycl::range<2>{1, 1};
399 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
401 m_device.parallel_for_setup(input_dim, global_range, local_range);
402 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
403 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
406 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
412 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
413 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
414 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])).wait();
419 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
421 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
422 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
423 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
424 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
425 const size_t numP =
dimensions().TotalSize() / (numX * numY);
426 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
428 auto global_range = cl::sycl::range<3>{1, 1, 1};
429 auto local_range = cl::sycl::range<3>{1, 1, 1};
431 m_device.parallel_for_setup(input_dim, global_range, local_range);
433 const size_t local_memory_size =
434 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
435 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
436 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
438 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
439 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
443 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
444 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
445 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}).wait();
450 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(
Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
454 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
455 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
456 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
458 const size_t numX =
dimensions()[m_indices[kernel_index[0]]];
459 const size_t numY =
dimensions()[m_indices[kernel_index[1]]];
460 const size_t numZ =
dimensions()[m_indices[kernel_index[2]]];
461 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
462 const size_t numP =
dimensions().TotalSize() / (numX * numY * numZ);
465 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
466 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
467 m_kernelImpl.dimensions()[kernel_index[1]],
468 m_kernelImpl.dimensions()[kernel_index[2]]}};
470 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
472 auto global_range = cl::sycl::range<3>{1, 1, 1};
473 auto local_range = cl::sycl::range<3>{1, 1, 1};
475 m_device.parallel_for_setup(input_dim, global_range, local_range);
476 auto local_memory_range = (local_range + kernel_size - 1);
477 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
479 gpu_assert(
static_cast<unsigned long>(local_memory_size) <=
m_device.sharedMemPerBlock());
483 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
484 m_inputImpl, m_kernel,
data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
485 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP).wait();
491 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
502 template <
int LoadMode>
506 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
512 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
514 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
515 const double firstIndex_compute_cost =
517 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
519 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
#define EIGEN_DEVICE_FUNC
#define EIGEN_STATIC_ASSERT(X, MSG)
Eigen::internal::traits< TensorConvolutionOp >::Index Index
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
const Indices & indices() const
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
Buffer_accessor buffer_acc
Evaluator device_evaluator
void operator()(cl::sycl::nd_item< 3 > itemID) const
bool boundary_check(const BooleanDim3 boolean_check) const
Kernel_accessor kernel_filter
const cl::sycl::range< 2 > kernel_size
const cl::sycl::range< 3 > input_range
internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 2, Evaluator::Layout > indexMapper_, const cl::sycl::range< 2 > kernel_size_, const cl::sycl::range< 3 > input_range_)
Evaluator device_evaluator
Buffer_accessor buffer_acc
internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper
void operator()(cl::sycl::nd_item< 2 > itemID) const
const cl::sycl::range< 2 > input_range
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 1, Evaluator::Layout > indexMapper_, const size_t kernelSize_, const cl::sycl::range< 2 > input_range_)
bool boundary_check(const BooleanDim2 boolean_check) const
Kernel_accessor kernel_filter
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
void operator()(cl::sycl::nd_item< 3 > itemID) const
Kernel_accessor kernel_filter
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Local_accessor
internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper
const cl::sycl::range< 3 > input_range
Evaluator device_evaluator
Buffer_accessor buffer_acc
bool boundary_check(const BooleanDim3 boolean_check) const
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, Buffer_accessor buffer_acc_, internal::IndexMapper< Index, InputDims, 3, Evaluator::Layout > indexMapper_, const cl::sycl::range< 3 > kernel_size_, const cl::sycl::range< 3 > input_range_, const size_t numP_)
const cl::sycl::range< 3 > kernel_size
internal::packet_traits< Scalar >::type type
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
Storage::Type EvaluatorPointerType
static constexpr int PacketSize
EvaluatorPointerType data() const
Derived::Scalar CoeffReturnType