10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
26 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
33 for (
int i = 0;
i < NumKernelDims; ++
i) {
34 const Index index = indices[
i];
35 const Index input_dim = input_dims[index];
36 const Index kernel_dim = kernel_dims[
i];
37 const Index result_dim = input_dim - kernel_dim + 1;
38 dimensions[index] = result_dim;
43 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
46 for (
int i = 1;
i < NumDims; ++
i) {
47 inputStrides[
i] = inputStrides[
i-1] * input_dims[
i-1];
48 outputStrides[
i] = outputStrides[
i-1] * dimensions[
i-1];
51 inputStrides[NumDims - 1] = 1;
52 outputStrides[NumDims - 1] = 1;
53 for (
int i =
static_cast<int>(NumDims) - 2;
i >= 0; --
i) {
54 inputStrides[
i] = inputStrides[
i + 1] * input_dims[
i + 1];
55 outputStrides[
i] = outputStrides[
i + 1] * dimensions[
i + 1];
63 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
65 : NumDims - NumKernelDims;
66 for (
int i = 0;
i < NumKernelDims; ++
i) {
67 const Index index =
i + offset;
68 ordering[index] = indices[
i];
70 gpuInputDimensions[index] = input_dims[indices[
i]];
71 gpuOutputDimensions[index] = dimensions[indices[
i]];
74 int written =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
77 for (
int i = 0;
i < NumDims; ++
i) {
79 ordering[written] =
i;
80 gpuInputDimensions[written] = input_dims[
i];
81 gpuOutputDimensions[written] = dimensions[
i];
86 for (
int i = 0;
i < NumDims; ++
i) {
87 m_inputStrides[
i] = inputStrides[ordering[
i]];
88 m_outputStrides[
i] = outputStrides[ordering[
i]];
91 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
92 for (
int i = 0;
i < NumDims; ++
i) {
93 if (i > NumKernelDims) {
94 m_gpuInputStrides[
i] =
95 m_gpuInputStrides[
i - 1] * gpuInputDimensions[
i - 1];
96 m_gpuOutputStrides[
i] =
97 m_gpuOutputStrides[
i - 1] * gpuOutputDimensions[
i - 1];
99 m_gpuInputStrides[
i] = 1;
100 m_gpuOutputStrides[
i] = 1;
104 for (
int i = NumDims - 1;
i >= 0; --
i) {
105 if (
static_cast<size_t>(i + 1) < offset) {
106 m_gpuInputStrides[
i] =
107 m_gpuInputStrides[
i + 1] * gpuInputDimensions[
i + 1];
108 m_gpuOutputStrides[
i] =
109 m_gpuOutputStrides[
i + 1] * gpuOutputDimensions[
i + 1];
111 m_gpuInputStrides[
i] = 1;
112 m_gpuOutputStrides[
i] = 1;
119 Index inputIndex = 0;
120 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
121 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
122 const Index idx =
p / m_gpuInputStrides[d];
123 inputIndex += idx * m_inputStrides[d];
124 p -= idx * m_gpuInputStrides[d];
126 if (NumKernelDims < NumDims) {
127 inputIndex +=
p * m_inputStrides[NumKernelDims];
130 std::ptrdiff_t limit = 0;
131 if (NumKernelDims < NumDims) {
132 limit = NumDims - NumKernelDims - 1;
134 for (
int d = 0; d < limit; ++d) {
135 const Index idx =
p / m_gpuInputStrides[d];
136 inputIndex += idx * m_inputStrides[d];
137 p -= idx * m_gpuInputStrides[d];
139 inputIndex +=
p * m_inputStrides[limit];
145 Index outputIndex = 0;
146 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
147 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
148 const Index idx =
p / m_gpuOutputStrides[d];
149 outputIndex += idx * m_outputStrides[d];
150 p -= idx * m_gpuOutputStrides[d];
152 if (NumKernelDims < NumDims) {
153 outputIndex +=
p * m_outputStrides[NumKernelDims];
156 std::ptrdiff_t limit = 0;
157 if (NumKernelDims < NumDims) {
158 limit = NumDims - NumKernelDims - 1;
160 for (
int d = 0; d < limit; ++d) {
161 const Index idx =
p / m_gpuOutputStrides[d];
162 outputIndex += idx * m_outputStrides[d];
163 p -= idx * m_gpuOutputStrides[d];
165 outputIndex +=
p * m_outputStrides[limit];
171 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
173 : NumDims - NumKernelDims;
174 return i * m_inputStrides[offset];
178 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
180 : NumDims - NumKernelDims;
181 return i * m_outputStrides[offset];
185 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
187 : NumDims - NumKernelDims;
188 return i * m_inputStrides[offset] +
j * m_inputStrides[offset + 1];
192 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
194 : NumDims - NumKernelDims;
195 return i * m_outputStrides[offset] +
j * m_outputStrides[offset + 1];
199 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
201 : NumDims - NumKernelDims;
202 return i * m_inputStrides[offset] +
j * m_inputStrides[offset + 1] +
203 k * m_inputStrides[offset + 2];
207 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
209 : NumDims - NumKernelDims;
210 return i * m_outputStrides[offset] +
j * m_outputStrides[offset + 1] +
211 k * m_outputStrides[offset + 2];
215 static constexpr
int NumDims = internal::array_size<InputDims>::value;
224 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
225 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
228 typedef typename promote_storage_type<
typename InputXprType::Scalar,
229 typename KernelXprType::Scalar>::ret Scalar;
230 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
232 typedef typename promote_index_type<typename traits<InputXprType>::Index,
234 typedef typename InputXprType::Nested LhsNested;
235 typedef typename KernelXprType::Nested RhsNested;
236 typedef std::remove_reference_t<LhsNested> LhsNested_;
237 typedef std::remove_reference_t<RhsNested> RhsNested_;
240 typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
248 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
249 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>,
Eigen::Dense>
251 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
254 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
255 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
257 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
264 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
268 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar
Scalar;
270 typedef typename internal::promote_storage_type<
typename InputXprType::CoeffReturnType,
272 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type
Nested;
273 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind
StorageKind;
274 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index
Index;
298 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
303 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
304 static constexpr
int NumKernelDims = internal::array_size<Indices>::value;
320 PreferBlockAccess =
false,
330 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false),
m_device(device)
338 m_inputStride[0] = 1;
339 for (
int i = 1;
i < NumDims; ++
i) {
340 m_inputStride[
i] = m_inputStride[
i - 1] * input_dims[
i - 1];
343 m_inputStride[NumDims - 1] = 1;
344 for (
int i = NumDims - 2;
i >= 0; --
i) {
345 m_inputStride[
i] = m_inputStride[
i + 1] * input_dims[
i + 1];
351 for (
int i = 0;
i < NumKernelDims; ++
i) {
353 const Index input_dim = input_dims[index];
354 const Index kernel_dim = kernel_dims[
i];
355 const Index result_dim = input_dim - kernel_dim + 1;
356 m_dimensions[index] = result_dim;
358 m_kernelStride[
i] = m_kernelStride[
i - 1] * kernel_dims[
i - 1];
360 m_kernelStride[0] = 1;
362 m_indexStride[
i] = m_inputStride[index];
365 m_outputStride[0] = 1;
366 for (
int i = 1;
i < NumDims; ++
i) {
367 m_outputStride[
i] = m_outputStride[
i - 1] * m_dimensions[
i - 1];
370 for (
int i = NumKernelDims - 1;
i >= 0; --
i) {
372 const Index input_dim = input_dims[index];
373 const Index kernel_dim = kernel_dims[
i];
374 const Index result_dim = input_dim - kernel_dim + 1;
375 m_dimensions[index] = result_dim;
376 if (
i < NumKernelDims - 1) {
377 m_kernelStride[
i] = m_kernelStride[
i + 1] * kernel_dims[
i + 1];
379 m_kernelStride[NumKernelDims - 1] = 1;
381 m_indexStride[
i] = m_inputStride[index];
384 m_outputStride[NumDims - 1] = 1;
385 for (
int i = NumDims - 2;
i >= 0; --
i) {
386 m_outputStride[
i] = m_outputStride[
i + 1] * m_dimensions[
i + 1];
394 m_inputImpl.evalSubExprsIfNeeded(NULL);
399 m_inputImpl.cleanup();
400 if (m_local_kernel) {
401 m_device.deallocate((
void*)m_kernel);
402 m_local_kernel =
false;
418 convolve(firstInput(index), 0, NumKernelDims-1, result);
422 template<
int LoadMode>
426 Index startInputs[2] = {0, 0};
428 for (
int i = NumDims - 1;
i > 0; --
i) {
429 const Index idx0 = indices[0] / m_outputStride[
i];
430 const Index idx1 = indices[1] / m_outputStride[
i];
431 startInputs[0] += idx0 * m_inputStride[
i];
432 startInputs[1] += idx1 * m_inputStride[
i];
433 indices[0] -= idx0 * m_outputStride[
i];
434 indices[1] -= idx1 * m_outputStride[
i];
437 for (
int i = 0;
i < NumDims - 1; ++
i) {
438 const Index idx0 = indices[0] / m_outputStride[
i];
439 const Index idx1 = indices[1] / m_outputStride[
i];
440 startInputs[0] += idx0 * m_inputStride[
i];
441 startInputs[1] += idx1 * m_inputStride[
i];
442 indices[0] -= idx0 * m_outputStride[
i];
443 indices[1] -= idx1 * m_outputStride[
i];
446 startInputs[0] += indices[0];
447 startInputs[1] += indices[1];
449 if (startInputs[1]-startInputs[0] ==
PacketSize-1) {
451 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
456 convolve(startInputs[0], 0, NumKernelDims-1,
data[0]);
459 convolve(firstInput(index+
i), 0, NumKernelDims-1,
data[
i]);
463 return internal::pload<PacketReturnType>(
data);
469 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
471 const double convolve_compute_cost =
472 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
473 const double firstIndex_compute_cost =
475 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
476 TensorOpCost::DivCost<Index>());
478 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
479 m_kernelImpl.costPerCoeff(vectorized) +
488 Index startInput = 0;
490 for (
int i = NumDims - 1;
i > 0; --
i) {
491 const Index idx = index / m_outputStride[
i];
492 startInput += idx * m_inputStride[
i];
493 index -= idx * m_outputStride[
i];
496 for (
int i = 0;
i < NumDims - 1; ++
i) {
497 const Index idx = index / m_outputStride[
i];
498 startInput += idx * m_inputStride[
i];
499 index -= idx * m_outputStride[
i];
507 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
508 const Index input = firstIndex +
j * m_indexStride[DimIndex];
509 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
511 convolve(input, kernel, DimIndex-1, accum);
513 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
518 template <
typename Packet>
520 for (
int j = 0;
j < m_kernelImpl.dimensions()[DimIndex]; ++
j) {
521 const Index input = firstIndex +
j * m_indexStride[DimIndex];
522 const Index kernel = firstKernel +
j * m_kernelStride[DimIndex];
524 convolvePacket(input, kernel, DimIndex-1, accum);
526 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
534 const Scalar* in_place = m_kernelImpl.data();
537 m_local_kernel =
false;
539 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
542 EvalTo evalToTmp(local, m_kernelArg);
543 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
544 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp,
m_device);
547 m_local_kernel =
true;
570 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
572 template <
int StaticKernelSize>
573 struct GetKernelSize {
575 return StaticKernelSize;
579 struct GetKernelSize<
Dynamic> {
580 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator() (
const int kernelSize)
const {
585 template <
typename InputEvaluator,
typename Index,
typename InputDims,
586 int StaticKernelSize>
589 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
591 const float* __restrict kernel,
const int numPlanes,
const int numX,
592 const int maxX,
const int kernelSize,
float* buffer) {
593 #if defined(EIGEN_HIPCC)
594 HIP_DYNAMIC_SHARED(
float, s)
596 extern __shared__
float s[];
599 const int first_x = blockIdx.x * maxX;
600 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
601 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
602 const int num_x_output = last_x - first_x + 1;
604 const int first_plane = blockIdx.y * blockDim.y;
605 const int plane_stride = blockDim.y * gridDim.y;
607 for (
int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
609 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
610 const int plane_kernel_offset = threadIdx.y * num_x_input;
612 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
613 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
614 s[
i + plane_kernel_offset] = eval.coeff(tensor_index);
620 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
623 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
624 const int kernel_offset = plane_kernel_offset +
i;
627 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
628 result += s[k + kernel_offset] * kernel[k];
630 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
631 buffer[tensor_index] = result;
637 template <
typename InputEvaluator,
typename Index,
typename InputDims,
638 int StaticKernelSizeX,
int StaticKernelSizeY>
641 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
643 const float* __restrict kernel,
const int numPlanes,
const int numX,
644 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
645 const int kernelSizeY,
float* buffer) {
646 #if defined(EIGEN_HIPCC)
647 HIP_DYNAMIC_SHARED(
float, s)
649 extern __shared__
float s[];
652 const int first_x = blockIdx.x * maxX;
653 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
654 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
655 const int num_x_output = last_x - first_x + 1;
657 const int first_y = blockIdx.y * maxY;
658 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
659 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
660 const int num_y_output = last_y - first_y + 1;
662 const int first_plane = blockIdx.z * blockDim.z;
663 const int plane_stride = blockDim.z * gridDim.z;
665 for (
int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
667 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
668 const int plane_kernel_offset = threadIdx.z * num_y_input;
672 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
673 const int input_offset = num_x_input * (
j + plane_kernel_offset);
675 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
676 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
677 s[
i + input_offset] = eval.coeff(tensor_index);
684 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
687 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
689 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
692 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
693 const int kernel_offset = kernelSizeX * l;
694 const int input_offset =
i + num_x_input * (
j + l + plane_kernel_offset);
696 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
697 result += s[k + input_offset] * kernel[k + kernel_offset];
700 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
701 buffer[tensor_index] = result;
709 template <
typename InputEvaluator,
typename Index,
typename InputDims>
712 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
714 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
715 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
716 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
717 const size_t kernelSizeZ,
float* buffer) {
718 #if defined(EIGEN_HIPCC)
719 HIP_DYNAMIC_SHARED(
float, s)
721 extern __shared__
float s[];
725 const int first_x = blockIdx.x * maxX;
726 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
727 const int num_x_input = last_x - first_x + kernelSizeX;
729 const int first_y = blockIdx.y * maxY;
730 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
731 const int num_y_input = last_y - first_y + kernelSizeY;
733 const int first_z = blockIdx.z * maxZ;
734 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
735 const int num_z_input = last_z - first_z + kernelSizeZ;
737 for (
int p = 0;
p < numPlanes; ++
p) {
739 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
740 const int plane_kernel_offset = 0;
742 for (
int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
743 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
744 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
745 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
746 s[
i + num_x_input * (
j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
754 const int num_z_output = last_z - first_z + 1;
755 const int num_y_output = last_y - first_y + 1;
756 const int num_x_output = last_x - first_x + 1;
757 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
759 for (
int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
760 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
761 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
763 for (
int n = 0;
n < kernelSizeZ; ++
n) {
764 for (
int m = 0;
m < kernelSizeY; ++
m) {
765 for (
int l = 0; l < kernelSizeX; ++l) {
766 result += s[
i + l + num_x_input * (
j +
m + num_y_input * (k +
n + plane_kernel_offset))] * kernel[l + kernelSizeX * (
m + kernelSizeY *
n)];
770 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
771 buffer[tensor_index] = result;
781 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
782 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
784 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType>
XprType;
786 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
787 static constexpr
int NumKernelDims = internal::array_size<Indices>::value;
788 typedef typename XprType::Index
Index;
797 PreferBlockAccess =
false,
803 typedef internal::TensorBlockNotImplemented
TensorBlock;
807 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false),
m_device(device)
814 m_dimensions = m_inputImpl.dimensions();
815 for (
int i = 0;
i < NumKernelDims; ++
i) {
816 const Index index = op.indices()[
i];
817 const Index input_dim = input_dims[index];
818 const Index kernel_dim = kernel_dims[
i];
819 const Index result_dim = input_dim - kernel_dim + 1;
820 m_dimensions[index] = result_dim;
826 typedef typename InputArgType::Scalar
Scalar;
827 static constexpr
int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
833 m_inputImpl.evalSubExprsIfNeeded(NULL);
844 EIGEN_STRONG_INLINE
void cleanup() {
845 m_inputImpl.cleanup();
850 if (m_local_kernel) {
851 m_device.deallocate((
void*)m_kernel);
852 m_local_kernel =
false;
857 EIGEN_STRONG_INLINE
void preloadKernel() {
860 const Scalar* in_place = m_kernelImpl.data();
863 m_local_kernel =
false;
865 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(
Scalar);
867 typedef TensorEvalToOp<const KernelArgType> EvalTo;
868 EvalTo evalToTmp(local, m_kernelArg);
869 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
870 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp,
m_device);
873 m_local_kernel =
true;
877 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
878 const unsigned int rounded_toward_zero = num / denom;
879 if (num > rounded_toward_zero * denom) {
880 return rounded_toward_zero + 1;
882 return rounded_toward_zero;
888 const int maxSharedMem =
m_device.sharedMemPerBlock();
889 const int maxThreadsPerBlock =
m_device.maxGpuThreadsPerBlock();
890 const int maxBlocksPerProcessor =
m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
891 const int numMultiProcessors =
m_device.getNumGpuMultiProcessors();
892 const int warpSize = 32;
894 switch (NumKernelDims) {
896 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
899 const int numP =
dimensions().TotalSize() / numX;
903 const int single_stride_dim =
906 : m_inputImpl.dimensions().rank() - 1;
907 if (m_indices[0] == single_stride_dim) {
909 const int inner_dim = ((maxSharedMem / (
sizeof(
Scalar)) - kernel_size + 1 + 31) / 32) * 32;
910 maxX = numext::mini<int>(inner_dim, numX);
911 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(
Scalar)), numP);
913 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
917 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(
Scalar));
918 const int maxP = numext::mini<int>(inner_dim, numP);
919 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(
Scalar)) - kernel_size + 1, numX);
922 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
925 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(
Scalar);
928 const int num_x_blocks =
ceil(numX, maxX);
929 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
930 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
932 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
939 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
940 m_inputImpl.dimensions(), kernel_dims, indices);
941 switch(kernel_size) {
943 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4,
data);
947 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7,
data);
951 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size,
data);
962 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
963 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
965 const int numX =
dimensions()[m_indices[idxX]];
966 const int numY =
dimensions()[m_indices[idxY]];
967 const int numP =
dimensions().TotalSize() / (numX*numY);
969 const float scaling_factor = sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(
Scalar) * kernel_size_y * kernel_size_x));
972 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
973 const int maxX = numext::mini<int>(inner_dim, numX);
974 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
975 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(
Scalar)), numP);
979 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
980 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
982 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(
Scalar);
985 const int num_x_blocks =
ceil(numX, maxX);
986 const int num_y_blocks =
ceil(numY, maxY);
987 const int blocksPerProcessor =
numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
988 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
990 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
997 m_kernelImpl.dimensions()[idxY]);
998 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
999 m_inputImpl.dimensions(), kernel_dims, indices);
1000 switch (kernel_size_x) {
1002 switch (kernel_size_y) {
1004 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7,
data);
1008 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 4,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y,
data);
1015 switch (kernel_size_y) {
1017 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4,
data);
1021 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims, 7,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y,
data);
1028 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims,
Dynamic,
Dynamic>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y,
data);
1043 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1044 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1045 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1047 const int numX =
dimensions()[m_indices[idxX]];
1048 const int numY =
dimensions()[m_indices[idxY]];
1049 const int numZ =
dimensions()[m_indices[idxZ]];
1050 const int numP =
dimensions().TotalSize() / (numX*numY*numZ);
1052 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1053 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1054 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(
Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1059 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1060 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1062 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(
Scalar);
1069 m_kernelImpl.dimensions()[idxY],
1070 m_kernelImpl.dimensions()[idxZ]);
1071 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1072 m_inputImpl.dimensions(), kernel_dims, indices);
1074 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>,
Index, InputDims>), num_blocks, block_size, shared_mem,
m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z,
data);
1079 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1088 return m_buf[index];
1091 template<
int LoadMode>
1096 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1103 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1105 const double convolve_compute_cost =
1106 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1107 const double firstIndex_compute_cost =
1109 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1110 TensorOpCost::DivCost<Index>());
1111 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized,
PacketSize) +
1112 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1113 m_kernelImpl.costPerCoeff(vectorized) +
1114 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1119 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1120 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1121 KernelArgType m_kernelArg;
1126 bool m_local_kernel;
#define EIGEN_DEVICE_FUNC
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
#define EIGEN_STATIC_ASSERT(X, MSG)
const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Eigen::internal::traits< TensorConvolutionOp >::Index Index
Eigen::internal::traits< TensorConvolutionOp >::Scalar Scalar
Eigen::internal::traits< TensorConvolutionOp >::StorageKind StorageKind
internal::promote_storage_type< typename InputXprType::CoeffReturnType, typename KernelXprType::CoeffReturnType >::ret CoeffReturnType
Eigen::internal::nested< TensorConvolutionOp >::type Nested
Eigen::NumTraits< Scalar >::Real RealScalar
const Indices & indices() const
const internal::remove_all_t< typename KernelXprType::Nested > & kernelExpression() const
InputXprType::Nested m_input_xpr
KernelXprType::Nested m_kernel_xpr
TensorConvolutionOp(const InputXprType &input, const KernelXprType &kernel, const Indices &dims)
typename remove_all< T >::type remove_all_t
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
internal::packet_traits< Scalar >::type type
A cost model used to limit the number of threads used for evaluating tensor expression.
const Dimensions & dimensions() const
PacketReturnType packet(Index index) const
static constexpr int Layout
const Device EIGEN_DEVICE_REF m_device
CoeffReturnType coeff(Index index) const
TensorEvaluator(const Derived &m, const Device &device)
static constexpr int PacketSize
EvaluatorPointerType data() const
Derived::Scalar CoeffReturnType
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
bool evalSubExprsIfNeeded(EvaluatorPointerType dest)
PacketType< CoeffReturnType, Device >::type PacketReturnType
Derived::Dimensions Dimensions
TensorOpCost costPerCoeff(bool vectorized) const