8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
18 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
27 template <
int Layout,
typename IndexType,
int NumDims>
31 if (NumDims == 0)
return strides;
35 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
37 for (
int i = 1;
i < NumDims; ++
i) {
42 for (
int i = NumDims - 2;
i >= 0; --
i) {
50 template <
int Layout,
typename IndexType,
size_t NumDims>
56 template <
int Layout, std::ptrdiff_t... Indices>
59 return strides<Layout>(
DSizes<std::ptrdiff_t,
sizeof...(Indices)>(sizes));
77 struct TensorBlockResourceRequirements {
89 : shape_type(shape_type_),
size(size_), cost_per_coeff(cost_)
93 template <
typename Scalar>
98 return {shape_type,
size, cost};
101 template <
typename Scalar>
118 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
124 template <
typename Scalar>
126 size_t size_in_bytes) {
131 template <
typename Scalar>
133 size_t size_in_bytes) {
139 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
140 merge(
const TensorBlockResourceRequirements& lhs,
141 const TensorBlockResourceRequirements& rhs) {
142 return {merge(lhs.shape_type, rhs.shape_type),
143 merge(lhs.size, rhs.size),
144 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};
149 cost_per_coeff += cost;
157 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
162 using Requirements = TensorBlockResourceRequirements;
165 static EIGEN_STRONG_INLINE
size_t merge(
size_t lhs_size,
size_t rhs_size) {
179 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
180 TensorOpCost rhs_cost) {
181 return lhs_cost + rhs_cost;
189 template <
int NumDims,
typename IndexType = Eigen::Index>
190 class TensorBlockDescriptor {
192 typedef DSizes<IndexType, NumDims> Dimensions;
204 class DestinationBuffer {
206 enum DestinationBufferKind :
int {
237 template <
typename Scalar>
238 Scalar*
data()
const {
240 return static_cast<Scalar*
>(m_data);
243 const Dimensions&
strides()
const {
return m_strides; }
244 const DestinationBufferKind& kind()
const {
return m_kind; }
247 friend class TensorBlockDescriptor<NumDims, IndexType>;
249 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
251 template <
typename Scalar>
252 DestinationBuffer(Scalar* data,
const Dimensions&
strides,
253 DestinationBufferKind kind)
254 : m_data(static_cast<void*>(
data)),
255 m_data_type_size(sizeof(Scalar)),
259 template <
int Layout,
typename Scalar>
260 static DestinationBuffer make(
const TensorBlockDescriptor& desc,
261 Scalar* data,
const Dimensions&
strides) {
262 return DestinationBuffer(data,
strides, kind<Layout>(desc,
strides));
265 template <
int Layout>
266 static DestinationBufferKind kind(
const TensorBlockDescriptor& desc,
268 const Dimensions& desc_dims = desc.dimensions();
269 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
270 for (
int i = 0;
i < NumDims; ++
i) {
271 if (desc_dims[i] == 1)
continue;
272 if (desc_strides[i] !=
strides[i])
return kStrided;
280 size_t m_data_type_size;
284 Dimensions m_strides;
286 DestinationBufferKind m_kind;
289 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions,
290 const DestinationBuffer& destination)
292 m_dimensions(dimensions),
293 m_destination(destination) {}
295 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions)
297 m_dimensions(dimensions),
298 m_destination(DestinationBuffer()) {}
300 IndexType offset()
const {
return m_offset; }
301 const Dimensions& dimensions()
const {
return m_dimensions; }
302 IndexType dimension(
int index)
const {
return m_dimensions[index]; }
303 IndexType
size()
const {
return array_prod<IndexType>(m_dimensions); }
305 const DestinationBuffer& destination()
const {
return m_destination; }
307 template <
int Layout,
typename Scalar>
308 void AddDestinationBuffer(Scalar* dst_base,
const Dimensions& dst_strides) {
311 DestinationBuffer::template make<Layout>(*
this, dst_base, dst_strides);
314 template <
int Layout,
typename Scalar,
typename DstStr
idesIndexType>
315 void AddDestinationBuffer(
317 const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
319 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
322 TensorBlockDescriptor& DropDestinationBuffer() {
323 m_destination.m_data = NULL;
324 m_destination.m_kind = DestinationBuffer::kEmpty;
328 bool HasDestinationBuffer()
const {
329 return m_destination.kind() != DestinationBuffer::kEmpty;
333 TensorBlockDescriptor WithOffset(IndexType offset)
const {
334 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
340 const IndexType m_offset;
341 const Dimensions m_dimensions;
342 DestinationBuffer m_destination;
348 template <
int NumDims,
int Layout,
typename IndexType = Eigen::Index>
349 class TensorBlockMapper {
350 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
353 typedef DSizes<IndexType, NumDims> Dimensions;
355 TensorBlockMapper() =
default;
356 TensorBlockMapper(
const DSizes<IndexType, NumDims>& dimensions,
357 const TensorBlockResourceRequirements& requirements)
358 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
360 InitializeBlockDimensions();
364 return m_total_block_count;
368 return m_block_dimensions.TotalSize();
372 blockDimensions()
const {
373 return m_block_dimensions;
377 blockDescriptor(IndexType block_index)
const {
378 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
380 IndexType offset = 0;
381 DSizes<IndexType, NumDims> dimensions;
383 if (NumDims == 0)
return BlockDescriptor(offset, dimensions);
386 for (
int i = NumDims - 1;
i >= 0; --
i) {
387 const int dim = isColMajor ?
i : NumDims -
i - 1;
389 const IndexType idx = block_index / m_block_strides[dim];
390 block_index -= idx * m_block_strides[dim];
392 const IndexType coord = idx * m_block_dimensions[dim];
393 dimensions[dim] =
numext::mini(m_tensor_dimensions[dim] - coord,
394 m_block_dimensions[dim]);
395 offset += coord * m_tensor_strides[dim];
398 return {offset, dimensions};
402 void InitializeBlockDimensions() {
405 IndexType target_block_size =
406 numext::maxi<IndexType>(1,
static_cast<IndexType
>(m_requirements.size));
408 IndexType tensor_size = m_tensor_dimensions.TotalSize();
414 if (tensor_size == 0) {
415 for (
int i = 0;
i < NumDims; ++
i) {
416 m_block_dimensions[
i] = 1;
418 m_total_block_count = 0;
423 if (tensor_size <= target_block_size) {
424 m_block_dimensions = m_tensor_dimensions;
425 m_total_block_count = 1;
428 for (
int i = 0;
i < NumDims; ++
i) {
429 m_tensor_strides[
i] = 0;
430 m_block_strides[
i] = 1;
435 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
439 IndexType coeff_to_allocate = target_block_size;
441 for (
int i = 0;
i < NumDims; ++
i) {
442 const int dim = isColMajor ?
i : NumDims -
i - 1;
443 m_block_dimensions[dim] =
444 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
445 coeff_to_allocate =
divup(
447 numext::maxi(
static_cast<IndexType
>(1), m_block_dimensions[dim]));
454 const IndexType dim_size_target = convert_index<IndexType>(
455 std::pow(
static_cast<float>(target_block_size),
456 1.0f /
static_cast<float>(m_block_dimensions.rank())));
458 for (
int i = 0;
i < NumDims; ++
i) {
463 m_block_dimensions[
i] =
468 IndexType total_size = m_block_dimensions.TotalSize();
469 for (
int i = 0;
i < NumDims; ++
i) {
470 const int dim = isColMajor ?
i : NumDims -
i - 1;
472 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
473 const IndexType total_size_other_dims =
474 total_size / m_block_dimensions[dim];
475 const IndexType alloc_avail =
476 divup<IndexType>(target_block_size, total_size_other_dims);
477 if (alloc_avail == m_block_dimensions[dim]) {
481 m_block_dimensions[dim] =
483 total_size = total_size_other_dims * m_block_dimensions[dim];
492 numext::mini<IndexType>(target_block_size,
493 m_tensor_dimensions.TotalSize()));
496 DSizes<IndexType, NumDims> block_count;
497 for (
int i = 0;
i < NumDims; ++
i) {
498 block_count[
i] =
divup(m_tensor_dimensions[i], m_block_dimensions[i]);
500 m_total_block_count =
array_prod(block_count);
503 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
504 m_block_strides = strides<Layout>(block_count);
507 DSizes<IndexType, NumDims> m_tensor_dimensions;
508 TensorBlockResourceRequirements m_requirements;
510 DSizes<IndexType, NumDims> m_block_dimensions;
511 IndexType m_total_block_count;
513 DSizes<IndexType, NumDims> m_tensor_strides;
514 DSizes<IndexType, NumDims> m_block_strides;
526 template <
typename Device>
527 class TensorBlockScratchAllocator {
529 explicit TensorBlockScratchAllocator(
const Device& device)
530 : m_device(device), m_allocation_index(0) {}
532 ~TensorBlockScratchAllocator() {
533 for (
size_t i = 0;
i < m_allocations.size(); ++
i) {
534 m_device.deallocate(m_allocations[i].ptr);
538 void* allocate(
size_t size) {
540 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
543 const int num_allocations =
static_cast<int>(m_allocations.size());
544 const bool has_allocation = m_allocation_index < num_allocations;
554 if (has_allocation && m_allocations[m_allocation_index].size < size) {
555 m_device.deallocate(m_allocations[m_allocation_index].ptr);
556 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
557 m_allocations[m_allocation_index].size =
size;
561 if (!has_allocation) {
562 Allocation allocation;
563 allocation.ptr = m_device.allocate(size);
564 allocation.size =
size;
565 m_allocations.push_back(allocation);
568 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
569 eigen_assert(m_allocations[m_allocation_index].size >= size);
571 return m_allocations[m_allocation_index++].ptr;
574 void reset() { m_allocation_index = 0; }
582 const Device& m_device;
583 int m_allocation_index;
585 std::vector<Allocation> m_allocations;
619 class TensorBlockNotImplemented {
621 typedef void XprType;
629 template <
typename XprType>
631 typedef typename XprType::Scalar type;
634 struct XprScalar<void> {
656 template <
typename Scalar,
int NumDims,
int Layout,
658 class TensorMaterializedBlock {
660 typedef DSizes<IndexType, NumDims> Dimensions;
661 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
664 const Dimensions& dimensions,
bool valid_expr =
true)
667 m_dimensions(dimensions),
668 m_expr(m_data, m_dimensions),
669 m_valid_expr(valid_expr) {
679 const XprType& expr()
const {
683 const Scalar*
data()
const {
return m_data; }
686 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
698 Scalar*
data()
const {
return m_data; }
699 const Dimensions& dimensions()
const {
return m_dimensions; }
700 const Dimensions&
strides()
const {
return m_strides; }
702 TensorMaterializedBlock AsTensorMaterializedBlock()
const {
703 return TensorMaterializedBlock(
704 m_materialized_in_output
707 m_data, m_dimensions, !m_strided_storage);
711 friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
713 Storage(Scalar* data,
const Dimensions& dimensions,
714 const Dimensions&
strides,
bool materialized_in_output,
715 bool strided_storage)
717 m_dimensions(dimensions),
719 m_materialized_in_output(materialized_in_output),
720 m_strided_storage(strided_storage) {}
723 Dimensions m_dimensions;
724 Dimensions m_strides;
725 bool m_materialized_in_output;
726 bool m_strided_storage;
731 template <
typename TensorBlockScratch>
732 EIGEN_STRONG_INLINE
static Storage prepareStorage(
733 TensorBlockDesc& desc, TensorBlockScratch& scratch,
734 bool allow_strided_storage =
false) {
736 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
738 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
739 Scalar* buffer = desc.destination().template data<Scalar>();
740 desc.DropDestinationBuffer();
741 return Storage(buffer, desc.dimensions(),
742 internal::strides<Layout>(desc.dimensions()),
746 }
else if (desc.destination().kind() == DestinationBuffer::kStrided &&
747 allow_strided_storage) {
748 Scalar* buffer = desc.destination().template data<Scalar>();
749 desc.DropDestinationBuffer();
750 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
754 void* mem = scratch.allocate(desc.size() *
sizeof(Scalar));
755 return Storage(
static_cast<Scalar*
>(mem), desc.dimensions(),
756 internal::strides<Layout>(desc.dimensions()),
763 template <
typename DataDimensions,
typename TensorBlockScratch>
764 EIGEN_STRONG_INLINE
static TensorMaterializedBlock materialize(
765 const Scalar* data,
const DataDimensions& data_dims,
766 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
767 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
779 static const bool is_col_major = Layout ==
ColMajor;
782 int num_matching_inner_dims = 0;
783 for (
int i = 0;
i < NumDims; ++
i) {
784 int dim = is_col_major ?
i : NumDims -
i - 1;
785 if (data_dims[dim] != desc.dimensions()[dim])
break;
786 ++num_matching_inner_dims;
791 bool can_use_direct_access =
true;
792 for (
int i = num_matching_inner_dims + 1;
i < NumDims; ++
i) {
793 int dim = is_col_major ?
i : NumDims -
i - 1;
794 if (desc.dimension(dim) != 1) {
795 can_use_direct_access =
false;
800 if (can_use_direct_access) {
801 const Scalar* block_start =
data + desc.offset();
803 block_start, desc.dimensions());
807 const Storage storage = prepareStorage(desc, scratch);
809 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
811 typedef typename TensorBlockIO::Dst TensorBlockIODst;
812 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
814 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
815 data, desc.offset());
816 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
819 TensorBlockIO::Copy(dst, src);
820 return storage.AsTensorMaterializedBlock();
826 const Scalar* m_data;
827 Dimensions m_dimensions;
836 template <
typename UnaryOp,
typename ArgTensorBlock>
837 class TensorCwiseUnaryBlock {
838 static constexpr
bool NoArgBlockAccess =
839 internal::is_void<typename ArgTensorBlock::XprType>::value;
842 typedef std::conditional_t<
843 NoArgBlockAccess, void,
844 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
847 typedef typename XprScalar<XprType>::type Scalar;
849 TensorCwiseUnaryBlock(
const ArgTensorBlock& arg_block,
const UnaryOp& functor)
850 : m_arg_block(arg_block), m_functor(functor) {}
854 XprType expr()
const {
return XprType(m_arg_block.expr(), m_functor); }
855 const Scalar*
data()
const {
return NULL; }
856 void cleanup() { m_arg_block.cleanup(); }
859 ArgTensorBlock m_arg_block;
867 template <
typename BinaryOp,
typename LhsTensorBlock,
typename RhsTensorBlock>
868 class TensorCwiseBinaryBlock {
869 static constexpr
bool NoArgBlockAccess =
870 internal::is_void<typename LhsTensorBlock::XprType>::value ||
871 internal::is_void<typename RhsTensorBlock::XprType>::value;
874 typedef std::conditional_t<
875 NoArgBlockAccess, void,
876 TensorCwiseBinaryOp<
BinaryOp,
const typename LhsTensorBlock::XprType,
877 const typename RhsTensorBlock::XprType> >
880 typedef typename XprScalar<XprType>::type Scalar;
882 TensorCwiseBinaryBlock(
const LhsTensorBlock& left_block,
883 const RhsTensorBlock& right_block,
885 : m_left_block(left_block),
886 m_right_block(right_block),
887 m_functor(functor) {}
891 XprType expr()
const {
892 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
895 const Scalar*
data()
const {
return NULL; }
898 m_left_block.cleanup();
899 m_right_block.cleanup();
903 LhsTensorBlock m_left_block;
904 RhsTensorBlock m_right_block;
913 template <
typename BlockFactory,
typename ArgTensorBlock>
914 class TensorUnaryExprBlock {
915 typedef typename ArgTensorBlock::XprType ArgXprType;
916 static constexpr
bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
919 typedef std::conditional_t<
920 NoArgBlockAccess, void,
921 typename BlockFactory::template XprType<ArgXprType>::type> XprType;
923 typedef typename XprScalar<XprType>::type Scalar;
925 TensorUnaryExprBlock(
const ArgTensorBlock& arg_block,
926 const BlockFactory& factory)
927 : m_arg_block(arg_block), m_factory(factory) {}
930 XprType expr()
const {
return m_factory.expr(m_arg_block.expr()); }
931 const Scalar*
data()
const {
return NULL; }
932 void cleanup() { m_arg_block.cleanup(); }
935 ArgTensorBlock m_arg_block;
936 BlockFactory m_factory;
943 template <
typename BlockFactory,
typename Arg1TensorBlock,
944 typename Arg2TensorBlock,
typename Arg3TensorBlock>
945 class TensorTernaryExprBlock {
946 typedef typename Arg1TensorBlock::XprType Arg1XprType;
947 typedef typename Arg2TensorBlock::XprType Arg2XprType;
948 typedef typename Arg3TensorBlock::XprType Arg3XprType;
950 static constexpr
bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
951 internal::is_void<Arg2XprType>::value ||
952 internal::is_void<Arg3XprType>::value;
955 typedef std::conditional_t<
956 NoArgBlockAccess, void,
957 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
958 Arg3XprType>::type> XprType;
960 typedef typename XprScalar<XprType>::type Scalar;
962 TensorTernaryExprBlock(
const Arg1TensorBlock& arg1_block,
963 const Arg2TensorBlock& arg2_block,
964 const Arg3TensorBlock& arg3_block,
965 const BlockFactory& factory)
966 : m_arg1_block(arg1_block),
967 m_arg2_block(arg2_block),
968 m_arg3_block(arg3_block),
969 m_factory(factory) {}
972 XprType expr()
const {
973 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
974 m_arg3_block.expr());
976 const Scalar*
data()
const {
return NULL; }
978 m_arg1_block.cleanup();
979 m_arg2_block.cleanup();
980 m_arg3_block.cleanup();
984 Arg1TensorBlock m_arg1_block;
985 Arg2TensorBlock m_arg2_block;
986 Arg3TensorBlock m_arg3_block;
987 BlockFactory m_factory;
994 template <
typename Scalar,
typename IndexType>
995 class StridedLinearBufferCopy {
996 typedef typename packet_traits<Scalar>::type Packet;
997 typedef typename unpacket_traits<Packet>::half HalfPacket;
999 Vectorizable = packet_traits<Scalar>::Vectorizable,
1000 PacketSize = packet_traits<Scalar>::size,
1001 HalfPacketSize = unpacket_traits<HalfPacket>::size,
1002 HasHalfPacket =
static_cast<int>(HalfPacketSize) <
static_cast<int>(PacketSize)
1017 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s),
data(d) {}
1025 Src(IndexType o, IndexType s,
const Scalar* d)
1026 : offset(o), stride(s),
data(d) {}
1033 template <
typename Str
idedLinearBufferCopy::Kind kind>
1036 const size_t count) {
1037 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1042 template <
typename Str
idedLinearBufferCopy::Kind kind>
1044 const IndexType count,
const IndexType dst_offset,
1045 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1046 const IndexType src_offset,
const IndexType src_stride,
1047 const Scalar* EIGEN_RESTRICT src_data) {
1048 const Scalar* src = &src_data[src_offset];
1049 Scalar* dst = &dst_data[dst_offset];
1051 if (!Vectorizable) {
1052 for (
Index i = 0;
i < count; ++
i) {
1053 dst[
i * dst_stride] = src[
i * src_stride];
1058 const IndexType vectorized_size = count - PacketSize;
1061 if (kind == StridedLinearBufferCopy::Kind::Linear) {
1064 const IndexType unrolled_size = count - 4 * PacketSize;
1066 for (;
i <= unrolled_size;
i += 4 * PacketSize) {
1067 for (
int j = 0;
j < 4; ++
j) {
1068 Packet
p = ploadu<Packet>(src + i + j * PacketSize);
1069 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1072 for (;
i <= vectorized_size;
i += PacketSize) {
1073 Packet
p = ploadu<Packet>(src + i);
1074 pstoreu<Scalar, Packet>(dst + i, p);
1076 if (HasHalfPacket) {
1077 const IndexType vectorized_half_size = count - HalfPacketSize;
1078 if (i <= vectorized_half_size) {
1079 HalfPacket
p = ploadu<HalfPacket>(src + i);
1080 pstoreu<Scalar, HalfPacket>(dst + i, p);
1081 i += HalfPacketSize;
1084 for (;
i < count; ++
i) {
1088 }
else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1091 for (;
i <= vectorized_size;
i += PacketSize) {
1092 Packet
p = ploadu<Packet>(src + i);
1093 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1095 if (HasHalfPacket) {
1096 const IndexType vectorized_half_size = count - HalfPacketSize;
1097 if (i <= vectorized_half_size) {
1098 HalfPacket
p = ploadu<HalfPacket>(src + i);
1099 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
1100 i += HalfPacketSize;
1103 for (;
i < count; ++
i) {
1104 dst[
i * dst_stride] = src[
i];
1107 }
else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1110 const IndexType unrolled_size = count - 4 * PacketSize;
1112 Packet
p = pset1<Packet>(s);
1113 for (;
i <= unrolled_size;
i += 4 * PacketSize) {
1114 for (
int j = 0;
j < 4; ++
j) {
1115 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1118 for (;
i <= vectorized_size;
i += PacketSize) {
1119 pstoreu<Scalar, Packet>(dst + i, p);
1121 if (HasHalfPacket) {
1122 const IndexType vectorized_half_size = count - HalfPacketSize;
1123 if (i <= vectorized_half_size) {
1124 HalfPacket hp = pset1<HalfPacket>(s);
1125 pstoreu<Scalar, HalfPacket>(dst + i, hp);
1126 i += HalfPacketSize;
1129 for (;
i < count; ++
i) {
1133 }
else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1137 Packet
p = pset1<Packet>(s);
1138 for (;
i <= vectorized_size;
i += PacketSize) {
1139 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1141 if (HasHalfPacket) {
1142 const IndexType vectorized_half_size = count - HalfPacketSize;
1143 if (i <= vectorized_half_size) {
1144 HalfPacket hp = pset1<HalfPacket>(s);
1145 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
1146 i += HalfPacketSize;
1149 for (;
i < count; ++
i) {
1150 dst[
i * dst_stride] = s;
1153 }
else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1156 for (;
i <= vectorized_size;
i += PacketSize) {
1157 Packet
p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1158 pstoreu<Scalar, Packet>(dst + i, p);
1160 if (HasHalfPacket) {
1161 const IndexType vectorized_half_size = count - HalfPacketSize;
1162 if (i <= vectorized_half_size) {
1164 pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
1165 pstoreu<Scalar, HalfPacket>(dst + i, p);
1166 i += HalfPacketSize;
1169 for (;
i < count; ++
i) {
1170 dst[
i] = src[
i * src_stride];
1173 }
else if (kind == StridedLinearBufferCopy::Kind::Random) {
1175 for (;
i < count; ++
i) {
1176 dst[
i * dst_stride] = src[
i * src_stride];
1190 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
1191 class TensorBlockIO {
1192 static constexpr
bool IsColMajor = (Layout ==
ColMajor);
1194 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1197 typedef DSizes<IndexType, NumDims> Dimensions;
1198 typedef DSizes<int, NumDims> DimensionsMap;
1201 Dst(
const Dimensions& dst_dims,
const Dimensions& dst_strides, Scalar* dst,
1202 IndexType dst_offset = 0)
1203 : dims(dst_dims),
strides(dst_strides),
data(dst), offset(dst_offset) {}
1212 Src(
const Dimensions& src_strides,
const Scalar* src,
1213 IndexType src_offset = 0)
1214 :
strides(src_strides),
data(src), offset(src_offset) {}
1227 const Dst& dst,
const Src& src,
const DimensionsMap& dst_to_src_dim_map) {
1230 *(dst.data + dst.offset) = *(src.data + src.offset);
1238 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1240 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1241 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1245 const DimensionsMap& dim_map = dst_to_src_dim_map;
1248 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1260 int num_size_one_inner_dims = 0;
1261 for (
int i = 0;
i < num_squeezable_dims; ++
i) {
1262 const int dst_dim = IsColMajor ?
i : NumDims -
i - 1;
1263 if (dst.dims[dst_dim] != 1)
break;
1264 num_size_one_inner_dims++;
1268 if (num_size_one_inner_dims == NumDims) {
1269 *(dst.data + dst.offset) = *(src.data + src.offset);
1274 const int dst_stride1_dim = IsColMajor
1275 ? num_size_one_inner_dims
1276 : NumDims - num_size_one_inner_dims - 1;
1279 const int src_dim_for_dst_stride1_dim =
1280 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1283 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1287 for (
int i = num_size_one_inner_dims + 1;
i < num_squeezable_dims; ++
i) {
1288 const int dst_dim = IsColMajor ?
i : NumDims -
i - 1;
1289 const IndexType dst_stride = dst.strides[dst_dim];
1290 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1291 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1292 dst_inner_dim_size *= dst.dims[dst_dim];
1293 ++num_size_one_inner_dims;
1300 IndexType input_offset = src.offset;
1301 IndexType output_offset = dst.offset;
1302 IndexType input_stride =
1303 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1304 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1306 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1311 for (
int i = num_size_one_inner_dims;
i < NumDims - 1; ++
i) {
1312 const int dst_dim = IsColMajor ?
i + 1 : NumDims -
i - 2;
1313 if (dst.dims[dst_dim] == 1)
continue;
1315 it[idx].size = dst.dims[dst_dim];
1316 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1317 it[idx].output_stride = dst.strides[dst_dim];
1319 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1320 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1326 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1328 #define COPY_INNER_DIM(KIND) \
1329 IndexType num_copied = 0; \
1330 for (num_copied = 0; num_copied < block_total_size; \
1331 num_copied += dst_inner_dim_size) { \
1332 LinCopy::template Run<KIND>( \
1333 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1334 typename LinCopy::Src(input_offset, input_stride, src.data), \
1335 dst_inner_dim_size); \
1337 for (int j = 0; j < idx; ++j) { \
1338 if (++it[j].count < it[j].size) { \
1339 input_offset += it[j].input_stride; \
1340 output_offset += it[j].output_stride; \
1344 input_offset -= it[j].input_span; \
1345 output_offset -= it[j].output_span; \
1350 if (input_stride == 1 && output_stride == 1) {
1352 }
else if (input_stride == 1 && output_stride != 1) {
1354 }
else if (input_stride == 0 && output_stride == 1) {
1356 }
else if (input_stride == 0 && output_stride != 1) {
1358 }
else if (output_stride == 1) {
1364 #undef COPY_INNER_DIM
1371 DimensionsMap dst_to_src_map;
1372 for (
int i = 0;
i < NumDims; ++
i) dst_to_src_map[i] = i;
1373 return Copy(dst, src, dst_to_src_map);
1377 struct BlockIteratorState {
1378 BlockIteratorState()
1388 IndexType input_stride;
1389 IndexType output_stride;
1390 IndexType input_span;
1391 IndexType output_span;
1397 static int NumSqueezableInnerDims(
const DimensionsMap& dim_map) {
1398 int num_squeezable_dims = 0;
1399 for (
int i = 0;
i < NumDims; ++
i) {
1400 const int dim = IsColMajor ?
i : NumDims -
i - 1;
1401 if (dim_map[dim] != dim)
break;
1402 num_squeezable_dims++;
1404 return num_squeezable_dims;
1427 template <
typename Scalar,
int NumDims,
typename TensorBlockExpr,
1429 class TensorBlockAssignment {
1431 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1432 TensorBlockEvaluator;
1434 typedef DSizes<IndexType, NumDims> Dimensions;
1437 Vectorizable = packet_traits<Scalar>::Vectorizable,
1438 PacketSize = packet_traits<Scalar>::size
1441 template <
bool Vectorizable,
typename Evaluator>
1442 struct InnerDimAssign {
1444 const Evaluator& eval,
1445 IndexType eval_offset) {
1446 for (IndexType i = 0;
i < count; ++
i) {
1447 target[
i] = eval.coeff(eval_offset + i);
1452 template <
typename Evaluator>
1453 struct InnerDimAssign<true, Evaluator> {
1455 const Evaluator& eval,
1456 IndexType eval_offset) {
1457 typedef typename packet_traits<Scalar>::type Packet;
1459 const IndexType unrolled_size = count - 4 * PacketSize;
1460 const IndexType vectorized_size = count - PacketSize;
1463 for (;
i <= unrolled_size;
i += 4 * PacketSize) {
1464 for (
int j = 0;
j < 4; ++
j) {
1465 const IndexType idx = eval_offset +
i +
j * PacketSize;
1466 Packet
p = eval.template packet<Unaligned>(idx);
1467 pstoreu<Scalar>(target + i + j * PacketSize, p);
1471 for (;
i <= vectorized_size;
i += PacketSize) {
1472 Packet
p = eval.template packet<Unaligned>(eval_offset + i);
1473 pstoreu<Scalar>(target + i, p);
1476 for (;
i < count; ++
i) {
1477 target[
i] = eval.coeff(eval_offset + i);
1484 Target(
const Dimensions& target_dims,
const Dimensions& target_strides,
1485 Scalar* target_data, IndexType target_offset = 0)
1486 : dims(target_dims),
1489 offset(target_offset) {}
1497 static Target target(
const Dimensions& target_dims,
1498 const Dimensions& target_strides, Scalar* target_data,
1499 IndexType target_offset = 0) {
1500 return Target(target_dims, target_strides, target_data, target_offset);
1503 template <
typename TargetDimsIndexType,
typename TargetStr
idesIndexType>
1505 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1506 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1507 Scalar* target_data, IndexType target_offset = 0) {
1509 return Target(Dimensions(target_dims), Dimensions(target_strides),
1510 target_data, target_offset);
1514 const Target& target,
const TensorBlockExpr& expr) {
1516 DefaultDevice default_device;
1517 TensorBlockEvaluator eval(expr, default_device);
1523 static const bool is_col_major = Layout ==
ColMajor;
1526 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1527 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1528 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1534 IndexType num_squeezed_dims = 0;
1535 for (
Index i = 1;
i < NumDims; ++
i) {
1536 const Index dim = is_col_major ?
i : NumDims -
i - 1;
1537 const IndexType target_stride = target.strides[dim];
1539 if (output_inner_dim_size == target_stride) {
1540 output_inner_dim_size *= target.dims[dim];
1541 num_squeezed_dims++;
1552 for (
Index i = num_squeezed_dims;
i < NumDims - 1; ++
i) {
1553 const Index dim = is_col_major ?
i + 1 : NumDims -
i - 2;
1556 it[idx].size = target.dims[dim];
1557 it[idx].output_stride = target.strides[dim];
1558 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1564 IndexType input_offset = 0;
1565 IndexType output_offset = target.offset;
1568 for (IndexType i = 0;
i < output_size;
i += output_inner_dim_size) {
1571 TensorBlockEvaluator>::Run(target.data + output_offset,
1572 output_inner_dim_size, eval,
1576 input_offset += output_inner_dim_size;
1579 for (
int j = 0;
j < idx; ++
j) {
1580 if (++it[j].count < it[j].size) {
1581 output_offset += it[
j].output_stride;
1585 output_offset -= it[
j].output_span;
1591 struct BlockIteratorState {
1592 BlockIteratorState()
1593 : count(0),
size(0), output_stride(0), output_span(0) {}
1597 IndexType output_stride;
1598 IndexType output_span;
#define EIGEN_ALWAYS_INLINE
#define EIGEN_UNUSED_VARIABLE(var)
#define EIGEN_DEVICE_FUNC
#define COPY_INNER_DIM(KIND)
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
constexpr auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
: TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Eigen::AutoDiffScalar< EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(internal::remove_all_t< DerType >, typename internal::traits< internal::remove_all_t< DerType >>::Scalar, product) > pow(const Eigen::AutoDiffScalar< DerType > &x, const typename internal::traits< internal::remove_all_t< DerType >>::Scalar &y)
EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
static constexpr int Layout