eigen/InverseSize4_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2001 Intel Corporation

 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>

 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 //

 // The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.

 // inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,

 // adjugate of M and determinant of M respectively. M# is computed block-wise

 // using specific formulae. For proof, see:

 // https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html

 // Variable names are adopted from \src\LU\Inverse_SSE.h.

 //

 // The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h

 // comes from the following Intel's library:

 // http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/

 //

 // Here is the respective copyright and license statement:

 //

 //   Copyright (c) 2001 Intel Corporation.

 //

 // Permition is granted to use, copy, distribute and prepare derivative works

 // of this library for any purpose and without fee, provided, that the above

 // copyright notice and this statement appear in all copies.

 // Intel makes no representations about the suitability of this software for

 // any purpose, and specifically disclaims all warranties.

 // See LEGAL.TXT for all the legal information.

 //

 // TODO: Unify implementations of different data types (i.e. float and double).

 #ifndef EIGEN_INVERSE_SIZE_4_H

 #define EIGEN_INVERSE_SIZE_4_H


 #include "../InternalHeaderCheck.h"


 #if EIGEN_COMP_GNUC_STRICT

 // These routines requires bit manipulation of the sign, which is not compatible

 // with fastmath.

 #pragma GCC push_options

 #pragma GCC optimize ("no-fast-math")

 #endif


 namespace Eigen

 {

 namespace internal

 {

 template <typename MatrixType, typename ResultType>

 struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>

 {

   enum

   {

     MatrixAlignment = traits<MatrixType>::Alignment,

     ResultAlignment = traits<ResultType>::Alignment,

     StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)

   };

   typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject> ActualMatrixType;


   static void run(const MatrixType &mat, ResultType &result)

   {

     ActualMatrixType matrix(mat);


     const float* data = matrix.data();

     const Index stride = matrix.innerStride();

     Packet4f L1 = ploadt<Packet4f,MatrixAlignment>(data);

     Packet4f L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);

     Packet4f L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);

     Packet4f L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);


     // Four 2x2 sub-matrices of the input matrix

     // input = [[A, B],

     //          [C, D]]

     Packet4f A, B, C, D;


     if (!StorageOrdersMatch)

     {

       A = vec4f_unpacklo(L1, L2);

       B = vec4f_unpacklo(L3, L4);

       C = vec4f_unpackhi(L1, L2);

       D = vec4f_unpackhi(L3, L4);

     }

     else

     {

       A = vec4f_movelh(L1, L2);

       B = vec4f_movehl(L2, L1);

       C = vec4f_movelh(L3, L4);

       D = vec4f_movehl(L4, L3);

     }


     Packet4f AB, DC;


     // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.

     AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);

     AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));


     // DC = D#*C

     DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);

     DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));


     // determinants of the sub-matrices

     Packet4f dA, dB, dC, dD;


     dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);

     dA = psub(dA, vec4f_movehl(dA, dA));


     dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);

     dB = psub(dB, vec4f_movehl(dB, dB));


     dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);

     dC = psub(dC, vec4f_movehl(dC, dC));


     dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);

     dD = psub(dD, vec4f_movehl(dD, dD));


     Packet4f d, d1, d2;


     d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);

     d = padd(d, vec4f_movehl(d, d));

     d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));

     d1 = pmul(dA, dD);

     d2 = pmul(dB, dC);


     // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)

     Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);


     // reciprocal of the determinant of the input matrix, rd = 1/det

     Packet4f rd = preciprocal(det);


     // Four sub-matrices of the inverse

     Packet4f iA, iB, iC, iD;


     // iD = D*|A| - C*A#*B

     iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));

     iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));

     iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);


     // iA = A*|D| - B*D#*C

     iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));

     iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));

     iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);


     // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A

     iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));

     iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));

     iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);


     // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D

     iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));

     iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));

     iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);


     EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};

     const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);

     rd = pxor(rd, p4f_sign_PNNP);

     iA = pmul(iA, rd);

     iB = pmul(iB, rd);

     iC = pmul(iC, rd);

     iD = pmul(iD, rd);


     Index res_stride = result.outerStride();

     float *res = result.data();


     pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));

     pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));

     pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));

     pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));

   }

 };


 #if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))

 // same algorithm as above, except that each operand is split into

 // halves for two registers to hold.

 template <typename MatrixType, typename ResultType>

 struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>

 {

   enum

   {

     MatrixAlignment = traits<MatrixType>::Alignment,

     ResultAlignment = traits<ResultType>::Alignment,

     StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)

   };

   typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit),

                          MatrixType const &,

                          typename MatrixType::PlainObject>

       ActualMatrixType;


   static void run(const MatrixType &mat, ResultType &result)

   {

     ActualMatrixType matrix(mat);


     // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower

     // row e.g. A1, upper row of A, A2, lower row of A

     // input = [[A, B],  =  [[[A1, [B1,

     //          [C, D]]        A2], B2]],

     //                       [[C1, [D1,

     //                         C2], D2]]]


     Packet2d A1, A2, B1, B2, C1, C2, D1, D2;


     const double* data = matrix.data();

     const Index stride = matrix.innerStride();

     if (StorageOrdersMatch)

     {

       A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);

       B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);

       A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);

       B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);

       C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);

       D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);

       C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);

       D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);

     }

     else

     {

       Packet2d temp;

       A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);

       C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);

       A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);

       C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);

       temp = A1;

       A1 = vec2d_unpacklo(A1, A2);

       A2 = vec2d_unpackhi(temp, A2);


       temp = C1;

       C1 = vec2d_unpacklo(C1, C2);

       C2 = vec2d_unpackhi(temp, C2);


       B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);

       D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);

       B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);

       D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);


       temp = B1;

       B1 = vec2d_unpacklo(B1, B2);

       B2 = vec2d_unpackhi(temp, B2);


       temp = D1;

       D1 = vec2d_unpacklo(D1, D2);

       D2 = vec2d_unpackhi(temp, D2);

     }


     // determinants of the sub-matrices

     Packet2d dA, dB, dC, dD;


     dA = vec2d_swizzle2(A2, A2, 1);

     dA = pmul(A1, dA);

     dA = psub(dA, vec2d_duplane(dA, 1));


     dB = vec2d_swizzle2(B2, B2, 1);

     dB = pmul(B1, dB);

     dB = psub(dB, vec2d_duplane(dB, 1));


     dC = vec2d_swizzle2(C2, C2, 1);

     dC = pmul(C1, dC);

     dC = psub(dC, vec2d_duplane(dC, 1));


     dD = vec2d_swizzle2(D2, D2, 1);

     dD = pmul(D1, dD);

     dD = psub(dD, vec2d_duplane(dD, 1));


     Packet2d DC1, DC2, AB1, AB2;


     // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.

     AB1 = pmul(B1, vec2d_duplane(A2, 1));

     AB2 = pmul(B2, vec2d_duplane(A1, 0));

     AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));

     AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));


     // DC = D#*C

     DC1 = pmul(C1, vec2d_duplane(D2, 1));

     DC2 = pmul(C2, vec2d_duplane(D1, 0));

     DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));

     DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));


     Packet2d d1, d2;


     // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)

     Packet2d det;


     // reciprocal of the determinant of the input matrix, rd = 1/det

     Packet2d rd;


     d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));

     d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));

     rd = padd(d1, d2);

     rd = padd(rd, vec2d_duplane(rd, 1));


     d1 = pmul(dA, dD);

     d2 = pmul(dB, dC);


     det = padd(d1, d2);

     det = psub(det, rd);

     det = vec2d_duplane(det, 0);

     rd = pdiv(pset1<Packet2d>(1.0), det);


     // rows of four sub-matrices of the inverse

     Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;


     // iD = D*|A| - C*A#*B

     iD1 = pmul(AB1, vec2d_duplane(C1, 0));

     iD2 = pmul(AB1, vec2d_duplane(C2, 0));

     iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));

     iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));

     dA = vec2d_duplane(dA, 0);

     iD1 = psub(pmul(D1, dA), iD1);

     iD2 = psub(pmul(D2, dA), iD2);


     // iA = A*|D| - B*D#*C

     iA1 = pmul(DC1, vec2d_duplane(B1, 0));

     iA2 = pmul(DC1, vec2d_duplane(B2, 0));

     iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));

     iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));

     dD = vec2d_duplane(dD, 0);

     iA1 = psub(pmul(A1, dD), iA1);

     iA2 = psub(pmul(A2, dD), iA2);


     // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A

     iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));

     iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));

     iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));

     iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));

     dB = vec2d_duplane(dB, 0);

     iB1 = psub(pmul(C1, dB), iB1);

     iB2 = psub(pmul(C2, dB), iB2);


     // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D

     iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));

     iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));

     iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));

     iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));

     dC = vec2d_duplane(dC, 0);

     iC1 = psub(pmul(B1, dC), iC1);

     iC2 = psub(pmul(B2, dC), iC2);


     EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};

     EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};

     const Packet2d sign_PN = pload<Packet2d>(sign_mask1);

     const Packet2d sign_NP = pload<Packet2d>(sign_mask2);

     d1 = pxor(rd, sign_PN);

     d2 = pxor(rd, sign_NP);


     Index res_stride = result.outerStride();

     double *res = result.data();

     pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));

     pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));

     pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));

     pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));

     pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));

     pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));

     pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));

     pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));

   }

 };

 #endif

 } // namespace internal

 } // namespace Eigen


 #if EIGEN_COMP_GNUC_STRICT

 #pragma GCC pop_options

 #endif


 #endif

A
MatrixXcf A
Definition: ComplexEigenSolver_compute.cpp:1

EIGEN_ALIGN_MAX
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:137

D
MatrixXcd D
Definition: EigenSolver_EigenSolver_MatrixType.cpp:14

B
MatrixXf B
Definition: GeneralizedEigenSolver.cpp:3

data
int data[]
Definition: Map_placement_new.cpp:1

vec4f_duplane
#define vec4f_duplane(a, p)
Definition: NEON/PacketMath.h:152

res
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3

vec2d_duplane
#define vec2d_duplane(a, p)
Definition: SSE/PacketMath.h:120

vec2d_swizzle2
#define vec2d_swizzle2(a, b, mask)
Definition: SSE/PacketMath.h:109

mat
MatrixXf mat
Definition: Tutorial_AdvancedInitialization_CommaTemporary.cpp:1

MatrixType
Matrix< float, 1, Dynamic > MatrixType
Definition: Tutorial_Map_using.cpp:1

Eigen::Matrix::PlainObject
Base::PlainObject PlainObject
Definition: Matrix.h:194

Eigen::LinearAccessBit
const unsigned int LinearAccessBit
Definition: Constants.h:132

Eigen::RowMajorBit
const unsigned int RowMajorBit
Definition: Constants.h:68

Eigen::Architecture::Target
@ Target
Definition: Constants.h:496

Eigen::internal::Packet2d
v2f64 Packet2d
Definition: MSA/PacketMath.h:820

Eigen::internal::padd
Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:308

Eigen::internal::vec4f_unpackhi
Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
Definition: NEON/PacketMath.h:148

Eigen::internal::vec4f_unpacklo
Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Definition: NEON/PacketMath.h:144

Eigen::internal::pmul
Packet pmul(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:339

Eigen::internal::vec2d_unpackhi
Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:116

Eigen::internal::psub
Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:324

Eigen::internal::vec4f_swizzle2
Packet4f vec4f_swizzle2(const Packet4f &a, const Packet4f &b, int p, int q, int r, int s)
Definition: NEON/PacketMath.h:132

Eigen::internal::pset1< Packet2d >
Packet2d pset1< Packet2d >(const double &from)
Definition: MSA/PacketMath.h:871

Eigen::internal::pxor
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition: AVX/PacketMath.h:1880

Eigen::internal::pdiv
Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:346

Eigen::internal::preciprocal
Packet preciprocal(const Packet &a)
Definition: GenericPacketMath.h:1284

Eigen::internal::vec2d_unpacklo
Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Definition: SSE/PacketMath.h:112

Eigen::internal::pload< Packet2d >
Packet2d pload< Packet2d >(const double *from)
Definition: MSA/PacketMath.h:966

Eigen::internal::pload< Packet4f >
Packet4f pload< Packet4f >(const float *from)
Definition: AltiVec/PacketMath.h:461

Eigen::internal::Packet4f
__vector float Packet4f
Definition: AltiVec/PacketMath.h:32

Eigen::internal::vec4f_movelh
Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
Definition: NEON/PacketMath.h:136

Eigen::internal::vec4f_movehl
Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Definition: NEON/PacketMath.h:140

Eigen
: InteropHeaders
Definition: Core:139

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82

internal
Definition: Eigen_Colamd.h:50