InverseSize4.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2001 Intel Corporation
5 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
6 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 //
12 // The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
13 // inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
14 // adjugate of M and determinant of M respectively. M# is computed block-wise
15 // using specific formulae. For proof, see:
16 // https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
17 // Variable names are adopted from \src\LU\Inverse_SSE.h.
18 //
19 // The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
20 // comes from the following Intel's library:
21 // http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
22 //
23 // Here is the respective copyright and license statement:
24 //
25 // Copyright (c) 2001 Intel Corporation.
26 //
27 // Permition is granted to use, copy, distribute and prepare derivative works
28 // of this library for any purpose and without fee, provided, that the above
29 // copyright notice and this statement appear in all copies.
30 // Intel makes no representations about the suitability of this software for
31 // any purpose, and specifically disclaims all warranties.
32 // See LEGAL.TXT for all the legal information.
33 //
34 // TODO: Unify implementations of different data types (i.e. float and double).
35 #ifndef EIGEN_INVERSE_SIZE_4_H
36 #define EIGEN_INVERSE_SIZE_4_H
37 
38 #include "../InternalHeaderCheck.h"
39 
40 #if EIGEN_COMP_GNUC_STRICT
41 // These routines requires bit manipulation of the sign, which is not compatible
42 // with fastmath.
43 #pragma GCC push_options
44 #pragma GCC optimize ("no-fast-math")
45 #endif
46 
47 namespace Eigen
48 {
49 namespace internal
50 {
51 template <typename MatrixType, typename ResultType>
52 struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
53 {
54  enum
55  {
56  MatrixAlignment = traits<MatrixType>::Alignment,
57  ResultAlignment = traits<ResultType>::Alignment,
58  StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
59  };
60  typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject> ActualMatrixType;
61 
62  static void run(const MatrixType &mat, ResultType &result)
63  {
64  ActualMatrixType matrix(mat);
65 
66  const float* data = matrix.data();
67  const Index stride = matrix.innerStride();
68  Packet4f L1 = ploadt<Packet4f,MatrixAlignment>(data);
69  Packet4f L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
70  Packet4f L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
71  Packet4f L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
72 
73  // Four 2x2 sub-matrices of the input matrix
74  // input = [[A, B],
75  // [C, D]]
76  Packet4f A, B, C, D;
77 
78  if (!StorageOrdersMatch)
79  {
80  A = vec4f_unpacklo(L1, L2);
81  B = vec4f_unpacklo(L3, L4);
82  C = vec4f_unpackhi(L1, L2);
83  D = vec4f_unpackhi(L3, L4);
84  }
85  else
86  {
87  A = vec4f_movelh(L1, L2);
88  B = vec4f_movehl(L2, L1);
89  C = vec4f_movelh(L3, L4);
90  D = vec4f_movehl(L4, L3);
91  }
92 
93  Packet4f AB, DC;
94 
95  // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
96  AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
97  AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
98 
99  // DC = D#*C
100  DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
101  DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
102 
103  // determinants of the sub-matrices
104  Packet4f dA, dB, dC, dD;
105 
106  dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
107  dA = psub(dA, vec4f_movehl(dA, dA));
108 
109  dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
110  dB = psub(dB, vec4f_movehl(dB, dB));
111 
112  dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
113  dC = psub(dC, vec4f_movehl(dC, dC));
114 
115  dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
116  dD = psub(dD, vec4f_movehl(dD, dD));
117 
118  Packet4f d, d1, d2;
119 
120  d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
121  d = padd(d, vec4f_movehl(d, d));
122  d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
123  d1 = pmul(dA, dD);
124  d2 = pmul(dB, dC);
125 
126  // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
127  Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
128 
129  // reciprocal of the determinant of the input matrix, rd = 1/det
130  Packet4f rd = preciprocal(det);
131 
132  // Four sub-matrices of the inverse
133  Packet4f iA, iB, iC, iD;
134 
135  // iD = D*|A| - C*A#*B
136  iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
137  iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
138  iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
139 
140  // iA = A*|D| - B*D#*C
141  iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
142  iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
143  iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
144 
145  // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
146  iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
147  iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
148  iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
149 
150  // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
151  iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
152  iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
153  iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
154 
155  EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
156  const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
157  rd = pxor(rd, p4f_sign_PNNP);
158  iA = pmul(iA, rd);
159  iB = pmul(iB, rd);
160  iC = pmul(iC, rd);
161  iD = pmul(iD, rd);
162 
163  Index res_stride = result.outerStride();
164  float *res = result.data();
165 
166  pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
167  pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
168  pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
169  pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
170  }
171 };
172 
173 #if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
174 // same algorithm as above, except that each operand is split into
175 // halves for two registers to hold.
176 template <typename MatrixType, typename ResultType>
177 struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
178 {
179  enum
180  {
181  MatrixAlignment = traits<MatrixType>::Alignment,
182  ResultAlignment = traits<ResultType>::Alignment,
183  StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
184  };
185  typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit),
186  MatrixType const &,
187  typename MatrixType::PlainObject>
188  ActualMatrixType;
189 
190  static void run(const MatrixType &mat, ResultType &result)
191  {
192  ActualMatrixType matrix(mat);
193 
194  // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
195  // row e.g. A1, upper row of A, A2, lower row of A
196  // input = [[A, B], = [[[A1, [B1,
197  // [C, D]] A2], B2]],
198  // [[C1, [D1,
199  // C2], D2]]]
200 
201  Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
202 
203  const double* data = matrix.data();
204  const Index stride = matrix.innerStride();
205  if (StorageOrdersMatch)
206  {
207  A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
208  B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
209  A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
210  B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
211  C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
212  D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
213  C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
214  D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
215  }
216  else
217  {
218  Packet2d temp;
219  A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
220  C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
221  A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
222  C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
223  temp = A1;
224  A1 = vec2d_unpacklo(A1, A2);
225  A2 = vec2d_unpackhi(temp, A2);
226 
227  temp = C1;
228  C1 = vec2d_unpacklo(C1, C2);
229  C2 = vec2d_unpackhi(temp, C2);
230 
231  B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
232  D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
233  B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
234  D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
235 
236  temp = B1;
237  B1 = vec2d_unpacklo(B1, B2);
238  B2 = vec2d_unpackhi(temp, B2);
239 
240  temp = D1;
241  D1 = vec2d_unpacklo(D1, D2);
242  D2 = vec2d_unpackhi(temp, D2);
243  }
244 
245  // determinants of the sub-matrices
246  Packet2d dA, dB, dC, dD;
247 
248  dA = vec2d_swizzle2(A2, A2, 1);
249  dA = pmul(A1, dA);
250  dA = psub(dA, vec2d_duplane(dA, 1));
251 
252  dB = vec2d_swizzle2(B2, B2, 1);
253  dB = pmul(B1, dB);
254  dB = psub(dB, vec2d_duplane(dB, 1));
255 
256  dC = vec2d_swizzle2(C2, C2, 1);
257  dC = pmul(C1, dC);
258  dC = psub(dC, vec2d_duplane(dC, 1));
259 
260  dD = vec2d_swizzle2(D2, D2, 1);
261  dD = pmul(D1, dD);
262  dD = psub(dD, vec2d_duplane(dD, 1));
263 
264  Packet2d DC1, DC2, AB1, AB2;
265 
266  // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
267  AB1 = pmul(B1, vec2d_duplane(A2, 1));
268  AB2 = pmul(B2, vec2d_duplane(A1, 0));
269  AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
270  AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
271 
272  // DC = D#*C
273  DC1 = pmul(C1, vec2d_duplane(D2, 1));
274  DC2 = pmul(C2, vec2d_duplane(D1, 0));
275  DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
276  DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
277 
278  Packet2d d1, d2;
279 
280  // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
281  Packet2d det;
282 
283  // reciprocal of the determinant of the input matrix, rd = 1/det
284  Packet2d rd;
285 
286  d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
287  d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
288  rd = padd(d1, d2);
289  rd = padd(rd, vec2d_duplane(rd, 1));
290 
291  d1 = pmul(dA, dD);
292  d2 = pmul(dB, dC);
293 
294  det = padd(d1, d2);
295  det = psub(det, rd);
296  det = vec2d_duplane(det, 0);
297  rd = pdiv(pset1<Packet2d>(1.0), det);
298 
299  // rows of four sub-matrices of the inverse
300  Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
301 
302  // iD = D*|A| - C*A#*B
303  iD1 = pmul(AB1, vec2d_duplane(C1, 0));
304  iD2 = pmul(AB1, vec2d_duplane(C2, 0));
305  iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
306  iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
307  dA = vec2d_duplane(dA, 0);
308  iD1 = psub(pmul(D1, dA), iD1);
309  iD2 = psub(pmul(D2, dA), iD2);
310 
311  // iA = A*|D| - B*D#*C
312  iA1 = pmul(DC1, vec2d_duplane(B1, 0));
313  iA2 = pmul(DC1, vec2d_duplane(B2, 0));
314  iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
315  iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
316  dD = vec2d_duplane(dD, 0);
317  iA1 = psub(pmul(A1, dD), iA1);
318  iA2 = psub(pmul(A2, dD), iA2);
319 
320  // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
321  iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
322  iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
323  iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
324  iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
325  dB = vec2d_duplane(dB, 0);
326  iB1 = psub(pmul(C1, dB), iB1);
327  iB2 = psub(pmul(C2, dB), iB2);
328 
329  // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
330  iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
331  iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
332  iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
333  iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
334  dC = vec2d_duplane(dC, 0);
335  iC1 = psub(pmul(B1, dC), iC1);
336  iC2 = psub(pmul(B2, dC), iC2);
337 
338  EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
339  EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
340  const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
341  const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
342  d1 = pxor(rd, sign_PN);
343  d2 = pxor(rd, sign_NP);
344 
345  Index res_stride = result.outerStride();
346  double *res = result.data();
347  pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
348  pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
349  pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
350  pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
351  pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
352  pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
353  pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
354  pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
355  }
356 };
357 #endif
358 } // namespace internal
359 } // namespace Eigen
360 
361 #if EIGEN_COMP_GNUC_STRICT
362 #pragma GCC pop_options
363 #endif
364 
365 #endif
MatrixXcf A
#define EIGEN_ALIGN_MAX
MatrixXf B
int data[]
#define vec4f_duplane(a, p)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
#define vec2d_duplane(a, p)
#define vec2d_swizzle2(a, b, mask)
Matrix< float, 1, Dynamic > MatrixType
Base::PlainObject PlainObject
Definition: Matrix.h:194
const unsigned int LinearAccessBit
Definition: Constants.h:132
const unsigned int RowMajorBit
Definition: Constants.h:68
Packet padd(const Packet &a, const Packet &b)
Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Packet pmul(const Packet &a, const Packet &b)
Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Packet psub(const Packet &a, const Packet &b)
Packet4f vec4f_swizzle2(const Packet4f &a, const Packet4f &b, int p, int q, int r, int s)
Packet2d pset1< Packet2d >(const double &from)
Packet8h pxor(const Packet8h &a, const Packet8h &b)
Packet pdiv(const Packet &a, const Packet &b)
Packet preciprocal(const Packet &a)
Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Packet2d pload< Packet2d >(const double *from)
Packet4f pload< Packet4f >(const float *from)
__vector float Packet4f
Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82