AltiVec/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11 #define EIGEN_PACKET_MATH_ALTIVEC_H
12 
13 #include "../../InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
17 namespace internal {
18 
19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
21 #endif
22 
23 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25 #endif
26 
27 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
28 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
29 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
30 #endif
31 
32 typedef __vector float Packet4f;
33 typedef __vector int Packet4i;
34 typedef __vector unsigned int Packet4ui;
35 typedef __vector __bool int Packet4bi;
36 typedef __vector short int Packet8s;
37 typedef __vector unsigned short int Packet8us;
38 typedef __vector __bool short Packet8bi;
39 typedef __vector signed char Packet16c;
40 typedef __vector unsigned char Packet16uc;
41 typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
42 
43 // We don't want to write the same code all the time, but we need to reuse the constants
44 // and it doesn't really work to declare them global, so we define macros instead
45 #define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
46  Packet4f p4f_##NAME = {X, X, X, X}
47 
48 #define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
49  Packet4i p4i_##NAME = vec_splat_s32(X)
50 
51 #define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
52  Packet4ui p4ui_##NAME = {X, X, X, X}
53 
54 #define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
55  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
56 
57 #define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
58  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
59 
60 #define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
61  Packet4f p4f_##NAME = pset1<Packet4f>(X)
62 
63 #define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
64  Packet4i p4i_##NAME = pset1<Packet4i>(X)
65 
66 #define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
67  Packet2d p2d_##NAME = pset1<Packet2d>(X)
68 
69 #define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
70  Packet2l p2l_##NAME = pset1<Packet2l>(X)
71 
72 #define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
73  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
74 
75 #define DST_CHAN 1
76 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
77 #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
78 
79 // These constants are endian-agnostic
80 static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
81 static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
82 static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
83 static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
84 static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
85 static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
86 static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
87 static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
88 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
89 #ifndef __VSX__
90 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
91 #endif
92 
93 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
94 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
95 static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
96 static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
97 
98 static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
99  8, 9, 10, 11, 12, 13, 14, 15};
100 static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
101  8, 9, 10, 11, 12, 13, 14, 15};
102 
103 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
104 static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
105 #ifndef _ARCH_PWR9
106 static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
107 #endif
108 
109 #ifdef _BIG_ENDIAN
110 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
111 #endif
112 static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
113 static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
114 
115 static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
116 
117 static Packet16uc p16uc_MERGEE16 = { 0,1, 16,17, 4,5, 20,21, 8,9, 24,25, 12,13, 28,29 };
118 static Packet16uc p16uc_MERGEO16 = { 2,3, 18,19, 6,7, 22,23, 10,11, 26,27, 14,15, 30,31 };
119 #ifdef _BIG_ENDIAN
120 static Packet16uc p16uc_MERGEH16 = { 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 };
121 #else
122 static Packet16uc p16uc_MERGEL16 = { 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 };
123 #endif
124 
125 // Handle endianness properly while loading constants
126 // Define global static constants:
127 #ifdef _BIG_ENDIAN
128 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
129 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
130 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
131 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
132 #else
134 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
135 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
136 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137 #endif // _BIG_ENDIAN
138 
139 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
140 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
141 static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
142 static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
143 
144 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
145 
146 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
147  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
148 #else
149  #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
150 #endif
151 
152 #if EIGEN_COMP_LLVM
153 #define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
154 #else
155 #define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
156 #endif
157 
158 template <>
159 struct packet_traits<float> : default_packet_traits {
160  typedef Packet4f type;
161  typedef Packet4f half;
162  enum {
163  Vectorizable = 1,
164  AlignedOnScalar = 1,
165  size = 4,
166 
167  HasAdd = 1,
168  HasSub = 1,
169  HasMul = 1,
170  HasDiv = 1,
171  HasMin = 1,
172  HasMax = 1,
173  HasAbs = 1,
174  HasSin = EIGEN_FAST_MATH,
175  HasCos = EIGEN_FAST_MATH,
176  HasACos = 1,
177  HasASin = 1,
178  HasATan = 1,
179  HasATanh = 1,
180  HasLog = 1,
181  HasExp = 1,
182 #ifdef EIGEN_VECTORIZE_VSX
183  HasSqrt = 1,
184 #if !EIGEN_COMP_CLANG
185  HasRsqrt = 1,
186 #else
187  HasRsqrt = 0,
188 #endif
189  HasTanh = EIGEN_FAST_MATH,
190  HasErf = EIGEN_FAST_MATH,
191  HasRint = 1,
192 #else
193  HasSqrt = 0,
194  HasRsqrt = 0,
195  HasTanh = 0,
196  HasErf = 0,
197  HasRint = 0,
198 #endif
199  HasRound = 1,
200  HasFloor = 1,
201  HasCeil = 1,
202  HasNegate = 1,
203  HasBlend = 1
204  };
205 };
206 template <>
207 struct packet_traits<bfloat16> : default_packet_traits {
208  typedef Packet8bf type;
209  typedef Packet8bf half;
210  enum {
211  Vectorizable = 1,
212  AlignedOnScalar = 1,
213  size = 8,
214 
215  HasAdd = 1,
216  HasSub = 1,
217  HasMul = 1,
218  HasDiv = 1,
219  HasMin = 1,
220  HasMax = 1,
221  HasAbs = 1,
222  HasSin = EIGEN_FAST_MATH,
223  HasCos = EIGEN_FAST_MATH,
224  HasLog = 1,
225  HasExp = 1,
226 #ifdef EIGEN_VECTORIZE_VSX
227  HasSqrt = 1,
228 #if !EIGEN_COMP_CLANG
229  HasRsqrt = 1,
230 #else
231  HasRsqrt = 0,
232 #endif
233  HasRint = 1,
234 #else
235  HasSqrt = 0,
236  HasRsqrt = 0,
237  HasRint = 0,
238 #endif
239  HasTanh = 0,
240  HasErf = 0,
241  HasRound = 1,
242  HasFloor = 1,
243  HasCeil = 1,
244  HasNegate = 1,
245  HasBlend = 1
246  };
247 };
248 
249 template <>
250 struct packet_traits<int> : default_packet_traits {
251  typedef Packet4i type;
252  typedef Packet4i half;
253  enum {
254  Vectorizable = 1,
255  AlignedOnScalar = 1,
256  size = 4,
257 
258  HasAdd = 1,
259  HasSub = 1,
260  HasShift = 1,
261  HasMul = 1,
262 #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
263  HasDiv = 1,
264 #else
265  HasDiv = 0,
266 #endif
267  HasBlend = 1,
268  HasCmp = 1
269  };
270 };
271 
272 template <>
273 struct packet_traits<short int> : default_packet_traits {
274  typedef Packet8s type;
275  typedef Packet8s half;
276  enum {
277  Vectorizable = 1,
278  AlignedOnScalar = 1,
279  size = 8,
280 
281  HasAdd = 1,
282  HasSub = 1,
283  HasMul = 1,
284  HasDiv = 0,
285  HasBlend = 1,
286  HasCmp = 1
287  };
288 };
289 
290 template <>
291 struct packet_traits<unsigned short int> : default_packet_traits {
292  typedef Packet8us type;
293  typedef Packet8us half;
294  enum {
295  Vectorizable = 1,
296  AlignedOnScalar = 1,
297  size = 8,
298 
299  HasAdd = 1,
300  HasSub = 1,
301  HasMul = 1,
302  HasDiv = 0,
303  HasBlend = 1,
304  HasCmp = 1
305  };
306 };
307 
308 template <>
309 struct packet_traits<signed char> : default_packet_traits {
310  typedef Packet16c type;
311  typedef Packet16c half;
312  enum {
313  Vectorizable = 1,
314  AlignedOnScalar = 1,
315  size = 16,
316 
317  HasAdd = 1,
318  HasSub = 1,
319  HasMul = 1,
320  HasDiv = 0,
321  HasBlend = 1,
322  HasCmp = 1
323  };
324 };
325 
326 template <>
327 struct packet_traits<unsigned char> : default_packet_traits {
328  typedef Packet16uc type;
329  typedef Packet16uc half;
330  enum {
331  Vectorizable = 1,
332  AlignedOnScalar = 1,
333  size = 16,
334 
335  HasAdd = 1,
336  HasSub = 1,
337  HasMul = 1,
338  HasDiv = 0,
339  HasBlend = 1,
340  HasCmp = 1
341  };
342 };
343 
344 template<> struct unpacket_traits<Packet4f>
345 {
346  typedef float type;
347  typedef Packet4f half;
348  typedef Packet4i integer_packet;
349  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
350 };
351 template<> struct unpacket_traits<Packet4i>
352 {
353  typedef int type;
354  typedef Packet4i half;
355  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
356 };
357 template<> struct unpacket_traits<Packet8s>
358 {
359  typedef short int type;
360  typedef Packet8s half;
361  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
362 };
363 template<> struct unpacket_traits<Packet8us>
364 {
365  typedef unsigned short int type;
366  typedef Packet8us half;
367  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
368 };
369 
370 template<> struct unpacket_traits<Packet16c>
371 {
372  typedef signed char type;
373  typedef Packet16c half;
374  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
375 };
376 template<> struct unpacket_traits<Packet16uc>
377 {
378  typedef unsigned char type;
379  typedef Packet16uc half;
380  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
381 };
382 
383 template<> struct unpacket_traits<Packet8bf>
384 {
385  typedef bfloat16 type;
386  typedef Packet8bf half;
387  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
388 };
389 inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
390 {
391  union {
392  Packet16c v;
393  signed char n[16];
394  } vt;
395  vt.v = v;
396  for (int i=0; i< 16; i++)
397  s << vt.n[i] << ", ";
398  return s;
399 }
400 
401 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
402 {
403  union {
404  Packet16uc v;
405  unsigned char n[16];
406  } vt;
407  vt.v = v;
408  for (int i=0; i< 16; i++)
409  s << vt.n[i] << ", ";
410  return s;
411 }
412 
413 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
414 {
415  union {
416  Packet4f v;
417  float n[4];
418  } vt;
419  vt.v = v;
420  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
421  return s;
422 }
423 
424 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
425 {
426  union {
427  Packet4i v;
428  int n[4];
429  } vt;
430  vt.v = v;
431  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
432  return s;
433 }
434 
435 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
436 {
437  union {
438  Packet4ui v;
439  unsigned int n[4];
440  } vt;
441  vt.v = v;
442  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
443  return s;
444 }
445 
446 template <typename Packet>
447 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
448 {
449  // some versions of GCC throw "unused-but-set-parameter".
450  // ignoring these warnings for now.
451  EIGEN_UNUSED_VARIABLE(from);
453 #ifdef EIGEN_VECTORIZE_VSX
454  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
455 #else
456  return vec_ld(0, from);
457 #endif
458 }
459 
460 // Need to define them first or we get specialization after instantiation errors
461 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
462 {
463  return pload_common<Packet4f>(from);
464 }
465 
466 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
467 {
468  return pload_common<Packet4i>(from);
469 }
470 
471 template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
472 {
473  return pload_common<Packet8s>(from);
474 }
475 
476 template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
477 {
478  return pload_common<Packet8us>(from);
479 }
480 
481 template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
482 {
483  return pload_common<Packet16c>(from);
484 }
485 
486 template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
487 {
488  return pload_common<Packet16uc>(from);
489 }
490 
491 template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
492 {
493  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
494 }
495 
496 template <typename Packet>
498 {
499  // some versions of GCC throw "unused-but-set-parameter".
500  // ignoring these warnings for now.
501  EIGEN_UNUSED_VARIABLE(from);
503  // Ignore partial input memory initialized
504 #if !EIGEN_COMP_LLVM
505  #pragma GCC diagnostic push
506  #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
507 #endif
508 #ifdef EIGEN_VECTORIZE_VSX
509  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
510 #else
511  return vec_ld(0, from);
512 #endif
513 #if !EIGEN_COMP_LLVM
514  #pragma GCC diagnostic pop
515 #endif
516 }
517 
519 {
520  return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
521 }
522 
523 template <typename Packet>
524 EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
525 {
526  // some versions of GCC throw "unused-but-set-parameter".
527  // ignoring these warnings for now.
528  const Index packet_size = unpacket_traits<Packet>::size;
529  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
530  const Index size = sizeof(__UNPACK_TYPE__(Packet));
531 #ifdef _ARCH_PWR9
532  EIGEN_UNUSED_VARIABLE(packet_size);
534  EIGEN_UNUSED_VARIABLE(from);
535  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
536  if (offset) {
537  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
538 #ifdef _BIG_ENDIAN
539  load = Packet(vec_sro(Packet16uc(load), shift));
540 #else
541  load = Packet(vec_slo(Packet16uc(load), shift));
542 #endif
543  }
544  return load;
545 #else
546  if (n) {
547  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
548  unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
549  unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
550  Index n2 = n * size;
551  if (16 <= n2) {
552  pstoreu(load2, ploadu<Packet16uc>(from2));
553  } else {
554  memcpy((void *)load2, (void *)from2, n2);
555  }
556  return pload_ignore<Packet>(load);
557  } else {
558  return Packet(pset1<Packet16uc>(0));
559  }
560 #endif
561 }
562 
563 template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
564 {
565  return pload_partial_common<Packet4f>(from, n, offset);
566 }
567 
568 template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
569 {
570  return pload_partial_common<Packet4i>(from, n, offset);
571 }
572 
573 template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
574 {
575  return pload_partial_common<Packet8s>(from, n, offset);
576 }
577 
578 template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
579 {
580  return pload_partial_common<Packet8us>(from, n, offset);
581 }
582 
583 template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
584 {
585  return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
586 }
587 
588 template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
589 {
590  return pload_partial_common<Packet16c>(from, n, offset);
591 }
592 
593 template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
594 {
595  return pload_partial_common<Packet16uc>(from, n, offset);
596 }
597 
598 template <typename Packet>
599 EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
600  // some versions of GCC throw "unused-but-set-parameter" (float *to).
601  // ignoring these warnings for now.
604 #ifdef EIGEN_VECTORIZE_VSX
605  vec_xst(from, 0, to);
606 #else
607  vec_st(from, 0, to);
608 #endif
609 }
610 
611 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
612 {
613  pstore_common<Packet4f>(to, from);
614 }
615 
616 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
617 {
618  pstore_common<Packet4i>(to, from);
619 }
620 
621 template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
622 {
623  pstore_common<Packet8s>(to, from);
624 }
625 
626 template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
627 {
628  pstore_common<Packet8us>(to, from);
629 }
630 
631 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
632 {
633  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
634 }
635 
636 template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
637 {
638  pstore_common<Packet16c>(to, from);
639 }
640 
641 template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
642 {
643  pstore_common<Packet16uc>(to, from);
644 }
645 
646 template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
647 {
648  // some versions of GCC throw "unused-but-set-parameter" (float *to).
649  // ignoring these warnings for now.
650  const Index packet_size = unpacket_traits<Packet>::size;
651  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
652  const Index size = sizeof(__UNPACK_TYPE__(Packet));
653 #ifdef _ARCH_PWR9
654  EIGEN_UNUSED_VARIABLE(packet_size);
657  Packet store = from;
658  if (offset) {
659  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
660 #ifdef _BIG_ENDIAN
661  store = Packet(vec_slo(Packet16uc(store), shift));
662 #else
663  store = Packet(vec_sro(Packet16uc(store), shift));
664 #endif
665  }
666  vec_xst_len(store, to, n * size);
667 #else
668  if (n) {
669  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
670  pstore(store, from);
671  unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
672  unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
673  Index n2 = n * size;
674  if (16 <= n2) {
675  pstore(to2, ploadu<Packet16uc>(store2));
676  } else {
677  memcpy((void *)to2, (void *)store2, n2);
678  }
679  }
680 #endif
681 }
682 
683 template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
684 {
685  pstore_partial_common<Packet4f>(to, from, n, offset);
686 }
687 
688 template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
689 {
690  pstore_partial_common<Packet4i>(to, from, n, offset);
691 }
692 
693 template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
694 {
695  pstore_partial_common<Packet8s>(to, from, n, offset);
696 }
697 
698 template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
699 {
700  pstore_partial_common<Packet8us>(to, from, n, offset);
701 }
702 
703 template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
704 {
705  pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
706 }
707 
708 template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
709 {
710  pstore_partial_common<Packet16c>(to, from, n, offset);
711 }
712 
713 template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
714 {
715  pstore_partial_common<Packet16uc>(to, from, n, offset);
716 }
717 
718 template<typename Packet>
719 EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
720 {
721  Packet v = {from, from, from, from};
722  return v;
723 }
724 
725 template<typename Packet>
726 EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
727 {
728  Packet v = {from, from, from, from, from, from, from, from};
729  return v;
730 }
731 
732 template<typename Packet>
733 EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
734 {
735  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
736  return v;
737 }
738 
739 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
740  return pset1_size4<Packet4f>(from);
741 }
742 
743 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
744  return pset1_size4<Packet4i>(from);
745 }
746 
747 template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
748  return pset1_size8<Packet8s>(from);
749 }
750 
751 template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
752  return pset1_size8<Packet8us>(from);
753 }
754 
755 template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
756  return pset1_size16<Packet16c>(from);
757 }
758 
759 template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
760  return pset1_size16<Packet16uc>(from);
761 }
762 
763 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
764  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
765 }
766 
767 template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
768  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
769 }
770 
771 template<typename Packet> EIGEN_STRONG_INLINE void
773  Packet& a0, Packet& a1, Packet& a2, Packet& a3)
774 {
775  a3 = pload<Packet>(a);
776  a0 = vec_splat(a3, 0);
777  a1 = vec_splat(a3, 1);
778  a2 = vec_splat(a3, 2);
779  a3 = vec_splat(a3, 3);
780 }
781 
782 template<> EIGEN_STRONG_INLINE void
783 pbroadcast4<Packet4f>(const float *a,
784  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
785 {
786  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
787 }
788 template<> EIGEN_STRONG_INLINE void
789 pbroadcast4<Packet4i>(const int *a,
790  Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
791 {
792  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
793 }
794 
795 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
796 {
798  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
799  if (stride == 1) {
801  return ploadu<Packet>(from);
802  } else {
803  return ploadu_partial<Packet>(from, n);
804  }
805  } else {
807  for (Index i = 0; i < n; i++) {
808  a[i] = from[i*stride];
809  }
810  // Leave rest of the array uninitialized
811  return pload_ignore<Packet>(a);
812  }
813 }
814 
816 {
817  return pgather_common<Packet4f>(from, stride);
818 }
819 
821 {
822  return pgather_common<Packet4i>(from, stride);
823 }
824 
826 {
827  return pgather_common<Packet8s>(from, stride);
828 }
829 
831 {
832  return pgather_common<Packet8us>(from, stride);
833 }
834 
836 {
837  return pgather_common<Packet8bf>(from, stride);
838 }
839 
841 {
842  return pgather_common<Packet16c>(from, stride);
843 }
844 
846 {
847  return pgather_common<Packet16uc>(from, stride);
848 }
849 
851 {
852  return pgather_common<Packet4f>(from, stride, n);
853 }
854 
856 {
857  return pgather_common<Packet4i>(from, stride, n);
858 }
859 
861 {
862  return pgather_common<Packet8s>(from, stride, n);
863 }
864 
866 {
867  return pgather_common<Packet8us>(from, stride, n);
868 }
869 
871 {
872  return pgather_common<Packet8bf>(from, stride, n);
873 }
874 
876 {
877  return pgather_common<Packet16c>(from, stride, n);
878 }
879 
881 {
882  return pgather_common<Packet16uc>(from, stride, n);
883 }
884 
885 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
886 {
888  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
889  if (stride == 1) {
891  return pstoreu(to, from);
892  } else {
893  return pstoreu_partial(to, from, n);
894  }
895  } else {
896  pstore<__UNPACK_TYPE__(Packet)>(a, from);
898  for (Index i = 0; i < n; i++) {
899  to[i*stride] = a[i];
900  }
901  }
902 }
903 
904 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
905 {
906  pscatter_common<Packet4f>(to, from, stride);
907 }
908 
909 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
910 {
911  pscatter_common<Packet4i>(to, from, stride);
912 }
913 
914 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
915 {
916  pscatter_common<Packet8s>(to, from, stride);
917 }
918 
919 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
920 {
921  pscatter_common<Packet8us>(to, from, stride);
922 }
923 
925 {
926  pscatter_common<Packet8bf>(to, from, stride);
927 }
928 
929 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
930 {
931  pscatter_common<Packet16c>(to, from, stride);
932 }
933 
934 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
935 {
936  pscatter_common<Packet16uc>(to, from, stride);
937 }
938 
939 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
940 {
941  pscatter_common<Packet4f>(to, from, stride, n);
942 }
943 
944 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
945 {
946  pscatter_common<Packet4i>(to, from, stride, n);
947 }
948 
949 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
950 {
951  pscatter_common<Packet8s>(to, from, stride, n);
952 }
953 
954 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
955 {
956  pscatter_common<Packet8us>(to, from, stride, n);
957 }
958 
960 {
961  pscatter_common<Packet8bf>(to, from, stride, n);
962 }
963 
964 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
965 {
966  pscatter_common<Packet16c>(to, from, stride, n);
967 }
968 
969 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
970 {
971  pscatter_common<Packet16uc>(to, from, stride, n);
972 }
973 
974 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
975 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
976 template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
977 template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
978 template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
979 template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
980 
981 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
982 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
983 template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
984 template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
985 template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
986 template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
987 template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
988 
989 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
990 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
991 template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
992 template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
993 template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
994 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
995 
996 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
997 {
998 #ifdef __POWER8_VECTOR__
999  return vec_neg(a);
1000 #else
1001  return vec_xor(a, p4f_MZERO);
1002 #endif
1003 }
1004 template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a)
1005 {
1006 #ifdef __POWER8_VECTOR__
1007  return vec_neg(a);
1008 #else
1009  return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1010 #endif
1011 }
1012 template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a)
1013 {
1014 #ifdef __POWER8_VECTOR__
1015  return vec_neg(a);
1016 #else
1017  return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1018 #endif
1019 }
1020 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
1021 {
1022 #ifdef __POWER8_VECTOR__
1023  return vec_neg(a);
1024 #else
1025  return p4i_ZERO - a;
1026 #endif
1027 }
1028 
1029 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
1030 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
1031 
1032 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
1033 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
1034 template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
1035 template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
1036 template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
1037 template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
1038 
1039 
1040 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
1041 {
1042 #ifndef __VSX__ // VSX actually provides a div instruction
1043  Packet4f t, y_0, y_1;
1044 
1045  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1046  y_0 = vec_re(b);
1047 
1048  // Do one Newton-Raphson iteration to get the needed accuracy
1049  t = vec_nmsub(y_0, b, p4f_ONE);
1050  y_1 = vec_madd(y_0, t, y_0);
1051 
1052  return vec_madd(a, y_1, p4f_MZERO);
1053 #else
1054  return vec_div(a, b);
1055 #endif
1056 }
1057 
1058 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b)
1059 {
1060 #if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
1061  return vec_div(a, b);
1062 #else
1065  eigen_assert(false && "packet integer division are not supported by AltiVec");
1066  return pset1<Packet4i>(0);
1067 #endif
1068 }
1069 
1070 // for some weird raisons, it has to be overloaded for packet of integers
1071 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
1072 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
1073 template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
1074 template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
1075 
1076 #ifdef EIGEN_VECTORIZE_VSX
1077 template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); }
1078 template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); }
1079 template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); }
1080 #endif
1081 
1082 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
1083 {
1084  #ifdef EIGEN_VECTORIZE_VSX
1085  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
1086  Packet4f ret;
1087  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
1088  return ret;
1089  #else
1090  return vec_min(a, b);
1091  #endif
1092 }
1093 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
1094 template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
1095 template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
1096 template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
1097 template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
1098 
1099 
1100 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
1101 {
1102  #ifdef EIGEN_VECTORIZE_VSX
1103  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
1104  Packet4f ret;
1105  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
1106  return ret;
1107  #else
1108  return vec_max(a, b);
1109  #endif
1110 }
1111 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
1112 template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
1113 template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
1114 template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
1115 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
1116 
1117 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
1118 // To fix bug with vec_cmplt on older versions
1119 #ifdef EIGEN_VECTORIZE_VSX
1120 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
1121 #endif
1122 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
1123 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1124  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
1125  return vec_nor(c,c);
1126 }
1127 
1128 #ifdef EIGEN_VECTORIZE_VSX
1129 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
1130 #endif
1131 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
1132 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
1133 #ifdef EIGEN_VECTORIZE_VSX
1134 template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
1135 #endif
1136 template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
1137 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
1138 #ifdef EIGEN_VECTORIZE_VSX
1139 template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
1140 #endif
1141 template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
1142 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
1143 #ifdef EIGEN_VECTORIZE_VSX
1144 template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
1145 #endif
1146 template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
1147 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
1148 #ifdef EIGEN_VECTORIZE_VSX
1149 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
1150 #endif
1151 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
1152 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
1153 
1154 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
1155 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
1156 template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
1157 template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
1158 template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1159  return pand<Packet8us>(a, b);
1160 }
1161 
1162 
1163 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
1164 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
1165 template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
1166 template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
1167 template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1168  return por<Packet8us>(a, b);
1169 }
1170 
1171 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
1172 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
1173 template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_xor(a, b); }
1174 template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1175  return pxor<Packet8us>(a, b);
1176 }
1177 
1178 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
1179 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
1180 
1181 template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1182  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
1183 }
1184 
1185 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
1186 {
1187  Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1188  Packet4f res;
1189 
1190 #ifdef EIGEN_VECTORIZE_VSX
1191  __asm__("xvrspiz %x0, %x1\n\t"
1192  : "=&wa" (res)
1193  : "wa" (t));
1194 #else
1195  __asm__("vrfiz %0, %1\n\t"
1196  : "=v" (res)
1197  : "v" (t));
1198 #endif
1199 
1200  return res;
1201 }
1202 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
1203 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
1204 #ifdef EIGEN_VECTORIZE_VSX
1205 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
1206 {
1207  Packet4f res;
1208 
1209  __asm__("xvrspic %x0, %x1\n\t"
1210  : "=&wa" (res)
1211  : "wa" (a));
1212 
1213  return res;
1214 }
1215 #endif
1216 
1217 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
1218 {
1220 #if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
1222  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1223 #else
1224  Packet16uc MSQ, LSQ;
1225  Packet16uc mask;
1226  MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
1227  LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
1228  mask = vec_lvsl(0, from); // create the permute mask
1229  //TODO: Add static_cast here
1230  return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
1231 #endif
1232 }
1233 
1234 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
1235 {
1236  return ploadu_common<Packet4f>(from);
1237 }
1238 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
1239 {
1240  return ploadu_common<Packet4i>(from);
1241 }
1242 template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
1243 {
1244  return ploadu_common<Packet8s>(from);
1245 }
1246 template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
1247 {
1248  return ploadu_common<Packet8us>(from);
1249 }
1250 template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
1251 {
1252  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1253 }
1254 template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
1255 {
1256  return ploadu_common<Packet16c>(from);
1257 }
1258 template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
1259 {
1260  return ploadu_common<Packet16uc>(from);
1261 }
1262 
1263 template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
1264 {
1265  const Index packet_size = unpacket_traits<Packet>::size;
1266  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1267  const Index size = sizeof(__UNPACK_TYPE__(Packet));
1268 #ifdef _ARCH_PWR9
1269  EIGEN_UNUSED_VARIABLE(packet_size);
1272  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1273  if (offset) {
1274  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1275 #ifdef _BIG_ENDIAN
1276  load = Packet(vec_sro(Packet16uc(load), shift));
1277 #else
1278  load = Packet(vec_slo(Packet16uc(load), shift));
1279 #endif
1280  }
1281  return load;
1282 #else
1283  if (n) {
1284  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1285  unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
1286  unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1287  Index n2 = n * size;
1288  if (16 <= n2) {
1289  pstoreu(load2, ploadu<Packet16uc>(from2));
1290  } else {
1291  memcpy((void *)load2, (void *)from2, n2);
1292  }
1293  return pload_ignore<Packet>(load);
1294  } else {
1295  return Packet(pset1<Packet16uc>(0));
1296  }
1297 #endif
1298 }
1299 
1300 template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset)
1301 {
1302  return ploadu_partial_common<Packet4f>(from, n, offset);
1303 }
1304 template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset)
1305 {
1306  return ploadu_partial_common<Packet4i>(from, n, offset);
1307 }
1308 template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset)
1309 {
1310  return ploadu_partial_common<Packet8s>(from, n, offset);
1311 }
1312 template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
1313 {
1314  return ploadu_partial_common<Packet8us>(from, n, offset);
1315 }
1316 template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
1317 {
1318  return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
1319 }
1320 template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
1321 {
1322  return ploadu_partial_common<Packet16c>(from, n, offset);
1323 }
1324 template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
1325 {
1326  return ploadu_partial_common<Packet16uc>(from, n, offset);
1327 }
1328 
1329 template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
1330 {
1331  Packet p;
1332  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
1333  else p = ploadu<Packet>(from);
1334  return vec_mergeh(p, p);
1335 }
1336 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
1337 {
1338  return ploaddup_common<Packet4f>(from);
1339 }
1340 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
1341 {
1342  return ploaddup_common<Packet4i>(from);
1343 }
1344 
1345 template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
1346 {
1347  Packet8s p;
1348  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1349  else p = ploadu<Packet8s>(from);
1350  return vec_mergeh(p, p);
1351 }
1352 
1353 template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
1354 {
1355  Packet8us p;
1356  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1357  else p = ploadu<Packet8us>(from);
1358  return vec_mergeh(p, p);
1359 }
1360 
1361 template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
1362 {
1363  Packet8s p;
1364  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
1365  else p = ploadu<Packet8s>(from);
1366  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1367 }
1368 
1369 template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
1370 {
1371  Packet8us p;
1372  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
1373  else p = ploadu<Packet8us>(from);
1374  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1375 }
1376 
1377 template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
1378 {
1379  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1380 }
1381 
1382 template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
1383 {
1384  Packet16c p;
1385  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
1386  else p = ploadu<Packet16c>(from);
1387  return vec_mergeh(p, p);
1388 }
1389 
1390 template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
1391 {
1392  Packet16uc p;
1393  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
1394  else p = ploadu<Packet16uc>(from);
1395  return vec_mergeh(p, p);
1396 }
1397 
1398 template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
1399 {
1401 #if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
1402  vec_xst(from, 0, to);
1403 #else
1404  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1405  // Warning: not thread safe!
1406  Packet16uc MSQ, LSQ, edges;
1407  Packet16uc edgeAlign, align;
1408 
1409  MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
1410  LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
1411  edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1412  edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
1413  align = vec_lvsr( 0, to ); // permute map to misalign data
1414  MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
1415  LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
1416  vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
1417  vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
1418 #endif
1419 }
1420 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
1421 {
1422  pstoreu_common<Packet4f>(to, from);
1423 }
1424 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
1425 {
1426  pstoreu_common<Packet4i>(to, from);
1427 }
1428 template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
1429 {
1430  pstoreu_common<Packet8s>(to, from);
1431 }
1432 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
1433 {
1434  pstoreu_common<Packet8us>(to, from);
1435 }
1436 template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
1437 {
1438  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1439 }
1440 template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
1441 {
1442  pstoreu_common<Packet16c>(to, from);
1443 }
1444 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
1445 {
1446  pstoreu_common<Packet16uc>(to, from);
1447 }
1448 
1449 template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
1450 {
1451  const Index packet_size = unpacket_traits<Packet>::size;
1452  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1453  const Index size = sizeof(__UNPACK_TYPE__(Packet));
1454 #ifdef _ARCH_PWR9
1455  EIGEN_UNUSED_VARIABLE(packet_size);
1457  Packet store = from;
1458  if (offset) {
1459  Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1460 #ifdef _BIG_ENDIAN
1461  store = Packet(vec_slo(Packet16uc(store), shift));
1462 #else
1463  store = Packet(vec_sro(Packet16uc(store), shift));
1464 #endif
1465  }
1466  vec_xst_len(store, to, n * size);
1467 #else
1468  if (n) {
1469  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1470  pstore(store, from);
1471  unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
1472  unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
1473  Index n2 = n * size;
1474  if (16 <= n2) {
1475  pstoreu(to2, ploadu<Packet16uc>(store2));
1476  } else {
1477  memcpy((void *)to2, (void *)store2, n2);
1478  }
1479  }
1480 #endif
1481 }
1482 
1483 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
1484 {
1485  pstoreu_partial_common<Packet4f>(to, from, n, offset);
1486 }
1487 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
1488 {
1489  pstoreu_partial_common<Packet4i>(to, from, n, offset);
1490 }
1491 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
1492 {
1493  pstoreu_partial_common<Packet8s>(to, from, n, offset);
1494 }
1495 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
1496 {
1497  pstoreu_partial_common<Packet8us>(to, from, n, offset);
1498 }
1499 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
1500 {
1501  pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1502 }
1503 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
1504 {
1505  pstoreu_partial_common<Packet16c>(to, from, n, offset);
1506 }
1507 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
1508 {
1509  pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1510 }
1511 
1512 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
1513 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
1514 
1515 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
1516 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
1517 
1518 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1520  vec_ste(a, 0, &x);
1521  return x;
1522 }
1523 
1524 template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1525  return pfirst_common<Packet8s>(a);
1526 }
1527 
1528 template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1529  return pfirst_common<Packet8us>(a);
1530 }
1531 
1532 template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
1533 {
1534  return pfirst_common<Packet16c>(a);
1535 }
1536 
1537 template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
1538 {
1539  return pfirst_common<Packet16uc>(a);
1540 }
1541 
1542 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
1543 {
1544  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1545 }
1546 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
1547 {
1548  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1549 }
1550 template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
1551 {
1552  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1553 }
1554 template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
1555 {
1556  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1557 }
1558 template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
1559 {
1560 #ifdef _ARCH_PWR9
1561  return vec_revb(a);
1562 #else
1563  return vec_perm(a, a, p16uc_REVERSE8);
1564 #endif
1565 }
1566 template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
1567 {
1568 #ifdef _ARCH_PWR9
1569  return vec_revb(a);
1570 #else
1571  return vec_perm(a, a, p16uc_REVERSE8);
1572 #endif
1573 }
1574 template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
1575 {
1576  return preverse<Packet8us>(a);
1577 }
1578 
1579 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
1580 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
1581 template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
1582 template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
1583 template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
1584 template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
1585 template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1586  EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
1587  return pand<Packet8us>(p8us_abs_mask, a);
1588 }
1589 
1590 template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return vec_sra(a.m_val, vec_splat_u16(15)); }
1591 template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31))); }
1592 
1593 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
1594 { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1595 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
1596 { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1597 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
1598 { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
1599 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
1600 {
1601  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1602  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1603  return reinterpret_cast<Packet4f>(r);
1604 }
1605 
1606 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
1607 {
1608  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1609  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1610  return reinterpret_cast<Packet4f>(r);
1611 }
1612 
1613 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
1614 {
1615  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1616  return vec_sr(a, p4ui_mask);
1617 }
1618 
1619 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
1620 {
1621  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1622  return vec_sl(a, p4ui_mask);
1623 }
1624 
1625 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
1626 {
1627  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1628  return vec_sl(a, p8us_mask);
1629 }
1630 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
1631 {
1632  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1633  return vec_sr(a, p8us_mask);
1634 }
1635 
1636 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
1637  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1638 }
1639 
1640 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
1641  const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1642  return pand<Packet4f>(
1643  reinterpret_cast<Packet4f>(bf.m_val),
1644  reinterpret_cast<Packet4f>(p4ui_high_mask)
1645  );
1646 }
1647 
1649 #ifdef _BIG_ENDIAN
1650  return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1651 #else
1652  return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1653 #endif
1654 }
1655 
1656 // Simple interleaving of bool masks, prevents true values from being
1657 // converted to NaNs.
1658 EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1659  return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1660 }
1661 
1662 //#define SUPPORT_BF16_SUBNORMALS
1663 
1664 #ifndef __VEC_CLASS_FP_NAN
1665 #define __VEC_CLASS_FP_NAN (1<<6)
1666 #endif
1667 
1668 #if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
1669 #define __VEC_CLASS_FP_SUBNORMAL_P (1<<1)
1670 #define __VEC_CLASS_FP_SUBNORMAL_N (1<<0)
1671 
1672 #define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
1673 #endif
1674 
1675 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
1676 #ifdef _ARCH_PWR10
1677  return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
1678 #else
1679  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
1680  Packet4ui lsb = plogical_shift_right<16>(input);
1681  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
1682 
1683  EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
1684  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
1685  input = padd<Packet4ui>(input, rounding_bias);
1686 
1687  const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
1688 #ifdef _ARCH_PWR9
1689  Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
1690  input = vec_sel(input, p4ui_nan, nan_selector);
1691 
1692 #ifdef SUPPORT_BF16_SUBNORMALS
1693  Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
1694  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1695 #endif
1696 #else
1697 #ifdef SUPPORT_BF16_SUBNORMALS
1698  //Test NaN and Subnormal
1699  const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
1700  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
1701 
1702  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
1703  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
1704 
1705  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
1706  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
1707 
1708  Packet4ui nan_selector = pandnot<Packet4ui>(
1709  reinterpret_cast<Packet4ui>(is_max_exp),
1710  reinterpret_cast<Packet4ui>(is_mant_zero)
1711  );
1712 
1713  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
1714 
1715  Packet4ui subnormal_selector = pandnot<Packet4ui>(
1716  reinterpret_cast<Packet4ui>(is_zero_exp),
1717  reinterpret_cast<Packet4ui>(is_mant_zero)
1718  );
1719 
1720  input = vec_sel(input, p4ui_nan, nan_selector);
1721  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
1722 #else
1723  //Test only NaN
1724  Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
1725 
1726  input = vec_sel(p4ui_nan, input, nan_selector);
1727 #endif
1728 #endif
1729 
1730  input = plogical_shift_right<16>(input);
1731  return reinterpret_cast<Packet8us>(input);
1732 #endif
1733 }
1734 
1735 #ifdef _BIG_ENDIAN
1741 template<bool lohi>
1743 {
1744  if (lohi) {
1745  return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
1746  } else {
1747  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
1748  }
1749 }
1750 
1756 template<bool lohi>
1758 {
1759  if (lohi) {
1760  return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
1761  } else {
1762  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
1763  }
1764 }
1765 #else
1766 template<bool lohi>
1768 {
1769  if (lohi) {
1770  return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
1771  } else {
1772  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
1773  }
1774 }
1775 
1776 template<bool lohi>
1778 {
1779  if (lohi) {
1780  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
1781  } else {
1782  return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
1783  }
1784 }
1785 #endif
1786 
1792 template<bool lohi = true>
1794 {
1795  Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
1796  Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
1797 
1798  Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
1799  EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS,0x7FFFu);
1800  lsb = padd<Packet8us>(lsb, p8us_BIAS);
1801  lsb = padd<Packet8us>(lsb, p4f2);
1802 
1803  Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
1804  Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
1805 
1806 #ifdef _ARCH_PWR9
1807  Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
1808  Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
1809  Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
1810 
1811  input = vec_sel(input, p8us_BIAS, nan_selector);
1812 
1813 #ifdef SUPPORT_BF16_SUBNORMALS
1814  Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
1815  Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
1816  Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo), reinterpret_cast<Packet4f>(subnormal_selector_hi));
1817 
1818  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
1819 #endif
1820 #else
1821 #ifdef SUPPORT_BF16_SUBNORMALS
1822  //Test NaN and Subnormal
1823  const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
1824  Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
1825 
1826  const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
1827  Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
1828 
1829  Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
1830  Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
1831 
1832  Packet8us nan_selector = pandnot<Packet8us>(
1833  reinterpret_cast<Packet8us>(is_max_exp),
1834  reinterpret_cast<Packet8us>(is_mant_zero)
1835  );
1836 
1837  Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
1838 
1839  Packet8us subnormal_selector = pandnot<Packet8us>(
1840  reinterpret_cast<Packet8us>(is_zero_exp),
1841  reinterpret_cast<Packet8us>(is_mant_zero)
1842  );
1843 
1844  // Using BIAS as NaN (since any or all of the last 7 bits can be set)
1845  input = vec_sel(input, p8us_BIAS, nan_selector);
1846  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
1847 #else
1848  //Test only NaN
1849  Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
1850  Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
1851  Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
1852 
1853  input = vec_sel(p8us_BIAS, input, nan_selector);
1854 #endif
1855 #endif
1856 
1857  return input;
1858 }
1859 
1863 EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi)
1864 {
1865 #ifdef _ARCH_PWR10
1866  Packet8bf fp16_0 = F32ToBf16(lo);
1867  Packet8bf fp16_1 = F32ToBf16(hi);
1868  return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
1869 #else
1870  return F32ToBf16Two(lo, hi);
1871 #endif
1872 }
1873 
1877 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
1878 #ifdef _ARCH_PWR10
1879  return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
1880 #else
1881  return F32ToBf16Two<false>(even, odd);
1882 #endif
1883 }
1884 #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
1885  Packet4f a_even = Bf16ToF32Even(A);\
1886  Packet4f a_odd = Bf16ToF32Odd(A);\
1887  Packet4f op_even = OP(a_even);\
1888  Packet4f op_odd = OP(a_odd);\
1889  return F32ToBf16(op_even, op_odd);\
1890 
1891 #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
1892  Packet4f a_even = Bf16ToF32Even(A);\
1893  Packet4f a_odd = Bf16ToF32Odd(A);\
1894  Packet4f b_even = Bf16ToF32Even(B);\
1895  Packet4f b_odd = Bf16ToF32Odd(B);\
1896  Packet4f op_even = OP(a_even, b_even);\
1897  Packet4f op_odd = OP(a_odd, b_odd);\
1898  return F32ToBf16(op_even, op_odd);\
1899 
1900 #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
1901  Packet4f a_even = Bf16ToF32Even(A);\
1902  Packet4f a_odd = Bf16ToF32Odd(A);\
1903  Packet4f b_even = Bf16ToF32Even(B);\
1904  Packet4f b_odd = Bf16ToF32Odd(B);\
1905  Packet4f op_even = OP(a_even, b_even);\
1906  Packet4f op_odd = OP(a_odd, b_odd);\
1907  return F32ToBf16Bool(op_even, op_odd);\
1908 
1909 template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1911 }
1912 
1913 template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1915 }
1916 
1917 template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1919 }
1920 
1921 template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
1922  EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask,0x8000);
1923  return pxor<Packet8us>(p8us_neg_mask, a);
1924 }
1925 
1926 template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1928 }
1929 
1930 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
1932 }
1933 
1934 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1935  return pldexp_generic(a,exponent);
1936 }
1937 template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
1939 }
1940 
1941 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1942  return pfrexp_generic(a,exponent);
1943 }
1944 template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
1945  Packet4f a_even = Bf16ToF32Even(a);
1946  Packet4f a_odd = Bf16ToF32Odd(a);
1947  Packet4f e_even;
1948  Packet4f e_odd;
1949  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
1950  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
1951  e = F32ToBf16(e_even, e_odd);
1952  return F32ToBf16(op_even, op_odd);
1953 }
1954 
1955 template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
1957 }
1958 template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
1960 }
1961 template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
1963 }
1964 template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
1966 }
1967 template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
1969 }
1970 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
1972 }
1973 #ifdef EIGEN_VECTORIZE_VSX
1974 template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
1975  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
1976 }
1977 #endif
1978 template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
1979  Packet4f a_even = Bf16ToF32Even(a);
1980  Packet4f a_odd = Bf16ToF32Odd(a);
1981  Packet4f b_even = Bf16ToF32Even(b);
1982  Packet4f b_odd = Bf16ToF32Odd(b);
1983  Packet4f c_even = Bf16ToF32Even(c);
1984  Packet4f c_odd = Bf16ToF32Odd(c);
1985  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
1986  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
1987  return F32ToBf16(pmadd_even, pmadd_odd);
1988 }
1989 
1990 template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1992 }
1993 
1994 template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1996 }
1997 
1998 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2000 }
2001 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2003 }
2004 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2006 }
2007 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2009 }
2010 
2011 template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2013 }
2014 
2015 template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
2016 {
2017  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
2018 }
2019 
2020 template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2021  bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2022  bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
2023  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2024 }
2025 
2026 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
2027 {
2028  Packet4f b, sum;
2029  b = vec_sld(a, a, 8);
2030  sum = a + b;
2031  b = vec_sld(sum, sum, 4);
2032  sum += b;
2033  return pfirst(sum);
2034 }
2035 
2036 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
2037 {
2038  Packet4i sum;
2039  sum = vec_sums(a, p4i_ZERO);
2040 #ifdef _BIG_ENDIAN
2041  sum = vec_sld(sum, p4i_ZERO, 12);
2042 #else
2043  sum = vec_sld(p4i_ZERO, sum, 4);
2044 #endif
2045  return pfirst(sum);
2046 }
2047 
2048 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
2049 {
2050  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2051  float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2052  float f32_result = redux_even + redux_odd;
2053  return bfloat16(f32_result);
2054 }
2055 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
2056 {
2057  union{
2058  Packet v;
2059  __UNPACK_TYPE__(Packet) n[8];
2060  } vt;
2061  vt.v = a;
2062 
2063  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
2064  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
2065  Packet4i first_half = pload<Packet4i>(first_loader);
2066  Packet4i second_half = pload<Packet4i>(second_loader);
2067 
2068  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
2069 }
2070 
2071 template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
2072 {
2073  return predux_size8<Packet8s>(a);
2074 }
2075 
2076 template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
2077 {
2078  return predux_size8<Packet8us>(a);
2079 }
2080 
2081 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
2082 {
2083  union{
2084  Packet v;
2085  __UNPACK_TYPE__(Packet) n[16];
2086  } vt;
2087  vt.v = a;
2088 
2089  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
2090  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
2091  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
2092  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
2093 
2094  Packet4i first_quarter = pload<Packet4i>(first_loader);
2095  Packet4i second_quarter = pload<Packet4i>(second_loader);
2096  Packet4i third_quarter = pload<Packet4i>(third_loader);
2097  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2098 
2099  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
2100  + predux(third_quarter) + predux(fourth_quarter));
2101 }
2102 
2103 template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
2104 {
2105  return predux_size16<Packet16c>(a);
2106 }
2107 
2108 template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
2109 {
2110  return predux_size16<Packet16uc>(a);
2111 }
2112 
2113 // Other reduction functions:
2114 // mul
2115 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
2116 {
2117  Packet4f prod;
2118  prod = pmul(a, vec_sld(a, a, 8));
2119  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2120 }
2121 
2122 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
2123 {
2124  EIGEN_ALIGN16 int aux[4];
2125  pstore(aux, a);
2126  return aux[0] * aux[1] * aux[2] * aux[3];
2127 }
2128 
2129 template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
2130 {
2131  Packet8s pair, quad, octo;
2132 
2133  pair = vec_mul(a, vec_sld(a, a, 8));
2134  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2135  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2136 
2137  return pfirst(octo);
2138 }
2139 
2140 template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
2141 {
2142  Packet8us pair, quad, octo;
2143 
2144  pair = vec_mul(a, vec_sld(a, a, 8));
2145  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2146  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2147 
2148  return pfirst(octo);
2149 }
2150 
2151 template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
2152 {
2153  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2154  float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2155  float f32_result = redux_even * redux_odd;
2156  return bfloat16(f32_result);
2157 }
2158 
2159 
2160 template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
2161 {
2162  Packet16c pair, quad, octo, result;
2163 
2164  pair = vec_mul(a, vec_sld(a, a, 8));
2165  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2166  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2167  result = vec_mul(octo, vec_sld(octo, octo, 1));
2168 
2169  return pfirst(result);
2170 }
2171 
2172 template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
2173 {
2174  Packet16uc pair, quad, octo, result;
2175 
2176  pair = vec_mul(a, vec_sld(a, a, 8));
2177  quad = vec_mul(pair, vec_sld(pair, pair, 4));
2178  octo = vec_mul(quad, vec_sld(quad, quad, 2));
2179  result = vec_mul(octo, vec_sld(octo, octo, 1));
2180 
2181  return pfirst(result);
2182 }
2183 
2184 // min
2185 template<typename Packet> EIGEN_STRONG_INLINE
2186 __UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
2187 {
2188  Packet b, res;
2189  b = vec_min(a, vec_sld(a, a, 8));
2190  res = vec_min(b, vec_sld(b, b, 4));
2191  return pfirst(res);
2192 }
2193 
2194 
2195 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
2196 {
2197  return predux_min4<Packet4f>(a);
2198 }
2199 
2200 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
2201 {
2202  return predux_min4<Packet4i>(a);
2203 }
2204 
2205 template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
2206 {
2207  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2208  float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2209  float f32_result = (std::min)(redux_even, redux_odd);
2210  return bfloat16(f32_result);
2211 }
2212 
2213 template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
2214 {
2215  Packet8s pair, quad, octo;
2216 
2217  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2218  pair = vec_min(a, vec_sld(a, a, 8));
2219 
2220  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2221  quad = vec_min(pair, vec_sld(pair, pair, 4));
2222 
2223  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2224  octo = vec_min(quad, vec_sld(quad, quad, 2));
2225  return pfirst(octo);
2226 }
2227 
2228 template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
2229 {
2230  Packet8us pair, quad, octo;
2231 
2232  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2233  pair = vec_min(a, vec_sld(a, a, 8));
2234 
2235  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2236  quad = vec_min(pair, vec_sld(pair, pair, 4));
2237 
2238  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2239  octo = vec_min(quad, vec_sld(quad, quad, 2));
2240  return pfirst(octo);
2241 }
2242 
2243 template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
2244 {
2245  Packet16c pair, quad, octo, result;
2246 
2247  pair = vec_min(a, vec_sld(a, a, 8));
2248  quad = vec_min(pair, vec_sld(pair, pair, 4));
2249  octo = vec_min(quad, vec_sld(quad, quad, 2));
2250  result = vec_min(octo, vec_sld(octo, octo, 1));
2251 
2252  return pfirst(result);
2253 }
2254 
2255 template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
2256 {
2257  Packet16uc pair, quad, octo, result;
2258 
2259  pair = vec_min(a, vec_sld(a, a, 8));
2260  quad = vec_min(pair, vec_sld(pair, pair, 4));
2261  octo = vec_min(quad, vec_sld(quad, quad, 2));
2262  result = vec_min(octo, vec_sld(octo, octo, 1));
2263 
2264  return pfirst(result);
2265 }
2266 // max
2267 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
2268 {
2269  Packet b, res;
2270  b = vec_max(a, vec_sld(a, a, 8));
2271  res = vec_max(b, vec_sld(b, b, 4));
2272  return pfirst(res);
2273 }
2274 
2275 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
2276 {
2277  return predux_max4<Packet4f>(a);
2278 }
2279 
2280 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
2281 {
2282  return predux_max4<Packet4i>(a);
2283 }
2284 
2285 template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
2286 {
2287  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2288  float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2289  float f32_result = (std::max)(redux_even, redux_odd);
2290  return bfloat16(f32_result);
2291 }
2292 
2293 template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
2294 {
2295  Packet8s pair, quad, octo;
2296 
2297  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2298  pair = vec_max(a, vec_sld(a, a, 8));
2299 
2300  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2301  quad = vec_max(pair, vec_sld(pair, pair, 4));
2302 
2303  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2304  octo = vec_max(quad, vec_sld(quad, quad, 2));
2305  return pfirst(octo);
2306 }
2307 
2308 template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
2309 {
2310  Packet8us pair, quad, octo;
2311 
2312  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2313  pair = vec_max(a, vec_sld(a, a, 8));
2314 
2315  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2316  quad = vec_max(pair, vec_sld(pair, pair, 4));
2317 
2318  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2319  octo = vec_max(quad, vec_sld(quad, quad, 2));
2320  return pfirst(octo);
2321 }
2322 
2323 template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
2324 {
2325  Packet16c pair, quad, octo, result;
2326 
2327  pair = vec_max(a, vec_sld(a, a, 8));
2328  quad = vec_max(pair, vec_sld(pair, pair, 4));
2329  octo = vec_max(quad, vec_sld(quad, quad, 2));
2330  result = vec_max(octo, vec_sld(octo, octo, 1));
2331 
2332  return pfirst(result);
2333 }
2334 
2335 template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
2336 {
2337  Packet16uc pair, quad, octo, result;
2338 
2339  pair = vec_max(a, vec_sld(a, a, 8));
2340  quad = vec_max(pair, vec_sld(pair, pair, 4));
2341  octo = vec_max(quad, vec_sld(quad, quad, 2));
2342  result = vec_max(octo, vec_sld(octo, octo, 1));
2343 
2344  return pfirst(result);
2345 }
2346 
2347 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
2348 {
2349  return vec_any_ne(x, pzero(x));
2350 }
2351 
2352 template <typename T> EIGEN_DEVICE_FUNC inline void
2353 ptranpose_common(PacketBlock<T,4>& kernel){
2354  T t0, t1, t2, t3;
2355  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2356  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2357  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2358  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2359  kernel.packet[0] = vec_mergeh(t0, t2);
2360  kernel.packet[1] = vec_mergel(t0, t2);
2361  kernel.packet[2] = vec_mergeh(t1, t3);
2362  kernel.packet[3] = vec_mergel(t1, t3);
2363 }
2364 
2365 EIGEN_DEVICE_FUNC inline void
2366 ptranspose(PacketBlock<Packet4f,4>& kernel) {
2367  ptranpose_common<Packet4f>(kernel);
2368 }
2369 
2370 EIGEN_DEVICE_FUNC inline void
2371 ptranspose(PacketBlock<Packet4i,4>& kernel) {
2372  ptranpose_common<Packet4i>(kernel);
2373 }
2374 
2375 EIGEN_DEVICE_FUNC inline void
2376 ptranspose(PacketBlock<Packet8s,4>& kernel) {
2377  Packet8s t0, t1, t2, t3;
2378  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2379  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2380  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2381  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2382  kernel.packet[0] = vec_mergeh(t0, t2);
2383  kernel.packet[1] = vec_mergel(t0, t2);
2384  kernel.packet[2] = vec_mergeh(t1, t3);
2385  kernel.packet[3] = vec_mergel(t1, t3);
2386 }
2387 
2388 EIGEN_DEVICE_FUNC inline void
2389 ptranspose(PacketBlock<Packet8us,4>& kernel) {
2390  Packet8us t0, t1, t2, t3;
2391  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2392  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2393  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2394  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2395  kernel.packet[0] = vec_mergeh(t0, t2);
2396  kernel.packet[1] = vec_mergel(t0, t2);
2397  kernel.packet[2] = vec_mergeh(t1, t3);
2398  kernel.packet[3] = vec_mergel(t1, t3);
2399 }
2400 
2401 
2402 EIGEN_DEVICE_FUNC inline void
2403 ptranspose(PacketBlock<Packet8bf,4>& kernel) {
2404  Packet8us t0, t1, t2, t3;
2405 
2406  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2407  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2408  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2409  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2410  kernel.packet[0] = vec_mergeh(t0, t2);
2411  kernel.packet[1] = vec_mergel(t0, t2);
2412  kernel.packet[2] = vec_mergeh(t1, t3);
2413  kernel.packet[3] = vec_mergel(t1, t3);
2414 }
2415 
2416 EIGEN_DEVICE_FUNC inline void
2417 ptranspose(PacketBlock<Packet16c,4>& kernel) {
2418  Packet16c t0, t1, t2, t3;
2419  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2420  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2421  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2422  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2423  kernel.packet[0] = vec_mergeh(t0, t2);
2424  kernel.packet[1] = vec_mergel(t0, t2);
2425  kernel.packet[2] = vec_mergeh(t1, t3);
2426  kernel.packet[3] = vec_mergel(t1, t3);
2427 }
2428 
2429 
2430 EIGEN_DEVICE_FUNC inline void
2431 ptranspose(PacketBlock<Packet16uc,4>& kernel) {
2432  Packet16uc t0, t1, t2, t3;
2433  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2434  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2435  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2436  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2437  kernel.packet[0] = vec_mergeh(t0, t2);
2438  kernel.packet[1] = vec_mergel(t0, t2);
2439  kernel.packet[2] = vec_mergeh(t1, t3);
2440  kernel.packet[3] = vec_mergel(t1, t3);
2441 }
2442 
2443 EIGEN_DEVICE_FUNC inline void
2444 ptranspose(PacketBlock<Packet8s,8>& kernel) {
2445  Packet8s v[8], sum[8];
2446 
2447  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2448  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2449  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2450  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2451  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2452  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2453  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2454  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2455  sum[0] = vec_mergeh(v[0], v[4]);
2456  sum[1] = vec_mergel(v[0], v[4]);
2457  sum[2] = vec_mergeh(v[1], v[5]);
2458  sum[3] = vec_mergel(v[1], v[5]);
2459  sum[4] = vec_mergeh(v[2], v[6]);
2460  sum[5] = vec_mergel(v[2], v[6]);
2461  sum[6] = vec_mergeh(v[3], v[7]);
2462  sum[7] = vec_mergel(v[3], v[7]);
2463 
2464  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2465  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2466  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2467  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2468  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2469  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2470  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2471  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2472 }
2473 
2474 EIGEN_DEVICE_FUNC inline void
2475 ptranspose(PacketBlock<Packet8us,8>& kernel) {
2476  Packet8us v[8], sum[8];
2477 
2478  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2479  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2480  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2481  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2482  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2483  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2484  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2485  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2486  sum[0] = vec_mergeh(v[0], v[4]);
2487  sum[1] = vec_mergel(v[0], v[4]);
2488  sum[2] = vec_mergeh(v[1], v[5]);
2489  sum[3] = vec_mergel(v[1], v[5]);
2490  sum[4] = vec_mergeh(v[2], v[6]);
2491  sum[5] = vec_mergel(v[2], v[6]);
2492  sum[6] = vec_mergeh(v[3], v[7]);
2493  sum[7] = vec_mergel(v[3], v[7]);
2494 
2495  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2496  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2497  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2498  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2499  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2500  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2501  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2502  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2503 }
2504 
2505 EIGEN_DEVICE_FUNC inline void
2506 ptranspose(PacketBlock<Packet8bf,8>& kernel) {
2507  Packet8bf v[8], sum[8];
2508 
2509  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2510  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2511  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2512  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2513  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2514  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2515  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2516  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2517  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2518  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2519  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2520  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2521  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2522  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2523  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2524  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2525 
2526  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2527  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2528  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2529  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2530  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2531  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2532  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2533  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2534 }
2535 
2536 EIGEN_DEVICE_FUNC inline void
2537 ptranspose(PacketBlock<Packet16c,16>& kernel) {
2538  Packet16c step1[16], step2[16], step3[16];
2539 
2540  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2541  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2542  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2543  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2544  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2545  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2546  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2547  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2548  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2549  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2550  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2551  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2552  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2553  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2554  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2555  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2556 
2557  step2[0] = vec_mergeh(step1[0], step1[8]);
2558  step2[1] = vec_mergel(step1[0], step1[8]);
2559  step2[2] = vec_mergeh(step1[1], step1[9]);
2560  step2[3] = vec_mergel(step1[1], step1[9]);
2561  step2[4] = vec_mergeh(step1[2], step1[10]);
2562  step2[5] = vec_mergel(step1[2], step1[10]);
2563  step2[6] = vec_mergeh(step1[3], step1[11]);
2564  step2[7] = vec_mergel(step1[3], step1[11]);
2565  step2[8] = vec_mergeh(step1[4], step1[12]);
2566  step2[9] = vec_mergel(step1[4], step1[12]);
2567  step2[10] = vec_mergeh(step1[5], step1[13]);
2568  step2[11] = vec_mergel(step1[5], step1[13]);
2569  step2[12] = vec_mergeh(step1[6], step1[14]);
2570  step2[13] = vec_mergel(step1[6], step1[14]);
2571  step2[14] = vec_mergeh(step1[7], step1[15]);
2572  step2[15] = vec_mergel(step1[7], step1[15]);
2573 
2574  step3[0] = vec_mergeh(step2[0], step2[8]);
2575  step3[1] = vec_mergel(step2[0], step2[8]);
2576  step3[2] = vec_mergeh(step2[1], step2[9]);
2577  step3[3] = vec_mergel(step2[1], step2[9]);
2578  step3[4] = vec_mergeh(step2[2], step2[10]);
2579  step3[5] = vec_mergel(step2[2], step2[10]);
2580  step3[6] = vec_mergeh(step2[3], step2[11]);
2581  step3[7] = vec_mergel(step2[3], step2[11]);
2582  step3[8] = vec_mergeh(step2[4], step2[12]);
2583  step3[9] = vec_mergel(step2[4], step2[12]);
2584  step3[10] = vec_mergeh(step2[5], step2[13]);
2585  step3[11] = vec_mergel(step2[5], step2[13]);
2586  step3[12] = vec_mergeh(step2[6], step2[14]);
2587  step3[13] = vec_mergel(step2[6], step2[14]);
2588  step3[14] = vec_mergeh(step2[7], step2[15]);
2589  step3[15] = vec_mergel(step2[7], step2[15]);
2590 
2591  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2592  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2593  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2594  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2595  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2596  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2597  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2598  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2599  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2600  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2601  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2602  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2603  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2604  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2605  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2606  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2607 }
2608 
2609 EIGEN_DEVICE_FUNC inline void
2610 ptranspose(PacketBlock<Packet16uc,16>& kernel) {
2611  Packet16uc step1[16], step2[16], step3[16];
2612 
2613  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2614  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2615  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2616  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2617  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2618  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2619  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2620  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2621  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2622  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2623  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2624  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2625  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2626  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2627  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2628  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2629 
2630  step2[0] = vec_mergeh(step1[0], step1[8]);
2631  step2[1] = vec_mergel(step1[0], step1[8]);
2632  step2[2] = vec_mergeh(step1[1], step1[9]);
2633  step2[3] = vec_mergel(step1[1], step1[9]);
2634  step2[4] = vec_mergeh(step1[2], step1[10]);
2635  step2[5] = vec_mergel(step1[2], step1[10]);
2636  step2[6] = vec_mergeh(step1[3], step1[11]);
2637  step2[7] = vec_mergel(step1[3], step1[11]);
2638  step2[8] = vec_mergeh(step1[4], step1[12]);
2639  step2[9] = vec_mergel(step1[4], step1[12]);
2640  step2[10] = vec_mergeh(step1[5], step1[13]);
2641  step2[11] = vec_mergel(step1[5], step1[13]);
2642  step2[12] = vec_mergeh(step1[6], step1[14]);
2643  step2[13] = vec_mergel(step1[6], step1[14]);
2644  step2[14] = vec_mergeh(step1[7], step1[15]);
2645  step2[15] = vec_mergel(step1[7], step1[15]);
2646 
2647  step3[0] = vec_mergeh(step2[0], step2[8]);
2648  step3[1] = vec_mergel(step2[0], step2[8]);
2649  step3[2] = vec_mergeh(step2[1], step2[9]);
2650  step3[3] = vec_mergel(step2[1], step2[9]);
2651  step3[4] = vec_mergeh(step2[2], step2[10]);
2652  step3[5] = vec_mergel(step2[2], step2[10]);
2653  step3[6] = vec_mergeh(step2[3], step2[11]);
2654  step3[7] = vec_mergel(step2[3], step2[11]);
2655  step3[8] = vec_mergeh(step2[4], step2[12]);
2656  step3[9] = vec_mergel(step2[4], step2[12]);
2657  step3[10] = vec_mergeh(step2[5], step2[13]);
2658  step3[11] = vec_mergel(step2[5], step2[13]);
2659  step3[12] = vec_mergeh(step2[6], step2[14]);
2660  step3[13] = vec_mergel(step2[6], step2[14]);
2661  step3[14] = vec_mergeh(step2[7], step2[15]);
2662  step3[15] = vec_mergel(step2[7], step2[15]);
2663 
2664  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2665  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2666  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2667  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2668  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2669  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2670  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2671  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2672  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2673  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2674  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2675  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2676  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2677  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2678  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2679  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2680 }
2681 
2682 template<typename Packet> EIGEN_STRONG_INLINE
2683 Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
2684  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
2685  Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
2686  return vec_sel(elsePacket, thenPacket, mask);
2687 }
2688 
2689 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
2690  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
2691 }
2692 
2693 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
2694  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
2695 }
2696 
2697 template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
2698  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2699  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2700  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
2701  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
2702  return result;
2703 }
2704 
2705 template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
2706  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2707  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
2708  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
2709  return vec_sel(elsePacket, thenPacket, mask);
2710 }
2711 
2712 template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
2713  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
2714 }
2715 
2716 template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
2717  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2718  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2719  ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2720  ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2721 
2722  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
2723  return vec_sel(elsePacket, thenPacket, mask);
2724 }
2725 
2726 template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
2727  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
2728  ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
2729  ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
2730  ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
2731 
2732  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
2733  return vec_sel(elsePacket, thenPacket, mask);
2734 }
2735 
2736 
2737 //---------- double ----------
2738 #ifdef EIGEN_VECTORIZE_VSX
2739 typedef __vector double Packet2d;
2740 typedef __vector unsigned long long Packet2ul;
2741 typedef __vector long long Packet2l;
2742 #if EIGEN_COMP_CLANG
2743 typedef Packet2ul Packet2bl;
2744 #else
2745 typedef __vector __bool long Packet2bl;
2746 #endif
2747 
2748 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
2749 static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
2750 static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
2751 static Packet2d p2d_ONE = { 1.0, 1.0 };
2752 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
2753 static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
2754  numext::bit_cast<double>(0x8000000000000000ull) };
2755 
2756 #ifdef _BIG_ENDIAN
2757 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
2758 #else
2759 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
2760 #endif
2761 
2762 template<int index> Packet2d vec_splat_dbl(Packet2d& a)
2763 {
2764  return vec_splat(a, index);
2765 }
2766 
2767 template<> struct packet_traits<double> : default_packet_traits
2768 {
2769  typedef Packet2d type;
2770  typedef Packet2d half;
2771  enum {
2772  Vectorizable = 1,
2773  AlignedOnScalar = 1,
2774  size=2,
2775 
2776  HasAdd = 1,
2777  HasSub = 1,
2778  HasMul = 1,
2779  HasDiv = 1,
2780  HasMin = 1,
2781  HasMax = 1,
2782  HasAbs = 1,
2783  HasSin = 0,
2784  HasCos = 0,
2785  HasATan = 0,
2786  HasLog = 0,
2787  HasExp = 1,
2788  HasSqrt = 1,
2789 #if !EIGEN_COMP_CLANG
2790  HasRsqrt = 1,
2791 #else
2792  HasRsqrt = 0,
2793 #endif
2794  HasRound = 1,
2795  HasFloor = 1,
2796  HasCeil = 1,
2797  HasRint = 1,
2798  HasNegate = 1,
2799  HasBlend = 1
2800  };
2801 };
2802 
2803 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
2804 
2805 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
2806 {
2807  union {
2808  Packet2l v;
2809  int64_t n[2];
2810  } vt;
2811  vt.v = v;
2812  s << vt.n[0] << ", " << vt.n[1];
2813  return s;
2814 }
2815 
2816 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
2817 {
2818  union {
2819  Packet2d v;
2820  double n[2];
2821  } vt;
2822  vt.v = v;
2823  s << vt.n[0] << ", " << vt.n[1];
2824  return s;
2825 }
2826 
2827 // Need to define them first or we get specialization after instantiation errors
2828 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
2829 {
2831  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
2832 }
2833 
2834 template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
2835 {
2836  return pload_partial_common<Packet2d>(from, n, offset);
2837 }
2838 
2839 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
2840 {
2842  vec_xst(from, 0, to);
2843 }
2844 
2845 template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
2846 {
2847  pstore_partial_common<Packet2d>(to, from, n, offset);
2848 }
2849 
2850 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
2851  Packet2d v = {from, from};
2852  return v;
2853 }
2854 
2855 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
2856  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
2857  return reinterpret_cast<Packet2d>(v);
2858 }
2859 
2860 template<> EIGEN_STRONG_INLINE void
2861 pbroadcast4<Packet2d>(const double *a,
2862  Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
2863 {
2864  //This way is faster than vec_splat (at least for doubles in Power 9)
2865  a0 = pset1<Packet2d>(a[0]);
2866  a1 = pset1<Packet2d>(a[1]);
2867  a2 = pset1<Packet2d>(a[2]);
2868  a3 = pset1<Packet2d>(a[3]);
2869 }
2870 
2871 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
2872 {
2873  return pgather_common<Packet2d>(from, stride);
2874 }
2875 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
2876 {
2877  return pgather_common<Packet2d>(from, stride, n);
2878 }
2879 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
2880 {
2881  pscatter_common<Packet2d>(to, from, stride);
2882 }
2883 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
2884 {
2885  pscatter_common<Packet2d>(to, from, stride, n);
2886 }
2887 
2888 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
2889 
2890 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
2891 
2892 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
2893 
2894 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
2895 {
2896 #ifdef __POWER8_VECTOR__
2897  return vec_neg(a);
2898 #else
2899  return vec_xor(a, p2d_MZERO);
2900 #endif
2901 }
2902 
2903 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
2904 
2905 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
2906 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
2907 
2908 // for some weird raisons, it has to be overloaded for packet of integers
2909 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
2910 template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_msub(a,b,c); }
2911 template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmsub(a,b,c); }
2912 template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmadd(a,b,c); }
2913 
2914 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
2915 {
2916  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
2917  Packet2d ret;
2918  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2919  return ret;
2920  }
2921 
2922 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
2923 {
2924  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
2925  Packet2d ret;
2926  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
2927  return ret;
2928 }
2929 
2930 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
2931 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
2932 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
2933 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
2934  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
2935  return vec_nor(c,c);
2936 }
2937 
2938 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
2939 
2940 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
2941 
2942 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
2943 
2944 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
2945 
2946 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
2947 {
2948  Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
2949  Packet2d res;
2950 
2951  __asm__("xvrdpiz %x0, %x1\n\t"
2952  : "=&wa" (res)
2953  : "wa" (t));
2954 
2955  return res;
2956 }
2957 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
2958 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
2959 template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
2960 {
2961  Packet2d res;
2962 
2963  __asm__("xvrdpic %x0, %x1\n\t"
2964  : "=&wa" (res)
2965  : "wa" (a));
2966 
2967  return res;
2968 }
2969 
2970 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
2971 {
2973  return vec_xl(0, const_cast<double*>(from));
2974 }
2975 
2976 template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset)
2977 {
2978  return ploadu_partial_common<Packet2d>(from, n, offset);
2979 }
2980 
2981 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
2982 {
2983  Packet2d p;
2984  if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
2985  else p = ploadu<Packet2d>(from);
2986  return vec_splat_dbl<0>(p);
2987 }
2988 
2989 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
2990 {
2992  vec_xst(from, 0, to);
2993 }
2994 
2995 template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
2996 {
2997  pstoreu_partial_common<Packet2d>(to, from, n, offset);
2998 }
2999 
3000 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
3001 
3002 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
3003 
3004 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
3005 {
3006  return vec_sld(a, a, 8);
3007 }
3008 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
3009 #ifdef __POWER8_VECTOR__
3010 template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63))); }
3011 #else
3012 #ifdef _BIG_ENDIAN
3013 static Packet16uc p16uc_DUPSIGN = { 0,0,0,0, 0,0,0,0, 8,8,8,8, 8,8,8,8 };
3014 #else
3015 static Packet16uc p16uc_DUPSIGN = { 7,7,7,7, 7,7,7,7, 15,15,15,15, 15,15,15,15 };
3016 #endif
3017 
3018 template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
3019 {
3020  Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3021  return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3022 }
3023 #endif
3024 
3025 template<> inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
3026 
3027 template<> inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
3028 
3029 // Packet2l shifts.
3030 // For POWER8 we simply use vec_sr/l.
3031 //
3032 // Things are more complicated for POWER7. There is actually a
3033 // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
3034 // So we need to shift by N % 32 and rearrage bytes.
3035 #ifdef __POWER8_VECTOR__
3036 
3037 template<int N>
3038 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3039  const Packet2ul shift = { N, N };
3040  return vec_sl(a, shift);
3041 }
3042 
3043 template<int N>
3044 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3045  const Packet2ul shift = { N, N };
3046  return vec_sr(a, shift);
3047 }
3048 
3049 #else
3050 
3051 // Shifts [A, B, C, D] to [B, 0, D, 0].
3052 // Used to implement left shifts for Packet2l.
3053 EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
3054  static const Packet16uc perm = {
3055  0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3056  0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
3057  #ifdef _BIG_ENDIAN
3058  return vec_perm(p4i_ZERO, a, perm);
3059  #else
3060  return vec_perm(a, p4i_ZERO, perm);
3061  #endif
3062 }
3063 
3064 // Shifts [A, B, C, D] to [0, A, 0, C].
3065 // Used to implement right shifts for Packet2l.
3066 EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
3067  static const Packet16uc perm = {
3068  0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3069  0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
3070  #ifdef _BIG_ENDIAN
3071  return vec_perm(p4i_ZERO, a, perm);
3072  #else
3073  return vec_perm(a, p4i_ZERO, perm);
3074  #endif
3075 }
3076 
3077 template<int N, typename EnableIf = void>
3078 struct plogical_shift_left_impl;
3079 
3080 template<int N>
3081 struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
3082  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3083  static const unsigned n = static_cast<unsigned>(N);
3084  const Packet4ui shift = {n, n, n, n};
3085  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3086  static const unsigned m = static_cast<unsigned>(32 - N);
3087  const Packet4ui shift_right = {m, m, m, m};
3088  const Packet4i out_hi = vec_sl(ai, shift);
3089  const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3090  return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3091  }
3092 };
3093 
3094 template<int N>
3095 struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)>> {
3096  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3097  static const unsigned m = static_cast<unsigned>(N - 32);
3098  const Packet4ui shift = {m, m, m, m};
3099  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3100  return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
3101  }
3102 };
3103 
3104 template<int N>
3105 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3106  return plogical_shift_left_impl<N>::run(a);
3107 }
3108 
3109 template<int N, typename EnableIf = void>
3110 struct plogical_shift_right_impl;
3111 
3112 template<int N>
3113 struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
3114  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3115  static const unsigned n = static_cast<unsigned>(N);
3116  const Packet4ui shift = {n, n, n, n};
3117  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3118  static const unsigned m = static_cast<unsigned>(32 - N);
3119  const Packet4ui shift_left = {m, m, m, m};
3120  const Packet4i out_lo = vec_sr(ai, shift);
3121  const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3122  return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3123  }
3124 };
3125 
3126 template<int N>
3127 struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)>> {
3128  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3129  static const unsigned m = static_cast<unsigned>(N - 32);
3130  const Packet4ui shift = {m, m, m, m};
3131  const Packet4i ai = reinterpret_cast<Packet4i>(a);
3132  return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
3133  }
3134 };
3135 
3136 template<int N>
3137 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3138  return plogical_shift_right_impl<N>::run(a);
3139 }
3140 #endif
3141 
3142 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3143  // Clamp exponent to [-2099, 2099]
3144  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3145  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3146 
3147  // Split 2^e into four factors and multiply:
3148  const Packet2l bias = { 1023, 1023 };
3149  Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
3150  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
3151  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3152  b = psub(psub(psub(e, b), b), b); // e - 3b
3153  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3154  out = pmul(out, c); // a * 2^e
3155  return out;
3156 }
3157 
3158 
3159 // Extract exponent without existence of Packet2l.
3160 template<>
3161 EIGEN_STRONG_INLINE
3163  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
3164 }
3165 
3166 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
3167  return pfrexp_generic(a, exponent);
3168 }
3169 
3170 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
3171 {
3172  Packet2d b, sum;
3173  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3174  sum = a + b;
3175  return pfirst<Packet2d>(sum);
3176 }
3177 
3178 // Other reduction functions:
3179 // mul
3180 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
3181 {
3182  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3183 }
3184 
3185 // min
3186 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
3187 {
3188  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3189 }
3190 
3191 // max
3192 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
3193 {
3194  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3195 }
3196 
3197 EIGEN_DEVICE_FUNC inline void
3198 ptranspose(PacketBlock<Packet2d,2>& kernel) {
3199  Packet2d t0, t1;
3200  t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3201  t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3202  kernel.packet[0] = t0;
3203  kernel.packet[1] = t1;
3204 }
3205 
3206 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
3207  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
3208  Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
3209  return vec_sel(elsePacket, thenPacket, mask);
3210 }
3211 
3212 
3213 #endif // __VSX__
3214 } // end namespace internal
3215 
3216 } // end namespace Eigen
3217 
3218 #endif // EIGEN_PACKET_MATH_ALTIVEC_H
#define EIGEN_PPC_PREFETCH(ADDR)
#define LOAD_STORE_UNROLL_16
#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A)
#define __VEC_CLASS_FP_NAN
#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B)
#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B)
Matrix3f m
Array< int, Dynamic, 1 > v
Array< int, 3, 1 > b
int n
#define EIGEN_ALIGN16
Array< double, 1, 3 > e(1./3., 0.5, 2.)
Array33i c
#define EIGEN_DEBUG_ALIGNED_STORE
#define EIGEN_DEBUG_ALIGNED_LOAD
#define EIGEN_DEBUG_UNALIGNED_STORE
#define EIGEN_DEBUG_UNALIGNED_LOAD
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:836
#define eigen_internal_assert(x)
Definition: Macros.h:908
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:957
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:883
#define eigen_assert(x)
Definition: Macros.h:902
#define EIGEN_FAST_MATH
Definition: Macros.h:50
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
float * p
@ Aligned16
Definition: Constants.h:237
bfloat16() max(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:690
EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value)
bfloat16() min(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:684
EIGEN_ALWAYS_INLINE Packet4f pload_partial< Packet4f >(const float *from, const Index n, const Index offset)
Packet4f pcmp_lt< Packet4f >(const Packet4f &a, const Packet4f &b)
std::ostream & operator<<(std::ostream &s, const Packet16c &v)
void pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
signed char pfirst< Packet16c >(const Packet16c &a)
EIGEN_ALWAYS_INLINE void pstore_partial< signed char >(signed char *to, const Packet16c &from, const Index n, const Index offset)
Packet pmin(const Packet &a, const Packet &b)
Packet8us pand< Packet8us >(const Packet8us &a, const Packet8us &b)
Packet2d pdiv< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pnmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8bf pldexp< Packet8bf >(const Packet8bf &a, const Packet8bf &exponent)
Packet16uc psub< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
Packet8us pandnot< Packet8us >(const Packet8us &a, const Packet8us &b)
Packet16uc ploadu< Packet16uc >(const unsigned char *from)
Packet8s pmax< Packet8s >(const Packet8s &a, const Packet8s &b)
EIGEN_ALWAYS_INLINE void pstore_partial< unsigned short int >(unsigned short int *to, const Packet8us &from, const Index n, const Index offset)
double predux_max< Packet2d >(const Packet2d &a)
Packet8f pzero(const Packet8f &)
void pstore(Scalar *to, const Packet &from)
Packet8bf ploadquad< Packet8bf >(const bfloat16 *from)
EIGEN_ALWAYS_INLINE void pscatter< unsigned char, Packet16uc >(unsigned char *to, const Packet16uc &from, Index stride)
void pstore< float >(float *to, const Packet4f &from)
static Packet8us p8us_COUNTDOWN
Packet2d pset1frombits< Packet2d >(uint64_t from)
float predux_max< Packet4f >(const Packet4f &a)
double predux_min< Packet2d >(const Packet2d &a)
EIGEN_ALWAYS_INLINE void pstore_partial< int >(int *to, const Packet4i &from, const Index n, const Index offset)
static Packet4i p4i_COUNTDOWN
Packet2d plset< Packet2d >(const double &a)
EIGEN_ALWAYS_INLINE void pscatter_partial< float, Packet4f >(float *to, const Packet4f &from, Index stride, const Index n)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet &x)
static Packet4f p4f_ONE
__vector int Packet4i
EIGEN_ALWAYS_INLINE Packet16c pload_partial< Packet16c >(const signed char *from, const Index n, const Index offset)
Packet2d pmin< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pmul< Packet2d >(const Packet2d &a, const Packet2d &b)
unpacket_traits< Packet >::type predux(const Packet &a)
Packet16c pload< Packet16c >(const signed char *from)
Packet4f pcmp_lt_or_nan(const Packet4f &a, const Packet4f &b)
static Packet16uc p16uc_MERGEO16
static Packet8s p8s_COUNTDOWN
void pstore< int >(int *to, const Packet4i &from)
static Packet16uc p16uc_REVERSE8
static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0)
EIGEN_ALWAYS_INLINE Packet16uc pload_partial< Packet16uc >(const unsigned char *from, const Index n, const Index offset)
Packet4f pxor< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pfrexp< Packet2d >(const Packet2d &a, Packet2d &exponent)
__vector unsigned char Packet16uc
Packet8bf F32ToBf16(Packet4f p4f)
static Packet16uc p16uc_MERGEE16
EIGEN_ALWAYS_INLINE void pstore_partial< unsigned char >(unsigned char *to, const Packet16uc &from, const Index n, const Index offset)
Packet16uc plset< Packet16uc >(const unsigned char &a)
static Packet16uc p16uc_REVERSE16
EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) *from)
Packet8us ploaddup< Packet8us >(const unsigned short int *from)
void pstoreu< signed char >(signed char *to, const Packet16c &from)
EIGEN_ALWAYS_INLINE void pscatter< signed char, Packet16c >(signed char *to, const Packet16c &from, Index stride)
static Packet16c p16c_COUNTDOWN
short int predux_mul< Packet8s >(const Packet8s &a)
Packet8s padd< Packet8s >(const Packet8s &a, const Packet8s &b)
Packet8us por< Packet8us >(const Packet8us &a, const Packet8us &b)
Packet8s ploaddup< Packet8s >(const short int *from)
Packet16uc pset1< Packet16uc >(const unsigned char &from)
int predux< Packet4i >(const Packet4i &a)
Packet4f pmin< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet8bf pnegate< Packet8bf >(const Packet8bf &a)
Packet8s ploadu< Packet8s >(const short int *from)
Packet pload_common(const __UNPACK_TYPE__(Packet) *from)
float predux_min< Packet4f >(const Packet4f &a)
Packet4f pset1frombits< Packet4f >(unsigned int from)
Packet2d pceil< Packet2d >(const Packet2d &a)
EIGEN_ALWAYS_INLINE void pstoreu_partial< signed char >(signed char *to, const Packet16c &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) *to, const Packet &from, Index stride, const Index n=unpacket_traits< Packet >::size)
EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi)
Packet4f por< Packet4f >(const Packet4f &a, const Packet4f &b)
EIGEN_ALWAYS_INLINE Packet4i pgather< int, Packet4i >(const int *from, Index stride)
Packet4f Bf16ToF32Even(const Packet8bf &bf)
EIGEN_ALWAYS_INLINE void pstoreu_partial< bfloat16 >(bfloat16 *to, const Packet8bf &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) *to, const Packet &from, const Index n, const Index offset)
Packet8bf pdiv< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd)
EIGEN_ALWAYS_INLINE void pscatter_partial< int, Packet4i >(int *to, const Packet4i &from, Index stride, const Index n)
bfloat16 predux_max< Packet8bf >(const Packet8bf &a)
EIGEN_ALWAYS_INLINE void pstoreu_partial< int >(int *to, const Packet4i &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void pstore_partial< bfloat16 >(bfloat16 *to, const Packet8bf &from, const Index n, const Index offset)
static Packet16uc p16uc_TRANSPOSE64_LO
Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd)
Packet4f pandnot< Packet4f >(const Packet4f &a, const Packet4f &b)
EIGEN_ALWAYS_INLINE Packet8s pgather< short int, Packet8s >(const short int *from, Index stride)
Packet4f pldexp< Packet4f >(const Packet4f &a, const Packet4f &exponent)
void pstoreu< short int >(short int *to, const Packet8s &from)
Packet4f pabs(const Packet4f &a)
static Packet16uc p16uc_REVERSE32
__vector unsigned short int Packet8us
static Packet16uc p16uc_MERGEL16
Packet pmax(const Packet &a, const Packet &b)
Packet8s psub< Packet8s >(const Packet8s &a, const Packet8s &b)
EIGEN_ALWAYS_INLINE Packet16c pgather< signed char, Packet16c >(const signed char *from, Index stride)
float predux< Packet4f >(const Packet4f &a)
Packet4f plset< Packet4f >(const float &a)
Packet8s ploadquad< Packet8s >(const short int *from)
unsigned short int predux< Packet8us >(const Packet8us &a)
__vector __bool short Packet8bi
Packet ploaddup_common(const __UNPACK_TYPE__(Packet) *from)
Packet2cf pnegate(const Packet2cf &a)
void pstore< signed char >(signed char *to, const Packet16c &from)
void pstoreu< unsigned short int >(unsigned short int *to, const Packet8us &from)
double predux< Packet2d >(const Packet2d &a)
Packet2d pround< Packet2d >(const Packet2d &a)
EIGEN_ALWAYS_INLINE void pstoreu_partial< unsigned char >(unsigned char *to, const Packet16uc &from, const Index n, const Index offset)
Packet8bf pfloor< Packet8bf >(const Packet8bf &a)
Packet4f pand< Packet4f >(const Packet4f &a, const Packet4f &b)
unsigned short int predux_min< Packet8us >(const Packet8us &a)
Packet8s pset1< Packet8s >(const short int &from)
Packet16c plset< Packet16c >(const signed char &a)
EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial< Packet8bf >(const bfloat16 *from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE Packet8s pload_partial< Packet8s >(const short int *from, const Index n, const Index offset)
Packet16uc pmax< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
int predux_max< Packet4i >(const Packet4i &a)
Packet8bf print< Packet8bf >(const Packet8bf &a)
Packet4f pcmp_eq< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet8bf pload< Packet8bf >(const bfloat16 *from)
Packet4i plogical_shift_right(const Packet4i &a)
signed char predux< Packet16c >(const Packet16c &a)
Packet8us ploadquad< Packet8us >(const unsigned short int *from)
Packet pset1_size16(const __UNPACK_TYPE__(Packet)&from)
signed char predux_mul< Packet16c >(const Packet16c &a)
Packet4f pcmp_le< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pand< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) *to, const Packet &from, const Index n, const Index offset)
double predux_mul< Packet2d >(const Packet2d &a)
Packet4f pdiv< Packet4f >(const Packet4f &a, const Packet4f &b)
static Packet16uc p16uc_PSET32_WEVEN
void pstore< unsigned short int >(unsigned short int *to, const Packet8us &from)
Packet16uc pmin< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
__UNPACK_TYPE__(Packet) pfirst_common(const Packet &a)
Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
EIGEN_ALWAYS_INLINE Packet16uc pgather_partial< unsigned char, Packet16uc >(const unsigned char *from, Index stride, const Index n)
Packet16c pmul< Packet16c >(const Packet16c &a, const Packet16c &b)
EIGEN_ALWAYS_INLINE Packet4i pload_partial< Packet4i >(const int *from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE Packet8us pgather_partial< unsigned short int, Packet8us >(const unsigned short int *from, Index stride, const Index n)
Packet8bf pmul< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet &x)
Packet2d ploaddup< Packet2d >(const double *from)
static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u)
void pstoreu(Scalar *to, const Packet &from)
Packet4i pmul< Packet4i >(const Packet4i &a, const Packet4i &b)
__vector signed char Packet16c
Packet4ui padd< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet2cf pcmp_eq(const Packet2cf &a, const Packet2cf &b)
bfloat16 pfirst(const Packet8bf &a)
Packet16c padd< Packet16c >(const Packet16c &a, const Packet16c &b)
Packet2d pgather< double, Packet2d >(const double *from, Index stride)
Packet8us padd< Packet8us >(const Packet8us &a, const Packet8us &b)
void pstoreu_partial(Scalar *to, const Packet &from, const Index n, const Index offset=0)
Packet8s pmin< Packet8s >(const Packet8s &a, const Packet8s &b)
Packet ploadu_common(const __UNPACK_TYPE__(Packet) *from)
void pstoreu< double >(double *to, const Packet4d &from)
Packet4f pselect(const Packet4f &mask, const Packet4f &a, const Packet4f &b)
Packet8bf plog< Packet8bf >(const Packet8bf &a)
EIGEN_ALWAYS_INLINE void pscatter< int, Packet4i >(int *to, const Packet4i &from, Index stride)
Packet16c ploaddup< Packet16c >(const signed char *from)
__vector unsigned int Packet4ui
Packet16uc ploaddup< Packet16uc >(const unsigned char *from)
Packet pmul(const Packet &a, const Packet &b)
__vector __bool int Packet4bi
void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Packet4d pfrexp_generic_get_biased_exponent(const Packet4d &a)
Packet pmsub(const Packet &a, const Packet &b, const Packet &c)
Packet8bf psub< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4i ploadu< Packet4i >(const int *from)
signed char predux_max< Packet16c >(const Packet16c &a)
float pfirst< Packet4f >(const Packet4f &a)
Packet8bf pceil< Packet8bf >(const Packet8bf &a)
Packet pfrexp_generic(const Packet &a, Packet &exponent)
EIGEN_ALWAYS_INLINE void pscatter_partial< bfloat16, Packet8bf >(bfloat16 *to, const Packet8bf &from, Index stride, const Index n)
unsigned char predux< Packet16uc >(const Packet16uc &a)
double pfirst< Packet2d >(const Packet2d &a)
Packet pldexp_generic(const Packet &a, const Packet &exponent)
Packet4i pxor< Packet4i >(const Packet4i &a, const Packet4i &b)
bfloat16 predux_mul< Packet8bf >(const Packet8bf &a)
Packet8bf por< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
void pstoreu< bfloat16 >(bfloat16 *to, const Packet8bf &from)
eigen_packet_wrapper< __vector unsigned short int, 0 > Packet8bf
Packet16c pmin< Packet16c >(const Packet16c &a, const Packet16c &b)
Packet4i pandnot< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet8bf pmin< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4i pset1< Packet4i >(const int &from)
static Packet16uc p16uc_HALF64_0_16
Packet8s pload< Packet8s >(const short int *from)
Packet16c pset1< Packet16c >(const signed char &from)
Packet4f padd< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pload< Packet4i >(const int *from)
Packet4f ploadu< Packet4f >(const float *from)
EIGEN_ALWAYS_INLINE Packet4f pgather_partial< float, Packet4f >(const float *from, Index stride, const Index n)
float predux_mul< Packet4f >(const Packet4f &a)
Packet2d padd< Packet2d >(const Packet2d &a, const Packet2d &b)
short int pfirst< Packet8s >(const Packet8s &a)
unsigned char predux_min< Packet16uc >(const Packet16uc &a)
Packet2d pandnot< Packet2d >(const Packet2d &a, const Packet2d &b)
EIGEN_ALWAYS_INLINE void pscatter< float, Packet4f >(float *to, const Packet4f &from, Index stride)
Packet8us plset< Packet8us >(const unsigned short int &a)
Packet pnmadd(const Packet &a, const Packet &b, const Packet &c)
Packet8bf pxor< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
__vector short int Packet8s
void pbroadcast4< Packet4i >(const int *a, Packet4i &a0, Packet4i &a1, Packet4i &a2, Packet4i &a3)
EIGEN_ALWAYS_INLINE void pstoreu_partial< float >(float *to, const Packet4f &from, const Index n, const Index offset)
Packet psub(const Packet &a, const Packet &b)
static Packet4f p4f_MZERO
bfloat16 predux_min< Packet8bf >(const Packet8bf &a)
void pstore< unsigned char >(unsigned char *to, const Packet16uc &from)
void pstore_common(__UNPACK_TYPE__(Packet) *to, const Packet &from)
Packet4i ploaddup< Packet4i >(const int *from)
void prefetch< float >(const float *addr)
void prefetch< double >(const double *addr)
Packet8us pset1< Packet8us >(const unsigned short int &from)
Packet4f pmul< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet2d pset1< Packet2d >(const double &from)
Packet4i pdiv< Packet4i >(const Packet4i &a, const Packet4i &b)
bfloat16 predux< Packet8bf >(const Packet8bf &a)
Packet4f pfrexp< Packet4f >(const Packet4f &a, Packet4f &exponent)
Packet2d ploadu< Packet2d >(const double *from)
Packet8bf pfrexp< Packet8bf >(const Packet8bf &a, Packet8bf &e)
Packet8us pmul< Packet8us >(const Packet8us &a, const Packet8us &b)
short int predux_min< Packet8s >(const Packet8s &a)
Packet4i plset< Packet4i >(const int &a)
Packet16uc pload< Packet16uc >(const unsigned char *from)
EIGEN_ALWAYS_INLINE Packet8bf pgather_partial< bfloat16, Packet8bf >(const bfloat16 *from, Index stride, const Index n)
Packet8bf pexp< Packet8bf >(const Packet8bf &a)
Packet16c ploadu< Packet16c >(const signed char *from)
EIGEN_ALWAYS_INLINE void pstoreu_partial< short int >(short int *to, const Packet8s &from, const Index n, const Index offset)
void pstoreu< float >(float *to, const Packet4f &from)
static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0)
unsigned char predux_max< Packet16uc >(const Packet16uc &a)
Packet16c pmax< Packet16c >(const Packet16c &a, const Packet16c &b)
static Packet2d p2d_ONE
EIGEN_ALWAYS_INLINE void pstore_partial< short int >(short int *to, const Packet8s &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void pscatter_partial< unsigned short int, Packet8us >(unsigned short int *to, const Packet8us &from, Index stride, const Index n)
Packet8s pmul< Packet8s >(const Packet8s &a, const Packet8s &b)
EIGEN_ALWAYS_INLINE void pscatter_partial< signed char, Packet16c >(signed char *to, const Packet16c &from, Index stride, const Index n)
void pstoreu_common(__UNPACK_TYPE__(Packet) *to, const Packet &from)
EIGEN_ALWAYS_INLINE Packet16uc pgather< unsigned char, Packet16uc >(const unsigned char *from, Index stride)
Packet2d psub< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet2d pfloor< Packet2d >(const Packet2d &a)
static Packet16uc p16uc_PSET64_HI
Packet8s plset< Packet8s >(const short int &a)
EIGEN_ALWAYS_INLINE Packet4f pgather< float, Packet4f >(const float *from, Index stride)
EIGEN_ALWAYS_INLINE Packet8us pgather< unsigned short int, Packet8us >(const unsigned short int *from, Index stride)
Packet2d pldexp< Packet2d >(const Packet2d &a, const Packet2d &exponent)
void ptranpose_common(PacketBlock< T, 4 > &kernel)
EIGEN_ALWAYS_INLINE Packet4i pgather_partial< int, Packet4i >(const int *from, Index stride, const Index n)
Packet8bf plset< Packet8bf >(const bfloat16 &a)
Packet8bf padd< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet8us psub< Packet8us >(const Packet8us &a, const Packet8us &b)
Packet8bf ploadu< Packet8bf >(const bfloat16 *from)
Packet16c psub< Packet16c >(const Packet16c &a, const Packet16c &b)
int predux_min< Packet4i >(const Packet4i &a)
Packet8us pmax< Packet8us >(const Packet8us &a, const Packet8us &b)
static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1)
Packet4i padd< Packet4i >(const Packet4i &a, const Packet4i &b)
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x)
Packet4f Bf16ToF32Odd(const Packet8bf &bf)
Packet4i psub< Packet4i >(const Packet4i &a, const Packet4i &b)
EIGEN_ALWAYS_INLINE Packet8bf pload_partial< Packet8bf >(const bfloat16 *from, const Index n, const Index offset)
Packet8bf pset1< Packet8bf >(const bfloat16 &from)
EIGEN_ALWAYS_INLINE void pscatter< unsigned short int, Packet8us >(unsigned short int *to, const Packet8us &from, Index stride)
Packet2d pxor< Packet2d >(const Packet2d &a, const Packet2d &b)
unsigned short int pfirst< Packet8us >(const Packet8us &a)
EIGEN_ALWAYS_INLINE void pscatter_partial< unsigned char, Packet16uc >(unsigned char *to, const Packet16uc &from, Index stride, const Index n)
static Packet16uc p16uc_COUNTDOWN
Packet8bf psignbit(const Packet8bf &a)
int predux_mul< Packet4i >(const Packet4i &a)
void pstoreu< unsigned char >(unsigned char *to, const Packet16uc &from)
Packet4i pblend(const Selector< 4 > &ifPacket, const Packet4i &thenPacket, const Packet4i &elsePacket)
static Packet16uc p16uc_FORWARD
int pfirst< Packet4i >(const Packet4i &a)
EIGEN_ALWAYS_INLINE Packet4i ploadu_partial< Packet4i >(const int *from, const Index n, const Index offset)
void pstore< short int >(short int *to, const Packet8s &from)
Packet4i pmax< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet4i por< Packet4i >(const Packet4i &a, const Packet4i &b)
Packet2cf pconj(const Packet2cf &a)
EIGEN_ALWAYS_INLINE void pscatter< bfloat16, Packet8bf >(bfloat16 *to, const Packet8bf &from, Index stride)
Packet8bf psin< Packet8bf >(const Packet8bf &a)
Packet2d por< Packet2d >(const Packet2d &a, const Packet2d &b)
Packet pset1_size4(const __UNPACK_TYPE__(Packet)&from)
void pstoreu< int >(int *to, const Packet4i &from)
short int predux< Packet8s >(const Packet8s &a)
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo)
unsigned char pfirst< Packet16uc >(const Packet16uc &a)
Packet8bf pcos< Packet8bf >(const Packet8bf &a)
Packet8bf pmax< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4f pmax< Packet4f >(const Packet4f &a, const Packet4f &b)
Packet4i pmin< Packet4i >(const Packet4i &a, const Packet4i &b)
EIGEN_ALWAYS_INLINE void pstoreu_partial< unsigned short int >(unsigned short int *to, const Packet8us &from, const Index n, const Index offset)
Packet4i pand< Packet4i >(const Packet4i &a, const Packet4i &b)
static const Packet16uc p16uc_DUPLICATE16_EVEN
Packet8us pxor< Packet8us >(const Packet8us &a, const Packet8us &b)
EIGEN_ALWAYS_INLINE Packet4f ploadu_partial< Packet4f >(const float *from, const Index n, const Index offset)
Packet2d pload< Packet2d >(const double *from)
Packet8bf pand< Packet8bf >(const Packet8bf &a, const Packet8bf &b)
Packet4f psub< Packet4f >(const Packet4f &a, const Packet4f &b)
static Packet2d p2d_COUNTDOWN
Packet4i plogical_shift_left(const Packet4i &a)
static Packet16uc p16uc_PSET32_WODD
EIGEN_ALWAYS_INLINE Packet8bf pload_ignore< Packet8bf >(const bfloat16 *from)
Packet2cf preverse(const Packet2cf &a)
Packet4i parithmetic_shift_right(const Packet4i &a)
EIGEN_ALWAYS_INLINE Packet8us pload_partial< Packet8us >(const unsigned short int *from, const Index n, const Index offset)
static Packet16uc p16uc_PSET64_LO
EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial< Packet16uc >(const unsigned char *from, const Index n, const Index offset)
Packet pblend4(const Selector< 4 > &ifPacket, const Packet &thenPacket, const Packet &elsePacket)
Packet4f pload< Packet4f >(const float *from)
EIGEN_ALWAYS_INLINE void pscatter_partial< short int, Packet8s >(short int *to, const Packet8s &from, Index stride, const Index n)
Packet4f pcmp_lt_or_nan< Packet4f >(const Packet4f &a, const Packet4f &b)
static Packet16uc p16uc_COMPLEX32_REV
Packet4f pset1< Packet4f >(const float &from)
Packet4i pcmp_lt(const Packet4i &a, const Packet4i &b)
Packet4f ploaddup< Packet4f >(const float *from)
static Packet16uc p16uc_QUADRUPLICATE16_HI
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x)
unsigned char predux_mul< Packet16uc >(const Packet16uc &a)
static const Packet16uc p16uc_DUPLICATE16_ODD
EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) *from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE void pstore_partial< float >(float *to, const Packet4f &from, const Index n, const Index offset)
EIGEN_ALWAYS_INLINE Packet8s ploadu_partial< Packet8s >(const short int *from, const Index n, const Index offset)
__vector float Packet4f
Packet pset1_size8(const __UNPACK_TYPE__(Packet)&from)
static Packet4f p4f_COUNTDOWN
Packet4f pround< Packet4f >(const Packet4f &a)
Packet4f pfloor< Packet4f >(const Packet4f &a)
short int predux_max< Packet8s >(const Packet8s &a)
Packet16uc padd< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
EIGEN_ALWAYS_INLINE Packet16c pgather_partial< signed char, Packet16c >(const signed char *from, Index stride, const Index n)
EIGEN_ALWAYS_INLINE Packet8us ploadu_partial< Packet8us >(const unsigned short int *from, const Index n, const Index offset)
unsigned short int predux_max< Packet8us >(const Packet8us &a)
EIGEN_ALWAYS_INLINE Packet16c ploadu_partial< Packet16c >(const signed char *from, const Index n, const Index offset)
Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi)
Packet8s por< Packet8s >(const Packet8s &a, const Packet8s &b)
EIGEN_ALWAYS_INLINE void pscatter< short int, Packet8s >(short int *to, const Packet8s &from, Index stride)
void pstore< bfloat16 >(bfloat16 *to, const Packet8bf &from)
Packet16uc pmul< Packet16uc >(const Packet16uc &a, const Packet16uc &b)
void prefetch< int >(const int *addr)
void pbroadcast4< Packet4f >(const float *a, Packet4f &a0, Packet4f &a1, Packet4f &a2, Packet4f &a3)
EIGEN_ALWAYS_INLINE Packet8bf pgather< bfloat16, Packet8bf >(const bfloat16 *from, Index stride)
Packet4ui pand< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
void pscatter< double, Packet2d >(double *to, const Packet2d &from, Index stride)
void pbroadcast4< Packet2d >(const double *a, Packet2d &a0, Packet2d &a1, Packet2d &a2, Packet2d &a3)
unsigned short int predux_mul< Packet8us >(const Packet8us &a)
Packet8us ploadu< Packet8us >(const unsigned short int *from)
Packet8us pload< Packet8us >(const unsigned short int *from)
signed char predux_min< Packet16c >(const Packet16c &a)
bool predux_any(const Packet4f &x)
EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo)
Packet4ui pandnot< Packet4ui >(const Packet4ui &a, const Packet4ui &b)
Packet8us pmin< Packet8us >(const Packet8us &a, const Packet8us &b)
Packet8bf ploaddup< Packet8bf >(const bfloat16 *from)
void pstore< double >(double *to, const Packet4d &from)
Packet4f pcmp_le(const Packet4f &a, const Packet4f &b)
Packet4f pceil< Packet4f >(const Packet4f &a)
EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) *from, const Index n, const Index offset)
Packet8bf pround< Packet8bf >(const Packet8bf &a)
EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) *from, Index stride, const Index n=unpacket_traits< Packet >::size)
EIGEN_ALWAYS_INLINE Packet8s pgather_partial< short int, Packet8s >(const short int *from, Index stride, const Index n)
static Packet16uc p16uc_DUPLICATE32_HI
Packet2d pmax< Packet2d >(const Packet2d &a, const Packet2d &b)
static Packet16uc p16uc_TRANSPOSE64_HI
std::int64_t int64_t
Definition: Meta.h:42
: InteropHeaders
Definition: Core:139
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
Definition: BFloat16.h:222