arch/NEON/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 #include "../../InternalHeaderCheck.h"
2 
3 namespace Eigen {
4 namespace internal {
5 
6 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
7 
8 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
9 // Here we specialize gebp_traits to eliminate these register spills.
10 // See #2138.
11 template<>
12 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
13  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
14 {
15  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
16  {
17  // This volatile inline ASM both acts as a barrier to prevent reordering,
18  // as well as enforces strict register use.
19  asm volatile(
20  "vmla.f32 %q[r], %q[c], %q[alpha]"
21  : [r] "+w" (r)
22  : [c] "w" (c),
23  [alpha] "w" (alpha)
24  : );
25  }
26 
27  template <typename LaneIdType>
28  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
29  Packet4f& c, Packet4f&,
30  const LaneIdType&) const {
31  acc(a, b, c);
32  }
33 
34  template <typename LaneIdType>
35  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
36  Packet4f& c, Packet4f& tmp,
37  const LaneIdType& lane) const {
38  madd(a, b.get(lane), c, tmp, lane);
39  }
40 };
41 
42 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
43 
44 #if EIGEN_ARCH_ARM64
45 
46 #ifndef EIGEN_NEON_GEBP_NR
47 #define EIGEN_NEON_GEBP_NR 8
48 #endif
49 
50 template<>
51 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
52  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
53 {
54  typedef float RhsPacket;
55  typedef float32x4_t RhsPacketx4;
56  enum { nr = EIGEN_NEON_GEBP_NR };
57  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const {
58  dest = *b;
59  }
60 
61  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
62  {
63  dest = vld1q_f32(b);
64  }
65 
66  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
67  {
68  dest = *b;
69  }
70 
71  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
72  {}
73 
74  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
75  {
76  loadRhs(b,dest);
77  }
78 
79  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
80  {
81  c = vfmaq_n_f32(c, a, b);
82  }
83  // NOTE: Template parameter inference failed when compiled with Android NDK:
84  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
85 
86  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
87  { madd_helper<0>(a, b, c); }
88  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
89  { madd_helper<1>(a, b, c); }
90  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
91  { madd_helper<2>(a, b, c); }
92  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
93  { madd_helper<3>(a, b, c); }
94 
95  private:
96  template<int LaneID>
97  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
98  {
99  #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
100  // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
101  // vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
102  // 2. workaround the gcc register split problem on arm64-neon
103  if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
104  else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
105  else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
106  else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
107  #else
108  c = vfmaq_laneq_f32(c, a, b, LaneID);
109  #endif
110  }
111 };
112 
113 
114 template<>
115 struct gebp_traits <double,double,false,false,Architecture::NEON>
116  : gebp_traits<double,double,false,false,Architecture::Generic>
117 {
118  typedef double RhsPacket;
119  enum { nr = EIGEN_NEON_GEBP_NR };
120  struct RhsPacketx4 {
121  float64x2_t B_0, B_1;
122  };
123 
124  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
125  {
126  dest = *b;
127  }
128 
129  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
130  {
131  dest.B_0 = vld1q_f64(b);
132  dest.B_1 = vld1q_f64(b+2);
133  }
134 
135  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
136  {
137  loadRhs(b,dest);
138  }
139 
140  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
141  {}
142 
143  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
144  {
145  loadRhs(b,dest);
146  }
147 
148  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
149  {
150  c = vfmaq_n_f64(c, a, b);
151  }
152 
153  // NOTE: Template parameter inference failed when compiled with Android NDK:
154  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
155 
156  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
157  { madd_helper<0>(a, b, c); }
158  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
159  { madd_helper<1>(a, b, c); }
160  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
161  { madd_helper<2>(a, b, c); }
162  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
163  { madd_helper<3>(a, b, c); }
164 
165  private:
166  template <int LaneID>
167  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
168  {
169  #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
170  // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
171  // vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
172  // 2. workaround the gcc register split problem on arm64-neon
173  if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
174  else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
175  else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
176  else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
177  #else
178  if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
179  else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
180  else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
181  else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
182  #endif
183  }
184 };
185 
186 // The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
187 // allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
188 // so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
189 // through a costly dup in gcc compiler.
190 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
191 
192 template<>
193 struct gebp_traits <half,half,false,false,Architecture::NEON>
194  : gebp_traits<half,half,false,false,Architecture::Generic>
195 {
196  typedef half RhsPacket;
197  typedef float16x4_t RhsPacketx4;
198  typedef float16x4_t PacketHalf;
199  enum { nr = EIGEN_NEON_GEBP_NR };
200 
201  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
202  {
203  dest = *b;
204  }
205 
206  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
207  {
208  dest = vld1_f16((const __fp16 *)b);
209  }
210 
211  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
212  {
213  dest = *b;
214  }
215 
216  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
217  {}
218 
219  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const
220  {
221  // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
222  // using a single scalar value.
223  eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
224  }
225 
226  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
227  {
228  c = vfmaq_n_f16(c, a, b);
229  }
230  EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
231  {
232  c = vfma_n_f16(c, a, b);
233  }
234 
235  // NOTE: Template parameter inference failed when compiled with Android NDK:
236  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
237  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
238  { madd_helper<0>(a, b, c); }
239  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
240  { madd_helper<1>(a, b, c); }
241  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
242  { madd_helper<2>(a, b, c); }
243  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
244  { madd_helper<3>(a, b, c); }
245  private:
246  template<int LaneID>
247  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
248  {
249  c = vfmaq_lane_f16(c, a, b, LaneID);
250  }
251 };
252 #endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
253 #endif // EIGEN_ARCH_ARM64
254 
255 } // namespace internal
256 } // namespace Eigen
Array< int, 3, 1 > b
Array33i c
#define eigen_assert(x)
Definition: Macros.h:902
__vector float Packet4f
: InteropHeaders
Definition: Core:139