OpenShot Audio Library | OpenShotAudio  0.3.2
juce_avx_SIMDNativeOps.h
1 /*
2  ==============================================================================
3 
4  This file is part of the JUCE library.
5  Copyright (c) 2017 - ROLI Ltd.
6 
7  JUCE is an open source library subject to commercial or open-source
8  licensing.
9 
10  By using JUCE, you agree to the terms of both the JUCE 5 End-User License
11  Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
12  27th April 2017).
13 
14  End User License Agreement: www.juce.com/juce-5-licence
15  Privacy Policy: www.juce.com/juce-5-privacy-policy
16 
17  Or: You may also use this code under the terms of the GPL v3 (see
18  www.gnu.org/licenses).
19 
20  JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
21  EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
22  DISCLAIMED.
23 
24  ==============================================================================
25 */
26 
27 namespace juce
28 {
29 namespace dsp
30 {
31 
32 #ifndef DOXYGEN
33 
34 #if JUCE_GCC && (__GNUC__ >= 6)
35  #pragma GCC diagnostic push
36  #pragma GCC diagnostic ignored "-Wignored-attributes"
37 #endif
38 
39 #ifdef _MSC_VER
40  #define DECLARE_AVX_SIMD_CONST(type, name) \
41  static __declspec(align(32)) const type name[32 / sizeof (type)]
42 
43  #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
44  __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
45 
46 #else
47  #define DECLARE_AVX_SIMD_CONST(type, name) \
48  static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
49 
50  #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
51  const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
52 
53 #endif
54 
55 template <typename type>
56 struct SIMDNativeOps;
57 
58 //==============================================================================
63 template <>
64 struct SIMDNativeOps<float>
65 {
66  using vSIMDType = __m256;
67 
68  //==============================================================================
69  DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
70  DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
71  DECLARE_AVX_SIMD_CONST (float, kOne);
72 
73  //==============================================================================
74  static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return load (a); }
75  static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return _mm256_castsi256_ps (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
76  static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
77  static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
78  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
79  static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
80  static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
81  static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
82  static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
83  static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
84  static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
85  static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
86  static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
87  static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
88  static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
89  static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
90  static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
91  static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
92  static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
93  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
94  static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
95  static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
96  static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
97  static forcedinline float JUCE_VECTOR_CALLTYPE get (__m256 v, size_t i) noexcept { return SIMDFallbackOps<float, __m256>::get (v, i); }
98  static forcedinline __m256 JUCE_VECTOR_CALLTYPE set (__m256 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m256>::set (v, i, s); }
99  static forcedinline __m256 JUCE_VECTOR_CALLTYPE truncate (__m256 a) noexcept { return _mm256_cvtepi32_ps (_mm256_cvttps_epi32 (a)); }
100 
101  static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept
102  {
103  #if __FMA__
104  return _mm256_fmadd_ps (b, c, a);
105  #else
106  return add (a, mul (b, c));
107  #endif
108  }
109 
110  static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
111  {
112  a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
113  return add (_mm256_permute2f128_ps (a, a, 1), a);
114  }
115 
116  //==============================================================================
117  static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
118  {
119  __m256 rr_ir = mul (a, dupeven (b));
120  __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
121  return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
122  }
123 
124  static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
125  {
126  __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
127  __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
128  retval = _mm256_add_ps (retval, tmp);
129 
130  #if JUCE_GCC
131  return retval[0];
132  #else
133  return _mm256_cvtss_f32 (retval);
134  #endif
135  }
136 };
137 
138 //==============================================================================
143 template <>
144 struct SIMDNativeOps<double>
145 {
146  using vSIMDType = __m256d;
147 
148  //==============================================================================
149  DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
150  DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
151  DECLARE_AVX_SIMD_CONST (double, kOne);
152 
153  //==============================================================================
154  static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
155  static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm256_castsi256_pd (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
156  static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
157  static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
158  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
159  static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
160  static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
161  static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
162  static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
163  static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
164  static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
165  static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
166  static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
167  static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
168  static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
169  static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
170  static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
171  static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
172  static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
173  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
174  static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
175  static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
176  static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
177  static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
178  static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
179  static forcedinline double JUCE_VECTOR_CALLTYPE get (__m256d v, size_t i) noexcept { return SIMDFallbackOps<double, __m256d>::get (v, i); }
180  static forcedinline __m256d JUCE_VECTOR_CALLTYPE set (__m256d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m256d>::set (v, i, s); }
181  static forcedinline __m256d JUCE_VECTOR_CALLTYPE truncate (__m256d a) noexcept { return _mm256_cvtepi32_pd (_mm256_cvttpd_epi32 (a)); }
182 
183  //==============================================================================
184  static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
185  {
186  __m256d rr_ir = mul (a, dupeven (b));
187  __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
188  return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
189  }
190 
191  static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
192  {
193  __m256d retval = _mm256_hadd_pd (a, a);
194  __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
195  retval = _mm256_add_pd (retval, tmp);
196 
197  #if JUCE_GCC
198  return retval[0];
199  #else
200  return _mm256_cvtsd_f64 (retval);
201  #endif
202  }
203 };
204 
205 //==============================================================================
210 template <>
211 struct SIMDNativeOps<int8_t>
212 {
213  using vSIMDType = __m256i;
214 
215  //==============================================================================
216  DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
217 
218  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
219  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
220  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
221  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
222  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
223  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
224  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
225  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
226  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
227  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
228  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
229  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
230  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
231  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
232  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
233  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
234  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
235  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
236  static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m256i>::get (v, i); }
237  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m256i>::set (v, i, s); }
238  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
239 
240  //==============================================================================
241  static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
242  {
243  __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
244  __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
245 
246  for (int i = 0; i < 3; ++i)
247  {
248  lo = _mm256_hadd_epi16 (lo, lo);
249  hi = _mm256_hadd_epi16 (hi, hi);
250  }
251 
252  #if JUCE_GCC
253  return (int8_t) ((lo[0] & 0xff) +
254  (hi[0] & 0xff) +
255  (lo[2] & 0xff) +
256  (hi[2] & 0xff));
257  #else
258  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
259 
260  return (int8_t) ((_mm256_cvtsi256_si32 (lo) & 0xff) +
261  (_mm256_cvtsi256_si32 (hi) & 0xff) +
262  (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask)) & 0xff) +
263  (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask)) & 0xff));
264  #endif
265  }
266 
267  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
268  {
269  // unpack and multiply
270  __m256i even = _mm256_mullo_epi16 (a, b);
271  __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
272 
273  return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
274  _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
275  }
276 };
277 
278 //==============================================================================
283 template <>
284 struct SIMDNativeOps<uint8_t>
285 {
286  //==============================================================================
287  using vSIMDType = __m256i;
288 
289  //==============================================================================
290  DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
291  DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
292 
293  static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
294  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
295  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
296  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
297  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
298  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
299  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
300  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
301  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
302  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
303  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
304  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
305  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
306  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
307  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
308  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
309  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
310  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
311  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
312  static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::get (v, i); }
313  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::set (v, i, s); }
314  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
315 
316  //==============================================================================
317  static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
318  {
319  __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
320  __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
321 
322  for (int i = 0; i < 3; ++i)
323  {
324  lo = _mm256_hadd_epi16 (lo, lo);
325  hi = _mm256_hadd_epi16 (hi, hi);
326  }
327 
328  #if JUCE_GCC
329  return (uint8_t) ((static_cast<uint32_t> (lo[0]) & 0xffu) +
330  (static_cast<uint32_t> (hi[0]) & 0xffu) +
331  (static_cast<uint32_t> (lo[2]) & 0xffu) +
332  (static_cast<uint32_t> (hi[2]) & 0xffu));
333  #else
334  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
335 
336  return (uint8_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (lo)) & 0xffu) +
337  (static_cast<uint32_t> (_mm256_cvtsi256_si32 (hi)) & 0xffu) +
338  (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask))) & 0xffu) +
339  (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask))) & 0xffu));
340  #endif
341  }
342 
343  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
344  {
345  // unpack and multiply
346  __m256i even = _mm256_mullo_epi16 (a, b);
347  __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
348 
349  return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
350  _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
351  }
352 };
353 
354 //==============================================================================
359 template <>
360 struct SIMDNativeOps<int16_t>
361 {
362  //==============================================================================
363  using vSIMDType = __m256i;
364 
365  //==============================================================================
366  DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
367 
368  //==============================================================================
369  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
370  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
371  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
372  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
373  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
374  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
375  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
376  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
377  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
378  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
379  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
380  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
381  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
382  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
383  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
384  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
385  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
386  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
387  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
388  static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m256i>::get (v, i); }
389  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m256i>::set (v, i, s); }
390  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
391 
392  //==============================================================================
393  static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
394  {
395  __m256i tmp = _mm256_hadd_epi16 (a, a);
396  tmp = _mm256_hadd_epi16 (tmp, tmp);
397  tmp = _mm256_hadd_epi16 (tmp, tmp);
398 
399  #if JUCE_GCC
400  return (int16_t) ((tmp[0] & 0xffff) + (tmp[2] & 0xffff));
401  #else
402  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
403 
404  return (int16_t) ((_mm256_cvtsi256_si32 (tmp) & 0xffff) +
405  (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)) & 0xffff));
406  #endif
407  }
408 };
409 
410 //==============================================================================
415 template <>
416 struct SIMDNativeOps<uint16_t>
417 {
418  //==============================================================================
419  using vSIMDType = __m256i;
420 
421  //==============================================================================
422  DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
423  DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
424 
425  //==============================================================================
426  static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
427  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
428  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
429  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
430  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
431  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
432  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
433  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
434  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
435  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
436  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
437  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
438  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
439  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
440  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
441  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
442  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
443  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
444  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
445  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
446  static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::get (v, i); }
447  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::set (v, i, s); }
448  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
449 
450  //==============================================================================
451  static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
452  {
453  __m256i tmp = _mm256_hadd_epi16 (a, a);
454  tmp = _mm256_hadd_epi16 (tmp, tmp);
455  tmp = _mm256_hadd_epi16 (tmp, tmp);
456 
457  #if JUCE_GCC
458  return (uint16_t) ((static_cast<uint32_t> (tmp[0]) & 0xffffu) +
459  (static_cast<uint32_t> (tmp[2]) & 0xffffu));
460  #else
461  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
462 
463  return (uint16_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp)) & 0xffffu) +
464  (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))) & 0xffffu));
465  #endif
466  }
467 };
468 
469 //==============================================================================
474 template <>
475 struct SIMDNativeOps<int32_t>
476 {
477  //==============================================================================
478  using vSIMDType = __m256i;
479 
480  //==============================================================================
481  DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
482 
483  //==============================================================================
484  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
485  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
486  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
487  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
488  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
489  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
490  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
491  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
492  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
493  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
494  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
495  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
496  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
497  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
498  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
499  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
500  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
501  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
502  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
503  static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m256i>::get (v, i); }
504  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m256i>::set (v, i, s); }
505  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
506 
507  //==============================================================================
508  static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
509  {
510  __m256i tmp = _mm256_hadd_epi32 (a, a);
511  tmp = _mm256_hadd_epi32 (tmp, tmp);
512 
513  #if JUCE_GCC
514  return (int32_t) (tmp[0] + tmp[2]);
515  #else
516  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
517 
518  return _mm256_cvtsi256_si32 (tmp) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask));
519  #endif
520  }
521 };
522 
523 //==============================================================================
528 template <>
529 struct SIMDNativeOps<uint32_t>
530 {
531  //==============================================================================
532  using vSIMDType = __m256i;
533 
534  //==============================================================================
535  DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
536  DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
537 
538  //==============================================================================
539  static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
540  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
541  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
542  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
543  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
544  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
545  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
546  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
547  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
548  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
549  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
550  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
551  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
552  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
553  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
554  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
555  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
556  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
557  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
558  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
559  static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::get (v, i); }
560  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::set (v, i, s); }
561  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
562 
563  //==============================================================================
564  static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
565  {
566  __m256i tmp = _mm256_hadd_epi32 (a, a);
567  tmp = _mm256_hadd_epi32 (tmp, tmp);
568 
569  #if JUCE_GCC
570  return static_cast<uint32_t> (tmp[0]) + static_cast<uint32_t> (tmp[2]);
571  #else
572  constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
573 
574  return static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp))
575  + static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)));
576  #endif
577  }
578 };
579 
580 //==============================================================================
585 template <>
586 struct SIMDNativeOps<int64_t>
587 {
588  //==============================================================================
589  using vSIMDType = __m256i;
590 
591  //==============================================================================
592  DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
593 
594  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
595  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
596  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
597  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
598  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
599  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
600  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
601  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
602  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
603  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
604  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
605  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
606  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
607  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
608  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
609  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
610  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
611  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
612  static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m256i>::get (v, i); }
613  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m256i>::set (v, i, s); }
614  static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<int64_t, __m256i>::sum (a); }
615  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<int64_t, __m256i>::mul (a, b); }
616  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
617 };
618 
619 //==============================================================================
624 template <>
625 struct SIMDNativeOps<uint64_t>
626 {
627  //==============================================================================
628  using vSIMDType = __m256i;
629 
630  //==============================================================================
631  DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
632  DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
633 
634  static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
635  static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
636  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
637  static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
638  static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
639  static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
640  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
641  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
642  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
643  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
644  static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
645  static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
646  static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
647  static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
648  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
649  static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
650  static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
651  static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
652  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
653  static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::get (v, i); }
654  static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::set (v, i, s); }
655  static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::sum (a); }
656  static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::mul (a, b); }
657  static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
658 };
659 
660 #endif
661 
662 #if JUCE_GCC && (__GNUC__ >= 6)
663  #pragma GCC diagnostic pop
664 #endif
665 
666 } // namespace dsp
667 } // namespace juce