BFGraph
libpopcnt.h
1 /*
2  * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
3  * population count) in an array as quickly as possible using
4  * specialized CPU instructions e.g. POPCNT, AVX2.
5  *
6  * Copyright (c) 2016 - 2017, Kim Walisch
7  * Copyright (c) 2016 - 2017, Wojciech Muła
8  *
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright notice, this
15  * list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright notice,
17  * this list of conditions and the following disclaimer in the documentation
18  * and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef LIBPOPCNT_H
33 #define LIBPOPCNT_H
34 
35 #include <stdint.h>
36 
37 #ifndef __has_builtin
38  #define __has_builtin(x) 0
39 #endif
40 
41 #ifndef __has_attribute
42  #define __has_attribute(x) 0
43 #endif
44 
45 #ifdef __GNUC__
46  #define GNUC_PREREQ(x, y) \
47  (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y))
48 #else
49  #define GNUC_PREREQ(x, y) 0
50 #endif
51 
52 #ifdef __clang__
53  #define CLANG_PREREQ(x, y) \
54  (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y))
55 #else
56  #define CLANG_PREREQ(x, y) 0
57 #endif
58 
59 #if (defined(__i386__) || \
60  defined(__x86_64__) || \
61  defined(_M_IX86) || \
62  defined(_M_X64))
63  #define X86_OR_X64
64 #endif
65 
66 #if defined(X86_OR_X64) && \
67  (defined(__cplusplus) || \
68  (GNUC_PREREQ(4, 2) || \
69  __has_builtin(__sync_val_compare_and_swap)))
70  #define HAVE_CPUID
71 #endif
72 
73 #if GNUC_PREREQ(4, 2) || \
74  __has_builtin(__builtin_popcount)
75  #define HAVE_BUILTIN_POPCOUNT
76 #endif
77 
78 #if GNUC_PREREQ(4, 2) || \
79  CLANG_PREREQ(3, 0)
80  #define HAVE_ASM_POPCNT
81 #endif
82 
83 #if defined(HAVE_CPUID) && \
84  (defined(HAVE_ASM_POPCNT) || \
85  defined(_MSC_VER))
86  #define HAVE_POPCNT
87 #endif
88 
89 #if defined(HAVE_CPUID) && \
90  GNUC_PREREQ(4, 9)
91  #define HAVE_AVX2
92 #endif
93 
94 #if defined(HAVE_CPUID) && \
95  CLANG_PREREQ(3, 8) && \
96  __has_attribute(target) && \
97  (!defined(_MSC_VER) || defined(__AVX2__)) && \
98  (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
99  #define HAVE_AVX2
100 #endif
101 
102 /*
103  * This uses fewer arithmetic operations than any other known
104  * implementation on machines with fast multiplication.
105  * It uses 12 arithmetic operations, one of which is a multiply.
106  * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
107  */
108 static inline uint64_t popcount64(uint64_t x)
109 {
110  uint64_t m1 = 0x5555555555555555ll;
111  uint64_t m2 = 0x3333333333333333ll;
112  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
113  uint64_t h01 = 0x0101010101010101ll;
114 
115  x -= (x >> 1) & m1;
116  x = (x & m2) + ((x >> 2) & m2);
117  x = (x + (x >> 4)) & m4;
118 
119  return (x * h01) >> 56;
120 }
121 
122 #if defined(HAVE_ASM_POPCNT) && \
123  defined(__x86_64__)
124 
125 static inline uint64_t popcnt64(uint64_t x)
126 {
127  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
128  return x;
129 }
130 
131 #elif defined(HAVE_ASM_POPCNT) && \
132  defined(__i386__)
133 
134 static inline uint32_t popcnt32(uint32_t x)
135 {
136  __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x));
137  return x;
138 }
139 
140 static inline uint64_t popcnt64(uint64_t x)
141 {
142  return popcnt32((uint32_t) x) +
143  popcnt32((uint32_t)(x >> 32));
144 }
145 
146 #elif defined(_MSC_VER) && \
147  defined(_M_X64)
148 
149 #include <nmmintrin.h>
150 
151 static inline uint64_t popcnt64(uint64_t x)
152 {
153  return _mm_popcnt_u64(x);
154 }
155 
156 #elif defined(_MSC_VER) && \
157  defined(_M_IX86)
158 
159 #include <nmmintrin.h>
160 
161 static inline uint64_t popcnt64(uint64_t x)
162 {
163  return _mm_popcnt_u32((uint32_t) x) +
164  _mm_popcnt_u32((uint32_t)(x >> 32));
165 }
166 
167 /* non x86 CPUs */
168 #elif defined(HAVE_BUILTIN_POPCOUNT)
169 
170 static inline uint64_t popcnt64(uint64_t x)
171 {
172  return __builtin_popcountll(x);
173 }
174 
175 /* no hardware POPCNT,
176  * use pure integer algorithm */
177 #else
178 
179 static inline uint64_t popcnt64(uint64_t x)
180 {
181  return popcount64(x);
182 }
183 
184 #endif
185 
186 static inline uint64_t popcnt64_unrolled(const uint64_t* data, uint64_t size)
187 {
188  uint64_t i = 0;
189  uint64_t limit = size - size % 4;
190  uint64_t cnt = 0;
191 
192  for (; i < limit; i += 4)
193  {
194  cnt += popcnt64(data[i+0]);
195  cnt += popcnt64(data[i+1]);
196  cnt += popcnt64(data[i+2]);
197  cnt += popcnt64(data[i+3]);
198  }
199 
200  for (; i < size; i++)
201  cnt += popcnt64(data[i]);
202 
203  return cnt;
204 }
205 
206 #if defined(HAVE_CPUID)
207 
208 #if defined(_MSC_VER)
209  #include <intrin.h>
210  #include <immintrin.h>
211 #endif
212 
213 /* %ecx bit flags */
214 #define bit_POPCNT (1 << 23)
215 
216 /* %ebx bit flags */
217 #define bit_AVX2 (1 << 5)
218 
219 /* xgetbv bit flags */
220 #define XSTATE_SSE (1 << 1)
221 #define XSTATE_YMM (1 << 2)
222 
223 static inline void run_cpuid(int eax, int ecx, int* abcd)
224 {
225  int ebx = 0;
226  int edx = 0;
227 
228 #if defined(_MSC_VER)
229  __cpuidex(abcd, eax, ecx);
230 #elif defined(__i386__) && \
231  defined(__PIC__)
232  /* in case of PIC under 32-bit EBX cannot be clobbered */
233  __asm__ ("movl %%ebx, %%edi;"
234  "cpuid;"
235  "xchgl %%ebx, %%edi;"
236  : "=D" (ebx),
237  "+a" (eax),
238  "+c" (ecx),
239  "=d" (edx));
240 #else
241  __asm__ ("cpuid;"
242  : "+b" (ebx),
243  "+a" (eax),
244  "+c" (ecx),
245  "=d" (edx));
246 #endif
247  abcd[0] = eax;
248  abcd[1] = ebx;
249  abcd[2] = ecx;
250  abcd[3] = edx;
251 }
252 
253 static inline int has_POPCNT()
254 {
255  int abcd[4];
256 
257  run_cpuid(1, 0, abcd);
258  if ((abcd[2] & bit_POPCNT) != bit_POPCNT)
259  return 0;
260 
261  return bit_POPCNT;
262 }
263 
264 #if defined(HAVE_AVX2)
265 
266 static inline int check_xcr0_ymm()
267 {
268  int xcr0;
269  int mask = XSTATE_SSE | XSTATE_YMM;
270 #if defined(_MSC_VER)
271  xcr0 = (int) _xgetbv(0);
272 #else
273  __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
274 #endif
275  return (xcr0 & mask) == mask;
276 }
277 
278 static inline int has_AVX2()
279 {
280  int abcd[4];
281  int osxsave_mask = (1 << 27);
282 
283  /* ensure OS supports extended processor state management */
284  run_cpuid(1, 0, abcd);
285  if ((abcd[2] & osxsave_mask) != osxsave_mask)
286  return 0;
287 
288  /* ensure OS supports YMM registers (and XMM) */
289  if (!check_xcr0_ymm())
290  return 0;
291 
292  run_cpuid(7, 0, abcd);
293  if ((abcd[1] & bit_AVX2) != bit_AVX2)
294  return 0;
295 
296  return bit_AVX2;
297 }
298 
299 #endif /* has_AVX2 */
300 
301 static inline int get_cpuid()
302 {
303 #if defined(HAVE_AVX2)
304  return has_POPCNT() | has_AVX2();
305 #else
306  return has_POPCNT();
307 #endif
308 }
309 
310 #endif
311 
312 #if defined(HAVE_AVX2)
313 
314 #include <immintrin.h>
315 
316 __attribute__ ((target ("avx2")))
317 static inline void CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c)
318 {
319  __m256i u = a ^ b;
320  *h = (a & b) | (u & c);
321  *l = u ^ c;
322 }
323 
324 __attribute__ ((target ("avx2")))
325 static inline __m256i popcnt256(__m256i v)
326 {
327  __m256i lookup1 = _mm256_setr_epi8(
328  4, 5, 5, 6, 5, 6, 6, 7,
329  5, 6, 6, 7, 6, 7, 7, 8,
330  4, 5, 5, 6, 5, 6, 6, 7,
331  5, 6, 6, 7, 6, 7, 7, 8
332  );
333 
334  __m256i lookup2 = _mm256_setr_epi8(
335  4, 3, 3, 2, 3, 2, 2, 1,
336  3, 2, 2, 1, 2, 1, 1, 0,
337  4, 3, 3, 2, 3, 2, 2, 1,
338  3, 2, 2, 1, 2, 1, 1, 0
339  );
340 
341  __m256i low_mask = _mm256_set1_epi8(0x0f);
342  __m256i lo = v & low_mask;
343  __m256i hi = _mm256_srli_epi16(v, 4) & low_mask;
344  __m256i popcnt1 = _mm256_shuffle_epi8(lookup1, lo);
345  __m256i popcnt2 = _mm256_shuffle_epi8(lookup2, hi);
346 
347  return _mm256_sad_epu8(popcnt1, popcnt2);
348 }
349 
350 /*
351  * AVX2 Harley-Seal popcount (4th iteration).
352  * The algorithm is based on the paper "Faster Population Counts
353  * using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
354  * Wojciech Mula (23 Nov 2016).
355  * @see https://arxiv.org/abs/1611.07612
356  */
357 __attribute__ ((target ("avx2")))
358 static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
359 {
360  __m256i cnt = _mm256_setzero_si256();
361  __m256i ones = _mm256_setzero_si256();
362  __m256i twos = _mm256_setzero_si256();
363  __m256i fours = _mm256_setzero_si256();
364  __m256i eights = _mm256_setzero_si256();
365  __m256i sixteens = _mm256_setzero_si256();
366  __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
367 
368  uint64_t i = 0;
369  uint64_t limit = size - size % 16;
370  uint64_t* cnt64;
371 
372  for(; i < limit; i += 16)
373  {
374  CSA256(&twosA, &ones, ones, data[i+0], data[i+1]);
375  CSA256(&twosB, &ones, ones, data[i+2], data[i+3]);
376  CSA256(&foursA, &twos, twos, twosA, twosB);
377  CSA256(&twosA, &ones, ones, data[i+4], data[i+5]);
378  CSA256(&twosB, &ones, ones, data[i+6], data[i+7]);
379  CSA256(&foursB, &twos, twos, twosA, twosB);
380  CSA256(&eightsA, &fours, fours, foursA, foursB);
381  CSA256(&twosA, &ones, ones, data[i+8], data[i+9]);
382  CSA256(&twosB, &ones, ones, data[i+10], data[i+11]);
383  CSA256(&foursA, &twos, twos, twosA, twosB);
384  CSA256(&twosA, &ones, ones, data[i+12], data[i+13]);
385  CSA256(&twosB, &ones, ones, data[i+14], data[i+15]);
386  CSA256(&foursB, &twos, twos, twosA, twosB);
387  CSA256(&eightsB, &fours, fours, foursA, foursB);
388  CSA256(&sixteens, &eights, eights, eightsA, eightsB);
389 
390  cnt = _mm256_add_epi64(cnt, popcnt256(sixteens));
391  }
392 
393  cnt = _mm256_slli_epi64(cnt, 4);
394  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(eights), 3));
395  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(fours), 2));
396  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(popcnt256(twos), 1));
397  cnt = _mm256_add_epi64(cnt, popcnt256(ones));
398 
399  for(; i < size; i++)
400  cnt = _mm256_add_epi64(cnt, popcnt256(data[i]));
401 
402  cnt64 = (uint64_t*) &cnt;
403 
404  return cnt64[0] +
405  cnt64[1] +
406  cnt64[2] +
407  cnt64[3];
408 }
409 
410 /* Align memory to 32 bytes boundary */
411 static inline void align_avx2(const uint8_t** p, uint64_t* size, uint64_t* cnt)
412 {
413  for (; (uintptr_t) *p % 8; (*p)++)
414  {
415  *cnt += popcnt64(**p);
416  *size -= 1;
417  }
418  for (; (uintptr_t) *p % 32; (*p) += 8)
419  {
420  *cnt += popcnt64(
421  *(const uint64_t*) *p);
422  *size -= 8;
423  }
424 }
425 
426 #endif
427 
428 /* x86 CPUs */
429 #if defined(X86_OR_X64)
430 
431 /*
432  * Count the number of 1 bits in the data array
433  * @data: An array
434  * @size: Size of data in bytes
435  */
436 static inline uint64_t popcnt(const void* data, uint64_t size)
437 {
438  const uint8_t* ptr = (const uint8_t*) data;
439  uint64_t cnt = 0;
440  uint64_t i;
441 
442 #if defined(HAVE_CPUID)
443  #if defined(__cplusplus)
444  /* C++11 thread-safe singleton */
445  static const int cpuid = get_cpuid();
446  #else
447  static int cpuid_ = -1;
448  int cpuid = cpuid_;
449  if (cpuid == -1)
450  {
451  cpuid = get_cpuid();
452  __sync_val_compare_and_swap(&cpuid_, -1, cpuid);
453  }
454  #endif
455 #endif
456 
457 #if defined(HAVE_AVX2)
458 
459  /* AVX2 requires arrays >= 512 bytes */
460  if ((cpuid & bit_AVX2) &&
461  size >= 512)
462  {
463  align_avx2(&ptr, &size, &cnt);
464  cnt += popcnt_avx2((const __m256i*) ptr, size / 32);
465  ptr += size - size % 32;
466  size = size % 32;
467  }
468 
469 #endif
470 
471 #if defined(HAVE_POPCNT)
472 
473  if (cpuid & bit_POPCNT)
474  {
475  cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
476  ptr += size - size % 8;
477  size = size % 8;
478  for (i = 0; i < size; i++)
479  cnt += popcnt64(ptr[i]);
480 
481  return cnt;
482  }
483 
484 #endif
485 
486  /* pure integer popcount algorithm */
487  for (i = 0; i < size; i++)
488  cnt += popcount64(ptr[i]);
489 
490  return cnt;
491 }
492 
493 #elif defined(__ARM_NEON) || \
494  defined(__aarch64__)
495 
496 #include <arm_neon.h>
497 
498 /*
499  * Count the number of 1 bits in the data array
500  * @data: An array
501  * @size: Size of data in bytes
502  */
503 static inline uint64_t popcnt(const void* data, uint64_t size)
504 {
505  const uint8_t* ptr = (const uint8_t*) data;
506  uint64_t cnt = 0;
507  uint64_t tmp[2];
508  uint64_t chunk_size = 128;
509  uint64_t n = size / chunk_size;
510  uint64_t i;
511 
512  uint8x16x4_t input0;
513  uint8x16x4_t input1;
514  uint8x16_t t0;
515  uint32x4_t t1;
516 
517  uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
518 
519  for (i = 0; i < n; i++, ptr += chunk_size)
520  {
521  input0 = vld4q_u8(ptr);
522  input1 = vld4q_u8(ptr + 64);
523 
524  t0 = vcntq_u8(input0.val[0]);
525  t0 = vaddq_u8(t0, vcntq_u8(input0.val[1]));
526  t0 = vaddq_u8(t0, vcntq_u8(input0.val[2]));
527  t0 = vaddq_u8(t0, vcntq_u8(input0.val[3]));
528  t0 = vaddq_u8(t0, vcntq_u8(input1.val[0]));
529  t0 = vaddq_u8(t0, vcntq_u8(input1.val[1]));
530  t0 = vaddq_u8(t0, vcntq_u8(input1.val[2]));
531  t0 = vaddq_u8(t0, vcntq_u8(input1.val[3]));
532  t1 = vpaddlq_u16(vpaddlq_u8(t0));
533 
534  sum = vpadalq_u32(sum, t1);
535  }
536 
537  vst1q_u64(tmp, sum);
538  for (i = 0; i < 2; i++)
539  cnt += tmp[i];
540 
541  size %= chunk_size;
542  cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
543  ptr += size - size % 8;
544  size = size % 8;
545  for (i = 0; i < size; i++)
546  cnt += popcnt64(ptr[i]);
547 
548  return cnt;
549 }
550 
551 /* all other CPUs */
552 #else
553 
554 /* Align memory to 8 bytes boundary */
555 static inline void align(const uint8_t** p, uint64_t* size, uint64_t* cnt)
556 {
557  for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
558  {
559  *cnt += popcnt64(**p);
560  *size -= 1;
561  }
562 }
563 
564 /*
565  * Count the number of 1 bits in the data array
566  * @data: An array
567  * @size: Size of data in bytes
568  */
569 static inline uint64_t popcnt(const void* data, uint64_t size)
570 {
571  const uint8_t* ptr = (const uint8_t*) data;
572  uint64_t cnt = 0;
573  uint64_t i;
574 
575  align(&ptr, &size, &cnt);
576  cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
577  ptr += size - size % 8;
578  size = size % 8;
579  for (i = 0; i < size; i++)
580  cnt += popcnt64(ptr[i]);
581 
582  return cnt;
583 }
584 
585 #endif
586 
587 #endif /* LIBPOPCNT_H */