simdjson  3.11.0
Ridiculously Fast JSON
simd.h
1 #ifndef SIMDJSON_PPC64_SIMD_H
2 #define SIMDJSON_PPC64_SIMD_H
3 
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/ppc64/base.h"
6 #include "simdjson/ppc64/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
8 #endif // SIMDJSON_CONDITIONAL_INCLUDE
9 
10 #include <type_traits>
11 
12 namespace simdjson {
13 namespace ppc64 {
14 namespace {
15 namespace simd {
16 
17 using __m128i = __vector unsigned char;
18 
19 template <typename Child> struct base {
20  __m128i value;
21 
22  // Zero constructor
23  simdjson_inline base() : value{__m128i()} {}
24 
25  // Conversion from SIMD register
26  simdjson_inline base(const __m128i _value) : value(_value) {}
27 
28  // Conversion to SIMD register
29  simdjson_inline operator const __m128i &() const {
30  return this->value;
31  }
32  simdjson_inline operator __m128i &() { return this->value; }
33 
34  // Bit operations
35  simdjson_inline Child operator|(const Child other) const {
36  return vec_or(this->value, (__m128i)other);
37  }
38  simdjson_inline Child operator&(const Child other) const {
39  return vec_and(this->value, (__m128i)other);
40  }
41  simdjson_inline Child operator^(const Child other) const {
42  return vec_xor(this->value, (__m128i)other);
43  }
44  simdjson_inline Child bit_andnot(const Child other) const {
45  return vec_andc(this->value, (__m128i)other);
46  }
47  simdjson_inline Child &operator|=(const Child other) {
48  auto this_cast = static_cast<Child*>(this);
49  *this_cast = *this_cast | other;
50  return *this_cast;
51  }
52  simdjson_inline Child &operator&=(const Child other) {
53  auto this_cast = static_cast<Child*>(this);
54  *this_cast = *this_cast & other;
55  return *this_cast;
56  }
57  simdjson_inline Child &operator^=(const Child other) {
58  auto this_cast = static_cast<Child*>(this);
59  *this_cast = *this_cast ^ other;
60  return *this_cast;
61  }
62 };
63 
64 template <typename T, typename Mask = simd8<bool>>
65 struct base8 : base<simd8<T>> {
66  typedef uint16_t bitmask_t;
67  typedef uint32_t bitmask2_t;
68 
69  simdjson_inline base8() : base<simd8<T>>() {}
70  simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
71 
72  friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
73  return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
74  }
75 
76  static const int SIZE = sizeof(base<simd8<T>>::value);
77 
78  template <int N = 1>
79  simdjson_inline simd8<T> prev(simd8<T> prev_chunk) const {
80  __m128i chunk = this->value;
81 #ifdef __LITTLE_ENDIAN__
82  chunk = (__m128i)vec_reve(this->value);
83  prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
84 #endif
85  chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
86 #ifdef __LITTLE_ENDIAN__
87  chunk = (__m128i)vec_reve((__m128i)chunk);
88 #endif
89  return chunk;
90  }
91 };
92 
93 // SIMD byte mask type (returned by things like eq and gt)
94 template <> struct simd8<bool> : base8<bool> {
95  static simdjson_inline simd8<bool> splat(bool _value) {
96  return (__m128i)vec_splats((unsigned char)(-(!!_value)));
97  }
98 
99  simdjson_inline simd8() : base8<bool>() {}
100  simdjson_inline simd8(const __m128i _value)
101  : base8<bool>(_value) {}
102  // Splat constructor
103  simdjson_inline simd8(bool _value)
104  : base8<bool>(splat(_value)) {}
105 
106  simdjson_inline int to_bitmask() const {
107  __vector unsigned long long result;
108  const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
109  0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
110 
111  result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
112  (__m128i)perm_mask));
113 #ifdef __LITTLE_ENDIAN__
114  return static_cast<int>(result[1]);
115 #else
116  return static_cast<int>(result[0]);
117 #endif
118  }
119  simdjson_inline bool any() const {
120  return !vec_all_eq(this->value, (__m128i)vec_splats(0));
121  }
122  simdjson_inline simd8<bool> operator~() const {
123  return this->value ^ (__m128i)splat(true);
124  }
125 };
126 
127 template <typename T> struct base8_numeric : base8<T> {
128  static simdjson_inline simd8<T> splat(T value) {
129  (void)value;
130  return (__m128i)vec_splats(value);
131  }
132  static simdjson_inline simd8<T> zero() { return splat(0); }
133  static simdjson_inline simd8<T> load(const T values[16]) {
134  return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
135  }
136  // Repeat 16 values as many times as necessary (usually for lookup tables)
137  static simdjson_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
138  T v5, T v6, T v7, T v8, T v9,
139  T v10, T v11, T v12, T v13,
140  T v14, T v15) {
141  return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
142  v14, v15);
143  }
144 
145  simdjson_inline base8_numeric() : base8<T>() {}
146  simdjson_inline base8_numeric(const __m128i _value)
147  : base8<T>(_value) {}
148 
149  // Store to array
150  simdjson_inline void store(T dst[16]) const {
151  vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
152  }
153 
154  // Override to distinguish from bool version
155  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
156 
157  // Addition/subtraction are the same for signed and unsigned
158  simdjson_inline simd8<T> operator+(const simd8<T> other) const {
159  return (__m128i)((__m128i)this->value + (__m128i)other);
160  }
161  simdjson_inline simd8<T> operator-(const simd8<T> other) const {
162  return (__m128i)((__m128i)this->value - (__m128i)other);
163  }
164  simdjson_inline simd8<T> &operator+=(const simd8<T> other) {
165  *this = *this + other;
166  return *static_cast<simd8<T> *>(this);
167  }
168  simdjson_inline simd8<T> &operator-=(const simd8<T> other) {
169  *this = *this - other;
170  return *static_cast<simd8<T> *>(this);
171  }
172 
173  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
174  // for out of range values)
175  template <typename L>
176  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
177  return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
178  }
179 
180  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
181  // as a bitset). Passing a 0 value for mask would be equivalent to writing out
182  // every byte to output. Only the first 16 - count_ones(mask) bytes of the
183  // result are significant but 16 bytes get written. Design consideration: it
184  // seems like a function with the signature simd8<L> compress(uint32_t mask)
185  // would be sensible, but the AVX ISA makes this kind of approach difficult.
186  template <typename L>
187  simdjson_inline void compress(uint16_t mask, L *output) const {
188  using internal::BitsSetTable256mul2;
189  using internal::pshufb_combine_table;
190  using internal::thintable_epi8;
191  // this particular implementation was inspired by work done by @animetosho
192  // we do it in two steps, first 8 bytes and then second 8 bytes
193  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
194  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
195  // next line just loads the 64-bit values thintable_epi8[mask1] and
196  // thintable_epi8[mask2] into a 128-bit register, using only
197  // two instructions on most compilers.
198 #ifdef __LITTLE_ENDIAN__
199  __m128i shufmask = (__m128i)(__vector unsigned long long){
200  thintable_epi8[mask1], thintable_epi8[mask2]};
201 #else
202  __m128i shufmask = (__m128i)(__vector unsigned long long){
203  thintable_epi8[mask2], thintable_epi8[mask1]};
204  shufmask = (__m128i)vec_reve((__m128i)shufmask);
205 #endif
206  // we increment by 0x08 the second half of the mask
207  shufmask = ((__m128i)shufmask) +
208  ((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
209 
210  // this is the version "nearly pruned"
211  __m128i pruned = vec_perm(this->value, this->value, shufmask);
212  // we still need to put the two halves together.
213  // we compute the popcount of the first half:
214  int pop1 = BitsSetTable256mul2[mask1];
215  // then load the corresponding mask, what it does is to write
216  // only the first pop1 bytes from the first 8 bytes, and then
217  // it fills in with the bytes from the second 8 bytes + some filling
218  // at the end.
219  __m128i compactmask =
220  vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
221  __m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
222  vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output));
223  }
224 
225  template <typename L>
226  simdjson_inline simd8<L>
227  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
228  L replace5, L replace6, L replace7, L replace8, L replace9,
229  L replace10, L replace11, L replace12, L replace13, L replace14,
230  L replace15) const {
231  return lookup_16(simd8<L>::repeat_16(
232  replace0, replace1, replace2, replace3, replace4, replace5, replace6,
233  replace7, replace8, replace9, replace10, replace11, replace12,
234  replace13, replace14, replace15));
235  }
236 };
237 
238 // Signed bytes
239 template <> struct simd8<int8_t> : base8_numeric<int8_t> {
240  simdjson_inline simd8() : base8_numeric<int8_t>() {}
241  simdjson_inline simd8(const __m128i _value)
242  : base8_numeric<int8_t>(_value) {}
243  // Splat constructor
244  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
245  // Array constructor
246  simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {}
247  // Member-by-member initialization
248  simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
249  int8_t v4, int8_t v5, int8_t v6, int8_t v7,
250  int8_t v8, int8_t v9, int8_t v10, int8_t v11,
251  int8_t v12, int8_t v13, int8_t v14, int8_t v15)
252  : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
253  v8, v9, v10, v11, v12, v13, v14,
254  v15}) {}
255  // Repeat 16 values as many times as necessary (usually for lookup tables)
256  simdjson_inline static simd8<int8_t>
257  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
258  int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
259  int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
260  return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
261  v13, v14, v15);
262  }
263 
264  // Order-sensitive comparisons
265  simdjson_inline simd8<int8_t>
266  max_val(const simd8<int8_t> other) const {
267  return (__m128i)vec_max((__vector signed char)this->value,
268  (__vector signed char)(__m128i)other);
269  }
270  simdjson_inline simd8<int8_t>
271  min_val(const simd8<int8_t> other) const {
272  return (__m128i)vec_min((__vector signed char)this->value,
273  (__vector signed char)(__m128i)other);
274  }
275  simdjson_inline simd8<bool>
276  operator>(const simd8<int8_t> other) const {
277  return (__m128i)vec_cmpgt((__vector signed char)this->value,
278  (__vector signed char)(__m128i)other);
279  }
280  simdjson_inline simd8<bool>
281  operator<(const simd8<int8_t> other) const {
282  return (__m128i)vec_cmplt((__vector signed char)this->value,
283  (__vector signed char)(__m128i)other);
284  }
285 };
286 
287 // Unsigned bytes
288 template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
289  simdjson_inline simd8() : base8_numeric<uint8_t>() {}
290  simdjson_inline simd8(const __m128i _value)
291  : base8_numeric<uint8_t>(_value) {}
292  // Splat constructor
293  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
294  // Array constructor
295  simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {}
296  // Member-by-member initialization
297  simdjson_inline
298  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
299  uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
300  uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
301  : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
302  v13, v14, v15}) {}
303  // Repeat 16 values as many times as necessary (usually for lookup tables)
304  simdjson_inline static simd8<uint8_t>
305  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
306  uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
307  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
308  uint8_t v15) {
309  return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
310  v13, v14, v15);
311  }
312 
313  // Saturated math
314  simdjson_inline simd8<uint8_t>
315  saturating_add(const simd8<uint8_t> other) const {
316  return (__m128i)vec_adds(this->value, (__m128i)other);
317  }
318  simdjson_inline simd8<uint8_t>
319  saturating_sub(const simd8<uint8_t> other) const {
320  return (__m128i)vec_subs(this->value, (__m128i)other);
321  }
322 
323  // Order-specific operations
324  simdjson_inline simd8<uint8_t>
325  max_val(const simd8<uint8_t> other) const {
326  return (__m128i)vec_max(this->value, (__m128i)other);
327  }
328  simdjson_inline simd8<uint8_t>
329  min_val(const simd8<uint8_t> other) const {
330  return (__m128i)vec_min(this->value, (__m128i)other);
331  }
332  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
333  simdjson_inline simd8<uint8_t>
334  gt_bits(const simd8<uint8_t> other) const {
335  return this->saturating_sub(other);
336  }
337  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
338  simdjson_inline simd8<uint8_t>
339  lt_bits(const simd8<uint8_t> other) const {
340  return other.saturating_sub(*this);
341  }
342  simdjson_inline simd8<bool>
343  operator<=(const simd8<uint8_t> other) const {
344  return other.max_val(*this) == other;
345  }
346  simdjson_inline simd8<bool>
347  operator>=(const simd8<uint8_t> other) const {
348  return other.min_val(*this) == other;
349  }
350  simdjson_inline simd8<bool>
351  operator>(const simd8<uint8_t> other) const {
352  return this->gt_bits(other).any_bits_set();
353  }
354  simdjson_inline simd8<bool>
355  operator<(const simd8<uint8_t> other) const {
356  return this->gt_bits(other).any_bits_set();
357  }
358 
359  // Bit-specific operations
360  simdjson_inline simd8<bool> bits_not_set() const {
361  return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
362  }
363  simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
364  return (*this & bits).bits_not_set();
365  }
366  simdjson_inline simd8<bool> any_bits_set() const {
367  return ~this->bits_not_set();
368  }
369  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
370  return ~this->bits_not_set(bits);
371  }
372  simdjson_inline bool bits_not_set_anywhere() const {
373  return vec_all_eq(this->value, (__m128i)vec_splats(0));
374  }
375  simdjson_inline bool any_bits_set_anywhere() const {
376  return !bits_not_set_anywhere();
377  }
378  simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
379  return vec_all_eq(vec_and(this->value, (__m128i)bits),
380  (__m128i)vec_splats(0));
381  }
382  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
383  return !bits_not_set_anywhere(bits);
384  }
385  template <int N> simdjson_inline simd8<uint8_t> shr() const {
386  return simd8<uint8_t>(
387  (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
388  }
389  template <int N> simdjson_inline simd8<uint8_t> shl() const {
390  return simd8<uint8_t>(
391  (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
392  }
393 };
394 
395 template <typename T> struct simd8x64 {
396  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
397  static_assert(NUM_CHUNKS == 4,
398  "PPC64 kernel should use four registers per 64-byte block.");
399  const simd8<T> chunks[NUM_CHUNKS];
400 
401  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
402  simd8x64<T> &
403  operator=(const simd8<T>& other) = delete; // no assignment allowed
404  simd8x64() = delete; // no default constructor allowed
405 
406  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
407  const simd8<T> chunk2, const simd8<T> chunk3)
408  : chunks{chunk0, chunk1, chunk2, chunk3} {}
409  simdjson_inline simd8x64(const T ptr[64])
410  : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
411  simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
412 
413  simdjson_inline void store(T ptr[64]) const {
414  this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
415  this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
416  this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
417  this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
418  }
419 
420  simdjson_inline simd8<T> reduce_or() const {
421  return (this->chunks[0] | this->chunks[1]) |
422  (this->chunks[2] | this->chunks[3]);
423  }
424 
425  simdjson_inline uint64_t compress(uint64_t mask, T *output) const {
426  this->chunks[0].compress(uint16_t(mask), output);
427  this->chunks[1].compress(uint16_t(mask >> 16),
428  output + 16 - count_ones(mask & 0xFFFF));
429  this->chunks[2].compress(uint16_t(mask >> 32),
430  output + 32 - count_ones(mask & 0xFFFFFFFF));
431  this->chunks[3].compress(uint16_t(mask >> 48),
432  output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
433  return 64 - count_ones(mask);
434  }
435 
436  simdjson_inline uint64_t to_bitmask() const {
437  uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
438  uint64_t r1 = this->chunks[1].to_bitmask();
439  uint64_t r2 = this->chunks[2].to_bitmask();
440  uint64_t r3 = this->chunks[3].to_bitmask();
441  return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
442  }
443 
444  simdjson_inline uint64_t eq(const T m) const {
445  const simd8<T> mask = simd8<T>::splat(m);
446  return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
447  this->chunks[2] == mask, this->chunks[3] == mask)
448  .to_bitmask();
449  }
450 
451  simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
452  return simd8x64<bool>(this->chunks[0] == other.chunks[0],
453  this->chunks[1] == other.chunks[1],
454  this->chunks[2] == other.chunks[2],
455  this->chunks[3] == other.chunks[3])
456  .to_bitmask();
457  }
458 
459  simdjson_inline uint64_t lteq(const T m) const {
460  const simd8<T> mask = simd8<T>::splat(m);
461  return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
462  this->chunks[2] <= mask, this->chunks[3] <= mask)
463  .to_bitmask();
464  }
465 }; // struct simd8x64<T>
466 
467 } // namespace simd
468 } // unnamed namespace
469 } // namespace ppc64
470 } // namespace simdjson
471 
472 #endif // SIMDJSON_PPC64_SIMD_INPUT_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8