simdjson  3.11.0
Ridiculously Fast JSON
simd.h
1 #ifndef SIMDJSON_HASWELL_SIMD_H
2 #define SIMDJSON_HASWELL_SIMD_H
3 
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/haswell/base.h"
6 #include "simdjson/haswell/intrinsics.h"
7 #include "simdjson/haswell/bitmanipulation.h"
8 #include "simdjson/internal/simdprune_tables.h"
9 #endif // SIMDJSON_CONDITIONAL_INCLUDE
10 
11 namespace simdjson {
12 namespace haswell {
13 namespace {
14 namespace simd {
15 
16  // Forward-declared so they can be used by splat and friends.
17  template<typename Child>
18  struct base {
19  __m256i value;
20 
21  // Zero constructor
22  simdjson_inline base() : value{__m256i()} {}
23 
24  // Conversion from SIMD register
25  simdjson_inline base(const __m256i _value) : value(_value) {}
26 
27  // Conversion to SIMD register
28  simdjson_inline operator const __m256i&() const { return this->value; }
29  simdjson_inline operator __m256i&() { return this->value; }
30 
31  // Bit operations
32  simdjson_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
33  simdjson_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
34  simdjson_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
35  simdjson_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
36  simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
37  simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
38  simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
39  };
40 
41  // Forward-declared so they can be used by splat and friends.
42  template<typename T>
43  struct simd8;
44 
45  template<typename T, typename Mask=simd8<bool>>
46  struct base8: base<simd8<T>> {
47  typedef uint32_t bitmask_t;
48  typedef uint64_t bitmask2_t;
49 
50  simdjson_inline base8() : base<simd8<T>>() {}
51  simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
52 
53  friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
54 
55  static const int SIZE = sizeof(base<T>::value);
56 
57  template<int N=1>
58  simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
59  return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
60  }
61  };
62 
63  // SIMD byte mask type (returned by things like eq and gt)
64  template<>
65  struct simd8<bool>: base8<bool> {
66  static simdjson_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
67 
68  simdjson_inline simd8() : base8() {}
69  simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
70  // Splat constructor
71  simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
72 
73  simdjson_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
74  simdjson_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
75  simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
76  };
77 
78  template<typename T>
79  struct base8_numeric: base8<T> {
80  static simdjson_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
81  static simdjson_inline simd8<T> zero() { return _mm256_setzero_si256(); }
82  static simdjson_inline simd8<T> load(const T values[32]) {
83  return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
84  }
85  // Repeat 16 values as many times as necessary (usually for lookup tables)
86  static simdjson_inline simd8<T> repeat_16(
87  T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
88  T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
89  ) {
90  return simd8<T>(
91  v0, v1, v2, v3, v4, v5, v6, v7,
92  v8, v9, v10,v11,v12,v13,v14,v15,
93  v0, v1, v2, v3, v4, v5, v6, v7,
94  v8, v9, v10,v11,v12,v13,v14,v15
95  );
96  }
97 
98  simdjson_inline base8_numeric() : base8<T>() {}
99  simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
100 
101  // Store to array
102  simdjson_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
103 
104  // Addition/subtraction are the same for signed and unsigned
105  simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
106  simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
107  simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
108  simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
109 
110  // Override to distinguish from bool version
111  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
112 
113  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
114  template<typename L>
115  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
116  return _mm256_shuffle_epi8(lookup_table, *this);
117  }
118 
119  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
120  // Passing a 0 value for mask would be equivalent to writing out every byte to output.
121  // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
122  // get written.
123  // Design consideration: it seems like a function with the
124  // signature simd8<L> compress(uint32_t mask) would be
125  // sensible, but the AVX ISA makes this kind of approach difficult.
126  template<typename L>
127  simdjson_inline void compress(uint32_t mask, L * output) const {
128  using internal::thintable_epi8;
129  using internal::BitsSetTable256mul2;
130  using internal::pshufb_combine_table;
131  // this particular implementation was inspired by work done by @animetosho
132  // we do it in four steps, first 8 bytes and then second 8 bytes...
133  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
134  uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
135  uint8_t mask3 = uint8_t(mask >> 16); // ...
136  uint8_t mask4 = uint8_t(mask >> 24); // ...
137  // next line just loads the 64-bit values thintable_epi8[mask1] and
138  // thintable_epi8[mask2] into a 128-bit register, using only
139  // two instructions on most compilers.
140  __m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
141  thintable_epi8[mask2], thintable_epi8[mask1]);
142  // we increment by 0x08 the second half of the mask and so forth
143  shufmask =
144  _mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
145  0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
146  // this is the version "nearly pruned"
147  __m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
148  // we still need to put the pieces back together.
149  // we compute the popcount of the first words:
150  int pop1 = BitsSetTable256mul2[mask1];
151  int pop3 = BitsSetTable256mul2[mask3];
152 
153  // then load the corresponding mask
154  // could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
155  __m256i v256 = _mm256_castsi128_si256(
156  _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8)));
157  __m256i compactmask = _mm256_insertf128_si256(v256,
158  _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop3 * 8)), 1);
159  __m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask);
160  // We just need to write out the result.
161  // This is the tricky bit that is hard to do
162  // if we want to return a SIMD register, since there
163  // is no single-instruction approach to recombine
164  // the two 128-bit lanes with an offset.
165  __m128i v128;
166  v128 = _mm256_castsi256_si128(almostthere);
167  _mm_storeu_si128( reinterpret_cast<__m128i *>(output), v128);
168  v128 = _mm256_extractf128_si256(almostthere, 1);
169  _mm_storeu_si128( reinterpret_cast<__m128i *>(output + 16 - count_ones(mask & 0xFFFF)), v128);
170  }
171 
172  template<typename L>
173  simdjson_inline simd8<L> lookup_16(
174  L replace0, L replace1, L replace2, L replace3,
175  L replace4, L replace5, L replace6, L replace7,
176  L replace8, L replace9, L replace10, L replace11,
177  L replace12, L replace13, L replace14, L replace15) const {
178  return lookup_16(simd8<L>::repeat_16(
179  replace0, replace1, replace2, replace3,
180  replace4, replace5, replace6, replace7,
181  replace8, replace9, replace10, replace11,
182  replace12, replace13, replace14, replace15
183  ));
184  }
185  };
186 
187  // Signed bytes
188  template<>
189  struct simd8<int8_t> : base8_numeric<int8_t> {
190  simdjson_inline simd8() : base8_numeric<int8_t>() {}
191  simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
192  // Splat constructor
193  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
194  // Array constructor
195  simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
196  // Member-by-member initialization
197  simdjson_inline simd8(
198  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
199  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
200  int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
201  int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
202  ) : simd8(_mm256_setr_epi8(
203  v0, v1, v2, v3, v4, v5, v6, v7,
204  v8, v9, v10,v11,v12,v13,v14,v15,
205  v16,v17,v18,v19,v20,v21,v22,v23,
206  v24,v25,v26,v27,v28,v29,v30,v31
207  )) {}
208  // Repeat 16 values as many times as necessary (usually for lookup tables)
209  simdjson_inline static simd8<int8_t> repeat_16(
210  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
211  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
212  ) {
213  return simd8<int8_t>(
214  v0, v1, v2, v3, v4, v5, v6, v7,
215  v8, v9, v10,v11,v12,v13,v14,v15,
216  v0, v1, v2, v3, v4, v5, v6, v7,
217  v8, v9, v10,v11,v12,v13,v14,v15
218  );
219  }
220 
221  // Order-sensitive comparisons
222  simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
223  simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
224  simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
225  simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
226  };
227 
228  // Unsigned bytes
229  template<>
230  struct simd8<uint8_t>: base8_numeric<uint8_t> {
231  simdjson_inline simd8() : base8_numeric<uint8_t>() {}
232  simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
233  // Splat constructor
234  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
235  // Array constructor
236  simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
237  // Member-by-member initialization
238  simdjson_inline simd8(
239  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
240  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
241  uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
242  uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
243  ) : simd8(_mm256_setr_epi8(
244  v0, v1, v2, v3, v4, v5, v6, v7,
245  v8, v9, v10,v11,v12,v13,v14,v15,
246  v16,v17,v18,v19,v20,v21,v22,v23,
247  v24,v25,v26,v27,v28,v29,v30,v31
248  )) {}
249  // Repeat 16 values as many times as necessary (usually for lookup tables)
250  simdjson_inline static simd8<uint8_t> repeat_16(
251  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
252  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
253  ) {
254  return simd8<uint8_t>(
255  v0, v1, v2, v3, v4, v5, v6, v7,
256  v8, v9, v10,v11,v12,v13,v14,v15,
257  v0, v1, v2, v3, v4, v5, v6, v7,
258  v8, v9, v10,v11,v12,v13,v14,v15
259  );
260  }
261 
262  // Saturated math
263  simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
264  simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
265 
266  // Order-specific operations
267  simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
268  simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
269  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
270  simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
271  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
272  simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
273  simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
274  simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
275  simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
276  simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
277 
278  // Bit-specific operations
279  simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
280  simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
281  simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
282  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
283  simdjson_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
284  simdjson_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
285  simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
286  simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
287  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
288  template<int N>
289  simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
290  template<int N>
291  simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
292  // Get one of the bits and make a bitmask out of it.
293  // e.g. value.get_bit<7>() gets the high bit
294  template<int N>
295  simdjson_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
296  };
297 
298  template<typename T>
299  struct simd8x64 {
300  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
301  static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
302  const simd8<T> chunks[NUM_CHUNKS];
303 
304  simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
305  simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
306  simd8x64() = delete; // no default constructor allowed
307 
308  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
309  simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
310 
311  simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
312  uint32_t mask1 = uint32_t(mask);
313  uint32_t mask2 = uint32_t(mask >> 32);
314  this->chunks[0].compress(mask1, output);
315  this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
316  return 64 - count_ones(mask);
317  }
318 
319  simdjson_inline void store(T ptr[64]) const {
320  this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
321  this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
322  }
323 
324  simdjson_inline uint64_t to_bitmask() const {
325  uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
326  uint64_t r_hi = this->chunks[1].to_bitmask();
327  return r_lo | (r_hi << 32);
328  }
329 
330  simdjson_inline simd8<T> reduce_or() const {
331  return this->chunks[0] | this->chunks[1];
332  }
333 
334  simdjson_inline simd8x64<T> bit_or(const T m) const {
335  const simd8<T> mask = simd8<T>::splat(m);
336  return simd8x64<T>(
337  this->chunks[0] | mask,
338  this->chunks[1] | mask
339  );
340  }
341 
342  simdjson_inline uint64_t eq(const T m) const {
343  const simd8<T> mask = simd8<T>::splat(m);
344  return simd8x64<bool>(
345  this->chunks[0] == mask,
346  this->chunks[1] == mask
347  ).to_bitmask();
348  }
349 
350  simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
351  return simd8x64<bool>(
352  this->chunks[0] == other.chunks[0],
353  this->chunks[1] == other.chunks[1]
354  ).to_bitmask();
355  }
356 
357  simdjson_inline uint64_t lteq(const T m) const {
358  const simd8<T> mask = simd8<T>::splat(m);
359  return simd8x64<bool>(
360  this->chunks[0] <= mask,
361  this->chunks[1] <= mask
362  ).to_bitmask();
363  }
364  }; // struct simd8x64<T>
365 
366 } // namespace simd
367 
368 } // unnamed namespace
369 } // namespace haswell
370 } // namespace simdjson
371 
372 #endif // SIMDJSON_HASWELL_SIMD_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8