simdjson  3.11.0
Ridiculously Fast JSON
simd.h
1 #ifndef SIMDJSON_WESTMERE_SIMD_H
2 #define SIMDJSON_WESTMERE_SIMD_H
3 
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/westmere/base.h"
6 #include "simdjson/westmere/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
8 #endif // SIMDJSON_CONDITIONAL_INCLUDE
9 
10 namespace simdjson {
11 namespace westmere {
12 namespace {
13 namespace simd {
14 
15  template<typename Child>
16  struct base {
17  __m128i value;
18 
19  // Zero constructor
20  simdjson_inline base() : value{__m128i()} {}
21 
22  // Conversion from SIMD register
23  simdjson_inline base(const __m128i _value) : value(_value) {}
24 
25  // Conversion to SIMD register
26  simdjson_inline operator const __m128i&() const { return this->value; }
27  simdjson_inline operator __m128i&() { return this->value; }
28 
29  // Bit operations
30  simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
31  simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
32  simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
33  simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
34  simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
35  simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
36  simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
37  };
38 
39  template<typename T, typename Mask=simd8<bool>>
40  struct base8: base<simd8<T>> {
41  typedef uint16_t bitmask_t;
42  typedef uint32_t bitmask2_t;
43 
44  simdjson_inline base8() : base<simd8<T>>() {}
45  simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
46 
47  friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
48 
49  static const int SIZE = sizeof(base<simd8<T>>::value);
50 
51  template<int N=1>
52  simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
53  return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
54  }
55  };
56 
57  // SIMD byte mask type (returned by things like eq and gt)
58  template<>
59  struct simd8<bool>: base8<bool> {
60  static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
61 
62  simdjson_inline simd8() : base8() {}
63  simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
64  // Splat constructor
65  simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
66 
67  simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
68  simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
69  simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
70  };
71 
72  template<typename T>
73  struct base8_numeric: base8<T> {
74  static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
75  static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
76  static simdjson_inline simd8<T> load(const T values[16]) {
77  return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
78  }
79  // Repeat 16 values as many times as necessary (usually for lookup tables)
80  static simdjson_inline simd8<T> repeat_16(
81  T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
82  T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
83  ) {
84  return simd8<T>(
85  v0, v1, v2, v3, v4, v5, v6, v7,
86  v8, v9, v10,v11,v12,v13,v14,v15
87  );
88  }
89 
90  simdjson_inline base8_numeric() : base8<T>() {}
91  simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
92 
93  // Store to array
94  simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
95 
96  // Override to distinguish from bool version
97  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
98 
99  // Addition/subtraction are the same for signed and unsigned
100  simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
101  simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
102  simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
103  simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
104 
105  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
106  template<typename L>
107  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
108  return _mm_shuffle_epi8(lookup_table, *this);
109  }
110 
111  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
112  // Passing a 0 value for mask would be equivalent to writing out every byte to output.
113  // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
114  // get written.
115  // Design consideration: it seems like a function with the
116  // signature simd8<L> compress(uint32_t mask) would be
117  // sensible, but the AVX ISA makes this kind of approach difficult.
118  template<typename L>
119  simdjson_inline void compress(uint16_t mask, L * output) const {
120  using internal::thintable_epi8;
121  using internal::BitsSetTable256mul2;
122  using internal::pshufb_combine_table;
123  // this particular implementation was inspired by work done by @animetosho
124  // we do it in two steps, first 8 bytes and then second 8 bytes
125  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
126  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
127  // next line just loads the 64-bit values thintable_epi8[mask1] and
128  // thintable_epi8[mask2] into a 128-bit register, using only
129  // two instructions on most compilers.
130  __m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
131  // we increment by 0x08 the second half of the mask
132  shufmask =
133  _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
134  // this is the version "nearly pruned"
135  __m128i pruned = _mm_shuffle_epi8(*this, shufmask);
136  // we still need to put the two halves together.
137  // we compute the popcount of the first half:
138  int pop1 = BitsSetTable256mul2[mask1];
139  // then load the corresponding mask, what it does is to write
140  // only the first pop1 bytes from the first 8 bytes, and then
141  // it fills in with the bytes from the second 8 bytes + some filling
142  // at the end.
143  __m128i compactmask =
144  _mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
145  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
146  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
147  }
148 
149  template<typename L>
150  simdjson_inline simd8<L> lookup_16(
151  L replace0, L replace1, L replace2, L replace3,
152  L replace4, L replace5, L replace6, L replace7,
153  L replace8, L replace9, L replace10, L replace11,
154  L replace12, L replace13, L replace14, L replace15) const {
155  return lookup_16(simd8<L>::repeat_16(
156  replace0, replace1, replace2, replace3,
157  replace4, replace5, replace6, replace7,
158  replace8, replace9, replace10, replace11,
159  replace12, replace13, replace14, replace15
160  ));
161  }
162  };
163 
164  // Signed bytes
165  template<>
166  struct simd8<int8_t> : base8_numeric<int8_t> {
167  simdjson_inline simd8() : base8_numeric<int8_t>() {}
168  simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
169  // Splat constructor
170  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
171  // Array constructor
172  simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
173  // Member-by-member initialization
174  simdjson_inline simd8(
175  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
176  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
177  ) : simd8(_mm_setr_epi8(
178  v0, v1, v2, v3, v4, v5, v6, v7,
179  v8, v9, v10,v11,v12,v13,v14,v15
180  )) {}
181  // Repeat 16 values as many times as necessary (usually for lookup tables)
182  simdjson_inline static simd8<int8_t> repeat_16(
183  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
184  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
185  ) {
186  return simd8<int8_t>(
187  v0, v1, v2, v3, v4, v5, v6, v7,
188  v8, v9, v10,v11,v12,v13,v14,v15
189  );
190  }
191 
192  // Order-sensitive comparisons
193  simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
194  simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
195  simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
196  simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
197  };
198 
199  // Unsigned bytes
200  template<>
201  struct simd8<uint8_t>: base8_numeric<uint8_t> {
202  simdjson_inline simd8() : base8_numeric<uint8_t>() {}
203  simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
204  // Splat constructor
205  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
206  // Array constructor
207  simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
208  // Member-by-member initialization
209  simdjson_inline simd8(
210  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
211  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
212  ) : simd8(_mm_setr_epi8(
213  v0, v1, v2, v3, v4, v5, v6, v7,
214  v8, v9, v10,v11,v12,v13,v14,v15
215  )) {}
216  // Repeat 16 values as many times as necessary (usually for lookup tables)
217  simdjson_inline static simd8<uint8_t> repeat_16(
218  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
219  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
220  ) {
221  return simd8<uint8_t>(
222  v0, v1, v2, v3, v4, v5, v6, v7,
223  v8, v9, v10,v11,v12,v13,v14,v15
224  );
225  }
226 
227  // Saturated math
228  simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
229  simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
230 
231  // Order-specific operations
232  simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
233  simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
234  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
235  simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
236  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
237  simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
238  simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
239  simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
240  simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
241  simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
242 
243  // Bit-specific operations
244  simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
245  simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
246  simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
247  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
248  simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
249  simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
250  simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
251  simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
252  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
253  template<int N>
254  simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
255  template<int N>
256  simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
257  // Get one of the bits and make a bitmask out of it.
258  // e.g. value.get_bit<7>() gets the high bit
259  template<int N>
260  simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
261  };
262 
263  template<typename T>
264  struct simd8x64 {
265  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
266  static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
267  const simd8<T> chunks[NUM_CHUNKS];
268 
269  simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
270  simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
271  simd8x64() = delete; // no default constructor allowed
272 
273  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
274  simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
275 
276  simdjson_inline void store(T ptr[64]) const {
277  this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
278  this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
279  this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
280  this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
281  }
282 
283  simdjson_inline simd8<T> reduce_or() const {
284  return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
285  }
286 
287  simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
288  this->chunks[0].compress(uint16_t(mask), output);
289  this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
290  this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
291  this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
292  return 64 - count_ones(mask);
293  }
294 
295  simdjson_inline uint64_t to_bitmask() const {
296  uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
297  uint64_t r1 = this->chunks[1].to_bitmask() ;
298  uint64_t r2 = this->chunks[2].to_bitmask() ;
299  uint64_t r3 = this->chunks[3].to_bitmask() ;
300  return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
301  }
302 
303  simdjson_inline uint64_t eq(const T m) const {
304  const simd8<T> mask = simd8<T>::splat(m);
305  return simd8x64<bool>(
306  this->chunks[0] == mask,
307  this->chunks[1] == mask,
308  this->chunks[2] == mask,
309  this->chunks[3] == mask
310  ).to_bitmask();
311  }
312 
313  simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
314  return simd8x64<bool>(
315  this->chunks[0] == other.chunks[0],
316  this->chunks[1] == other.chunks[1],
317  this->chunks[2] == other.chunks[2],
318  this->chunks[3] == other.chunks[3]
319  ).to_bitmask();
320  }
321 
322  simdjson_inline uint64_t lteq(const T m) const {
323  const simd8<T> mask = simd8<T>::splat(m);
324  return simd8x64<bool>(
325  this->chunks[0] <= mask,
326  this->chunks[1] <= mask,
327  this->chunks[2] <= mask,
328  this->chunks[3] <= mask
329  ).to_bitmask();
330  }
331  }; // struct simd8x64<T>
332 
333 } // namespace simd
334 } // unnamed namespace
335 } // namespace westmere
336 } // namespace simdjson
337 
338 #endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8