simdjson  3.11.0
Ridiculously Fast JSON
simd.h
1 #ifndef SIMDJSON_ARM64_SIMD_H
2 #define SIMDJSON_ARM64_SIMD_H
3 
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/arm64/base.h"
6 #include "simdjson/arm64/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
8 #endif // SIMDJSON_CONDITIONAL_INCLUDE
9 
10 namespace simdjson {
11 namespace arm64 {
12 namespace {
13 namespace simd {
14 
15 #if SIMDJSON_REGULAR_VISUAL_STUDIO
16 namespace {
17 // Start of private section with Visual Studio workaround
18 
19 
20 #ifndef simdjson_make_uint8x16_t
21 #define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
22  x13, x14, x15, x16) \
23  ([=]() { \
24  uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
25  x9, x10, x11, x12, x13, x14, x15, x16}; \
26  return vld1q_u8(array); \
27  }())
28 #endif
29 #ifndef simdjson_make_int8x16_t
30 #define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
31  x13, x14, x15, x16) \
32  ([=]() { \
33  int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
34  x9, x10, x11, x12, x13, x14, x15, x16}; \
35  return vld1q_s8(array); \
36  }())
37 #endif
38 
39 #ifndef simdjson_make_uint8x8_t
40 #define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
41  ([=]() { \
42  uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
43  return vld1_u8(array); \
44  }())
45 #endif
46 #ifndef simdjson_make_int8x8_t
47 #define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
48  ([=]() { \
49  int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
50  return vld1_s8(array); \
51  }())
52 #endif
53 #ifndef simdjson_make_uint16x8_t
54 #define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
55  ([=]() { \
56  uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
57  return vld1q_u16(array); \
58  }())
59 #endif
60 #ifndef simdjson_make_int16x8_t
61 #define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
62  ([=]() { \
63  int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
64  return vld1q_s16(array); \
65  }())
66 #endif
67 
68 // End of private section with Visual Studio workaround
69 } // namespace
70 #endif // SIMDJSON_REGULAR_VISUAL_STUDIO
71 
72 
73  template<typename T>
74  struct simd8;
75 
76  //
77  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
78  //
79  template<typename T, typename Mask=simd8<bool>>
80  struct base_u8 {
81  uint8x16_t value;
82  static const int SIZE = sizeof(value);
83 
84  // Conversion from/to SIMD register
85  simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
86  simdjson_inline operator const uint8x16_t&() const { return this->value; }
87  simdjson_inline operator uint8x16_t&() { return this->value; }
88 
89  // Bit operations
90  simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
91  simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
92  simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
93  simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
94  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
95  simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
96  simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
97  simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
98 
99  friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
100 
101  template<int N=1>
102  simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
103  return vextq_u8(prev_chunk, *this, 16 - N);
104  }
105  };
106 
107  // SIMD byte mask type (returned by things like eq and gt)
108  template<>
109  struct simd8<bool>: base_u8<bool> {
110  typedef uint16_t bitmask_t;
111  typedef uint32_t bitmask2_t;
112 
113  static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
114 
115  simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
116  // False constructor
117  simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
118  // Splat constructor
119  simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
120 
121  // We return uint32_t instead of uint16_t because that seems to be more efficient for most
122  // purposes (cutting it down to uint16_t costs performance in some compilers).
123  simdjson_inline uint32_t to_bitmask() const {
124 #if SIMDJSON_REGULAR_VISUAL_STUDIO
125  const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
126  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
127 #else
128  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
129  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
130 #endif
131  auto minput = *this & bit_mask;
132  uint8x16_t tmp = vpaddq_u8(minput, minput);
133  tmp = vpaddq_u8(tmp, tmp);
134  tmp = vpaddq_u8(tmp, tmp);
135  return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
136  }
137  simdjson_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; }
138  };
139 
140  // Unsigned bytes
141  template<>
142  struct simd8<uint8_t>: base_u8<uint8_t> {
143  static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
144  static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); }
145  static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
146 
147  simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
148  // Zero constructor
149  simdjson_inline simd8() : simd8(zero()) {}
150  // Array constructor
151  simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
152  // Splat constructor
153  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
154  // Member-by-member initialization
155 #if SIMDJSON_REGULAR_VISUAL_STUDIO
156  simdjson_inline simd8(
157  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
158  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
159  ) : simd8(simdjson_make_uint8x16_t(
160  v0, v1, v2, v3, v4, v5, v6, v7,
161  v8, v9, v10,v11,v12,v13,v14,v15
162  )) {}
163 #else
164  simdjson_inline simd8(
165  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
166  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
167  ) : simd8(uint8x16_t{
168  v0, v1, v2, v3, v4, v5, v6, v7,
169  v8, v9, v10,v11,v12,v13,v14,v15
170  }) {}
171 #endif
172 
173  // Repeat 16 values as many times as necessary (usually for lookup tables)
174  simdjson_inline static simd8<uint8_t> repeat_16(
175  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
176  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
177  ) {
178  return simd8<uint8_t>(
179  v0, v1, v2, v3, v4, v5, v6, v7,
180  v8, v9, v10,v11,v12,v13,v14,v15
181  );
182  }
183 
184  // Store to array
185  simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
186 
187  // Saturated math
188  simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
189  simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
190 
191  // Addition/subtraction are the same for signed and unsigned
192  simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
193  simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
194  simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
195  simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
196 
197  // Order-specific operations
198  simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
199  simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); }
200  simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
201  simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
202  simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
203  simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
204  simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
205  simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
206  // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
207  simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
208  // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
209  simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
210 
211  // Bit-specific operations
212  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
213  simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
214  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
215  template<int N>
216  simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
217  template<int N>
218  simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
219 
220  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
221  template<typename L>
222  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
223  return lookup_table.apply_lookup_16_to(*this);
224  }
225 
226 
227  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
228  // Passing a 0 value for mask would be equivalent to writing out every byte to output.
229  // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
230  // get written.
231  // Design consideration: it seems like a function with the
232  // signature simd8<L> compress(uint16_t mask) would be
233  // sensible, but the AVX ISA makes this kind of approach difficult.
234  template<typename L>
235  simdjson_inline void compress(uint16_t mask, L * output) const {
236  using internal::thintable_epi8;
237  using internal::BitsSetTable256mul2;
238  using internal::pshufb_combine_table;
239  // this particular implementation was inspired by work done by @animetosho
240  // we do it in two steps, first 8 bytes and then second 8 bytes
241  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
242  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
243  // next line just loads the 64-bit values thintable_epi8[mask1] and
244  // thintable_epi8[mask2] into a 128-bit register, using only
245  // two instructions on most compilers.
246  uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
247  uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
248  // we increment by 0x08 the second half of the mask
249 #if SIMDJSON_REGULAR_VISUAL_STUDIO
250  uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
251 #else
252  uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
253 #endif
254  shufmask = vaddq_u8(shufmask, inc);
255  // this is the version "nearly pruned"
256  uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
257  // we still need to put the two halves together.
258  // we compute the popcount of the first half:
259  int pop1 = BitsSetTable256mul2[mask1];
260  // then load the corresponding mask, what it does is to write
261  // only the first pop1 bytes from the first 8 bytes, and then
262  // it fills in with the bytes from the second 8 bytes + some filling
263  // at the end.
264  uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
265  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
266  vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
267  }
268 
269  // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
270  // bitset) to output1, then those corresponding to a 0 in the high half to output2.
271  template<typename L>
272  simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
273  using internal::thintable_epi8;
274  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
275  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
276  uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
277  uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
278  // we increment by 0x08 the second half of the mask
279 #if SIMDJSON_REGULAR_VISUAL_STUDIO
280  uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
281 #else
282  uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
283 #endif
284  compactmask2 = vadd_u8(compactmask2, inc);
285  // store each result (with the second store possibly overlapping the first)
286  vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
287  vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
288  }
289 
290  template<typename L>
291  simdjson_inline simd8<L> lookup_16(
292  L replace0, L replace1, L replace2, L replace3,
293  L replace4, L replace5, L replace6, L replace7,
294  L replace8, L replace9, L replace10, L replace11,
295  L replace12, L replace13, L replace14, L replace15) const {
296  return lookup_16(simd8<L>::repeat_16(
297  replace0, replace1, replace2, replace3,
298  replace4, replace5, replace6, replace7,
299  replace8, replace9, replace10, replace11,
300  replace12, replace13, replace14, replace15
301  ));
302  }
303 
304  template<typename T>
305  simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
306  return vqtbl1q_u8(*this, simd8<uint8_t>(original));
307  }
308  };
309 
310  // Signed bytes
311  template<>
312  struct simd8<int8_t> {
313  int8x16_t value;
314 
315  static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
316  static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
317  static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
318 
319  // Conversion from/to SIMD register
320  simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
321  simdjson_inline operator const int8x16_t&() const { return this->value; }
322  simdjson_inline operator int8x16_t&() { return this->value; }
323 
324  // Zero constructor
325  simdjson_inline simd8() : simd8(zero()) {}
326  // Splat constructor
327  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
328  // Array constructor
329  simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
330  // Member-by-member initialization
331 #if SIMDJSON_REGULAR_VISUAL_STUDIO
332  simdjson_inline simd8(
333  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
334  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
335  ) : simd8(simdjson_make_int8x16_t(
336  v0, v1, v2, v3, v4, v5, v6, v7,
337  v8, v9, v10,v11,v12,v13,v14,v15
338  )) {}
339 #else
340  simdjson_inline simd8(
341  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
342  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
343  ) : simd8(int8x16_t{
344  v0, v1, v2, v3, v4, v5, v6, v7,
345  v8, v9, v10,v11,v12,v13,v14,v15
346  }) {}
347 #endif
348  // Repeat 16 values as many times as necessary (usually for lookup tables)
349  simdjson_inline static simd8<int8_t> repeat_16(
350  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
351  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
352  ) {
353  return simd8<int8_t>(
354  v0, v1, v2, v3, v4, v5, v6, v7,
355  v8, v9, v10,v11,v12,v13,v14,v15
356  );
357  }
358 
359  // Store to array
360  simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
361 
362  // Explicit conversion to/from unsigned
363  //
364  // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
365  // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
366  // and relatively ugly and hard to read.
367 #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
368  simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
369 #endif
370  simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
371 
372  // Math
373  simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
374  simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
375  simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
376  simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
377 
378  // Order-sensitive comparisons
379  simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
380  simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
381  simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
382  simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
383  simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
384 
385  template<int N=1>
386  simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
387  return vextq_s8(prev_chunk, *this, 16 - N);
388  }
389 
390  // Perform a lookup assuming no value is larger than 16
391  template<typename L>
392  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
393  return lookup_table.apply_lookup_16_to(*this);
394  }
395  template<typename L>
396  simdjson_inline simd8<L> lookup_16(
397  L replace0, L replace1, L replace2, L replace3,
398  L replace4, L replace5, L replace6, L replace7,
399  L replace8, L replace9, L replace10, L replace11,
400  L replace12, L replace13, L replace14, L replace15) const {
401  return lookup_16(simd8<L>::repeat_16(
402  replace0, replace1, replace2, replace3,
403  replace4, replace5, replace6, replace7,
404  replace8, replace9, replace10, replace11,
405  replace12, replace13, replace14, replace15
406  ));
407  }
408 
409  template<typename T>
410  simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
411  return vqtbl1q_s8(*this, simd8<uint8_t>(original));
412  }
413  };
414 
415  template<typename T>
416  struct simd8x64 {
417  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
418  static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
419  const simd8<T> chunks[NUM_CHUNKS];
420 
421  simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
422  simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
423  simd8x64() = delete; // no default constructor allowed
424 
425  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
426  simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
427 
428  simdjson_inline void store(T ptr[64]) const {
429  this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
430  this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
431  this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
432  this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
433  }
434 
435  simdjson_inline simd8<T> reduce_or() const {
436  return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
437  }
438 
439 
440  simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
441  uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
442  // compute the prefix sum of the popcounts of each byte
443  uint64_t offsets = popcounts * 0x0101010101010101;
444  this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
445  this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
446  this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
447  this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
448  return offsets >> 56;
449  }
450 
451  simdjson_inline uint64_t to_bitmask() const {
452 #if SIMDJSON_REGULAR_VISUAL_STUDIO
453  const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
454  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
455  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
456  );
457 #else
458  const uint8x16_t bit_mask = {
459  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
460  0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
461  };
462 #endif
463  // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
464  uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
465  uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
466  sum0 = vpaddq_u8(sum0, sum1);
467  sum0 = vpaddq_u8(sum0, sum0);
468  return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
469  }
470 
471  simdjson_inline uint64_t eq(const T m) const {
472  const simd8<T> mask = simd8<T>::splat(m);
473  return simd8x64<bool>(
474  this->chunks[0] == mask,
475  this->chunks[1] == mask,
476  this->chunks[2] == mask,
477  this->chunks[3] == mask
478  ).to_bitmask();
479  }
480 
481  simdjson_inline uint64_t lteq(const T m) const {
482  const simd8<T> mask = simd8<T>::splat(m);
483  return simd8x64<bool>(
484  this->chunks[0] <= mask,
485  this->chunks[1] <= mask,
486  this->chunks[2] <= mask,
487  this->chunks[3] <= mask
488  ).to_bitmask();
489  }
490  }; // struct simd8x64<T>
491 
492 } // namespace simd
493 } // unnamed namespace
494 } // namespace arm64
495 } // namespace simdjson
496 
497 #endif // SIMDJSON_ARM64_SIMD_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8