simdjson 4.0.7
Ridiculously Fast JSON
Loading...
Searching...
No Matches
simd.h
1#ifndef SIMDJSON_ARM64_SIMD_H
2#define SIMDJSON_ARM64_SIMD_H
3
4#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5#include "simdjson/arm64/base.h"
6#include "simdjson/arm64/bitmanipulation.h"
7#include "simdjson/internal/simdprune_tables.h"
8#endif // SIMDJSON_CONDITIONAL_INCLUDE
9
10namespace simdjson {
11namespace arm64 {
12namespace {
13namespace simd {
14
15#if SIMDJSON_REGULAR_VISUAL_STUDIO
16namespace {
17// Start of private section with Visual Studio workaround
18
19
20#ifndef simdjson_make_uint8x16_t
21#define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
22 x13, x14, x15, x16) \
23 ([=]() { \
24 uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
25 x9, x10, x11, x12, x13, x14, x15, x16}; \
26 return vld1q_u8(array); \
27 }())
28#endif
29#ifndef simdjson_make_int8x16_t
30#define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
31 x13, x14, x15, x16) \
32 ([=]() { \
33 int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
34 x9, x10, x11, x12, x13, x14, x15, x16}; \
35 return vld1q_s8(array); \
36 }())
37#endif
38
39#ifndef simdjson_make_uint8x8_t
40#define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
41 ([=]() { \
42 uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
43 return vld1_u8(array); \
44 }())
45#endif
46#ifndef simdjson_make_int8x8_t
47#define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
48 ([=]() { \
49 int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
50 return vld1_s8(array); \
51 }())
52#endif
53#ifndef simdjson_make_uint16x8_t
54#define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
55 ([=]() { \
56 uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
57 return vld1q_u16(array); \
58 }())
59#endif
60#ifndef simdjson_make_int16x8_t
61#define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
62 ([=]() { \
63 int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
64 return vld1q_s16(array); \
65 }())
66#endif
67
68// End of private section with Visual Studio workaround
69} // namespace
70#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
71
72
73 template<typename T>
74 struct simd8;
75
76 //
77 // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
78 //
79 template<typename T, typename Mask=simd8<bool>>
80 struct base_u8 {
81 uint8x16_t value;
82 static const int SIZE = sizeof(value);
83
84 // Conversion from/to SIMD register
85 simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
86 simdjson_inline operator const uint8x16_t&() const { return this->value; }
87 simdjson_inline operator uint8x16_t&() { return this->value; }
88
89 // Bit operations
90 simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
91 simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
92 simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
93 simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
94 simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
95 simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
96 simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
97 simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
98
99 friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
100
101 template<int N=1>
102 simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
103 return vextq_u8(prev_chunk, *this, 16 - N);
104 }
105 };
106
107 // SIMD byte mask type (returned by things like eq and gt)
108 template<>
109 struct simd8<bool>: base_u8<bool> {
110 typedef uint16_t bitmask_t;
111 typedef uint32_t bitmask2_t;
112
113 static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
114
115 simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
116 // False constructor
117 simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
118 // Splat constructor
119 simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
120
121 // We return uint32_t instead of uint16_t because that seems to be more efficient for most
122 // purposes (cutting it down to uint16_t costs performance in some compilers).
123 simdjson_inline uint32_t to_bitmask() const {
124#if SIMDJSON_REGULAR_VISUAL_STUDIO
125 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
126 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
127#else
128 const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
129 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
130#endif
131 auto minput = *this & bit_mask;
132 uint8x16_t tmp = vpaddq_u8(minput, minput);
133 tmp = vpaddq_u8(tmp, tmp);
134 tmp = vpaddq_u8(tmp, tmp);
135 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
136 }
137 // Returns 4-bit out of each byte, alternating between the high 4 bits and low
138 // bits result it is 64 bit.
139 simdjson_inline uint64_t to_bitmask64() const {
140 return vget_lane_u64(
141 vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
142 }
143 simdjson_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; }
144 };
145
146 // Unsigned bytes
147 template<>
148 struct simd8<uint8_t>: base_u8<uint8_t> {
149 static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
150 static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); }
151 static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
152
153 simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
154 // Zero constructor
155 simdjson_inline simd8() : simd8(zero()) {}
156 // Array constructor
157 simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
158 // Splat constructor
159 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
160 // Member-by-member initialization
161#if SIMDJSON_REGULAR_VISUAL_STUDIO
162 simdjson_inline simd8(
163 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
164 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
165 ) : simd8(simdjson_make_uint8x16_t(
166 v0, v1, v2, v3, v4, v5, v6, v7,
167 v8, v9, v10,v11,v12,v13,v14,v15
168 )) {}
169#else
170 simdjson_inline simd8(
171 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
172 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
173 ) : simd8(uint8x16_t{
174 v0, v1, v2, v3, v4, v5, v6, v7,
175 v8, v9, v10,v11,v12,v13,v14,v15
176 }) {}
177#endif
178
179 // Repeat 16 values as many times as necessary (usually for lookup tables)
180 simdjson_inline static simd8<uint8_t> repeat_16(
181 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
182 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
183 ) {
184 return simd8<uint8_t>(
185 v0, v1, v2, v3, v4, v5, v6, v7,
186 v8, v9, v10,v11,v12,v13,v14,v15
187 );
188 }
189
190 // Store to array
191 simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
192
193 // Saturated math
194 simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
195 simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
196
197 // Addition/subtraction are the same for signed and unsigned
198 simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
199 simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
200 simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
201 simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
202
203 // Order-specific operations
204 simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
205 simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); }
206 simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
207 simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
208 simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
209 simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
210 simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
211 simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
212 // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
213 simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
214 // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
215 simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
216
217 // Bit-specific operations
218 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
219 simdjson_inline bool any_bits_set_anywhere() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; }
220 simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
221 template<int N>
222 simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
223 template<int N>
224 simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
225
226 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
227 template<typename L>
228 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
229 return lookup_table.apply_lookup_16_to(*this);
230 }
231
232 // Returns 4-bit out of each byte, alternating between the high 4 bits and low
233 // bits result it is 64 bit.
234 simdjson_inline uint64_t to_bitmask64() const {
235 return vget_lane_u64(
236 vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
237 }
238 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
239 // Passing a 0 value for mask would be equivalent to writing out every byte to output.
240 // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
241 // get written.
242 // Design consideration: it seems like a function with the
243 // signature simd8<L> compress(uint16_t mask) would be
244 // sensible, but the AVX ISA makes this kind of approach difficult.
245 template<typename L>
246 simdjson_inline void compress(uint16_t mask, L * output) const {
247 using internal::thintable_epi8;
248 using internal::BitsSetTable256mul2;
249 using internal::pshufb_combine_table;
250 // this particular implementation was inspired by work done by @animetosho
251 // we do it in two steps, first 8 bytes and then second 8 bytes
252 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
253 uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
254 // next line just loads the 64-bit values thintable_epi8[mask1] and
255 // thintable_epi8[mask2] into a 128-bit register, using only
256 // two instructions on most compilers.
257 uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
258 uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
259 // we increment by 0x08 the second half of the mask
260#if SIMDJSON_REGULAR_VISUAL_STUDIO
261 uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
262#else
263 uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
264#endif
265 shufmask = vaddq_u8(shufmask, inc);
266 // this is the version "nearly pruned"
267 uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
268 // we still need to put the two halves together.
269 // we compute the popcount of the first half:
270 int pop1 = BitsSetTable256mul2[mask1];
271 // then load the corresponding mask, what it does is to write
272 // only the first pop1 bytes from the first 8 bytes, and then
273 // it fills in with the bytes from the second 8 bytes + some filling
274 // at the end.
275 uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
276 uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
277 vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
278 }
279
280 // Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
281 // bitset) to output1, then those corresponding to a 0 in the high half to output2.
282 template<typename L>
283 simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
284 using internal::thintable_epi8;
285 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
286 uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
287 uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
288 uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
289 // we increment by 0x08 the second half of the mask
290#if SIMDJSON_REGULAR_VISUAL_STUDIO
291 uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
292#else
293 uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
294#endif
295 compactmask2 = vadd_u8(compactmask2, inc);
296 // store each result (with the second store possibly overlapping the first)
297 vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
298 vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
299 }
300
301 template<typename L>
302 simdjson_inline simd8<L> lookup_16(
303 L replace0, L replace1, L replace2, L replace3,
304 L replace4, L replace5, L replace6, L replace7,
305 L replace8, L replace9, L replace10, L replace11,
306 L replace12, L replace13, L replace14, L replace15) const {
307 return lookup_16(simd8<L>::repeat_16(
308 replace0, replace1, replace2, replace3,
309 replace4, replace5, replace6, replace7,
310 replace8, replace9, replace10, replace11,
311 replace12, replace13, replace14, replace15
312 ));
313 }
314
315 template<typename T>
316 simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
317 return vqtbl1q_u8(*this, simd8<uint8_t>(original));
318 }
319 };
320
321 // Signed bytes
322 template<>
323 struct simd8<int8_t> {
324 int8x16_t value;
325
326 static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
327 static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
328 static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
329
330 // Conversion from/to SIMD register
331 simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
332 simdjson_inline operator const int8x16_t&() const { return this->value; }
333 simdjson_inline operator int8x16_t&() { return this->value; }
334
335 // Zero constructor
336 simdjson_inline simd8() : simd8(zero()) {}
337 // Splat constructor
338 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
339 // Array constructor
340 simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
341 // Member-by-member initialization
342#if SIMDJSON_REGULAR_VISUAL_STUDIO
343 simdjson_inline simd8(
344 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
345 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
346 ) : simd8(simdjson_make_int8x16_t(
347 v0, v1, v2, v3, v4, v5, v6, v7,
348 v8, v9, v10,v11,v12,v13,v14,v15
349 )) {}
350#else
351 simdjson_inline simd8(
352 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
353 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
354 ) : simd8(int8x16_t{
355 v0, v1, v2, v3, v4, v5, v6, v7,
356 v8, v9, v10,v11,v12,v13,v14,v15
357 }) {}
358#endif
359 // Repeat 16 values as many times as necessary (usually for lookup tables)
360 simdjson_inline static simd8<int8_t> repeat_16(
361 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
362 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
363 ) {
364 return simd8<int8_t>(
365 v0, v1, v2, v3, v4, v5, v6, v7,
366 v8, v9, v10,v11,v12,v13,v14,v15
367 );
368 }
369
370 // Store to array
371 simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
372
373 // Explicit conversion to/from unsigned
374 //
375 // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
376 // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
377 // and relatively ugly and hard to read.
378#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
379 simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
380#endif
381 simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
382
383 // Math
384 simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
385 simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
386 simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
387 simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
388
389 // Order-sensitive comparisons
390 simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
391 simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
392 simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
393 simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
394 simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
395
396 template<int N=1>
397 simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
398 return vextq_s8(prev_chunk, *this, 16 - N);
399 }
400
401 // Perform a lookup assuming no value is larger than 16
402 template<typename L>
403 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
404 return lookup_table.apply_lookup_16_to(*this);
405 }
406 template<typename L>
407 simdjson_inline simd8<L> lookup_16(
408 L replace0, L replace1, L replace2, L replace3,
409 L replace4, L replace5, L replace6, L replace7,
410 L replace8, L replace9, L replace10, L replace11,
411 L replace12, L replace13, L replace14, L replace15) const {
412 return lookup_16(simd8<L>::repeat_16(
413 replace0, replace1, replace2, replace3,
414 replace4, replace5, replace6, replace7,
415 replace8, replace9, replace10, replace11,
416 replace12, replace13, replace14, replace15
417 ));
418 }
419
420 template<typename T>
421 simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
422 return vqtbl1q_s8(*this, simd8<uint8_t>(original));
423 }
424 };
425
426 template<typename T>
427 struct simd8x64 {
428 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
429 static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
430 const simd8<T> chunks[NUM_CHUNKS];
431
432 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
433 simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
434 simd8x64() = delete; // no default constructor allowed
435
436 simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
437 simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
438
439 simdjson_inline void store(T ptr[64]) const {
440 this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
441 this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
442 this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
443 this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
444 }
445
446 simdjson_inline simd8<T> reduce_or() const {
447 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
448 }
449
450
451 simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
452 uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
453 // compute the prefix sum of the popcounts of each byte
454 uint64_t offsets = popcounts * 0x0101010101010101;
455 this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
456 this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
457 this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
458 this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
459 return offsets >> 56;
460 }
461
462 simdjson_inline uint64_t to_bitmask() const {
463#if SIMDJSON_REGULAR_VISUAL_STUDIO
464 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
465 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
466 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
467 );
468#else
469 const uint8x16_t bit_mask = {
470 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
471 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
472 };
473#endif
474 // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
475 uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
476 uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
477 sum0 = vpaddq_u8(sum0, sum1);
478 sum0 = vpaddq_u8(sum0, sum0);
479 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
480 }
481
482 simdjson_inline uint64_t eq(const T m) const {
483 const simd8<T> mask = simd8<T>::splat(m);
484 return simd8x64<bool>(
485 this->chunks[0] == mask,
486 this->chunks[1] == mask,
487 this->chunks[2] == mask,
488 this->chunks[3] == mask
489 ).to_bitmask();
490 }
491
492 simdjson_inline uint64_t lteq(const T m) const {
493 const simd8<T> mask = simd8<T>::splat(m);
494 return simd8x64<bool>(
495 this->chunks[0] <= mask,
496 this->chunks[1] <= mask,
497 this->chunks[2] <= mask,
498 this->chunks[3] <= mask
499 ).to_bitmask();
500 }
501 }; // struct simd8x64<T>
502
503} // namespace simd
504} // unnamed namespace
505} // namespace arm64
506} // namespace simdjson
507
508#endif // SIMDJSON_ARM64_SIMD_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8