1 #ifndef SIMDJSON_ARM64_SIMD_H
2 #define SIMDJSON_ARM64_SIMD_H
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/arm64/base.h"
6 #include "simdjson/arm64/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
15 #if SIMDJSON_REGULAR_VISUAL_STUDIO
20 #ifndef simdjson_make_uint8x16_t
21 #define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
24 uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
25 x9, x10, x11, x12, x13, x14, x15, x16}; \
26 return vld1q_u8(array); \
29 #ifndef simdjson_make_int8x16_t
30 #define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
33 int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
34 x9, x10, x11, x12, x13, x14, x15, x16}; \
35 return vld1q_s8(array); \
39 #ifndef simdjson_make_uint8x8_t
40 #define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
42 uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
43 return vld1_u8(array); \
46 #ifndef simdjson_make_int8x8_t
47 #define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
49 int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
50 return vld1_s8(array); \
53 #ifndef simdjson_make_uint16x8_t
54 #define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
56 uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
57 return vld1q_u16(array); \
60 #ifndef simdjson_make_int16x8_t
61 #define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
63 int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
64 return vld1q_s16(array); \
79 template<
typename T,
typename Mask=simd8<
bool>>
82 static const int SIZE =
sizeof(value);
85 simdjson_inline base_u8(
const uint8x16_t _value) : value(_value) {}
86 simdjson_inline
operator const uint8x16_t&()
const {
return this->value; }
87 simdjson_inline
operator uint8x16_t&() {
return this->value; }
90 simdjson_inline simd8<T> operator|(
const simd8<T> other)
const {
return vorrq_u8(*
this, other); }
91 simdjson_inline simd8<T> operator&(
const simd8<T> other)
const {
return vandq_u8(*
this, other); }
92 simdjson_inline simd8<T> operator^(
const simd8<T> other)
const {
return veorq_u8(*
this, other); }
93 simdjson_inline simd8<T> bit_andnot(
const simd8<T> other)
const {
return vbicq_u8(*
this, other); }
94 simdjson_inline simd8<T> operator~()
const {
return *
this ^ 0xFFu; }
95 simdjson_inline simd8<T>& operator|=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast | other;
return *this_cast; }
96 simdjson_inline simd8<T>& operator&=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast & other;
return *this_cast; }
97 simdjson_inline simd8<T>& operator^=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast ^ other;
return *this_cast; }
99 friend simdjson_inline Mask
operator==(
const simd8<T> lhs,
const simd8<T> rhs) {
return vceqq_u8(lhs, rhs); }
102 simdjson_inline simd8<T> prev(
const simd8<T> prev_chunk)
const {
103 return vextq_u8(prev_chunk, *
this, 16 - N);
109 struct simd8<bool>: base_u8<bool> {
110 typedef uint16_t bitmask_t;
111 typedef uint32_t bitmask2_t;
113 static simdjson_inline simd8<bool> splat(
bool _value) {
return vmovq_n_u8(uint8_t(-(!!_value))); }
115 simdjson_inline simd8(
const uint8x16_t _value) : base_u8<bool>(_value) {}
117 simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
119 simdjson_inline simd8(
bool _value) : simd8(splat(_value)) {}
123 simdjson_inline uint32_t to_bitmask()
const {
124 #if SIMDJSON_REGULAR_VISUAL_STUDIO
125 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
126 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
128 const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
129 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
131 auto minput = *
this & bit_mask;
132 uint8x16_t tmp = vpaddq_u8(minput, minput);
133 tmp = vpaddq_u8(tmp, tmp);
134 tmp = vpaddq_u8(tmp, tmp);
135 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
137 simdjson_inline
bool any()
const {
return vmaxvq_u32(vreinterpretq_u32_u8(*
this)) != 0; }
142 struct simd8<uint8_t>: base_u8<uint8_t> {
143 static simdjson_inline uint8x16_t splat(uint8_t _value) {
return vmovq_n_u8(_value); }
144 static simdjson_inline uint8x16_t zero() {
return vdupq_n_u8(0); }
145 static simdjson_inline uint8x16_t load(
const uint8_t* values) {
return vld1q_u8(values); }
147 simdjson_inline simd8(
const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
149 simdjson_inline simd8() : simd8(zero()) {}
151 simdjson_inline simd8(
const uint8_t values[16]) : simd8(load(values)) {}
153 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
155 #if SIMDJSON_REGULAR_VISUAL_STUDIO
156 simdjson_inline simd8(
157 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
158 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
159 ) : simd8(simdjson_make_uint8x16_t(
160 v0, v1, v2, v3, v4, v5, v6, v7,
161 v8, v9, v10,v11,v12,v13,v14,v15
164 simdjson_inline simd8(
165 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
166 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
167 ) : simd8(uint8x16_t{
168 v0, v1, v2, v3, v4, v5, v6, v7,
169 v8, v9, v10,v11,v12,v13,v14,v15
174 simdjson_inline
static simd8<uint8_t> repeat_16(
175 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
176 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
178 return simd8<uint8_t>(
179 v0, v1, v2, v3, v4, v5, v6, v7,
180 v8, v9, v10,v11,v12,v13,v14,v15
185 simdjson_inline
void store(uint8_t dst[16])
const {
return vst1q_u8(dst, *
this); }
188 simdjson_inline simd8<uint8_t> saturating_add(
const simd8<uint8_t> other)
const {
return vqaddq_u8(*
this, other); }
189 simdjson_inline simd8<uint8_t> saturating_sub(
const simd8<uint8_t> other)
const {
return vqsubq_u8(*
this, other); }
192 simdjson_inline simd8<uint8_t> operator+(
const simd8<uint8_t> other)
const {
return vaddq_u8(*
this, other); }
193 simdjson_inline simd8<uint8_t> operator-(
const simd8<uint8_t> other)
const {
return vsubq_u8(*
this, other); }
194 simdjson_inline simd8<uint8_t>& operator+=(
const simd8<uint8_t> other) { *
this = *
this + other;
return *
this; }
195 simdjson_inline simd8<uint8_t>& operator-=(
const simd8<uint8_t> other) { *
this = *
this - other;
return *
this; }
198 simdjson_inline uint8_t max_val()
const {
return vmaxvq_u8(*
this); }
199 simdjson_inline uint8_t min_val()
const {
return vminvq_u8(*
this); }
200 simdjson_inline simd8<uint8_t> max_val(
const simd8<uint8_t> other)
const {
return vmaxq_u8(*
this, other); }
201 simdjson_inline simd8<uint8_t> min_val(
const simd8<uint8_t> other)
const {
return vminq_u8(*
this, other); }
202 simdjson_inline simd8<bool> operator<=(
const simd8<uint8_t> other)
const {
return vcleq_u8(*
this, other); }
203 simdjson_inline simd8<bool> operator>=(
const simd8<uint8_t> other)
const {
return vcgeq_u8(*
this, other); }
204 simdjson_inline simd8<bool> operator<(
const simd8<uint8_t> other)
const {
return vcltq_u8(*
this, other); }
205 simdjson_inline simd8<bool> operator>(
const simd8<uint8_t> other)
const {
return vcgtq_u8(*
this, other); }
207 simdjson_inline simd8<uint8_t> gt_bits(
const simd8<uint8_t> other)
const {
return simd8<uint8_t>(*
this > other); }
209 simdjson_inline simd8<uint8_t> lt_bits(
const simd8<uint8_t> other)
const {
return simd8<uint8_t>(*
this < other); }
212 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits)
const {
return vtstq_u8(*
this, bits); }
213 simdjson_inline
bool any_bits_set_anywhere()
const {
return this->max_val() != 0; }
214 simdjson_inline
bool any_bits_set_anywhere(simd8<uint8_t> bits)
const {
return (*
this & bits).any_bits_set_anywhere(); }
216 simdjson_inline simd8<uint8_t> shr()
const {
return vshrq_n_u8(*
this, N); }
218 simdjson_inline simd8<uint8_t> shl()
const {
return vshlq_n_u8(*
this, N); }
222 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
223 return lookup_table.apply_lookup_16_to(*
this);
235 simdjson_inline
void compress(uint16_t mask, L * output)
const {
236 using internal::thintable_epi8;
237 using internal::BitsSetTable256mul2;
238 using internal::pshufb_combine_table;
241 uint8_t mask1 = uint8_t(mask);
242 uint8_t mask2 = uint8_t(mask >> 8);
246 uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
247 uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
249 #if SIMDJSON_REGULAR_VISUAL_STUDIO
250 uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
252 uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
254 shufmask = vaddq_u8(shufmask, inc);
256 uint8x16_t pruned = vqtbl1q_u8(*
this, shufmask);
259 int pop1 = BitsSetTable256mul2[mask1];
264 uint8x16_t compactmask = vld1q_u8(
reinterpret_cast<const uint8_t *
>(pshufb_combine_table + pop1 * 8));
265 uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
266 vst1q_u8(
reinterpret_cast<uint8_t*
>(output), answer);
272 simdjson_inline
void compress_halves(uint16_t mask, L *output1, L *output2)
const {
273 using internal::thintable_epi8;
274 uint8_t mask1 = uint8_t(mask);
275 uint8_t mask2 = uint8_t(mask >> 8);
276 uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
277 uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
279 #if SIMDJSON_REGULAR_VISUAL_STUDIO
280 uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
282 uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
284 compactmask2 = vadd_u8(compactmask2, inc);
286 vst1_u8((uint8_t*)output1, vqtbl1_u8(*
this, compactmask1));
287 vst1_u8((uint8_t*)output2, vqtbl1_u8(*
this, compactmask2));
291 simdjson_inline simd8<L> lookup_16(
292 L replace0, L replace1, L replace2, L replace3,
293 L replace4, L replace5, L replace6, L replace7,
294 L replace8, L replace9, L replace10, L replace11,
295 L replace12, L replace13, L replace14, L replace15)
const {
296 return lookup_16(simd8<L>::repeat_16(
297 replace0, replace1, replace2, replace3,
298 replace4, replace5, replace6, replace7,
299 replace8, replace9, replace10, replace11,
300 replace12, replace13, replace14, replace15
305 simdjson_inline simd8<uint8_t> apply_lookup_16_to(
const simd8<T> original) {
306 return vqtbl1q_u8(*
this, simd8<uint8_t>(original));
312 struct simd8<int8_t> {
315 static simdjson_inline simd8<int8_t> splat(int8_t _value) {
return vmovq_n_s8(_value); }
316 static simdjson_inline simd8<int8_t> zero() {
return vdupq_n_s8(0); }
317 static simdjson_inline simd8<int8_t> load(
const int8_t values[16]) {
return vld1q_s8(values); }
320 simdjson_inline simd8(
const int8x16_t _value) : value{_value} {}
321 simdjson_inline
operator const int8x16_t&()
const {
return this->value; }
322 simdjson_inline
operator int8x16_t&() {
return this->value; }
325 simdjson_inline simd8() : simd8(zero()) {}
327 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
329 simdjson_inline simd8(
const int8_t* values) : simd8(load(values)) {}
331 #if SIMDJSON_REGULAR_VISUAL_STUDIO
332 simdjson_inline simd8(
333 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
334 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
335 ) : simd8(simdjson_make_int8x16_t(
336 v0, v1, v2, v3, v4, v5, v6, v7,
337 v8, v9, v10,v11,v12,v13,v14,v15
340 simdjson_inline simd8(
341 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
342 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
344 v0, v1, v2, v3, v4, v5, v6, v7,
345 v8, v9, v10,v11,v12,v13,v14,v15
349 simdjson_inline
static simd8<int8_t> repeat_16(
350 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
351 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
353 return simd8<int8_t>(
354 v0, v1, v2, v3, v4, v5, v6, v7,
355 v8, v9, v10,v11,v12,v13,v14,v15
360 simdjson_inline
void store(int8_t dst[16])
const {
return vst1q_s8(dst, *
this); }
367 #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
368 simdjson_inline
explicit simd8(
const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
370 simdjson_inline
explicit operator simd8<uint8_t>()
const {
return vreinterpretq_u8_s8(this->value); }
373 simdjson_inline simd8<int8_t> operator+(
const simd8<int8_t> other)
const {
return vaddq_s8(*
this, other); }
374 simdjson_inline simd8<int8_t> operator-(
const simd8<int8_t> other)
const {
return vsubq_s8(*
this, other); }
375 simdjson_inline simd8<int8_t>& operator+=(
const simd8<int8_t> other) { *
this = *
this + other;
return *
this; }
376 simdjson_inline simd8<int8_t>& operator-=(
const simd8<int8_t> other) { *
this = *
this - other;
return *
this; }
379 simdjson_inline simd8<int8_t> max_val(
const simd8<int8_t> other)
const {
return vmaxq_s8(*
this, other); }
380 simdjson_inline simd8<int8_t> min_val(
const simd8<int8_t> other)
const {
return vminq_s8(*
this, other); }
381 simdjson_inline simd8<bool> operator>(
const simd8<int8_t> other)
const {
return vcgtq_s8(*
this, other); }
382 simdjson_inline simd8<bool> operator<(
const simd8<int8_t> other)
const {
return vcltq_s8(*
this, other); }
383 simdjson_inline simd8<bool>
operator==(
const simd8<int8_t> other)
const {
return vceqq_s8(*
this, other); }
386 simdjson_inline simd8<int8_t> prev(
const simd8<int8_t> prev_chunk)
const {
387 return vextq_s8(prev_chunk, *
this, 16 - N);
392 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
393 return lookup_table.apply_lookup_16_to(*
this);
396 simdjson_inline simd8<L> lookup_16(
397 L replace0, L replace1, L replace2, L replace3,
398 L replace4, L replace5, L replace6, L replace7,
399 L replace8, L replace9, L replace10, L replace11,
400 L replace12, L replace13, L replace14, L replace15)
const {
401 return lookup_16(simd8<L>::repeat_16(
402 replace0, replace1, replace2, replace3,
403 replace4, replace5, replace6, replace7,
404 replace8, replace9, replace10, replace11,
405 replace12, replace13, replace14, replace15
410 simdjson_inline simd8<int8_t> apply_lookup_16_to(
const simd8<T> original) {
411 return vqtbl1q_s8(*
this, simd8<uint8_t>(original));
417 static constexpr
int NUM_CHUNKS = 64 /
sizeof(simd8<T>);
418 static_assert(NUM_CHUNKS == 4,
"ARM kernel should use four registers per 64-byte block.");
419 const simd8<T> chunks[NUM_CHUNKS];
421 simd8x64(
const simd8x64<T>& o) =
delete;
422 simd8x64<T>& operator=(
const simd8<T>& other) =
delete;
425 simdjson_inline simd8x64(
const simd8<T> chunk0,
const simd8<T> chunk1,
const simd8<T> chunk2,
const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
426 simdjson_inline simd8x64(
const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
428 simdjson_inline
void store(T ptr[64])
const {
429 this->chunks[0].store(ptr+
sizeof(simd8<T>)*0);
430 this->chunks[1].store(ptr+
sizeof(simd8<T>)*1);
431 this->chunks[2].store(ptr+
sizeof(simd8<T>)*2);
432 this->chunks[3].store(ptr+
sizeof(simd8<T>)*3);
435 simdjson_inline simd8<T> reduce_or()
const {
436 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
440 simdjson_inline uint64_t compress(uint64_t mask, T * output)
const {
441 uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
443 uint64_t offsets = popcounts * 0x0101010101010101;
444 this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
445 this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
446 this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
447 this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
448 return offsets >> 56;
451 simdjson_inline uint64_t to_bitmask()
const {
452 #if SIMDJSON_REGULAR_VISUAL_STUDIO
453 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
454 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
455 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
458 const uint8x16_t bit_mask = {
459 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
460 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
464 uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
465 uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
466 sum0 = vpaddq_u8(sum0, sum1);
467 sum0 = vpaddq_u8(sum0, sum0);
468 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
471 simdjson_inline uint64_t eq(
const T m)
const {
472 const simd8<T> mask = simd8<T>::splat(m);
473 return simd8x64<bool>(
474 this->chunks[0] == mask,
475 this->chunks[1] == mask,
476 this->chunks[2] == mask,
477 this->chunks[3] == mask
481 simdjson_inline uint64_t lteq(
const T m)
const {
482 const simd8<T> mask = simd8<T>::splat(m);
483 return simd8x64<bool>(
484 this->chunks[0] <= mask,
485 this->chunks[1] <= mask,
486 this->chunks[2] <= mask,
487 this->chunks[3] <= mask
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.