1#ifndef SIMDJSON_ARM64_SIMD_H
2#define SIMDJSON_ARM64_SIMD_H
4#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5#include "simdjson/arm64/base.h"
6#include "simdjson/arm64/bitmanipulation.h"
7#include "simdjson/internal/simdprune_tables.h"
15#if SIMDJSON_REGULAR_VISUAL_STUDIO
20#ifndef simdjson_make_uint8x16_t
21#define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
24 uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
25 x9, x10, x11, x12, x13, x14, x15, x16}; \
26 return vld1q_u8(array); \
29#ifndef simdjson_make_int8x16_t
30#define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
33 int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
34 x9, x10, x11, x12, x13, x14, x15, x16}; \
35 return vld1q_s8(array); \
39#ifndef simdjson_make_uint8x8_t
40#define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
42 uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
43 return vld1_u8(array); \
46#ifndef simdjson_make_int8x8_t
47#define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
49 int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
50 return vld1_s8(array); \
53#ifndef simdjson_make_uint16x8_t
54#define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
56 uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
57 return vld1q_u16(array); \
60#ifndef simdjson_make_int16x8_t
61#define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
63 int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
64 return vld1q_s16(array); \
79 template<
typename T,
typename Mask=simd8<
bool>>
82 static const int SIZE =
sizeof(value);
85 simdjson_inline base_u8(
const uint8x16_t _value) : value(_value) {}
86 simdjson_inline
operator const uint8x16_t&()
const {
return this->value; }
87 simdjson_inline
operator uint8x16_t&() {
return this->value; }
90 simdjson_inline simd8<T> operator|(
const simd8<T> other)
const {
return vorrq_u8(*
this, other); }
91 simdjson_inline simd8<T> operator&(
const simd8<T> other)
const {
return vandq_u8(*
this, other); }
92 simdjson_inline simd8<T> operator^(
const simd8<T> other)
const {
return veorq_u8(*
this, other); }
93 simdjson_inline simd8<T> bit_andnot(
const simd8<T> other)
const {
return vbicq_u8(*
this, other); }
94 simdjson_inline simd8<T> operator~()
const {
return *
this ^ 0xFFu; }
95 simdjson_inline simd8<T>& operator|=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast | other;
return *this_cast; }
96 simdjson_inline simd8<T>& operator&=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast & other;
return *this_cast; }
97 simdjson_inline simd8<T>& operator^=(
const simd8<T> other) {
auto this_cast =
static_cast<simd8<T>*
>(
this); *this_cast = *this_cast ^ other;
return *this_cast; }
99 friend simdjson_inline Mask
operator==(
const simd8<T> lhs,
const simd8<T> rhs) {
return vceqq_u8(lhs, rhs); }
102 simdjson_inline simd8<T> prev(
const simd8<T> prev_chunk)
const {
103 return vextq_u8(prev_chunk, *
this, 16 - N);
109 struct simd8<bool>: base_u8<bool> {
110 typedef uint16_t bitmask_t;
111 typedef uint32_t bitmask2_t;
113 static simdjson_inline simd8<bool> splat(
bool _value) {
return vmovq_n_u8(uint8_t(-(!!_value))); }
115 simdjson_inline simd8(
const uint8x16_t _value) : base_u8<bool>(_value) {}
117 simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
119 simdjson_inline simd8(
bool _value) : simd8(splat(_value)) {}
123 simdjson_inline uint32_t to_bitmask()
const {
124#if SIMDJSON_REGULAR_VISUAL_STUDIO
125 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
126 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
128 const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
129 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
131 auto minput = *
this & bit_mask;
132 uint8x16_t tmp = vpaddq_u8(minput, minput);
133 tmp = vpaddq_u8(tmp, tmp);
134 tmp = vpaddq_u8(tmp, tmp);
135 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
139 simdjson_inline uint64_t to_bitmask64()
const {
140 return vget_lane_u64(
141 vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*
this), 4)), 0);
143 simdjson_inline
bool any()
const {
return vmaxvq_u32(vreinterpretq_u32_u8(*
this)) != 0; }
148 struct simd8<uint8_t>: base_u8<uint8_t> {
149 static simdjson_inline uint8x16_t splat(uint8_t _value) {
return vmovq_n_u8(_value); }
150 static simdjson_inline uint8x16_t zero() {
return vdupq_n_u8(0); }
151 static simdjson_inline uint8x16_t load(
const uint8_t* values) {
return vld1q_u8(values); }
153 simdjson_inline simd8(
const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
155 simdjson_inline simd8() : simd8(zero()) {}
157 simdjson_inline simd8(
const uint8_t values[16]) : simd8(load(values)) {}
159 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
161#if SIMDJSON_REGULAR_VISUAL_STUDIO
162 simdjson_inline simd8(
163 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
164 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
165 ) : simd8(simdjson_make_uint8x16_t(
166 v0, v1, v2, v3, v4, v5, v6, v7,
167 v8, v9, v10,v11,v12,v13,v14,v15
170 simdjson_inline simd8(
171 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
172 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
173 ) : simd8(uint8x16_t{
174 v0, v1, v2, v3, v4, v5, v6, v7,
175 v8, v9, v10,v11,v12,v13,v14,v15
180 simdjson_inline
static simd8<uint8_t> repeat_16(
181 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
182 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
184 return simd8<uint8_t>(
185 v0, v1, v2, v3, v4, v5, v6, v7,
186 v8, v9, v10,v11,v12,v13,v14,v15
191 simdjson_inline
void store(uint8_t dst[16])
const {
return vst1q_u8(dst, *
this); }
194 simdjson_inline simd8<uint8_t> saturating_add(
const simd8<uint8_t> other)
const {
return vqaddq_u8(*
this, other); }
195 simdjson_inline simd8<uint8_t> saturating_sub(
const simd8<uint8_t> other)
const {
return vqsubq_u8(*
this, other); }
198 simdjson_inline simd8<uint8_t> operator+(
const simd8<uint8_t> other)
const {
return vaddq_u8(*
this, other); }
199 simdjson_inline simd8<uint8_t> operator-(
const simd8<uint8_t> other)
const {
return vsubq_u8(*
this, other); }
200 simdjson_inline simd8<uint8_t>& operator+=(
const simd8<uint8_t> other) { *
this = *
this + other;
return *
this; }
201 simdjson_inline simd8<uint8_t>& operator-=(
const simd8<uint8_t> other) { *
this = *
this - other;
return *
this; }
204 simdjson_inline uint8_t max_val()
const {
return vmaxvq_u8(*
this); }
205 simdjson_inline uint8_t min_val()
const {
return vminvq_u8(*
this); }
206 simdjson_inline simd8<uint8_t> max_val(
const simd8<uint8_t> other)
const {
return vmaxq_u8(*
this, other); }
207 simdjson_inline simd8<uint8_t> min_val(
const simd8<uint8_t> other)
const {
return vminq_u8(*
this, other); }
208 simdjson_inline simd8<bool> operator<=(
const simd8<uint8_t> other)
const {
return vcleq_u8(*
this, other); }
209 simdjson_inline simd8<bool> operator>=(
const simd8<uint8_t> other)
const {
return vcgeq_u8(*
this, other); }
210 simdjson_inline simd8<bool> operator<(
const simd8<uint8_t> other)
const {
return vcltq_u8(*
this, other); }
211 simdjson_inline simd8<bool> operator>(
const simd8<uint8_t> other)
const {
return vcgtq_u8(*
this, other); }
213 simdjson_inline simd8<uint8_t> gt_bits(
const simd8<uint8_t> other)
const {
return simd8<uint8_t>(*
this > other); }
215 simdjson_inline simd8<uint8_t> lt_bits(
const simd8<uint8_t> other)
const {
return simd8<uint8_t>(*
this < other); }
218 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits)
const {
return vtstq_u8(*
this, bits); }
219 simdjson_inline
bool any_bits_set_anywhere()
const {
return vmaxvq_u32(vreinterpretq_u32_u8(*
this)) != 0; }
220 simdjson_inline
bool any_bits_set_anywhere(simd8<uint8_t> bits)
const {
return (*
this & bits).any_bits_set_anywhere(); }
222 simdjson_inline simd8<uint8_t> shr()
const {
return vshrq_n_u8(*
this, N); }
224 simdjson_inline simd8<uint8_t> shl()
const {
return vshlq_n_u8(*
this, N); }
228 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
229 return lookup_table.apply_lookup_16_to(*
this);
234 simdjson_inline uint64_t to_bitmask64()
const {
235 return vget_lane_u64(
236 vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*
this), 4)), 0);
246 simdjson_inline
void compress(uint16_t mask, L * output)
const {
247 using internal::thintable_epi8;
248 using internal::BitsSetTable256mul2;
249 using internal::pshufb_combine_table;
252 uint8_t mask1 = uint8_t(mask);
253 uint8_t mask2 = uint8_t(mask >> 8);
257 uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
258 uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
260#if SIMDJSON_REGULAR_VISUAL_STUDIO
261 uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
263 uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
265 shufmask = vaddq_u8(shufmask, inc);
267 uint8x16_t pruned = vqtbl1q_u8(*
this, shufmask);
270 int pop1 = BitsSetTable256mul2[mask1];
275 uint8x16_t compactmask = vld1q_u8(
reinterpret_cast<const uint8_t *
>(pshufb_combine_table + pop1 * 8));
276 uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
277 vst1q_u8(
reinterpret_cast<uint8_t*
>(output), answer);
283 simdjson_inline
void compress_halves(uint16_t mask, L *output1, L *output2)
const {
284 using internal::thintable_epi8;
285 uint8_t mask1 = uint8_t(mask);
286 uint8_t mask2 = uint8_t(mask >> 8);
287 uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
288 uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
290#if SIMDJSON_REGULAR_VISUAL_STUDIO
291 uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
293 uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
295 compactmask2 = vadd_u8(compactmask2, inc);
297 vst1_u8((uint8_t*)output1, vqtbl1_u8(*
this, compactmask1));
298 vst1_u8((uint8_t*)output2, vqtbl1_u8(*
this, compactmask2));
302 simdjson_inline simd8<L> lookup_16(
303 L replace0, L replace1, L replace2, L replace3,
304 L replace4, L replace5, L replace6, L replace7,
305 L replace8, L replace9, L replace10, L replace11,
306 L replace12, L replace13, L replace14, L replace15)
const {
307 return lookup_16(simd8<L>::repeat_16(
308 replace0, replace1, replace2, replace3,
309 replace4, replace5, replace6, replace7,
310 replace8, replace9, replace10, replace11,
311 replace12, replace13, replace14, replace15
316 simdjson_inline simd8<uint8_t> apply_lookup_16_to(
const simd8<T> original) {
317 return vqtbl1q_u8(*
this, simd8<uint8_t>(original));
323 struct simd8<int8_t> {
326 static simdjson_inline simd8<int8_t> splat(int8_t _value) {
return vmovq_n_s8(_value); }
327 static simdjson_inline simd8<int8_t> zero() {
return vdupq_n_s8(0); }
328 static simdjson_inline simd8<int8_t> load(
const int8_t values[16]) {
return vld1q_s8(values); }
331 simdjson_inline simd8(
const int8x16_t _value) : value{_value} {}
332 simdjson_inline
operator const int8x16_t&()
const {
return this->value; }
333 simdjson_inline
operator int8x16_t&() {
return this->value; }
336 simdjson_inline simd8() : simd8(zero()) {}
338 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
340 simdjson_inline simd8(
const int8_t* values) : simd8(load(values)) {}
342#if SIMDJSON_REGULAR_VISUAL_STUDIO
343 simdjson_inline simd8(
344 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
345 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
346 ) : simd8(simdjson_make_int8x16_t(
347 v0, v1, v2, v3, v4, v5, v6, v7,
348 v8, v9, v10,v11,v12,v13,v14,v15
351 simdjson_inline simd8(
352 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
353 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
355 v0, v1, v2, v3, v4, v5, v6, v7,
356 v8, v9, v10,v11,v12,v13,v14,v15
360 simdjson_inline
static simd8<int8_t> repeat_16(
361 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
362 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
364 return simd8<int8_t>(
365 v0, v1, v2, v3, v4, v5, v6, v7,
366 v8, v9, v10,v11,v12,v13,v14,v15
371 simdjson_inline
void store(int8_t dst[16])
const {
return vst1q_s8(dst, *
this); }
378#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
379 simdjson_inline
explicit simd8(
const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
381 simdjson_inline
explicit operator simd8<uint8_t>()
const {
return vreinterpretq_u8_s8(this->value); }
384 simdjson_inline simd8<int8_t> operator+(
const simd8<int8_t> other)
const {
return vaddq_s8(*
this, other); }
385 simdjson_inline simd8<int8_t> operator-(
const simd8<int8_t> other)
const {
return vsubq_s8(*
this, other); }
386 simdjson_inline simd8<int8_t>& operator+=(
const simd8<int8_t> other) { *
this = *
this + other;
return *
this; }
387 simdjson_inline simd8<int8_t>& operator-=(
const simd8<int8_t> other) { *
this = *
this - other;
return *
this; }
390 simdjson_inline simd8<int8_t> max_val(
const simd8<int8_t> other)
const {
return vmaxq_s8(*
this, other); }
391 simdjson_inline simd8<int8_t> min_val(
const simd8<int8_t> other)
const {
return vminq_s8(*
this, other); }
392 simdjson_inline simd8<bool> operator>(
const simd8<int8_t> other)
const {
return vcgtq_s8(*
this, other); }
393 simdjson_inline simd8<bool> operator<(
const simd8<int8_t> other)
const {
return vcltq_s8(*
this, other); }
394 simdjson_inline simd8<bool>
operator==(
const simd8<int8_t> other)
const {
return vceqq_s8(*
this, other); }
397 simdjson_inline simd8<int8_t> prev(
const simd8<int8_t> prev_chunk)
const {
398 return vextq_s8(prev_chunk, *
this, 16 - N);
403 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
404 return lookup_table.apply_lookup_16_to(*
this);
407 simdjson_inline simd8<L> lookup_16(
408 L replace0, L replace1, L replace2, L replace3,
409 L replace4, L replace5, L replace6, L replace7,
410 L replace8, L replace9, L replace10, L replace11,
411 L replace12, L replace13, L replace14, L replace15)
const {
412 return lookup_16(simd8<L>::repeat_16(
413 replace0, replace1, replace2, replace3,
414 replace4, replace5, replace6, replace7,
415 replace8, replace9, replace10, replace11,
416 replace12, replace13, replace14, replace15
421 simdjson_inline simd8<int8_t> apply_lookup_16_to(
const simd8<T> original) {
422 return vqtbl1q_s8(*
this, simd8<uint8_t>(original));
428 static constexpr int NUM_CHUNKS = 64 /
sizeof(simd8<T>);
429 static_assert(NUM_CHUNKS == 4,
"ARM kernel should use four registers per 64-byte block.");
430 const simd8<T> chunks[NUM_CHUNKS];
432 simd8x64(
const simd8x64<T>& o) =
delete;
433 simd8x64<T>& operator=(
const simd8<T>& other) =
delete;
436 simdjson_inline simd8x64(
const simd8<T> chunk0,
const simd8<T> chunk1,
const simd8<T> chunk2,
const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
437 simdjson_inline simd8x64(
const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
439 simdjson_inline
void store(T ptr[64])
const {
440 this->chunks[0].store(ptr+
sizeof(simd8<T>)*0);
441 this->chunks[1].store(ptr+
sizeof(simd8<T>)*1);
442 this->chunks[2].store(ptr+
sizeof(simd8<T>)*2);
443 this->chunks[3].store(ptr+
sizeof(simd8<T>)*3);
446 simdjson_inline simd8<T> reduce_or()
const {
447 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
451 simdjson_inline uint64_t compress(uint64_t mask, T * output)
const {
452 uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
454 uint64_t offsets = popcounts * 0x0101010101010101;
455 this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
456 this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
457 this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
458 this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
459 return offsets >> 56;
462 simdjson_inline uint64_t to_bitmask()
const {
463#if SIMDJSON_REGULAR_VISUAL_STUDIO
464 const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
465 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
466 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
469 const uint8x16_t bit_mask = {
470 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
471 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
475 uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
476 uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
477 sum0 = vpaddq_u8(sum0, sum1);
478 sum0 = vpaddq_u8(sum0, sum0);
479 return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
482 simdjson_inline uint64_t eq(
const T m)
const {
483 const simd8<T> mask = simd8<T>::splat(m);
484 return simd8x64<bool>(
485 this->chunks[0] == mask,
486 this->chunks[1] == mask,
487 this->chunks[2] == mask,
488 this->chunks[3] == mask
492 simdjson_inline uint64_t lteq(
const T m)
const {
493 const simd8<T> mask = simd8<T>::splat(m);
494 return simd8x64<bool>(
495 this->chunks[0] <= mask,
496 this->chunks[1] <= mask,
497 this->chunks[2] <= mask,
498 this->chunks[3] <= mask
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.