1 #ifndef SIMDJSON_ICELAKE_SIMD_H
2 #define SIMDJSON_ICELAKE_SIMD_H
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/icelake/base.h"
6 #include "simdjson/icelake/intrinsics.h"
7 #include "simdjson/icelake/bitmanipulation.h"
8 #include "simdjson/internal/simdprune_tables.h"
11 #if defined(__GNUC__) && !defined(__clang__)
13 #define SIMDJSON_GCC8 1
21 inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
22 return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
23 uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
24 uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
25 uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
26 uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
27 uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
28 uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
29 uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
41 template<
typename Child>
46 simdjson_inline base() : value{__m512i()} {}
49 simdjson_inline base(
const __m512i _value) : value(_value) {}
52 simdjson_inline
operator const __m512i&()
const {
return this->value; }
53 simdjson_inline
operator __m512i&() {
return this->value; }
56 simdjson_inline Child operator|(
const Child other)
const {
return _mm512_or_si512(*
this, other); }
57 simdjson_inline Child operator&(
const Child other)
const {
return _mm512_and_si512(*
this, other); }
58 simdjson_inline Child operator^(
const Child other)
const {
return _mm512_xor_si512(*
this, other); }
59 simdjson_inline Child bit_andnot(
const Child other)
const {
return _mm512_andnot_si512(other, *
this); }
60 simdjson_inline Child& operator|=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast | other;
return *this_cast; }
61 simdjson_inline Child& operator&=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast & other;
return *this_cast; }
62 simdjson_inline Child& operator^=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast ^ other;
return *this_cast; }
69 template<
typename T,
typename Mask=simd8<
bool>>
70 struct base8: base<simd8<T>> {
71 typedef uint32_t bitmask_t;
72 typedef uint64_t bitmask2_t;
74 simdjson_inline base8() : base<simd8<T>>() {}
75 simdjson_inline base8(
const __m512i _value) : base<simd8<T>>(_value) {}
77 friend simdjson_really_inline uint64_t
operator==(
const simd8<T> lhs,
const simd8<T> rhs) {
78 return _mm512_cmpeq_epi8_mask(lhs, rhs);
81 static const int SIZE =
sizeof(base<T>::value);
84 simdjson_inline simd8<T> prev(
const simd8<T> prev_chunk)
const {
86 constexpr
int shift = 16 - N;
87 return _mm512_alignr_epi8(*
this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *
this), shift);
93 struct simd8<bool>: base8<bool> {
94 static simdjson_inline simd8<bool> splat(
bool _value) {
return _mm512_set1_epi8(uint8_t(-(!!_value))); }
96 simdjson_inline simd8() : base8() {}
97 simdjson_inline simd8(
const __m512i _value) : base8<bool>(_value) {}
99 simdjson_inline simd8(
bool _value) : base8<bool>(splat(_value)) {}
100 simdjson_inline
bool any()
const {
return !!_mm512_test_epi8_mask (*
this, *
this); }
101 simdjson_inline simd8<bool> operator~()
const {
return *
this ^
true; }
105 struct base8_numeric: base8<T> {
106 static simdjson_inline simd8<T> splat(T _value) {
return _mm512_set1_epi8(_value); }
107 static simdjson_inline simd8<T> zero() {
return _mm512_setzero_si512(); }
108 static simdjson_inline simd8<T> load(
const T values[64]) {
109 return _mm512_loadu_si512(
reinterpret_cast<const __m512i *
>(values));
112 static simdjson_inline simd8<T> repeat_16(
113 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
114 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
117 v0, v1, v2, v3, v4, v5, v6, v7,
118 v8, v9, v10,v11,v12,v13,v14,v15,
119 v0, v1, v2, v3, v4, v5, v6, v7,
120 v8, v9, v10,v11,v12,v13,v14,v15,
121 v0, v1, v2, v3, v4, v5, v6, v7,
122 v8, v9, v10,v11,v12,v13,v14,v15,
123 v0, v1, v2, v3, v4, v5, v6, v7,
124 v8, v9, v10,v11,v12,v13,v14,v15
128 simdjson_inline base8_numeric() : base8<T>() {}
129 simdjson_inline base8_numeric(
const __m512i _value) : base8<T>(_value) {}
132 simdjson_inline
void store(T dst[64])
const {
return _mm512_storeu_si512(
reinterpret_cast<__m512i *
>(dst), *
this); }
135 simdjson_inline simd8<T> operator+(
const simd8<T> other)
const {
return _mm512_add_epi8(*
this, other); }
136 simdjson_inline simd8<T> operator-(
const simd8<T> other)
const {
return _mm512_sub_epi8(*
this, other); }
137 simdjson_inline simd8<T>& operator+=(
const simd8<T> other) { *
this = *
this + other;
return *
static_cast<simd8<T>*
>(
this); }
138 simdjson_inline simd8<T>& operator-=(
const simd8<T> other) { *
this = *
this - other;
return *
static_cast<simd8<T>*
>(
this); }
141 simdjson_inline simd8<T> operator~()
const {
return *
this ^ 0xFFu; }
145 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
146 return _mm512_shuffle_epi8(lookup_table, *
this);
157 simdjson_inline
void compress(uint64_t mask, L * output)
const {
158 _mm512_mask_compressstoreu_epi8 (output,~mask,*
this);
162 simdjson_inline simd8<L> lookup_16(
163 L replace0, L replace1, L replace2, L replace3,
164 L replace4, L replace5, L replace6, L replace7,
165 L replace8, L replace9, L replace10, L replace11,
166 L replace12, L replace13, L replace14, L replace15)
const {
167 return lookup_16(simd8<L>::repeat_16(
168 replace0, replace1, replace2, replace3,
169 replace4, replace5, replace6, replace7,
170 replace8, replace9, replace10, replace11,
171 replace12, replace13, replace14, replace15
178 struct simd8<int8_t> : base8_numeric<int8_t> {
179 simdjson_inline simd8() : base8_numeric<int8_t>() {}
180 simdjson_inline simd8(
const __m512i _value) : base8_numeric<int8_t>(_value) {}
182 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
184 simdjson_inline simd8(
const int8_t values[64]) : simd8(load(values)) {}
186 simdjson_inline simd8(
187 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
188 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
189 int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
190 int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31,
191 int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39,
192 int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47,
193 int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55,
194 int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63
195 ) : simd8(_mm512_set_epi8(
196 v63, v62, v61, v60, v59, v58, v57, v56,
197 v55, v54, v53, v52, v51, v50, v49, v48,
198 v47, v46, v45, v44, v43, v42, v41, v40,
199 v39, v38, v37, v36, v35, v34, v33, v32,
200 v31, v30, v29, v28, v27, v26, v25, v24,
201 v23, v22, v21, v20, v19, v18, v17, v16,
202 v15, v14, v13, v12, v11, v10, v9, v8,
203 v7, v6, v5, v4, v3, v2, v1, v0
207 simdjson_inline
static simd8<int8_t> repeat_16(
208 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
209 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
211 return simd8<int8_t>(
212 v0, v1, v2, v3, v4, v5, v6, v7,
213 v8, v9, v10,v11,v12,v13,v14,v15,
214 v0, v1, v2, v3, v4, v5, v6, v7,
215 v8, v9, v10,v11,v12,v13,v14,v15,
216 v0, v1, v2, v3, v4, v5, v6, v7,
217 v8, v9, v10,v11,v12,v13,v14,v15,
218 v0, v1, v2, v3, v4, v5, v6, v7,
219 v8, v9, v10,v11,v12,v13,v14,v15
224 simdjson_inline simd8<int8_t> max_val(
const simd8<int8_t> other)
const {
return _mm512_max_epi8(*
this, other); }
225 simdjson_inline simd8<int8_t> min_val(
const simd8<int8_t> other)
const {
return _mm512_min_epi8(*
this, other); }
227 simdjson_inline simd8<bool> operator>(
const simd8<int8_t> other)
const {
return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*
this, other),_mm512_set1_epi8(uint8_t(0x80))); }
228 simdjson_inline simd8<bool> operator<(
const simd8<int8_t> other)
const {
return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *
this),_mm512_set1_epi8(uint8_t(0x80))); }
233 struct simd8<uint8_t>: base8_numeric<uint8_t> {
234 simdjson_inline simd8() : base8_numeric<uint8_t>() {}
235 simdjson_inline simd8(
const __m512i _value) : base8_numeric<uint8_t>(_value) {}
237 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
239 simdjson_inline simd8(
const uint8_t values[64]) : simd8(load(values)) {}
241 simdjson_inline simd8(
242 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
243 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
244 uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
245 uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31,
246 uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39,
247 uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47,
248 uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55,
249 uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63
250 ) : simd8(_mm512_set_epi8(
251 v63, v62, v61, v60, v59, v58, v57, v56,
252 v55, v54, v53, v52, v51, v50, v49, v48,
253 v47, v46, v45, v44, v43, v42, v41, v40,
254 v39, v38, v37, v36, v35, v34, v33, v32,
255 v31, v30, v29, v28, v27, v26, v25, v24,
256 v23, v22, v21, v20, v19, v18, v17, v16,
257 v15, v14, v13, v12, v11, v10, v9, v8,
258 v7, v6, v5, v4, v3, v2, v1, v0
262 simdjson_inline
static simd8<uint8_t> repeat_16(
263 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
264 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
266 return simd8<uint8_t>(
267 v0, v1, v2, v3, v4, v5, v6, v7,
268 v8, v9, v10,v11,v12,v13,v14,v15,
269 v0, v1, v2, v3, v4, v5, v6, v7,
270 v8, v9, v10,v11,v12,v13,v14,v15,
271 v0, v1, v2, v3, v4, v5, v6, v7,
272 v8, v9, v10,v11,v12,v13,v14,v15,
273 v0, v1, v2, v3, v4, v5, v6, v7,
274 v8, v9, v10,v11,v12,v13,v14,v15
279 simdjson_inline simd8<uint8_t> saturating_add(
const simd8<uint8_t> other)
const {
return _mm512_adds_epu8(*
this, other); }
280 simdjson_inline simd8<uint8_t> saturating_sub(
const simd8<uint8_t> other)
const {
return _mm512_subs_epu8(*
this, other); }
283 simdjson_inline simd8<uint8_t> max_val(
const simd8<uint8_t> other)
const {
return _mm512_max_epu8(*
this, other); }
284 simdjson_inline simd8<uint8_t> min_val(
const simd8<uint8_t> other)
const {
return _mm512_min_epu8(other, *
this); }
286 simdjson_inline simd8<uint8_t> gt_bits(
const simd8<uint8_t> other)
const {
return this->saturating_sub(other); }
288 simdjson_inline simd8<uint8_t> lt_bits(
const simd8<uint8_t> other)
const {
return other.saturating_sub(*
this); }
289 simdjson_inline uint64_t operator<=(
const simd8<uint8_t> other)
const {
return other.max_val(*
this) == other; }
290 simdjson_inline uint64_t operator>=(
const simd8<uint8_t> other)
const {
return other.min_val(*
this) == other; }
291 simdjson_inline simd8<bool> operator>(
const simd8<uint8_t> other)
const {
return this->gt_bits(other).any_bits_set(); }
292 simdjson_inline simd8<bool> operator<(
const simd8<uint8_t> other)
const {
return this->lt_bits(other).any_bits_set(); }
295 simdjson_inline simd8<bool> bits_not_set()
const {
return _mm512_mask_blend_epi8(*
this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); }
296 simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits)
const {
return (*
this & bits).bits_not_set(); }
297 simdjson_inline simd8<bool> any_bits_set()
const {
return ~this->bits_not_set(); }
298 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits)
const {
return ~this->bits_not_set(bits); }
300 simdjson_inline
bool is_ascii()
const {
return _mm512_movepi8_mask(*
this) == 0; }
301 simdjson_inline
bool bits_not_set_anywhere()
const {
302 return !_mm512_test_epi8_mask(*
this, *
this);
304 simdjson_inline
bool any_bits_set_anywhere()
const {
return !bits_not_set_anywhere(); }
305 simdjson_inline
bool bits_not_set_anywhere(simd8<uint8_t> bits)
const {
return !_mm512_test_epi8_mask(*
this, bits); }
306 simdjson_inline
bool any_bits_set_anywhere(simd8<uint8_t> bits)
const {
return !bits_not_set_anywhere(bits); }
308 simdjson_inline simd8<uint8_t> shr()
const {
return simd8<uint8_t>(_mm512_srli_epi16(*
this, N)) & uint8_t(0xFFu >> N); }
310 simdjson_inline simd8<uint8_t> shl()
const {
return simd8<uint8_t>(_mm512_slli_epi16(*
this, N)) & uint8_t(0xFFu << N); }
314 simdjson_inline uint64_t get_bit()
const {
return _mm512_movepi8_mask(_mm512_slli_epi16(*
this, 7-N)); }
319 static constexpr
int NUM_CHUNKS = 64 /
sizeof(simd8<T>);
320 static_assert(NUM_CHUNKS == 1,
"Icelake kernel should use one register per 64-byte block.");
321 const simd8<T> chunks[NUM_CHUNKS];
323 simd8x64(
const simd8x64<T>& o) =
delete;
324 simd8x64<T>& operator=(
const simd8<T>& other) =
delete;
327 simdjson_inline simd8x64(
const simd8<T> chunk0,
const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
328 simdjson_inline simd8x64(
const simd8<T> chunk0) : chunks{chunk0} {}
329 simdjson_inline simd8x64(
const T ptr[64]) : chunks{simd8<T>::load(ptr)} {}
331 simdjson_inline uint64_t compress(uint64_t mask, T * output)
const {
332 this->chunks[0].compress(mask, output);
333 return 64 - count_ones(mask);
336 simdjson_inline
void store(T ptr[64])
const {
337 this->chunks[0].store(ptr+
sizeof(simd8<T>)*0);
340 simdjson_inline simd8<T> reduce_or()
const {
341 return this->chunks[0];
344 simdjson_inline simd8x64<T> bit_or(
const T m)
const {
345 const simd8<T> mask = simd8<T>::splat(m);
347 this->chunks[0] | mask
351 simdjson_inline uint64_t eq(
const T m)
const {
352 const simd8<T> mask = simd8<T>::splat(m);
353 return this->chunks[0] == mask;
356 simdjson_inline uint64_t eq(
const simd8x64<uint8_t> &other)
const {
357 return this->chunks[0] == other.chunks[0];
360 simdjson_inline uint64_t lteq(
const T m)
const {
361 const simd8<T> mask = simd8<T>::splat(m);
362 return this->chunks[0] <= mask;
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.