1#ifndef SIMDJSON_ICELAKE_SIMD_H
2#define SIMDJSON_ICELAKE_SIMD_H
4#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5#include "simdjson/icelake/base.h"
6#include "simdjson/icelake/intrinsics.h"
7#include "simdjson/icelake/bitmanipulation.h"
8#include "simdjson/internal/simdprune_tables.h"
11#if defined(__GNUC__) && !defined(__clang__)
13#define SIMDJSON_GCC8 1
21inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
22 return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
23 uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
24 uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
25 uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
26 uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
27 uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
28 uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
29 uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
41 template<
typename Child>
46 simdjson_inline base() : value{__m512i()} {}
49 simdjson_inline base(
const __m512i _value) : value(_value) {}
52 simdjson_inline
operator const __m512i&()
const {
return this->value; }
53 simdjson_inline
operator __m512i&() {
return this->value; }
56 simdjson_inline Child operator|(
const Child other)
const {
return _mm512_or_si512(*
this, other); }
57 simdjson_inline Child operator&(
const Child other)
const {
return _mm512_and_si512(*
this, other); }
58 simdjson_inline Child operator^(
const Child other)
const {
return _mm512_xor_si512(*
this, other); }
59 simdjson_inline Child bit_andnot(
const Child other)
const {
return _mm512_andnot_si512(other, *
this); }
60 simdjson_inline Child& operator|=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast | other;
return *this_cast; }
61 simdjson_inline Child& operator&=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast & other;
return *this_cast; }
62 simdjson_inline Child& operator^=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast ^ other;
return *this_cast; }
69 template<
typename T,
typename Mask=simd8<
bool>>
70 struct base8: base<simd8<T>> {
71 typedef uint32_t bitmask_t;
72 typedef uint64_t bitmask2_t;
74 simdjson_inline base8() : base<simd8<T>>() {}
75 simdjson_inline base8(
const __m512i _value) : base<simd8<T>>(_value) {}
77 friend simdjson_really_inline uint64_t
operator==(
const simd8<T> lhs,
const simd8<T> rhs) {
78 return _mm512_cmpeq_epi8_mask(lhs, rhs);
80 static const int SIZE =
sizeof(base<T>::value);
83 simdjson_inline simd8<T> prev(
const simd8<T> prev_chunk)
const {
85 constexpr int shift = 16 - N;
86 return _mm512_alignr_epi8(*
this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *
this), shift);
92 struct simd8<bool>: base8<bool> {
93 static simdjson_inline simd8<bool> splat(
bool _value) {
return _mm512_set1_epi8(uint8_t(-(!!_value))); }
95 simdjson_inline simd8() : base8() {}
96 simdjson_inline simd8(
const __m512i _value) : base8<bool>(_value) {}
98 simdjson_inline simd8(
bool _value) : base8<bool>(splat(_value)) {}
99 simdjson_inline
bool any()
const {
return !!_mm512_test_epi8_mask (*
this, *
this); }
100 simdjson_inline simd8<bool> operator~()
const {
return *
this ^
true; }
104 struct base8_numeric: base8<T> {
105 static simdjson_inline simd8<T> splat(T _value) {
return _mm512_set1_epi8(_value); }
106 static simdjson_inline simd8<T> zero() {
return _mm512_setzero_si512(); }
107 static simdjson_inline simd8<T> load(
const T values[64]) {
108 return _mm512_loadu_si512(
reinterpret_cast<const __m512i *
>(values));
111 static simdjson_inline simd8<T> repeat_16(
112 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
113 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
116 v0, v1, v2, v3, v4, v5, v6, v7,
117 v8, v9, v10,v11,v12,v13,v14,v15,
118 v0, v1, v2, v3, v4, v5, v6, v7,
119 v8, v9, v10,v11,v12,v13,v14,v15,
120 v0, v1, v2, v3, v4, v5, v6, v7,
121 v8, v9, v10,v11,v12,v13,v14,v15,
122 v0, v1, v2, v3, v4, v5, v6, v7,
123 v8, v9, v10,v11,v12,v13,v14,v15
127 simdjson_inline base8_numeric() : base8<T>() {}
128 simdjson_inline base8_numeric(
const __m512i _value) : base8<T>(_value) {}
131 simdjson_inline
void store(T dst[64])
const {
return _mm512_storeu_si512(
reinterpret_cast<__m512i *
>(dst), *
this); }
134 simdjson_inline simd8<T> operator+(
const simd8<T> other)
const {
return _mm512_add_epi8(*
this, other); }
135 simdjson_inline simd8<T> operator-(
const simd8<T> other)
const {
return _mm512_sub_epi8(*
this, other); }
136 simdjson_inline simd8<T>& operator+=(
const simd8<T> other) { *
this = *
this + other;
return *
static_cast<simd8<T>*
>(
this); }
137 simdjson_inline simd8<T>& operator-=(
const simd8<T> other) { *
this = *
this - other;
return *
static_cast<simd8<T>*
>(
this); }
140 simdjson_inline simd8<T> operator~()
const {
return *
this ^ 0xFFu; }
144 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
145 return _mm512_shuffle_epi8(lookup_table, *
this);
156 simdjson_inline
void compress(uint64_t mask, L * output)
const {
160 __m512i compressed = _mm512_maskz_compress_epi8(~mask, *
this);
161 _mm512_storeu_si512(output, compressed);
165 simdjson_inline simd8<L> lookup_16(
166 L replace0, L replace1, L replace2, L replace3,
167 L replace4, L replace5, L replace6, L replace7,
168 L replace8, L replace9, L replace10, L replace11,
169 L replace12, L replace13, L replace14, L replace15)
const {
170 return lookup_16(simd8<L>::repeat_16(
171 replace0, replace1, replace2, replace3,
172 replace4, replace5, replace6, replace7,
173 replace8, replace9, replace10, replace11,
174 replace12, replace13, replace14, replace15
181 struct simd8<int8_t> : base8_numeric<int8_t> {
182 simdjson_inline simd8() : base8_numeric<int8_t>() {}
183 simdjson_inline simd8(
const __m512i _value) : base8_numeric<int8_t>(_value) {}
185 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
187 simdjson_inline simd8(
const int8_t values[64]) : simd8(load(values)) {}
189 simdjson_inline simd8(
190 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
191 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
192 int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
193 int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31,
194 int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39,
195 int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47,
196 int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55,
197 int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63
198 ) : simd8(_mm512_set_epi8(
199 v63, v62, v61, v60, v59, v58, v57, v56,
200 v55, v54, v53, v52, v51, v50, v49, v48,
201 v47, v46, v45, v44, v43, v42, v41, v40,
202 v39, v38, v37, v36, v35, v34, v33, v32,
203 v31, v30, v29, v28, v27, v26, v25, v24,
204 v23, v22, v21, v20, v19, v18, v17, v16,
205 v15, v14, v13, v12, v11, v10, v9, v8,
206 v7, v6, v5, v4, v3, v2, v1, v0
210 simdjson_inline
static simd8<int8_t> repeat_16(
211 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
212 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
214 return simd8<int8_t>(
215 v0, v1, v2, v3, v4, v5, v6, v7,
216 v8, v9, v10,v11,v12,v13,v14,v15,
217 v0, v1, v2, v3, v4, v5, v6, v7,
218 v8, v9, v10,v11,v12,v13,v14,v15,
219 v0, v1, v2, v3, v4, v5, v6, v7,
220 v8, v9, v10,v11,v12,v13,v14,v15,
221 v0, v1, v2, v3, v4, v5, v6, v7,
222 v8, v9, v10,v11,v12,v13,v14,v15
227 simdjson_inline simd8<int8_t> max_val(
const simd8<int8_t> other)
const {
return _mm512_max_epi8(*
this, other); }
228 simdjson_inline simd8<int8_t> min_val(
const simd8<int8_t> other)
const {
return _mm512_min_epi8(*
this, other); }
230 simdjson_inline simd8<bool> operator>(
const simd8<int8_t> other)
const {
return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*
this, other),_mm512_set1_epi8(uint8_t(0x80))); }
231 simdjson_inline simd8<bool> operator<(
const simd8<int8_t> other)
const {
return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *
this),_mm512_set1_epi8(uint8_t(0x80))); }
236 struct simd8<uint8_t>: base8_numeric<uint8_t> {
237 simdjson_inline simd8() : base8_numeric<uint8_t>() {}
238 simdjson_inline simd8(
const __m512i _value) : base8_numeric<uint8_t>(_value) {}
240 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
242 simdjson_inline simd8(
const uint8_t values[64]) : simd8(load(values)) {}
244 simdjson_inline simd8(
245 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
246 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
247 uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
248 uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31,
249 uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39,
250 uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47,
251 uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55,
252 uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63
253 ) : simd8(_mm512_set_epi8(
254 v63, v62, v61, v60, v59, v58, v57, v56,
255 v55, v54, v53, v52, v51, v50, v49, v48,
256 v47, v46, v45, v44, v43, v42, v41, v40,
257 v39, v38, v37, v36, v35, v34, v33, v32,
258 v31, v30, v29, v28, v27, v26, v25, v24,
259 v23, v22, v21, v20, v19, v18, v17, v16,
260 v15, v14, v13, v12, v11, v10, v9, v8,
261 v7, v6, v5, v4, v3, v2, v1, v0
265 simdjson_inline
static simd8<uint8_t> repeat_16(
266 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
267 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
269 return simd8<uint8_t>(
270 v0, v1, v2, v3, v4, v5, v6, v7,
271 v8, v9, v10,v11,v12,v13,v14,v15,
272 v0, v1, v2, v3, v4, v5, v6, v7,
273 v8, v9, v10,v11,v12,v13,v14,v15,
274 v0, v1, v2, v3, v4, v5, v6, v7,
275 v8, v9, v10,v11,v12,v13,v14,v15,
276 v0, v1, v2, v3, v4, v5, v6, v7,
277 v8, v9, v10,v11,v12,v13,v14,v15
282 simdjson_inline simd8<uint8_t> saturating_add(
const simd8<uint8_t> other)
const {
return _mm512_adds_epu8(*
this, other); }
283 simdjson_inline simd8<uint8_t> saturating_sub(
const simd8<uint8_t> other)
const {
return _mm512_subs_epu8(*
this, other); }
286 simdjson_inline simd8<uint8_t> max_val(
const simd8<uint8_t> other)
const {
return _mm512_max_epu8(*
this, other); }
287 simdjson_inline simd8<uint8_t> min_val(
const simd8<uint8_t> other)
const {
return _mm512_min_epu8(other, *
this); }
289 simdjson_inline simd8<uint8_t> gt_bits(
const simd8<uint8_t> other)
const {
return this->saturating_sub(other); }
291 simdjson_inline simd8<uint8_t> lt_bits(
const simd8<uint8_t> other)
const {
return other.saturating_sub(*
this); }
292 simdjson_inline uint64_t operator<=(
const simd8<uint8_t> other)
const {
return other.max_val(*
this) == other; }
293 simdjson_inline uint64_t operator>=(
const simd8<uint8_t> other)
const {
return other.min_val(*
this) == other; }
294 simdjson_inline simd8<bool> operator>(
const simd8<uint8_t> other)
const {
return this->gt_bits(other).any_bits_set(); }
295 simdjson_inline simd8<bool> operator<(
const simd8<uint8_t> other)
const {
return this->lt_bits(other).any_bits_set(); }
298 simdjson_inline simd8<bool> bits_not_set()
const {
return _mm512_mask_blend_epi8(*
this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); }
299 simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits)
const {
return (*
this & bits).bits_not_set(); }
300 simdjson_inline simd8<bool> any_bits_set()
const {
return ~this->bits_not_set(); }
301 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits)
const {
return ~this->bits_not_set(bits); }
303 simdjson_inline
bool is_ascii()
const {
return _mm512_movepi8_mask(*
this) == 0; }
304 simdjson_inline
bool bits_not_set_anywhere()
const {
305 return !_mm512_test_epi8_mask(*
this, *
this);
307 simdjson_inline
bool any_bits_set_anywhere()
const {
return !bits_not_set_anywhere(); }
308 simdjson_inline
bool bits_not_set_anywhere(simd8<uint8_t> bits)
const {
return !_mm512_test_epi8_mask(*
this, bits); }
309 simdjson_inline
bool any_bits_set_anywhere(simd8<uint8_t> bits)
const {
return !bits_not_set_anywhere(bits); }
311 simdjson_inline simd8<uint8_t> shr()
const {
return simd8<uint8_t>(_mm512_srli_epi16(*
this, N)) & uint8_t(0xFFu >> N); }
313 simdjson_inline simd8<uint8_t> shl()
const {
return simd8<uint8_t>(_mm512_slli_epi16(*
this, N)) & uint8_t(0xFFu << N); }
317 simdjson_inline uint64_t get_bit()
const {
return _mm512_movepi8_mask(_mm512_slli_epi16(*
this, 7-N)); }
322 static constexpr int NUM_CHUNKS = 64 /
sizeof(simd8<T>);
323 static_assert(NUM_CHUNKS == 1,
"Icelake kernel should use one register per 64-byte block.");
324 const simd8<T> chunks[NUM_CHUNKS];
326 simd8x64(
const simd8x64<T>& o) =
delete;
327 simd8x64<T>& operator=(
const simd8<T>& other) =
delete;
330 simdjson_inline simd8x64(
const simd8<T> chunk0,
const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
331 simdjson_inline simd8x64(
const simd8<T> chunk0) : chunks{chunk0} {}
332 simdjson_inline simd8x64(
const T ptr[64]) : chunks{simd8<T>::load(ptr)} {}
334 simdjson_inline uint64_t compress(uint64_t mask, T * output)
const {
335 this->chunks[0].compress(mask, output);
336 return 64 - count_ones(mask);
339 simdjson_inline
void store(T ptr[64])
const {
340 this->chunks[0].store(ptr+
sizeof(simd8<T>)*0);
343 simdjson_inline simd8<T> reduce_or()
const {
344 return this->chunks[0];
347 simdjson_inline simd8x64<T> bit_or(
const T m)
const {
348 const simd8<T> mask = simd8<T>::splat(m);
350 this->chunks[0] | mask
354 simdjson_inline uint64_t eq(
const T m)
const {
355 const simd8<T> mask = simd8<T>::splat(m);
356 return this->chunks[0] == mask;
359 simdjson_inline uint64_t eq(
const simd8x64<uint8_t> &other)
const {
360 return this->chunks[0] == other.chunks[0];
363 simdjson_inline uint64_t lteq(
const T m)
const {
364 const simd8<T> mask = simd8<T>::splat(m);
365 return this->chunks[0] <= mask;
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.