1 #ifndef SIMDJSON_LASX_SIMD_H
2 #define SIMDJSON_LASX_SIMD_H
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/lasx/base.h"
6 #include "simdjson/lasx/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
16 template<
typename Child>
21 simdjson_inline base() : value{__m256i()} {}
24 simdjson_inline base(
const __m256i _value) : value(_value) {}
27 simdjson_inline
operator const __m256i&()
const {
return this->value; }
28 simdjson_inline
operator __m256i&() {
return this->value; }
29 simdjson_inline
operator const v32i8&()
const {
return (v32i8&)this->value; }
30 simdjson_inline
operator v32i8&() {
return (v32i8&)this->value; }
33 simdjson_inline Child operator|(
const Child other)
const {
return __lasx_xvor_v(*
this, other); }
34 simdjson_inline Child operator&(
const Child other)
const {
return __lasx_xvand_v(*
this, other); }
35 simdjson_inline Child operator^(
const Child other)
const {
return __lasx_xvxor_v(*
this, other); }
36 simdjson_inline Child bit_andnot(
const Child other)
const {
return __lasx_xvandn_v(other, *
this); }
37 simdjson_inline Child& operator|=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast | other;
return *this_cast; }
38 simdjson_inline Child& operator&=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast & other;
return *this_cast; }
39 simdjson_inline Child& operator^=(
const Child other) {
auto this_cast =
static_cast<Child*
>(
this); *this_cast = *this_cast ^ other;
return *this_cast; }
46 template<
typename T,
typename Mask=simd8<
bool>>
47 struct base8: base<simd8<T>> {
48 simdjson_inline base8() : base<simd8<T>>() {}
49 simdjson_inline base8(
const __m256i _value) : base<simd8<T>>(_value) {}
51 friend simdjson_really_inline Mask
operator==(
const simd8<T> lhs,
const simd8<T> rhs) {
return __lasx_xvseq_b(lhs, rhs); }
53 static const int SIZE =
sizeof(base<simd8<T>>::value);
56 simdjson_inline simd8<T> prev(
const simd8<T> prev_chunk)
const {
57 __m256i hi = __lasx_xvbsll_v(*
this, N);
58 __m256i lo = __lasx_xvbsrl_v(*
this, 16 - N);
59 __m256i tmp = __lasx_xvbsrl_v(prev_chunk, 16 - N);
60 lo = __lasx_xvpermi_q(lo, tmp, 0x21);
61 return __lasx_xvor_v(hi, lo);
67 struct simd8<bool>: base8<bool> {
68 static simdjson_inline simd8<bool> splat(
bool _value) {
return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); }
70 simdjson_inline simd8() : base8() {}
71 simdjson_inline simd8(
const __m256i _value) : base8<bool>(_value) {}
73 simdjson_inline simd8(
bool _value) : base8<bool>(splat(_value)) {}
75 simdjson_inline
int to_bitmask()
const {
76 __m256i mask = __lasx_xvmskltz_b(*
this);
77 return (__lasx_xvpickve2gr_w(mask, 4) << 16) | (__lasx_xvpickve2gr_w(mask, 0));
79 simdjson_inline
bool any()
const {
80 __m256i v = __lasx_xvmsknz_b(*
this);
81 return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
83 simdjson_inline simd8<bool> operator~()
const {
return *
this ^
true; }
87 struct base8_numeric: base8<T> {
88 static simdjson_inline simd8<T> splat(T _value) {
89 return __lasx_xvreplgr2vr_b(_value);
91 static simdjson_inline simd8<T> zero() {
return __lasx_xvldi(0); }
92 static simdjson_inline simd8<T> load(
const T values[32]) {
93 return __lasx_xvld(
reinterpret_cast<const __m256i *
>(values), 0);
96 static simdjson_inline simd8<T> repeat_16(
97 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
98 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
101 v0, v1, v2, v3, v4, v5, v6, v7,
102 v8, v9, v10,v11,v12,v13,v14,v15,
103 v0, v1, v2, v3, v4, v5, v6, v7,
104 v8, v9, v10,v11,v12,v13,v14,v15
108 simdjson_inline base8_numeric() : base8<T>() {}
109 simdjson_inline base8_numeric(
const __m256i _value) : base8<T>(_value) {}
112 simdjson_inline
void store(T dst[32])
const {
113 return __lasx_xvst(*
this,
reinterpret_cast<__m256i *
>(dst), 0);
117 simdjson_inline simd8<T> operator+(
const simd8<T> other)
const {
return __lasx_xvadd_b(*
this, other); }
118 simdjson_inline simd8<T> operator-(
const simd8<T> other)
const {
return __lasx_xvsub_b(*
this, other); }
119 simdjson_inline simd8<T>& operator+=(
const simd8<T> other) { *
this = *
this + other;
return *
static_cast<simd8<T>*
>(
this); }
120 simdjson_inline simd8<T>& operator-=(
const simd8<T> other) { *
this = *
this - other;
return *
static_cast<simd8<T>*
>(
this); }
123 simdjson_inline simd8<T> operator~()
const {
return *
this ^ 0xFFu; }
127 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table)
const {
128 return __lasx_xvshuf_b(lookup_table, lookup_table, *
this);
136 simdjson_inline
void compress(uint32_t mask, L * output)
const {
137 using internal::thintable_epi8;
138 using internal::BitsSetTable256mul2;
139 using internal::pshufb_combine_table;
142 uint8_t mask1 = uint8_t(mask);
143 uint8_t mask2 = uint8_t(mask >> 8);
144 uint8_t mask3 = uint8_t(mask >> 16);
145 uint8_t mask4 = uint8_t(mask >> 24);
148 __m256i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808, int64_t(thintable_epi8[mask3]), int64_t(thintable_epi8[mask4]) + 0x0808080808080808};
150 __m256i pruned = __lasx_xvshuf_b(*
this, *
this, shufmask);
153 int pop1 = BitsSetTable256mul2[mask1];
154 int pop2 = BitsSetTable256mul2[mask2];
155 int pop3 = BitsSetTable256mul2[mask3];
158 __m256i masklo = __lasx_xvldx(
reinterpret_cast<void*
>(
reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
159 __m256i maskhi = __lasx_xvldx(
reinterpret_cast<void*
>(
reinterpret_cast<unsigned long>(pshufb_combine_table)), pop3 * 8);
160 __m256i compactmask = __lasx_xvpermi_q(maskhi, masklo, 0x20);
161 __m256i answer = __lasx_xvshuf_b(pruned, pruned, compactmask);
162 __lasx_xvst(answer,
reinterpret_cast<uint8_t*
>(output), 0);
163 uint64_t value3 = __lasx_xvpickve2gr_du(answer, 2);
164 uint64_t value4 = __lasx_xvpickve2gr_du(answer, 3);
165 uint64_t *pos =
reinterpret_cast<uint64_t*
>(
reinterpret_cast<uint8_t*
>(output) + 16 - (pop1 + pop2) / 2);
171 simdjson_inline simd8<L> lookup_16(
172 L replace0, L replace1, L replace2, L replace3,
173 L replace4, L replace5, L replace6, L replace7,
174 L replace8, L replace9, L replace10, L replace11,
175 L replace12, L replace13, L replace14, L replace15)
const {
176 return lookup_16(simd8<L>::repeat_16(
177 replace0, replace1, replace2, replace3,
178 replace4, replace5, replace6, replace7,
179 replace8, replace9, replace10, replace11,
180 replace12, replace13, replace14, replace15
187 struct simd8<int8_t> : base8_numeric<int8_t> {
188 simdjson_inline simd8() : base8_numeric<int8_t>() {}
189 simdjson_inline simd8(
const __m256i _value) : base8_numeric<int8_t>(_value) {}
191 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
193 simdjson_inline simd8(
const int8_t values[32]) : simd8(load(values)) {}
195 simdjson_inline simd8(
196 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
197 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
198 int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
199 int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
201 v0, v1, v2, v3, v4, v5, v6, v7,
202 v8, v9, v10,v11,v12,v13,v14,v15,
203 v16,v17,v18,v19,v20,v21,v22,v23,
204 v24,v25,v26,v27,v28,v29,v30,v31
207 simdjson_inline
static simd8<int8_t> repeat_16(
208 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
209 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
211 return simd8<int8_t>(
212 v0, v1, v2, v3, v4, v5, v6, v7,
213 v8, v9, v10,v11,v12,v13,v14,v15,
214 v0, v1, v2, v3, v4, v5, v6, v7,
215 v8, v9, v10,v11,v12,v13,v14,v15
220 simdjson_inline simd8<int8_t> max_val(
const simd8<int8_t> other)
const {
return __lasx_xvmax_b(*
this, other); }
221 simdjson_inline simd8<int8_t> min_val(
const simd8<int8_t> other)
const {
return __lasx_xvmin_b(*
this, other); }
222 simdjson_inline simd8<bool> operator>(
const simd8<int8_t> other)
const {
return __lasx_xvslt_b(other, *
this); }
223 simdjson_inline simd8<bool> operator<(
const simd8<int8_t> other)
const {
return __lasx_xvslt_b(*
this, other); }
228 struct simd8<uint8_t>: base8_numeric<uint8_t> {
229 simdjson_inline simd8() : base8_numeric<uint8_t>() {}
230 simdjson_inline simd8(
const __m256i _value) : base8_numeric<uint8_t>(_value) {}
232 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
234 simdjson_inline simd8(
const uint8_t values[32]) : simd8(load(values)) {}
236 simdjson_inline simd8(
237 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
238 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
239 uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
240 uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
241 ) : simd8(__m256i(v32u8{
242 v0, v1, v2, v3, v4, v5, v6, v7,
243 v8, v9, v10,v11,v12,v13,v14,v15,
244 v16,v17,v18,v19,v20,v21,v22,v23,
245 v24,v25,v26,v27,v28,v29,v30,v31
248 simdjson_inline
static simd8<uint8_t> repeat_16(
249 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
250 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
252 return simd8<uint8_t>(
253 v0, v1, v2, v3, v4, v5, v6, v7,
254 v8, v9, v10,v11,v12,v13,v14,v15,
255 v0, v1, v2, v3, v4, v5, v6, v7,
256 v8, v9, v10,v11,v12,v13,v14,v15
261 simdjson_inline simd8<uint8_t> saturating_add(
const simd8<uint8_t> other)
const {
return __lasx_xvsadd_bu(*
this, other); }
262 simdjson_inline simd8<uint8_t> saturating_sub(
const simd8<uint8_t> other)
const {
return __lasx_xvssub_bu(*
this, other); }
265 simdjson_inline simd8<uint8_t> max_val(
const simd8<uint8_t> other)
const {
return __lasx_xvmax_bu(*
this, other); }
266 simdjson_inline simd8<uint8_t> min_val(
const simd8<uint8_t> other)
const {
return __lasx_xvmin_bu(other, *
this); }
268 simdjson_inline simd8<uint8_t> gt_bits(
const simd8<uint8_t> other)
const {
return this->saturating_sub(other); }
270 simdjson_inline simd8<uint8_t> lt_bits(
const simd8<uint8_t> other)
const {
return other.saturating_sub(*
this); }
271 simdjson_inline simd8<bool> operator<=(
const simd8<uint8_t> other)
const {
return other.max_val(*
this) == other; }
272 simdjson_inline simd8<bool> operator>=(
const simd8<uint8_t> other)
const {
return other.min_val(*
this) == other; }
273 simdjson_inline simd8<bool> operator>(
const simd8<uint8_t> other)
const {
return this->gt_bits(other).any_bits_set(); }
274 simdjson_inline simd8<bool> operator<(
const simd8<uint8_t> other)
const {
return this->lt_bits(other).any_bits_set(); }
277 simdjson_inline simd8<bool> bits_not_set()
const {
return *
this == uint8_t(0); }
278 simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits)
const {
return (*
this & bits).bits_not_set(); }
279 simdjson_inline simd8<bool> any_bits_set()
const {
return ~this->bits_not_set(); }
280 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits)
const {
return ~this->bits_not_set(bits); }
281 simdjson_inline
bool is_ascii()
const {
282 __m256i mask = __lasx_xvmskltz_b(*
this);
283 return (0 == __lasx_xvpickve2gr_w(mask, 0)) && (0 == __lasx_xvpickve2gr_w(mask, 4));
285 simdjson_inline
bool bits_not_set_anywhere()
const {
286 __m256i v = __lasx_xvmsknz_b(*
this);
287 return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
289 simdjson_inline
bool any_bits_set_anywhere()
const {
return !bits_not_set_anywhere(); }
290 simdjson_inline
bool bits_not_set_anywhere(simd8<uint8_t> bits)
const {
291 __m256i v = __lasx_xvmsknz_b(__lasx_xvand_v(*
this, bits));
292 return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
294 simdjson_inline
bool any_bits_set_anywhere(simd8<uint8_t> bits)
const {
return !bits_not_set_anywhere(bits); }
296 simdjson_inline simd8<uint8_t> shr()
const {
return simd8<uint8_t>(__lasx_xvsrli_b(*
this, N)); }
298 simdjson_inline simd8<uint8_t> shl()
const {
return simd8<uint8_t>(__lasx_xvslli_b(*
this, N)); }
303 static constexpr
int NUM_CHUNKS = 64 /
sizeof(simd8<T>);
304 static_assert(NUM_CHUNKS == 2,
"LASX kernel should use two registers per 64-byte block.");
305 const simd8<T> chunks[NUM_CHUNKS];
307 simd8x64(
const simd8x64<T>& o) =
delete;
308 simd8x64<T>& operator=(
const simd8<T>& other) =
delete;
311 simdjson_inline simd8x64(
const simd8<T> chunk0,
const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
312 simdjson_inline simd8x64(
const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
314 simdjson_inline uint64_t compress(uint64_t mask, T * output)
const {
315 uint32_t mask1 = uint32_t(mask);
316 uint32_t mask2 = uint32_t(mask >> 32);
317 __m256i zcnt = __lasx_xvpcnt_w(__m256i(v4u64{~mask, 0, 0, 0}));
318 uint64_t zcnt1 = __lasx_xvpickve2gr_wu(zcnt, 0);
319 uint64_t zcnt2 = __lasx_xvpickve2gr_wu(zcnt, 1);
322 this->chunks[0].compress(mask1, output);
324 this->chunks[1].compress(mask2, output + zcnt1);
325 return zcnt1 + zcnt2;
328 simdjson_inline
void store(T ptr[64])
const {
329 this->chunks[0].store(ptr+
sizeof(simd8<T>)*0);
330 this->chunks[1].store(ptr+
sizeof(simd8<T>)*1);
333 simdjson_inline uint64_t to_bitmask()
const {
334 __m256i mask0 = __lasx_xvmskltz_b(this->chunks[0]);
335 __m256i mask1 = __lasx_xvmskltz_b(this->chunks[1]);
336 __m256i mask_tmp = __lasx_xvpickve_w(mask0, 4);
337 __m256i tmp = __lasx_xvpickve_w(mask1, 4);
338 mask0 = __lasx_xvinsve0_w(mask0, mask1, 1);
339 mask_tmp = __lasx_xvinsve0_w(mask_tmp, tmp, 1);
340 return __lasx_xvpickve2gr_du(__lasx_xvpackev_h(mask_tmp, mask0), 0);
343 simdjson_inline simd8<T> reduce_or()
const {
344 return this->chunks[0] | this->chunks[1];
347 simdjson_inline uint64_t eq(
const T m)
const {
348 const simd8<T> mask = simd8<T>::splat(m);
349 return simd8x64<bool>(
350 this->chunks[0] == mask,
351 this->chunks[1] == mask
355 simdjson_inline uint64_t eq(
const simd8x64<uint8_t> &other)
const {
356 return simd8x64<bool>(
357 this->chunks[0] == other.chunks[0],
358 this->chunks[1] == other.chunks[1]
362 simdjson_inline uint64_t lteq(
const T m)
const {
363 const simd8<T> mask = simd8<T>::splat(m);
364 return simd8x64<bool>(
365 this->chunks[0] <= mask,
366 this->chunks[1] <= mask
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.