simdjson  3.11.0
Ridiculously Fast JSON
simd.h
1 #ifndef SIMDJSON_LASX_SIMD_H
2 #define SIMDJSON_LASX_SIMD_H
3 
4 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
5 #include "simdjson/lasx/base.h"
6 #include "simdjson/lasx/bitmanipulation.h"
7 #include "simdjson/internal/simdprune_tables.h"
8 #endif // SIMDJSON_CONDITIONAL_INCLUDE
9 
10 namespace simdjson {
11 namespace lasx {
12 namespace {
13 namespace simd {
14 
15  // Forward-declared so they can be used by splat and friends.
16  template<typename Child>
17  struct base {
18  __m256i value;
19 
20  // Zero constructor
21  simdjson_inline base() : value{__m256i()} {}
22 
23  // Conversion from SIMD register
24  simdjson_inline base(const __m256i _value) : value(_value) {}
25 
26  // Conversion to SIMD register
27  simdjson_inline operator const __m256i&() const { return this->value; }
28  simdjson_inline operator __m256i&() { return this->value; }
29  simdjson_inline operator const v32i8&() const { return (v32i8&)this->value; }
30  simdjson_inline operator v32i8&() { return (v32i8&)this->value; }
31 
32  // Bit operations
33  simdjson_inline Child operator|(const Child other) const { return __lasx_xvor_v(*this, other); }
34  simdjson_inline Child operator&(const Child other) const { return __lasx_xvand_v(*this, other); }
35  simdjson_inline Child operator^(const Child other) const { return __lasx_xvxor_v(*this, other); }
36  simdjson_inline Child bit_andnot(const Child other) const { return __lasx_xvandn_v(other, *this); }
37  simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
38  simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
39  simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
40  };
41 
42  // Forward-declared so they can be used by splat and friends.
43  template<typename T>
44  struct simd8;
45 
46  template<typename T, typename Mask=simd8<bool>>
47  struct base8: base<simd8<T>> {
48  simdjson_inline base8() : base<simd8<T>>() {}
49  simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
50 
51  friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lasx_xvseq_b(lhs, rhs); }
52 
53  static const int SIZE = sizeof(base<simd8<T>>::value);
54 
55  template<int N=1>
56  simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
57  __m256i hi = __lasx_xvbsll_v(*this, N);
58  __m256i lo = __lasx_xvbsrl_v(*this, 16 - N);
59  __m256i tmp = __lasx_xvbsrl_v(prev_chunk, 16 - N);
60  lo = __lasx_xvpermi_q(lo, tmp, 0x21);
61  return __lasx_xvor_v(hi, lo);
62  }
63  };
64 
65  // SIMD byte mask type (returned by things like eq and gt)
66  template<>
67  struct simd8<bool>: base8<bool> {
68  static simdjson_inline simd8<bool> splat(bool _value) { return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); }
69 
70  simdjson_inline simd8() : base8() {}
71  simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
72  // Splat constructor
73  simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
74 
75  simdjson_inline int to_bitmask() const {
76  __m256i mask = __lasx_xvmskltz_b(*this);
77  return (__lasx_xvpickve2gr_w(mask, 4) << 16) | (__lasx_xvpickve2gr_w(mask, 0));
78  }
79  simdjson_inline bool any() const {
80  __m256i v = __lasx_xvmsknz_b(*this);
81  return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
82  }
83  simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
84  };
85 
86  template<typename T>
87  struct base8_numeric: base8<T> {
88  static simdjson_inline simd8<T> splat(T _value) {
89  return __lasx_xvreplgr2vr_b(_value);
90  }
91  static simdjson_inline simd8<T> zero() { return __lasx_xvldi(0); }
92  static simdjson_inline simd8<T> load(const T values[32]) {
93  return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
94  }
95  // Repeat 16 values as many times as necessary (usually for lookup tables)
96  static simdjson_inline simd8<T> repeat_16(
97  T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
98  T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
99  ) {
100  return simd8<T>(
101  v0, v1, v2, v3, v4, v5, v6, v7,
102  v8, v9, v10,v11,v12,v13,v14,v15,
103  v0, v1, v2, v3, v4, v5, v6, v7,
104  v8, v9, v10,v11,v12,v13,v14,v15
105  );
106  }
107 
108  simdjson_inline base8_numeric() : base8<T>() {}
109  simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
110 
111  // Store to array
112  simdjson_inline void store(T dst[32]) const {
113  return __lasx_xvst(*this, reinterpret_cast<__m256i *>(dst), 0);
114  }
115 
116  // Addition/subtraction are the same for signed and unsigned
117  simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lasx_xvadd_b(*this, other); }
118  simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lasx_xvsub_b(*this, other); }
119  simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
120  simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
121 
122  // Override to distinguish from bool version
123  simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
124 
125  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
126  template<typename L>
127  simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
128  return __lasx_xvshuf_b(lookup_table, lookup_table, *this);
129  }
130 
131  // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
132  // Passing a 0 value for mask would be equivalent to writing out every byte to output.
133  // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
134  // get written.
135  template<typename L>
136  simdjson_inline void compress(uint32_t mask, L * output) const {
137  using internal::thintable_epi8;
138  using internal::BitsSetTable256mul2;
139  using internal::pshufb_combine_table;
140  // this particular implementation was inspired by haswell
141  // lasx do it in 4 steps, first 8 bytes and then second 8 bytes...
142  uint8_t mask1 = uint8_t(mask); // least significant 8 bits
143  uint8_t mask2 = uint8_t(mask >> 8); // second significant 8 bits
144  uint8_t mask3 = uint8_t(mask >> 16); // ...
145  uint8_t mask4 = uint8_t(mask >> 24); // ...
146  // next line just loads the 64-bit values thintable_epi8[mask{1,2,3,4}]
147  // into a 256-bit register.
148  __m256i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808, int64_t(thintable_epi8[mask3]), int64_t(thintable_epi8[mask4]) + 0x0808080808080808};
149  // this is the version "nearly pruned"
150  __m256i pruned = __lasx_xvshuf_b(*this, *this, shufmask);
151  // we still need to put the pieces back together.
152  // we compute the popcount of the first words:
153  int pop1 = BitsSetTable256mul2[mask1];
154  int pop2 = BitsSetTable256mul2[mask2];
155  int pop3 = BitsSetTable256mul2[mask3];
156 
157  // then load the corresponding mask
158  __m256i masklo = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
159  __m256i maskhi = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop3 * 8);
160  __m256i compactmask = __lasx_xvpermi_q(maskhi, masklo, 0x20);
161  __m256i answer = __lasx_xvshuf_b(pruned, pruned, compactmask);
162  __lasx_xvst(answer, reinterpret_cast<uint8_t*>(output), 0);
163  uint64_t value3 = __lasx_xvpickve2gr_du(answer, 2);
164  uint64_t value4 = __lasx_xvpickve2gr_du(answer, 3);
165  uint64_t *pos = reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(output) + 16 - (pop1 + pop2) / 2);
166  pos[0] = value3;
167  pos[1] = value4;
168  }
169 
170  template<typename L>
171  simdjson_inline simd8<L> lookup_16(
172  L replace0, L replace1, L replace2, L replace3,
173  L replace4, L replace5, L replace6, L replace7,
174  L replace8, L replace9, L replace10, L replace11,
175  L replace12, L replace13, L replace14, L replace15) const {
176  return lookup_16(simd8<L>::repeat_16(
177  replace0, replace1, replace2, replace3,
178  replace4, replace5, replace6, replace7,
179  replace8, replace9, replace10, replace11,
180  replace12, replace13, replace14, replace15
181  ));
182  }
183  };
184 
185  // Signed bytes
186  template<>
187  struct simd8<int8_t> : base8_numeric<int8_t> {
188  simdjson_inline simd8() : base8_numeric<int8_t>() {}
189  simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
190  // Splat constructor
191  simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
192  // Array constructor
193  simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
194  // Member-by-member initialization
195  simdjson_inline simd8(
196  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
197  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
198  int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
199  int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
200  ) : simd8({
201  v0, v1, v2, v3, v4, v5, v6, v7,
202  v8, v9, v10,v11,v12,v13,v14,v15,
203  v16,v17,v18,v19,v20,v21,v22,v23,
204  v24,v25,v26,v27,v28,v29,v30,v31
205  }) {}
206  // Repeat 16 values as many times as necessary (usually for lookup tables)
207  simdjson_inline static simd8<int8_t> repeat_16(
208  int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
209  int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
210  ) {
211  return simd8<int8_t>(
212  v0, v1, v2, v3, v4, v5, v6, v7,
213  v8, v9, v10,v11,v12,v13,v14,v15,
214  v0, v1, v2, v3, v4, v5, v6, v7,
215  v8, v9, v10,v11,v12,v13,v14,v15
216  );
217  }
218 
219  // Order-sensitive comparisons
220  simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lasx_xvmax_b(*this, other); }
221  simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lasx_xvmin_b(*this, other); }
222  simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lasx_xvslt_b(other, *this); }
223  simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lasx_xvslt_b(*this, other); }
224  };
225 
226  // Unsigned bytes
227  template<>
228  struct simd8<uint8_t>: base8_numeric<uint8_t> {
229  simdjson_inline simd8() : base8_numeric<uint8_t>() {}
230  simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
231  // Splat constructor
232  simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
233  // Array constructor
234  simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
235  // Member-by-member initialization
236  simdjson_inline simd8(
237  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
238  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
239  uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
240  uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
241  ) : simd8(__m256i(v32u8{
242  v0, v1, v2, v3, v4, v5, v6, v7,
243  v8, v9, v10,v11,v12,v13,v14,v15,
244  v16,v17,v18,v19,v20,v21,v22,v23,
245  v24,v25,v26,v27,v28,v29,v30,v31
246  })) {}
247  // Repeat 16 values as many times as necessary (usually for lookup tables)
248  simdjson_inline static simd8<uint8_t> repeat_16(
249  uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
250  uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
251  ) {
252  return simd8<uint8_t>(
253  v0, v1, v2, v3, v4, v5, v6, v7,
254  v8, v9, v10,v11,v12,v13,v14,v15,
255  v0, v1, v2, v3, v4, v5, v6, v7,
256  v8, v9, v10,v11,v12,v13,v14,v15
257  );
258  }
259 
260  // Saturated math
261  simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lasx_xvsadd_bu(*this, other); }
262  simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lasx_xvssub_bu(*this, other); }
263 
264  // Order-specific operations
265  simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lasx_xvmax_bu(*this, other); }
266  simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lasx_xvmin_bu(other, *this); }
267  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
268  simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
269  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
270  simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
271  simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
272  simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
273  simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
274  simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
275 
276  // Bit-specific operations
277  simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
278  simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
279  simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
280  simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
281  simdjson_inline bool is_ascii() const {
282  __m256i mask = __lasx_xvmskltz_b(*this);
283  return (0 == __lasx_xvpickve2gr_w(mask, 0)) && (0 == __lasx_xvpickve2gr_w(mask, 4));
284  }
285  simdjson_inline bool bits_not_set_anywhere() const {
286  __m256i v = __lasx_xvmsknz_b(*this);
287  return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
288  }
289  simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
290  simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
291  __m256i v = __lasx_xvmsknz_b(__lasx_xvand_v(*this, bits));
292  return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
293  }
294  simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
295  template<int N>
296  simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lasx_xvsrli_b(*this, N)); }
297  template<int N>
298  simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lasx_xvslli_b(*this, N)); }
299  };
300 
301  template<typename T>
302  struct simd8x64 {
303  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
304  static_assert(NUM_CHUNKS == 2, "LASX kernel should use two registers per 64-byte block.");
305  const simd8<T> chunks[NUM_CHUNKS];
306 
307  simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
308  simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
309  simd8x64() = delete; // no default constructor allowed
310 
311  simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
312  simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
313 
314  simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
315  uint32_t mask1 = uint32_t(mask);
316  uint32_t mask2 = uint32_t(mask >> 32);
317  __m256i zcnt = __lasx_xvpcnt_w(__m256i(v4u64{~mask, 0, 0, 0}));
318  uint64_t zcnt1 = __lasx_xvpickve2gr_wu(zcnt, 0);
319  uint64_t zcnt2 = __lasx_xvpickve2gr_wu(zcnt, 1);
320  // There should be a critical value which processes in scaler is faster.
321  if (zcnt1)
322  this->chunks[0].compress(mask1, output);
323  if (zcnt2)
324  this->chunks[1].compress(mask2, output + zcnt1);
325  return zcnt1 + zcnt2;
326  }
327 
328  simdjson_inline void store(T ptr[64]) const {
329  this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
330  this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
331  }
332 
333  simdjson_inline uint64_t to_bitmask() const {
334  __m256i mask0 = __lasx_xvmskltz_b(this->chunks[0]);
335  __m256i mask1 = __lasx_xvmskltz_b(this->chunks[1]);
336  __m256i mask_tmp = __lasx_xvpickve_w(mask0, 4);
337  __m256i tmp = __lasx_xvpickve_w(mask1, 4);
338  mask0 = __lasx_xvinsve0_w(mask0, mask1, 1);
339  mask_tmp = __lasx_xvinsve0_w(mask_tmp, tmp, 1);
340  return __lasx_xvpickve2gr_du(__lasx_xvpackev_h(mask_tmp, mask0), 0);
341  }
342 
343  simdjson_inline simd8<T> reduce_or() const {
344  return this->chunks[0] | this->chunks[1];
345  }
346 
347  simdjson_inline uint64_t eq(const T m) const {
348  const simd8<T> mask = simd8<T>::splat(m);
349  return simd8x64<bool>(
350  this->chunks[0] == mask,
351  this->chunks[1] == mask
352  ).to_bitmask();
353  }
354 
355  simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
356  return simd8x64<bool>(
357  this->chunks[0] == other.chunks[0],
358  this->chunks[1] == other.chunks[1]
359  ).to_bitmask();
360  }
361 
362  simdjson_inline uint64_t lteq(const T m) const {
363  const simd8<T> mask = simd8<T>::splat(m);
364  return simd8x64<bool>(
365  this->chunks[0] <= mask,
366  this->chunks[1] <= mask
367  ).to_bitmask();
368  }
369  }; // struct simd8x64<T>
370 
371 } // namespace simd
372 } // unnamed namespace
373 } // namespace lasx
374 } // namespace simdjson
375 
376 #endif // SIMDJSON_LASX_SIMD_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8