simdjson 4.2.3
Ridiculously Fast JSON
Loading...
Searching...
No Matches
simd.h
1#ifndef SIMDJSON_LSX_SIMD_H
2#define SIMDJSON_LSX_SIMD_H
3
4#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5#include "simdjson/lsx/base.h"
6#include "simdjson/lsx/bitmanipulation.h"
7#include "simdjson/internal/simdprune_tables.h"
8#endif // SIMDJSON_CONDITIONAL_INCLUDE
9
10namespace simdjson {
11namespace lsx {
12namespace {
13namespace simd {
14
15 // Forward-declared so they can be used by splat and friends.
16 template<typename Child>
17 struct base {
18 __m128i value;
19
20 // Zero constructor
21 simdjson_inline base() : value{__m128i()} {}
22
23 // Conversion from SIMD register
24 simdjson_inline base(const __m128i _value) : value(_value) {}
25
26 // Conversion to SIMD register
27 simdjson_inline operator const __m128i&() const { return this->value; }
28 simdjson_inline operator __m128i&() { return this->value; }
29 simdjson_inline operator const v16i8&() const { return (v16i8&)this->value; }
30 simdjson_inline operator v16i8&() { return (v16i8&)this->value; }
31
32 // Bit operations
33 simdjson_inline Child operator|(const Child other) const { return __lsx_vor_v(*this, other); }
34 simdjson_inline Child operator&(const Child other) const { return __lsx_vand_v(*this, other); }
35 simdjson_inline Child operator^(const Child other) const { return __lsx_vxor_v(*this, other); }
36 simdjson_inline Child bit_andnot(const Child other) const { return __lsx_vandn_v(other, *this); }
37 simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
38 simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
39 simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
40 };
41
42 // Forward-declared so they can be used by splat and friends.
43 template<typename T>
44 struct simd8;
45
46 template<typename T, typename Mask=simd8<bool>>
47 struct base8: base<simd8<T>> {
48 simdjson_inline base8() : base<simd8<T>>() {}
49 simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
50
51 friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lsx_vseq_b(lhs, rhs); }
52
53 static const int SIZE = sizeof(base<simd8<T>>::value);
54
55 template<int N=1>
56 simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
57 return __lsx_vor_v(__lsx_vbsll_v(*this, N), __lsx_vbsrl_v(prev_chunk, 16 - N));
58 }
59 };
60
61 // SIMD byte mask type (returned by things like eq and gt)
62 template<>
63 struct simd8<bool>: base8<bool> {
64 static simdjson_inline simd8<bool> splat(bool _value) {
65 return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
66 }
67
68 simdjson_inline simd8() : base8() {}
69 simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
70 // Splat constructor
71 simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
72
73 simdjson_inline int to_bitmask() const { return __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
74 simdjson_inline bool any() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
75 simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
76 };
77
78 template<typename T>
79 struct base8_numeric: base8<T> {
80 static simdjson_inline simd8<T> splat(T _value) { return __lsx_vreplgr2vr_b(_value); }
81 static simdjson_inline simd8<T> zero() { return __lsx_vldi(0); }
82 static simdjson_inline simd8<T> load(const T values[16]) {
83 return __lsx_vld(reinterpret_cast<const __m128i *>(values), 0);
84 }
85 // Repeat 16 values as many times as necessary (usually for lookup tables)
86 static simdjson_inline simd8<T> repeat_16(
87 T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
88 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
89 ) {
90 return simd8<T>(
91 v0, v1, v2, v3, v4, v5, v6, v7,
92 v8, v9, v10,v11,v12,v13,v14,v15
93 );
94 }
95
96 simdjson_inline base8_numeric() : base8<T>() {}
97 simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
98
99 // Store to array
100 simdjson_inline void store(T dst[16]) const {
101 return __lsx_vst(*this, reinterpret_cast<__m128i *>(dst), 0);
102 }
103
104 // Addition/subtraction are the same for signed and unsigned
105 simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lsx_vadd_b(*this, other); }
106 simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lsx_vsub_b(*this, other); }
107 simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
108 simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
109
110 // Override to distinguish from bool version
111 simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
112
113 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
114 template<typename L>
115 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
116 return __lsx_vshuf_b(lookup_table, lookup_table, *this);
117 }
118
119 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
120 // Passing a 0 value for mask would be equivalent to writing out every byte to output.
121 // Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
122 // get written.
123 template<typename L>
124 simdjson_inline void compress(uint16_t mask, L * output) const {
125 using internal::thintable_epi8;
126 using internal::BitsSetTable256mul2;
127 using internal::pshufb_combine_table;
128 // this particular implementation was inspired by haswell
129 // lsx do it in 2 steps, first 8 bytes and then second 8 bytes...
130 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
131 uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
132 // next line just loads the 64-bit values thintable_epi8[mask1] and
133 // thintable_epi8[mask2] into a 128-bit register.
134 __m128i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808};
135 // this is the version "nearly pruned"
136 __m128i pruned = __lsx_vshuf_b(*this, *this, shufmask);
137 // we still need to put the pieces back together.
138 // we compute the popcount of the first words:
139 int pop1 = BitsSetTable256mul2[mask1];
140 // then load the corresponding mask
141 __m128i compactmask = __lsx_vldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
142 __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
143 __lsx_vst(answer, reinterpret_cast<uint8_t*>(output), 0);
144 }
145
146 template<typename L>
147 simdjson_inline simd8<L> lookup_16(
148 L replace0, L replace1, L replace2, L replace3,
149 L replace4, L replace5, L replace6, L replace7,
150 L replace8, L replace9, L replace10, L replace11,
151 L replace12, L replace13, L replace14, L replace15) const {
152 return lookup_16(simd8<L>::repeat_16(
153 replace0, replace1, replace2, replace3,
154 replace4, replace5, replace6, replace7,
155 replace8, replace9, replace10, replace11,
156 replace12, replace13, replace14, replace15
157 ));
158 }
159 };
160
161 // Signed bytes
162 template<>
163 struct simd8<int8_t> : base8_numeric<int8_t> {
164 simdjson_inline simd8() : base8_numeric<int8_t>() {}
165 simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
166 // Splat constructor
167 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
168 // Array constructor
169 simdjson_inline simd8(const int8_t values[16]) : simd8(load(values)) {}
170 // Member-by-member initialization
171 simdjson_inline simd8(
172 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
173 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
174 ) : simd8({
175 v0, v1, v2, v3, v4, v5, v6, v7,
176 v8, v9, v10,v11,v12,v13,v14,v15
177 }) {}
178 // Repeat 16 values as many times as necessary (usually for lookup tables)
179 simdjson_inline static simd8<int8_t> repeat_16(
180 int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
181 int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
182 ) {
183 return simd8<int8_t>(
184 v0, v1, v2, v3, v4, v5, v6, v7,
185 v8, v9, v10,v11,v12,v13,v14,v15
186 );
187 }
188
189 // Order-sensitive comparisons
190 simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lsx_vmax_b(*this, other); }
191 simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lsx_vmin_b(*this, other); }
192 simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lsx_vslt_b(other, *this); }
193 simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lsx_vslt_b(*this, other); }
194 };
195
196 // Unsigned bytes
197 template<>
198 struct simd8<uint8_t>: base8_numeric<uint8_t> {
199 simdjson_inline simd8() : base8_numeric<uint8_t>() {}
200 simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
201 // Splat constructor
202 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
203 // Array constructor
204 simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
205 // Member-by-member initialization
206 simdjson_inline simd8(
207 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
208 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
209 ) : simd8(__m128i(v16u8{
210 v0, v1, v2, v3, v4, v5, v6, v7,
211 v8, v9, v10,v11,v12,v13,v14,v15
212 })) {}
213 // Repeat 16 values as many times as necessary (usually for lookup tables)
214 simdjson_inline static simd8<uint8_t> repeat_16(
215 uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
216 uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
217 ) {
218 return simd8<uint8_t>(
219 v0, v1, v2, v3, v4, v5, v6, v7,
220 v8, v9, v10,v11,v12,v13,v14,v15
221 );
222 }
223
224 // Saturated math
225 simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lsx_vsadd_bu(*this, other); }
226 simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lsx_vssub_bu(*this, other); }
227
228 // Order-specific operations
229 simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lsx_vmax_bu(*this, other); }
230 simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lsx_vmin_bu(other, *this); }
231 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
232 simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
233 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
234 simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
235 simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
236 simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
237 simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
238 simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
239
240 // Bit-specific operations
241 simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
242 simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
243 simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
244 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
245 simdjson_inline bool is_ascii() const { return 0 == __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
246 simdjson_inline bool bits_not_set_anywhere() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
247 simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
248 simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
249 return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(__lsx_vand_v(*this, bits)), 0);
250 }
251 simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
252 template<int N>
253 simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lsx_vsrli_b(*this, N)); }
254 template<int N>
255 simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lsx_vslli_b(*this, N)); }
256 };
257
258 template<typename T>
259 struct simd8x64 {
260 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
261 static_assert(NUM_CHUNKS == 4, "LSX kernel should use four registers per 64-byte block.");
262 const simd8<T> chunks[NUM_CHUNKS];
263
264 simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
265 simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
266 simd8x64() = delete; // no default constructor allowed
267
268 simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
269 simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
270
271 simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
272 uint16_t mask1 = uint16_t(mask);
273 uint16_t mask2 = uint16_t(mask >> 16);
274 uint16_t mask3 = uint16_t(mask >> 32);
275 uint16_t mask4 = uint16_t(mask >> 48);
276 __m128i zcnt = __lsx_vpcnt_h(__m128i(v2u64{~mask, 0}));
277 uint64_t zcnt1 = __lsx_vpickve2gr_hu(zcnt, 0);
278 uint64_t zcnt2 = __lsx_vpickve2gr_hu(zcnt, 1);
279 uint64_t zcnt3 = __lsx_vpickve2gr_hu(zcnt, 2);
280 uint64_t zcnt4 = __lsx_vpickve2gr_hu(zcnt, 3);
281 uint8_t *voutput = reinterpret_cast<uint8_t*>(output);
282 // There should be a critical value which processes in scaler is faster.
283 if (zcnt1)
284 this->chunks[0].compress(mask1, reinterpret_cast<T*>(voutput));
285 voutput += zcnt1;
286 if (zcnt2)
287 this->chunks[1].compress(mask2, reinterpret_cast<T*>(voutput));
288 voutput += zcnt2;
289 if (zcnt3)
290 this->chunks[2].compress(mask3, reinterpret_cast<T*>(voutput));
291 voutput += zcnt3;
292 if (zcnt4)
293 this->chunks[3].compress(mask4, reinterpret_cast<T*>(voutput));
294 voutput += zcnt4;
295 return reinterpret_cast<uint64_t>(voutput) - reinterpret_cast<uint64_t>(output);
296 }
297
298 simdjson_inline void store(T ptr[64]) const {
299 this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
300 this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
301 this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
302 this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
303 }
304
305 simdjson_inline uint64_t to_bitmask() const {
306 __m128i mask1 = __lsx_vmskltz_b(this->chunks[0]);
307 __m128i mask2 = __lsx_vmskltz_b(this->chunks[1]);
308 __m128i mask3 = __lsx_vmskltz_b(this->chunks[2]);
309 __m128i mask4 = __lsx_vmskltz_b(this->chunks[3]);
310 mask1 = __lsx_vilvl_h(mask2, mask1);
311 mask2 = __lsx_vilvl_h(mask4, mask3);
312 return __lsx_vpickve2gr_du(__lsx_vilvl_w(mask2, mask1), 0);
313 }
314
315 simdjson_inline simd8<T> reduce_or() const {
316 return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
317 }
318
319 simdjson_inline uint64_t eq(const T m) const {
320 const simd8<T> mask = simd8<T>::splat(m);
321 return simd8x64<bool>(
322 this->chunks[0] == mask,
323 this->chunks[1] == mask,
324 this->chunks[2] == mask,
325 this->chunks[3] == mask
326 ).to_bitmask();
327 }
328
329 simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
330 return simd8x64<bool>(
331 this->chunks[0] == other.chunks[0],
332 this->chunks[1] == other.chunks[1],
333 this->chunks[2] == other.chunks[2],
334 this->chunks[3] == other.chunks[3]
335 ).to_bitmask();
336 }
337
338 simdjson_inline uint64_t lteq(const T m) const {
339 const simd8<T> mask = simd8<T>::splat(m);
340 return simd8x64<bool>(
341 this->chunks[0] <= mask,
342 this->chunks[1] <= mask,
343 this->chunks[2] <= mask,
344 this->chunks[3] <= mask
345 ).to_bitmask();
346 }
347 }; // struct simd8x64<T>
348
349} // namespace simd
350} // unnamed namespace
351} // namespace lsx
352} // namespace simdjson
353
354#endif // SIMDJSON_LSX_SIMD_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8