simdjson 4.1.0
Ridiculously Fast JSON
Loading...
Searching...
No Matches
simd.h
1#ifndef SIMDJSON_PPC64_SIMD_H
2#define SIMDJSON_PPC64_SIMD_H
3
4#ifndef SIMDJSON_CONDITIONAL_INCLUDE
5#include "simdjson/ppc64/base.h"
6#include "simdjson/ppc64/bitmanipulation.h"
7#include "simdjson/internal/simdprune_tables.h"
8#endif // SIMDJSON_CONDITIONAL_INCLUDE
9
10#include <type_traits>
11
12namespace simdjson {
13namespace ppc64 {
14namespace {
15namespace simd {
16
17using __m128i = __vector unsigned char;
18
19template <typename Child> struct base {
20 __m128i value;
21
22 // Zero constructor
23 simdjson_inline base() : value{__m128i()} {}
24
25 // Conversion from SIMD register
26 simdjson_inline base(const __m128i _value) : value(_value) {}
27
28 // Conversion to SIMD register
29 simdjson_inline operator const __m128i &() const {
30 return this->value;
31 }
32 simdjson_inline operator __m128i &() { return this->value; }
33
34 // Bit operations
35 simdjson_inline Child operator|(const Child other) const {
36 return vec_or(this->value, (__m128i)other);
37 }
38 simdjson_inline Child operator&(const Child other) const {
39 return vec_and(this->value, (__m128i)other);
40 }
41 simdjson_inline Child operator^(const Child other) const {
42 return vec_xor(this->value, (__m128i)other);
43 }
44 simdjson_inline Child bit_andnot(const Child other) const {
45 return vec_andc(this->value, (__m128i)other);
46 }
47 simdjson_inline Child &operator|=(const Child other) {
48 auto this_cast = static_cast<Child*>(this);
49 *this_cast = *this_cast | other;
50 return *this_cast;
51 }
52 simdjson_inline Child &operator&=(const Child other) {
53 auto this_cast = static_cast<Child*>(this);
54 *this_cast = *this_cast & other;
55 return *this_cast;
56 }
57 simdjson_inline Child &operator^=(const Child other) {
58 auto this_cast = static_cast<Child*>(this);
59 *this_cast = *this_cast ^ other;
60 return *this_cast;
61 }
62};
63
64template <typename T, typename Mask = simd8<bool>>
65struct base8 : base<simd8<T>> {
66 typedef uint16_t bitmask_t;
67 typedef uint32_t bitmask2_t;
68
69 simdjson_inline base8() : base<simd8<T>>() {}
70 simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
71
72 friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
73 return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
74 }
75
76 static const int SIZE = sizeof(base<simd8<T>>::value);
77
78 template <int N = 1>
79 simdjson_inline simd8<T> prev(simd8<T> prev_chunk) const {
80 __m128i chunk = this->value;
81#ifdef __LITTLE_ENDIAN__
82 chunk = (__m128i)vec_reve(this->value);
83 prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
84#endif
85 chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
86#ifdef __LITTLE_ENDIAN__
87 chunk = (__m128i)vec_reve((__m128i)chunk);
88#endif
89 return chunk;
90 }
91};
92
93// SIMD byte mask type (returned by things like eq and gt)
94template <> struct simd8<bool> : base8<bool> {
95 static simdjson_inline simd8<bool> splat(bool _value) {
96 return (__m128i)vec_splats((unsigned char)(-(!!_value)));
97 }
98
99 simdjson_inline simd8() : base8<bool>() {}
100 simdjson_inline simd8(const __m128i _value)
101 : base8<bool>(_value) {}
102 // Splat constructor
103 simdjson_inline simd8(bool _value)
104 : base8<bool>(splat(_value)) {}
105
106 simdjson_inline int to_bitmask() const {
107 __vector unsigned long long result;
108 const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
109 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
110
111 result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
112 (__m128i)perm_mask));
113#ifdef __LITTLE_ENDIAN__
114 return static_cast<int>(result[1]);
115#else
116 return static_cast<int>(result[0]);
117#endif
118 }
119 simdjson_inline bool any() const {
120 return !vec_all_eq(this->value, (__m128i)vec_splats(0));
121 }
122 simdjson_inline simd8<bool> operator~() const {
123 return this->value ^ (__m128i)splat(true);
124 }
125};
126
127template <typename T> struct base8_numeric : base8<T> {
128 static simdjson_inline simd8<T> splat(T value) {
129 (void)value;
130 return (__m128i)vec_splats(value);
131 }
132 static simdjson_inline simd8<T> zero() { return splat(0); }
133 static simdjson_inline simd8<T> load(const T values[16]) {
134 return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
135 }
136 // Repeat 16 values as many times as necessary (usually for lookup tables)
137 static simdjson_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
138 T v5, T v6, T v7, T v8, T v9,
139 T v10, T v11, T v12, T v13,
140 T v14, T v15) {
141 return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
142 v14, v15);
143 }
144
145 simdjson_inline base8_numeric() : base8<T>() {}
146 simdjson_inline base8_numeric(const __m128i _value)
147 : base8<T>(_value) {}
148
149 // Store to array
150 simdjson_inline void store(T dst[16]) const {
151 vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
152 }
153
154 // Override to distinguish from bool version
155 simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
156
157 // Addition/subtraction are the same for signed and unsigned
158 simdjson_inline simd8<T> operator+(const simd8<T> other) const {
159 return (__m128i)((__m128i)this->value + (__m128i)other);
160 }
161 simdjson_inline simd8<T> operator-(const simd8<T> other) const {
162 return (__m128i)((__m128i)this->value - (__m128i)other);
163 }
164 simdjson_inline simd8<T> &operator+=(const simd8<T> other) {
165 *this = *this + other;
166 return *static_cast<simd8<T> *>(this);
167 }
168 simdjson_inline simd8<T> &operator-=(const simd8<T> other) {
169 *this = *this - other;
170 return *static_cast<simd8<T> *>(this);
171 }
172
173 // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
174 // for out of range values)
175 template <typename L>
176 simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
177 return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
178 }
179
180 // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
181 // as a bitset). Passing a 0 value for mask would be equivalent to writing out
182 // every byte to output. Only the first 16 - count_ones(mask) bytes of the
183 // result are significant but 16 bytes get written. Design consideration: it
184 // seems like a function with the signature simd8<L> compress(uint32_t mask)
185 // would be sensible, but the AVX ISA makes this kind of approach difficult.
186 template <typename L>
187 simdjson_inline void compress(uint16_t mask, L *output) const {
188 using internal::BitsSetTable256mul2;
189 using internal::pshufb_combine_table;
190 using internal::thintable_epi8;
191 // this particular implementation was inspired by work done by @animetosho
192 // we do it in two steps, first 8 bytes and then second 8 bytes
193 uint8_t mask1 = uint8_t(mask); // least significant 8 bits
194 uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
195 // next line just loads the 64-bit values thintable_epi8[mask1] and
196 // thintable_epi8[mask2] into a 128-bit register, using only
197 // two instructions on most compilers.
198#ifdef __LITTLE_ENDIAN__
199 __m128i shufmask = (__m128i)(__vector unsigned long long){
200 thintable_epi8[mask1], thintable_epi8[mask2]};
201#else
202 __m128i shufmask = (__m128i)(__vector unsigned long long){
203 thintable_epi8[mask2], thintable_epi8[mask1]};
204 shufmask = (__m128i)vec_reve((__m128i)shufmask);
205#endif
206 // we increment by 0x08 the second half of the mask
207 shufmask = ((__m128i)shufmask) +
208 ((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
209
210 // this is the version "nearly pruned"
211 __m128i pruned = vec_perm(this->value, this->value, shufmask);
212 // we still need to put the two halves together.
213 // we compute the popcount of the first half:
214 int pop1 = BitsSetTable256mul2[mask1];
215 // then load the corresponding mask, what it does is to write
216 // only the first pop1 bytes from the first 8 bytes, and then
217 // it fills in with the bytes from the second 8 bytes + some filling
218 // at the end.
219 __m128i compactmask =
220 vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
221 __m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
222 vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output));
223 }
224
225 template <typename L>
226 simdjson_inline simd8<L>
227 lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
228 L replace5, L replace6, L replace7, L replace8, L replace9,
229 L replace10, L replace11, L replace12, L replace13, L replace14,
230 L replace15) const {
231 return lookup_16(simd8<L>::repeat_16(
232 replace0, replace1, replace2, replace3, replace4, replace5, replace6,
233 replace7, replace8, replace9, replace10, replace11, replace12,
234 replace13, replace14, replace15));
235 }
236};
237
238// Signed bytes
239template <> struct simd8<int8_t> : base8_numeric<int8_t> {
240 simdjson_inline simd8() : base8_numeric<int8_t>() {}
241 simdjson_inline simd8(const __m128i _value)
242 : base8_numeric<int8_t>(_value) {}
243 // Splat constructor
244 simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
245 // Array constructor
246 simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {}
247 // Member-by-member initialization
248 simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
249 int8_t v4, int8_t v5, int8_t v6, int8_t v7,
250 int8_t v8, int8_t v9, int8_t v10, int8_t v11,
251 int8_t v12, int8_t v13, int8_t v14, int8_t v15)
252 : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
253 v8, v9, v10, v11, v12, v13, v14,
254 v15}) {}
255 // Repeat 16 values as many times as necessary (usually for lookup tables)
256 simdjson_inline static simd8<int8_t>
257 repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
258 int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
259 int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
260 return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
261 v13, v14, v15);
262 }
263
264 // Order-sensitive comparisons
265 simdjson_inline simd8<int8_t>
266 max_val(const simd8<int8_t> other) const {
267 return (__m128i)vec_max((__vector signed char)this->value,
268 (__vector signed char)(__m128i)other);
269 }
270 simdjson_inline simd8<int8_t>
271 min_val(const simd8<int8_t> other) const {
272 return (__m128i)vec_min((__vector signed char)this->value,
273 (__vector signed char)(__m128i)other);
274 }
275 simdjson_inline simd8<bool>
276 operator>(const simd8<int8_t> other) const {
277 return (__m128i)vec_cmpgt((__vector signed char)this->value,
278 (__vector signed char)(__m128i)other);
279 }
280 simdjson_inline simd8<bool>
281 operator<(const simd8<int8_t> other) const {
282 return (__m128i)vec_cmplt((__vector signed char)this->value,
283 (__vector signed char)(__m128i)other);
284 }
285};
286
287// Unsigned bytes
288template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
289 simdjson_inline simd8() : base8_numeric<uint8_t>() {}
290 simdjson_inline simd8(const __m128i _value)
291 : base8_numeric<uint8_t>(_value) {}
292 // Splat constructor
293 simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
294 // Array constructor
295 simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {}
296 // Member-by-member initialization
297 simdjson_inline
298 simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
299 uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
300 uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
301 : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
302 v13, v14, v15}) {}
303 // Repeat 16 values as many times as necessary (usually for lookup tables)
304 simdjson_inline static simd8<uint8_t>
305 repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
306 uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
307 uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
308 uint8_t v15) {
309 return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
310 v13, v14, v15);
311 }
312
313 // Saturated math
314 simdjson_inline simd8<uint8_t>
315 saturating_add(const simd8<uint8_t> other) const {
316 return (__m128i)vec_adds(this->value, (__m128i)other);
317 }
318 simdjson_inline simd8<uint8_t>
319 saturating_sub(const simd8<uint8_t> other) const {
320 return (__m128i)vec_subs(this->value, (__m128i)other);
321 }
322
323 // Order-specific operations
324 simdjson_inline simd8<uint8_t>
325 max_val(const simd8<uint8_t> other) const {
326 return (__m128i)vec_max(this->value, (__m128i)other);
327 }
328 simdjson_inline simd8<uint8_t>
329 min_val(const simd8<uint8_t> other) const {
330 return (__m128i)vec_min(this->value, (__m128i)other);
331 }
332 // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
333 simdjson_inline simd8<uint8_t>
334 gt_bits(const simd8<uint8_t> other) const {
335 return this->saturating_sub(other);
336 }
337 // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
338 simdjson_inline simd8<uint8_t>
339 lt_bits(const simd8<uint8_t> other) const {
340 return other.saturating_sub(*this);
341 }
342 simdjson_inline simd8<bool>
343 operator<=(const simd8<uint8_t> other) const {
344 return other.max_val(*this) == other;
345 }
346 simdjson_inline simd8<bool>
347 operator>=(const simd8<uint8_t> other) const {
348 return other.min_val(*this) == other;
349 }
350 simdjson_inline simd8<bool>
351 operator>(const simd8<uint8_t> other) const {
352 return this->gt_bits(other).any_bits_set();
353 }
354 simdjson_inline simd8<bool>
355 operator<(const simd8<uint8_t> other) const {
356 return this->gt_bits(other).any_bits_set();
357 }
358
359 // Bit-specific operations
360 simdjson_inline simd8<bool> bits_not_set() const {
361 return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
362 }
363 simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
364 return (*this & bits).bits_not_set();
365 }
366 simdjson_inline simd8<bool> any_bits_set() const {
367 return ~this->bits_not_set();
368 }
369 simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
370 return ~this->bits_not_set(bits);
371 }
372 simdjson_inline bool bits_not_set_anywhere() const {
373 return vec_all_eq(this->value, (__m128i)vec_splats(0));
374 }
375 simdjson_inline bool any_bits_set_anywhere() const {
376 return !bits_not_set_anywhere();
377 }
378 simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
379 return vec_all_eq(vec_and(this->value, (__m128i)bits),
380 (__m128i)vec_splats(0));
381 }
382 simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
383 return !bits_not_set_anywhere(bits);
384 }
385 template <int N> simdjson_inline simd8<uint8_t> shr() const {
386 return simd8<uint8_t>(
387 (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
388 }
389 template <int N> simdjson_inline simd8<uint8_t> shl() const {
390 return simd8<uint8_t>(
391 (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
392 }
393};
394
395template <typename T> struct simd8x64 {
396 static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
397 static_assert(NUM_CHUNKS == 4,
398 "PPC64 kernel should use four registers per 64-byte block.");
399 const simd8<T> chunks[NUM_CHUNKS];
400
401 simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
402 simd8x64<T> &
403 operator=(const simd8<T>& other) = delete; // no assignment allowed
404 simd8x64() = delete; // no default constructor allowed
405
406 simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
407 const simd8<T> chunk2, const simd8<T> chunk3)
408 : chunks{chunk0, chunk1, chunk2, chunk3} {}
409 simdjson_inline simd8x64(const T ptr[64])
410 : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
411 simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
412
413 simdjson_inline void store(T ptr[64]) const {
414 this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
415 this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
416 this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
417 this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
418 }
419
420 simdjson_inline simd8<T> reduce_or() const {
421 return (this->chunks[0] | this->chunks[1]) |
422 (this->chunks[2] | this->chunks[3]);
423 }
424
425 simdjson_inline uint64_t compress(uint64_t mask, T *output) const {
426 this->chunks[0].compress(uint16_t(mask), output);
427 this->chunks[1].compress(uint16_t(mask >> 16),
428 output + 16 - count_ones(mask & 0xFFFF));
429 this->chunks[2].compress(uint16_t(mask >> 32),
430 output + 32 - count_ones(mask & 0xFFFFFFFF));
431 this->chunks[3].compress(uint16_t(mask >> 48),
432 output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
433 return 64 - count_ones(mask);
434 }
435
436 simdjson_inline uint64_t to_bitmask() const {
437 uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
438 uint64_t r1 = this->chunks[1].to_bitmask();
439 uint64_t r2 = this->chunks[2].to_bitmask();
440 uint64_t r3 = this->chunks[3].to_bitmask();
441 return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
442 }
443
444 simdjson_inline uint64_t eq(const T m) const {
445 const simd8<T> mask = simd8<T>::splat(m);
446 return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
447 this->chunks[2] == mask, this->chunks[3] == mask)
448 .to_bitmask();
449 }
450
451 simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
452 return simd8x64<bool>(this->chunks[0] == other.chunks[0],
453 this->chunks[1] == other.chunks[1],
454 this->chunks[2] == other.chunks[2],
455 this->chunks[3] == other.chunks[3])
456 .to_bitmask();
457 }
458
459 simdjson_inline uint64_t lteq(const T m) const {
460 const simd8<T> mask = simd8<T>::splat(m);
461 return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
462 this->chunks[2] <= mask, this->chunks[3] <= mask)
463 .to_bitmask();
464 }
465}; // struct simd8x64<T>
466
467} // namespace simd
468} // unnamed namespace
469} // namespace ppc64
470} // namespace simdjson
471
472#endif // SIMDJSON_PPC64_SIMD_INPUT_H
simdjson_unused simdjson_inline bool operator==(const raw_json_string &a, std::string_view c) noexcept
Comparisons between raw_json_string and std::string_view instances are potentially unsafe: the user i...
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8