simdjson 4.2.3
Ridiculously Fast JSON
Loading...
Searching...
No Matches
compile_time_json-inl.h
Go to the documentation of this file.
1
10#ifndef SIMDJSON_GENERIC_COMPILE_TIME_JSON_INL_H
11
12#if SIMDJSON_STATIC_REFLECTION
13
15#include <array>
16#include <cstdint>
17#include <meta>
18#include <string_view>
19
20#include <algorithm>
21#include <array>
22#include <charconv>
23#include <cstdint>
24#include <expected>
25#include <meta>
26#include <string>
27#include <string_view>
28#include <vector>
29
30#define simdjson_consteval_error(...) \
31 { \
32 std::abort(); \
33 }
34
35namespace simdjson {
36namespace compile_time {
37
47namespace number_parsing {
48
49// Counts the number of leading zeros in a 64-bit integer.
50consteval int leading_zeroes(uint64_t input_num, int last_bit = 0) {
51 if (input_num & uint64_t(0xffffffff00000000)) {
52 input_num >>= 32;
53 last_bit |= 32;
54 }
55 if (input_num & uint64_t(0xffff0000)) {
56 input_num >>= 16;
57 last_bit |= 16;
58 }
59 if (input_num & uint64_t(0xff00)) {
60 input_num >>= 8;
61 last_bit |= 8;
62 }
63 if (input_num & uint64_t(0xf0)) {
64 input_num >>= 4;
65 last_bit |= 4;
66 }
67 if (input_num & uint64_t(0xc)) {
68 input_num >>= 2;
69 last_bit |= 2;
70 }
71 if (input_num & uint64_t(0x2)) { /* input_num >>= 1; */
72 last_bit |= 1;
73 }
74 return 63 - last_bit;
75}
76// Multiplies two 32-bit unsigned integers and returns a 64-bit result.
77consteval uint64_t emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; }
78consteval uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
79 uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
80 uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
81 uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
82 uint64_t adbc_carry = (uint64_t)(adbc < ad);
83 uint64_t lo = bd + (adbc << 32);
84 *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
85 (adbc_carry << 32) + (uint64_t)(lo < bd);
86 return lo;
87}
88
89// Represents a 128-bit unsigned integer as two 64-bit parts.
90// We have a value128 struct elsewhere in the simdjson, but we
91// use a separate one here for clarity.
92struct value128 {
93 uint64_t low;
94 uint64_t high;
95
96 constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {}
97 constexpr value128() : low(0), high(0) {}
98};
99
100// Multiplies two 64-bit integers and returns a 128-bit result as value128.
101consteval value128 full_multiplication(uint64_t a, uint64_t b) {
102 value128 answer;
103 answer.low = umul128_generic(a, b, &answer.high);
104 return answer;
105}
106
107// Converts mantissa and exponent to a double, considering the sign.
108consteval double to_double(uint64_t mantissa, int64_t exponent, bool negative) {
109 uint64_t sign_bit = negative ? (1ULL << 63) : 0;
110 uint64_t exponent_bits = (uint64_t(exponent) & 0x7FF) << 52;
111 uint64_t bits = sign_bit | exponent_bits | (mantissa & ((1ULL << 52) - 1));
112 return std::bit_cast<double>(bits);
113}
114
115// Attempts to compute i * 10^(power) exactly; and if "negative" is
116// true, negate the result.
117// Returns true on success, false on failure.
118// Failure suggests and invalid input or out-of-range result.
119consteval bool compute_float_64(int64_t power, uint64_t i, bool negative,
120 double &d) {
121 if (i == 0) {
122 d = negative ? -0.0 : 0.0;
123 return true;
124 }
125 int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
126 int lz = leading_zeroes(i);
127 i <<= lz;
128 const uint32_t index =
129 2 * uint32_t(power - simdjson::internal::smallest_power);
130 value128 firstproduct = full_multiplication(
131 i, simdjson::internal::powers_template<>::power_of_five_128[index]);
132 if ((firstproduct.high & 0x1FF) == 0x1FF) {
133 value128 secondproduct = full_multiplication(
134 i, simdjson::internal::powers_template<>::power_of_five_128[index + 1]);
135 firstproduct.low += secondproduct.high;
136 if (secondproduct.high > firstproduct.low) {
137 firstproduct.high++;
138 }
139 }
140 uint64_t lower = firstproduct.low;
141 uint64_t upper = firstproduct.high;
142 uint64_t upperbit = upper >> 63;
143 uint64_t mantissa = upper >> (upperbit + 9);
144 lz += int(1 ^ upperbit);
145 int64_t real_exponent = exponent - lz;
146 if (real_exponent <= 0) {
147 if (-real_exponent + 1 >= 64) {
148 d = negative ? -0.0 : 0.0;
149 return true;
150 }
151 mantissa >>= -real_exponent + 1;
152 mantissa += (mantissa & 1);
153 mantissa >>= 1;
154 real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
155 d = to_double(mantissa, real_exponent, negative);
156 return true;
157 }
158 if ((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1)) {
159 if ((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
160 mantissa &= ~1;
161 }
162 }
163 mantissa += mantissa & 1;
164 mantissa >>= 1;
165 if (mantissa >= (1ULL << 53)) {
166 mantissa = (1ULL << 52);
167 real_exponent++;
168 }
169 mantissa &= ~(1ULL << 52);
170 if (real_exponent > 2046) {
171 return false;
172 }
173 d = to_double(mantissa, real_exponent, negative);
174 return true;
175}
176
177// Parses a single digit character and updates the integer value.
178consteval bool parse_digit(const char c, uint64_t &i) {
179 const uint8_t digit = static_cast<uint8_t>(c - '0');
180 if (digit > 9) {
181 return false;
182 }
183 i = 10 * i + digit;
184 return true;
185}
186
187// Parses a JSON float from a string starting at src.
188// Returns the parsed double and the number of characters consumed.
189consteval std::pair<double, size_t> parse_double(const char *src,
190 const char *end) {
191 auto get_value = [&](const char *pointer) -> char {
192 if (pointer == end) {
193 return '\0';
194 }
195 return *pointer;
196 };
197 const char *srcinit = src;
198 bool negative = (get_value(src) == '-');
199 src += uint8_t(negative);
200 uint64_t i = 0;
201 const char *p = src;
202 p += parse_digit(get_value(p), i);
203 bool leading_zero = (i == 0);
204 while (parse_digit(get_value(p), i)) {
205 p++;
206 }
207 if (p == src) {
208 simdjson_consteval_error("Invalid float value");
209 }
210 if ((leading_zero && p != src + 1)) {
211 simdjson_consteval_error("Invalid float value");
212 }
213 int64_t exponent = 0;
214 bool overflow;
215 if (get_value(p) == '.') {
216 p++;
217 const char *start_decimal_digits = p;
218 if (!parse_digit(get_value(p), i)) {
219 simdjson_consteval_error("Invalid float value");
220 } // no decimal digits
221 p++;
222 while (parse_digit(get_value(p), i)) {
223 p++;
224 }
225 exponent = -(p - start_decimal_digits);
226 overflow = p - src - 1 > 19;
227 if (overflow && leading_zero) {
228 const char *start_digits = src + 2;
229 while (get_value(start_digits) == '0') {
230 start_digits++;
231 }
232 overflow = p - start_digits > 19;
233 }
234 } else {
235 overflow = p - src > 19;
236 }
237 if (overflow) {
238 simdjson_consteval_error(
239 "Overflow while computing the float value: too many digits");
240 }
241 if (get_value(p) == 'e' || get_value(p) == 'E') {
242 p++;
243 bool exp_neg = get_value(p) == '-';
244 p += exp_neg || get_value(p) == '+';
245 uint64_t exp = 0;
246 const char *start_exp_digits = p;
247 while (parse_digit(get_value(p), exp)) {
248 p++;
249 }
250 if (p - start_exp_digits == 0 || p - start_exp_digits > 19) {
251 simdjson_consteval_error("Invalid float value");
252 }
253 exponent += exp_neg ? 0 - exp : exp;
254 }
255
256 overflow = overflow || exponent < simdjson::internal::smallest_power ||
257 exponent > simdjson::internal::largest_power;
258 if (overflow) {
259 simdjson_consteval_error("Overflow while computing the float value");
260 }
261 double d;
262 if (!compute_float_64(exponent, i, negative, d)) {
263 simdjson_consteval_error("Overflow while computing the float value");
264 }
265 return {d, size_t(p - srcinit)};
266}
267} // namespace number_parsing
268
269// JSON string may contain embedded nulls, and C++26 reflection does not yet
270// support std::string_view as a data member type. As a workaround, we define
271// a custom type that holds a const char* and a size.
272template <char... Vals> struct fixed_json_string {
273 // Statically-allocated array to hold the characters and a null terminator
274 static constexpr char inner_data[] = {Vals..., '\0'};
275
276 // Constant for the length of the string view (excluding the null terminator)
277 static constexpr std::size_t inner_size = sizeof...(Vals);
278
279 // The std::string_view over the data.
280 // We use data and size to avoid including the null terminator in the view.
281 static constexpr std::string_view view = {inner_data, inner_size};
282
283 constexpr operator std::string_view() { return view; }
284 constexpr const char *c_str() { return view.data(); }
285 constexpr const char *data() { return view.data(); }
286 constexpr size_t size() { return view.size(); }
287};
288
289consteval std::meta::info to_fixed_json_string(std::string_view in) {
290 std::vector<std::meta::info> Args = {};
291 for (char c : in) {
292 Args.push_back(std::meta::reflect_constant(c));
293 }
294 return std::meta::substitute(^^fixed_json_string, Args);
295}
296
300template <std::meta::info... meta_info> struct type_builder {
301 struct constructed_type;
302 consteval {
303 std::meta::define_aggregate(^^constructed_type, {
304 meta_info...});
305 }
306};
307
314template <std::meta::info... meta_info>
315using class_type = type_builder<meta_info...>::constructed_type;
316
323template <typename T, auto... Vs> constexpr auto construct_from = T{Vs...};
324
325// in JSON, there are only a few whitespace characters that are allowed
326// outside of objects, arrays, strings, and numbers.
327[[nodiscard]] constexpr bool is_whitespace(char c) {
328 return c == ' ' || c == '\n' || c == '\t' || c == '\r';
329};
330
331[[nodiscard]] constexpr std::string_view trim_whitespace(std::string_view str) {
332 size_t start = 0;
333 size_t end = str.size();
334
335 while (start < end && is_whitespace(str[start])) {
336 start++;
337 }
338 while (end > start && is_whitespace(str[end - 1])) {
339 end--;
340 }
341 return str.substr(start, end - start);
342}
343
344// Forward declaration
345consteval std::pair<std::meta::info, size_t>
346parse_json_object_impl(std::string_view json);
347
351
352// Parses a JSON number from a string view.
353// Returns the number of characters consumed and the parsed value
354// as a variant of int64_t, uint64_t, or double.
355[[nodiscard]] consteval size_t
356parse_number(std::string_view json,
357 std::variant<int64_t, uint64_t, double> &out) {
358 if (json.empty()) {
359 simdjson_consteval_error("Empty string is not a valid JSON number");
360 }
361 if (json[0] == '+') {
362 simdjson_consteval_error("Invalid number: leading '+' sign is not allowed");
363 }
364 // Compute the range and determine the type.
365 auto it = json.begin();
366 if (it != json.end() && (*it == '-')) {
367 it++;
368 }
369 while (it != json.end() && (*it >= '0' && *it <= '9')) {
370 it++;
371 }
372 bool is_float = false;
373 if (it != json.end() && (*it == '.' || *it == 'e' || *it == 'E')) {
374 is_float = true;
375 if (*it == '.') {
376 it++;
377 while (it != json.end() && (*it >= '0' && *it <= '9')) {
378 it++;
379 }
380 }
381 if (it != json.end() && (*it == 'e' || *it == 'E')) {
382 it++;
383 if (it != json.end() && (*it == '+' || *it == '-')) {
384 it++;
385 }
386 while (it != json.end() && (*it >= '0' && *it <= '9')) {
387 it++;
388 }
389 }
390 }
391 size_t scope = it - json.begin();
392
393 bool is_negative = json[0] == '-';
394 // Note that we consider -0 to be an integer unless it has a decimal point or
395 // exponent.
396 if (is_float) {
397 // It would be cool to use std::from_chars in a consteval context, but it is
398 // not supported yet for floating point types. :-(
399 auto [value, offset] =
400 number_parsing::parse_double(json.data(), json.data() + json.size());
401 if (offset != scope) {
402 simdjson_consteval_error(
403 "Internal error: cannot agree on the character range of the float");
404 }
405 out = value;
406 return offset;
407 } else if (is_negative) {
408 int64_t int_value = 0;
409 std::from_chars_result res =
410 std::from_chars(json.data(), json.data() + json.size(), int_value);
411 if (res.ec == std::errc()) {
412 out = int_value;
413 if ((res.ptr - json.data()) != scope) {
414 simdjson_consteval_error(
415 "Internal error: cannot agree on the character range of the float");
416 }
417 return (res.ptr - json.data());
418 } else {
419 simdjson_consteval_error("Invalid integer value");
420 }
421 } else {
422 uint64_t uint_value = 0;
423 std::from_chars_result res =
424 std::from_chars(json.data(), json.data() + json.size(), uint_value);
425 if (res.ec == std::errc()) {
426 out = uint_value;
427 if ((res.ptr - json.data()) != scope) {
428 simdjson_consteval_error(
429 "Internal error: cannot agree on the character range of the float");
430 }
431 return (res.ptr - json.data());
432 } else {
433 simdjson_consteval_error("Invalid unsigned integer value");
434 }
435 }
436}
437
441
442// parse a JSON string value, handling escape sequences and validating UTF-8
443// Returns the created string and the number of characters consumed.
444// Note that the number of characters consumed includes the surrounding quotes.
445// The number of bytes written to out differs from the number of characters
446// consumed in general because of escape sequences and UTF-8 encoding.
447[[nodiscard]] consteval std::pair<std::string, size_t>
448parse_string(std::string_view json) {
449 auto cursor = json.begin();
450 auto end = json.end();
451 std::string out;
452
453 // Expect starting quote
454 if (cursor == end || *(cursor++) != '"') {
455 simdjson_consteval_error("Expected opening quote for string");
456 }
457 // Notice that the quote is not appended.
458
459 // Process \uXXXX escape sequences, writes out to out.
460 // Returns true on success, false on error.
461 auto process_escaped_unicode = [&] [[nodiscard]] () -> bool {
462 // Processing \uXXXX escape sequence is a bit complicated, so we
463 // define helper lambdas here.
464 //
465 // consume the XXXX in \uXXXX and return the corresponding code point.
466 // In case of error, a value greater than 0xFFFF is returned.
467 // The caller should check!
468 auto hex_to_u32 = [&] [[nodiscard]] () -> uint32_t {
469 auto digit = [](uint8_t c) -> uint32_t {
470 if (c >= '0' && c <= '9')
471 return c - '0';
472 if (c >= 'A' && c <= 'F')
473 return c - 'A' + 10;
474 if (c >= 'a' && c <= 'f')
475 return c - 'a' + 10;
476 return 0xFFFFFFFF;
477 };
478 if (end - cursor < 4) {
479 return 0xFFFFFFFF;
480 }
481 uint32_t d0 = digit(*cursor++);
482 uint32_t d1 = digit(*cursor++);
483 uint32_t d2 = digit(*cursor++);
484 uint32_t d3 = digit(*cursor++);
485
486 return (d0 << 12) | (d1 << 8) | (d2 << 4) | d3;
487 };
488 // Write code point as UTF-8 into out.
489 // The caller must ensure that the code point is valid,
490 // i.e., not in the surrogate range or greater than 0x10FFFF.
491 auto codepoint_to_utf8 = [&out] [[nodiscard]] (uint32_t cp) -> bool {
492 // the high and low surrogates U+D800 through U+DFFF are invalid code
493 // points
494 if (cp > 0x10FFFF || (cp >= 0xD800 && cp <= 0xDFFF)) {
495 return false;
496 }
497 if (cp <= 0x7F) {
498 out.push_back(uint8_t(cp));
499 } else if (cp <= 0x7FF) {
500 out.push_back(uint8_t((cp >> 6) + 192));
501 out.push_back(uint8_t((cp & 63) + 128));
502 } else if (cp <= 0xFFFF) {
503 out.push_back(uint8_t((cp >> 12) + 224));
504 out.push_back(uint8_t(((cp >> 6) & 63) + 128));
505 out.push_back(uint8_t((cp & 63) + 128));
506 } else if (cp <= 0x10FFFF) {
507 out.push_back(uint8_t((cp >> 18) + 240));
508 out.push_back(uint8_t(((cp >> 12) & 63) + 128));
509 out.push_back(uint8_t(((cp >> 6) & 63) + 128));
510 out.push_back(uint8_t((cp & 63) + 128));
511 } else {
512 return false;
513 }
514 return true;
515 };
516 // We begin the implementation of process_escaped_unicode here.
517 // The substitution_code_point is used for invalid sequences.
518 // (This is U+FFFD, the Unicode replacement character.)
519 constexpr uint32_t substitution_code_point = 0xfffd;
520 // We need at least 4 characters for the hex digits
521 if (end - cursor < 4) {
522 return false;
523 }
524 // Consume the 4 hex digits
525 uint32_t code_point = hex_to_u32();
526 // Check for validity
527 if (code_point > 0xFFFF) {
528 return false;
529 }
530 // If we have a high surrogate, we need to process a low surrogate
531 if (code_point >= 0xd800 && code_point < 0xdc00) {
532 // We need at least 6 more characters for \uXXXX, if they are NOT
533 // present, we have an error (isolated high surrogate), which we
534 // tolerate by substituting the substitution_code_point.
535 if (end - cursor < 6 || *cursor != '\\' ||
536 *(cursor + 1) != 'u' > 0xFFFF) {
537 code_point = substitution_code_point;
538 } else { // we have \u following the high surrogate
539 cursor += 2; // skip \u
540 uint32_t code_point_2 = hex_to_u32();
541 if (code_point_2 > 0xFFFF) {
542 return false;
543 }
544 uint32_t low_bit = code_point_2 - 0xdc00;
545 if (low_bit >> 10) {
546 // Not a valid low surrogate
547 code_point = substitution_code_point;
548 } else {
549 code_point = ((code_point - 0xd800) << 10 | low_bit) + 0x10000;
550 }
551 }
552 } else if (code_point >= 0xdc00 && code_point < 0xe000) {
553 // Isolated low surrogate, invalid
554 code_point = substitution_code_point;
555 }
556 // Now we have the final code point, write it as UTF-8 to out.
557 return codepoint_to_utf8(code_point);
558 };
559
560 while (cursor != end && *cursor != '"') {
561 // If we find the end of input before closing quote, it's an error
562 if (cursor == end) {
563 simdjson_consteval_error("Unterminated string");
564 }
565 // capture the next character and move forward
566 char c = *(cursor++);
567 // The one case where we don't want to append to the string directly
568 // is when the string contains escape sequences.
569
570 if (c == '\\') {
571 // Assume that it is the start of an escape sequence
572 size_t how_many_backslashes = 1;
573 while (cursor != end && *(cursor) == '\\') {
574 cursor++;
575 how_many_backslashes++;
576 }
577 size_t backslashes_to_add = how_many_backslashes / 2;
578 // Append the backslashes
579 for (size_t i = 0; i < backslashes_to_add; i++) {
580 out += '\\';
581 }
582 // If we have an odd number of backslashes, we have an escape sequence
583 // besides the backslashes we have already added.
584 if (how_many_backslashes % 2 == 1) {
585 // If we have an odd number of backslashes, we must be in an escape
586 // sequence check that what follows is a valid escape character
587 if (cursor == end) {
588 simdjson_consteval_error("Truncated escape sequence in string");
589 }
590 char next_char = *cursor;
591 cursor++;
592 switch (next_char) {
593 case '"':
594 out += '"';
595 break;
596 case '/':
597 out += '/';
598 break;
599 case 'b':
600 out += '\b';
601 break;
602 case 'f':
603 out += '\f';
604 break;
605 case 'n':
606 out += '\n';
607 break;
608 case 'r':
609 out += '\r';
610 break;
611 case 't':
612 out += '\t';
613 break;
614 case 'u':
615 if (!process_escaped_unicode()) {
616 simdjson_consteval_error(
617 "Invalid unicode escape sequence in string");
618 }
619 break;
620 default:
621 simdjson_consteval_error("Invalid escape character in string");
622 }
623 }
624 continue; // continue to next iteration
625 }
626 // Handle escape sequences and UTF-8 validation. We do not process
627 // the escape sequences here, just validate them.
628 out += c;
629 if (static_cast<unsigned char>(c) < 0x20) {
630 simdjson_consteval_error("Invalid control character in string");
631 }
632 if (static_cast<unsigned char>(c) >= 0x80) {
633 // We have a non-ASCII character inside a string
634 // We must validate it.
635 uint8_t first_byte = static_cast<uint8_t>(c);
636
637 if ((first_byte & 0b11100000) == 0b11000000) {
638 if (cursor == end) {
639 simdjson_consteval_error("Truncated UTF-8 sequence in string");
640 }
641
642 char second_byte = *cursor;
643 out += second_byte;
644 ++cursor;
645 if ((static_cast<uint8_t>(second_byte) & 0b11000000) != 0b10000000) {
646 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
647 }
648 // range check
649 uint32_t code_point = (first_byte & 0b00011111) << 6 |
650 (static_cast<uint8_t>(second_byte) & 0b00111111);
651 if ((code_point < 0x80) || (0x7ff < code_point)) {
652 simdjson_consteval_error("Invalid UTF-8 code point in string");
653 }
654 } else if ((first_byte & 0b11110000) == 0b11100000) {
655 if (cursor == end) {
656 simdjson_consteval_error("Truncated UTF-8 sequence in string");
657 }
658 char second_byte = *cursor;
659 ++cursor;
660 out += second_byte;
661 if ((static_cast<uint8_t>(second_byte) & 0b11000000) != 0b10000000) {
662 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
663 }
664 if (cursor == end) {
665 simdjson_consteval_error("Truncated UTF-8 sequence in string");
666 }
667 char third_byte = *cursor;
668 ++cursor;
669 out += third_byte;
670 if ((static_cast<uint8_t>(third_byte) & 0b11000000) != 0b10000000) {
671 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
672 }
673 // range check
674 uint32_t code_point = (first_byte & 0b00001111) << 12 |
675 (static_cast<uint8_t>(second_byte) & 0b00111111)
676 << 6 |
677 (static_cast<uint8_t>(third_byte) & 0b00111111);
678 if ((code_point < 0x800) || (0xffff < code_point) ||
679 (0xd7ff < code_point && code_point < 0xe000)) {
680 simdjson_consteval_error("Invalid UTF-8 code point in string");
681 }
682 } else if ((first_byte & 0b11111000) == 0b11110000) { // 0b11110000
683 if (cursor == end) {
684 simdjson_consteval_error("Truncated UTF-8 sequence in string");
685 }
686 char second_byte = *cursor;
687 ++cursor;
688 out += second_byte;
689 if (cursor == end) {
690 simdjson_consteval_error("Truncated UTF-8 sequence in string");
691 }
692 char third_byte = *cursor;
693 ++cursor;
694 out += third_byte;
695 if (cursor == end) {
696 simdjson_consteval_error("Truncated UTF-8 sequence in string");
697 }
698 char fourth_byte = *cursor;
699 ++cursor;
700 out += fourth_byte;
701 if ((static_cast<uint8_t>(second_byte) & 0b11000000) != 0b10000000) {
702 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
703 }
704 if ((static_cast<uint8_t>(third_byte) & 0b11000000) != 0b10000000) {
705 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
706 }
707 if ((static_cast<uint8_t>(fourth_byte) & 0b11000000) != 0b10000000) {
708 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
709 }
710 // range check
711 uint32_t code_point =
712 (first_byte & 0b00000111) << 18 |
713 (static_cast<uint8_t>(second_byte) & 0b00111111) << 12 |
714 (static_cast<uint8_t>(third_byte) & 0b00111111) << 6 |
715 (static_cast<uint8_t>(fourth_byte) & 0b00111111);
716 if (code_point <= 0xffff || 0x10ffff < code_point) {
717 simdjson_consteval_error("Invalid UTF-8 code point in string");
718 }
719 } else {
720 // we have a continuation
721 simdjson_consteval_error("Invalid UTF-8 continuation byte in string");
722 }
723 continue;
724 }
725 }
726 if (cursor == end) {
727 simdjson_consteval_error("Unterminated string");
728 }
729 if (*cursor != '"') {
730 simdjson_consteval_error("Internal error: expected closing quote");
731 }
732 cursor++; // consume the closing quote
733 // We get here if and only if we have seen the closing quote.
734
735 return {out, size_t(cursor - json.begin())};
736}
737
741
742// Parses a JSON array and returns a std::meta::info representing the array as
743// well as the number of characters consumed.
744consteval std::pair<std::meta::info, size_t>
745parse_json_array_impl(const std::string_view json) {
746 size_t consumed = 0;
747 auto cursor = json.begin();
748 auto end = json.end();
749 auto is_whitespace = [](char c) {
750 return c == ' ' || c == '\n' || c == '\t' || c == '\r';
751 };
752
753 auto skip_whitespace = [&]() -> void {
754 while (cursor != end && is_whitespace(*cursor))
755 cursor++;
756 };
757
758 auto expect_consume = [&] [[nodiscard]] (char c) -> bool {
759 skip_whitespace();
760 if (cursor == end || *(cursor++) != c) {
761 return false;
762 };
763 return true;
764 };
765
766 if (!expect_consume('[')) {
767 simdjson_consteval_error("Expected '['");
768 }
769
770 std::vector<std::meta::info> values = {^^void};
771 skip_whitespace();
772 if (cursor != end && *cursor == ']') {
773 if (!expect_consume(']')) {
774 simdjson_consteval_error("Expected ']'");
775 }
776 // Empty array - use int as placeholder type since void doesn't work
777 auto array_type = std::meta::substitute(
778 ^^std::array, {
779 ^^int, std::meta::reflect_constant(0uz)});
780 values[0] = array_type;
781 consumed = size_t(cursor - json.begin());
782 if (json[consumed - 1] != ']') {
783 simdjson_consteval_error("Expected ']'");
784 }
785 return {std::meta::substitute(^^construct_from, values), consumed};
786 }
787 while (cursor != end && *cursor != ']') {
788 char c = *cursor;
789 switch (c) {
790 case '{': {
791 std::string_view value(cursor, end);
792 auto [parsed, object_size] = parse_json_object_impl(value);
793 if (*(cursor + object_size - 1) != '}') {
794 simdjson_consteval_error("Expected '}'");
795 }
796 values.push_back(parsed);
797 cursor += object_size;
798 break;
799 }
800 case '[': {
801 std::string_view value(cursor, end);
802 auto [parsed, array_size] = parse_json_array_impl(value);
803 if (*(cursor + array_size - 1) != ']') {
804 simdjson_consteval_error("Expected ']'");
805 }
806 values.push_back(parsed);
807 cursor += array_size;
808 break;
809 }
810 case '"': {
811 auto res = parse_string(std::string_view(cursor, end));
812 cursor += res.second;
813 for (char ch : res.first) {
814 if (ch == '\0') {
815 simdjson_consteval_error(
816 "Field string values cannot contain embedded nulls");
817 }
818 }
819 values.push_back(std::meta::reflect_constant_string(res.first));
820 break;
821 }
822 case 't': {
823 if (end - cursor < 4 || std::string_view(cursor, 4) != "true") {
824 simdjson_consteval_error("Invalid value");
825 }
826 cursor += 4;
827 values.push_back(std::meta::reflect_constant(true));
828 break;
829 }
830 case 'f': {
831 if (end - cursor < 5 || std::string_view(cursor, 5) != "false") {
832 simdjson_consteval_error("Invalid value");
833 }
834 cursor += 5;
835 values.push_back(std::meta::reflect_constant(false));
836 break;
837 }
838 case 'n': {
839 if (end - cursor < 4 || std::string_view(cursor, 4) != "null") {
840 simdjson_consteval_error("Invalid value");
841 }
842 cursor += 4;
843 values.push_back(std::meta::reflect_constant(nullptr));
844 break;
845 }
846 // We deliberately do not include '+' here, as per JSON spec
847 case '-':
848 case '0':
849 case '1':
850 case '2':
851 case '3':
852 case '4':
853 case '5':
854 case '6':
855 case '7':
856 case '8':
857 case '9': {
858 std::string_view suffix = std::string_view(cursor, end);
859 std::variant<int64_t, uint64_t, double> out;
860 size_t r = parse_number(suffix, out);
861 cursor += r;
862 if (std::holds_alternative<int64_t>(out)) {
863 int64_t int_value = std::get<int64_t>(out);
864 values.push_back(std::meta::reflect_constant(int_value));
865 } else if (std::holds_alternative<uint64_t>(out)) {
866 uint64_t uint_value = std::get<uint64_t>(out);
867 values.push_back(std::meta::reflect_constant(uint_value));
868 } else {
869 double float_value = std::get<double>(out);
870 values.push_back(std::meta::reflect_constant(float_value));
871 }
872 break;
873 }
874 default:
875 simdjson_consteval_error("Invalid character starting value");
876 }
877 skip_whitespace();
878 if (cursor != end && *cursor == ',') {
879 ++cursor;
880 skip_whitespace();
881 }
882 }
883
884 if (!expect_consume(']')) {
885 simdjson_consteval_error("Expected ']'");
886 }
887 std::size_t count = values.size() - 1;
888 // We assume all elements have the same type as the first element.
889 // However, if the array is heterogeneous, we should use std::variant.
890 auto array_type = std::meta::substitute(
891 ^^std::array,
892 {
893 std::meta::type_of(values[1]), std::meta::reflect_constant(count)});
894
895 // Create array instance with values
896 values[0] = array_type;
897 consumed = size_t(cursor - json.begin());
898 if (json[consumed - 1] != ']') {
899 simdjson_consteval_error("Expected ']'");
900 }
901 return {std::meta::substitute(^^construct_from, values), consumed};
902}
903
907
908// Parses a JSON object and returns a std::meta::info representing the object
909// type as well as the number of characters consumed.
910consteval std::pair<std::meta::info, size_t>
911parse_json_object_impl(std::string_view json) {
912 size_t consumed = 0;
913 auto cursor = json.begin();
914 auto end = json.end();
915
916 auto skip_whitespace = [&]() -> void {
917 while (cursor != end && is_whitespace(*cursor))
918 cursor++;
919 };
920
921 auto expect_consume = [&] [[nodiscard]] (char c) -> bool {
922 skip_whitespace();
923 if (cursor == end || *(cursor++) != c) {
924 return false;
925 };
926 return true;
927 };
928
929 if (!expect_consume('{')) {
930 simdjson_consteval_error("Expected '{'");
931 }
932
933 std::vector<std::meta::info> members;
934 std::vector<std::meta::info> values = {^^void};
935
936 while (cursor != end && *cursor != '}') {
937 skip_whitespace();
938 // Not all strings can be identifiers, but in JSON field names can be any
939 // string. Thus we may have a problem if the field name contains characters
940 // not allowed in identifiers. Let the standard library handle that case.
941 auto field = parse_string(std::string_view(cursor, end));
942 std::string field_name = field.first;
943 cursor += field.second;
944 if (!expect_consume(':')) {
945 simdjson_consteval_error("Expected ':'");
946 }
947 skip_whitespace();
948 if (cursor == end) {
949 simdjson_consteval_error("Expected value after colon");
950 }
951 char c = *cursor;
952 switch (c) {
953 case '{': {
954 std::string_view value(cursor, end);
955 auto [parsed, object_size] = parse_json_object_impl(value);
956 if (*(cursor + object_size - 1) != '}') {
957 simdjson_consteval_error("Expected '}'");
958 }
959 cursor += object_size;
960 auto dms = std::meta::data_member_spec(std::meta::type_of(parsed),
961 {.name = field_name});
962 members.push_back(std::meta::reflect_constant(dms));
963 values.push_back(parsed);
964
965 break;
966 }
967 case '[': {
968 std::string_view value(cursor, end);
969 auto [parsed, array_size] = parse_json_array_impl(value);
970 auto dms = std::meta::data_member_spec(std::meta::type_of(parsed),
971 {.name = field_name});
972 members.push_back(std::meta::reflect_constant(dms));
973 values.push_back(parsed);
974 if (*(cursor + array_size - 1) != ']') {
975 simdjson_consteval_error("Expected ']'");
976 }
977 cursor += array_size;
978
979 break;
980 }
981 case '"': {
982 auto res = parse_string(std::string_view(cursor, end));
983 std::string_view value = res.first;
984 cursor += res.second;
985 for (char ch : value) {
986 if (ch == '\0') {
987 simdjson_consteval_error(
988 "Field string values cannot contain embedded nulls");
989 }
990 }
991 auto dms =
992 std::meta::data_member_spec(^^const char *, {
993 .name = field_name});
994 members.push_back(std::meta::reflect_constant(dms));
995 values.push_back(std::meta::reflect_constant_string(value));
996 break;
997 }
998 case 't': {
999 if (end - cursor < 4 || std::string_view(cursor, 4) != "true") {
1000 simdjson_consteval_error("Invalid value");
1001 }
1002 cursor += 4;
1003
1004 auto dms = std::meta::data_member_spec(^^bool, {
1005 .name = field_name});
1006 members.push_back(std::meta::reflect_constant(dms));
1007 values.push_back(std::meta::reflect_constant(true));
1008 break;
1009 }
1010 case 'f': {
1011 if (end - cursor < 5 || std::string_view(cursor, 5) != "false") {
1012 simdjson_consteval_error("Invalid value");
1013 }
1014 cursor += 5;
1015
1016 auto dms = std::meta::data_member_spec(^^bool, {
1017 .name = field_name});
1018 members.push_back(std::meta::reflect_constant(dms));
1019 values.push_back(std::meta::reflect_constant(false));
1020 break;
1021 }
1022 case 'n': {
1023 if (end - cursor < 4 || std::string_view(cursor, 4) != "null") {
1024 simdjson_consteval_error("Invalid value");
1025 }
1026 cursor += 4;
1027
1028 auto dms = std::meta::data_member_spec(^^std::nullptr_t,
1029 {
1030 .name = field_name});
1031 members.push_back(std::meta::reflect_constant(dms));
1032 values.push_back(std::meta::reflect_constant(nullptr));
1033 break;
1034 }
1035 // We deliberately do not include '+' here, as per JSON spec
1036 case '-':
1037 case '0':
1038 case '1':
1039 case '2':
1040 case '3':
1041 case '4':
1042 case '5':
1043 case '6':
1044 case '7':
1045 case '8':
1046 case '9': {
1047 std::string_view suffix = std::string_view(cursor, end);
1048 std::variant<int64_t, uint64_t, double> out;
1049 size_t r = parse_number(suffix, out);
1050 cursor += r;
1051 if (std::holds_alternative<int64_t>(out)) {
1052 int64_t int_value = std::get<int64_t>(out);
1053 auto dms =
1054 std::meta::data_member_spec(^^int64_t, {
1055 .name = field_name});
1056 members.push_back(std::meta::reflect_constant(dms));
1057 values.push_back(std::meta::reflect_constant(int_value));
1058 } else if (std::holds_alternative<uint64_t>(out)) {
1059 uint64_t uint_value = std::get<uint64_t>(out);
1060 auto dms =
1061 std::meta::data_member_spec(^^uint64_t, {
1062 .name = field_name});
1063 members.push_back(std::meta::reflect_constant(dms));
1064 values.push_back(std::meta::reflect_constant(uint_value));
1065 } else {
1066 double float_value = std::get<double>(out);
1067 auto dms =
1068 std::meta::data_member_spec(^^double, {
1069 .name = field_name});
1070 members.push_back(std::meta::reflect_constant(dms));
1071 values.push_back(std::meta::reflect_constant(float_value));
1072 }
1073 break;
1074 }
1075 default:
1076 simdjson_consteval_error("Invalid character starting value");
1077 }
1078 skip_whitespace();
1079 if (cursor == end) {
1080 simdjson_consteval_error("Expected '}' or ','");
1081 }
1082 if (*cursor == ',') {
1083 ++cursor;
1084 skip_whitespace();
1085 } else if (*cursor != '}') {
1086 simdjson_consteval_error("Expected '}'");
1087 }
1088 }
1089
1090 if (!expect_consume('}')) {
1091 simdjson_consteval_error("Expected '}'");
1092 }
1093 values[0] = std::meta::substitute(^^class_type, members);
1094 consumed = size_t(cursor - json.begin());
1095 if (json[consumed - 1] != '}') {
1096 simdjson_consteval_error("Expected '}'");
1097 }
1098 return {std::meta::substitute(^^construct_from, values), consumed};
1099}
1100
1107template <constevalutil::fixed_string json_str> consteval auto parse_json() {
1108 constexpr std::string_view json = trim_whitespace(json_str.view());
1109 static_assert(!json.empty(), "JSON string cannot be empty");
1110 /*static_assert(json.front() != '{' && json.front() != '[',
1111 "Only JSON objects and arrays are supported at the top level, this "
1112 "limitation will be lifted in the future.");*/
1113
1114 constexpr auto result = json.front() == '['
1115 ? parse_json_array_impl(json)
1116 : parse_json_object_impl(json);
1117 return [: result.first :];
1118 /*
1119 if(json.front() == '[') {
1120 return [:parse_json_array_impl(json).first:];
1121 } else if(json.front() == '{') {
1122 // return [:parse_json_object_impl(json).first:];
1123 }*/
1124}
1125
1126} // namespace compile_time
1127} // namespace simdjson
1128
1129#endif // SIMDJSON_STATIC_REFLECTION
1130#endif // SIMDJSON_GENERIC_COMPILE_TIME_JSON_INL_H
Compile-time JSON parsing using C++26 reflection with std::meta::substitute()
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8