simdjson 4.2.3
Ridiculously Fast JSON
Loading...
Searching...
No Matches
numberparsing.h
1#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
2
3#ifndef SIMDJSON_CONDITIONAL_INCLUDE
4#define SIMDJSON_GENERIC_NUMBERPARSING_H
5#include "simdjson/generic/base.h"
6#include "simdjson/generic/jsoncharutils.h"
7#include "simdjson/internal/numberparsing_tables.h"
8#endif // SIMDJSON_CONDITIONAL_INCLUDE
9
10#include <limits>
11#include <ostream>
12#include <cstring>
13
14namespace simdjson {
15namespace SIMDJSON_IMPLEMENTATION {
16namespace numberparsing {
17
18#ifdef JSON_TEST_NUMBERS
19#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
20#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
21#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
22#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
23#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
24#else
25#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
26#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
27#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
28#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
29#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
30#endif
31
32namespace {
33
34// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
35// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
36// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
37simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
38 double d;
39 mantissa &= ~(1ULL << 52);
40 mantissa |= real_exponent << 52;
41 mantissa |= ((static_cast<uint64_t>(negative)) << 63);
42 std::memcpy(&d, &mantissa, sizeof(d));
43 return d;
44}
45
46// Attempts to compute i * 10^(power) exactly; and if "negative" is
47// true, negate the result.
48// This function will only work in some cases, when it does not work, success is
49// set to false. This should work *most of the time* (like 99% of the time).
50// We assume that power is in the [smallest_power,
51// largest_power] interval: the caller is responsible for this check.
52simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
53 // we start with a fast path
54 // It was described in
55 // Clinger WD. How to read floating point numbers accurately.
56 // ACM SIGPLAN Notices. 1990
57#ifndef FLT_EVAL_METHOD
58#error "FLT_EVAL_METHOD should be defined, please include cfloat."
59#endif
60#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
61 // We cannot be certain that x/y is rounded to nearest.
62 if (0 <= power && power <= 22 && i <= 9007199254740991)
63#else
64 if (-22 <= power && power <= 22 && i <= 9007199254740991)
65#endif
66 {
67 // convert the integer into a double. This is lossless since
68 // 0 <= i <= 2^53 - 1.
69 d = double(i);
70 //
71 // The general idea is as follows.
72 // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
73 // 1) Both s and p can be represented exactly as 64-bit floating-point
74 // values
75 // (binary64).
76 // 2) Because s and p can be represented exactly as floating-point values,
77 // then s * p
78 // and s / p will produce correctly rounded values.
79 //
80 if (power < 0) {
81 d = d / simdjson::internal::power_of_ten[-power];
82 } else {
83 d = d * simdjson::internal::power_of_ten[power];
84 }
85 if (negative) {
86 d = -d;
87 }
88 return true;
89 }
90 // When 22 < power && power < 22 + 16, we could
91 // hope for another, secondary fast path. It was
92 // described by David M. Gay in "Correctly rounded
93 // binary-decimal and decimal-binary conversions." (1990)
94 // If you need to compute i * 10^(22 + x) for x < 16,
95 // first compute i * 10^x, if you know that result is exact
96 // (e.g., when i * 10^x < 2^53),
97 // then you can still proceed and do (i * 10^x) * 10^22.
98 // Is this worth your time?
99 // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
100 // for this second fast path to work.
101 // If you you have 22 < power *and* power < 22 + 16, and then you
102 // optimistically compute "i * 10^(x-22)", there is still a chance that you
103 // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
104 // this optimization maybe less common than we would like. Source:
105 // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
106 // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
107
108 // The fast path has now failed, so we are failing back on the slower path.
109
110 // In the slow path, we need to adjust i so that it is > 1<<63 which is always
111 // possible, except if i == 0, so we handle i == 0 separately.
112 if(i == 0) {
113 d = negative ? -0.0 : 0.0;
114 return true;
115 }
116
117
118 // The exponent is 1024 + 63 + power
119 // + floor(log(5**power)/log(2)).
120 // The 1024 comes from the ieee64 standard.
121 // The 63 comes from the fact that we use a 64-bit word.
122 //
123 // Computing floor(log(5**power)/log(2)) could be
124 // slow. Instead we use a fast function.
125 //
126 // For power in (-400,350), we have that
127 // (((152170 + 65536) * power ) >> 16);
128 // is equal to
129 // floor(log(5**power)/log(2)) + power when power >= 0
130 // and it is equal to
131 // ceil(log(5**-power)/log(2)) + power when power < 0
132 //
133 // The 65536 is (1<<16) and corresponds to
134 // (65536 * power) >> 16 ---> power
135 //
136 // ((152170 * power ) >> 16) is equal to
137 // floor(log(5**power)/log(2))
138 //
139 // Note that this is not magic: 152170/(1<<16) is
140 // approximately equal to log(5)/log(2).
141 // The 1<<16 value is a power of two; we could use a
142 // larger power of 2 if we wanted to.
143 //
144 int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
145
146
147 // We want the most significant bit of i to be 1. Shift if needed.
148 int lz = leading_zeroes(i);
149 i <<= lz;
150
151
152 // We are going to need to do some 64-bit arithmetic to get a precise product.
153 // We use a table lookup approach.
154 // It is safe because
155 // power >= smallest_power
156 // and power <= largest_power
157 // We recover the mantissa of the power, it has a leading 1. It is always
158 // rounded down.
159 //
160 // We want the most significant 64 bits of the product. We know
161 // this will be non-zero because the most significant bit of i is
162 // 1.
163 const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
164 // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
165 //
166 // The full_multiplication function computes the 128-bit product of two 64-bit words
167 // with a returned value of type value128 with a "low component" corresponding to the
168 // 64-bit least significant bits of the product and with a "high component" corresponding
169 // to the 64-bit most significant bits of the product.
170#if SIMDJSON_STATIC_REFLECTION
171 simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::powers_template<>::power_of_five_128[index]);
172#else
173 simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
174#endif
175
176 // Both i and power_of_five_128[index] have their most significant bit set to 1 which
177 // implies that the either the most or the second most significant bit of the product
178 // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
179 // we make of the product. It also makes it easy to reason about the product: there
180 // is 0 or 1 leading zero in the product.
181
182 // Unless the least significant 9 bits of the high (64-bit) part of the full
183 // product are all 1s, then we know that the most significant 55 bits are
184 // exact and no further work is needed. Having 55 bits is necessary because
185 // we need 53 bits for the mantissa but we have to have one rounding bit and
186 // we can waste a bit if the most significant bit of the product is zero.
187 if((firstproduct.high & 0x1FF) == 0x1FF) {
188 // We want to compute i * 5^q, but only care about the top 55 bits at most.
189 // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
190 // the full computation is wasteful. So we do what is called a "truncated
191 // multiplication".
192 // We take the most significant 64-bits, and we put them in
193 // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
194 // to the desired approximation using one multiplication. Sometimes it does not suffice.
195 // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
196 // then we get a better approximation to i * 5^q.
197 //
198 // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
199 // more complicated.
200 //
201 // There is an extra layer of complexity in that we need more than 55 bits of
202 // accuracy in the round-to-even scenario.
203 //
204 // The full_multiplication function computes the 128-bit product of two 64-bit words
205 // with a returned value of type value128 with a "low component" corresponding to the
206 // 64-bit least significant bits of the product and with a "high component" corresponding
207 // to the 64-bit most significant bits of the product.
208#if SIMDJSON_STATIC_REFLECTION
209 simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::powers_template<>::power_of_five_128[index + 1]);
210#else
211 simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
212#endif
213 firstproduct.low += secondproduct.high;
214 if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
215 // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
216 // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
217 // is sufficiently accurate, and more computation is not needed.
218 }
219 uint64_t lower = firstproduct.low;
220 uint64_t upper = firstproduct.high;
221 // The final mantissa should be 53 bits with a leading 1.
222 // We shift it so that it occupies 54 bits with a leading 1.
224 uint64_t upperbit = upper >> 63;
225 uint64_t mantissa = upper >> (upperbit + 9);
226 lz += int(1 ^ upperbit);
227
228 // Here we have mantissa < (1<<54).
229 int64_t real_exponent = exponent - lz;
230 if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
231 // Here have that real_exponent <= 0 so -real_exponent >= 0
232 if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
233 d = negative ? -0.0 : 0.0;
234 return true;
235 }
236 // next line is safe because -real_exponent + 1 < 0
237 mantissa >>= -real_exponent + 1;
238 // Thankfully, we can't have both "round-to-even" and subnormals because
239 // "round-to-even" only occurs for powers close to 0.
240 mantissa += (mantissa & 1); // round up
241 mantissa >>= 1;
242 // There is a weird scenario where we don't have a subnormal but just.
243 // Suppose we start with 2.2250738585072013e-308, we end up
244 // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
245 // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
246 // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
247 // subnormal, but we can only know this after rounding.
248 // So we only declare a subnormal if we are smaller than the threshold.
249 real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
250 d = to_double(mantissa, real_exponent, negative);
251 return true;
252 }
253 // We have to round to even. The "to even" part
254 // is only a problem when we are right in between two floats
255 // which we guard against.
256 // If we have lots of trailing zeros, we may fall right between two
257 // floating-point values.
258 //
259 // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
260 // times a power of two. That is, it is right between a number with binary significand
261 // m and another number with binary significand m+1; and it must be the case
262 // that it cannot be represented by a float itself.
263 //
264 // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
265 // Recall that 10^q = 5^q * 2^q.
266 // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
267 // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
268 // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
269 // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
270 // 2^{53} x 5^{-q} < 2^{64}.
271 // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
272 //
273 // We require lower <= 1 and not lower == 0 because we could not prove that
274 // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
275 if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
276 if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
277 mantissa &= ~1; // flip it so that we do not round up
278 }
279 }
280
281 mantissa += mantissa & 1;
282 mantissa >>= 1;
283
284 // Here we have mantissa < (1<<53), unless there was an overflow
285 if (mantissa >= (1ULL << 53)) {
287 // This will happen when parsing values such as 7.2057594037927933e+16
289 mantissa = (1ULL << 52);
290 real_exponent++;
291 }
292 mantissa &= ~(1ULL << 52);
293 // we have to check that real_exponent is in range, otherwise we bail out
294 if (simdjson_unlikely(real_exponent > 2046)) {
295 // We have an infinite value!!! We could actually throw an error here if we could.
296 return false;
297 }
298 d = to_double(mantissa, real_exponent, negative);
299 return true;
300}
301
302// We call a fallback floating-point parser that might be slow. Note
303// it will accept JSON numbers, but the JSON spec. is more restrictive so
304// before you call parse_float_fallback, you need to have validated the input
305// string with the JSON grammar.
306// It will return an error (false) if the parsed number is infinite.
307// The string parsing itself always succeeds. We know that there is at least
308// one digit.
309static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
310 *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
311 // We do not accept infinite values.
312
313 // Detecting finite values in a portable manner is ridiculously hard, ideally
314 // we would want to do:
315 // return !std::isfinite(*outDouble);
316 // but that mysteriously fails under legacy/old libc++ libraries, see
317 // https://github.com/simdjson/simdjson/issues/1286
318 //
319 // Therefore, fall back to this solution (the extra parens are there
320 // to handle that max may be a macro on windows).
321 return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
322}
323
324static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
325 *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
326 // We do not accept infinite values.
327
328 // Detecting finite values in a portable manner is ridiculously hard, ideally
329 // we would want to do:
330 // return !std::isfinite(*outDouble);
331 // but that mysteriously fails under legacy/old libc++ libraries, see
332 // https://github.com/simdjson/simdjson/issues/1286
333 //
334 // Therefore, fall back to this solution (the extra parens are there
335 // to handle that max may be a macro on windows).
336 return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
337}
338
339// check quickly whether the next 8 chars are made of digits
340// at a glance, it looks better than Mula's
341// http://0x80.pl/articles/swar-digits-validate.html
342simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
343 uint64_t val;
344 // this can read up to 7 bytes beyond the buffer size, but we require
345 // SIMDJSON_PADDING of padding
346 static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
347 std::memcpy(&val, chars, 8);
348 // a branchy method might be faster:
349 // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
350 // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
351 // 0x3030303030303030);
352 return (((val & 0xF0F0F0F0F0F0F0F0) |
353 (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
354 0x3333333333333333);
355}
356
357template<typename I>
358SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
359simdjson_inline bool parse_digit(const uint8_t c, I &i) {
360 const uint8_t digit = static_cast<uint8_t>(c - '0');
361 if (digit > 9) {
362 return false;
363 }
364 // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
365 i = 10 * i + digit; // might overflow, we will handle the overflow later
366 return true;
367}
368
369simdjson_inline bool is_digit(const uint8_t c) {
370 return static_cast<uint8_t>(c - '0') <= 9;
371}
372
373simdjson_warn_unused simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
374 // we continue with the fiction that we have an integer. If the
375 // floating point number is representable as x * 10^z for some integer
376 // z that fits in 53 bits, then we will be able to convert back the
377 // the integer into a float in a lossless manner.
378 const uint8_t *const first_after_period = p;
379
380#ifdef SIMDJSON_SWAR_NUMBER_PARSING
381#if SIMDJSON_SWAR_NUMBER_PARSING
382 // this helps if we have lots of decimals!
383 // this turns out to be frequent enough.
384 if (is_made_of_eight_digits_fast(p)) {
385 i = i * 100000000 + parse_eight_digits_unrolled(p);
386 p += 8;
387 }
388#endif // SIMDJSON_SWAR_NUMBER_PARSING
389#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
390 // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
391 if (parse_digit(*p, i)) { ++p; }
392 while (parse_digit(*p, i)) { p++; }
393 exponent = first_after_period - p;
394 // Decimal without digits (123.) is illegal
395 if (exponent == 0) {
396 return INVALID_NUMBER(src);
397 }
398 return SUCCESS;
399}
400
401simdjson_warn_unused simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
402 // Exp Sign: -123.456e[-]78
403 bool neg_exp = ('-' == *p);
404 if (neg_exp || '+' == *p) { p++; } // Skip + as well
405
406 // Exponent: -123.456e-[78]
407 auto start_exp = p;
408 int64_t exp_number = 0;
409 while (parse_digit(*p, exp_number)) { ++p; }
410 // It is possible for parse_digit to overflow.
411 // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
412 // Thus we *must* check for possible overflow before we negate exp_number.
413
414 // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
415 // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
416 // not oblige and may, in fact, generate two distinct paths in any case. It might be
417 // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
418 // instructions for a simdjson_likely branch, an unconclusive gain.
419
420 // If there were no digits, it's an error.
421 if (simdjson_unlikely(p == start_exp)) {
422 return INVALID_NUMBER(src);
423 }
424 // We have a valid positive exponent in exp_number at this point, except that
425 // it may have overflowed.
426
427 // If there were more than 18 digits, we may have overflowed the integer. We have to do
428 // something!!!!
429 if (simdjson_unlikely(p > start_exp+18)) {
430 // Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
431 while (*start_exp == '0') { start_exp++; }
432 // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
433 // support exponents smaller than -999,999,999,999,999,999 and bigger
434 // than 999,999,999,999,999,999.
435 // We can truncate.
436 // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
437 // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
438 // truncate at 324.
439 // Note that there is no reason to fail per se at this point in time.
440 // E.g., 0e999999999999999999999 is a fine number.
441 if (p > start_exp+18) { exp_number = 999999999999999999; }
442 }
443 // At this point, we know that exp_number is a sane, positive, signed integer.
444 // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
445 // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
446 // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
447 // To sum it up: the next line should never overflow.
448 exponent += (neg_exp ? -exp_number : exp_number);
449 return SUCCESS;
450}
451
452simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
453 const uint8_t *const srcend = src + max_length;
454 bool negative = (*src == '-'); // we can always read at least one character after the '-'
455 const uint8_t *p = src + uint8_t(negative);
456 if(p == srcend) { return false; }
457 if(*p == '0') {
458 ++p;
459 if(p == srcend) { return true; }
460 if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
461 return true;
462 }
463 while(p != srcend && is_digit(*p)) { ++p; }
464 if(p == srcend) { return true; }
465 if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
466 return true;
467}
468
469simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
470 // It is possible that the integer had an overflow.
471 // We have to handle the case where we have 0.0000somenumber.
472 const uint8_t *start = start_digits;
473 while ((*start == '0') || (*start == '.')) { ++start; }
474 // we over-decrement by one when there is a '.'
475 return digit_count - size_t(start - start_digits);
476}
477
478} // unnamed namespace
479
481static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
482 if (parse_float_fallback(src, answer)) {
483 return SUCCESS;
484 }
485 return INVALID_NUMBER(src);
486}
487
489template<typename W>
490simdjson_warn_unused simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
491 // If we frequently had to deal with long strings of digits,
492 // we could extend our code by using a 128-bit integer instead
493 // of a 64-bit integer. However, this is uncommon in practice.
494 //
495 // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
496 // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
497 // may not have a decimal separator!
498 if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
499 // Ok, chances are good that we had an overflow!
500 // this is almost never going to get called!!!
501 // we start anew, going slowly!!!
502 // This will happen in the following examples:
503 // 10000000000000000000000000000000000000000000e+308
504 // 3.1415926535897932384626433832795028841971693993751
505 //
506 // NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
507 // reference to it, it would force it to be stored in memory, preventing the compiler from
508 // picking it apart and putting into registers. i.e. if we pass it as reference,
509 // it gets slow.
510 double d;
511 error_code error = slow_float_parsing(src, &d);
512 writer.append_double(d);
513 return error;
514 }
515 // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
516 // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
517 // To future reader: we'd love if someone found a better way, or at least could explain this result!
518 if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
519 //
520 // Important: smallest_power is such that it leads to a zero value.
521 // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
522 // so something x 10^-343 goes to zero, but not so with something x 10^-342.
523 static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
524 //
525 if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
526 // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
527 WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
528 return SUCCESS;
529 } else { // (exponent > largest_power) and (i != 0)
530 // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
531 return INVALID_NUMBER(src);
532 }
533 }
534 double d;
535 if (!compute_float_64(exponent, i, negative, d)) {
536 // we are almost never going to get here.
537 if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
538 }
539 WRITE_DOUBLE(d, src, writer);
540 return SUCCESS;
541}
542
543// parse the number at src
544// define JSON_TEST_NUMBERS for unit testing
545//
546// It is assumed that the number is followed by a structural ({,},],[) character
547// or a white space character. If that is not the case (e.g., when the JSON
548// document is made of a single number), then it is necessary to copy the
549// content and append a space before calling this function.
550//
551// Our objective is accurate parsing (ULP of 0) at high speed.
552template<typename W>
553simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
554
555// for performance analysis, it is sometimes useful to skip parsing
556#ifdef SIMDJSON_SKIPNUMBERPARSING
557
558template<typename W>
559simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
560 writer.append_s64(0); // always write zero
561 return SUCCESS; // always succeeds
562}
563
564simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
565simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
566simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
567simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
568simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
569simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
570simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
571simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
572simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
573#else
574
575// parse the number at src
576// define JSON_TEST_NUMBERS for unit testing
577//
578// It is assumed that the number is followed by a structural ({,},],[) character
579// or a white space character. If that is not the case (e.g., when the JSON
580// document is made of a single number), then it is necessary to copy the
581// content and append a space before calling this function.
582//
583// Our objective is accurate parsing (ULP of 0) at high speed.
584template<typename W>
585simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
586 //
587 // Check for minus sign
588 //
589 bool negative = (*src == '-');
590 const uint8_t *p = src + uint8_t(negative);
591
592 //
593 // Parse the integer part.
594 //
595 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
596 const uint8_t *const start_digits = p;
597 uint64_t i = 0;
598 while (parse_digit(*p, i)) { p++; }
599
600 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
601 // Optimization note: size_t is expected to be unsigned.
602 size_t digit_count = size_t(p - start_digits);
603 if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
604
605 //
606 // Handle floats if there is a . or e (or both)
607 //
608 int64_t exponent = 0;
609 bool is_float = false;
610 if ('.' == *p) {
611 is_float = true;
612 ++p;
613 SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
614 digit_count = int(p - start_digits); // used later to guard against overflows
615 }
616 if (('e' == *p) || ('E' == *p)) {
617 is_float = true;
618 ++p;
619 SIMDJSON_TRY( parse_exponent(src, p, exponent) );
620 }
621 if (is_float) {
622 const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
623 SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
624 if (dirty_end) { return INVALID_NUMBER(src); }
625 return SUCCESS;
626 }
627
628 // The longest negative 64-bit number is 19 digits.
629 // The longest positive 64-bit number is 20 digits.
630 // We do it this way so we don't trigger this branch unless we must.
631 size_t longest_digit_count = negative ? 19 : 20;
632 if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
633 if (digit_count == longest_digit_count) {
634 if (negative) {
635 // Anything negative above INT64_MAX+1 is invalid
636 if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
637 WRITE_INTEGER(~i+1, src, writer);
638 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
639 return SUCCESS;
640 // Positive overflow check:
641 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
642 // biggest uint64_t.
643 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
644 // If we got here, it's a 20 digit number starting with the digit "1".
645 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
646 // than 1,553,255,926,290,448,384.
647 // - That is smaller than the smallest possible 20-digit number the user could write:
648 // 10,000,000,000,000,000,000.
649 // - Therefore, if the number is positive and lower than that, it's overflow.
650 // - The value we are looking at is less than or equal to INT64_MAX.
651 //
652 } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
653 }
654
655 // Write unsigned if it does not fit in a signed integer.
656 if (i > uint64_t(INT64_MAX)) {
657 WRITE_UNSIGNED(i, src, writer);
658 } else {
659#if SIMDJSON_MINUS_ZERO_AS_FLOAT
660 if(i == 0 && negative) {
661 // We have to write -0.0 instead of 0
662 WRITE_DOUBLE(-0.0, src, writer);
663 } else {
664 WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
665 }
666#else
667 WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
668#endif
669 }
670 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
671 return SUCCESS;
672}
673
674// Inlineable functions
675namespace {
676
677// This table can be used to characterize the final character of an integer
678// string. For JSON structural character and allowable white space characters,
679// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
680// we return NUMBER_ERROR.
681// Optimization note: we could easily reduce the size of the table by half (to 128)
682// at the cost of an extra branch.
683// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
684static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
685static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
686static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
687
688const uint8_t integer_string_finisher[256] = {
741
742// Parse any number from 0 to 18,446,744,073,709,551,615
743simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
744 const uint8_t *p = src;
745 //
746 // Parse the integer part.
747 //
748 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
749 const uint8_t *const start_digits = p;
750 uint64_t i = 0;
751 while (parse_digit(*p, i)) { p++; }
752
753 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
754 // Optimization note: size_t is expected to be unsigned.
755 size_t digit_count = size_t(p - start_digits);
756 // The longest positive 64-bit number is 20 digits.
757 // We do it this way so we don't trigger this branch unless we must.
758 // Optimization note: the compiler can probably merge
759 // ((digit_count == 0) || (digit_count > 20))
760 // into a single branch since digit_count is unsigned.
761 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
762 // Here digit_count > 0.
763 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
764 // We can do the following...
765 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
766 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
767 // }
768 // as a single table lookup:
769 if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
770
771 if (digit_count == 20) {
772 // Positive overflow check:
773 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
774 // biggest uint64_t.
775 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
776 // If we got here, it's a 20 digit number starting with the digit "1".
777 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
778 // than 1,553,255,926,290,448,384.
779 // - That is smaller than the smallest possible 20-digit number the user could write:
780 // 10,000,000,000,000,000,000.
781 // - Therefore, if the number is positive and lower than that, it's overflow.
782 // - The value we are looking at is less than or equal to INT64_MAX.
783 //
784 if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
785 }
786
787 return i;
788}
789
790
791// Parse any number from 0 to 18,446,744,073,709,551,615
792// Never read at src_end or beyond
793simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
794 const uint8_t *p = src;
795 //
796 // Parse the integer part.
797 //
798 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
799 const uint8_t *const start_digits = p;
800 uint64_t i = 0;
801 while ((p != src_end) && parse_digit(*p, i)) { p++; }
802
803 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
804 // Optimization note: size_t is expected to be unsigned.
805 size_t digit_count = size_t(p - start_digits);
806 // The longest positive 64-bit number is 20 digits.
807 // We do it this way so we don't trigger this branch unless we must.
808 // Optimization note: the compiler can probably merge
809 // ((digit_count == 0) || (digit_count > 20))
810 // into a single branch since digit_count is unsigned.
811 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
812 // Here digit_count > 0.
813 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
814 // We can do the following...
815 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
816 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
817 // }
818 // as a single table lookup:
819 if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
820
821 if (digit_count == 20) {
822 // Positive overflow check:
823 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
824 // biggest uint64_t.
825 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
826 // If we got here, it's a 20 digit number starting with the digit "1".
827 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
828 // than 1,553,255,926,290,448,384.
829 // - That is smaller than the smallest possible 20-digit number the user could write:
830 // 10,000,000,000,000,000,000.
831 // - Therefore, if the number is positive and lower than that, it's overflow.
832 // - The value we are looking at is less than or equal to INT64_MAX.
833 //
834 if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
835 }
836
837 return i;
838}
839
840// Parse any number from 0 to 18,446,744,073,709,551,615
841simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
842 const uint8_t *p = src + 1;
843 //
844 // Parse the integer part.
845 //
846 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
847 const uint8_t *const start_digits = p;
848 uint64_t i = 0;
849 while (parse_digit(*p, i)) { p++; }
850
851 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
852 // Optimization note: size_t is expected to be unsigned.
853 size_t digit_count = size_t(p - start_digits);
854 // The longest positive 64-bit number is 20 digits.
855 // We do it this way so we don't trigger this branch unless we must.
856 // Optimization note: the compiler can probably merge
857 // ((digit_count == 0) || (digit_count > 20))
858 // into a single branch since digit_count is unsigned.
859 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
860 // Here digit_count > 0.
861 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
862 // We can do the following...
863 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
864 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
865 // }
866 // as a single table lookup:
867 if (*p != '"') { return NUMBER_ERROR; }
868
869 if (digit_count == 20) {
870 // Positive overflow check:
871 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
872 // biggest uint64_t.
873 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
874 // If we got here, it's a 20 digit number starting with the digit "1".
875 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
876 // than 1,553,255,926,290,448,384.
877 // - That is smaller than the smallest possible 20-digit number the user could write:
878 // 10,000,000,000,000,000,000.
879 // - Therefore, if the number is positive and lower than that, it's overflow.
880 // - The value we are looking at is less than or equal to INT64_MAX.
881 //
882 // Note: we use src[1] and not src[0] because src[0] is the quote character in this
883 // instance.
884 if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
885 }
886
887 return i;
888}
889
890// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
891simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
892 //
893 // Check for minus sign
894 //
895 bool negative = (*src == '-');
896 const uint8_t *p = src + uint8_t(negative);
897
898 //
899 // Parse the integer part.
900 //
901 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
902 const uint8_t *const start_digits = p;
903 uint64_t i = 0;
904 while (parse_digit(*p, i)) { p++; }
905
906 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
907 // Optimization note: size_t is expected to be unsigned.
908 size_t digit_count = size_t(p - start_digits);
909 // We go from
910 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
911 // so we can never represent numbers that have more than 19 digits.
912 size_t longest_digit_count = 19;
913 // Optimization note: the compiler can probably merge
914 // ((digit_count == 0) || (digit_count > longest_digit_count))
915 // into a single branch since digit_count is unsigned.
916 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
917 // Here digit_count > 0.
918 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
919 // We can do the following...
920 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
921 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
922 // }
923 // as a single table lookup:
924 if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
925 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
926 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
927 // so cheap that we might as well always make it.
928 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
929 return negative ? (~i+1) : i;
930}
931
932// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
933// Never read at src_end or beyond
934simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
935 //
936 // Check for minus sign
937 //
938 if(src == src_end) { return NUMBER_ERROR; }
939 bool negative = (*src == '-');
940 const uint8_t *p = src + uint8_t(negative);
941
942 //
943 // Parse the integer part.
944 //
945 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
946 const uint8_t *const start_digits = p;
947 uint64_t i = 0;
948 while ((p != src_end) && parse_digit(*p, i)) { p++; }
949
950 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
951 // Optimization note: size_t is expected to be unsigned.
952 size_t digit_count = size_t(p - start_digits);
953 // We go from
954 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
955 // so we can never represent numbers that have more than 19 digits.
956 size_t longest_digit_count = 19;
957 // Optimization note: the compiler can probably merge
958 // ((digit_count == 0) || (digit_count > longest_digit_count))
959 // into a single branch since digit_count is unsigned.
960 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
961 // Here digit_count > 0.
962 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
963 // We can do the following...
964 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
965 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
966 // }
967 // as a single table lookup:
968 if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
969 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
970 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
971 // so cheap that we might as well always make it.
972 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
973 return negative ? (~i+1) : i;
974}
975
976// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
977simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
978 //
979 // Check for minus sign
980 //
981 bool negative = (*(src + 1) == '-');
982 src += uint8_t(negative) + 1;
983
984 //
985 // Parse the integer part.
986 //
987 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
988 const uint8_t *const start_digits = src;
989 uint64_t i = 0;
990 while (parse_digit(*src, i)) { src++; }
991
992 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
993 // Optimization note: size_t is expected to be unsigned.
994 size_t digit_count = size_t(src - start_digits);
995 // We go from
996 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
997 // so we can never represent numbers that have more than 19 digits.
998 size_t longest_digit_count = 19;
999 // Optimization note: the compiler can probably merge
1000 // ((digit_count == 0) || (digit_count > longest_digit_count))
1001 // into a single branch since digit_count is unsigned.
1002 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
1003 // Here digit_count > 0.
1004 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
1005 // We can do the following...
1006 // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
1007 // return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
1008 // }
1009 // as a single table lookup:
1010 if(*src != '"') { return NUMBER_ERROR; }
1011 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
1012 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
1013 // so cheap that we might as well always make it.
1014 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
1015 return negative ? (~i+1) : i;
1016}
1017
1018simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
1019 //
1020 // Check for minus sign
1021 //
1022 bool negative = (*src == '-');
1023 src += uint8_t(negative);
1024
1025 //
1026 // Parse the integer part.
1027 //
1028 uint64_t i = 0;
1029 const uint8_t *p = src;
1030 p += parse_digit(*p, i);
1031 bool leading_zero = (i == 0);
1032 while (parse_digit(*p, i)) { p++; }
1033 // no integer digits, or 0123 (zero must be solo)
1034 if ( p == src ) { return INCORRECT_TYPE; }
1035 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1036
1037 //
1038 // Parse the decimal part.
1039 //
1040 int64_t exponent = 0;
1041 bool overflow;
1042 if (simdjson_likely(*p == '.')) {
1043 p++;
1044 const uint8_t *start_decimal_digits = p;
1045 if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1046 p++;
1047 while (parse_digit(*p, i)) { p++; }
1048 exponent = -(p - start_decimal_digits);
1049
1050 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1051 overflow = p-src-1 > 19;
1052 if (simdjson_unlikely(overflow && leading_zero)) {
1053 // Skip leading 0.00000 and see if it still overflows
1054 const uint8_t *start_digits = src + 2;
1055 while (*start_digits == '0') { start_digits++; }
1056 overflow = p-start_digits > 19;
1057 }
1058 } else {
1059 overflow = p-src > 19;
1060 }
1061
1062 //
1063 // Parse the exponent
1064 //
1065 if (*p == 'e' || *p == 'E') {
1066 p++;
1067 bool exp_neg = *p == '-';
1068 p += exp_neg || *p == '+';
1069
1070 uint64_t exp = 0;
1071 const uint8_t *start_exp_digits = p;
1072 while (parse_digit(*p, exp)) { p++; }
1073 // no exp digits, or 20+ exp digits
1074 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1075
1076 exponent += exp_neg ? 0-exp : exp;
1077 }
1078
1079 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1080
1081 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1082
1083 //
1084 // Assemble (or slow-parse) the float
1085 //
1086 double d;
1087 if (simdjson_likely(!overflow)) {
1088 if (compute_float_64(exponent, i, negative, d)) { return d; }
1089 }
1090 if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1091 return NUMBER_ERROR;
1092 }
1093 return d;
1094}
1095
1096simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
1097 return (*src == '-');
1098}
1099
1100simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
1101 bool negative = (*src == '-');
1102 src += uint8_t(negative);
1103 const uint8_t *p = src;
1104 while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1105 if ( p == src ) { return NUMBER_ERROR; }
1106 if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
1107 return false;
1108}
1109
1110simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
1111 bool negative = (*src == '-');
1112 src += uint8_t(negative);
1113 const uint8_t *p = src;
1114 while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1115 size_t digit_count = size_t(p - src);
1116 if ( p == src ) { return NUMBER_ERROR; }
1117 if (jsoncharutils::is_structural_or_whitespace(*p)) {
1118 static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
1119 // We have an integer.
1120 if(simdjson_unlikely(digit_count > 20)) {
1121 return number_type::big_integer;
1122 }
1123 // If the number is negative and valid, it must be a signed integer.
1124 if(negative) {
1125 if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
1126 if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
1127 return number_type::big_integer;
1128 }
1129#if SIMDJSON_MINUS_ZERO_AS_FLOAT
1130 if(digit_count == 1 && src[0] == '0') {
1131 // We have to write -0.0 instead of 0
1132 return number_type::floating_point_number;
1133 }
1134#endif
1135 return number_type::signed_integer;
1136 }
1137 // Let us check if we have a big integer (>=2**64).
1138 static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
1139 if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
1140 return number_type::big_integer;
1141 }
1142 // The number is positive and smaller than 18446744073709551616 (or 2**64).
1143 // We want values larger or equal to 9223372036854775808 to be unsigned
1144 // integers, and the other values to be signed integers.
1145 if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
1146 return number_type::unsigned_integer;
1147 }
1148 return number_type::signed_integer;
1149 }
1150 // Hopefully, we have 'e' or 'E' or '.'.
1151 return number_type::floating_point_number;
1152}
1153
1154// Never read at src_end or beyond
1155simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
1156 if(src == src_end) { return NUMBER_ERROR; }
1157 //
1158 // Check for minus sign
1159 //
1160 bool negative = (*src == '-');
1161 src += uint8_t(negative);
1162
1163 //
1164 // Parse the integer part.
1165 //
1166 uint64_t i = 0;
1167 const uint8_t *p = src;
1168 if(p == src_end) { return NUMBER_ERROR; }
1169 p += parse_digit(*p, i);
1170 bool leading_zero = (i == 0);
1171 while ((p != src_end) && parse_digit(*p, i)) { p++; }
1172 // no integer digits, or 0123 (zero must be solo)
1173 if ( p == src ) { return INCORRECT_TYPE; }
1174 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1175
1176 //
1177 // Parse the decimal part.
1178 //
1179 int64_t exponent = 0;
1180 bool overflow;
1181 if (simdjson_likely((p != src_end) && (*p == '.'))) {
1182 p++;
1183 const uint8_t *start_decimal_digits = p;
1184 if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1185 p++;
1186 while ((p != src_end) && parse_digit(*p, i)) { p++; }
1187 exponent = -(p - start_decimal_digits);
1188
1189 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1190 overflow = p-src-1 > 19;
1191 if (simdjson_unlikely(overflow && leading_zero)) {
1192 // Skip leading 0.00000 and see if it still overflows
1193 const uint8_t *start_digits = src + 2;
1194 while (*start_digits == '0') { start_digits++; }
1195 overflow = start_digits-src > 19;
1196 }
1197 } else {
1198 overflow = p-src > 19;
1199 }
1200
1201 //
1202 // Parse the exponent
1203 //
1204 if ((p != src_end) && (*p == 'e' || *p == 'E')) {
1205 p++;
1206 if(p == src_end) { return NUMBER_ERROR; }
1207 bool exp_neg = *p == '-';
1208 p += exp_neg || *p == '+';
1209
1210 uint64_t exp = 0;
1211 const uint8_t *start_exp_digits = p;
1212 while ((p != src_end) && parse_digit(*p, exp)) { p++; }
1213 // no exp digits, or 20+ exp digits
1214 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1215
1216 exponent += exp_neg ? 0-exp : exp;
1217 }
1218
1219 if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1220
1221 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1222
1223 //
1224 // Assemble (or slow-parse) the float
1225 //
1226 double d;
1227 if (simdjson_likely(!overflow)) {
1228 if (compute_float_64(exponent, i, negative, d)) { return d; }
1229 }
1230 if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
1231 return NUMBER_ERROR;
1232 }
1233 return d;
1234}
1235
1236simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
1237 //
1238 // Check for minus sign
1239 //
1240 bool negative = (*(src + 1) == '-');
1241 src += uint8_t(negative) + 1;
1242
1243 //
1244 // Parse the integer part.
1245 //
1246 uint64_t i = 0;
1247 const uint8_t *p = src;
1248 p += parse_digit(*p, i);
1249 bool leading_zero = (i == 0);
1250 while (parse_digit(*p, i)) { p++; }
1251 // no integer digits, or 0123 (zero must be solo)
1252 if ( p == src ) { return INCORRECT_TYPE; }
1253 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1254
1255 //
1256 // Parse the decimal part.
1257 //
1258 int64_t exponent = 0;
1259 bool overflow;
1260 if (simdjson_likely(*p == '.')) {
1261 p++;
1262 const uint8_t *start_decimal_digits = p;
1263 if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1264 p++;
1265 while (parse_digit(*p, i)) { p++; }
1266 exponent = -(p - start_decimal_digits);
1267
1268 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1269 overflow = p-src-1 > 19;
1270 if (simdjson_unlikely(overflow && leading_zero)) {
1271 // Skip leading 0.00000 and see if it still overflows
1272 const uint8_t *start_digits = src + 2;
1273 while (*start_digits == '0') { start_digits++; }
1274 overflow = p-start_digits > 19;
1275 }
1276 } else {
1277 overflow = p-src > 19;
1278 }
1279
1280 //
1281 // Parse the exponent
1282 //
1283 if (*p == 'e' || *p == 'E') {
1284 p++;
1285 bool exp_neg = *p == '-';
1286 p += exp_neg || *p == '+';
1287
1288 uint64_t exp = 0;
1289 const uint8_t *start_exp_digits = p;
1290 while (parse_digit(*p, exp)) { p++; }
1291 // no exp digits, or 20+ exp digits
1292 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1293
1294 exponent += exp_neg ? 0-exp : exp;
1295 }
1296
1297 if (*p != '"') { return NUMBER_ERROR; }
1298
1299 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1300
1301 //
1302 // Assemble (or slow-parse) the float
1303 //
1304 double d;
1305 if (simdjson_likely(!overflow)) {
1306 if (compute_float_64(exponent, i, negative, d)) { return d; }
1307 }
1308 if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1309 return NUMBER_ERROR;
1310 }
1311 return d;
1312}
1313
1314} // unnamed namespace
1315#endif // SIMDJSON_SKIPNUMBERPARSING
1316
1317} // namespace numberparsing
1318
1319inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
1320 switch (type) {
1321 case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
1322 case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
1323 case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
1324 case number_type::big_integer: out << "big integer"; break;
1325 default: SIMDJSON_UNREACHABLE();
1326 }
1327 return out;
1328}
1329
1330} // namespace SIMDJSON_IMPLEMENTATION
1331} // namespace simdjson
1332
1333#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8
error_code
All possible errors returned by simdjson.
Definition error.h:19
@ INCORRECT_TYPE
JSON element has a different type than user expected.
Definition error.h:37
@ SUCCESS
No error.
Definition error.h:20
@ NUMBER_ERROR
Problem while parsing a number.
Definition error.h:29
constexpr size_t SIMDJSON_PADDING
The amount of padding needed in a buffer to parse JSON.
Definition base.h:33