simdjson 4.1.0
Ridiculously Fast JSON
Loading...
Searching...
No Matches
numberparsing.h
1#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
2
3#ifndef SIMDJSON_CONDITIONAL_INCLUDE
4#define SIMDJSON_GENERIC_NUMBERPARSING_H
5#include "simdjson/generic/base.h"
6#include "simdjson/generic/jsoncharutils.h"
7#include "simdjson/internal/numberparsing_tables.h"
8#endif // SIMDJSON_CONDITIONAL_INCLUDE
9
10#include <limits>
11#include <ostream>
12#include <cstring>
13
14namespace simdjson {
15namespace SIMDJSON_IMPLEMENTATION {
16namespace numberparsing {
17
18#ifdef JSON_TEST_NUMBERS
19#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
20#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
21#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
22#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
23#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
24#else
25#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
26#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
27#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
28#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
29#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
30#endif
31
32namespace {
33
34// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
35// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
36// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
37simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
38 double d;
39 mantissa &= ~(1ULL << 52);
40 mantissa |= real_exponent << 52;
41 mantissa |= ((static_cast<uint64_t>(negative)) << 63);
42 std::memcpy(&d, &mantissa, sizeof(d));
43 return d;
44}
45
46// Attempts to compute i * 10^(power) exactly; and if "negative" is
47// true, negate the result.
48// This function will only work in some cases, when it does not work, success is
49// set to false. This should work *most of the time* (like 99% of the time).
50// We assume that power is in the [smallest_power,
51// largest_power] interval: the caller is responsible for this check.
52simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
53 // we start with a fast path
54 // It was described in
55 // Clinger WD. How to read floating point numbers accurately.
56 // ACM SIGPLAN Notices. 1990
57#ifndef FLT_EVAL_METHOD
58#error "FLT_EVAL_METHOD should be defined, please include cfloat."
59#endif
60#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
61 // We cannot be certain that x/y is rounded to nearest.
62 if (0 <= power && power <= 22 && i <= 9007199254740991)
63#else
64 if (-22 <= power && power <= 22 && i <= 9007199254740991)
65#endif
66 {
67 // convert the integer into a double. This is lossless since
68 // 0 <= i <= 2^53 - 1.
69 d = double(i);
70 //
71 // The general idea is as follows.
72 // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
73 // 1) Both s and p can be represented exactly as 64-bit floating-point
74 // values
75 // (binary64).
76 // 2) Because s and p can be represented exactly as floating-point values,
77 // then s * p
78 // and s / p will produce correctly rounded values.
79 //
80 if (power < 0) {
81 d = d / simdjson::internal::power_of_ten[-power];
82 } else {
83 d = d * simdjson::internal::power_of_ten[power];
84 }
85 if (negative) {
86 d = -d;
87 }
88 return true;
89 }
90 // When 22 < power && power < 22 + 16, we could
91 // hope for another, secondary fast path. It was
92 // described by David M. Gay in "Correctly rounded
93 // binary-decimal and decimal-binary conversions." (1990)
94 // If you need to compute i * 10^(22 + x) for x < 16,
95 // first compute i * 10^x, if you know that result is exact
96 // (e.g., when i * 10^x < 2^53),
97 // then you can still proceed and do (i * 10^x) * 10^22.
98 // Is this worth your time?
99 // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
100 // for this second fast path to work.
101 // If you you have 22 < power *and* power < 22 + 16, and then you
102 // optimistically compute "i * 10^(x-22)", there is still a chance that you
103 // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
104 // this optimization maybe less common than we would like. Source:
105 // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
106 // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
107
108 // The fast path has now failed, so we are failing back on the slower path.
109
110 // In the slow path, we need to adjust i so that it is > 1<<63 which is always
111 // possible, except if i == 0, so we handle i == 0 separately.
112 if(i == 0) {
113 d = negative ? -0.0 : 0.0;
114 return true;
115 }
116
117
118 // The exponent is 1024 + 63 + power
119 // + floor(log(5**power)/log(2)).
120 // The 1024 comes from the ieee64 standard.
121 // The 63 comes from the fact that we use a 64-bit word.
122 //
123 // Computing floor(log(5**power)/log(2)) could be
124 // slow. Instead we use a fast function.
125 //
126 // For power in (-400,350), we have that
127 // (((152170 + 65536) * power ) >> 16);
128 // is equal to
129 // floor(log(5**power)/log(2)) + power when power >= 0
130 // and it is equal to
131 // ceil(log(5**-power)/log(2)) + power when power < 0
132 //
133 // The 65536 is (1<<16) and corresponds to
134 // (65536 * power) >> 16 ---> power
135 //
136 // ((152170 * power ) >> 16) is equal to
137 // floor(log(5**power)/log(2))
138 //
139 // Note that this is not magic: 152170/(1<<16) is
140 // approximately equal to log(5)/log(2).
141 // The 1<<16 value is a power of two; we could use a
142 // larger power of 2 if we wanted to.
143 //
144 int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
145
146
147 // We want the most significant bit of i to be 1. Shift if needed.
148 int lz = leading_zeroes(i);
149 i <<= lz;
150
151
152 // We are going to need to do some 64-bit arithmetic to get a precise product.
153 // We use a table lookup approach.
154 // It is safe because
155 // power >= smallest_power
156 // and power <= largest_power
157 // We recover the mantissa of the power, it has a leading 1. It is always
158 // rounded down.
159 //
160 // We want the most significant 64 bits of the product. We know
161 // this will be non-zero because the most significant bit of i is
162 // 1.
163 const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
164 // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
165 //
166 // The full_multiplication function computes the 128-bit product of two 64-bit words
167 // with a returned value of type value128 with a "low component" corresponding to the
168 // 64-bit least significant bits of the product and with a "high component" corresponding
169 // to the 64-bit most significant bits of the product.
170 simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
171 // Both i and power_of_five_128[index] have their most significant bit set to 1 which
172 // implies that the either the most or the second most significant bit of the product
173 // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
174 // we make of the product. It also makes it easy to reason about the product: there
175 // is 0 or 1 leading zero in the product.
176
177 // Unless the least significant 9 bits of the high (64-bit) part of the full
178 // product are all 1s, then we know that the most significant 55 bits are
179 // exact and no further work is needed. Having 55 bits is necessary because
180 // we need 53 bits for the mantissa but we have to have one rounding bit and
181 // we can waste a bit if the most significant bit of the product is zero.
182 if((firstproduct.high & 0x1FF) == 0x1FF) {
183 // We want to compute i * 5^q, but only care about the top 55 bits at most.
184 // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
185 // the full computation is wasteful. So we do what is called a "truncated
186 // multiplication".
187 // We take the most significant 64-bits, and we put them in
188 // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
189 // to the desired approximation using one multiplication. Sometimes it does not suffice.
190 // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
191 // then we get a better approximation to i * 5^q.
192 //
193 // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
194 // more complicated.
195 //
196 // There is an extra layer of complexity in that we need more than 55 bits of
197 // accuracy in the round-to-even scenario.
198 //
199 // The full_multiplication function computes the 128-bit product of two 64-bit words
200 // with a returned value of type value128 with a "low component" corresponding to the
201 // 64-bit least significant bits of the product and with a "high component" corresponding
202 // to the 64-bit most significant bits of the product.
203 simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
204 firstproduct.low += secondproduct.high;
205 if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
206 // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
207 // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
208 // is sufficiently accurate, and more computation is not needed.
209 }
210 uint64_t lower = firstproduct.low;
211 uint64_t upper = firstproduct.high;
212 // The final mantissa should be 53 bits with a leading 1.
213 // We shift it so that it occupies 54 bits with a leading 1.
215 uint64_t upperbit = upper >> 63;
216 uint64_t mantissa = upper >> (upperbit + 9);
217 lz += int(1 ^ upperbit);
218
219 // Here we have mantissa < (1<<54).
220 int64_t real_exponent = exponent - lz;
221 if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
222 // Here have that real_exponent <= 0 so -real_exponent >= 0
223 if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
224 d = negative ? -0.0 : 0.0;
225 return true;
226 }
227 // next line is safe because -real_exponent + 1 < 0
228 mantissa >>= -real_exponent + 1;
229 // Thankfully, we can't have both "round-to-even" and subnormals because
230 // "round-to-even" only occurs for powers close to 0.
231 mantissa += (mantissa & 1); // round up
232 mantissa >>= 1;
233 // There is a weird scenario where we don't have a subnormal but just.
234 // Suppose we start with 2.2250738585072013e-308, we end up
235 // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
236 // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
237 // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
238 // subnormal, but we can only know this after rounding.
239 // So we only declare a subnormal if we are smaller than the threshold.
240 real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
241 d = to_double(mantissa, real_exponent, negative);
242 return true;
243 }
244 // We have to round to even. The "to even" part
245 // is only a problem when we are right in between two floats
246 // which we guard against.
247 // If we have lots of trailing zeros, we may fall right between two
248 // floating-point values.
249 //
250 // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
251 // times a power of two. That is, it is right between a number with binary significand
252 // m and another number with binary significand m+1; and it must be the case
253 // that it cannot be represented by a float itself.
254 //
255 // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
256 // Recall that 10^q = 5^q * 2^q.
257 // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
258 // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
259 // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
260 // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
261 // 2^{53} x 5^{-q} < 2^{64}.
262 // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
263 //
264 // We require lower <= 1 and not lower == 0 because we could not prove that
265 // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
266 if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
267 if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
268 mantissa &= ~1; // flip it so that we do not round up
269 }
270 }
271
272 mantissa += mantissa & 1;
273 mantissa >>= 1;
274
275 // Here we have mantissa < (1<<53), unless there was an overflow
276 if (mantissa >= (1ULL << 53)) {
278 // This will happen when parsing values such as 7.2057594037927933e+16
280 mantissa = (1ULL << 52);
281 real_exponent++;
282 }
283 mantissa &= ~(1ULL << 52);
284 // we have to check that real_exponent is in range, otherwise we bail out
285 if (simdjson_unlikely(real_exponent > 2046)) {
286 // We have an infinite value!!! We could actually throw an error here if we could.
287 return false;
288 }
289 d = to_double(mantissa, real_exponent, negative);
290 return true;
291}
292
293// We call a fallback floating-point parser that might be slow. Note
294// it will accept JSON numbers, but the JSON spec. is more restrictive so
295// before you call parse_float_fallback, you need to have validated the input
296// string with the JSON grammar.
297// It will return an error (false) if the parsed number is infinite.
298// The string parsing itself always succeeds. We know that there is at least
299// one digit.
300static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
301 *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
302 // We do not accept infinite values.
303
304 // Detecting finite values in a portable manner is ridiculously hard, ideally
305 // we would want to do:
306 // return !std::isfinite(*outDouble);
307 // but that mysteriously fails under legacy/old libc++ libraries, see
308 // https://github.com/simdjson/simdjson/issues/1286
309 //
310 // Therefore, fall back to this solution (the extra parens are there
311 // to handle that max may be a macro on windows).
312 return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
313}
314
315static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
316 *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
317 // We do not accept infinite values.
318
319 // Detecting finite values in a portable manner is ridiculously hard, ideally
320 // we would want to do:
321 // return !std::isfinite(*outDouble);
322 // but that mysteriously fails under legacy/old libc++ libraries, see
323 // https://github.com/simdjson/simdjson/issues/1286
324 //
325 // Therefore, fall back to this solution (the extra parens are there
326 // to handle that max may be a macro on windows).
327 return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
328}
329
330// check quickly whether the next 8 chars are made of digits
331// at a glance, it looks better than Mula's
332// http://0x80.pl/articles/swar-digits-validate.html
333simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
334 uint64_t val;
335 // this can read up to 7 bytes beyond the buffer size, but we require
336 // SIMDJSON_PADDING of padding
337 static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
338 std::memcpy(&val, chars, 8);
339 // a branchy method might be faster:
340 // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
341 // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
342 // 0x3030303030303030);
343 return (((val & 0xF0F0F0F0F0F0F0F0) |
344 (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
345 0x3333333333333333);
346}
347
348template<typename I>
349SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
350simdjson_inline bool parse_digit(const uint8_t c, I &i) {
351 const uint8_t digit = static_cast<uint8_t>(c - '0');
352 if (digit > 9) {
353 return false;
354 }
355 // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
356 i = 10 * i + digit; // might overflow, we will handle the overflow later
357 return true;
358}
359
360simdjson_inline bool is_digit(const uint8_t c) {
361 return static_cast<uint8_t>(c - '0') <= 9;
362}
363
364simdjson_warn_unused simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
365 // we continue with the fiction that we have an integer. If the
366 // floating point number is representable as x * 10^z for some integer
367 // z that fits in 53 bits, then we will be able to convert back the
368 // the integer into a float in a lossless manner.
369 const uint8_t *const first_after_period = p;
370
371#ifdef SIMDJSON_SWAR_NUMBER_PARSING
372#if SIMDJSON_SWAR_NUMBER_PARSING
373 // this helps if we have lots of decimals!
374 // this turns out to be frequent enough.
375 if (is_made_of_eight_digits_fast(p)) {
376 i = i * 100000000 + parse_eight_digits_unrolled(p);
377 p += 8;
378 }
379#endif // SIMDJSON_SWAR_NUMBER_PARSING
380#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
381 // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
382 if (parse_digit(*p, i)) { ++p; }
383 while (parse_digit(*p, i)) { p++; }
384 exponent = first_after_period - p;
385 // Decimal without digits (123.) is illegal
386 if (exponent == 0) {
387 return INVALID_NUMBER(src);
388 }
389 return SUCCESS;
390}
391
392simdjson_warn_unused simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
393 // Exp Sign: -123.456e[-]78
394 bool neg_exp = ('-' == *p);
395 if (neg_exp || '+' == *p) { p++; } // Skip + as well
396
397 // Exponent: -123.456e-[78]
398 auto start_exp = p;
399 int64_t exp_number = 0;
400 while (parse_digit(*p, exp_number)) { ++p; }
401 // It is possible for parse_digit to overflow.
402 // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
403 // Thus we *must* check for possible overflow before we negate exp_number.
404
405 // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
406 // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
407 // not oblige and may, in fact, generate two distinct paths in any case. It might be
408 // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
409 // instructions for a simdjson_likely branch, an unconclusive gain.
410
411 // If there were no digits, it's an error.
412 if (simdjson_unlikely(p == start_exp)) {
413 return INVALID_NUMBER(src);
414 }
415 // We have a valid positive exponent in exp_number at this point, except that
416 // it may have overflowed.
417
418 // If there were more than 18 digits, we may have overflowed the integer. We have to do
419 // something!!!!
420 if (simdjson_unlikely(p > start_exp+18)) {
421 // Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
422 while (*start_exp == '0') { start_exp++; }
423 // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
424 // support exponents smaller than -999,999,999,999,999,999 and bigger
425 // than 999,999,999,999,999,999.
426 // We can truncate.
427 // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
428 // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
429 // truncate at 324.
430 // Note that there is no reason to fail per se at this point in time.
431 // E.g., 0e999999999999999999999 is a fine number.
432 if (p > start_exp+18) { exp_number = 999999999999999999; }
433 }
434 // At this point, we know that exp_number is a sane, positive, signed integer.
435 // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
436 // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
437 // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
438 // To sum it up: the next line should never overflow.
439 exponent += (neg_exp ? -exp_number : exp_number);
440 return SUCCESS;
441}
442
443simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
444 const uint8_t *const srcend = src + max_length;
445 bool negative = (*src == '-'); // we can always read at least one character after the '-'
446 const uint8_t *p = src + uint8_t(negative);
447 if(p == srcend) { return false; }
448 if(*p == '0') {
449 ++p;
450 if(p == srcend) { return true; }
451 if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
452 return true;
453 }
454 while(p != srcend && is_digit(*p)) { ++p; }
455 if(p == srcend) { return true; }
456 if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
457 return true;
458}
459
460simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
461 // It is possible that the integer had an overflow.
462 // We have to handle the case where we have 0.0000somenumber.
463 const uint8_t *start = start_digits;
464 while ((*start == '0') || (*start == '.')) { ++start; }
465 // we over-decrement by one when there is a '.'
466 return digit_count - size_t(start - start_digits);
467}
468
469} // unnamed namespace
470
472static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
473 if (parse_float_fallback(src, answer)) {
474 return SUCCESS;
475 }
476 return INVALID_NUMBER(src);
477}
478
480template<typename W>
481simdjson_warn_unused simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
482 // If we frequently had to deal with long strings of digits,
483 // we could extend our code by using a 128-bit integer instead
484 // of a 64-bit integer. However, this is uncommon in practice.
485 //
486 // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
487 // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
488 // may not have a decimal separator!
489 if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
490 // Ok, chances are good that we had an overflow!
491 // this is almost never going to get called!!!
492 // we start anew, going slowly!!!
493 // This will happen in the following examples:
494 // 10000000000000000000000000000000000000000000e+308
495 // 3.1415926535897932384626433832795028841971693993751
496 //
497 // NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
498 // reference to it, it would force it to be stored in memory, preventing the compiler from
499 // picking it apart and putting into registers. i.e. if we pass it as reference,
500 // it gets slow.
501 double d;
502 error_code error = slow_float_parsing(src, &d);
503 writer.append_double(d);
504 return error;
505 }
506 // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
507 // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
508 // To future reader: we'd love if someone found a better way, or at least could explain this result!
509 if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
510 //
511 // Important: smallest_power is such that it leads to a zero value.
512 // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
513 // so something x 10^-343 goes to zero, but not so with something x 10^-342.
514 static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
515 //
516 if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
517 // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
518 WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
519 return SUCCESS;
520 } else { // (exponent > largest_power) and (i != 0)
521 // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
522 return INVALID_NUMBER(src);
523 }
524 }
525 double d;
526 if (!compute_float_64(exponent, i, negative, d)) {
527 // we are almost never going to get here.
528 if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
529 }
530 WRITE_DOUBLE(d, src, writer);
531 return SUCCESS;
532}
533
534// parse the number at src
535// define JSON_TEST_NUMBERS for unit testing
536//
537// It is assumed that the number is followed by a structural ({,},],[) character
538// or a white space character. If that is not the case (e.g., when the JSON
539// document is made of a single number), then it is necessary to copy the
540// content and append a space before calling this function.
541//
542// Our objective is accurate parsing (ULP of 0) at high speed.
543template<typename W>
544simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
545
546// for performance analysis, it is sometimes useful to skip parsing
547#ifdef SIMDJSON_SKIPNUMBERPARSING
548
549template<typename W>
550simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
551 writer.append_s64(0); // always write zero
552 return SUCCESS; // always succeeds
553}
554
555simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
556simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
557simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
558simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
559simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
560simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
561simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
562simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
563simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
564#else
565
566// parse the number at src
567// define JSON_TEST_NUMBERS for unit testing
568//
569// It is assumed that the number is followed by a structural ({,},],[) character
570// or a white space character. If that is not the case (e.g., when the JSON
571// document is made of a single number), then it is necessary to copy the
572// content and append a space before calling this function.
573//
574// Our objective is accurate parsing (ULP of 0) at high speed.
575template<typename W>
576simdjson_warn_unused simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
577 //
578 // Check for minus sign
579 //
580 bool negative = (*src == '-');
581 const uint8_t *p = src + uint8_t(negative);
582
583 //
584 // Parse the integer part.
585 //
586 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
587 const uint8_t *const start_digits = p;
588 uint64_t i = 0;
589 while (parse_digit(*p, i)) { p++; }
590
591 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
592 // Optimization note: size_t is expected to be unsigned.
593 size_t digit_count = size_t(p - start_digits);
594 if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
595
596 //
597 // Handle floats if there is a . or e (or both)
598 //
599 int64_t exponent = 0;
600 bool is_float = false;
601 if ('.' == *p) {
602 is_float = true;
603 ++p;
604 SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
605 digit_count = int(p - start_digits); // used later to guard against overflows
606 }
607 if (('e' == *p) || ('E' == *p)) {
608 is_float = true;
609 ++p;
610 SIMDJSON_TRY( parse_exponent(src, p, exponent) );
611 }
612 if (is_float) {
613 const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
614 SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
615 if (dirty_end) { return INVALID_NUMBER(src); }
616 return SUCCESS;
617 }
618
619 // The longest negative 64-bit number is 19 digits.
620 // The longest positive 64-bit number is 20 digits.
621 // We do it this way so we don't trigger this branch unless we must.
622 size_t longest_digit_count = negative ? 19 : 20;
623 if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
624 if (digit_count == longest_digit_count) {
625 if (negative) {
626 // Anything negative above INT64_MAX+1 is invalid
627 if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
628 WRITE_INTEGER(~i+1, src, writer);
629 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
630 return SUCCESS;
631 // Positive overflow check:
632 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
633 // biggest uint64_t.
634 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
635 // If we got here, it's a 20 digit number starting with the digit "1".
636 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
637 // than 1,553,255,926,290,448,384.
638 // - That is smaller than the smallest possible 20-digit number the user could write:
639 // 10,000,000,000,000,000,000.
640 // - Therefore, if the number is positive and lower than that, it's overflow.
641 // - The value we are looking at is less than or equal to INT64_MAX.
642 //
643 } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
644 }
645
646 // Write unsigned if it does not fit in a signed integer.
647 if (i > uint64_t(INT64_MAX)) {
648 WRITE_UNSIGNED(i, src, writer);
649 } else {
650#if SIMDJSON_MINUS_ZERO_AS_FLOAT
651 if(i == 0 && negative) {
652 // We have to write -0.0 instead of 0
653 WRITE_DOUBLE(-0.0, src, writer);
654 } else {
655 WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
656 }
657#else
658 WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
659#endif
660 }
661 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
662 return SUCCESS;
663}
664
665// Inlineable functions
666namespace {
667
668// This table can be used to characterize the final character of an integer
669// string. For JSON structural character and allowable white space characters,
670// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
671// we return NUMBER_ERROR.
672// Optimization note: we could easily reduce the size of the table by half (to 128)
673// at the cost of an extra branch.
674// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
675static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
676static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
677static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
678
679const uint8_t integer_string_finisher[256] = {
732
733// Parse any number from 0 to 18,446,744,073,709,551,615
734simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
735 const uint8_t *p = src;
736 //
737 // Parse the integer part.
738 //
739 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
740 const uint8_t *const start_digits = p;
741 uint64_t i = 0;
742 while (parse_digit(*p, i)) { p++; }
743
744 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
745 // Optimization note: size_t is expected to be unsigned.
746 size_t digit_count = size_t(p - start_digits);
747 // The longest positive 64-bit number is 20 digits.
748 // We do it this way so we don't trigger this branch unless we must.
749 // Optimization note: the compiler can probably merge
750 // ((digit_count == 0) || (digit_count > 20))
751 // into a single branch since digit_count is unsigned.
752 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
753 // Here digit_count > 0.
754 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
755 // We can do the following...
756 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
757 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
758 // }
759 // as a single table lookup:
760 if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
761
762 if (digit_count == 20) {
763 // Positive overflow check:
764 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
765 // biggest uint64_t.
766 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
767 // If we got here, it's a 20 digit number starting with the digit "1".
768 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
769 // than 1,553,255,926,290,448,384.
770 // - That is smaller than the smallest possible 20-digit number the user could write:
771 // 10,000,000,000,000,000,000.
772 // - Therefore, if the number is positive and lower than that, it's overflow.
773 // - The value we are looking at is less than or equal to INT64_MAX.
774 //
775 if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
776 }
777
778 return i;
779}
780
781
782// Parse any number from 0 to 18,446,744,073,709,551,615
783// Never read at src_end or beyond
784simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
785 const uint8_t *p = src;
786 //
787 // Parse the integer part.
788 //
789 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
790 const uint8_t *const start_digits = p;
791 uint64_t i = 0;
792 while ((p != src_end) && parse_digit(*p, i)) { p++; }
793
794 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
795 // Optimization note: size_t is expected to be unsigned.
796 size_t digit_count = size_t(p - start_digits);
797 // The longest positive 64-bit number is 20 digits.
798 // We do it this way so we don't trigger this branch unless we must.
799 // Optimization note: the compiler can probably merge
800 // ((digit_count == 0) || (digit_count > 20))
801 // into a single branch since digit_count is unsigned.
802 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
803 // Here digit_count > 0.
804 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
805 // We can do the following...
806 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
807 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
808 // }
809 // as a single table lookup:
810 if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
811
812 if (digit_count == 20) {
813 // Positive overflow check:
814 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
815 // biggest uint64_t.
816 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
817 // If we got here, it's a 20 digit number starting with the digit "1".
818 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
819 // than 1,553,255,926,290,448,384.
820 // - That is smaller than the smallest possible 20-digit number the user could write:
821 // 10,000,000,000,000,000,000.
822 // - Therefore, if the number is positive and lower than that, it's overflow.
823 // - The value we are looking at is less than or equal to INT64_MAX.
824 //
825 if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
826 }
827
828 return i;
829}
830
831// Parse any number from 0 to 18,446,744,073,709,551,615
832simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
833 const uint8_t *p = src + 1;
834 //
835 // Parse the integer part.
836 //
837 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
838 const uint8_t *const start_digits = p;
839 uint64_t i = 0;
840 while (parse_digit(*p, i)) { p++; }
841
842 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
843 // Optimization note: size_t is expected to be unsigned.
844 size_t digit_count = size_t(p - start_digits);
845 // The longest positive 64-bit number is 20 digits.
846 // We do it this way so we don't trigger this branch unless we must.
847 // Optimization note: the compiler can probably merge
848 // ((digit_count == 0) || (digit_count > 20))
849 // into a single branch since digit_count is unsigned.
850 if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
851 // Here digit_count > 0.
852 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
853 // We can do the following...
854 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
855 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
856 // }
857 // as a single table lookup:
858 if (*p != '"') { return NUMBER_ERROR; }
859
860 if (digit_count == 20) {
861 // Positive overflow check:
862 // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
863 // biggest uint64_t.
864 // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
865 // If we got here, it's a 20 digit number starting with the digit "1".
866 // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
867 // than 1,553,255,926,290,448,384.
868 // - That is smaller than the smallest possible 20-digit number the user could write:
869 // 10,000,000,000,000,000,000.
870 // - Therefore, if the number is positive and lower than that, it's overflow.
871 // - The value we are looking at is less than or equal to INT64_MAX.
872 //
873 // Note: we use src[1] and not src[0] because src[0] is the quote character in this
874 // instance.
875 if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
876 }
877
878 return i;
879}
880
881// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
882simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
883 //
884 // Check for minus sign
885 //
886 bool negative = (*src == '-');
887 const uint8_t *p = src + uint8_t(negative);
888
889 //
890 // Parse the integer part.
891 //
892 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
893 const uint8_t *const start_digits = p;
894 uint64_t i = 0;
895 while (parse_digit(*p, i)) { p++; }
896
897 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
898 // Optimization note: size_t is expected to be unsigned.
899 size_t digit_count = size_t(p - start_digits);
900 // We go from
901 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
902 // so we can never represent numbers that have more than 19 digits.
903 size_t longest_digit_count = 19;
904 // Optimization note: the compiler can probably merge
905 // ((digit_count == 0) || (digit_count > longest_digit_count))
906 // into a single branch since digit_count is unsigned.
907 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
908 // Here digit_count > 0.
909 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
910 // We can do the following...
911 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
912 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
913 // }
914 // as a single table lookup:
915 if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
916 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
917 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
918 // so cheap that we might as well always make it.
919 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
920 return negative ? (~i+1) : i;
921}
922
923// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
924// Never read at src_end or beyond
925simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
926 //
927 // Check for minus sign
928 //
929 if(src == src_end) { return NUMBER_ERROR; }
930 bool negative = (*src == '-');
931 const uint8_t *p = src + uint8_t(negative);
932
933 //
934 // Parse the integer part.
935 //
936 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
937 const uint8_t *const start_digits = p;
938 uint64_t i = 0;
939 while ((p != src_end) && parse_digit(*p, i)) { p++; }
940
941 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
942 // Optimization note: size_t is expected to be unsigned.
943 size_t digit_count = size_t(p - start_digits);
944 // We go from
945 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
946 // so we can never represent numbers that have more than 19 digits.
947 size_t longest_digit_count = 19;
948 // Optimization note: the compiler can probably merge
949 // ((digit_count == 0) || (digit_count > longest_digit_count))
950 // into a single branch since digit_count is unsigned.
951 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
952 // Here digit_count > 0.
953 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
954 // We can do the following...
955 // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
956 // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
957 // }
958 // as a single table lookup:
959 if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
960 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
961 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
962 // so cheap that we might as well always make it.
963 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
964 return negative ? (~i+1) : i;
965}
966
967// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
968simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
969 //
970 // Check for minus sign
971 //
972 bool negative = (*(src + 1) == '-');
973 src += uint8_t(negative) + 1;
974
975 //
976 // Parse the integer part.
977 //
978 // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
979 const uint8_t *const start_digits = src;
980 uint64_t i = 0;
981 while (parse_digit(*src, i)) { src++; }
982
983 // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
984 // Optimization note: size_t is expected to be unsigned.
985 size_t digit_count = size_t(src - start_digits);
986 // We go from
987 // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
988 // so we can never represent numbers that have more than 19 digits.
989 size_t longest_digit_count = 19;
990 // Optimization note: the compiler can probably merge
991 // ((digit_count == 0) || (digit_count > longest_digit_count))
992 // into a single branch since digit_count is unsigned.
993 if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
994 // Here digit_count > 0.
995 if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
996 // We can do the following...
997 // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
998 // return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
999 // }
1000 // as a single table lookup:
1001 if(*src != '"') { return NUMBER_ERROR; }
1002 // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
1003 // Performance note: This check is only needed when digit_count == longest_digit_count but it is
1004 // so cheap that we might as well always make it.
1005 if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
1006 return negative ? (~i+1) : i;
1007}
1008
1009simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
1010 //
1011 // Check for minus sign
1012 //
1013 bool negative = (*src == '-');
1014 src += uint8_t(negative);
1015
1016 //
1017 // Parse the integer part.
1018 //
1019 uint64_t i = 0;
1020 const uint8_t *p = src;
1021 p += parse_digit(*p, i);
1022 bool leading_zero = (i == 0);
1023 while (parse_digit(*p, i)) { p++; }
1024 // no integer digits, or 0123 (zero must be solo)
1025 if ( p == src ) { return INCORRECT_TYPE; }
1026 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1027
1028 //
1029 // Parse the decimal part.
1030 //
1031 int64_t exponent = 0;
1032 bool overflow;
1033 if (simdjson_likely(*p == '.')) {
1034 p++;
1035 const uint8_t *start_decimal_digits = p;
1036 if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1037 p++;
1038 while (parse_digit(*p, i)) { p++; }
1039 exponent = -(p - start_decimal_digits);
1040
1041 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1042 overflow = p-src-1 > 19;
1043 if (simdjson_unlikely(overflow && leading_zero)) {
1044 // Skip leading 0.00000 and see if it still overflows
1045 const uint8_t *start_digits = src + 2;
1046 while (*start_digits == '0') { start_digits++; }
1047 overflow = p-start_digits > 19;
1048 }
1049 } else {
1050 overflow = p-src > 19;
1051 }
1052
1053 //
1054 // Parse the exponent
1055 //
1056 if (*p == 'e' || *p == 'E') {
1057 p++;
1058 bool exp_neg = *p == '-';
1059 p += exp_neg || *p == '+';
1060
1061 uint64_t exp = 0;
1062 const uint8_t *start_exp_digits = p;
1063 while (parse_digit(*p, exp)) { p++; }
1064 // no exp digits, or 20+ exp digits
1065 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1066
1067 exponent += exp_neg ? 0-exp : exp;
1068 }
1069
1070 if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1071
1072 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1073
1074 //
1075 // Assemble (or slow-parse) the float
1076 //
1077 double d;
1078 if (simdjson_likely(!overflow)) {
1079 if (compute_float_64(exponent, i, negative, d)) { return d; }
1080 }
1081 if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1082 return NUMBER_ERROR;
1083 }
1084 return d;
1085}
1086
1087simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
1088 return (*src == '-');
1089}
1090
1091simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
1092 bool negative = (*src == '-');
1093 src += uint8_t(negative);
1094 const uint8_t *p = src;
1095 while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1096 if ( p == src ) { return NUMBER_ERROR; }
1097 if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
1098 return false;
1099}
1100
1101simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
1102 bool negative = (*src == '-');
1103 src += uint8_t(negative);
1104 const uint8_t *p = src;
1105 while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1106 size_t digit_count = size_t(p - src);
1107 if ( p == src ) { return NUMBER_ERROR; }
1108 if (jsoncharutils::is_structural_or_whitespace(*p)) {
1109 static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
1110 // We have an integer.
1111 if(simdjson_unlikely(digit_count > 20)) {
1112 return number_type::big_integer;
1113 }
1114 // If the number is negative and valid, it must be a signed integer.
1115 if(negative) {
1116 if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
1117 if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
1118 return number_type::big_integer;
1119 }
1120#if SIMDJSON_MINUS_ZERO_AS_FLOAT
1121 if(digit_count == 1 && src[0] == '0') {
1122 // We have to write -0.0 instead of 0
1123 return number_type::floating_point_number;
1124 }
1125#endif
1126 return number_type::signed_integer;
1127 }
1128 // Let us check if we have a big integer (>=2**64).
1129 static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
1130 if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
1131 return number_type::big_integer;
1132 }
1133 // The number is positive and smaller than 18446744073709551616 (or 2**64).
1134 // We want values larger or equal to 9223372036854775808 to be unsigned
1135 // integers, and the other values to be signed integers.
1136 if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
1137 return number_type::unsigned_integer;
1138 }
1139 return number_type::signed_integer;
1140 }
1141 // Hopefully, we have 'e' or 'E' or '.'.
1142 return number_type::floating_point_number;
1143}
1144
1145// Never read at src_end or beyond
1146simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
1147 if(src == src_end) { return NUMBER_ERROR; }
1148 //
1149 // Check for minus sign
1150 //
1151 bool negative = (*src == '-');
1152 src += uint8_t(negative);
1153
1154 //
1155 // Parse the integer part.
1156 //
1157 uint64_t i = 0;
1158 const uint8_t *p = src;
1159 if(p == src_end) { return NUMBER_ERROR; }
1160 p += parse_digit(*p, i);
1161 bool leading_zero = (i == 0);
1162 while ((p != src_end) && parse_digit(*p, i)) { p++; }
1163 // no integer digits, or 0123 (zero must be solo)
1164 if ( p == src ) { return INCORRECT_TYPE; }
1165 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1166
1167 //
1168 // Parse the decimal part.
1169 //
1170 int64_t exponent = 0;
1171 bool overflow;
1172 if (simdjson_likely((p != src_end) && (*p == '.'))) {
1173 p++;
1174 const uint8_t *start_decimal_digits = p;
1175 if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1176 p++;
1177 while ((p != src_end) && parse_digit(*p, i)) { p++; }
1178 exponent = -(p - start_decimal_digits);
1179
1180 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1181 overflow = p-src-1 > 19;
1182 if (simdjson_unlikely(overflow && leading_zero)) {
1183 // Skip leading 0.00000 and see if it still overflows
1184 const uint8_t *start_digits = src + 2;
1185 while (*start_digits == '0') { start_digits++; }
1186 overflow = start_digits-src > 19;
1187 }
1188 } else {
1189 overflow = p-src > 19;
1190 }
1191
1192 //
1193 // Parse the exponent
1194 //
1195 if ((p != src_end) && (*p == 'e' || *p == 'E')) {
1196 p++;
1197 if(p == src_end) { return NUMBER_ERROR; }
1198 bool exp_neg = *p == '-';
1199 p += exp_neg || *p == '+';
1200
1201 uint64_t exp = 0;
1202 const uint8_t *start_exp_digits = p;
1203 while ((p != src_end) && parse_digit(*p, exp)) { p++; }
1204 // no exp digits, or 20+ exp digits
1205 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1206
1207 exponent += exp_neg ? 0-exp : exp;
1208 }
1209
1210 if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1211
1212 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1213
1214 //
1215 // Assemble (or slow-parse) the float
1216 //
1217 double d;
1218 if (simdjson_likely(!overflow)) {
1219 if (compute_float_64(exponent, i, negative, d)) { return d; }
1220 }
1221 if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
1222 return NUMBER_ERROR;
1223 }
1224 return d;
1225}
1226
1227simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
1228 //
1229 // Check for minus sign
1230 //
1231 bool negative = (*(src + 1) == '-');
1232 src += uint8_t(negative) + 1;
1233
1234 //
1235 // Parse the integer part.
1236 //
1237 uint64_t i = 0;
1238 const uint8_t *p = src;
1239 p += parse_digit(*p, i);
1240 bool leading_zero = (i == 0);
1241 while (parse_digit(*p, i)) { p++; }
1242 // no integer digits, or 0123 (zero must be solo)
1243 if ( p == src ) { return INCORRECT_TYPE; }
1244 if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1245
1246 //
1247 // Parse the decimal part.
1248 //
1249 int64_t exponent = 0;
1250 bool overflow;
1251 if (simdjson_likely(*p == '.')) {
1252 p++;
1253 const uint8_t *start_decimal_digits = p;
1254 if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1255 p++;
1256 while (parse_digit(*p, i)) { p++; }
1257 exponent = -(p - start_decimal_digits);
1258
1259 // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1260 overflow = p-src-1 > 19;
1261 if (simdjson_unlikely(overflow && leading_zero)) {
1262 // Skip leading 0.00000 and see if it still overflows
1263 const uint8_t *start_digits = src + 2;
1264 while (*start_digits == '0') { start_digits++; }
1265 overflow = p-start_digits > 19;
1266 }
1267 } else {
1268 overflow = p-src > 19;
1269 }
1270
1271 //
1272 // Parse the exponent
1273 //
1274 if (*p == 'e' || *p == 'E') {
1275 p++;
1276 bool exp_neg = *p == '-';
1277 p += exp_neg || *p == '+';
1278
1279 uint64_t exp = 0;
1280 const uint8_t *start_exp_digits = p;
1281 while (parse_digit(*p, exp)) { p++; }
1282 // no exp digits, or 20+ exp digits
1283 if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1284
1285 exponent += exp_neg ? 0-exp : exp;
1286 }
1287
1288 if (*p != '"') { return NUMBER_ERROR; }
1289
1290 overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1291
1292 //
1293 // Assemble (or slow-parse) the float
1294 //
1295 double d;
1296 if (simdjson_likely(!overflow)) {
1297 if (compute_float_64(exponent, i, negative, d)) { return d; }
1298 }
1299 if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1300 return NUMBER_ERROR;
1301 }
1302 return d;
1303}
1304
1305} // unnamed namespace
1306#endif // SIMDJSON_SKIPNUMBERPARSING
1307
1308} // namespace numberparsing
1309
1310inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
1311 switch (type) {
1312 case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
1313 case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
1314 case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
1315 case number_type::big_integer: out << "big integer"; break;
1316 default: SIMDJSON_UNREACHABLE();
1317 }
1318 return out;
1319}
1320
1321} // namespace SIMDJSON_IMPLEMENTATION
1322} // namespace simdjson
1323
1324#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
The top level simdjson namespace, containing everything the library provides.
Definition base.h:8
error_code
All possible errors returned by simdjson.
Definition error.h:19
@ INCORRECT_TYPE
JSON element has a different type than user expected.
Definition error.h:37
@ SUCCESS
No error.
Definition error.h:20
@ NUMBER_ERROR
Problem while parsing a number.
Definition error.h:29
constexpr size_t SIMDJSON_PADDING
The amount of padding needed in a buffer to parse JSON.
Definition base.h:33