simdjson  3.11.0
Ridiculously Fast JSON
numberparsing.h
1 #ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
2 
3 #ifndef SIMDJSON_CONDITIONAL_INCLUDE
4 #define SIMDJSON_GENERIC_NUMBERPARSING_H
5 #include "simdjson/generic/base.h"
6 #include "simdjson/generic/jsoncharutils.h"
7 #include "simdjson/internal/numberparsing_tables.h"
8 #endif // SIMDJSON_CONDITIONAL_INCLUDE
9 
10 #include <limits>
11 #include <ostream>
12 #include <cstring>
13 
14 namespace simdjson {
15 namespace SIMDJSON_IMPLEMENTATION {
16 namespace numberparsing {
17 
18 #ifdef JSON_TEST_NUMBERS
19 #define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
20 #define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
21 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
22 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
23 #define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
24 #else
25 #define INVALID_NUMBER(SRC) (NUMBER_ERROR)
26 #define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
27 #define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
28 #define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
29 #define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
30 #endif
31 
32 namespace {
33 
34 // Convert a mantissa, an exponent and a sign bit into an ieee64 double.
35 // The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
36 // The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
37 simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
38  double d;
39  mantissa &= ~(1ULL << 52);
40  mantissa |= real_exponent << 52;
41  mantissa |= ((static_cast<uint64_t>(negative)) << 63);
42  std::memcpy(&d, &mantissa, sizeof(d));
43  return d;
44 }
45 
46 // Attempts to compute i * 10^(power) exactly; and if "negative" is
47 // true, negate the result.
48 // This function will only work in some cases, when it does not work, success is
49 // set to false. This should work *most of the time* (like 99% of the time).
50 // We assume that power is in the [smallest_power,
51 // largest_power] interval: the caller is responsible for this check.
52 simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
53  // we start with a fast path
54  // It was described in
55  // Clinger WD. How to read floating point numbers accurately.
56  // ACM SIGPLAN Notices. 1990
57 #ifndef FLT_EVAL_METHOD
58 #error "FLT_EVAL_METHOD should be defined, please include cfloat."
59 #endif
60 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
61  // We cannot be certain that x/y is rounded to nearest.
62  if (0 <= power && power <= 22 && i <= 9007199254740991)
63 #else
64  if (-22 <= power && power <= 22 && i <= 9007199254740991)
65 #endif
66  {
67  // convert the integer into a double. This is lossless since
68  // 0 <= i <= 2^53 - 1.
69  d = double(i);
70  //
71  // The general idea is as follows.
72  // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
73  // 1) Both s and p can be represented exactly as 64-bit floating-point
74  // values
75  // (binary64).
76  // 2) Because s and p can be represented exactly as floating-point values,
77  // then s * p
78  // and s / p will produce correctly rounded values.
79  //
80  if (power < 0) {
81  d = d / simdjson::internal::power_of_ten[-power];
82  } else {
83  d = d * simdjson::internal::power_of_ten[power];
84  }
85  if (negative) {
86  d = -d;
87  }
88  return true;
89  }
90  // When 22 < power && power < 22 + 16, we could
91  // hope for another, secondary fast path. It was
92  // described by David M. Gay in "Correctly rounded
93  // binary-decimal and decimal-binary conversions." (1990)
94  // If you need to compute i * 10^(22 + x) for x < 16,
95  // first compute i * 10^x, if you know that result is exact
96  // (e.g., when i * 10^x < 2^53),
97  // then you can still proceed and do (i * 10^x) * 10^22.
98  // Is this worth your time?
99  // You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
100  // for this second fast path to work.
101  // If you you have 22 < power *and* power < 22 + 16, and then you
102  // optimistically compute "i * 10^(x-22)", there is still a chance that you
103  // have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
104  // this optimization maybe less common than we would like. Source:
105  // http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
106  // also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
107 
108  // The fast path has now failed, so we are failing back on the slower path.
109 
110  // In the slow path, we need to adjust i so that it is > 1<<63 which is always
111  // possible, except if i == 0, so we handle i == 0 separately.
112  if(i == 0) {
113  d = negative ? -0.0 : 0.0;
114  return true;
115  }
116 
117 
118  // The exponent is 1024 + 63 + power
119  // + floor(log(5**power)/log(2)).
120  // The 1024 comes from the ieee64 standard.
121  // The 63 comes from the fact that we use a 64-bit word.
122  //
123  // Computing floor(log(5**power)/log(2)) could be
124  // slow. Instead we use a fast function.
125  //
126  // For power in (-400,350), we have that
127  // (((152170 + 65536) * power ) >> 16);
128  // is equal to
129  // floor(log(5**power)/log(2)) + power when power >= 0
130  // and it is equal to
131  // ceil(log(5**-power)/log(2)) + power when power < 0
132  //
133  // The 65536 is (1<<16) and corresponds to
134  // (65536 * power) >> 16 ---> power
135  //
136  // ((152170 * power ) >> 16) is equal to
137  // floor(log(5**power)/log(2))
138  //
139  // Note that this is not magic: 152170/(1<<16) is
140  // approximatively equal to log(5)/log(2).
141  // The 1<<16 value is a power of two; we could use a
142  // larger power of 2 if we wanted to.
143  //
144  int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
145 
146 
147  // We want the most significant bit of i to be 1. Shift if needed.
148  int lz = leading_zeroes(i);
149  i <<= lz;
150 
151 
152  // We are going to need to do some 64-bit arithmetic to get a precise product.
153  // We use a table lookup approach.
154  // It is safe because
155  // power >= smallest_power
156  // and power <= largest_power
157  // We recover the mantissa of the power, it has a leading 1. It is always
158  // rounded down.
159  //
160  // We want the most significant 64 bits of the product. We know
161  // this will be non-zero because the most significant bit of i is
162  // 1.
163  const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
164  // Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
165  //
166  // The full_multiplication function computes the 128-bit product of two 64-bit words
167  // with a returned value of type value128 with a "low component" corresponding to the
168  // 64-bit least significant bits of the product and with a "high component" corresponding
169  // to the 64-bit most significant bits of the product.
170  simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
171  // Both i and power_of_five_128[index] have their most significant bit set to 1 which
172  // implies that the either the most or the second most significant bit of the product
173  // is 1. We pack values in this manner for efficiency reasons: it maximizes the use
174  // we make of the product. It also makes it easy to reason about the product: there
175  // is 0 or 1 leading zero in the product.
176 
177  // Unless the least significant 9 bits of the high (64-bit) part of the full
178  // product are all 1s, then we know that the most significant 55 bits are
179  // exact and no further work is needed. Having 55 bits is necessary because
180  // we need 53 bits for the mantissa but we have to have one rounding bit and
181  // we can waste a bit if the most significant bit of the product is zero.
182  if((firstproduct.high & 0x1FF) == 0x1FF) {
183  // We want to compute i * 5^q, but only care about the top 55 bits at most.
184  // Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
185  // the full computation is wasteful. So we do what is called a "truncated
186  // multiplication".
187  // We take the most significant 64-bits, and we put them in
188  // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
189  // to the desired approximation using one multiplication. Sometimes it does not suffice.
190  // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
191  // then we get a better approximation to i * 5^q.
192  //
193  // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
194  // more complicated.
195  //
196  // There is an extra layer of complexity in that we need more than 55 bits of
197  // accuracy in the round-to-even scenario.
198  //
199  // The full_multiplication function computes the 128-bit product of two 64-bit words
200  // with a returned value of type value128 with a "low component" corresponding to the
201  // 64-bit least significant bits of the product and with a "high component" corresponding
202  // to the 64-bit most significant bits of the product.
203  simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
204  firstproduct.low += secondproduct.high;
205  if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
206  // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
207  // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
208  // is sufficiently accurate, and more computation is not needed.
209  }
210  uint64_t lower = firstproduct.low;
211  uint64_t upper = firstproduct.high;
212  // The final mantissa should be 53 bits with a leading 1.
213  // We shift it so that it occupies 54 bits with a leading 1.
215  uint64_t upperbit = upper >> 63;
216  uint64_t mantissa = upper >> (upperbit + 9);
217  lz += int(1 ^ upperbit);
218 
219  // Here we have mantissa < (1<<54).
220  int64_t real_exponent = exponent - lz;
221  if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
222  // Here have that real_exponent <= 0 so -real_exponent >= 0
223  if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
224  d = negative ? -0.0 : 0.0;
225  return true;
226  }
227  // next line is safe because -real_exponent + 1 < 0
228  mantissa >>= -real_exponent + 1;
229  // Thankfully, we can't have both "round-to-even" and subnormals because
230  // "round-to-even" only occurs for powers close to 0.
231  mantissa += (mantissa & 1); // round up
232  mantissa >>= 1;
233  // There is a weird scenario where we don't have a subnormal but just.
234  // Suppose we start with 2.2250738585072013e-308, we end up
235  // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
236  // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
237  // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
238  // subnormal, but we can only know this after rounding.
239  // So we only declare a subnormal if we are smaller than the threshold.
240  real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
241  d = to_double(mantissa, real_exponent, negative);
242  return true;
243  }
244  // We have to round to even. The "to even" part
245  // is only a problem when we are right in between two floats
246  // which we guard against.
247  // If we have lots of trailing zeros, we may fall right between two
248  // floating-point values.
249  //
250  // The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
251  // times a power of two. That is, it is right between a number with binary significand
252  // m and another number with binary significand m+1; and it must be the case
253  // that it cannot be represented by a float itself.
254  //
255  // We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
256  // Recall that 10^q = 5^q * 2^q.
257  // When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
258  // 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
259  // When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
260  // (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
261  // 2^{53} x 5^{-q} < 2^{64}.
262  // Hence we have 5^{-q} < 2^{11}$ or q>= -4.
263  //
264  // We require lower <= 1 and not lower == 0 because we could not prove that
265  // that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
266  if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
267  if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
268  mantissa &= ~1; // flip it so that we do not round up
269  }
270  }
271 
272  mantissa += mantissa & 1;
273  mantissa >>= 1;
274 
275  // Here we have mantissa < (1<<53), unless there was an overflow
276  if (mantissa >= (1ULL << 53)) {
278  // This will happen when parsing values such as 7.2057594037927933e+16
280  mantissa = (1ULL << 52);
281  real_exponent++;
282  }
283  mantissa &= ~(1ULL << 52);
284  // we have to check that real_exponent is in range, otherwise we bail out
285  if (simdjson_unlikely(real_exponent > 2046)) {
286  // We have an infinite value!!! We could actually throw an error here if we could.
287  return false;
288  }
289  d = to_double(mantissa, real_exponent, negative);
290  return true;
291 }
292 
293 // We call a fallback floating-point parser that might be slow. Note
294 // it will accept JSON numbers, but the JSON spec. is more restrictive so
295 // before you call parse_float_fallback, you need to have validated the input
296 // string with the JSON grammar.
297 // It will return an error (false) if the parsed number is infinite.
298 // The string parsing itself always succeeds. We know that there is at least
299 // one digit.
300 static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
301  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
302  // We do not accept infinite values.
303 
304  // Detecting finite values in a portable manner is ridiculously hard, ideally
305  // we would want to do:
306  // return !std::isfinite(*outDouble);
307  // but that mysteriously fails under legacy/old libc++ libraries, see
308  // https://github.com/simdjson/simdjson/issues/1286
309  //
310  // Therefore, fall back to this solution (the extra parens are there
311  // to handle that max may be a macro on windows).
312  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
313 }
314 
315 static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
316  *outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
317  // We do not accept infinite values.
318 
319  // Detecting finite values in a portable manner is ridiculously hard, ideally
320  // we would want to do:
321  // return !std::isfinite(*outDouble);
322  // but that mysteriously fails under legacy/old libc++ libraries, see
323  // https://github.com/simdjson/simdjson/issues/1286
324  //
325  // Therefore, fall back to this solution (the extra parens are there
326  // to handle that max may be a macro on windows).
327  return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
328 }
329 
330 // check quickly whether the next 8 chars are made of digits
331 // at a glance, it looks better than Mula's
332 // http://0x80.pl/articles/swar-digits-validate.html
333 simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
334  uint64_t val;
335  // this can read up to 7 bytes beyond the buffer size, but we require
336  // SIMDJSON_PADDING of padding
337  static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
338  std::memcpy(&val, chars, 8);
339  // a branchy method might be faster:
340  // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
341  // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
342  // 0x3030303030303030);
343  return (((val & 0xF0F0F0F0F0F0F0F0) |
344  (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
345  0x3333333333333333);
346 }
347 
348 template<typename I>
349 SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
350 simdjson_inline bool parse_digit(const uint8_t c, I &i) {
351  const uint8_t digit = static_cast<uint8_t>(c - '0');
352  if (digit > 9) {
353  return false;
354  }
355  // PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
356  i = 10 * i + digit; // might overflow, we will handle the overflow later
357  return true;
358 }
359 
360 simdjson_inline bool is_digit(const uint8_t c) {
361  return static_cast<uint8_t>(c - '0') <= 9;
362 }
363 
364 simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
365  // we continue with the fiction that we have an integer. If the
366  // floating point number is representable as x * 10^z for some integer
367  // z that fits in 53 bits, then we will be able to convert back the
368  // the integer into a float in a lossless manner.
369  const uint8_t *const first_after_period = p;
370 
371 #ifdef SIMDJSON_SWAR_NUMBER_PARSING
372 #if SIMDJSON_SWAR_NUMBER_PARSING
373  // this helps if we have lots of decimals!
374  // this turns out to be frequent enough.
375  if (is_made_of_eight_digits_fast(p)) {
376  i = i * 100000000 + parse_eight_digits_unrolled(p);
377  p += 8;
378  }
379 #endif // SIMDJSON_SWAR_NUMBER_PARSING
380 #endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
381  // Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
382  if (parse_digit(*p, i)) { ++p; }
383  while (parse_digit(*p, i)) { p++; }
384  exponent = first_after_period - p;
385  // Decimal without digits (123.) is illegal
386  if (exponent == 0) {
387  return INVALID_NUMBER(src);
388  }
389  return SUCCESS;
390 }
391 
392 simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
393  // Exp Sign: -123.456e[-]78
394  bool neg_exp = ('-' == *p);
395  if (neg_exp || '+' == *p) { p++; } // Skip + as well
396 
397  // Exponent: -123.456e-[78]
398  auto start_exp = p;
399  int64_t exp_number = 0;
400  while (parse_digit(*p, exp_number)) { ++p; }
401  // It is possible for parse_digit to overflow.
402  // In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
403  // Thus we *must* check for possible overflow before we negate exp_number.
404 
405  // Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
406  // a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
407  // not oblige and may, in fact, generate two distinct paths in any case. It might be
408  // possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
409  // instructions for a simdjson_likely branch, an unconclusive gain.
410 
411  // If there were no digits, it's an error.
412  if (simdjson_unlikely(p == start_exp)) {
413  return INVALID_NUMBER(src);
414  }
415  // We have a valid positive exponent in exp_number at this point, except that
416  // it may have overflowed.
417 
418  // If there were more than 18 digits, we may have overflowed the integer. We have to do
419  // something!!!!
420  if (simdjson_unlikely(p > start_exp+18)) {
421  // Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
422  while (*start_exp == '0') { start_exp++; }
423  // 19 digits could overflow int64_t and is kind of absurd anyway. We don't
424  // support exponents smaller than -999,999,999,999,999,999 and bigger
425  // than 999,999,999,999,999,999.
426  // We can truncate.
427  // Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
428  // infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
429  // truncate at 324.
430  // Note that there is no reason to fail per se at this point in time.
431  // E.g., 0e999999999999999999999 is a fine number.
432  if (p > start_exp+18) { exp_number = 999999999999999999; }
433  }
434  // At this point, we know that exp_number is a sane, positive, signed integer.
435  // It is <= 999,999,999,999,999,999. As long as 'exponent' is in
436  // [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
437  // is bounded in magnitude by the size of the JSON input, we are fine in this universe.
438  // To sum it up: the next line should never overflow.
439  exponent += (neg_exp ? -exp_number : exp_number);
440  return SUCCESS;
441 }
442 
443 simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
444  const uint8_t *const srcend = src + max_length;
445  bool negative = (*src == '-'); // we can always read at least one character after the '-'
446  const uint8_t *p = src + uint8_t(negative);
447  if(p == srcend) { return false; }
448  if(*p == '0') {
449  ++p;
450  if(p == srcend) { return true; }
451  if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
452  return true;
453  }
454  while(p != srcend && is_digit(*p)) { ++p; }
455  if(p == srcend) { return true; }
456  if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
457  return true;
458 }
459 
460 simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
461  // It is possible that the integer had an overflow.
462  // We have to handle the case where we have 0.0000somenumber.
463  const uint8_t *start = start_digits;
464  while ((*start == '0') || (*start == '.')) { ++start; }
465  // we over-decrement by one when there is a '.'
466  return digit_count - size_t(start - start_digits);
467 }
468 
469 } // unnamed namespace
470 
472 static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
473  if (parse_float_fallback(src, answer)) {
474  return SUCCESS;
475  }
476  return INVALID_NUMBER(src);
477 }
478 
480 template<typename W>
481 simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
482  // If we frequently had to deal with long strings of digits,
483  // we could extend our code by using a 128-bit integer instead
484  // of a 64-bit integer. However, this is uncommon in practice.
485  //
486  // 9999999999999999999 < 2**64 so we can accommodate 19 digits.
487  // If we have a decimal separator, then digit_count - 1 is the number of digits, but we
488  // may not have a decimal separator!
489  if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
490  // Ok, chances are good that we had an overflow!
491  // this is almost never going to get called!!!
492  // we start anew, going slowly!!!
493  // This will happen in the following examples:
494  // 10000000000000000000000000000000000000000000e+308
495  // 3.1415926535897932384626433832795028841971693993751
496  //
497  // NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
498  // reference to it, it would force it to be stored in memory, preventing the compiler from
499  // picking it apart and putting into registers. i.e. if we pass it as reference,
500  // it gets slow.
501  double d;
502  error_code error = slow_float_parsing(src, &d);
503  writer.append_double(d);
504  return error;
505  }
506  // NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
507  // way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
508  // To future reader: we'd love if someone found a better way, or at least could explain this result!
509  if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
510  //
511  // Important: smallest_power is such that it leads to a zero value.
512  // Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
513  // so something x 10^-343 goes to zero, but not so with something x 10^-342.
514  static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
515  //
516  if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
517  // E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
518  WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
519  return SUCCESS;
520  } else { // (exponent > largest_power) and (i != 0)
521  // We have, for sure, an infinite value and simdjson refuses to parse infinite values.
522  return INVALID_NUMBER(src);
523  }
524  }
525  double d;
526  if (!compute_float_64(exponent, i, negative, d)) {
527  // we are almost never going to get here.
528  if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
529  }
530  WRITE_DOUBLE(d, src, writer);
531  return SUCCESS;
532 }
533 
534 // parse the number at src
535 // define JSON_TEST_NUMBERS for unit testing
536 //
537 // It is assumed that the number is followed by a structural ({,},],[) character
538 // or a white space character. If that is not the case (e.g., when the JSON
539 // document is made of a single number), then it is necessary to copy the
540 // content and append a space before calling this function.
541 //
542 // Our objective is accurate parsing (ULP of 0) at high speed.
543 template<typename W>
544 simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
545 
546 // for performance analysis, it is sometimes useful to skip parsing
547 #ifdef SIMDJSON_SKIPNUMBERPARSING
548 
549 template<typename W>
550 simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
551  writer.append_s64(0); // always write zero
552  return SUCCESS; // always succeeds
553 }
554 
555 simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
556 simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
557 simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
558 simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
559 simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
560 simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
561 simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
562 simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
563 simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
564 #else
565 
566 // parse the number at src
567 // define JSON_TEST_NUMBERS for unit testing
568 //
569 // It is assumed that the number is followed by a structural ({,},],[) character
570 // or a white space character. If that is not the case (e.g., when the JSON
571 // document is made of a single number), then it is necessary to copy the
572 // content and append a space before calling this function.
573 //
574 // Our objective is accurate parsing (ULP of 0) at high speed.
575 template<typename W>
576 simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
577  //
578  // Check for minus sign
579  //
580  bool negative = (*src == '-');
581  const uint8_t *p = src + uint8_t(negative);
582 
583  //
584  // Parse the integer part.
585  //
586  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
587  const uint8_t *const start_digits = p;
588  uint64_t i = 0;
589  while (parse_digit(*p, i)) { p++; }
590 
591  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
592  // Optimization note: size_t is expected to be unsigned.
593  size_t digit_count = size_t(p - start_digits);
594  if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
595 
596  //
597  // Handle floats if there is a . or e (or both)
598  //
599  int64_t exponent = 0;
600  bool is_float = false;
601  if ('.' == *p) {
602  is_float = true;
603  ++p;
604  SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
605  digit_count = int(p - start_digits); // used later to guard against overflows
606  }
607  if (('e' == *p) || ('E' == *p)) {
608  is_float = true;
609  ++p;
610  SIMDJSON_TRY( parse_exponent(src, p, exponent) );
611  }
612  if (is_float) {
613  const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
614  SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
615  if (dirty_end) { return INVALID_NUMBER(src); }
616  return SUCCESS;
617  }
618 
619  // The longest negative 64-bit number is 19 digits.
620  // The longest positive 64-bit number is 20 digits.
621  // We do it this way so we don't trigger this branch unless we must.
622  size_t longest_digit_count = negative ? 19 : 20;
623  if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
624  if (digit_count == longest_digit_count) {
625  if (negative) {
626  // Anything negative above INT64_MAX+1 is invalid
627  if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
628  WRITE_INTEGER(~i+1, src, writer);
629  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
630  return SUCCESS;
631  // Positive overflow check:
632  // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
633  // biggest uint64_t.
634  // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
635  // If we got here, it's a 20 digit number starting with the digit "1".
636  // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
637  // than 1,553,255,926,290,448,384.
638  // - That is smaller than the smallest possible 20-digit number the user could write:
639  // 10,000,000,000,000,000,000.
640  // - Therefore, if the number is positive and lower than that, it's overflow.
641  // - The value we are looking at is less than or equal to INT64_MAX.
642  //
643  } else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
644  }
645 
646  // Write unsigned if it does not fit in a signed integer.
647  if (i > uint64_t(INT64_MAX)) {
648  WRITE_UNSIGNED(i, src, writer);
649  } else {
650  WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
651  }
652  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
653  return SUCCESS;
654 }
655 
656 // Inlineable functions
657 namespace {
658 
659 // This table can be used to characterize the final character of an integer
660 // string. For JSON structural character and allowable white space characters,
661 // we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
662 // we return NUMBER_ERROR.
663 // Optimization note: we could easily reduce the size of the table by half (to 128)
664 // at the cost of an extra branch.
665 // Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
666 static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
667 static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
668 static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
669 
670 const uint8_t integer_string_finisher[256] = {
722  NUMBER_ERROR};
723 
724 // Parse any number from 0 to 18,446,744,073,709,551,615
725 simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
726  const uint8_t *p = src;
727  //
728  // Parse the integer part.
729  //
730  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
731  const uint8_t *const start_digits = p;
732  uint64_t i = 0;
733  while (parse_digit(*p, i)) { p++; }
734 
735  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
736  // Optimization note: size_t is expected to be unsigned.
737  size_t digit_count = size_t(p - start_digits);
738  // The longest positive 64-bit number is 20 digits.
739  // We do it this way so we don't trigger this branch unless we must.
740  // Optimization note: the compiler can probably merge
741  // ((digit_count == 0) || (digit_count > 20))
742  // into a single branch since digit_count is unsigned.
743  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
744  // Here digit_count > 0.
745  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
746  // We can do the following...
747  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
748  // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
749  // }
750  // as a single table lookup:
751  if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
752 
753  if (digit_count == 20) {
754  // Positive overflow check:
755  // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
756  // biggest uint64_t.
757  // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
758  // If we got here, it's a 20 digit number starting with the digit "1".
759  // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
760  // than 1,553,255,926,290,448,384.
761  // - That is smaller than the smallest possible 20-digit number the user could write:
762  // 10,000,000,000,000,000,000.
763  // - Therefore, if the number is positive and lower than that, it's overflow.
764  // - The value we are looking at is less than or equal to INT64_MAX.
765  //
766  if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
767  }
768 
769  return i;
770 }
771 
772 
773 // Parse any number from 0 to 18,446,744,073,709,551,615
774 // Never read at src_end or beyond
775 simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
776  const uint8_t *p = src;
777  //
778  // Parse the integer part.
779  //
780  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
781  const uint8_t *const start_digits = p;
782  uint64_t i = 0;
783  while ((p != src_end) && parse_digit(*p, i)) { p++; }
784 
785  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
786  // Optimization note: size_t is expected to be unsigned.
787  size_t digit_count = size_t(p - start_digits);
788  // The longest positive 64-bit number is 20 digits.
789  // We do it this way so we don't trigger this branch unless we must.
790  // Optimization note: the compiler can probably merge
791  // ((digit_count == 0) || (digit_count > 20))
792  // into a single branch since digit_count is unsigned.
793  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
794  // Here digit_count > 0.
795  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
796  // We can do the following...
797  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
798  // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
799  // }
800  // as a single table lookup:
801  if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
802 
803  if (digit_count == 20) {
804  // Positive overflow check:
805  // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
806  // biggest uint64_t.
807  // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
808  // If we got here, it's a 20 digit number starting with the digit "1".
809  // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
810  // than 1,553,255,926,290,448,384.
811  // - That is smaller than the smallest possible 20-digit number the user could write:
812  // 10,000,000,000,000,000,000.
813  // - Therefore, if the number is positive and lower than that, it's overflow.
814  // - The value we are looking at is less than or equal to INT64_MAX.
815  //
816  if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
817  }
818 
819  return i;
820 }
821 
822 // Parse any number from 0 to 18,446,744,073,709,551,615
823 simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
824  const uint8_t *p = src + 1;
825  //
826  // Parse the integer part.
827  //
828  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
829  const uint8_t *const start_digits = p;
830  uint64_t i = 0;
831  while (parse_digit(*p, i)) { p++; }
832 
833  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
834  // Optimization note: size_t is expected to be unsigned.
835  size_t digit_count = size_t(p - start_digits);
836  // The longest positive 64-bit number is 20 digits.
837  // We do it this way so we don't trigger this branch unless we must.
838  // Optimization note: the compiler can probably merge
839  // ((digit_count == 0) || (digit_count > 20))
840  // into a single branch since digit_count is unsigned.
841  if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
842  // Here digit_count > 0.
843  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
844  // We can do the following...
845  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
846  // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
847  // }
848  // as a single table lookup:
849  if (*p != '"') { return NUMBER_ERROR; }
850 
851  if (digit_count == 20) {
852  // Positive overflow check:
853  // - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
854  // biggest uint64_t.
855  // - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
856  // If we got here, it's a 20 digit number starting with the digit "1".
857  // - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
858  // than 1,553,255,926,290,448,384.
859  // - That is smaller than the smallest possible 20-digit number the user could write:
860  // 10,000,000,000,000,000,000.
861  // - Therefore, if the number is positive and lower than that, it's overflow.
862  // - The value we are looking at is less than or equal to INT64_MAX.
863  //
864  // Note: we use src[1] and not src[0] because src[0] is the quote character in this
865  // instance.
866  if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
867  }
868 
869  return i;
870 }
871 
872 // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
873 simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
874  //
875  // Check for minus sign
876  //
877  bool negative = (*src == '-');
878  const uint8_t *p = src + uint8_t(negative);
879 
880  //
881  // Parse the integer part.
882  //
883  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
884  const uint8_t *const start_digits = p;
885  uint64_t i = 0;
886  while (parse_digit(*p, i)) { p++; }
887 
888  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
889  // Optimization note: size_t is expected to be unsigned.
890  size_t digit_count = size_t(p - start_digits);
891  // We go from
892  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
893  // so we can never represent numbers that have more than 19 digits.
894  size_t longest_digit_count = 19;
895  // Optimization note: the compiler can probably merge
896  // ((digit_count == 0) || (digit_count > longest_digit_count))
897  // into a single branch since digit_count is unsigned.
898  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
899  // Here digit_count > 0.
900  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
901  // We can do the following...
902  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
903  // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
904  // }
905  // as a single table lookup:
906  if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
907  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
908  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
909  // so cheap that we might as well always make it.
910  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
911  return negative ? (~i+1) : i;
912 }
913 
914 // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
915 // Never read at src_end or beyond
916 simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
917  //
918  // Check for minus sign
919  //
920  if(src == src_end) { return NUMBER_ERROR; }
921  bool negative = (*src == '-');
922  const uint8_t *p = src + uint8_t(negative);
923 
924  //
925  // Parse the integer part.
926  //
927  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
928  const uint8_t *const start_digits = p;
929  uint64_t i = 0;
930  while ((p != src_end) && parse_digit(*p, i)) { p++; }
931 
932  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
933  // Optimization note: size_t is expected to be unsigned.
934  size_t digit_count = size_t(p - start_digits);
935  // We go from
936  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
937  // so we can never represent numbers that have more than 19 digits.
938  size_t longest_digit_count = 19;
939  // Optimization note: the compiler can probably merge
940  // ((digit_count == 0) || (digit_count > longest_digit_count))
941  // into a single branch since digit_count is unsigned.
942  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
943  // Here digit_count > 0.
944  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
945  // We can do the following...
946  // if (!jsoncharutils::is_structural_or_whitespace(*p)) {
947  // return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
948  // }
949  // as a single table lookup:
950  if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
951  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
952  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
953  // so cheap that we might as well always make it.
954  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
955  return negative ? (~i+1) : i;
956 }
957 
958 // Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
959 simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
960  //
961  // Check for minus sign
962  //
963  bool negative = (*(src + 1) == '-');
964  src += uint8_t(negative) + 1;
965 
966  //
967  // Parse the integer part.
968  //
969  // PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
970  const uint8_t *const start_digits = src;
971  uint64_t i = 0;
972  while (parse_digit(*src, i)) { src++; }
973 
974  // If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
975  // Optimization note: size_t is expected to be unsigned.
976  size_t digit_count = size_t(src - start_digits);
977  // We go from
978  // -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
979  // so we can never represent numbers that have more than 19 digits.
980  size_t longest_digit_count = 19;
981  // Optimization note: the compiler can probably merge
982  // ((digit_count == 0) || (digit_count > longest_digit_count))
983  // into a single branch since digit_count is unsigned.
984  if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
985  // Here digit_count > 0.
986  if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
987  // We can do the following...
988  // if (!jsoncharutils::is_structural_or_whitespace(*src)) {
989  // return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
990  // }
991  // as a single table lookup:
992  if(*src != '"') { return NUMBER_ERROR; }
993  // Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
994  // Performance note: This check is only needed when digit_count == longest_digit_count but it is
995  // so cheap that we might as well always make it.
996  if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
997  return negative ? (~i+1) : i;
998 }
999 
1000 simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
1001  //
1002  // Check for minus sign
1003  //
1004  bool negative = (*src == '-');
1005  src += uint8_t(negative);
1006 
1007  //
1008  // Parse the integer part.
1009  //
1010  uint64_t i = 0;
1011  const uint8_t *p = src;
1012  p += parse_digit(*p, i);
1013  bool leading_zero = (i == 0);
1014  while (parse_digit(*p, i)) { p++; }
1015  // no integer digits, or 0123 (zero must be solo)
1016  if ( p == src ) { return INCORRECT_TYPE; }
1017  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1018 
1019  //
1020  // Parse the decimal part.
1021  //
1022  int64_t exponent = 0;
1023  bool overflow;
1024  if (simdjson_likely(*p == '.')) {
1025  p++;
1026  const uint8_t *start_decimal_digits = p;
1027  if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1028  p++;
1029  while (parse_digit(*p, i)) { p++; }
1030  exponent = -(p - start_decimal_digits);
1031 
1032  // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1033  overflow = p-src-1 > 19;
1034  if (simdjson_unlikely(overflow && leading_zero)) {
1035  // Skip leading 0.00000 and see if it still overflows
1036  const uint8_t *start_digits = src + 2;
1037  while (*start_digits == '0') { start_digits++; }
1038  overflow = p-start_digits > 19;
1039  }
1040  } else {
1041  overflow = p-src > 19;
1042  }
1043 
1044  //
1045  // Parse the exponent
1046  //
1047  if (*p == 'e' || *p == 'E') {
1048  p++;
1049  bool exp_neg = *p == '-';
1050  p += exp_neg || *p == '+';
1051 
1052  uint64_t exp = 0;
1053  const uint8_t *start_exp_digits = p;
1054  while (parse_digit(*p, exp)) { p++; }
1055  // no exp digits, or 20+ exp digits
1056  if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1057 
1058  exponent += exp_neg ? 0-exp : exp;
1059  }
1060 
1061  if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1062 
1063  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1064 
1065  //
1066  // Assemble (or slow-parse) the float
1067  //
1068  double d;
1069  if (simdjson_likely(!overflow)) {
1070  if (compute_float_64(exponent, i, negative, d)) { return d; }
1071  }
1072  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1073  return NUMBER_ERROR;
1074  }
1075  return d;
1076 }
1077 
1078 simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
1079  return (*src == '-');
1080 }
1081 
1082 simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
1083  bool negative = (*src == '-');
1084  src += uint8_t(negative);
1085  const uint8_t *p = src;
1086  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1087  if ( p == src ) { return NUMBER_ERROR; }
1088  if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
1089  return false;
1090 }
1091 
1092 simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
1093  bool negative = (*src == '-');
1094  src += uint8_t(negative);
1095  const uint8_t *p = src;
1096  while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
1097  size_t digit_count = size_t(p - src);
1098  if ( p == src ) { return NUMBER_ERROR; }
1099  if (jsoncharutils::is_structural_or_whitespace(*p)) {
1100  static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
1101  // We have an integer.
1102  if(simdjson_unlikely(digit_count > 20)) {
1103  return number_type::big_integer;
1104  }
1105  // If the number is negative and valid, it must be a signed integer.
1106  if(negative) {
1107  if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
1108  if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
1109  return number_type::big_integer;
1110  }
1111  return number_type::signed_integer;
1112  }
1113  // Let us check if we have a big integer (>=2**64).
1114  static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
1115  if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
1116  return number_type::big_integer;
1117  }
1118  // The number is positive and smaller than 18446744073709551616 (or 2**64).
1119  // We want values larger or equal to 9223372036854775808 to be unsigned
1120  // integers, and the other values to be signed integers.
1121  if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
1122  return number_type::unsigned_integer;
1123  }
1124  return number_type::signed_integer;
1125  }
1126  // Hopefully, we have 'e' or 'E' or '.'.
1127  return number_type::floating_point_number;
1128 }
1129 
1130 // Never read at src_end or beyond
1131 simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
1132  if(src == src_end) { return NUMBER_ERROR; }
1133  //
1134  // Check for minus sign
1135  //
1136  bool negative = (*src == '-');
1137  src += uint8_t(negative);
1138 
1139  //
1140  // Parse the integer part.
1141  //
1142  uint64_t i = 0;
1143  const uint8_t *p = src;
1144  if(p == src_end) { return NUMBER_ERROR; }
1145  p += parse_digit(*p, i);
1146  bool leading_zero = (i == 0);
1147  while ((p != src_end) && parse_digit(*p, i)) { p++; }
1148  // no integer digits, or 0123 (zero must be solo)
1149  if ( p == src ) { return INCORRECT_TYPE; }
1150  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1151 
1152  //
1153  // Parse the decimal part.
1154  //
1155  int64_t exponent = 0;
1156  bool overflow;
1157  if (simdjson_likely((p != src_end) && (*p == '.'))) {
1158  p++;
1159  const uint8_t *start_decimal_digits = p;
1160  if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1161  p++;
1162  while ((p != src_end) && parse_digit(*p, i)) { p++; }
1163  exponent = -(p - start_decimal_digits);
1164 
1165  // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1166  overflow = p-src-1 > 19;
1167  if (simdjson_unlikely(overflow && leading_zero)) {
1168  // Skip leading 0.00000 and see if it still overflows
1169  const uint8_t *start_digits = src + 2;
1170  while (*start_digits == '0') { start_digits++; }
1171  overflow = start_digits-src > 19;
1172  }
1173  } else {
1174  overflow = p-src > 19;
1175  }
1176 
1177  //
1178  // Parse the exponent
1179  //
1180  if ((p != src_end) && (*p == 'e' || *p == 'E')) {
1181  p++;
1182  if(p == src_end) { return NUMBER_ERROR; }
1183  bool exp_neg = *p == '-';
1184  p += exp_neg || *p == '+';
1185 
1186  uint64_t exp = 0;
1187  const uint8_t *start_exp_digits = p;
1188  while ((p != src_end) && parse_digit(*p, exp)) { p++; }
1189  // no exp digits, or 20+ exp digits
1190  if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1191 
1192  exponent += exp_neg ? 0-exp : exp;
1193  }
1194 
1195  if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
1196 
1197  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1198 
1199  //
1200  // Assemble (or slow-parse) the float
1201  //
1202  double d;
1203  if (simdjson_likely(!overflow)) {
1204  if (compute_float_64(exponent, i, negative, d)) { return d; }
1205  }
1206  if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
1207  return NUMBER_ERROR;
1208  }
1209  return d;
1210 }
1211 
1212 simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
1213  //
1214  // Check for minus sign
1215  //
1216  bool negative = (*(src + 1) == '-');
1217  src += uint8_t(negative) + 1;
1218 
1219  //
1220  // Parse the integer part.
1221  //
1222  uint64_t i = 0;
1223  const uint8_t *p = src;
1224  p += parse_digit(*p, i);
1225  bool leading_zero = (i == 0);
1226  while (parse_digit(*p, i)) { p++; }
1227  // no integer digits, or 0123 (zero must be solo)
1228  if ( p == src ) { return INCORRECT_TYPE; }
1229  if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
1230 
1231  //
1232  // Parse the decimal part.
1233  //
1234  int64_t exponent = 0;
1235  bool overflow;
1236  if (simdjson_likely(*p == '.')) {
1237  p++;
1238  const uint8_t *start_decimal_digits = p;
1239  if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
1240  p++;
1241  while (parse_digit(*p, i)) { p++; }
1242  exponent = -(p - start_decimal_digits);
1243 
1244  // Overflow check. More than 19 digits (minus the decimal) may be overflow.
1245  overflow = p-src-1 > 19;
1246  if (simdjson_unlikely(overflow && leading_zero)) {
1247  // Skip leading 0.00000 and see if it still overflows
1248  const uint8_t *start_digits = src + 2;
1249  while (*start_digits == '0') { start_digits++; }
1250  overflow = p-start_digits > 19;
1251  }
1252  } else {
1253  overflow = p-src > 19;
1254  }
1255 
1256  //
1257  // Parse the exponent
1258  //
1259  if (*p == 'e' || *p == 'E') {
1260  p++;
1261  bool exp_neg = *p == '-';
1262  p += exp_neg || *p == '+';
1263 
1264  uint64_t exp = 0;
1265  const uint8_t *start_exp_digits = p;
1266  while (parse_digit(*p, exp)) { p++; }
1267  // no exp digits, or 20+ exp digits
1268  if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
1269 
1270  exponent += exp_neg ? 0-exp : exp;
1271  }
1272 
1273  if (*p != '"') { return NUMBER_ERROR; }
1274 
1275  overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
1276 
1277  //
1278  // Assemble (or slow-parse) the float
1279  //
1280  double d;
1281  if (simdjson_likely(!overflow)) {
1282  if (compute_float_64(exponent, i, negative, d)) { return d; }
1283  }
1284  if (!parse_float_fallback(src - uint8_t(negative), &d)) {
1285  return NUMBER_ERROR;
1286  }
1287  return d;
1288 }
1289 
1290 } // unnamed namespace
1291 #endif // SIMDJSON_SKIPNUMBERPARSING
1292 
1293 } // namespace numberparsing
1294 
1295 inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
1296  switch (type) {
1297  case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
1298  case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
1299  case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
1300  case number_type::big_integer: out << "big integer"; break;
1301  default: SIMDJSON_UNREACHABLE();
1302  }
1303  return out;
1304 }
1305 
1306 } // namespace SIMDJSON_IMPLEMENTATION
1307 } // namespace simdjson
1308 
1309 #endif // SIMDJSON_GENERIC_NUMBERPARSING_H
The top level simdjson namespace, containing everything the library provides.
Definition: base.h:8
std::ostream & operator<<(std::ostream &out, error_code error) noexcept
Write the error message to the output stream.
Definition: error-inl.h:35
error_code
All possible errors returned by simdjson.
Definition: error.h:19
@ INCORRECT_TYPE
JSON element has a different type than user expected.
Definition: error.h:37
@ SUCCESS
No error.
Definition: error.h:20
@ NUMBER_ERROR
Problem while parsing a number.
Definition: error.h:29
constexpr size_t SIMDJSON_PADDING
The amount of padding needed in a buffer to parse JSON.
Definition: base.h:32