blob: 8b11868c887af3ce9eb1732285a23336a97b8f95 [file] [log] [blame]
Austin Schuh36244a12019-09-21 17:52:38 -07001// Copyright 2018 The Abseil Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/strings/internal/charconv_parse.h"
16#include "absl/strings/charconv.h"
17
18#include <cassert>
19#include <cstdint>
20#include <limits>
21
22#include "absl/strings/internal/memutil.h"
23
24namespace absl {
Austin Schuhb4691e92020-12-31 12:37:18 -080025ABSL_NAMESPACE_BEGIN
Austin Schuh36244a12019-09-21 17:52:38 -070026namespace {
27
28// ParseFloat<10> will read the first 19 significant digits of the mantissa.
29// This number was chosen for multiple reasons.
30//
31// (a) First, for whatever integer type we choose to represent the mantissa, we
32// want to choose the largest possible number of decimal digits for that integer
33// type. We are using uint64_t, which can express any 19-digit unsigned
34// integer.
35//
36// (b) Second, we need to parse enough digits that the binary value of any
37// mantissa we capture has more bits of resolution than the mantissa
38// representation in the target float. Our algorithm requires at least 3 bits
39// of headway, but 19 decimal digits give a little more than that.
40//
41// The following static assertions verify the above comments:
42constexpr int kDecimalMantissaDigitsMax = 19;
43
44static_assert(std::numeric_limits<uint64_t>::digits10 ==
45 kDecimalMantissaDigitsMax,
46 "(a) above");
47
48// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
49static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
50static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
51static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
52
53// The lowest valued 19-digit decimal mantissa we can read still contains
54// sufficient information to reconstruct a binary mantissa.
55static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
56
57// ParseFloat<16> will read the first 15 significant digits of the mantissa.
58//
59// Because a base-16-to-base-2 conversion can be done exactly, we do not need
60// to maximize the number of scanned hex digits to improve our conversion. What
61// is required is to scan two more bits than the mantissa can represent, so that
62// we always round correctly.
63//
64// (One extra bit does not suffice to perform correct rounding, since a number
65// exactly halfway between two representable floats has unique rounding rules,
66// so we need to differentiate between a "halfway between" number and a "closer
67// to the larger value" number.)
68constexpr int kHexadecimalMantissaDigitsMax = 15;
69
70// The minimum number of significant bits that will be read from
71// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since
72// the most significant digit can be a "1", which only contributes a single
73// significant bit.
74constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
75 4 * kHexadecimalMantissaDigitsMax - 3;
76
77static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
78 std::numeric_limits<double>::digits + 2,
79 "kHexadecimalMantissaDigitsMax too small");
80
81// We also impose a limit on the number of significant digits we will read from
82// an exponent, to avoid having to deal with integer overflow. We use 9 for
83// this purpose.
84//
85// If we read a 9 digit exponent, the end result of the conversion will
86// necessarily be infinity or zero, depending on the sign of the exponent.
87// Therefore we can just drop extra digits on the floor without any extra
88// logic.
89constexpr int kDecimalExponentDigitsMax = 9;
90static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
91 "int type too small");
92
93// To avoid incredibly large inputs causing integer overflow for our exponent,
94// we impose an arbitrary but very large limit on the number of significant
95// digits we will accept. The implementation refuses to match a string with
96// more consecutive significant mantissa digits than this.
97constexpr int kDecimalDigitLimit = 50000000;
98
99// Corresponding limit for hexadecimal digit inputs. This is one fourth the
100// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
101// a binary exponent adjustment of 4.
102constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
103
104// The largest exponent we can read is 999999999 (per
105// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
106// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
107// comfortably fits in an integer.
108//
109// We count kDecimalDigitLimit twice because there are independent limits for
110// numbers before and after the decimal point. (In the case where there are no
111// significant digits before the decimal point, there are independent limits for
112// post-decimal-point leading zeroes and for significant digits.)
113static_assert(999999999 + 2 * kDecimalDigitLimit <
114 std::numeric_limits<int>::max(),
115 "int type too small");
116static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
117 std::numeric_limits<int>::max(),
118 "int type too small");
119
120// Returns true if the provided bitfield allows parsing an exponent value
121// (e.g., "1.5e100").
122bool AllowExponent(chars_format flags) {
123 bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
124 bool scientific =
125 (flags & chars_format::scientific) == chars_format::scientific;
126 return scientific || !fixed;
127}
128
129// Returns true if the provided bitfield requires an exponent value be present.
130bool RequireExponent(chars_format flags) {
131 bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
132 bool scientific =
133 (flags & chars_format::scientific) == chars_format::scientific;
134 return scientific && !fixed;
135}
136
137const int8_t kAsciiToInt[256] = {
138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
139 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
140 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
141 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
142 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
143 -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
144 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
145 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
146 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
147 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
148 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
151 -1, -1, -1, -1, -1, -1, -1, -1, -1};
152
153// Returns true if `ch` is a digit in the given base
154template <int base>
155bool IsDigit(char ch);
156
157// Converts a valid `ch` to its digit value in the given base.
158template <int base>
159unsigned ToDigit(char ch);
160
161// Returns true if `ch` is the exponent delimiter for the given base.
162template <int base>
163bool IsExponentCharacter(char ch);
164
165// Returns the maximum number of significant digits we will read for a float
166// in the given base.
167template <int base>
168constexpr int MantissaDigitsMax();
169
170// Returns the largest consecutive run of digits we will accept when parsing a
171// number in the given base.
172template <int base>
173constexpr int DigitLimit();
174
175// Returns the amount the exponent must be adjusted by for each dropped digit.
176// (For decimal this is 1, since the digits are in base 10 and the exponent base
177// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
178// the exponent base is 2.)
179template <int base>
180constexpr int DigitMagnitude();
181
182template <>
183bool IsDigit<10>(char ch) {
184 return ch >= '0' && ch <= '9';
185}
186template <>
187bool IsDigit<16>(char ch) {
188 return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
189}
190
191template <>
192unsigned ToDigit<10>(char ch) {
193 return ch - '0';
194}
195template <>
196unsigned ToDigit<16>(char ch) {
197 return kAsciiToInt[static_cast<unsigned char>(ch)];
198}
199
200template <>
201bool IsExponentCharacter<10>(char ch) {
202 return ch == 'e' || ch == 'E';
203}
204
205template <>
206bool IsExponentCharacter<16>(char ch) {
207 return ch == 'p' || ch == 'P';
208}
209
210template <>
211constexpr int MantissaDigitsMax<10>() {
212 return kDecimalMantissaDigitsMax;
213}
214template <>
215constexpr int MantissaDigitsMax<16>() {
216 return kHexadecimalMantissaDigitsMax;
217}
218
219template <>
220constexpr int DigitLimit<10>() {
221 return kDecimalDigitLimit;
222}
223template <>
224constexpr int DigitLimit<16>() {
225 return kHexadecimalDigitLimit;
226}
227
228template <>
229constexpr int DigitMagnitude<10>() {
230 return 1;
231}
232template <>
233constexpr int DigitMagnitude<16>() {
234 return 4;
235}
236
237// Reads decimal digits from [begin, end) into *out. Returns the number of
238// digits consumed.
239//
240// After max_digits has been read, keeps consuming characters, but no longer
241// adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit
242// is set; otherwise, it is left unmodified.
243//
244// If no digits are matched, returns 0 and leaves *out unchanged.
245//
246// ConsumeDigits does not protect against overflow on *out; max_digits must
247// be chosen with respect to type T to avoid the possibility of overflow.
248template <int base, typename T>
Austin Schuhb4691e92020-12-31 12:37:18 -0800249int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,
250 bool* dropped_nonzero_digit) {
Austin Schuh36244a12019-09-21 17:52:38 -0700251 if (base == 10) {
252 assert(max_digits <= std::numeric_limits<T>::digits10);
253 } else if (base == 16) {
254 assert(max_digits * 4 <= std::numeric_limits<T>::digits);
255 }
256 const char* const original_begin = begin;
Austin Schuhb4691e92020-12-31 12:37:18 -0800257
258 // Skip leading zeros, but only if *out is zero.
259 // They don't cause an overflow so we don't have to count them for
260 // `max_digits`.
261 while (!*out && end != begin && *begin == '0') ++begin;
262
Austin Schuh36244a12019-09-21 17:52:38 -0700263 T accumulator = *out;
264 const char* significant_digits_end =
265 (end - begin > max_digits) ? begin + max_digits : end;
266 while (begin < significant_digits_end && IsDigit<base>(*begin)) {
267 // Do not guard against *out overflow; max_digits was chosen to avoid this.
268 // Do assert against it, to detect problems in debug builds.
269 auto digit = static_cast<T>(ToDigit<base>(*begin));
270 assert(accumulator * base >= accumulator);
271 accumulator *= base;
272 assert(accumulator + digit >= accumulator);
273 accumulator += digit;
274 ++begin;
275 }
276 bool dropped_nonzero = false;
277 while (begin < end && IsDigit<base>(*begin)) {
278 dropped_nonzero = dropped_nonzero || (*begin != '0');
279 ++begin;
280 }
281 if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
282 *dropped_nonzero_digit = true;
283 }
284 *out = accumulator;
Austin Schuhb4691e92020-12-31 12:37:18 -0800285 return static_cast<int>(begin - original_begin);
Austin Schuh36244a12019-09-21 17:52:38 -0700286}
287
288// Returns true if `v` is one of the chars allowed inside parentheses following
289// a NaN.
290bool IsNanChar(char v) {
291 return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') ||
292 (v >= 'A' && v <= 'Z');
293}
294
295// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If
296// one is found, sets `out` appropriately and returns true.
297bool ParseInfinityOrNan(const char* begin, const char* end,
298 strings_internal::ParsedFloat* out) {
299 if (end - begin < 3) {
300 return false;
301 }
302 switch (*begin) {
303 case 'i':
304 case 'I': {
Austin Schuhb4691e92020-12-31 12:37:18 -0800305 // An infinity string consists of the characters "inf" or "infinity",
Austin Schuh36244a12019-09-21 17:52:38 -0700306 // case insensitive.
307 if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
308 return false;
309 }
310 out->type = strings_internal::FloatType::kInfinity;
311 if (end - begin >= 8 &&
312 strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
313 out->end = begin + 8;
314 } else {
315 out->end = begin + 3;
316 }
317 return true;
318 }
319 case 'n':
320 case 'N': {
321 // A NaN consists of the characters "nan", case insensitive, optionally
322 // followed by a parenthesized sequence of zero or more alphanumeric
323 // characters and/or underscores.
324 if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
325 return false;
326 }
327 out->type = strings_internal::FloatType::kNan;
328 out->end = begin + 3;
Austin Schuhb4691e92020-12-31 12:37:18 -0800329 // NaN is allowed to be followed by a parenthesized string, consisting of
Austin Schuh36244a12019-09-21 17:52:38 -0700330 // only the characters [a-zA-Z0-9_]. Match that if it's present.
331 begin += 3;
332 if (begin < end && *begin == '(') {
333 const char* nan_begin = begin + 1;
334 while (nan_begin < end && IsNanChar(*nan_begin)) {
335 ++nan_begin;
336 }
337 if (nan_begin < end && *nan_begin == ')') {
338 // We found an extra NaN specifier range
339 out->subrange_begin = begin + 1;
340 out->subrange_end = nan_begin;
341 out->end = nan_begin + 1;
342 }
343 }
344 return true;
345 }
346 default:
347 return false;
348 }
349}
350} // namespace
351
352namespace strings_internal {
353
354template <int base>
355strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
356 chars_format format_flags) {
357 strings_internal::ParsedFloat result;
358
359 // Exit early if we're given an empty range.
360 if (begin == end) return result;
361
362 // Handle the infinity and NaN cases.
363 if (ParseInfinityOrNan(begin, end, &result)) {
364 return result;
365 }
366
367 const char* const mantissa_begin = begin;
368 while (begin < end && *begin == '0') {
369 ++begin; // skip leading zeros
370 }
371 uint64_t mantissa = 0;
372
373 int exponent_adjustment = 0;
374 bool mantissa_is_inexact = false;
Austin Schuhb4691e92020-12-31 12:37:18 -0800375 int pre_decimal_digits = ConsumeDigits<base>(
Austin Schuh36244a12019-09-21 17:52:38 -0700376 begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
377 begin += pre_decimal_digits;
378 int digits_left;
379 if (pre_decimal_digits >= DigitLimit<base>()) {
380 // refuse to parse pathological inputs
381 return result;
382 } else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
383 // We dropped some non-fraction digits on the floor. Adjust our exponent
384 // to compensate.
385 exponent_adjustment =
386 static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
387 digits_left = 0;
388 } else {
389 digits_left =
390 static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
391 }
392 if (begin < end && *begin == '.') {
393 ++begin;
394 if (mantissa == 0) {
395 // If we haven't seen any nonzero digits yet, keep skipping zeros. We
396 // have to adjust the exponent to reflect the changed place value.
397 const char* begin_zeros = begin;
398 while (begin < end && *begin == '0') {
399 ++begin;
400 }
Austin Schuhb4691e92020-12-31 12:37:18 -0800401 int zeros_skipped = static_cast<int>(begin - begin_zeros);
Austin Schuh36244a12019-09-21 17:52:38 -0700402 if (zeros_skipped >= DigitLimit<base>()) {
403 // refuse to parse pathological inputs
404 return result;
405 }
406 exponent_adjustment -= static_cast<int>(zeros_skipped);
407 }
Austin Schuhb4691e92020-12-31 12:37:18 -0800408 int post_decimal_digits = ConsumeDigits<base>(
Austin Schuh36244a12019-09-21 17:52:38 -0700409 begin, end, digits_left, &mantissa, &mantissa_is_inexact);
410 begin += post_decimal_digits;
411
412 // Since `mantissa` is an integer, each significant digit we read after
413 // the decimal point requires an adjustment to the exponent. "1.23e0" will
414 // be stored as `mantissa` == 123 and `exponent` == -2 (that is,
415 // "123e-2").
416 if (post_decimal_digits >= DigitLimit<base>()) {
417 // refuse to parse pathological inputs
418 return result;
419 } else if (post_decimal_digits > digits_left) {
420 exponent_adjustment -= digits_left;
421 } else {
422 exponent_adjustment -= post_decimal_digits;
423 }
424 }
425 // If we've found no mantissa whatsoever, this isn't a number.
426 if (mantissa_begin == begin) {
427 return result;
428 }
429 // A bare "." doesn't count as a mantissa either.
430 if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
431 return result;
432 }
433
434 if (mantissa_is_inexact) {
435 // We dropped significant digits on the floor. Handle this appropriately.
436 if (base == 10) {
437 // If we truncated significant decimal digits, store the full range of the
438 // mantissa for future big integer math for exact rounding.
439 result.subrange_begin = mantissa_begin;
440 result.subrange_end = begin;
441 } else if (base == 16) {
442 // If we truncated hex digits, reflect this fact by setting the low
443 // ("sticky") bit. This allows for correct rounding in all cases.
444 mantissa |= 1;
445 }
446 }
447 result.mantissa = mantissa;
448
449 const char* const exponent_begin = begin;
450 result.literal_exponent = 0;
451 bool found_exponent = false;
452 if (AllowExponent(format_flags) && begin < end &&
453 IsExponentCharacter<base>(*begin)) {
454 bool negative_exponent = false;
455 ++begin;
456 if (begin < end && *begin == '-') {
457 negative_exponent = true;
458 ++begin;
459 } else if (begin < end && *begin == '+') {
460 ++begin;
461 }
462 const char* const exponent_digits_begin = begin;
463 // Exponent is always expressed in decimal, even for hexadecimal floats.
464 begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
465 &result.literal_exponent, nullptr);
466 if (begin == exponent_digits_begin) {
467 // there were no digits where we expected an exponent. We failed to read
468 // an exponent and should not consume the 'e' after all. Rewind 'begin'.
469 found_exponent = false;
470 begin = exponent_begin;
471 } else {
472 found_exponent = true;
473 if (negative_exponent) {
474 result.literal_exponent = -result.literal_exponent;
475 }
476 }
477 }
478
479 if (!found_exponent && RequireExponent(format_flags)) {
480 // Provided flags required an exponent, but none was found. This results
481 // in a failure to scan.
482 return result;
483 }
484
485 // Success!
486 result.type = strings_internal::FloatType::kNumber;
487 if (result.mantissa > 0) {
488 result.exponent = result.literal_exponent +
489 (DigitMagnitude<base>() * exponent_adjustment);
490 } else {
491 result.exponent = 0;
492 }
493 result.end = begin;
494 return result;
495}
496
497template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
498 chars_format format_flags);
499template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
500 chars_format format_flags);
501
502} // namespace strings_internal
Austin Schuhb4691e92020-12-31 12:37:18 -0800503ABSL_NAMESPACE_END
Austin Schuh36244a12019-09-21 17:52:38 -0700504} // namespace absl