Blame - absl/strings/internal/charconv_parse.cc - RealtimeRoboticsGroup/test

blob: 8b11868c887af3ce9eb1732285a23336a97b8f95 [file] [log] [blame]

Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	1	// Copyright 2018 The Abseil Authors.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// https://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include "absl/strings/internal/charconv_parse.h"
				16	#include "absl/strings/charconv.h"
				17
				18	#include <cassert>
				19	#include <cstdint>
				20	#include <limits>
				21
				22	#include "absl/strings/internal/memutil.h"
				23
				24	namespace absl {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	25	ABSL_NAMESPACE_BEGIN
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	26	namespace {
				27
				28	// ParseFloat<10> will read the first 19 significant digits of the mantissa.
				29	// This number was chosen for multiple reasons.
				30	//
				31	// (a) First, for whatever integer type we choose to represent the mantissa, we
				32	// want to choose the largest possible number of decimal digits for that integer
				33	// type. We are using uint64_t, which can express any 19-digit unsigned
				34	// integer.
				35	//
				36	// (b) Second, we need to parse enough digits that the binary value of any
				37	// mantissa we capture has more bits of resolution than the mantissa
				38	// representation in the target float. Our algorithm requires at least 3 bits
				39	// of headway, but 19 decimal digits give a little more than that.
				40	//
				41	// The following static assertions verify the above comments:
				42	constexpr int kDecimalMantissaDigitsMax = 19;
				43
				44	static_assert(std::numeric_limits<uint64_t>::digits10 ==
				45	kDecimalMantissaDigitsMax,
				46	"(a) above");
				47
				48	// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa.
				49	static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed");
				50	static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact");
				51	static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact");
				52
				53	// The lowest valued 19-digit decimal mantissa we can read still contains
				54	// sufficient information to reconstruct a binary mantissa.
				55	static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above");
				56
				57	// ParseFloat<16> will read the first 15 significant digits of the mantissa.
				58	//
				59	// Because a base-16-to-base-2 conversion can be done exactly, we do not need
				60	// to maximize the number of scanned hex digits to improve our conversion. What
				61	// is required is to scan two more bits than the mantissa can represent, so that
				62	// we always round correctly.
				63	//
				64	// (One extra bit does not suffice to perform correct rounding, since a number
				65	// exactly halfway between two representable floats has unique rounding rules,
				66	// so we need to differentiate between a "halfway between" number and a "closer
				67	// to the larger value" number.)
				68	constexpr int kHexadecimalMantissaDigitsMax = 15;
				69
				70	// The minimum number of significant bits that will be read from
				71	// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since
				72	// the most significant digit can be a "1", which only contributes a single
				73	// significant bit.
				74	constexpr int kGuaranteedHexadecimalMantissaBitPrecision =
				75	4 * kHexadecimalMantissaDigitsMax - 3;
				76
				77	static_assert(kGuaranteedHexadecimalMantissaBitPrecision >
				78	std::numeric_limits<double>::digits + 2,
				79	"kHexadecimalMantissaDigitsMax too small");
				80
				81	// We also impose a limit on the number of significant digits we will read from
				82	// an exponent, to avoid having to deal with integer overflow. We use 9 for
				83	// this purpose.
				84	//
				85	// If we read a 9 digit exponent, the end result of the conversion will
				86	// necessarily be infinity or zero, depending on the sign of the exponent.
				87	// Therefore we can just drop extra digits on the floor without any extra
				88	// logic.
				89	constexpr int kDecimalExponentDigitsMax = 9;
				90	static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax,
				91	"int type too small");
				92
				93	// To avoid incredibly large inputs causing integer overflow for our exponent,
				94	// we impose an arbitrary but very large limit on the number of significant
				95	// digits we will accept. The implementation refuses to match a string with
				96	// more consecutive significant mantissa digits than this.
				97	constexpr int kDecimalDigitLimit = 50000000;
				98
				99	// Corresponding limit for hexadecimal digit inputs. This is one fourth the
				100	// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires
				101	// a binary exponent adjustment of 4.
				102	constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4;
				103
				104	// The largest exponent we can read is 999999999 (per
				105	// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get
				106	// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these
				107	// comfortably fits in an integer.
				108	//
				109	// We count kDecimalDigitLimit twice because there are independent limits for
				110	// numbers before and after the decimal point. (In the case where there are no
				111	// significant digits before the decimal point, there are independent limits for
				112	// post-decimal-point leading zeroes and for significant digits.)
				113	static_assert(999999999 + 2 * kDecimalDigitLimit <
				114	std::numeric_limits<int>::max(),
				115	"int type too small");
				116	static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) <
				117	std::numeric_limits<int>::max(),
				118	"int type too small");
				119
				120	// Returns true if the provided bitfield allows parsing an exponent value
				121	// (e.g., "1.5e100").
				122	bool AllowExponent(chars_format flags) {
				123	bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
				124	bool scientific =
				125	(flags & chars_format::scientific) == chars_format::scientific;
				126	return scientific \|\| !fixed;
				127	}
				128
				129	// Returns true if the provided bitfield requires an exponent value be present.
				130	bool RequireExponent(chars_format flags) {
				131	bool fixed = (flags & chars_format::fixed) == chars_format::fixed;
				132	bool scientific =
				133	(flags & chars_format::scientific) == chars_format::scientific;
				134	return scientific && !fixed;
				135	}
				136
				137	const int8_t kAsciiToInt[256] = {
				138	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				139	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				140	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,
				141	9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1,
				142	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				143	-1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				144	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				145	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				146	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				147	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				148	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				149	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				150	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				151	-1, -1, -1, -1, -1, -1, -1, -1, -1};
				152
				153	// Returns true if `ch` is a digit in the given base
				154	template <int base>
				155	bool IsDigit(char ch);
				156
				157	// Converts a valid `ch` to its digit value in the given base.
				158	template <int base>
				159	unsigned ToDigit(char ch);
				160
				161	// Returns true if `ch` is the exponent delimiter for the given base.
				162	template <int base>
				163	bool IsExponentCharacter(char ch);
				164
				165	// Returns the maximum number of significant digits we will read for a float
				166	// in the given base.
				167	template <int base>
				168	constexpr int MantissaDigitsMax();
				169
				170	// Returns the largest consecutive run of digits we will accept when parsing a
				171	// number in the given base.
				172	template <int base>
				173	constexpr int DigitLimit();
				174
				175	// Returns the amount the exponent must be adjusted by for each dropped digit.
				176	// (For decimal this is 1, since the digits are in base 10 and the exponent base
				177	// is also 10, but for hexadecimal this is 4, since the digits are base 16 but
				178	// the exponent base is 2.)
				179	template <int base>
				180	constexpr int DigitMagnitude();
				181
				182	template <>
				183	bool IsDigit<10>(char ch) {
				184	return ch >= '0' && ch <= '9';
				185	}
				186	template <>
				187	bool IsDigit<16>(char ch) {
				188	return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0;
				189	}
				190
				191	template <>
				192	unsigned ToDigit<10>(char ch) {
				193	return ch - '0';
				194	}
				195	template <>
				196	unsigned ToDigit<16>(char ch) {
				197	return kAsciiToInt[static_cast<unsigned char>(ch)];
				198	}
				199
				200	template <>
				201	bool IsExponentCharacter<10>(char ch) {
				202	return ch == 'e' \|\| ch == 'E';
				203	}
				204
				205	template <>
				206	bool IsExponentCharacter<16>(char ch) {
				207	return ch == 'p' \|\| ch == 'P';
				208	}
				209
				210	template <>
				211	constexpr int MantissaDigitsMax<10>() {
				212	return kDecimalMantissaDigitsMax;
				213	}
				214	template <>
				215	constexpr int MantissaDigitsMax<16>() {
				216	return kHexadecimalMantissaDigitsMax;
				217	}
				218
				219	template <>
				220	constexpr int DigitLimit<10>() {
				221	return kDecimalDigitLimit;
				222	}
				223	template <>
				224	constexpr int DigitLimit<16>() {
				225	return kHexadecimalDigitLimit;
				226	}
				227
				228	template <>
				229	constexpr int DigitMagnitude<10>() {
				230	return 1;
				231	}
				232	template <>
				233	constexpr int DigitMagnitude<16>() {
				234	return 4;
				235	}
				236
				237	// Reads decimal digits from [begin, end) into *out. Returns the number of
				238	// digits consumed.
				239	//
				240	// After max_digits has been read, keeps consuming characters, but no longer
				241	// adjusts out. If a nonzero digit is dropped this way, dropped_nonzero_digit
				242	// is set; otherwise, it is left unmodified.
				243	//
				244	// If no digits are matched, returns 0 and leaves *out unchanged.
				245	//
				246	// ConsumeDigits does not protect against overflow on *out; max_digits must
				247	// be chosen with respect to type T to avoid the possibility of overflow.
				248	template <int base, typename T>
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	249	int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out,
				250	bool* dropped_nonzero_digit) {
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	251	if (base == 10) {
				252	assert(max_digits <= std::numeric_limits<T>::digits10);
				253	} else if (base == 16) {
				254	assert(max_digits * 4 <= std::numeric_limits<T>::digits);
				255	}
				256	const char* const original_begin = begin;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	257
				258	// Skip leading zeros, but only if *out is zero.
				259	// They don't cause an overflow so we don't have to count them for
				260	// `max_digits`.
				261	while (!out && end != begin && begin == '0') ++begin;
				262
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	263	T accumulator = *out;
				264	const char* significant_digits_end =
				265	(end - begin > max_digits) ? begin + max_digits : end;
				266	while (begin < significant_digits_end && IsDigit<base>(*begin)) {
				267	// Do not guard against *out overflow; max_digits was chosen to avoid this.
				268	// Do assert against it, to detect problems in debug builds.
				269	auto digit = static_cast<T>(ToDigit<base>(*begin));
				270	assert(accumulator * base >= accumulator);
				271	accumulator *= base;
				272	assert(accumulator + digit >= accumulator);
				273	accumulator += digit;
				274	++begin;
				275	}
				276	bool dropped_nonzero = false;
				277	while (begin < end && IsDigit<base>(*begin)) {
				278	dropped_nonzero = dropped_nonzero \|\| (*begin != '0');
				279	++begin;
				280	}
				281	if (dropped_nonzero && dropped_nonzero_digit != nullptr) {
				282	*dropped_nonzero_digit = true;
				283	}
				284	*out = accumulator;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	285	return static_cast<int>(begin - original_begin);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	286	}
				287
				288	// Returns true if `v` is one of the chars allowed inside parentheses following
				289	// a NaN.
				290	bool IsNanChar(char v) {
				291	return (v == '_') \|\| (v >= '0' && v <= '9') \|\| (v >= 'a' && v <= 'z') \|\|
				292	(v >= 'A' && v <= 'Z');
				293	}
				294
				295	// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If
				296	// one is found, sets `out` appropriately and returns true.
				297	bool ParseInfinityOrNan(const char* begin, const char* end,
				298	strings_internal::ParsedFloat* out) {
				299	if (end - begin < 3) {
				300	return false;
				301	}
				302	switch (*begin) {
				303	case 'i':
				304	case 'I': {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	305	// An infinity string consists of the characters "inf" or "infinity",
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	306	// case insensitive.
				307	if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) {
				308	return false;
				309	}
				310	out->type = strings_internal::FloatType::kInfinity;
				311	if (end - begin >= 8 &&
				312	strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) {
				313	out->end = begin + 8;
				314	} else {
				315	out->end = begin + 3;
				316	}
				317	return true;
				318	}
				319	case 'n':
				320	case 'N': {
				321	// A NaN consists of the characters "nan", case insensitive, optionally
				322	// followed by a parenthesized sequence of zero or more alphanumeric
				323	// characters and/or underscores.
				324	if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) {
				325	return false;
				326	}
				327	out->type = strings_internal::FloatType::kNan;
				328	out->end = begin + 3;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	329	// NaN is allowed to be followed by a parenthesized string, consisting of
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	330	// only the characters [a-zA-Z0-9_]. Match that if it's present.
				331	begin += 3;
				332	if (begin < end && *begin == '(') {
				333	const char* nan_begin = begin + 1;
				334	while (nan_begin < end && IsNanChar(*nan_begin)) {
				335	++nan_begin;
				336	}
				337	if (nan_begin < end && *nan_begin == ')') {
				338	// We found an extra NaN specifier range
				339	out->subrange_begin = begin + 1;
				340	out->subrange_end = nan_begin;
				341	out->end = nan_begin + 1;
				342	}
				343	}
				344	return true;
				345	}
				346	default:
				347	return false;
				348	}
				349	}
				350	} // namespace
				351
				352	namespace strings_internal {
				353
				354	template <int base>
				355	strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end,
				356	chars_format format_flags) {
				357	strings_internal::ParsedFloat result;
				358
				359	// Exit early if we're given an empty range.
				360	if (begin == end) return result;
				361
				362	// Handle the infinity and NaN cases.
				363	if (ParseInfinityOrNan(begin, end, &result)) {
				364	return result;
				365	}
				366
				367	const char* const mantissa_begin = begin;
				368	while (begin < end && *begin == '0') {
				369	++begin; // skip leading zeros
				370	}
				371	uint64_t mantissa = 0;
				372
				373	int exponent_adjustment = 0;
				374	bool mantissa_is_inexact = false;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	375	int pre_decimal_digits = ConsumeDigits<base>(
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	376	begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact);
				377	begin += pre_decimal_digits;
				378	int digits_left;
				379	if (pre_decimal_digits >= DigitLimit<base>()) {
				380	// refuse to parse pathological inputs
				381	return result;
				382	} else if (pre_decimal_digits > MantissaDigitsMax<base>()) {
				383	// We dropped some non-fraction digits on the floor. Adjust our exponent
				384	// to compensate.
				385	exponent_adjustment =
				386	static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>());
				387	digits_left = 0;
				388	} else {
				389	digits_left =
				390	static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits);
				391	}
				392	if (begin < end && *begin == '.') {
				393	++begin;
				394	if (mantissa == 0) {
				395	// If we haven't seen any nonzero digits yet, keep skipping zeros. We
				396	// have to adjust the exponent to reflect the changed place value.
				397	const char* begin_zeros = begin;
				398	while (begin < end && *begin == '0') {
				399	++begin;
				400	}
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	401	int zeros_skipped = static_cast<int>(begin - begin_zeros);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	402	if (zeros_skipped >= DigitLimit<base>()) {
				403	// refuse to parse pathological inputs
				404	return result;
				405	}
				406	exponent_adjustment -= static_cast<int>(zeros_skipped);
				407	}
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	408	int post_decimal_digits = ConsumeDigits<base>(
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	409	begin, end, digits_left, &mantissa, &mantissa_is_inexact);
				410	begin += post_decimal_digits;
				411
				412	// Since `mantissa` is an integer, each significant digit we read after
				413	// the decimal point requires an adjustment to the exponent. "1.23e0" will
				414	// be stored as `mantissa` == 123 and `exponent` == -2 (that is,
				415	// "123e-2").
				416	if (post_decimal_digits >= DigitLimit<base>()) {
				417	// refuse to parse pathological inputs
				418	return result;
				419	} else if (post_decimal_digits > digits_left) {
				420	exponent_adjustment -= digits_left;
				421	} else {
				422	exponent_adjustment -= post_decimal_digits;
				423	}
				424	}
				425	// If we've found no mantissa whatsoever, this isn't a number.
				426	if (mantissa_begin == begin) {
				427	return result;
				428	}
				429	// A bare "." doesn't count as a mantissa either.
				430	if (begin - mantissa_begin == 1 && *mantissa_begin == '.') {
				431	return result;
				432	}
				433
				434	if (mantissa_is_inexact) {
				435	// We dropped significant digits on the floor. Handle this appropriately.
				436	if (base == 10) {
				437	// If we truncated significant decimal digits, store the full range of the
				438	// mantissa for future big integer math for exact rounding.
				439	result.subrange_begin = mantissa_begin;
				440	result.subrange_end = begin;
				441	} else if (base == 16) {
				442	// If we truncated hex digits, reflect this fact by setting the low
				443	// ("sticky") bit. This allows for correct rounding in all cases.
				444	mantissa \|= 1;
				445	}
				446	}
				447	result.mantissa = mantissa;
				448
				449	const char* const exponent_begin = begin;
				450	result.literal_exponent = 0;
				451	bool found_exponent = false;
				452	if (AllowExponent(format_flags) && begin < end &&
				453	IsExponentCharacter<base>(*begin)) {
				454	bool negative_exponent = false;
				455	++begin;
				456	if (begin < end && *begin == '-') {
				457	negative_exponent = true;
				458	++begin;
				459	} else if (begin < end && *begin == '+') {
				460	++begin;
				461	}
				462	const char* const exponent_digits_begin = begin;
				463	// Exponent is always expressed in decimal, even for hexadecimal floats.
				464	begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax,
				465	&result.literal_exponent, nullptr);
				466	if (begin == exponent_digits_begin) {
				467	// there were no digits where we expected an exponent. We failed to read
				468	// an exponent and should not consume the 'e' after all. Rewind 'begin'.
				469	found_exponent = false;
				470	begin = exponent_begin;
				471	} else {
				472	found_exponent = true;
				473	if (negative_exponent) {
				474	result.literal_exponent = -result.literal_exponent;
				475	}
				476	}
				477	}
				478
				479	if (!found_exponent && RequireExponent(format_flags)) {
				480	// Provided flags required an exponent, but none was found. This results
				481	// in a failure to scan.
				482	return result;
				483	}
				484
				485	// Success!
				486	result.type = strings_internal::FloatType::kNumber;
				487	if (result.mantissa > 0) {
				488	result.exponent = result.literal_exponent +
				489	(DigitMagnitude<base>() * exponent_adjustment);
				490	} else {
				491	result.exponent = 0;
				492	}
				493	result.end = begin;
				494	return result;
				495	}
				496
				497	template ParsedFloat ParseFloat<10>(const char* begin, const char* end,
				498	chars_format format_flags);
				499	template ParsedFloat ParseFloat<16>(const char* begin, const char* end,
				500	chars_format format_flags);
				501
				502	} // namespace strings_internal
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	503	ABSL_NAMESPACE_END
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	504	} // namespace absl