Blame - absl/strings/escaping.cc - RealtimeRoboticsGroup/test

blob: 18b20b83fd3604a23d324f7926f429d52259cb29 [file] [log] [blame]

Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	1	// Copyright 2017 The Abseil Authors.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// https://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include "absl/strings/escaping.h"
				16
				17	#include <algorithm>
				18	#include <cassert>
				19	#include <cstdint>
				20	#include <cstring>
				21	#include <iterator>
				22	#include <limits>
				23	#include <string>
				24
				25	#include "absl/base/internal/endian.h"
				26	#include "absl/base/internal/raw_logging.h"
				27	#include "absl/base/internal/unaligned_access.h"
				28	#include "absl/strings/internal/char_map.h"
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	29	#include "absl/strings/internal/escaping.h"
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	30	#include "absl/strings/internal/resize_uninitialized.h"
				31	#include "absl/strings/internal/utf8.h"
				32	#include "absl/strings/str_cat.h"
				33	#include "absl/strings/str_join.h"
				34	#include "absl/strings/string_view.h"
				35
				36	namespace absl {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	37	ABSL_NAMESPACE_BEGIN
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	38	namespace {
				39
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	40	// These are used for the leave_nulls_escaped argument to CUnescapeInternal().
				41	constexpr bool kUnescapeNulls = false;
				42
				43	inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
				44
				45	inline int hex_digit_to_int(char c) {
				46	static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
				47	"Character set must be ASCII.");
				48	assert(absl::ascii_isxdigit(c));
				49	int x = static_cast<unsigned char>(c);
				50	if (x > '9') {
				51	x += 9;
				52	}
				53	return x & 0xf;
				54	}
				55
				56	inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
				57	if (c >= 0xD800 && c <= 0xDFFF) {
				58	if (error) {
				59	*error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
				60	src);
				61	}
				62	return true;
				63	}
				64	return false;
				65	}
				66
				67	// ----------------------------------------------------------------------
				68	// CUnescapeInternal()
				69	// Implements both CUnescape() and CUnescapeForNullTerminatedString().
				70	//
				71	// Unescapes C escape sequences and is the reverse of CEscape().
				72	//
				73	// If 'source' is valid, stores the unescaped string and its size in
				74	// 'dest' and 'dest_len' respectively, and returns true. Otherwise
				75	// returns false and optionally stores the error description in
				76	// 'error'. Set 'error' to nullptr to disable error reporting.
				77	//
				78	// 'dest' should point to a buffer that is at least as big as 'source'.
				79	// 'source' and 'dest' may be the same.
				80	//
				81	// NOTE: any changes to this function must also be reflected in the older
				82	// UnescapeCEscapeSequences().
				83	// ----------------------------------------------------------------------
				84	bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
				85	char* dest, ptrdiff_t* dest_len, std::string* error) {
				86	char* d = dest;
				87	const char* p = source.data();
				88	const char* end = p + source.size();
				89	const char* last_byte = end - 1;
				90
				91	// Small optimization for case where source = dest and there's no escaping
				92	while (p == d && p < end && *p != '\\') p++, d++;
				93
				94	while (p < end) {
				95	if (*p != '\\') {
				96	d++ = p++;
				97	} else {
				98	if (++p > last_byte) { // skip past the '\\'
				99	if (error) *error = "String cannot end with \\";
				100	return false;
				101	}
				102	switch (*p) {
				103	case 'a': *d++ = '\a'; break;
				104	case 'b': *d++ = '\b'; break;
				105	case 'f': *d++ = '\f'; break;
				106	case 'n': *d++ = '\n'; break;
				107	case 'r': *d++ = '\r'; break;
				108	case 't': *d++ = '\t'; break;
				109	case 'v': *d++ = '\v'; break;
				110	case '\\': *d++ = '\\'; break;
				111	case '?': *d++ = '\?'; break; // \? Who knew?
				112	case '\'': *d++ = '\''; break;
				113	case '"': *d++ = '\"'; break;
				114	case '0':
				115	case '1':
				116	case '2':
				117	case '3':
				118	case '4':
				119	case '5':
				120	case '6':
				121	case '7': {
				122	// octal digit: 1 to 3 digits
				123	const char* octal_start = p;
				124	unsigned int ch = *p - '0';
				125	if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
				126	if (p < last_byte && is_octal_digit(p[1]))
				127	ch = ch * 8 + *++p - '0'; // now points at last digit
				128	if (ch > 0xff) {
				129	if (error) {
				130	*error = "Value of \\" +
				131	std::string(octal_start, p + 1 - octal_start) +
				132	" exceeds 0xff";
				133	}
				134	return false;
				135	}
				136	if ((ch == 0) && leave_nulls_escaped) {
				137	// Copy the escape sequence for the null character
				138	const ptrdiff_t octal_size = p + 1 - octal_start;
				139	*d++ = '\\';
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	140	memmove(d, octal_start, octal_size);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	141	d += octal_size;
				142	break;
				143	}
				144	*d++ = ch;
				145	break;
				146	}
				147	case 'x':
				148	case 'X': {
				149	if (p >= last_byte) {
				150	if (error) *error = "String cannot end with \\x";
				151	return false;
				152	} else if (!absl::ascii_isxdigit(p[1])) {
				153	if (error) *error = "\\x cannot be followed by a non-hex digit";
				154	return false;
				155	}
				156	unsigned int ch = 0;
				157	const char* hex_start = p;
				158	while (p < last_byte && absl::ascii_isxdigit(p[1]))
				159	// Arbitrarily many hex digits
				160	ch = (ch << 4) + hex_digit_to_int(*++p);
				161	if (ch > 0xFF) {
				162	if (error) {
				163	*error = "Value of \\" +
				164	std::string(hex_start, p + 1 - hex_start) +
				165	" exceeds 0xff";
				166	}
				167	return false;
				168	}
				169	if ((ch == 0) && leave_nulls_escaped) {
				170	// Copy the escape sequence for the null character
				171	const ptrdiff_t hex_size = p + 1 - hex_start;
				172	*d++ = '\\';
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	173	memmove(d, hex_start, hex_size);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	174	d += hex_size;
				175	break;
				176	}
				177	*d++ = ch;
				178	break;
				179	}
				180	case 'u': {
				181	// \uhhhh => convert 4 hex digits to UTF-8
				182	char32_t rune = 0;
				183	const char* hex_start = p;
				184	if (p + 4 >= end) {
				185	if (error) {
				186	*error = "\\u must be followed by 4 hex digits: \\" +
				187	std::string(hex_start, p + 1 - hex_start);
				188	}
				189	return false;
				190	}
				191	for (int i = 0; i < 4; ++i) {
				192	// Look one char ahead.
				193	if (absl::ascii_isxdigit(p[1])) {
				194	rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
				195	} else {
				196	if (error) {
				197	*error = "\\u must be followed by 4 hex digits: \\" +
				198	std::string(hex_start, p + 1 - hex_start);
				199	}
				200	return false;
				201	}
				202	}
				203	if ((rune == 0) && leave_nulls_escaped) {
				204	// Copy the escape sequence for the null character
				205	*d++ = '\\';
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	206	memmove(d, hex_start, 5); // u0000
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	207	d += 5;
				208	break;
				209	}
				210	if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
				211	return false;
				212	}
				213	d += strings_internal::EncodeUTF8Char(d, rune);
				214	break;
				215	}
				216	case 'U': {
				217	// \Uhhhhhhhh => convert 8 hex digits to UTF-8
				218	char32_t rune = 0;
				219	const char* hex_start = p;
				220	if (p + 8 >= end) {
				221	if (error) {
				222	*error = "\\U must be followed by 8 hex digits: \\" +
				223	std::string(hex_start, p + 1 - hex_start);
				224	}
				225	return false;
				226	}
				227	for (int i = 0; i < 8; ++i) {
				228	// Look one char ahead.
				229	if (absl::ascii_isxdigit(p[1])) {
				230	// Don't change rune until we're sure this
				231	// is within the Unicode limit, but do advance p.
				232	uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
				233	if (newrune > 0x10FFFF) {
				234	if (error) {
				235	*error = "Value of \\" +
				236	std::string(hex_start, p + 1 - hex_start) +
				237	" exceeds Unicode limit (0x10FFFF)";
				238	}
				239	return false;
				240	} else {
				241	rune = newrune;
				242	}
				243	} else {
				244	if (error) {
				245	*error = "\\U must be followed by 8 hex digits: \\" +
				246	std::string(hex_start, p + 1 - hex_start);
				247	}
				248	return false;
				249	}
				250	}
				251	if ((rune == 0) && leave_nulls_escaped) {
				252	// Copy the escape sequence for the null character
				253	*d++ = '\\';
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	254	memmove(d, hex_start, 9); // U00000000
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	255	d += 9;
				256	break;
				257	}
				258	if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
				259	return false;
				260	}
				261	d += strings_internal::EncodeUTF8Char(d, rune);
				262	break;
				263	}
				264	default: {
				265	if (error) error = std::string("Unknown escape sequence: \\") + p;
				266	return false;
				267	}
				268	}
				269	p++; // read past letter we escaped
				270	}
				271	}
				272	*dest_len = d - dest;
				273	return true;
				274	}
				275
				276	// ----------------------------------------------------------------------
				277	// CUnescapeInternal()
				278	//
				279	// Same as above but uses a std::string for output. 'source' and 'dest'
				280	// may be the same.
				281	// ----------------------------------------------------------------------
				282	bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
				283	std::string* dest, std::string* error) {
				284	strings_internal::STLStringResizeUninitialized(dest, source.size());
				285
				286	ptrdiff_t dest_size;
				287	if (!CUnescapeInternal(source,
				288	leave_nulls_escaped,
				289	&(*dest)[0],
				290	&dest_size,
				291	error)) {
				292	return false;
				293	}
				294	dest->erase(dest_size);
				295	return true;
				296	}
				297
				298	// ----------------------------------------------------------------------
				299	// CEscape()
				300	// CHexEscape()
				301	// Utf8SafeCEscape()
				302	// Utf8SafeCHexEscape()
				303	// Escapes 'src' using C-style escape sequences. This is useful for
				304	// preparing query flags. The 'Hex' version uses hexadecimal rather than
				305	// octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
				306	//
				307	// Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
				308	// ----------------------------------------------------------------------
				309	std::string CEscapeInternal(absl::string_view src, bool use_hex,
				310	bool utf8_safe) {
				311	std::string dest;
				312	bool last_hex_escape = false; // true if last output char was \xNN.
				313
				314	for (unsigned char c : src) {
				315	bool is_hex_escape = false;
				316	switch (c) {
				317	case '\n': dest.append("\\" "n"); break;
				318	case '\r': dest.append("\\" "r"); break;
				319	case '\t': dest.append("\\" "t"); break;
				320	case '\"': dest.append("\\" "\""); break;
				321	case '\'': dest.append("\\" "'"); break;
				322	case '\\': dest.append("\\" "\\"); break;
				323	default:
				324	// Note that if we emit \xNN and the src character after that is a hex
				325	// digit then that digit must be escaped too to prevent it being
				326	// interpreted as part of the character code by C.
				327	if ((!utf8_safe \|\| c < 0x80) &&
				328	(!absl::ascii_isprint(c) \|\|
				329	(last_hex_escape && absl::ascii_isxdigit(c)))) {
				330	if (use_hex) {
				331	dest.append("\\" "x");
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	332	dest.push_back(numbers_internal::kHexChar[c / 16]);
				333	dest.push_back(numbers_internal::kHexChar[c % 16]);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	334	is_hex_escape = true;
				335	} else {
				336	dest.append("\\");
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	337	dest.push_back(numbers_internal::kHexChar[c / 64]);
				338	dest.push_back(numbers_internal::kHexChar[(c % 64) / 8]);
				339	dest.push_back(numbers_internal::kHexChar[c % 8]);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	340	}
				341	} else {
				342	dest.push_back(c);
				343	break;
				344	}
				345	}
				346	last_hex_escape = is_hex_escape;
				347	}
				348
				349	return dest;
				350	}
				351
				352	/* clang-format off */
				353	constexpr char c_escaped_len[256] = {
				354	4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
				355	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				356	1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
				357	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
				358	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
				359	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
				360	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
				361	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
				362	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				363	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				364	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				365	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				366	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				367	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				368	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				369	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
				370	};
				371	/* clang-format on */
				372
				373	// Calculates the length of the C-style escaped version of 'src'.
				374	// Assumes that non-printable characters are escaped using octal sequences, and
				375	// that UTF-8 bytes are not handled specially.
				376	inline size_t CEscapedLength(absl::string_view src) {
				377	size_t escaped_len = 0;
				378	for (unsigned char c : src) escaped_len += c_escaped_len[c];
				379	return escaped_len;
				380	}
				381
				382	void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
				383	size_t escaped_len = CEscapedLength(src);
				384	if (escaped_len == src.size()) {
				385	dest->append(src.data(), src.size());
				386	return;
				387	}
				388
				389	size_t cur_dest_len = dest->size();
				390	strings_internal::STLStringResizeUninitialized(dest,
				391	cur_dest_len + escaped_len);
				392	char* append_ptr = &(*dest)[cur_dest_len];
				393
				394	for (unsigned char c : src) {
				395	int char_len = c_escaped_len[c];
				396	if (char_len == 1) {
				397	*append_ptr++ = c;
				398	} else if (char_len == 2) {
				399	switch (c) {
				400	case '\n':
				401	*append_ptr++ = '\\';
				402	*append_ptr++ = 'n';
				403	break;
				404	case '\r':
				405	*append_ptr++ = '\\';
				406	*append_ptr++ = 'r';
				407	break;
				408	case '\t':
				409	*append_ptr++ = '\\';
				410	*append_ptr++ = 't';
				411	break;
				412	case '\"':
				413	*append_ptr++ = '\\';
				414	*append_ptr++ = '\"';
				415	break;
				416	case '\'':
				417	*append_ptr++ = '\\';
				418	*append_ptr++ = '\'';
				419	break;
				420	case '\\':
				421	*append_ptr++ = '\\';
				422	*append_ptr++ = '\\';
				423	break;
				424	}
				425	} else {
				426	*append_ptr++ = '\\';
				427	*append_ptr++ = '0' + c / 64;
				428	*append_ptr++ = '0' + (c % 64) / 8;
				429	*append_ptr++ = '0' + c % 8;
				430	}
				431	}
				432	}
				433
				434	bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
				435	size_t szdest, const signed char* unbase64,
				436	size_t* len) {
				437	static const char kPad64Equals = '=';
				438	static const char kPad64Dot = '.';
				439
				440	size_t destidx = 0;
				441	int decode = 0;
				442	int state = 0;
				443	unsigned int ch = 0;
				444	unsigned int temp = 0;
				445
				446	// If "char" is signed by default, using *src as an array index results in
				447	// accessing negative array elements. Treat the input as a pointer to
				448	// unsigned char to avoid this.
				449	const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
				450
				451	// The GET_INPUT macro gets the next input character, skipping
				452	// over any whitespace, and stopping when we reach the end of the
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	453	// string or when we read any non-data character. The arguments are
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	454	// an arbitrary identifier (used as a label for goto) and the number
				455	// of data bytes that must remain in the input to avoid aborting the
				456	// loop.
				457	#define GET_INPUT(label, remain) \
				458	label: \
				459	--szsrc; \
				460	ch = *src++; \
				461	decode = unbase64[ch]; \
				462	if (decode < 0) { \
				463	if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
				464	state = 4 - remain; \
				465	break; \
				466	}
				467
				468	// if dest is null, we're just checking to see if it's legal input
				469	// rather than producing output. (I suspect this could just be done
				470	// with a regexp...). We duplicate the loop so this test can be
				471	// outside it instead of in every iteration.
				472
				473	if (dest) {
				474	// This loop consumes 4 input bytes and produces 3 output bytes
				475	// per iteration. We can't know at the start that there is enough
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	476	// data left in the string for a full iteration, so the loop may
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	477	// break out in the middle; if so 'state' will be set to the
				478	// number of input bytes read.
				479
				480	while (szsrc >= 4) {
				481	// We'll start by optimistically assuming that the next four
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	482	// bytes of the string (src[0..3]) are four good data bytes
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	483	// (that is, no nulls, whitespace, padding chars, or illegal
				484	// chars). We need to test src[0..2] for nulls individually
				485	// before constructing temp to preserve the property that we
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	486	// never read past a null in the string (no matter how long
				487	// szsrc claims the string is).
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	488
				489	if (!src[0] \|\| !src[1] \|\| !src[2] \|\|
				490	((temp = ((unsigned(unbase64[src[0]]) << 18) \|
				491	(unsigned(unbase64[src[1]]) << 12) \|
				492	(unsigned(unbase64[src[2]]) << 6) \|
				493	(unsigned(unbase64[src[3]])))) &
				494	0x80000000)) {
				495	// Iff any of those four characters was bad (null, illegal,
				496	// whitespace, padding), then temp's high bit will be set
				497	// (because unbase64[] is -1 for all bad characters).
				498	//
				499	// We'll back up and resort to the slower decoder, which knows
				500	// how to handle those cases.
				501
				502	GET_INPUT(first, 4);
				503	temp = decode;
				504	GET_INPUT(second, 3);
				505	temp = (temp << 6) \| decode;
				506	GET_INPUT(third, 2);
				507	temp = (temp << 6) \| decode;
				508	GET_INPUT(fourth, 1);
				509	temp = (temp << 6) \| decode;
				510	} else {
				511	// We really did have four good data bytes, so advance four
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	512	// characters in the string.
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	513
				514	szsrc -= 4;
				515	src += 4;
				516	}
				517
				518	// temp has 24 bits of input, so write that out as three bytes.
				519
				520	if (destidx + 3 > szdest) return false;
				521	dest[destidx + 2] = temp;
				522	temp >>= 8;
				523	dest[destidx + 1] = temp;
				524	temp >>= 8;
				525	dest[destidx] = temp;
				526	destidx += 3;
				527	}
				528	} else {
				529	while (szsrc >= 4) {
				530	if (!src[0] \|\| !src[1] \|\| !src[2] \|\|
				531	((temp = ((unsigned(unbase64[src[0]]) << 18) \|
				532	(unsigned(unbase64[src[1]]) << 12) \|
				533	(unsigned(unbase64[src[2]]) << 6) \|
				534	(unsigned(unbase64[src[3]])))) &
				535	0x80000000)) {
				536	GET_INPUT(first_no_dest, 4);
				537	GET_INPUT(second_no_dest, 3);
				538	GET_INPUT(third_no_dest, 2);
				539	GET_INPUT(fourth_no_dest, 1);
				540	} else {
				541	szsrc -= 4;
				542	src += 4;
				543	}
				544	destidx += 3;
				545	}
				546	}
				547
				548	#undef GET_INPUT
				549
				550	// if the loop terminated because we read a bad character, return
				551	// now.
				552	if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
				553	!absl::ascii_isspace(ch))
				554	return false;
				555
				556	if (ch == kPad64Equals \|\| ch == kPad64Dot) {
				557	// if we stopped by hitting an '=' or '.', un-read that character -- we'll
				558	// look at it again when we count to check for the proper number of
				559	// equals signs at the end.
				560	++szsrc;
				561	--src;
				562	} else {
				563	// This loop consumes 1 input byte per iteration. It's used to
				564	// clean up the 0-3 input bytes remaining when the first, faster
				565	// loop finishes. 'temp' contains the data from 'state' input
				566	// characters read by the first loop.
				567	while (szsrc > 0) {
				568	--szsrc;
				569	ch = *src++;
				570	decode = unbase64[ch];
				571	if (decode < 0) {
				572	if (absl::ascii_isspace(ch)) {
				573	continue;
				574	} else if (ch == kPad64Equals \|\| ch == kPad64Dot) {
				575	// back up one character; we'll read it again when we check
				576	// for the correct number of pad characters at the end.
				577	++szsrc;
				578	--src;
				579	break;
				580	} else {
				581	return false;
				582	}
				583	}
				584
				585	// Each input character gives us six bits of output.
				586	temp = (temp << 6) \| decode;
				587	++state;
				588	if (state == 4) {
				589	// If we've accumulated 24 bits of output, write that out as
				590	// three bytes.
				591	if (dest) {
				592	if (destidx + 3 > szdest) return false;
				593	dest[destidx + 2] = temp;
				594	temp >>= 8;
				595	dest[destidx + 1] = temp;
				596	temp >>= 8;
				597	dest[destidx] = temp;
				598	}
				599	destidx += 3;
				600	state = 0;
				601	temp = 0;
				602	}
				603	}
				604	}
				605
				606	// Process the leftover data contained in 'temp' at the end of the input.
				607	int expected_equals = 0;
				608	switch (state) {
				609	case 0:
				610	// Nothing left over; output is a multiple of 3 bytes.
				611	break;
				612
				613	case 1:
				614	// Bad input; we have 6 bits left over.
				615	return false;
				616
				617	case 2:
				618	// Produce one more output byte from the 12 input bits we have left.
				619	if (dest) {
				620	if (destidx + 1 > szdest) return false;
				621	temp >>= 4;
				622	dest[destidx] = temp;
				623	}
				624	++destidx;
				625	expected_equals = 2;
				626	break;
				627
				628	case 3:
				629	// Produce two more output bytes from the 18 input bits we have left.
				630	if (dest) {
				631	if (destidx + 2 > szdest) return false;
				632	temp >>= 2;
				633	dest[destidx + 1] = temp;
				634	temp >>= 8;
				635	dest[destidx] = temp;
				636	}
				637	destidx += 2;
				638	expected_equals = 1;
				639	break;
				640
				641	default:
				642	// state should have no other values at this point.
				643	ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
				644	state);
				645	}
				646
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	647	// The remainder of the string should be all whitespace, mixed with
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	648	// exactly 0 equals signs, or exactly 'expected_equals' equals
				649	// signs. (Always accepting 0 equals signs is an Abseil extension
				650	// not covered in the RFC, as is accepting dot as the pad character.)
				651
				652	int equals = 0;
				653	while (szsrc > 0) {
				654	if (src == kPad64Equals \|\| src == kPad64Dot)
				655	++equals;
				656	else if (!absl::ascii_isspace(*src))
				657	return false;
				658	--szsrc;
				659	++src;
				660	}
				661
				662	const bool ok = (equals == 0 \|\| equals == expected_equals);
				663	if (ok) *len = destidx;
				664	return ok;
				665	}
				666
				667	// The arrays below were generated by the following code
				668	// #include <sys/time.h>
				669	// #include <stdlib.h>
				670	// #include <string.h>
				671	// main()
				672	// {
				673	// static const char Base64[] =
				674	// "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
				675	// char* pos;
				676	// int idx, i, j;
				677	// printf(" ");
				678	// for (i = 0; i < 255; i += 8) {
				679	// for (j = i; j < i + 8; j++) {
				680	// pos = strchr(Base64, j);
				681	// if ((pos == nullptr) \|\| (j == 0))
				682	// idx = -1;
				683	// else
				684	// idx = pos - Base64;
				685	// if (idx == -1)
				686	// printf(" %2d, ", idx);
				687	// else
				688	// printf(" %2d/%c/,", idx, j);
				689	// }
				690	// printf("\n ");
				691	// }
				692	// }
				693	//
				694	// where the value of "Base64[]" was replaced by one of the base-64 conversion
				695	// tables from the functions below.
				696	/* clang-format off */
				697	constexpr signed char kUnBase64[] = {
				698	-1, -1, -1, -1, -1, -1, -1, -1,
				699	-1, -1, -1, -1, -1, -1, -1, -1,
				700	-1, -1, -1, -1, -1, -1, -1, -1,
				701	-1, -1, -1, -1, -1, -1, -1, -1,
				702	-1, -1, -1, -1, -1, -1, -1, -1,
				703	-1, -1, -1, 62/+/, -1, -1, -1, 63// /,
				704	52/0/, 53/1/, 54/2/, 55/3/, 56/4/, 57/5/, 58/6/, 59/7/,
				705	60/8/, 61/9/, -1, -1, -1, -1, -1, -1,
				706	-1, 0/A/, 1/B/, 2/C/, 3/D/, 4/E/, 5/F/, 6/G/,
				707	07/H/, 8/I/, 9/J/, 10/K/, 11/L/, 12/M/, 13/N/, 14/O/,
				708	15/P/, 16/Q/, 17/R/, 18/S/, 19/T/, 20/U/, 21/V/, 22/W/,
				709	23/X/, 24/Y/, 25/Z/, -1, -1, -1, -1, -1,
				710	-1, 26/a/, 27/b/, 28/c/, 29/d/, 30/e/, 31/f/, 32/g/,
				711	33/h/, 34/i/, 35/j/, 36/k/, 37/l/, 38/m/, 39/n/, 40/o/,
				712	41/p/, 42/q/, 43/r/, 44/s/, 45/t/, 46/u/, 47/v/, 48/w/,
				713	49/x/, 50/y/, 51/z/, -1, -1, -1, -1, -1,
				714	-1, -1, -1, -1, -1, -1, -1, -1,
				715	-1, -1, -1, -1, -1, -1, -1, -1,
				716	-1, -1, -1, -1, -1, -1, -1, -1,
				717	-1, -1, -1, -1, -1, -1, -1, -1,
				718	-1, -1, -1, -1, -1, -1, -1, -1,
				719	-1, -1, -1, -1, -1, -1, -1, -1,
				720	-1, -1, -1, -1, -1, -1, -1, -1,
				721	-1, -1, -1, -1, -1, -1, -1, -1,
				722	-1, -1, -1, -1, -1, -1, -1, -1,
				723	-1, -1, -1, -1, -1, -1, -1, -1,
				724	-1, -1, -1, -1, -1, -1, -1, -1,
				725	-1, -1, -1, -1, -1, -1, -1, -1,
				726	-1, -1, -1, -1, -1, -1, -1, -1,
				727	-1, -1, -1, -1, -1, -1, -1, -1,
				728	-1, -1, -1, -1, -1, -1, -1, -1,
				729	-1, -1, -1, -1, -1, -1, -1, -1
				730	};
				731
				732	constexpr signed char kUnWebSafeBase64[] = {
				733	-1, -1, -1, -1, -1, -1, -1, -1,
				734	-1, -1, -1, -1, -1, -1, -1, -1,
				735	-1, -1, -1, -1, -1, -1, -1, -1,
				736	-1, -1, -1, -1, -1, -1, -1, -1,
				737	-1, -1, -1, -1, -1, -1, -1, -1,
				738	-1, -1, -1, -1, -1, 62/-/, -1, -1,
				739	52/0/, 53/1/, 54/2/, 55/3/, 56/4/, 57/5/, 58/6/, 59/7/,
				740	60/8/, 61/9/, -1, -1, -1, -1, -1, -1,
				741	-1, 0/A/, 1/B/, 2/C/, 3/D/, 4/E/, 5/F/, 6/G/,
				742	07/H/, 8/I/, 9/J/, 10/K/, 11/L/, 12/M/, 13/N/, 14/O/,
				743	15/P/, 16/Q/, 17/R/, 18/S/, 19/T/, 20/U/, 21/V/, 22/W/,
				744	23/X/, 24/Y/, 25/Z/, -1, -1, -1, -1, 63/_/,
				745	-1, 26/a/, 27/b/, 28/c/, 29/d/, 30/e/, 31/f/, 32/g/,
				746	33/h/, 34/i/, 35/j/, 36/k/, 37/l/, 38/m/, 39/n/, 40/o/,
				747	41/p/, 42/q/, 43/r/, 44/s/, 45/t/, 46/u/, 47/v/, 48/w/,
				748	49/x/, 50/y/, 51/z/, -1, -1, -1, -1, -1,
				749	-1, -1, -1, -1, -1, -1, -1, -1,
				750	-1, -1, -1, -1, -1, -1, -1, -1,
				751	-1, -1, -1, -1, -1, -1, -1, -1,
				752	-1, -1, -1, -1, -1, -1, -1, -1,
				753	-1, -1, -1, -1, -1, -1, -1, -1,
				754	-1, -1, -1, -1, -1, -1, -1, -1,
				755	-1, -1, -1, -1, -1, -1, -1, -1,
				756	-1, -1, -1, -1, -1, -1, -1, -1,
				757	-1, -1, -1, -1, -1, -1, -1, -1,
				758	-1, -1, -1, -1, -1, -1, -1, -1,
				759	-1, -1, -1, -1, -1, -1, -1, -1,
				760	-1, -1, -1, -1, -1, -1, -1, -1,
				761	-1, -1, -1, -1, -1, -1, -1, -1,
				762	-1, -1, -1, -1, -1, -1, -1, -1,
				763	-1, -1, -1, -1, -1, -1, -1, -1,
				764	-1, -1, -1, -1, -1, -1, -1, -1
				765	};
				766	/* clang-format on */
				767
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	768	constexpr char kWebSafeBase64Chars[] =
				769	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
				770
				771	template <typename String>
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	772	bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
				773	const signed char* unbase64) {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	774	// Determine the size of the output string. Base64 encodes every 3 bytes into
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	775	// 4 characters. any leftover chars are added directly for good measure.
				776	// This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
				777	const size_t dest_len = 3 * (slen / 4) + (slen % 4);
				778
				779	strings_internal::STLStringResizeUninitialized(dest, dest_len);
				780
				781	// We are getting the destination buffer by getting the beginning of the
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	782	// string and converting it into a char *.
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	783	size_t len;
				784	const bool ok =
				785	Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
				786	if (!ok) {
				787	dest->clear();
				788	return false;
				789	}
				790
				791	// could be shorter if there was padding
				792	assert(len <= dest_len);
				793	dest->erase(len);
				794
				795	return true;
				796	}
				797
				798	/* clang-format off */
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	799	constexpr char kHexValueLenient[256] = {
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	800	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				801	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				802	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				803	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
				804	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
				805	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				806	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
				807	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				808	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				809	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				810	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				811	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				812	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				813	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				814	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	815	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	816	};
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	817
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	818	/* clang-format on */
				819
				820	// This is a templated function so that T can be either a char*
				821	// or a string. This works because we use the [] operator to access
				822	// individual characters at a time.
				823	template <typename T>
				824	void HexStringToBytesInternal(const char* from, T to, ptrdiff_t num) {
				825	for (int i = 0; i < num; i++) {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	826	to[i] = (kHexValueLenient[from[i * 2] & 0xFF] << 4) +
				827	(kHexValueLenient[from[i * 2 + 1] & 0xFF]);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	828	}
				829	}
				830
				831	// This is a templated function so that T can be either a char* or a
				832	// std::string.
				833	template <typename T>
				834	void BytesToHexStringInternal(const unsigned char* src, T dest, ptrdiff_t num) {
				835	auto dest_ptr = &dest[0];
				836	for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	837	const char* hex_p = &numbers_internal::kHexTable[src_ptr 2];
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	838	std::copy(hex_p, hex_p + 2, dest_ptr);
				839	}
				840	}
				841
				842	} // namespace
				843
				844	// ----------------------------------------------------------------------
				845	// CUnescape()
				846	//
				847	// See CUnescapeInternal() for implementation details.
				848	// ----------------------------------------------------------------------
				849	bool CUnescape(absl::string_view source, std::string* dest,
				850	std::string* error) {
				851	return CUnescapeInternal(source, kUnescapeNulls, dest, error);
				852	}
				853
				854	std::string CEscape(absl::string_view src) {
				855	std::string dest;
				856	CEscapeAndAppendInternal(src, &dest);
				857	return dest;
				858	}
				859
				860	std::string CHexEscape(absl::string_view src) {
				861	return CEscapeInternal(src, true, false);
				862	}
				863
				864	std::string Utf8SafeCEscape(absl::string_view src) {
				865	return CEscapeInternal(src, false, true);
				866	}
				867
				868	std::string Utf8SafeCHexEscape(absl::string_view src) {
				869	return CEscapeInternal(src, true, true);
				870	}
				871
				872	// ----------------------------------------------------------------------
				873	// Base64Unescape() - base64 decoder
				874	// Base64Escape() - base64 encoder
				875	// WebSafeBase64Unescape() - Google's variation of base64 decoder
				876	// WebSafeBase64Escape() - Google's variation of base64 encoder
				877	//
				878	// Check out
				879	// http://tools.ietf.org/html/rfc2045 for formal description, but what we
				880	// care about is that...
				881	// Take the encoded stuff in groups of 4 characters and turn each
				882	// character into a code 0 to 63 thus:
				883	// A-Z map to 0 to 25
				884	// a-z map to 26 to 51
				885	// 0-9 map to 52 to 61
				886	// +(- for WebSafe) maps to 62
				887	// /(_ for WebSafe) maps to 63
				888	// There will be four numbers, all less than 64 which can be represented
				889	// by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
				890	// Arrange the 6 digit binary numbers into three bytes as such:
				891	// aaaaaabb bbbbcccc ccdddddd
				892	// Equals signs (one or two) are used at the end of the encoded block to
				893	// indicate that the text was not an integer multiple of three bytes long.
				894	// ----------------------------------------------------------------------
				895
				896	bool Base64Unescape(absl::string_view src, std::string* dest) {
				897	return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
				898	}
				899
				900	bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
				901	return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
				902	}
				903
				904	void Base64Escape(absl::string_view src, std::string* dest) {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	905	strings_internal::Base64EscapeInternal(
				906	reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
				907	true, strings_internal::kBase64Chars);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	908	}
				909
				910	void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	911	strings_internal::Base64EscapeInternal(
				912	reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
				913	false, kWebSafeBase64Chars);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	914	}
				915
				916	std::string Base64Escape(absl::string_view src) {
				917	std::string dest;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	918	strings_internal::Base64EscapeInternal(
				919	reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
				920	true, strings_internal::kBase64Chars);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	921	return dest;
				922	}
				923
				924	std::string WebSafeBase64Escape(absl::string_view src) {
				925	std::string dest;
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	926	strings_internal::Base64EscapeInternal(
				927	reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
				928	false, kWebSafeBase64Chars);
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	929	return dest;
				930	}
				931
				932	std::string HexStringToBytes(absl::string_view from) {
				933	std::string result;
				934	const auto num = from.size() / 2;
				935	strings_internal::STLStringResizeUninitialized(&result, num);
				936	absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
				937	return result;
				938	}
				939
				940	std::string BytesToHexString(absl::string_view from) {
				941	std::string result;
				942	strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
				943	absl::BytesToHexStringInternal<std::string&>(
				944	reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
				945	return result;
				946	}
				947
Austin Schuh	b4691e9	2020-12-31 12:37:18 -0800	[diff] [blame^]	948	ABSL_NAMESPACE_END
Austin Schuh	36244a1	2019-09-21 17:52:38 -0700	[diff] [blame]	949	} // namespace absl