blob: 93bb03e95815b4dd1ee8dac47f0aca45b02bc8ef [file] [log] [blame]
Austin Schuh36244a12019-09-21 17:52:38 -07001// Copyright 2017 The Abseil Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/strings/ascii.h"
16
17namespace absl {
Austin Schuhb4691e92020-12-31 12:37:18 -080018ABSL_NAMESPACE_BEGIN
Austin Schuh36244a12019-09-21 17:52:38 -070019namespace ascii_internal {
20
21// # Table generated by this Python code (bit 0x02 is currently unused):
22// TODO(mbar) Move Python code for generation of table to BUILD and link here.
23
24// NOTE: The kAsciiPropertyBits table used within this code was generated by
25// Python code of the following form. (Bit 0x02 is currently unused and
26// available.)
27//
28// def Hex2(n):
29// return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
30// def IsPunct(ch):
31// return (ord(ch) >= 32 and ord(ch) < 127 and
32// not ch.isspace() and not ch.isalnum())
33// def IsBlank(ch):
34// return ch in ' \t'
35// def IsCntrl(ch):
36// return ord(ch) < 32 or ord(ch) == 127
37// def IsXDigit(ch):
38// return ch.isdigit() or ch.lower() in 'abcdef'
39// for i in range(128):
40// ch = chr(i)
41// mask = ((ch.isalpha() and 0x01 or 0) |
42// (ch.isalnum() and 0x04 or 0) |
43// (ch.isspace() and 0x08 or 0) |
44// (IsPunct(ch) and 0x10 or 0) |
45// (IsBlank(ch) and 0x20 or 0) |
46// (IsCntrl(ch) and 0x40 or 0) |
47// (IsXDigit(ch) and 0x80 or 0))
48// print Hex2(mask) + ',',
49// if i % 16 == 7:
50// print ' //', Hex2(i & 0x78)
51// elif i % 16 == 15:
52// print
53
54// clang-format off
55// Array of bitfields holding character information. Each bit value corresponds
56// to a particular character feature. For readability, and because the value
57// of these bits is tightly coupled to this implementation, the individual bits
58// are not named. Note that bitfields for all characters above ASCII 127 are
59// zero-initialized.
Austin Schuhb4691e92020-12-31 12:37:18 -080060ABSL_DLL const unsigned char kPropertyBits[256] = {
Austin Schuh36244a12019-09-21 17:52:38 -070061 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
62 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
63 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
64 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
65 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
66 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
67 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
68 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
69 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
70 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
71 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
72 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
73 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
74 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
75 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
76 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
77};
78
79// Array of characters for the ascii_tolower() function. For values 'A'
80// through 'Z', return the lower-case character; otherwise, return the
81// identity of the passed character.
Austin Schuhb4691e92020-12-31 12:37:18 -080082ABSL_DLL const char kToLower[256] = {
Austin Schuh36244a12019-09-21 17:52:38 -070083 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
84 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
85 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
86 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
87 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
88 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
89 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
90 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
91 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
92 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
94 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
95 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
96 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
97 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
98 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
99 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
100 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
101 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
102 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
103 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
104 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
105 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
106 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
107 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
108 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
109 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
110 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
111 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
112 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
113 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
114 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
115};
116
117// Array of characters for the ascii_toupper() function. For values 'a'
118// through 'z', return the upper-case character; otherwise, return the
119// identity of the passed character.
Austin Schuhb4691e92020-12-31 12:37:18 -0800120ABSL_DLL const char kToUpper[256] = {
Austin Schuh36244a12019-09-21 17:52:38 -0700121 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
122 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
123 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
124 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
125 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
126 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
127 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
128 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
129 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
130 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
131 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
132 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
133 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
134 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
135 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
136 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
137 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
138 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
139 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
140 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
141 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
142 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
143 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
144 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
145 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
146 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
147 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
148 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
149 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
150 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
151 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
152 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
153};
154// clang-format on
155
156} // namespace ascii_internal
157
158void AsciiStrToLower(std::string* s) {
159 for (auto& ch : *s) {
160 ch = absl::ascii_tolower(ch);
161 }
162}
163
164void AsciiStrToUpper(std::string* s) {
165 for (auto& ch : *s) {
166 ch = absl::ascii_toupper(ch);
167 }
168}
169
170void RemoveExtraAsciiWhitespace(std::string* str) {
171 auto stripped = StripAsciiWhitespace(*str);
172
173 if (stripped.empty()) {
174 str->clear();
175 return;
176 }
177
178 auto input_it = stripped.begin();
179 auto input_end = stripped.end();
180 auto output_it = &(*str)[0];
181 bool is_ws = false;
182
183 for (; input_it < input_end; ++input_it) {
184 if (is_ws) {
185 // Consecutive whitespace? Keep only the last.
186 is_ws = absl::ascii_isspace(*input_it);
187 if (is_ws) --output_it;
188 } else {
189 is_ws = absl::ascii_isspace(*input_it);
190 }
191
192 *output_it = *input_it;
193 ++output_it;
194 }
195
196 str->erase(output_it - &(*str)[0]);
197}
198
Austin Schuhb4691e92020-12-31 12:37:18 -0800199ABSL_NAMESPACE_END
Austin Schuh36244a12019-09-21 17:52:38 -0700200} // namespace absl