Austin Schuh | 36244a1 | 2019-09-21 17:52:38 -0700 | [diff] [blame] | 1 | // Copyright 2017 The Abseil Authors. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include "absl/strings/ascii.h" |
| 16 | |
| 17 | namespace absl { |
| 18 | namespace ascii_internal { |
| 19 | |
| 20 | // # Table generated by this Python code (bit 0x02 is currently unused): |
| 21 | // TODO(mbar) Move Python code for generation of table to BUILD and link here. |
| 22 | |
| 23 | // NOTE: The kAsciiPropertyBits table used within this code was generated by |
| 24 | // Python code of the following form. (Bit 0x02 is currently unused and |
| 25 | // available.) |
| 26 | // |
| 27 | // def Hex2(n): |
| 28 | // return '0x' + hex(n/16)[2:] + hex(n%16)[2:] |
| 29 | // def IsPunct(ch): |
| 30 | // return (ord(ch) >= 32 and ord(ch) < 127 and |
| 31 | // not ch.isspace() and not ch.isalnum()) |
| 32 | // def IsBlank(ch): |
| 33 | // return ch in ' \t' |
| 34 | // def IsCntrl(ch): |
| 35 | // return ord(ch) < 32 or ord(ch) == 127 |
| 36 | // def IsXDigit(ch): |
| 37 | // return ch.isdigit() or ch.lower() in 'abcdef' |
| 38 | // for i in range(128): |
| 39 | // ch = chr(i) |
| 40 | // mask = ((ch.isalpha() and 0x01 or 0) | |
| 41 | // (ch.isalnum() and 0x04 or 0) | |
| 42 | // (ch.isspace() and 0x08 or 0) | |
| 43 | // (IsPunct(ch) and 0x10 or 0) | |
| 44 | // (IsBlank(ch) and 0x20 or 0) | |
| 45 | // (IsCntrl(ch) and 0x40 or 0) | |
| 46 | // (IsXDigit(ch) and 0x80 or 0)) |
| 47 | // print Hex2(mask) + ',', |
| 48 | // if i % 16 == 7: |
| 49 | // print ' //', Hex2(i & 0x78) |
| 50 | // elif i % 16 == 15: |
| 51 | // print |
| 52 | |
| 53 | // clang-format off |
| 54 | // Array of bitfields holding character information. Each bit value corresponds |
| 55 | // to a particular character feature. For readability, and because the value |
| 56 | // of these bits is tightly coupled to this implementation, the individual bits |
| 57 | // are not named. Note that bitfields for all characters above ASCII 127 are |
| 58 | // zero-initialized. |
| 59 | const unsigned char kPropertyBits[256] = { |
| 60 | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00 |
| 61 | 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40, |
| 62 | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10 |
| 63 | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, |
| 64 | 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20 |
| 65 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
| 66 | 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30 |
| 67 | 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
| 68 | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40 |
| 69 | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
| 70 | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50 |
| 71 | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10, |
| 72 | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60 |
| 73 | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
| 74 | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70 |
| 75 | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40, |
| 76 | }; |
| 77 | |
| 78 | // Array of characters for the ascii_tolower() function. For values 'A' |
| 79 | // through 'Z', return the lower-case character; otherwise, return the |
| 80 | // identity of the passed character. |
| 81 | const char kToLower[256] = { |
| 82 | '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', |
| 83 | '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', |
| 84 | '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', |
| 85 | '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', |
| 86 | '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', |
| 87 | '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', |
| 88 | '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', |
| 89 | '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', |
| 90 | '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g', |
| 91 | 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
| 92 | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', |
| 93 | 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', |
| 94 | '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', |
| 95 | '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f', |
| 96 | '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', |
| 97 | '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', |
| 98 | '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', |
| 99 | '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', |
| 100 | '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', |
| 101 | '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', |
| 102 | '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', |
| 103 | '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', |
| 104 | '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', |
| 105 | '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', |
| 106 | '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', |
| 107 | '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', |
| 108 | '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', |
| 109 | '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', |
| 110 | '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', |
| 111 | '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', |
| 112 | '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', |
| 113 | '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', |
| 114 | }; |
| 115 | |
| 116 | // Array of characters for the ascii_toupper() function. For values 'a' |
| 117 | // through 'z', return the upper-case character; otherwise, return the |
| 118 | // identity of the passed character. |
| 119 | const char kToUpper[256] = { |
| 120 | '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', |
| 121 | '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f', |
| 122 | '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', |
| 123 | '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', |
| 124 | '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', |
| 125 | '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f', |
| 126 | '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', |
| 127 | '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f', |
| 128 | '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', |
| 129 | '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f', |
| 130 | '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', |
| 131 | '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f', |
| 132 | '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G', |
| 133 | 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', |
| 134 | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', |
| 135 | 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f', |
| 136 | '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', |
| 137 | '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', |
| 138 | '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', |
| 139 | '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', |
| 140 | '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', |
| 141 | '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', |
| 142 | '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7', |
| 143 | '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf', |
| 144 | '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7', |
| 145 | '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf', |
| 146 | '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7', |
| 147 | '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf', |
| 148 | '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7', |
| 149 | '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef', |
| 150 | '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7', |
| 151 | '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff', |
| 152 | }; |
| 153 | // clang-format on |
| 154 | |
| 155 | } // namespace ascii_internal |
| 156 | |
| 157 | void AsciiStrToLower(std::string* s) { |
| 158 | for (auto& ch : *s) { |
| 159 | ch = absl::ascii_tolower(ch); |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | void AsciiStrToUpper(std::string* s) { |
| 164 | for (auto& ch : *s) { |
| 165 | ch = absl::ascii_toupper(ch); |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | void RemoveExtraAsciiWhitespace(std::string* str) { |
| 170 | auto stripped = StripAsciiWhitespace(*str); |
| 171 | |
| 172 | if (stripped.empty()) { |
| 173 | str->clear(); |
| 174 | return; |
| 175 | } |
| 176 | |
| 177 | auto input_it = stripped.begin(); |
| 178 | auto input_end = stripped.end(); |
| 179 | auto output_it = &(*str)[0]; |
| 180 | bool is_ws = false; |
| 181 | |
| 182 | for (; input_it < input_end; ++input_it) { |
| 183 | if (is_ws) { |
| 184 | // Consecutive whitespace? Keep only the last. |
| 185 | is_ws = absl::ascii_isspace(*input_it); |
| 186 | if (is_ws) --output_it; |
| 187 | } else { |
| 188 | is_ws = absl::ascii_isspace(*input_it); |
| 189 | } |
| 190 | |
| 191 | *output_it = *input_it; |
| 192 | ++output_it; |
| 193 | } |
| 194 | |
| 195 | str->erase(output_it - &(*str)[0]); |
| 196 | } |
| 197 | |
| 198 | } // namespace absl |