Brian Silverman | 70325d6 | 2015-09-20 17:00:43 -0400 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (c) 2007, Google Inc. |
| 3 | * All rights reserved. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions are |
| 7 | * met: |
| 8 | * |
| 9 | * * Redistributions of source code must retain the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer. |
| 11 | * * Redistributions in binary form must reproduce the above |
| 12 | * copyright notice, this list of conditions and the following disclaimer |
| 13 | * in the documentation and/or other materials provided with the |
| 14 | * distribution. |
| 15 | * * Neither the name of Google Inc. nor the names of its |
| 16 | * contributors may be used to endorse or promote products derived from |
| 17 | * this software without specific prior written permission. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 20 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 22 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 23 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 24 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 25 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 26 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 27 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 28 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 29 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | * --- |
| 31 | * |
| 32 | * Author: falmeida@google.com (Filipe Almeida) |
| 33 | */ |
| 34 | |
| 35 | #ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_H |
| 36 | #define SECURITY_STREAMHTMLPARSER_HTMLPARSER_H |
| 37 | |
| 38 | #include <config.h> |
| 39 | #include "htmlparser/statemachine.h" |
| 40 | #include "htmlparser/jsparser.h" |
| 41 | |
| 42 | // Annoying stuff for windows in opensource-land -- make sure clients |
| 43 | // (in this case unittests) can import the functions. |
| 44 | #ifndef CTEMPLATE_DLL_DECL |
| 45 | # ifdef _MSC_VER |
| 46 | # define CTEMPLATE_DLL_DECL __declspec(dllimport) |
| 47 | # else |
| 48 | # define CTEMPLATE_DLL_DECL /* should be the empty string for non-windows */ |
| 49 | # endif |
| 50 | #endif |
| 51 | |
| 52 | #ifdef __cplusplus |
| 53 | namespace ctemplate_htmlparser { |
| 54 | #endif |
| 55 | |
| 56 | /* entity filter */ |
| 57 | |
| 58 | /* String sizes used in htmlparser and entityfilter structures including the |
| 59 | * NULL terminator. |
| 60 | */ |
| 61 | #define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE |
| 62 | #define HTMLPARSER_MAX_ENTITY_SIZE 10 |
| 63 | |
| 64 | |
| 65 | enum htmlparser_state_external_enum { |
| 66 | HTMLPARSER_STATE_TEXT, |
| 67 | HTMLPARSER_STATE_TAG, |
| 68 | HTMLPARSER_STATE_ATTR, |
| 69 | HTMLPARSER_STATE_VALUE, |
| 70 | HTMLPARSER_STATE_COMMENT, |
| 71 | HTMLPARSER_STATE_JS_FILE, |
| 72 | HTMLPARSER_STATE_CSS_FILE, |
| 73 | HTMLPARSER_STATE_ERROR |
| 74 | }; |
| 75 | |
| 76 | enum htmlparser_mode { |
| 77 | HTMLPARSER_MODE_HTML, |
| 78 | HTMLPARSER_MODE_JS, |
| 79 | HTMLPARSER_MODE_CSS, |
| 80 | HTMLPARSER_MODE_HTML_IN_TAG |
| 81 | }; |
| 82 | |
| 83 | enum htmlparser_attr_type { |
| 84 | HTMLPARSER_ATTR_NONE, |
| 85 | HTMLPARSER_ATTR_REGULAR, |
| 86 | HTMLPARSER_ATTR_URI, |
| 87 | HTMLPARSER_ATTR_JS, |
| 88 | HTMLPARSER_ATTR_STYLE |
| 89 | }; |
| 90 | |
| 91 | |
| 92 | /* TODO(falmeida): Maybe move some of these declaration to the .c and only keep |
| 93 | * a forward declaration in here, since these structures are private. |
| 94 | */ |
| 95 | |
| 96 | /* entityfilter context structure. |
| 97 | * |
| 98 | * The entity filter collection of routines provide a way to decode html |
| 99 | * entities from an html document in a streaming way. |
| 100 | * |
| 101 | * The html_process() function receives a character at a time from the input |
| 102 | * stream and returns 0 or more characters which should be appended to the |
| 103 | * resulting decoded document. |
| 104 | * |
| 105 | * Currently this collection of functions are only exported for testing purposes |
| 106 | * and shouldn't be called from outside of htmlparser.c. |
| 107 | * |
| 108 | * Since we really only use these functions with the very specific purpose of |
| 109 | * decoding html entities for javascript attributes, only a small subset of |
| 110 | * entities are supported: <, >, "e;, &, &apos, and the numeric |
| 111 | * character references for both decimal and hexadecimal. |
| 112 | */ |
| 113 | typedef struct entityfilter_ctx_s { |
| 114 | |
| 115 | /* Current position into the buffer. */ |
| 116 | int buffer_pos; |
| 117 | |
| 118 | /* True if currently processing an html entity. */ |
| 119 | int in_entity; |
| 120 | |
| 121 | /* Temporary character buffer that is used while processing html entities. |
| 122 | */ |
| 123 | char buffer[HTMLPARSER_MAX_ENTITY_SIZE]; |
| 124 | |
| 125 | /* String buffer returned to the application after we decoded an html |
| 126 | * entity. |
| 127 | */ |
| 128 | char output[HTMLPARSER_MAX_ENTITY_SIZE]; |
| 129 | } entityfilter_ctx; |
| 130 | |
| 131 | /* Resets the entityfilter to its initial state so it can be reused. |
| 132 | */ |
| 133 | void entityfilter_reset(entityfilter_ctx *ctx); |
| 134 | |
| 135 | /* Initializes a new entity filter object. |
| 136 | */ |
| 137 | entityfilter_ctx *entityfilter_new(void); |
| 138 | |
| 139 | /* Deallocates an entity filter object. |
| 140 | */ |
| 141 | void entityfilter_delete(entityfilter_ctx *ctx); |
| 142 | |
| 143 | /* Copies the context of the entityfilter pointed to by src to the entityfilter |
| 144 | * dst. |
| 145 | */ |
| 146 | void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src); |
| 147 | |
| 148 | /* Processes a character from the input stream and decodes any html entities |
| 149 | * in the accumulated buffer. |
| 150 | * |
| 151 | * Returns a reference to a string that points to an internal buffer. This |
| 152 | * buffer will be changed after every call to entityfilter_process(). As |
| 153 | * such this string should be duplicated before subsequent calls to |
| 154 | * entityfilter_process(). |
| 155 | */ |
| 156 | const char *entityfilter_process(entityfilter_ctx *ctx, char c); |
| 157 | |
| 158 | |
| 159 | /* html parser */ |
| 160 | |
| 161 | /* Stores the context of the html parser. |
| 162 | * If this structure is changed, htmlparser_new(), htmlparser_copy() and |
| 163 | * htmlparser_reset() should be updated accordingly. |
| 164 | */ |
| 165 | typedef struct htmlparser_ctx_s { |
| 166 | |
| 167 | /* Holds a reference to the statemachine context. */ |
| 168 | statemachine_ctx *statemachine; |
| 169 | |
| 170 | /* Holds a reference to the statemachine definition in use. Right now this is |
| 171 | * only used so we can deallocate it at the end. |
| 172 | * |
| 173 | * It should be readonly and contain the same values across jsparser |
| 174 | * instances. |
| 175 | */ |
| 176 | /* TODO(falmeida): Change statemachine_def to const. */ |
| 177 | statemachine_definition *statemachine_def; |
| 178 | |
| 179 | /* Holds a reference to the javascript parser. */ |
| 180 | jsparser_ctx *jsparser; |
| 181 | |
| 182 | /* Holds a reference to the entity filter. Used for decoding html entities |
| 183 | * inside javascript attributes. */ |
| 184 | entityfilter_ctx *entityfilter; |
| 185 | |
| 186 | /* Offset into the current attribute value where 0 is the first character in |
| 187 | * the value. */ |
| 188 | int value_index; |
| 189 | |
| 190 | /* True if currently processing javascript. */ |
| 191 | int in_js; |
| 192 | |
| 193 | /* Current tag name. */ |
| 194 | char tag[HTMLPARSER_MAX_STRING]; |
| 195 | |
| 196 | /* Current attribute name. */ |
| 197 | char attr[HTMLPARSER_MAX_STRING]; |
| 198 | |
| 199 | /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */ |
| 200 | char value[HTMLPARSER_MAX_STRING]; |
| 201 | |
| 202 | } htmlparser_ctx; |
| 203 | |
| 204 | /* Resets the parser to its initial state and to the default mode, which |
| 205 | * is MODE_HTML. |
| 206 | * |
| 207 | * All internal context like tag name, attribute name or the state of the |
| 208 | * statemachine are reset to its original values as if the object was just |
| 209 | * created. |
| 210 | */ |
| 211 | extern CTEMPLATE_DLL_DECL |
| 212 | void htmlparser_reset(htmlparser_ctx *ctx); |
| 213 | |
| 214 | /* Resets the parser to its initial state and changes the parser mode. |
| 215 | * All internal context like tag name, attribute name or the state of the |
| 216 | * statemachine are reset to their original values as if the object was just |
| 217 | * created. |
| 218 | * |
| 219 | * Available modes: |
| 220 | * HTMLPARSER_MODE_HTML - Parses html text |
| 221 | * HTMLPARSER_MODE_JS - Parses javascript files |
| 222 | * HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done |
| 223 | * but htmlparser_in_css() always returns true. |
| 224 | * HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To |
| 225 | * be used in a template expanded in the |
| 226 | * following context: <a $template> |
| 227 | * |
| 228 | */ |
| 229 | extern CTEMPLATE_DLL_DECL |
| 230 | void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode); |
| 231 | |
| 232 | /* Initializes a new htmlparser instance. |
| 233 | * |
| 234 | * Returns a pointer to the new instance or NULL if the initialization fails. |
| 235 | * Initialization failure is fatal, and if this function fails it may not |
| 236 | * deallocate all previsouly allocated memory. |
| 237 | */ |
| 238 | extern CTEMPLATE_DLL_DECL |
| 239 | htmlparser_ctx *htmlparser_new(void); |
| 240 | |
| 241 | /* Copies the context of the htmlparser pointed to by src to the htmlparser dst. |
| 242 | * |
| 243 | * Also copies over the instances of the state machine, the jsparser and the |
| 244 | * entity filter but not the statemachine definition since this one is read |
| 245 | * only. |
| 246 | */ |
| 247 | extern CTEMPLATE_DLL_DECL |
| 248 | void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src); |
| 249 | |
| 250 | /* Receives an htmlparser context and returns the current html state. |
| 251 | * |
| 252 | * The return value will be one of the states of htmlparser_state_external_enum. |
| 253 | */ |
| 254 | extern CTEMPLATE_DLL_DECL |
| 255 | int htmlparser_state(htmlparser_ctx *ctx); |
| 256 | |
| 257 | /* Parses the input html stream and returns the finishing state. |
| 258 | * |
| 259 | * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse() |
| 260 | * is called after an error situation was encountered the behaviour is |
| 261 | * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode() |
| 262 | * can be called to reset the state. |
| 263 | */ |
| 264 | extern CTEMPLATE_DLL_DECL |
| 265 | int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size); |
| 266 | |
| 267 | /* Returns true if the parser is inside an attribute value and the value is |
| 268 | * surrounded by single or double quotes. */ |
| 269 | extern CTEMPLATE_DLL_DECL |
| 270 | int htmlparser_is_attr_quoted(htmlparser_ctx *ctx); |
| 271 | |
| 272 | /* Returns true if the parser is currently in javascript. This can be a |
| 273 | * an attribute that takes javascript, a javascript block or the parser |
| 274 | * can just be in MODE_JS. */ |
| 275 | extern CTEMPLATE_DLL_DECL |
| 276 | int htmlparser_in_js(htmlparser_ctx *ctx); |
| 277 | |
| 278 | /* Returns the current tag or NULL if not available or we haven't seen the |
| 279 | * entire tag yet. |
| 280 | * |
| 281 | * There is no stack implemented because we currently don't have a need for |
| 282 | * it, which means tag names are tracked only one level deep. |
| 283 | * |
| 284 | * This is better understood by looking at the following example: |
| 285 | * |
| 286 | * <b [tag=b]> |
| 287 | * [tag=b] |
| 288 | * <i> |
| 289 | * [tag=i] |
| 290 | * </i> |
| 291 | * [tag=NULL] |
| 292 | * </b> |
| 293 | * |
| 294 | * The tag is correctly filled inside the tag itself and before any new inner |
| 295 | * tag is closed, at which point the tag will be null. |
| 296 | * |
| 297 | * For our current purposes this is not a problem, but we may implement a tag |
| 298 | * tracking stack in the future for completeness. |
| 299 | * |
| 300 | */ |
| 301 | extern CTEMPLATE_DLL_DECL |
| 302 | const char *htmlparser_tag(htmlparser_ctx *ctx); |
| 303 | |
| 304 | /* Returns the current attribute name if after an attribute name or in an |
| 305 | * attribute value. Returns NULL otherwise. */ |
| 306 | extern CTEMPLATE_DLL_DECL |
| 307 | const char *htmlparser_attr(htmlparser_ctx *ctx); |
| 308 | |
| 309 | /* Returns the contents of the current attribute value. |
| 310 | * |
| 311 | * Returns NULL if not inside an attribute value. |
| 312 | */ |
| 313 | extern CTEMPLATE_DLL_DECL |
| 314 | const char *htmlparser_value(htmlparser_ctx *ctx); |
| 315 | |
| 316 | /* Returns true if the parser is currently inside a CSS construct. |
| 317 | * |
| 318 | * Currently this can be either a STYLE tag, a STYLE attribute or the fact that |
| 319 | * the parser was reset in HTMLPARSER_MODE_CSS using |
| 320 | * htmlparser_reset_mode(). |
| 321 | */ |
| 322 | extern CTEMPLATE_DLL_DECL |
| 323 | int htmlparser_in_css(htmlparser_ctx *ctx); |
| 324 | |
| 325 | /* Returns the current state of the javascript state machine. |
| 326 | * |
| 327 | * Currently only present for testing purposes. |
| 328 | */ |
| 329 | extern CTEMPLATE_DLL_DECL |
| 330 | int htmlparser_js_state(htmlparser_ctx *ctx); |
| 331 | |
| 332 | /* Returns non-zero if currently inside a javascript string literal and zero |
| 333 | * otherwise. |
| 334 | */ |
| 335 | extern CTEMPLATE_DLL_DECL |
| 336 | int htmlparser_is_js_quoted(htmlparser_ctx *ctx); |
| 337 | |
| 338 | /* Returns non-zero if currently inside an attribute value and zero otherwise. |
| 339 | */ |
| 340 | extern CTEMPLATE_DLL_DECL |
| 341 | int htmlparser_value_index(htmlparser_ctx *ctx); |
| 342 | |
| 343 | /* Returns true if this is the first character of a url inside an attribute. |
| 344 | * |
| 345 | * This function can be used by an html sanitizer or auto escaping system as a |
| 346 | * hint that it should validate the url for a whitelist of protocol handlers and |
| 347 | * for well-formedness, or that it should just escape a component of it. |
| 348 | * |
| 349 | * For attributes that expect a URL, this will return true if we are at the |
| 350 | * first character of the URL, false otherwise. |
| 351 | * For most attributes, this is the same as checking that we are at the first |
| 352 | * character of the attribute value but it also works correctly for the |
| 353 | * "content" attribute of the "meta" tag where the URL follows some earlier |
| 354 | * content. |
| 355 | * e.g: <meta http-equiv="refresh" * content="5; URL=http://bla."> |
| 356 | * |
| 357 | * For any other attributes, the result will always be false. |
| 358 | */ |
| 359 | extern CTEMPLATE_DLL_DECL |
| 360 | int htmlparser_is_url_start(htmlparser_ctx *ctx); |
| 361 | |
| 362 | /* Returns the current attribute type. |
| 363 | * |
| 364 | * The attribute type can be one of: |
| 365 | * HTMLPARSER_ATTR_NONE - not inside an attribute. |
| 366 | * HTMLPARSER_ATTR_REGULAR - Inside a normal attribute. |
| 367 | * HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri. |
| 368 | * HTMLPARSER_ATTR_JS - Inside a javascript attribute. |
| 369 | * HTMLPARSER_ATTR_STYLE - Inside a css style attribute. |
| 370 | */ |
| 371 | extern CTEMPLATE_DLL_DECL |
| 372 | int htmlparser_attr_type(htmlparser_ctx *ctx); |
| 373 | |
| 374 | /* Return the current line number. */ |
| 375 | extern CTEMPLATE_DLL_DECL |
| 376 | int htmlparser_get_line_number(htmlparser_ctx *ctx); |
| 377 | |
| 378 | /* Set the current line number. */ |
| 379 | extern CTEMPLATE_DLL_DECL |
| 380 | void htmlparser_set_line_number(htmlparser_ctx *ctx, int line); |
| 381 | |
| 382 | /* Return the current column number. */ |
| 383 | extern CTEMPLATE_DLL_DECL |
| 384 | int htmlparser_get_column_number(htmlparser_ctx *ctx); |
| 385 | |
| 386 | /* Set the current column number. */ |
| 387 | extern CTEMPLATE_DLL_DECL |
| 388 | void htmlparser_set_column_number(htmlparser_ctx *ctx, int column); |
| 389 | |
| 390 | /* Retrieve a human readable error message in case an error occurred. |
| 391 | * |
| 392 | * NULL is returned if the parser didn't encounter an error. |
| 393 | */ |
| 394 | extern CTEMPLATE_DLL_DECL |
| 395 | const char *htmlparser_get_error_msg(htmlparser_ctx *ctx); |
| 396 | |
| 397 | /* Invoked by the caller when text is expanded by the caller. |
| 398 | * |
| 399 | * Should be invoked when a template directive that expands to content is |
| 400 | * executed but we don't provide this content to the parser itself. This changes |
| 401 | * the current state by following the default rule, ensuring we stay in sync |
| 402 | * with the template. |
| 403 | * |
| 404 | * Returns 1 if template directives are accepted for this state and 0 if they |
| 405 | * are not, which should result in an error condition. |
| 406 | * |
| 407 | * Right now the only case being handled are unquoted attribute values and it |
| 408 | * always returns 1. When insert_text() is called after the equals sign, we |
| 409 | * assume some text was consumed and we are now in the middle of the attribute |
| 410 | * value itself. Example: |
| 411 | * |
| 412 | * <a href=$HREF_VALUE alt=alternate_text> |
| 413 | * |
| 414 | * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't |
| 415 | * the parser would only have seen the following html: |
| 416 | * |
| 417 | * <a href= alt=alternate_text> |
| 418 | * |
| 419 | * and would interpret alt=alternate_text as the value of the href attribute. |
| 420 | */ |
| 421 | extern CTEMPLATE_DLL_DECL |
| 422 | int htmlparser_insert_text(htmlparser_ctx *ctx); |
| 423 | |
| 424 | /* Deallocates an htmlparser context object. |
| 425 | */ |
| 426 | extern CTEMPLATE_DLL_DECL |
| 427 | void htmlparser_delete(htmlparser_ctx *ctx); |
| 428 | |
| 429 | #define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1); |
| 430 | #ifdef __cplusplus |
| 431 | #define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \ |
| 432 | static_cast<int>(strlen(b))); |
| 433 | #else |
| 434 | #define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b)); |
| 435 | #endif |
| 436 | |
| 437 | #ifdef __cplusplus |
| 438 | } /* namespace security_streamhtmlparser */ |
| 439 | #endif |
| 440 | |
| 441 | #endif /* SECURITY_STREAMHTMLPARSER_HTMLPARSER_H */ |