| /* Copyright (c) 2007, Google Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * --- |
| * |
| * Author: falmeida@google.com (Filipe Almeida) |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <assert.h> |
| |
| #include "htmlparser/statemachine.h" |
| #include "htmlparser/jsparser.h" |
| |
| /* So we can support both C and C++ compilers, we use the CAST() macro instead |
| * of using C style casts or static_cast<>() directly. |
| */ |
| #ifdef __cplusplus |
| #define CAST(type, expression) (static_cast<type>(expression)) |
| #else |
| #define CAST(type, expression) ((type)(expression)) |
| #endif |
| |
| #ifdef __cplusplus |
| namespace ctemplate_htmlparser { |
| #endif /* __cplusplus */ |
| |
| /* Generated state machine definition. */ |
| #include "htmlparser/jsparser_fsm.h" |
| |
| /* List of keywords that can precede a regular expression literal. Taken from: |
| * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html |
| * |
| * 'void' was added to this list. |
| * The list is used as input to a binary search, so it must be kept in a sorted |
| * form. |
| * There are a large number of keywords in here that don't exist in |
| * ecmascript 3, either because they are reserved or because they are part of |
| * ecmascript 4. However they weren't removed in order to keep the list in sync |
| * with the previous document. |
| */ |
| static const char *regexp_token_prefix[] = { |
| "abstract", |
| "break", |
| "case", |
| "catch", |
| "class", |
| "const", |
| "continue", |
| "debugger", |
| "default", |
| "delete", |
| "do", |
| "else", |
| "enum", |
| "eval", |
| "export", |
| "extends", |
| "field", |
| "final", |
| "finally", |
| "for", |
| "function", |
| "goto", |
| "if", |
| "implements", |
| "import", |
| "in", |
| "instanceof", |
| "native", |
| "new", |
| "package", |
| "private", |
| "protected", |
| "public", |
| "return", |
| "static", |
| "switch", |
| "synchronized", |
| "throw", |
| "throws", |
| "transient", |
| "try", |
| "typeof", |
| "var", |
| "void", |
| "volatile", |
| "while", |
| "with" |
| }; |
| |
| /* Utility functions */ |
| |
| /* Converts the internal state into the external superstate. |
| */ |
| static inline int state_external(int state) |
| { |
| assert(state < JSPARSER_NUM_STATES); |
| assert(state >= 0); |
| return jsparser_states_external[state]; |
| } |
| |
| /* Returns true if the character is an ecmascript whitespace or line terminator |
| * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3 |
| * with the exception of unicode space and line terminators: |
| * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf |
| */ |
| static inline int js_is_whitespace(char c) |
| { |
| return c == '\t' || /* Tab 0x09 */ |
| c == '\v' || /* Vertical Tab 0x0B */ |
| c == '\f' || /* Form Feed 0x0C */ |
| c == ' ' || /* Space 0x20 */ |
| c == '\xa0' || /* No-Break Space 0xA0 */ |
| c == '\n' || /* line Feed 0x0A */ |
| c == '\r'; /* Carriage Return 0x0D */ |
| } |
| |
| /* Returns true if the character is part of a javascript identifier. The rules |
| * are pretty relaxed here since we don't accept unicode and don't care if the |
| * first character doesn't contain numbers or not. |
| * |
| * For more detail on the limitations of having this relaxed set of characters |
| * please see the comments in_state_js_text(). |
| */ |
| static inline int js_is_identifier(char c) { |
| return (c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9') || |
| c == '_' || |
| c == '$'; |
| } |
| |
| /* Appends a character to the ring buffer. Sequences of whitespace and newlines |
| * are folded into one. |
| * |
| * js->buffer_start points to the first character in the buffer and |
| * js->buffer_end points to the position of the next character to be written, |
| * or one plus the last character written. If the buffer is full there will be |
| * an empty slot position so we can diferentiate an empty buffer from a full |
| * buffer. |
| * |
| * If the buffer is empty, then: |
| * js->buffer_start == js->buffer_end. |
| * If the buffer is full, then: |
| * (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start. |
| * |
| */ |
| void jsparser_buffer_append_chr(jsparser_ctx *js, char chr) |
| { |
| /* Fold whitespace so we have enough space in the buffer. */ |
| if (js_is_whitespace(chr) && |
| js_is_whitespace(jsparser_buffer_get(js, -1))) { |
| return; |
| } |
| |
| js->buffer[js->buffer_end] = chr; |
| js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE; |
| if (js->buffer_end == js->buffer_start) { |
| js->buffer_start = (js->buffer_end + 1) % |
| JSPARSER_RING_BUFFER_SIZE; |
| } |
| } |
| |
| /* Appends a string to the ring buffer. Sequences of whitespace and newlines |
| * are folded into one. |
| */ |
| void jsparser_buffer_append_str(jsparser_ctx *js, const char *str) |
| { |
| assert(js != NULL); |
| assert(str != NULL); |
| |
| while(*str != '\0') { |
| jsparser_buffer_append_chr(js, *str++); |
| } |
| } |
| |
| /* Returns the position relative to the start of the buffer or -1 if past the |
| * size of the buffer.. |
| */ |
| static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos) |
| { |
| int absolute_pos; |
| int buffer_len; |
| assert(pos < 0); |
| |
| if(pos <= -JSPARSER_RING_BUFFER_SIZE) { |
| return -1; |
| } |
| |
| buffer_len = js->buffer_end - js->buffer_start; |
| if (buffer_len < 0) { |
| buffer_len += JSPARSER_RING_BUFFER_SIZE; |
| } |
| |
| if (pos < -buffer_len) { |
| return -1; |
| } |
| |
| absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE; |
| if (absolute_pos < 0) { |
| absolute_pos += JSPARSER_RING_BUFFER_SIZE; |
| } |
| |
| return absolute_pos; |
| } |
| |
| /* Returns the last appended character and removes it from the buffer. If the |
| * buffer is empty, then it returns ASCII 0 ('\0'). |
| */ |
| char jsparser_buffer_pop(jsparser_ctx *js) |
| { |
| if (js->buffer_start == js->buffer_end) { |
| return '\0'; |
| } |
| |
| js->buffer_end--; |
| if (js->buffer_end < 0) { |
| js->buffer_end += JSPARSER_RING_BUFFER_SIZE; |
| } |
| |
| return js->buffer[js->buffer_end]; |
| } |
| |
| /* Returns the value of the character at a certain index in the buffer or an |
| * ASCII 0 ('\0') character if the index is outside the buffer boundaries. |
| * |
| * Index positions are negative, were -1 is the last character appended to the |
| * buffer. Using positive integers for the index will result in undefined |
| * behaviour. |
| */ |
| char jsparser_buffer_get(jsparser_ctx *js, int pos) |
| { |
| int absolute_pos; |
| assert(pos < 0); |
| |
| absolute_pos = jsparser_buffer_absolute_pos(js, pos); |
| if (absolute_pos < 0) { |
| return '\0'; |
| } |
| |
| return js->buffer[absolute_pos]; |
| } |
| |
| /* Sets the value of the character at a certain index in the buffer. Returns |
| * true if the write was successful or false if there was an attempt to write |
| * outside of the buffer boundaries. |
| * |
| * Index positions are negative, were -1 is the last character appended to the |
| * buffer. Using positive integers for the index will result in undefined |
| * behaviour. |
| */ |
| int jsparser_buffer_set(jsparser_ctx *js, int pos, char value) |
| { |
| int absolute_pos; |
| assert(pos < 0); |
| |
| absolute_pos = jsparser_buffer_absolute_pos(js, pos); |
| if (absolute_pos < 0) { |
| return 0; |
| } |
| |
| js->buffer[absolute_pos] = value; |
| return 1; |
| } |
| |
| /* Copies a slice of the buffer to the string pointed to by output. start and |
| * end are the indexes of the sliced region. If start extends beyond the |
| * beginning of the buffer, the slice will only contain character from the |
| * beginning of the buffer. |
| */ |
| void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end) |
| { |
| int pos; |
| |
| assert(start <= end); |
| assert(start < 0); |
| assert(end < 0); |
| |
| for (pos = start; pos <= end; ++pos) { |
| char c; |
| c = jsparser_buffer_get(js, pos); |
| if (c != '\0') { |
| *output++ = jsparser_buffer_get(js, pos); |
| } |
| } |
| *output++ = '\0'; |
| } |
| |
| /* Copy the last javascript identifier or keyword found in the buffer to the |
| * string pointed by identifier. |
| * |
| * For simplicity, we consider an identifier to be a sequence of alphanumeric |
| * characters (as opposed to a digit followed by an alphanumeric character. |
| * |
| * Returns 0 if no identifier was matched, in which case the identifier |
| * argument is replaced with an empty string, or non 0 if the identifier was |
| * found. |
| */ |
| int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier) |
| { |
| int end; |
| int pos; |
| |
| assert(identifier != NULL); |
| |
| end = -1; |
| /* Ignore the optional whitespace delimiter */ |
| if (js_is_whitespace(jsparser_buffer_get(js, -1))) { |
| --end; |
| } |
| |
| /* Find the beginning of the identifier. This loop ends either when we find a |
| * character that doesn't belong to an identifier, or when we find a '\0' |
| * character, which means we reached the end of the buffer. */ |
| for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) { |
| } |
| if (pos + 1 >= end) { |
| identifier[0] = '\0'; |
| return 0; |
| } |
| |
| jsparser_buffer_slice(js, identifier, pos + 1, end); |
| return 1; |
| } |
| |
| /* Callback used in bsearch() for comparing a string against an array of |
| * strings. |
| */ |
| static int bsearch_strcmp(const void *a, const void *b) |
| { |
| return strcmp(CAST(const char*, a), *CAST(const char * const *, b)); |
| } |
| |
| /* Returns true if the token argument can be a token prefix to a javascript |
| * regular expression. |
| * |
| * The token argument is compared against a list of identifiers that can |
| * precede a regular expression in the javascript grammar, and returns true if |
| * the argument is found on that list. |
| */ |
| static inline int is_regexp_token_prefix(char *token) |
| { |
| assert(token != NULL); |
| |
| return bsearch(token, |
| regexp_token_prefix, |
| sizeof(regexp_token_prefix) / sizeof(char *), |
| sizeof(char *), bsearch_strcmp) != NULL; |
| } |
| |
| /* Called for every character in state text. |
| * |
| * We copy every character we find when we are in state text to the ring |
| * buffer. This has the side effect of also pushing slash characters that are |
| * part of comments into the buffer, although for parsing purposes these should |
| * be treated as whitespace. This issue is addressed in |
| * enter_state_js_comment_ml_after(). |
| */ |
| static void in_state_js_text(statemachine_ctx *ctx, int start, char chr, |
| int end) |
| { |
| jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user); |
| assert(js != NULL); |
| |
| jsparser_buffer_append_chr(js, chr); |
| } |
| |
| /* This function is called every time we find a slash ('/') character in the |
| * javascript text (except for slashes that close comments or regexp literals). |
| * |
| * Implements the logic to figure out if this slash character is a division |
| * operator or if it opens a regular expression literal. This is heavily |
| * inspired by the syntactic resynchronization for javascript 2.0: |
| * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html |
| * |
| * When we receive a '/', we look at the previous non space character to figure |
| * out if it's the ending of a punctuator that can precede a regexp literal, in |
| * which case we assume the current '/' is part of a regular expression literal |
| * (or the opening of a javascript comment, but that part is dealt with in the |
| * state machine). The exceptions to this are unary operators, so we look back |
| * a second character to rule out '++' and '--'. Although it is not |
| * straightforward to figure out if the binary operator is a postfix of the |
| * previous expression or a prefix of the regular expression, we rule out the |
| * later as it is an uncommon practice. |
| * |
| * If we ruled out the previous token to be a valid regexp preceding |
| * punctuator, we extract the last identifier in the buffer and match against a |
| * list of keywords that are known to precede expressions in the grammar. If we |
| * get a match on any of these keywords, then we are opening a regular |
| * expression, if not, then we have a division operator. |
| * |
| * Known cases that are accepted by the grammar but we handle differently, |
| * although I don't believe there is a legitimate usage for those: |
| * |
| * Division of a regular expression: |
| * var result = /test/ / 5; |
| * |
| * Prefix unary increment of a regular expression: |
| * var result = ++/test/; |
| * |
| * Division of an object literal: |
| * { a: 1 } /x/.exec('x'); |
| * |
| * We only support ascii right now, so unicode characters in identifiers will |
| * be treated as delimiters, effectively breaking the identifier name where |
| * they appear, and this may cause issues in very specific situations. Namely, |
| * if we have a unicode character in an identifier directly preceding a suffix |
| * that matches one of the keywords in regexp_token_prefix[], if this |
| * identifier precedes a / (slash) character: |
| * |
| * var x = test<unicode char>return / 5; |
| * |
| * We will interpret that slash as the start of a regular expression, when in |
| * reality it is a division operator. |
| */ |
| static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr, |
| int end) |
| { |
| jsparser_ctx *js; |
| char buffer[JSPARSER_RING_BUFFER_SIZE]; |
| int pos; |
| |
| assert(ctx != NULL); |
| assert(ctx->user != NULL); |
| |
| js = CAST(jsparser_ctx *, ctx->user); |
| assert(js != NULL); |
| |
| pos = -1; |
| /* Consume the last whitespace. */ |
| if (js_is_whitespace(jsparser_buffer_get(js, pos))) { |
| --pos; |
| } |
| |
| switch (jsparser_buffer_get(js, pos)) { |
| /* Ignore unary increment */ |
| case '+': |
| if (jsparser_buffer_get(js, pos - 1) != '+') { |
| ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; |
| } |
| break; |
| |
| /* Ignore unary decrement */ |
| case '-': |
| if (jsparser_buffer_get(js, pos - 1) != '-') { |
| ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; |
| } |
| break; |
| |
| /* List of punctuator endings except ), ], }, + and - */ |
| case '=': |
| case '<': |
| case '>': |
| case '&': |
| case '|': |
| case '!': |
| case '%': |
| case '*': |
| case '/': |
| case ',': |
| case ';': |
| case '?': |
| case ':': |
| case '^': |
| case '~': |
| case '{': |
| case '(': |
| case '[': |
| case '}': |
| case '\0': |
| ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; |
| break; |
| |
| default: |
| if (jsparser_buffer_last_identifier(js, buffer) && |
| is_regexp_token_prefix(buffer)) { |
| ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; |
| } |
| } |
| |
| jsparser_buffer_append_chr(js, chr); |
| } |
| |
| /* Called at the end of a javascript comment. |
| * |
| * When we open a comment, the initial '/' was inserted into the ring buffer, |
| * but it is not a token and should be considered whitespace for parsing |
| * purposes. |
| * |
| * When we first saw the '/' character, we didn't yet know if it was the |
| * beginning of a comment, a division operator, or a regexp. |
| * |
| * In this function we just replace the inital '/' with a whitespace character, |
| * unless we had a preceding whitespace character, in which case we just remove |
| * the '/'. This is needed to ensure all spaces in the buffer are correctly |
| * folded. |
| */ |
| static void enter_state_js_comment_after(statemachine_ctx *ctx, int start, |
| char chr, int end) |
| { |
| jsparser_ctx *js; |
| |
| assert(ctx != NULL); |
| assert(ctx->user != NULL); |
| |
| js = CAST(jsparser_ctx *, ctx->user); |
| |
| if (js_is_whitespace(jsparser_buffer_get(js, -2))) { |
| (void)jsparser_buffer_pop(js); |
| } else { |
| jsparser_buffer_set(js, -1, ' '); |
| } |
| } |
| |
| static statemachine_definition *create_statemachine_definition() |
| { |
| statemachine_definition *def; |
| def = statemachine_definition_new(JSPARSER_NUM_STATES); |
| if (def == NULL) |
| return NULL; |
| |
| /* TODO(falmeida): Check return value. */ |
| statemachine_definition_populate(def, jsparser_state_transitions, |
| jsparser_states_internal_names); |
| |
| statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT, |
| in_state_js_text); |
| statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH, |
| enter_state_js_slash); |
| statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER, |
| enter_state_js_comment_after); |
| |
| return def; |
| } |
| |
| |
| /* Initializes a new jsparser instance. |
| * |
| * Returns a pointer to the new instance or NULL if the initialization |
| * fails. |
| * |
| * Initialization failure is fatal, and if this function fails it may not |
| * deallocate all previsouly allocated memory. |
| */ |
| |
| jsparser_ctx *jsparser_new() |
| { |
| jsparser_ctx *js; |
| |
| js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx))); |
| if (js == NULL) |
| return NULL; |
| |
| js->statemachine_def = create_statemachine_definition(); |
| if (js->statemachine_def == NULL) |
| return NULL; |
| |
| js->statemachine = statemachine_new(js->statemachine_def, js); |
| if (js->statemachine == NULL) |
| return NULL; |
| |
| jsparser_reset(js); |
| |
| return js; |
| } |
| |
| /* Returns a pointer to a context which is a duplicate of the jsparser src. |
| */ |
| jsparser_ctx *jsparser_duplicate(jsparser_ctx *src) |
| { |
| jsparser_ctx *dst; |
| assert(src != NULL); |
| |
| dst = jsparser_new(); |
| if (dst == NULL) |
| return NULL; |
| |
| jsparser_copy(dst, src); |
| |
| return dst; |
| } |
| |
| /* Copies the context of the jsparser pointed to by src to the jsparser dst. |
| * |
| * The state machine definition is preserved since it is read only. |
| */ |
| void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src) |
| { |
| |
| dst->buffer_start = src->buffer_start; |
| dst->buffer_end = src->buffer_end; |
| memcpy(dst->buffer, src->buffer, sizeof(src->buffer)); |
| |
| statemachine_copy(dst->statemachine, |
| src->statemachine, |
| dst->statemachine_def, |
| dst); |
| |
| } |
| |
| void jsparser_reset(jsparser_ctx *ctx) |
| { |
| assert(ctx != NULL); |
| ctx->statemachine->current_state = 0; |
| ctx->buffer_start = 0; |
| ctx->buffer_end = 0; |
| } |
| |
| int jsparser_state(jsparser_ctx *ctx) |
| { |
| return state_external(ctx->statemachine->current_state); |
| } |
| |
| int jsparser_parse(jsparser_ctx *ctx, const char *str, int size) |
| { |
| int internal_state; |
| internal_state = statemachine_parse(ctx->statemachine, str, size); |
| return state_external(internal_state); |
| } |
| |
| void jsparser_delete(jsparser_ctx *ctx) |
| { |
| assert(ctx != NULL); |
| statemachine_delete(ctx->statemachine); |
| statemachine_definition_delete(ctx->statemachine_def); |
| free(ctx); |
| } |
| |
| #ifdef __cplusplus |
| } /* namespace security_streamhtmlparser */ |
| #endif /* __cplusplus */ |