blob: 15be35d2e7ee9a20090e4448a6754abd312bb316 [file] [log] [blame]
/* Copyright (c) 2007, Google Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ---
*
* Author: falmeida@google.com (Filipe Almeida)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "htmlparser/statemachine.h"
#include "htmlparser/jsparser.h"
/* So we can support both C and C++ compilers, we use the CAST() macro instead
* of using C style casts or static_cast<>() directly.
*/
#ifdef __cplusplus
#define CAST(type, expression) (static_cast<type>(expression))
#else
#define CAST(type, expression) ((type)(expression))
#endif
#ifdef __cplusplus
namespace ctemplate_htmlparser {
#endif /* __cplusplus */
/* Generated state machine definition. */
#include "htmlparser/jsparser_fsm.h"
/* List of keywords that can precede a regular expression literal. Taken from:
* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
*
* 'void' was added to this list.
* The list is used as input to a binary search, so it must be kept in a sorted
* form.
* There are a large number of keywords in here that don't exist in
* ecmascript 3, either because they are reserved or because they are part of
* ecmascript 4. However they weren't removed in order to keep the list in sync
* with the previous document.
*/
static const char *regexp_token_prefix[] = {
"abstract",
"break",
"case",
"catch",
"class",
"const",
"continue",
"debugger",
"default",
"delete",
"do",
"else",
"enum",
"eval",
"export",
"extends",
"field",
"final",
"finally",
"for",
"function",
"goto",
"if",
"implements",
"import",
"in",
"instanceof",
"native",
"new",
"package",
"private",
"protected",
"public",
"return",
"static",
"switch",
"synchronized",
"throw",
"throws",
"transient",
"try",
"typeof",
"var",
"void",
"volatile",
"while",
"with"
};
/* Utility functions */
/* Converts the internal state into the external superstate.
*/
static inline int state_external(int state)
{
assert(state < JSPARSER_NUM_STATES);
assert(state >= 0);
return jsparser_states_external[state];
}
/* Returns true if the character is an ecmascript whitespace or line terminator
* character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
* with the exception of unicode space and line terminators:
* http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
*/
static inline int js_is_whitespace(char c)
{
return c == '\t' || /* Tab 0x09 */
c == '\v' || /* Vertical Tab 0x0B */
c == '\f' || /* Form Feed 0x0C */
c == ' ' || /* Space 0x20 */
c == '\xa0' || /* No-Break Space 0xA0 */
c == '\n' || /* line Feed 0x0A */
c == '\r'; /* Carriage Return 0x0D */
}
/* Returns true if the character is part of a javascript identifier. The rules
* are pretty relaxed here since we don't accept unicode and don't care if the
* first character doesn't contain numbers or not.
*
* For more detail on the limitations of having this relaxed set of characters
* please see the comments in_state_js_text().
*/
static inline int js_is_identifier(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '_' ||
c == '$';
}
/* Appends a character to the ring buffer. Sequences of whitespace and newlines
* are folded into one.
*
* js->buffer_start points to the first character in the buffer and
* js->buffer_end points to the position of the next character to be written,
* or one plus the last character written. If the buffer is full there will be
* an empty slot position so we can diferentiate an empty buffer from a full
* buffer.
*
* If the buffer is empty, then:
* js->buffer_start == js->buffer_end.
* If the buffer is full, then:
* (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
*
*/
void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
{
/* Fold whitespace so we have enough space in the buffer. */
if (js_is_whitespace(chr) &&
js_is_whitespace(jsparser_buffer_get(js, -1))) {
return;
}
js->buffer[js->buffer_end] = chr;
js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
if (js->buffer_end == js->buffer_start) {
js->buffer_start = (js->buffer_end + 1) %
JSPARSER_RING_BUFFER_SIZE;
}
}
/* Appends a string to the ring buffer. Sequences of whitespace and newlines
* are folded into one.
*/
void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
{
assert(js != NULL);
assert(str != NULL);
while(*str != '\0') {
jsparser_buffer_append_chr(js, *str++);
}
}
/* Returns the position relative to the start of the buffer or -1 if past the
* size of the buffer..
*/
static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
{
int absolute_pos;
int buffer_len;
assert(pos < 0);
if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
return -1;
}
buffer_len = js->buffer_end - js->buffer_start;
if (buffer_len < 0) {
buffer_len += JSPARSER_RING_BUFFER_SIZE;
}
if (pos < -buffer_len) {
return -1;
}
absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
if (absolute_pos < 0) {
absolute_pos += JSPARSER_RING_BUFFER_SIZE;
}
return absolute_pos;
}
/* Returns the last appended character and removes it from the buffer. If the
* buffer is empty, then it returns ASCII 0 ('\0').
*/
char jsparser_buffer_pop(jsparser_ctx *js)
{
if (js->buffer_start == js->buffer_end) {
return '\0';
}
js->buffer_end--;
if (js->buffer_end < 0) {
js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
}
return js->buffer[js->buffer_end];
}
/* Returns the value of the character at a certain index in the buffer or an
* ASCII 0 ('\0') character if the index is outside the buffer boundaries.
*
* Index positions are negative, were -1 is the last character appended to the
* buffer. Using positive integers for the index will result in undefined
* behaviour.
*/
char jsparser_buffer_get(jsparser_ctx *js, int pos)
{
int absolute_pos;
assert(pos < 0);
absolute_pos = jsparser_buffer_absolute_pos(js, pos);
if (absolute_pos < 0) {
return '\0';
}
return js->buffer[absolute_pos];
}
/* Sets the value of the character at a certain index in the buffer. Returns
* true if the write was successful or false if there was an attempt to write
* outside of the buffer boundaries.
*
* Index positions are negative, were -1 is the last character appended to the
* buffer. Using positive integers for the index will result in undefined
* behaviour.
*/
int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
{
int absolute_pos;
assert(pos < 0);
absolute_pos = jsparser_buffer_absolute_pos(js, pos);
if (absolute_pos < 0) {
return 0;
}
js->buffer[absolute_pos] = value;
return 1;
}
/* Copies a slice of the buffer to the string pointed to by output. start and
* end are the indexes of the sliced region. If start extends beyond the
* beginning of the buffer, the slice will only contain character from the
* beginning of the buffer.
*/
void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end)
{
int pos;
assert(start <= end);
assert(start < 0);
assert(end < 0);
for (pos = start; pos <= end; ++pos) {
char c;
c = jsparser_buffer_get(js, pos);
if (c != '\0') {
*output++ = jsparser_buffer_get(js, pos);
}
}
*output++ = '\0';
}
/* Copy the last javascript identifier or keyword found in the buffer to the
* string pointed by identifier.
*
* For simplicity, we consider an identifier to be a sequence of alphanumeric
* characters (as opposed to a digit followed by an alphanumeric character.
*
* Returns 0 if no identifier was matched, in which case the identifier
* argument is replaced with an empty string, or non 0 if the identifier was
* found.
*/
int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier)
{
int end;
int pos;
assert(identifier != NULL);
end = -1;
/* Ignore the optional whitespace delimiter */
if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
--end;
}
/* Find the beginning of the identifier. This loop ends either when we find a
* character that doesn't belong to an identifier, or when we find a '\0'
* character, which means we reached the end of the buffer. */
for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
}
if (pos + 1 >= end) {
identifier[0] = '\0';
return 0;
}
jsparser_buffer_slice(js, identifier, pos + 1, end);
return 1;
}
/* Callback used in bsearch() for comparing a string against an array of
* strings.
*/
static int bsearch_strcmp(const void *a, const void *b)
{
return strcmp(CAST(const char*, a), *CAST(const char * const *, b));
}
/* Returns true if the token argument can be a token prefix to a javascript
* regular expression.
*
* The token argument is compared against a list of identifiers that can
* precede a regular expression in the javascript grammar, and returns true if
* the argument is found on that list.
*/
static inline int is_regexp_token_prefix(char *token)
{
assert(token != NULL);
return bsearch(token,
regexp_token_prefix,
sizeof(regexp_token_prefix) / sizeof(char *),
sizeof(char *), bsearch_strcmp) != NULL;
}
/* Called for every character in state text.
*
* We copy every character we find when we are in state text to the ring
* buffer. This has the side effect of also pushing slash characters that are
* part of comments into the buffer, although for parsing purposes these should
* be treated as whitespace. This issue is addressed in
* enter_state_js_comment_ml_after().
*/
static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
int end)
{
jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user);
assert(js != NULL);
jsparser_buffer_append_chr(js, chr);
}
/* This function is called every time we find a slash ('/') character in the
* javascript text (except for slashes that close comments or regexp literals).
*
* Implements the logic to figure out if this slash character is a division
* operator or if it opens a regular expression literal. This is heavily
* inspired by the syntactic resynchronization for javascript 2.0:
* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
*
* When we receive a '/', we look at the previous non space character to figure
* out if it's the ending of a punctuator that can precede a regexp literal, in
* which case we assume the current '/' is part of a regular expression literal
* (or the opening of a javascript comment, but that part is dealt with in the
* state machine). The exceptions to this are unary operators, so we look back
* a second character to rule out '++' and '--'. Although it is not
* straightforward to figure out if the binary operator is a postfix of the
* previous expression or a prefix of the regular expression, we rule out the
* later as it is an uncommon practice.
*
* If we ruled out the previous token to be a valid regexp preceding
* punctuator, we extract the last identifier in the buffer and match against a
* list of keywords that are known to precede expressions in the grammar. If we
* get a match on any of these keywords, then we are opening a regular
* expression, if not, then we have a division operator.
*
* Known cases that are accepted by the grammar but we handle differently,
* although I don't believe there is a legitimate usage for those:
*
* Division of a regular expression:
* var result = /test/ / 5;
*
* Prefix unary increment of a regular expression:
* var result = ++/test/;
*
* Division of an object literal:
* { a: 1 } /x/.exec('x');
*
* We only support ascii right now, so unicode characters in identifiers will
* be treated as delimiters, effectively breaking the identifier name where
* they appear, and this may cause issues in very specific situations. Namely,
* if we have a unicode character in an identifier directly preceding a suffix
* that matches one of the keywords in regexp_token_prefix[], if this
* identifier precedes a / (slash) character:
*
* var x = test<unicode char>return / 5;
*
* We will interpret that slash as the start of a regular expression, when in
* reality it is a division operator.
*/
static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
int end)
{
jsparser_ctx *js;
char buffer[JSPARSER_RING_BUFFER_SIZE];
int pos;
assert(ctx != NULL);
assert(ctx->user != NULL);
js = CAST(jsparser_ctx *, ctx->user);
assert(js != NULL);
pos = -1;
/* Consume the last whitespace. */
if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
--pos;
}
switch (jsparser_buffer_get(js, pos)) {
/* Ignore unary increment */
case '+':
if (jsparser_buffer_get(js, pos - 1) != '+') {
ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
}
break;
/* Ignore unary decrement */
case '-':
if (jsparser_buffer_get(js, pos - 1) != '-') {
ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
}
break;
/* List of punctuator endings except ), ], }, + and - */
case '=':
case '<':
case '>':
case '&':
case '|':
case '!':
case '%':
case '*':
case '/':
case ',':
case ';':
case '?':
case ':':
case '^':
case '~':
case '{':
case '(':
case '[':
case '}':
case '\0':
ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
break;
default:
if (jsparser_buffer_last_identifier(js, buffer) &&
is_regexp_token_prefix(buffer)) {
ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
}
}
jsparser_buffer_append_chr(js, chr);
}
/* Called at the end of a javascript comment.
*
* When we open a comment, the initial '/' was inserted into the ring buffer,
* but it is not a token and should be considered whitespace for parsing
* purposes.
*
* When we first saw the '/' character, we didn't yet know if it was the
* beginning of a comment, a division operator, or a regexp.
*
* In this function we just replace the inital '/' with a whitespace character,
* unless we had a preceding whitespace character, in which case we just remove
* the '/'. This is needed to ensure all spaces in the buffer are correctly
* folded.
*/
static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
char chr, int end)
{
jsparser_ctx *js;
assert(ctx != NULL);
assert(ctx->user != NULL);
js = CAST(jsparser_ctx *, ctx->user);
if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
(void)jsparser_buffer_pop(js);
} else {
jsparser_buffer_set(js, -1, ' ');
}
}
static statemachine_definition *create_statemachine_definition()
{
statemachine_definition *def;
def = statemachine_definition_new(JSPARSER_NUM_STATES);
if (def == NULL)
return NULL;
/* TODO(falmeida): Check return value. */
statemachine_definition_populate(def, jsparser_state_transitions,
jsparser_states_internal_names);
statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
in_state_js_text);
statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
enter_state_js_slash);
statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
enter_state_js_comment_after);
return def;
}
/* Initializes a new jsparser instance.
*
* Returns a pointer to the new instance or NULL if the initialization
* fails.
*
* Initialization failure is fatal, and if this function fails it may not
* deallocate all previsouly allocated memory.
*/
jsparser_ctx *jsparser_new()
{
jsparser_ctx *js;
js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
if (js == NULL)
return NULL;
js->statemachine_def = create_statemachine_definition();
if (js->statemachine_def == NULL)
return NULL;
js->statemachine = statemachine_new(js->statemachine_def, js);
if (js->statemachine == NULL)
return NULL;
jsparser_reset(js);
return js;
}
/* Returns a pointer to a context which is a duplicate of the jsparser src.
*/
jsparser_ctx *jsparser_duplicate(jsparser_ctx *src)
{
jsparser_ctx *dst;
assert(src != NULL);
dst = jsparser_new();
if (dst == NULL)
return NULL;
jsparser_copy(dst, src);
return dst;
}
/* Copies the context of the jsparser pointed to by src to the jsparser dst.
*
* The state machine definition is preserved since it is read only.
*/
void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src)
{
dst->buffer_start = src->buffer_start;
dst->buffer_end = src->buffer_end;
memcpy(dst->buffer, src->buffer, sizeof(src->buffer));
statemachine_copy(dst->statemachine,
src->statemachine,
dst->statemachine_def,
dst);
}
void jsparser_reset(jsparser_ctx *ctx)
{
assert(ctx != NULL);
ctx->statemachine->current_state = 0;
ctx->buffer_start = 0;
ctx->buffer_end = 0;
}
int jsparser_state(jsparser_ctx *ctx)
{
return state_external(ctx->statemachine->current_state);
}
int jsparser_parse(jsparser_ctx *ctx, const char *str, int size)
{
int internal_state;
internal_state = statemachine_parse(ctx->statemachine, str, size);
return state_external(internal_state);
}
void jsparser_delete(jsparser_ctx *ctx)
{
assert(ctx != NULL);
statemachine_delete(ctx->statemachine);
statemachine_definition_delete(ctx->statemachine_def);
free(ctx);
}
#ifdef __cplusplus
} /* namespace security_streamhtmlparser */
#endif /* __cplusplus */