| /* |
| * Copyright (c) 2007, Google Inc. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * --- |
| * |
| * Author: falmeida@google.com (Filipe Almeida) |
| */ |
| |
| /* TODO(falmeida): Breaks on NULL characters in the stream. fix. |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include <assert.h> |
| |
| #include "htmlparser/statemachine.h" |
| #include "htmlparser/htmlparser.h" |
| #include "htmlparser/jsparser.h" |
| |
| /* So we can support both C and C++ compilers, we use the CAST() macro instead |
| * of using C style casts or static_cast<>() directly. |
| */ |
| #ifdef __cplusplus |
| #define CAST(type, expression) (static_cast<type>(expression)) |
| #else |
| #define CAST(type, expression) ((type)(expression)) |
| #endif |
| |
| #ifdef __cplusplus |
| namespace ctemplate_htmlparser { |
| #endif |
| |
| /* Generated state machine definition. */ |
| #include "htmlparser/htmlparser_fsm.h" |
| |
| #define is_js_attribute(attr) ((attr)[0] == 'o' && (attr)[1] == 'n') |
| #define is_style_attribute(attr) (strcmp((attr), "style") == 0) |
| |
| /* html entity filter */ |
| static struct entityfilter_table_s { |
| const char *entity; |
| const char *value; |
| } entityfilter_table[] = { |
| { "lt", "<" }, |
| { "gt", ">" }, |
| { "quot", "\"" }, |
| { "amp", "&" }, |
| { "apos", "\'" }, |
| { NULL, NULL } |
| }; |
| |
| /* Utility functions */ |
| |
| /* Similar to strncpy() but avoids the NULL padding. */ |
| static inline void nopad_strncpy(char *dst, const char *src, size_t dst_size, |
| size_t src_size) |
| { |
| size_t size; |
| |
| /* size = min(dst_size, src_size) */ |
| size = dst_size > src_size ? src_size : dst_size; |
| strncpy(dst, src, size); |
| if (size > 0) |
| dst[size - 1] = '\0'; |
| } |
| |
| /* Converts the internal state into the external superstate. |
| */ |
| static int state_external(int st) |
| { |
| if (st == STATEMACHINE_ERROR) |
| return HTMLPARSER_STATE_ERROR; |
| else |
| return htmlparser_states_external[st]; |
| } |
| |
| /* Returns true if the character is considered an html whitespace character. |
| * |
| * From: http://www.w3.org/TR/html401/struct/text.html#h-9.1 |
| */ |
| static inline int html_isspace(char chr) |
| { |
| if (chr == ' ' || chr == '\t' || chr == '\n' || chr == '\r') { |
| return 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| /* Returns true if the attribute is expected to contain a url |
| * This list was taken from: http://www.w3.org/TR/html4/index/attributes.html |
| */ |
| static int is_uri_attribute(char *attr) |
| { |
| if (attr == NULL) |
| return 0; |
| |
| switch (attr[0]) { |
| case 'a': |
| if (strcmp(attr, "action") == 0) |
| return 1; |
| /* TODO(falmeida): This is a uri list. Should we treat it diferently? */ |
| if (strcmp(attr, "archive") == 0) /* This is a uri list */ |
| return 1; |
| break; |
| |
| case 'b': |
| if (strcmp(attr, "background") == 0) |
| return 1; |
| break; |
| |
| case 'c': |
| if (strcmp(attr, "cite") == 0) |
| return 1; |
| if (strcmp(attr, "classid") == 0) |
| return 1; |
| if (strcmp(attr, "codebase") == 0) |
| return 1; |
| break; |
| |
| case 'd': |
| if (strcmp(attr, "data") == 0) |
| return 1; |
| if (strcmp(attr, "dynsrc") == 0) /* from msdn */ |
| return 1; |
| break; |
| |
| case 'h': |
| if (strcmp(attr, "href") == 0) |
| return 1; |
| break; |
| |
| case 'l': |
| if (strcmp(attr, "longdesc") == 0) |
| return 1; |
| break; |
| |
| case 's': |
| if (strcmp(attr, "src") == 0) |
| return 1; |
| break; |
| |
| case 'u': |
| if (strcmp(attr, "usemap") == 0) |
| return 1; |
| break; |
| } |
| |
| return 0; |
| |
| } |
| |
| /* Convert a string to lower case characters inplace. |
| */ |
| static void tolower_str(char *s) |
| { |
| while (*s != '\0') { |
| *s = CAST(char, tolower(CAST(unsigned char,*s))); |
| s++; |
| } |
| } |
| |
| static const char *ignore_spaces_or_digits(const char *value) { |
| while (html_isspace(*value) || ((*value >= '0' && *value <= '9'))) |
| value++; |
| |
| return value; |
| } |
| |
| static const char *ignore_spaces(const char *value) { |
| while (html_isspace(*value)) |
| value++; |
| |
| return value; |
| } |
| |
| /* Return type of the function meta_redirect_type. |
| */ |
| enum meta_redirect_type_enum { |
| META_REDIRECT_TYPE_NONE, |
| META_REDIRECT_TYPE_URL_START, |
| META_REDIRECT_TYPE_URL |
| }; |
| |
| /* Analyzes a string for the presence of a meta refresh type url. |
| * |
| * This function receives the value of the content attribute of a meta tag and |
| * parses it in order to identify if a url is going to be present. This is the |
| * format of such tag: |
| * |
| * <meta http-equiv="refresh" content="5; URL=http://www.google.com"> |
| * |
| * Using a regular expression library would be the most obvious way to implement |
| * this functionality, but introducing such a dependency is undesirable. We |
| * opted instead to parse programmaticly since the expression is simple enough. |
| * |
| * For reference, this is the spec on the meta http refresh tag: |
| * http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh |
| * |
| * If the value has no content after the expression, we know we are at the start |
| * of the URL. Otherwise we are past the start of the URL. |
| * |
| * |
| * Returns: |
| * |
| * This functions returns one of the following values: |
| * META_REDIRECT_TYPE_NONE - A url was not identified in the input string. |
| * META_REDIRECT_TYPE_URL_START - The input string ends exactly at the start |
| * of the url. |
| * META_REDIRECT_TYPE_URL - The input string ends somewhere in the middle or |
| * the end of the url. |
| * |
| * A few examples: |
| * "5" |
| * Returns META_REDIRECT_TYPE_NONE since we don't expect a url to follow. |
| * |
| * "5; URL = " |
| * The function returns META_REDIRECT_TYPE_URL_START since we expect a url to |
| * follow. |
| * |
| * "5; URL = http://www.google.com/?" |
| * Returns META_REDIRECT_TYPE_URL since the input value terminates in the |
| * middle or end of a url. |
| * |
| * |
| * Caveats: We are only recording up to 256 characters of attribute values, so |
| * our analysis is limited to that. This shouldn't be an issue in practice |
| * though as it would be unexpected for the part of the string that we are |
| * matching to be so long. |
| */ |
| enum meta_redirect_type_enum meta_redirect_type(const char *value) { |
| |
| if (value == NULL) |
| return META_REDIRECT_TYPE_NONE; |
| |
| /* Match while [ \t\r\n0-9]* */ |
| value = ignore_spaces_or_digits(value); |
| |
| /* Verify that we got a semi-colon character */ |
| if (*value != ';') |
| return META_REDIRECT_TYPE_NONE; |
| value++; |
| |
| /* Match while [ \t\r\n]* */ |
| value = ignore_spaces(value); |
| |
| /* Validate that we have 'URL' */ |
| if (strncasecmp(value, "url", strlen("url")) != 0) |
| return META_REDIRECT_TYPE_NONE; |
| |
| value += strlen("url"); |
| |
| /* Match while [ \t\r\n]* */ |
| value = ignore_spaces(value); |
| |
| if (*value != '=') |
| return META_REDIRECT_TYPE_NONE; |
| value++; |
| |
| /* Match while [ \t\r\n]* */ |
| value = ignore_spaces(value); |
| |
| /* The HTML5 spec allows for the url to be quoted, so we skip a single or |
| * double quote if we find one. |
| */ |
| if (*value == '"' || *value == '\'') |
| value++; |
| |
| if (*value == '\0') |
| return META_REDIRECT_TYPE_URL_START; |
| else |
| return META_REDIRECT_TYPE_URL; |
| } |
| |
| |
| /* Resets the entityfilter to it's initial state so it can be reused. |
| */ |
| void entityfilter_reset(entityfilter_ctx *ctx) |
| { |
| ctx->buffer[0] = 0; |
| ctx->buffer_pos = 0; |
| ctx->in_entity = 0; |
| } |
| |
| /* Initializes a new entity filter object. |
| */ |
| entityfilter_ctx *entityfilter_new() |
| { |
| entityfilter_ctx *ctx; |
| ctx = CAST(entityfilter_ctx *, |
| malloc(sizeof(entityfilter_ctx))); |
| |
| if (ctx == NULL) |
| return NULL; |
| ctx->buffer[0] = 0; |
| ctx->buffer_pos = 0; |
| ctx->in_entity = 0; |
| |
| return ctx; |
| } |
| |
| /* Copies the context of the entityfilter pointed to by src to the entityfilter |
| * dst. |
| */ |
| void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src) |
| { |
| assert(src != NULL); |
| assert(dst != NULL); |
| assert(src != dst); |
| memcpy(dst, src, sizeof(entityfilter_ctx)); |
| } |
| |
| |
| /* Deallocates an entity filter object. |
| */ |
| void entityfilter_delete(entityfilter_ctx *ctx) |
| { |
| free(ctx); |
| } |
| |
| /* Converts a string containing an hexadecimal number to a string containing |
| * one character with the corresponding ascii value. |
| * |
| * The provided output char array must be at least 2 chars long. |
| */ |
| static const char *parse_hex(const char *s, char *output) |
| { |
| int n; |
| n = strtol(s, NULL, 16); |
| output[0] = n; |
| output[1] = 0; |
| /* TODO(falmeida): Make this function return void */ |
| return output; |
| } |
| |
| /* Converts a string containing a decimal number to a string containing one |
| * character with the corresponding ascii value. |
| * |
| * The provided output char array must be at least 2 chars long. |
| */ |
| static const char *parse_dec(const char *s, char *output) |
| { |
| int n; |
| n = strtol(s, NULL, 10); |
| output[0] = n; |
| output[1] = 0; |
| return output; |
| } |
| |
| /* Converts a string with an html entity to it's encoded form, which is written |
| * to the output string. |
| */ |
| static const char *entity_convert(const char *s, char *output, char terminator) |
| { |
| /* TODO(falmeida): Handle wide char encodings */ |
| struct entityfilter_table_s *t = entityfilter_table; |
| |
| if (s[0] == '#') { |
| if (s[1] == 'x' || s[1] == 'X') { /* hex */ |
| return parse_hex(s + 2, output); |
| } else { /* decimal */ |
| return parse_dec(s + 1, output); |
| } |
| } |
| |
| while (t->entity != NULL) { |
| if (strcasecmp(t->entity, s) == 0) |
| return t->value; |
| t++; |
| } |
| |
| snprintf(output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s%c", s, terminator); |
| output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0'; |
| |
| return output; |
| } |
| |
| |
| /* Processes a character from the input stream and decodes any html entities |
| * in the processed input stream. |
| */ |
| const char *entityfilter_process(entityfilter_ctx *ctx, char c) |
| { |
| if (ctx->in_entity) { |
| if (c == ';' || html_isspace(c)) { |
| ctx->in_entity = 0; |
| ctx->buffer[ctx->buffer_pos] = '\0'; |
| ctx->buffer_pos = 0; |
| return entity_convert(ctx->buffer, ctx->output, c); |
| } else { |
| ctx->buffer[ctx->buffer_pos++] = c; |
| if (ctx->buffer_pos >= HTMLPARSER_MAX_ENTITY_SIZE - 2) { |
| /* No more buffer to use, finalize and return. |
| * We need two characters left, one for the '&' character and |
| * another for the NULL termination. */ |
| ctx->buffer[ctx->buffer_pos] = '\0'; |
| ctx->in_entity=0; |
| ctx->buffer_pos = 0; |
| snprintf(ctx->output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s", |
| ctx->buffer); |
| ctx->output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0'; |
| return ctx->output; |
| } |
| } |
| } else { |
| if (c == '&') { |
| ctx->in_entity = 1; |
| ctx->buffer_pos = 0; |
| } else { |
| ctx->output[0] = c; |
| ctx->output[1] = 0; |
| return ctx->output; |
| } |
| } |
| return ""; |
| } |
| |
| /* Called when the parser enters a new tag. Starts recording it's name into |
| * html->tag. |
| */ |
| static void enter_tag_name(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| html->tag[0] = '\0'; |
| statemachine_start_record(ctx); |
| } |
| |
| /* Called when the parser exits the tag name in order to finalize the recording. |
| * |
| * It converts the tag name to lowercase, and if the tag was closed, just |
| * clears html->tag. |
| */ |
| static void exit_tag_name(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| nopad_strncpy(html->tag, statemachine_stop_record(ctx), |
| HTMLPARSER_MAX_STRING, statemachine_record_length(ctx)); |
| |
| tolower_str(html->tag); |
| |
| if (html->tag[0] == '/') |
| html->tag[0] = '\0'; |
| } |
| |
| /* Called when the parser enters a new tag. Starts recording it's name into |
| * html->attr |
| */ |
| static void enter_attr(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| html->attr[0] = '\0'; |
| statemachine_start_record(ctx); |
| } |
| |
| /* Called when the parser exits the attribute name in order to finalize the |
| * recording. |
| * |
| * It converts the tag name to lowercase. |
| */ |
| static void exit_attr(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| nopad_strncpy(html->attr, statemachine_stop_record(ctx), |
| HTMLPARSER_MAX_STRING, statemachine_record_length(ctx)); |
| |
| tolower_str(html->attr); |
| } |
| |
| /* Called when we enter an attribute value. |
| * |
| * Keeps track of a position index inside the value and initializes the |
| * javascript state machine for attributes that accept javascript. |
| */ |
| static void enter_value(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| html->value_index = 0; |
| |
| if (is_js_attribute(html->attr)) { |
| entityfilter_reset(html->entityfilter); |
| jsparser_reset(html->jsparser); |
| html->in_js = 1; |
| } else { |
| html->in_js = 0; |
| } |
| } |
| |
| /* Called when we enter the contents of an attribute value. |
| * |
| * Initializes the recording of the contents of the value. |
| */ |
| static void enter_value_content(statemachine_ctx *ctx, int start, char chr, |
| int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| html->value[0] = '\0'; |
| statemachine_start_record(ctx); |
| } |
| |
| /* Called when we exit the contents of an attribute value. |
| * |
| * Finalizes the recording of the contents of the value. |
| */ |
| static void exit_value_content(statemachine_ctx *ctx, int start, char chr, |
| int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| nopad_strncpy(html->value, statemachine_stop_record(ctx), |
| HTMLPARSER_MAX_STRING, statemachine_record_length(ctx)); |
| |
| html->in_js = 0; |
| } |
| |
| /* Called for every character inside an attribute value. |
| * |
| * Used to process javascript and keep track of the position index inside the |
| * attribute value. |
| */ |
| static void in_state_value(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| html->value_index++; |
| |
| if (html->in_js == 1) { |
| const char *output; |
| output = entityfilter_process(html->entityfilter, chr); |
| jsparser_parse_str(html->jsparser, output); |
| } |
| } |
| |
| /* Called everytime the parser leaves a tag definition. |
| * |
| * When we encounter a script tag, we initialize the js parser and switch the |
| * state to cdata. We also switch to the cdata state when we encounter any |
| * other CDATA/RCDATA tag (style, title or textarea) except that we do not |
| * initialize the js parser. |
| * |
| * To simplify the code, we treat RCDATA and CDATA sections the same since the |
| * differences between them don't affect the context we are in. |
| */ |
| static void tag_close(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| if (strcmp(html->tag, "script") == 0) { |
| ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT; |
| jsparser_reset(html->jsparser); |
| html->in_js = 1; |
| } else if (strcmp(html->tag, "style") == 0 || |
| strcmp(html->tag, "title") == 0 || |
| strcmp(html->tag, "textarea") == 0) { |
| ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT; |
| html->in_js = 0; |
| } |
| } |
| |
| /* Called inside cdata blocks in order to parse the javascript. |
| * |
| * Calls the javascript parser if currently in a script tag. |
| */ |
| static void in_state_cdata(statemachine_ctx *ctx, int start, char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| assert(html != NULL); |
| |
| if (html->in_js) |
| jsparser_parse_chr(html->jsparser, chr); |
| } |
| |
| /* Called if we encounter a '<' character in a cdata section. |
| * |
| * When encountering a '<' character inside cdata, we need to find the closing |
| * tag name in order to know if the tag is going to be closed or not, so we |
| * start recording the name of what could be the closing tag. |
| */ |
| static void enter_state_cdata_may_close(statemachine_ctx *ctx, int start, |
| char chr, int end) |
| { |
| statemachine_start_record(ctx); |
| } |
| |
| /* Called when we finish reading what could be a closing cdata tag. |
| * |
| * Checks if the closing tag name matches the current entity, and if so closes |
| * the element. |
| */ |
| static void exit_state_cdata_may_close(statemachine_ctx *ctx, int start, |
| char chr, int end) |
| { |
| htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user); |
| const char *cdata_close_tag; |
| assert(html != NULL); |
| |
| cdata_close_tag = statemachine_stop_record(ctx); |
| assert(cdata_close_tag[0] == '/'); |
| |
| if (strcasecmp(&cdata_close_tag[1], html->tag) == 0 && |
| (chr == '>' || html_isspace(chr))) { /* Make sure we have a delimiter */ |
| html->tag[0] = '\0'; /* Empty tag mimicking exit_tag_name(). */ |
| html->in_js = 0; /* In case this was a script tag. */ |
| } else { |
| /* Does not close the CDATA section. Go back to CDATA. */ |
| ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT; |
| } |
| } |
| |
| /* Resets the parser to it's initial state and changes the parser mode. |
| */ |
| void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode) |
| { |
| assert(ctx != NULL); |
| statemachine_reset(ctx->statemachine); |
| ctx->in_js = 0; |
| ctx->tag[0] = '\0'; |
| ctx->attr[0] = '\0'; |
| ctx->value[0] = '\0'; |
| |
| jsparser_reset(ctx->jsparser); |
| |
| switch (mode) { |
| case HTMLPARSER_MODE_HTML: |
| ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TEXT; |
| break; |
| case HTMLPARSER_MODE_JS: |
| ctx->statemachine->current_state = HTMLPARSER_STATE_INT_JS_FILE; |
| ctx->in_js = 1; |
| break; |
| case HTMLPARSER_MODE_CSS: |
| ctx->statemachine->current_state = HTMLPARSER_STATE_INT_CSS_FILE; |
| break; |
| case HTMLPARSER_MODE_HTML_IN_TAG: |
| ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TAG_SPACE; |
| break; |
| default: |
| assert("Invalid mode in htmlparser_reset_mode()." && 0); |
| } |
| } |
| |
| /* Resets the parser to it's initial state and to the default mode, which |
| * is MODE_HTML. |
| */ |
| void htmlparser_reset(htmlparser_ctx *ctx) |
| { |
| assert(ctx != NULL); |
| htmlparser_reset_mode(ctx, HTMLPARSER_MODE_HTML); |
| } |
| |
| /* Creates a new state machine definition and initializes the events for the |
| * state transitions. |
| * |
| * Although each instance of the parser has it's own private instance of a |
| * statemachine definition, they are still identical across html parser objects |
| * and are never modified after creation. As such, changes to this definition |
| * should not occur outside this function and should not depend on properties |
| * of this particular parser instance as in the future we may opt to use a |
| * single public definition across parser objects. |
| */ |
| static statemachine_definition *create_statemachine_definition() |
| { |
| statemachine_definition *def; |
| def = statemachine_definition_new(HTMLPARSER_NUM_STATES); |
| if (def == NULL) |
| return NULL; |
| |
| statemachine_definition_populate(def, htmlparser_state_transitions, |
| htmlparser_states_internal_names); |
| |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_NAME, |
| enter_tag_name); |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_TAG_NAME, exit_tag_name); |
| |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_ATTR, enter_attr); |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_ATTR, exit_attr); |
| |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_CLOSE, tag_close); |
| |
| /* CDATA states. We must list all cdata and javascript states here. */ |
| /* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't |
| * go out of sync. |
| */ |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_TEXT, in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START, |
| in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH, |
| in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY, |
| in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH, |
| in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH, |
| in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_LT, in_state_cdata); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE, |
| in_state_cdata); |
| |
| /* For simplification, we treat the javascript mode as if it were cdata. */ |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_JS_FILE, in_state_cdata); |
| |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE, |
| enter_state_cdata_may_close); |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE, |
| exit_state_cdata_may_close); |
| /* value states */ |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE, enter_value); |
| |
| /* Called when we enter the content of the value */ |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT, |
| enter_value_content); |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_Q, |
| enter_value_content); |
| statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_DQ, |
| enter_value_content); |
| |
| /* Called when we exit the content of the value */ |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT, |
| exit_value_content); |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_Q, |
| exit_value_content); |
| statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_DQ, |
| exit_value_content); |
| |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT, in_state_value); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_Q, in_state_value); |
| statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_DQ, in_state_value); |
| |
| return def; |
| } |
| |
| |
| /* Initializes a new htmlparser instance. |
| * |
| * Returns a pointer to the new instance or NULL if the initialization fails. |
| * Initialization failure is fatal, and if this function fails it may not |
| * deallocate all previsouly allocated memory. |
| */ |
| htmlparser_ctx *htmlparser_new() |
| { |
| htmlparser_ctx *html; |
| |
| html = CAST(htmlparser_ctx *, calloc(1, sizeof(htmlparser_ctx))); |
| if (html == NULL) |
| return NULL; |
| |
| html->statemachine_def = create_statemachine_definition(); |
| if (html->statemachine_def == NULL) |
| return NULL; |
| |
| html->statemachine = statemachine_new(html->statemachine_def, html); |
| if (html->statemachine == NULL) |
| return NULL; |
| |
| html->jsparser = jsparser_new(); |
| if (html->jsparser == NULL) |
| return NULL; |
| |
| html->entityfilter = entityfilter_new(); |
| if (html->entityfilter == NULL) |
| return NULL; |
| |
| htmlparser_reset(html); |
| |
| return html; |
| } |
| |
| /* Copies the context of the htmlparser pointed to by src to the htmlparser dst. |
| */ |
| void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src) |
| { |
| dst->value_index = src->value_index; |
| dst->in_js = src->in_js; |
| strcpy(dst->tag, src->tag); |
| strcpy(dst->attr, src->attr); |
| strcpy(dst->value, src->value); |
| |
| statemachine_copy(dst->statemachine, |
| src->statemachine, |
| dst->statemachine_def, |
| dst); |
| |
| jsparser_copy(dst->jsparser, src->jsparser); |
| |
| entityfilter_copy(dst->entityfilter, src->entityfilter); |
| |
| } |
| |
| /* Receives an htmlparser context and Returns the current html state. |
| */ |
| int htmlparser_state(htmlparser_ctx *ctx) |
| { |
| return state_external(ctx->statemachine->current_state); |
| } |
| |
| /* Parses the input html stream and returns the finishing state. |
| */ |
| int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size) |
| { |
| int internal_state; |
| internal_state = statemachine_parse(ctx->statemachine, str, size); |
| return state_external(internal_state); |
| } |
| |
| |
| /* Returns true if the parser is inside an attribute value and the value is |
| * surrounded by single or double quotes. */ |
| int htmlparser_is_attr_quoted(htmlparser_ctx *ctx) { |
| int st = statemachine_get_state(ctx->statemachine); |
| if (st == HTMLPARSER_STATE_INT_VALUE_Q_START || |
| st == HTMLPARSER_STATE_INT_VALUE_Q || |
| st == HTMLPARSER_STATE_INT_VALUE_DQ_START || |
| st == HTMLPARSER_STATE_INT_VALUE_DQ) |
| return 1; |
| else |
| return 0; |
| } |
| |
| /* Returns true if the parser is currently in javascript. |
| */ |
| int htmlparser_in_js(htmlparser_ctx *ctx) { |
| int st = statemachine_get_state(ctx->statemachine); |
| |
| /* CDATA states plus JS_FILE. We must list all cdata and javascript states |
| * here. */ |
| /* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't go |
| * out of sync. */ |
| if (ctx->in_js && |
| (st == HTMLPARSER_STATE_INT_CDATA_TEXT || |
| st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START || |
| st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH || |
| st == HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY || |
| st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH || |
| st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH || |
| st == HTMLPARSER_STATE_INT_CDATA_LT || |
| st == HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE || |
| st == HTMLPARSER_STATE_INT_JS_FILE)) |
| return 1; |
| |
| if (state_external(st) == HTMLPARSER_STATE_VALUE && ctx->in_js) |
| return 1; |
| else |
| return 0; |
| } |
| |
| /* Returns the current tag or NULL if not available or we haven't seen the |
| * entire tag yet. |
| */ |
| const char *htmlparser_tag(htmlparser_ctx *ctx) |
| { |
| if (ctx->tag[0] != '\0') |
| return ctx->tag; |
| else |
| return NULL; |
| } |
| |
| /* Returns true if inside an attribute or a value */ |
| int htmlparser_in_attr(htmlparser_ctx *ctx) |
| { |
| int ext_state = state_external(statemachine_get_state(ctx->statemachine)); |
| return ext_state == HTMLPARSER_STATE_ATTR || |
| ext_state == HTMLPARSER_STATE_VALUE; |
| } |
| |
| /* Returns the current attribute name if after an attribute name or in an |
| * attribute value. Returns NULL otherwise. */ |
| const char *htmlparser_attr(htmlparser_ctx *ctx) |
| { |
| if (htmlparser_in_attr(ctx)) |
| return ctx->attr; |
| else |
| return NULL; |
| } |
| |
| /* Returns true if the parser is currently inside a CSS construct. |
| */ |
| int htmlparser_in_css(htmlparser_ctx *ctx) { |
| int state = statemachine_get_state(ctx->statemachine); |
| const char *tag = htmlparser_tag(ctx); |
| int external_state = state_external(state); |
| |
| if (state == HTMLPARSER_STATE_INT_CSS_FILE || |
| (external_state == HTMLPARSER_STATE_VALUE && |
| htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_STYLE) || |
| (tag && strcmp(tag, "style") == 0)) { |
| return 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| /* Returns the contents of the current attribute value. |
| */ |
| const char *htmlparser_value(htmlparser_ctx *ctx) |
| { |
| int ext_state = state_external(statemachine_get_state(ctx->statemachine)); |
| if (ext_state == HTMLPARSER_STATE_VALUE) { |
| strncpy(ctx->value, statemachine_record_buffer(ctx->statemachine), |
| HTMLPARSER_MAX_STRING); |
| ctx->value[HTMLPARSER_MAX_STRING - 1] = '\0'; |
| return ctx->value; |
| } else { |
| return NULL; |
| } |
| } |
| |
| |
| /* Returns the current state of the javascript state machine |
| * |
| * Currently only present for testing purposes. |
| */ |
| int htmlparser_js_state(htmlparser_ctx *ctx) |
| { |
| return jsparser_state(ctx->jsparser); |
| } |
| |
| /* True is currently inside a javascript string literal |
| */ |
| int htmlparser_is_js_quoted(htmlparser_ctx *ctx) |
| { |
| if (htmlparser_in_js(ctx)) { |
| int st = jsparser_state(ctx->jsparser); |
| if (st == JSPARSER_STATE_Q || |
| st == JSPARSER_STATE_DQ) |
| return 1; |
| } |
| return 0; |
| } |
| |
| /* True if currently inside an attribute value |
| */ |
| int htmlparser_in_value(htmlparser_ctx *ctx) |
| { |
| int ext_state = state_external(statemachine_get_state(ctx->statemachine)); |
| return ext_state == HTMLPARSER_STATE_VALUE; |
| } |
| |
| /* Returns the position inside the current attribute value |
| */ |
| int htmlparser_value_index(htmlparser_ctx *ctx) |
| { |
| if (htmlparser_in_value(ctx)) |
| return ctx->value_index; |
| |
| return -1; |
| } |
| |
| /* Returns true if this is the first character of a url inside an attribute. |
| */ |
| int htmlparser_is_url_start(htmlparser_ctx *ctx) |
| { |
| if (htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_URI) { |
| const char* tag = htmlparser_tag(ctx); |
| /*const char* attr =*/ htmlparser_attr(ctx); |
| |
| if ((tag && strcmp(tag, "meta") == 0 && |
| meta_redirect_type(htmlparser_value(ctx)) == |
| META_REDIRECT_TYPE_URL_START) || |
| htmlparser_value_index(ctx) == 0) |
| return 1; |
| |
| } |
| return 0; |
| } |
| |
| /* Returns the current attribute type. |
| */ |
| int htmlparser_attr_type(htmlparser_ctx *ctx) |
| { |
| if (!htmlparser_in_attr(ctx)) |
| return HTMLPARSER_ATTR_NONE; |
| |
| if (is_js_attribute(ctx->attr)) |
| return HTMLPARSER_ATTR_JS; |
| |
| if (is_uri_attribute(ctx->attr)) |
| return HTMLPARSER_ATTR_URI; |
| |
| if (is_style_attribute(ctx->attr)) |
| return HTMLPARSER_ATTR_STYLE; |
| |
| const char* tag = htmlparser_tag(ctx); |
| const char* attr = htmlparser_attr(ctx); |
| |
| /* Special logic to handle meta redirect type tags. */ |
| if (tag && strcmp(tag, "meta") == 0 && |
| attr && strcmp(attr, "content") == 0) { |
| |
| const char* value = htmlparser_value(ctx); |
| meta_redirect_type_enum redirect_type = meta_redirect_type(value); |
| |
| if (redirect_type == META_REDIRECT_TYPE_URL || |
| redirect_type == META_REDIRECT_TYPE_URL_START) |
| return HTMLPARSER_ATTR_URI; |
| } |
| |
| return HTMLPARSER_ATTR_REGULAR; |
| } |
| |
| /* Return the current line number. */ |
| int htmlparser_get_line_number(htmlparser_ctx *ctx) { |
| return statemachine_get_line_number(ctx->statemachine); |
| } |
| |
| /* Set the current line number. */ |
| void htmlparser_set_line_number(htmlparser_ctx *ctx, int line) { |
| statemachine_set_line_number(ctx->statemachine, line); |
| } |
| |
| /* Return the current column number. */ |
| int htmlparser_get_column_number(htmlparser_ctx *ctx) { |
| return statemachine_get_column_number(ctx->statemachine); |
| } |
| |
| /* Set the current column number. */ |
| void htmlparser_set_column_number(htmlparser_ctx *ctx, int column) { |
| statemachine_set_column_number(ctx->statemachine, column); |
| } |
| |
| /* Retrieve a human readable error message in case an error occurred. |
| * |
| * NULL is returned if the parser didn't encounter an error. |
| */ |
| const char *htmlparser_get_error_msg(htmlparser_ctx *ctx) { |
| return statemachine_get_error_msg(ctx->statemachine); |
| } |
| |
| /* Invoked by the caller when text is expanded by the caller. |
| */ |
| int htmlparser_insert_text(htmlparser_ctx *ctx) |
| { |
| /* TODO(falmeida): Generalize and use a table for these values. */ |
| |
| if (statemachine_get_state(ctx->statemachine) == HTMLPARSER_STATE_INT_VALUE) { |
| statemachine_set_state(ctx->statemachine, HTMLPARSER_STATE_INT_VALUE_TEXT); |
| } |
| return 1; |
| } |
| |
| /* Deallocates an htmlparser context object. |
| */ |
| void htmlparser_delete(htmlparser_ctx *ctx) |
| { |
| assert(ctx != NULL); |
| statemachine_definition_delete(ctx->statemachine_def); |
| statemachine_delete(ctx->statemachine); |
| jsparser_delete(ctx->jsparser); |
| entityfilter_delete(ctx->entityfilter); |
| free(ctx); |
| } |
| |
| #ifdef __cplusplus |
| } /* namespace security_streamhtmlparser */ |
| #endif |