| // Copyright (c) 2007, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| // --- |
| // Author: falmeida@google.com (Filipe Almeida) |
| // |
| // c++ bindings for htmlparser. |
| |
| #ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |
| #define SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |
| |
| #include <config.h> |
| #include <string> |
| #include "htmlparser/htmlparser.h" |
| #include "htmlparser/jsparser.h" |
| #include "base/util.h" |
| |
| namespace ctemplate_htmlparser { |
| |
| class JavascriptParser { |
| public: |
| enum State { |
| STATE_TEXT = JSPARSER_STATE_TEXT, |
| STATE_Q = JSPARSER_STATE_Q, |
| STATE_DQ = JSPARSER_STATE_DQ, |
| STATE_REGEXP = JSPARSER_STATE_REGEXP, |
| STATE_COMMENT = JSPARSER_STATE_COMMENT, |
| }; |
| }; |
| |
| class HtmlParser { |
| public: |
| |
| /* html states */ |
| enum State { |
| STATE_TEXT = HTMLPARSER_STATE_TEXT, |
| STATE_TAG = HTMLPARSER_STATE_TAG, |
| STATE_ATTR = HTMLPARSER_STATE_ATTR, |
| STATE_VALUE = HTMLPARSER_STATE_VALUE, |
| STATE_COMMENT = HTMLPARSER_STATE_COMMENT, |
| STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE, |
| STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE, |
| STATE_ERROR = HTMLPARSER_STATE_ERROR |
| }; |
| |
| /* attribute types */ |
| enum AttributeType { |
| ATTR_NONE = HTMLPARSER_ATTR_NONE, |
| ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR, |
| ATTR_URI = HTMLPARSER_ATTR_URI, |
| ATTR_JS = HTMLPARSER_ATTR_JS, |
| ATTR_STYLE = HTMLPARSER_ATTR_STYLE |
| }; |
| |
| /* Parser modes */ |
| enum Mode { |
| MODE_HTML = HTMLPARSER_MODE_HTML, |
| MODE_JS = HTMLPARSER_MODE_JS, |
| MODE_CSS = HTMLPARSER_MODE_CSS, |
| MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG |
| }; |
| |
| HtmlParser() { |
| parser_ = htmlparser_new(); |
| CHECK(parser_ != NULL); |
| }; |
| |
| /* Parses the input html stream and returns the finishing state. |
| * |
| * Returns HtmlParser::STATE_ERROR if unable to parse the input. If |
| * htmlparser_parse() is called after an error situation was encountered |
| * the behaviour is unspecified. At this point, Reset() or ResetMode() |
| * can be called to reset the state so it can be used to parse a new file. |
| */ |
| int Parse(const char *str, int len) { |
| return htmlparser_parse(parser_, str, len); |
| }; |
| |
| int Parse(const std::string &str) { |
| return Parse(str.c_str(), static_cast<int>(str.length())); |
| }; |
| |
| /* Returns the current state the parser is in */ |
| int state() const { |
| return htmlparser_state(parser_); |
| }; |
| |
| /* Returns the current tag or NULL if not available. |
| * |
| * There is no stack implemented because we currently don't have a need for |
| * it, which means tag names are tracked only one level deep. |
| * |
| * This is better understood by looking at the following example: |
| * |
| * <b [tag=b]> |
| * [tag=b] |
| * <i> |
| * [tag=i] |
| * </i> |
| * [tag=NULL] |
| * </b> |
| * |
| * The tag is correctly filled inside the tag itself and before any new |
| * inner tag is closed, at which point the tag will be set to NULL. |
| * |
| * For our current purposes this is not a problem, but we may implement a |
| * tag tracking stack in the future for completeness. |
| */ |
| const char *tag() const { |
| return htmlparser_tag(parser_); |
| } |
| |
| /* Returns the current attribute name if inside an attribute name or an |
| * attribute value. Returns NULL otherwise. */ |
| const char *attribute() const { |
| return htmlparser_attr(parser_); |
| } |
| |
| /* Returns the contents of the current attribute value. */ |
| const char *value() const { |
| return htmlparser_value(parser_); |
| } |
| |
| /* Returns true if inside javascript. This can be a javascript block, a |
| * javascript attribute value or the parser may just be in javascript mode |
| * (HtmlParser::MODE_JS) */ |
| bool InJavascript() const { |
| return static_cast<bool>(htmlparser_in_js(parser_)); |
| } |
| |
| /* Returns true if the parser is currently inside a CSS construct. |
| * |
| * Currently this can be either a STYLE tag, a STYLE attribute or the fact |
| * that the parser was reset using MODE_CSS using ResetMode(). |
| */ |
| bool InCss() const { |
| return static_cast<bool>(htmlparser_in_css(parser_)); |
| } |
| |
| /* Returns true if the current attribute is quoted */ |
| bool IsAttributeQuoted() const { |
| return static_cast<bool>(htmlparser_is_attr_quoted(parser_)); |
| } |
| |
| /* Returns true if the parser is inside a js string literal. |
| */ |
| bool IsJavascriptQuoted() const { |
| return static_cast<bool>(htmlparser_is_js_quoted(parser_)); |
| } |
| |
| /* Returns the index within the current value or -1 if the parser is not |
| * inside an attribute value */ |
| int ValueIndex() const { |
| return htmlparser_value_index(parser_); |
| } |
| |
| /* Returns true if this is the first character of a url inside an attribute. |
| * |
| * This function can be used by an html sanitizer or auto escaping system as |
| * a hint that it should validate the url for a whitelist of protocol |
| * handlers and for well-formedness, or that it should just escape a |
| * component of it. |
| * |
| * For attributes that expect a url this will return true if we are at the |
| * first character of the attribute, but for the special case of a meta |
| * redirect tag some analysis is made in order to verify if we are at the |
| * start of a url or not. |
| * |
| * For any other attributes, the result will always be false. |
| * |
| */ |
| bool IsUrlStart() const { |
| return htmlparser_is_url_start(parser_); |
| } |
| |
| /* Returns the current attribute type. |
| * |
| * The attribute type can be one of: |
| * ATTR_NONE - not inside an attribute |
| * ATTR_REGULAR - Inside a normal attribute |
| * ATTR_URI - Inside an attribute that accepts a uri |
| * ATTR_JS - Inside a javascript attribute |
| * ATTR_STYLE - Inside a css style attribute |
| * */ |
| int AttributeType() const { |
| return htmlparser_attr_type(parser_); |
| } |
| |
| /* Return the current line number. */ |
| int line_number() const { |
| return htmlparser_get_line_number(parser_); |
| } |
| |
| /* Set the current line number. */ |
| void set_line_number(int line) { |
| return htmlparser_set_line_number(parser_, line); |
| } |
| |
| /* Return the current column number. */ |
| int column_number() const { |
| return htmlparser_get_column_number(parser_); |
| } |
| |
| /* Set the current line number. */ |
| void set_column_number(int column) { |
| return htmlparser_set_column_number(parser_, column); |
| } |
| |
| /* Retrieve a human readable error message in case an error occurred. |
| * |
| * NULL is returned if the parser didn't encounter an error. |
| */ |
| const char *GetErrorMessage() { |
| return htmlparser_get_error_msg(parser_); |
| } |
| |
| /* Returns the current state the javascript parser is in. |
| * |
| * Should only be used for testing. |
| */ |
| int javascript_state() const { |
| return htmlparser_js_state(parser_); |
| }; |
| |
| /* Resets the parser to it's initial state and changes the parser mode. |
| * |
| * Internal state (tag name, attribute name, state of statemachine) is |
| * reset as * though the object was just created. |
| * |
| * Available modes: |
| * MODE_HTML - Parses html text |
| * MODE_JS - Parses javascript files |
| * MODE_CSS - Parses CSS files. No actual parsing is actually done |
| * but InCss() always returns true. |
| * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To |
| * be used in a template expanded in the |
| * following context: <a $template> |
| */ |
| void ResetMode(enum Mode mode) { |
| return htmlparser_reset_mode(parser_, mode); |
| } |
| |
| /* Resets the parser to it's initial state and to the default mode, which is |
| * MODE_HTML. |
| * |
| * All internal context like tag name, attribute name or the state of the |
| * statemachine are reset to it's original values as if the object was just |
| * created. |
| */ |
| void Reset() { |
| return htmlparser_reset(parser_); |
| } |
| |
| /* Invoked when text is inserted by the caller. |
| * |
| * Should be called before a template directive that expands to content is |
| * found. This changes the current state by following the default rule, |
| * ensuring we stay in sync with template. |
| * |
| * Returns true if template directives are accepted for this state and |
| * false if they are not, which should result in an error condition. |
| * |
| * Right now the only case being handled are unquoted attribute values and |
| * it always returns true. In the future we can handle more cases and |
| * restrict the states were we allow template directives by returning false |
| * for those. |
| */ |
| bool InsertText() { |
| return static_cast<bool>(htmlparser_insert_text(parser_)); |
| } |
| |
| /* Copies the context of the HtmlParser object referenced in source to the |
| * current object. |
| */ |
| void CopyFrom(const HtmlParser *source) { |
| CHECK(this != source); |
| CHECK(source != NULL); |
| htmlparser_copy(parser_, source->parser_); |
| } |
| |
| ~HtmlParser() { |
| htmlparser_delete(parser_); |
| }; |
| |
| |
| private: |
| htmlparser_ctx *parser_; |
| DISALLOW_COPY_AND_ASSIGN(HtmlParser); |
| }; |
| |
| } // namespace security_streamhtmlparser |
| |
| #endif // SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |