blob: 0557783ff72ef5ed5ae917146393f2fcd41f05f6 [file] [log] [blame]
// Copyright (c) 2007, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// ---
// Author: falmeida@google.com (Filipe Almeida)
//
// c++ bindings for htmlparser.
#ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__
#define SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__
#include <config.h>
#include <string>
#include "htmlparser/htmlparser.h"
#include "htmlparser/jsparser.h"
#include "base/util.h"
namespace ctemplate_htmlparser {
class JavascriptParser {
public:
enum State {
STATE_TEXT = JSPARSER_STATE_TEXT,
STATE_Q = JSPARSER_STATE_Q,
STATE_DQ = JSPARSER_STATE_DQ,
STATE_REGEXP = JSPARSER_STATE_REGEXP,
STATE_COMMENT = JSPARSER_STATE_COMMENT,
};
};
class HtmlParser {
public:
/* html states */
enum State {
STATE_TEXT = HTMLPARSER_STATE_TEXT,
STATE_TAG = HTMLPARSER_STATE_TAG,
STATE_ATTR = HTMLPARSER_STATE_ATTR,
STATE_VALUE = HTMLPARSER_STATE_VALUE,
STATE_COMMENT = HTMLPARSER_STATE_COMMENT,
STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE,
STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE,
STATE_ERROR = HTMLPARSER_STATE_ERROR
};
/* attribute types */
enum AttributeType {
ATTR_NONE = HTMLPARSER_ATTR_NONE,
ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR,
ATTR_URI = HTMLPARSER_ATTR_URI,
ATTR_JS = HTMLPARSER_ATTR_JS,
ATTR_STYLE = HTMLPARSER_ATTR_STYLE
};
/* Parser modes */
enum Mode {
MODE_HTML = HTMLPARSER_MODE_HTML,
MODE_JS = HTMLPARSER_MODE_JS,
MODE_CSS = HTMLPARSER_MODE_CSS,
MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG
};
HtmlParser() {
parser_ = htmlparser_new();
CHECK(parser_ != NULL);
};
/* Parses the input html stream and returns the finishing state.
*
* Returns HtmlParser::STATE_ERROR if unable to parse the input. If
* htmlparser_parse() is called after an error situation was encountered
* the behaviour is unspecified. At this point, Reset() or ResetMode()
* can be called to reset the state so it can be used to parse a new file.
*/
int Parse(const char *str, int len) {
return htmlparser_parse(parser_, str, len);
};
int Parse(const std::string &str) {
return Parse(str.c_str(), static_cast<int>(str.length()));
};
/* Returns the current state the parser is in */
int state() const {
return htmlparser_state(parser_);
};
/* Returns the current tag or NULL if not available.
*
* There is no stack implemented because we currently don't have a need for
* it, which means tag names are tracked only one level deep.
*
* This is better understood by looking at the following example:
*
* <b [tag=b]>
* [tag=b]
* <i>
* [tag=i]
* </i>
* [tag=NULL]
* </b>
*
* The tag is correctly filled inside the tag itself and before any new
* inner tag is closed, at which point the tag will be set to NULL.
*
* For our current purposes this is not a problem, but we may implement a
* tag tracking stack in the future for completeness.
*/
const char *tag() const {
return htmlparser_tag(parser_);
}
/* Returns the current attribute name if inside an attribute name or an
* attribute value. Returns NULL otherwise. */
const char *attribute() const {
return htmlparser_attr(parser_);
}
/* Returns the contents of the current attribute value. */
const char *value() const {
return htmlparser_value(parser_);
}
/* Returns true if inside javascript. This can be a javascript block, a
* javascript attribute value or the parser may just be in javascript mode
* (HtmlParser::MODE_JS) */
bool InJavascript() const {
return static_cast<bool>(htmlparser_in_js(parser_));
}
/* Returns true if the parser is currently inside a CSS construct.
*
* Currently this can be either a STYLE tag, a STYLE attribute or the fact
* that the parser was reset using MODE_CSS using ResetMode().
*/
bool InCss() const {
return static_cast<bool>(htmlparser_in_css(parser_));
}
/* Returns true if the current attribute is quoted */
bool IsAttributeQuoted() const {
return static_cast<bool>(htmlparser_is_attr_quoted(parser_));
}
/* Returns true if the parser is inside a js string literal.
*/
bool IsJavascriptQuoted() const {
return static_cast<bool>(htmlparser_is_js_quoted(parser_));
}
/* Returns the index within the current value or -1 if the parser is not
* inside an attribute value */
int ValueIndex() const {
return htmlparser_value_index(parser_);
}
/* Returns true if this is the first character of a url inside an attribute.
*
* This function can be used by an html sanitizer or auto escaping system as
* a hint that it should validate the url for a whitelist of protocol
* handlers and for well-formedness, or that it should just escape a
* component of it.
*
* For attributes that expect a url this will return true if we are at the
* first character of the attribute, but for the special case of a meta
* redirect tag some analysis is made in order to verify if we are at the
* start of a url or not.
*
* For any other attributes, the result will always be false.
*
*/
bool IsUrlStart() const {
return htmlparser_is_url_start(parser_);
}
/* Returns the current attribute type.
*
* The attribute type can be one of:
* ATTR_NONE - not inside an attribute
* ATTR_REGULAR - Inside a normal attribute
* ATTR_URI - Inside an attribute that accepts a uri
* ATTR_JS - Inside a javascript attribute
* ATTR_STYLE - Inside a css style attribute
* */
int AttributeType() const {
return htmlparser_attr_type(parser_);
}
/* Return the current line number. */
int line_number() const {
return htmlparser_get_line_number(parser_);
}
/* Set the current line number. */
void set_line_number(int line) {
return htmlparser_set_line_number(parser_, line);
}
/* Return the current column number. */
int column_number() const {
return htmlparser_get_column_number(parser_);
}
/* Set the current line number. */
void set_column_number(int column) {
return htmlparser_set_column_number(parser_, column);
}
/* Retrieve a human readable error message in case an error occurred.
*
* NULL is returned if the parser didn't encounter an error.
*/
const char *GetErrorMessage() {
return htmlparser_get_error_msg(parser_);
}
/* Returns the current state the javascript parser is in.
*
* Should only be used for testing.
*/
int javascript_state() const {
return htmlparser_js_state(parser_);
};
/* Resets the parser to it's initial state and changes the parser mode.
*
* Internal state (tag name, attribute name, state of statemachine) is
* reset as * though the object was just created.
*
* Available modes:
* MODE_HTML - Parses html text
* MODE_JS - Parses javascript files
* MODE_CSS - Parses CSS files. No actual parsing is actually done
* but InCss() always returns true.
* MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
* be used in a template expanded in the
* following context: <a $template>
*/
void ResetMode(enum Mode mode) {
return htmlparser_reset_mode(parser_, mode);
}
/* Resets the parser to it's initial state and to the default mode, which is
* MODE_HTML.
*
* All internal context like tag name, attribute name or the state of the
* statemachine are reset to it's original values as if the object was just
* created.
*/
void Reset() {
return htmlparser_reset(parser_);
}
/* Invoked when text is inserted by the caller.
*
* Should be called before a template directive that expands to content is
* found. This changes the current state by following the default rule,
* ensuring we stay in sync with template.
*
* Returns true if template directives are accepted for this state and
* false if they are not, which should result in an error condition.
*
* Right now the only case being handled are unquoted attribute values and
* it always returns true. In the future we can handle more cases and
* restrict the states were we allow template directives by returning false
* for those.
*/
bool InsertText() {
return static_cast<bool>(htmlparser_insert_text(parser_));
}
/* Copies the context of the HtmlParser object referenced in source to the
* current object.
*/
void CopyFrom(const HtmlParser *source) {
CHECK(this != source);
CHECK(source != NULL);
htmlparser_copy(parser_, source->parser_);
}
~HtmlParser() {
htmlparser_delete(parser_);
};
private:
htmlparser_ctx *parser_;
DISALLOW_COPY_AND_ASSIGN(HtmlParser);
};
} // namespace security_streamhtmlparser
#endif // SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__