Brian Silverman | 70325d6 | 2015-09-20 17:00:43 -0400 | [diff] [blame] | 1 | // Copyright (c) 2007, Google Inc. |
| 2 | // All rights reserved. |
| 3 | // |
| 4 | // Redistribution and use in source and binary forms, with or without |
| 5 | // modification, are permitted provided that the following conditions are |
| 6 | // met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright |
| 9 | // notice, this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above |
| 11 | // copyright notice, this list of conditions and the following disclaimer |
| 12 | // in the documentation and/or other materials provided with the |
| 13 | // distribution. |
| 14 | // * Neither the name of Google Inc. nor the names of its |
| 15 | // contributors may be used to endorse or promote products derived from |
| 16 | // this software without specific prior written permission. |
| 17 | // |
| 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | // --- |
| 30 | // Author: falmeida@google.com (Filipe Almeida) |
| 31 | // |
| 32 | // c++ bindings for htmlparser. |
| 33 | |
| 34 | #ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |
| 35 | #define SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |
| 36 | |
| 37 | #include <config.h> |
| 38 | #include <string> |
| 39 | #include "htmlparser/htmlparser.h" |
| 40 | #include "htmlparser/jsparser.h" |
| 41 | #include "base/util.h" |
| 42 | |
| 43 | namespace ctemplate_htmlparser { |
| 44 | |
| 45 | class JavascriptParser { |
| 46 | public: |
| 47 | enum State { |
| 48 | STATE_TEXT = JSPARSER_STATE_TEXT, |
| 49 | STATE_Q = JSPARSER_STATE_Q, |
| 50 | STATE_DQ = JSPARSER_STATE_DQ, |
| 51 | STATE_REGEXP = JSPARSER_STATE_REGEXP, |
| 52 | STATE_COMMENT = JSPARSER_STATE_COMMENT, |
| 53 | }; |
| 54 | }; |
| 55 | |
| 56 | class HtmlParser { |
| 57 | public: |
| 58 | |
| 59 | /* html states */ |
| 60 | enum State { |
| 61 | STATE_TEXT = HTMLPARSER_STATE_TEXT, |
| 62 | STATE_TAG = HTMLPARSER_STATE_TAG, |
| 63 | STATE_ATTR = HTMLPARSER_STATE_ATTR, |
| 64 | STATE_VALUE = HTMLPARSER_STATE_VALUE, |
| 65 | STATE_COMMENT = HTMLPARSER_STATE_COMMENT, |
| 66 | STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE, |
| 67 | STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE, |
| 68 | STATE_ERROR = HTMLPARSER_STATE_ERROR |
| 69 | }; |
| 70 | |
| 71 | /* attribute types */ |
| 72 | enum AttributeType { |
| 73 | ATTR_NONE = HTMLPARSER_ATTR_NONE, |
| 74 | ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR, |
| 75 | ATTR_URI = HTMLPARSER_ATTR_URI, |
| 76 | ATTR_JS = HTMLPARSER_ATTR_JS, |
| 77 | ATTR_STYLE = HTMLPARSER_ATTR_STYLE |
| 78 | }; |
| 79 | |
| 80 | /* Parser modes */ |
| 81 | enum Mode { |
| 82 | MODE_HTML = HTMLPARSER_MODE_HTML, |
| 83 | MODE_JS = HTMLPARSER_MODE_JS, |
| 84 | MODE_CSS = HTMLPARSER_MODE_CSS, |
| 85 | MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG |
| 86 | }; |
| 87 | |
| 88 | HtmlParser() { |
| 89 | parser_ = htmlparser_new(); |
| 90 | CHECK(parser_ != NULL); |
| 91 | }; |
| 92 | |
| 93 | /* Parses the input html stream and returns the finishing state. |
| 94 | * |
| 95 | * Returns HtmlParser::STATE_ERROR if unable to parse the input. If |
| 96 | * htmlparser_parse() is called after an error situation was encountered |
| 97 | * the behaviour is unspecified. At this point, Reset() or ResetMode() |
| 98 | * can be called to reset the state so it can be used to parse a new file. |
| 99 | */ |
| 100 | int Parse(const char *str, int len) { |
| 101 | return htmlparser_parse(parser_, str, len); |
| 102 | }; |
| 103 | |
| 104 | int Parse(const std::string &str) { |
| 105 | return Parse(str.c_str(), static_cast<int>(str.length())); |
| 106 | }; |
| 107 | |
| 108 | /* Returns the current state the parser is in */ |
| 109 | int state() const { |
| 110 | return htmlparser_state(parser_); |
| 111 | }; |
| 112 | |
| 113 | /* Returns the current tag or NULL if not available. |
| 114 | * |
| 115 | * There is no stack implemented because we currently don't have a need for |
| 116 | * it, which means tag names are tracked only one level deep. |
| 117 | * |
| 118 | * This is better understood by looking at the following example: |
| 119 | * |
| 120 | * <b [tag=b]> |
| 121 | * [tag=b] |
| 122 | * <i> |
| 123 | * [tag=i] |
| 124 | * </i> |
| 125 | * [tag=NULL] |
| 126 | * </b> |
| 127 | * |
| 128 | * The tag is correctly filled inside the tag itself and before any new |
| 129 | * inner tag is closed, at which point the tag will be set to NULL. |
| 130 | * |
| 131 | * For our current purposes this is not a problem, but we may implement a |
| 132 | * tag tracking stack in the future for completeness. |
| 133 | */ |
| 134 | const char *tag() const { |
| 135 | return htmlparser_tag(parser_); |
| 136 | } |
| 137 | |
| 138 | /* Returns the current attribute name if inside an attribute name or an |
| 139 | * attribute value. Returns NULL otherwise. */ |
| 140 | const char *attribute() const { |
| 141 | return htmlparser_attr(parser_); |
| 142 | } |
| 143 | |
| 144 | /* Returns the contents of the current attribute value. */ |
| 145 | const char *value() const { |
| 146 | return htmlparser_value(parser_); |
| 147 | } |
| 148 | |
| 149 | /* Returns true if inside javascript. This can be a javascript block, a |
| 150 | * javascript attribute value or the parser may just be in javascript mode |
| 151 | * (HtmlParser::MODE_JS) */ |
| 152 | bool InJavascript() const { |
| 153 | return static_cast<bool>(htmlparser_in_js(parser_)); |
| 154 | } |
| 155 | |
| 156 | /* Returns true if the parser is currently inside a CSS construct. |
| 157 | * |
| 158 | * Currently this can be either a STYLE tag, a STYLE attribute or the fact |
| 159 | * that the parser was reset using MODE_CSS using ResetMode(). |
| 160 | */ |
| 161 | bool InCss() const { |
| 162 | return static_cast<bool>(htmlparser_in_css(parser_)); |
| 163 | } |
| 164 | |
| 165 | /* Returns true if the current attribute is quoted */ |
| 166 | bool IsAttributeQuoted() const { |
| 167 | return static_cast<bool>(htmlparser_is_attr_quoted(parser_)); |
| 168 | } |
| 169 | |
| 170 | /* Returns true if the parser is inside a js string literal. |
| 171 | */ |
| 172 | bool IsJavascriptQuoted() const { |
| 173 | return static_cast<bool>(htmlparser_is_js_quoted(parser_)); |
| 174 | } |
| 175 | |
| 176 | /* Returns the index within the current value or -1 if the parser is not |
| 177 | * inside an attribute value */ |
| 178 | int ValueIndex() const { |
| 179 | return htmlparser_value_index(parser_); |
| 180 | } |
| 181 | |
| 182 | /* Returns true if this is the first character of a url inside an attribute. |
| 183 | * |
| 184 | * This function can be used by an html sanitizer or auto escaping system as |
| 185 | * a hint that it should validate the url for a whitelist of protocol |
| 186 | * handlers and for well-formedness, or that it should just escape a |
| 187 | * component of it. |
| 188 | * |
| 189 | * For attributes that expect a url this will return true if we are at the |
| 190 | * first character of the attribute, but for the special case of a meta |
| 191 | * redirect tag some analysis is made in order to verify if we are at the |
| 192 | * start of a url or not. |
| 193 | * |
| 194 | * For any other attributes, the result will always be false. |
| 195 | * |
| 196 | */ |
| 197 | bool IsUrlStart() const { |
| 198 | return htmlparser_is_url_start(parser_); |
| 199 | } |
| 200 | |
| 201 | /* Returns the current attribute type. |
| 202 | * |
| 203 | * The attribute type can be one of: |
| 204 | * ATTR_NONE - not inside an attribute |
| 205 | * ATTR_REGULAR - Inside a normal attribute |
| 206 | * ATTR_URI - Inside an attribute that accepts a uri |
| 207 | * ATTR_JS - Inside a javascript attribute |
| 208 | * ATTR_STYLE - Inside a css style attribute |
| 209 | * */ |
| 210 | int AttributeType() const { |
| 211 | return htmlparser_attr_type(parser_); |
| 212 | } |
| 213 | |
| 214 | /* Return the current line number. */ |
| 215 | int line_number() const { |
| 216 | return htmlparser_get_line_number(parser_); |
| 217 | } |
| 218 | |
| 219 | /* Set the current line number. */ |
| 220 | void set_line_number(int line) { |
| 221 | return htmlparser_set_line_number(parser_, line); |
| 222 | } |
| 223 | |
| 224 | /* Return the current column number. */ |
| 225 | int column_number() const { |
| 226 | return htmlparser_get_column_number(parser_); |
| 227 | } |
| 228 | |
| 229 | /* Set the current line number. */ |
| 230 | void set_column_number(int column) { |
| 231 | return htmlparser_set_column_number(parser_, column); |
| 232 | } |
| 233 | |
| 234 | /* Retrieve a human readable error message in case an error occurred. |
| 235 | * |
| 236 | * NULL is returned if the parser didn't encounter an error. |
| 237 | */ |
| 238 | const char *GetErrorMessage() { |
| 239 | return htmlparser_get_error_msg(parser_); |
| 240 | } |
| 241 | |
| 242 | /* Returns the current state the javascript parser is in. |
| 243 | * |
| 244 | * Should only be used for testing. |
| 245 | */ |
| 246 | int javascript_state() const { |
| 247 | return htmlparser_js_state(parser_); |
| 248 | }; |
| 249 | |
| 250 | /* Resets the parser to it's initial state and changes the parser mode. |
| 251 | * |
| 252 | * Internal state (tag name, attribute name, state of statemachine) is |
| 253 | * reset as * though the object was just created. |
| 254 | * |
| 255 | * Available modes: |
| 256 | * MODE_HTML - Parses html text |
| 257 | * MODE_JS - Parses javascript files |
| 258 | * MODE_CSS - Parses CSS files. No actual parsing is actually done |
| 259 | * but InCss() always returns true. |
| 260 | * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To |
| 261 | * be used in a template expanded in the |
| 262 | * following context: <a $template> |
| 263 | */ |
| 264 | void ResetMode(enum Mode mode) { |
| 265 | return htmlparser_reset_mode(parser_, mode); |
| 266 | } |
| 267 | |
| 268 | /* Resets the parser to it's initial state and to the default mode, which is |
| 269 | * MODE_HTML. |
| 270 | * |
| 271 | * All internal context like tag name, attribute name or the state of the |
| 272 | * statemachine are reset to it's original values as if the object was just |
| 273 | * created. |
| 274 | */ |
| 275 | void Reset() { |
| 276 | return htmlparser_reset(parser_); |
| 277 | } |
| 278 | |
| 279 | /* Invoked when text is inserted by the caller. |
| 280 | * |
| 281 | * Should be called before a template directive that expands to content is |
| 282 | * found. This changes the current state by following the default rule, |
| 283 | * ensuring we stay in sync with template. |
| 284 | * |
| 285 | * Returns true if template directives are accepted for this state and |
| 286 | * false if they are not, which should result in an error condition. |
| 287 | * |
| 288 | * Right now the only case being handled are unquoted attribute values and |
| 289 | * it always returns true. In the future we can handle more cases and |
| 290 | * restrict the states were we allow template directives by returning false |
| 291 | * for those. |
| 292 | */ |
| 293 | bool InsertText() { |
| 294 | return static_cast<bool>(htmlparser_insert_text(parser_)); |
| 295 | } |
| 296 | |
| 297 | /* Copies the context of the HtmlParser object referenced in source to the |
| 298 | * current object. |
| 299 | */ |
| 300 | void CopyFrom(const HtmlParser *source) { |
| 301 | CHECK(this != source); |
| 302 | CHECK(source != NULL); |
| 303 | htmlparser_copy(parser_, source->parser_); |
| 304 | } |
| 305 | |
| 306 | ~HtmlParser() { |
| 307 | htmlparser_delete(parser_); |
| 308 | }; |
| 309 | |
| 310 | |
| 311 | private: |
| 312 | htmlparser_ctx *parser_; |
| 313 | DISALLOW_COPY_AND_ASSIGN(HtmlParser); |
| 314 | }; |
| 315 | |
| 316 | } // namespace security_streamhtmlparser |
| 317 | |
| 318 | #endif // SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |