blob: 0557783ff72ef5ed5ae917146393f2fcd41f05f6 [file] [log] [blame]
Brian Silverman70325d62015-09-20 17:00:43 -04001// Copyright (c) 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8// * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14// * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29// ---
30// Author: falmeida@google.com (Filipe Almeida)
31//
32// c++ bindings for htmlparser.
33
34#ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__
35#define SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__
36
37#include <config.h>
38#include <string>
39#include "htmlparser/htmlparser.h"
40#include "htmlparser/jsparser.h"
41#include "base/util.h"
42
43namespace ctemplate_htmlparser {
44
45class JavascriptParser {
46 public:
47 enum State {
48 STATE_TEXT = JSPARSER_STATE_TEXT,
49 STATE_Q = JSPARSER_STATE_Q,
50 STATE_DQ = JSPARSER_STATE_DQ,
51 STATE_REGEXP = JSPARSER_STATE_REGEXP,
52 STATE_COMMENT = JSPARSER_STATE_COMMENT,
53 };
54};
55
56class HtmlParser {
57 public:
58
59 /* html states */
60 enum State {
61 STATE_TEXT = HTMLPARSER_STATE_TEXT,
62 STATE_TAG = HTMLPARSER_STATE_TAG,
63 STATE_ATTR = HTMLPARSER_STATE_ATTR,
64 STATE_VALUE = HTMLPARSER_STATE_VALUE,
65 STATE_COMMENT = HTMLPARSER_STATE_COMMENT,
66 STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE,
67 STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE,
68 STATE_ERROR = HTMLPARSER_STATE_ERROR
69 };
70
71 /* attribute types */
72 enum AttributeType {
73 ATTR_NONE = HTMLPARSER_ATTR_NONE,
74 ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR,
75 ATTR_URI = HTMLPARSER_ATTR_URI,
76 ATTR_JS = HTMLPARSER_ATTR_JS,
77 ATTR_STYLE = HTMLPARSER_ATTR_STYLE
78 };
79
80 /* Parser modes */
81 enum Mode {
82 MODE_HTML = HTMLPARSER_MODE_HTML,
83 MODE_JS = HTMLPARSER_MODE_JS,
84 MODE_CSS = HTMLPARSER_MODE_CSS,
85 MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG
86 };
87
88 HtmlParser() {
89 parser_ = htmlparser_new();
90 CHECK(parser_ != NULL);
91 };
92
93 /* Parses the input html stream and returns the finishing state.
94 *
95 * Returns HtmlParser::STATE_ERROR if unable to parse the input. If
96 * htmlparser_parse() is called after an error situation was encountered
97 * the behaviour is unspecified. At this point, Reset() or ResetMode()
98 * can be called to reset the state so it can be used to parse a new file.
99 */
100 int Parse(const char *str, int len) {
101 return htmlparser_parse(parser_, str, len);
102 };
103
104 int Parse(const std::string &str) {
105 return Parse(str.c_str(), static_cast<int>(str.length()));
106 };
107
108 /* Returns the current state the parser is in */
109 int state() const {
110 return htmlparser_state(parser_);
111 };
112
113 /* Returns the current tag or NULL if not available.
114 *
115 * There is no stack implemented because we currently don't have a need for
116 * it, which means tag names are tracked only one level deep.
117 *
118 * This is better understood by looking at the following example:
119 *
120 * <b [tag=b]>
121 * [tag=b]
122 * <i>
123 * [tag=i]
124 * </i>
125 * [tag=NULL]
126 * </b>
127 *
128 * The tag is correctly filled inside the tag itself and before any new
129 * inner tag is closed, at which point the tag will be set to NULL.
130 *
131 * For our current purposes this is not a problem, but we may implement a
132 * tag tracking stack in the future for completeness.
133 */
134 const char *tag() const {
135 return htmlparser_tag(parser_);
136 }
137
138 /* Returns the current attribute name if inside an attribute name or an
139 * attribute value. Returns NULL otherwise. */
140 const char *attribute() const {
141 return htmlparser_attr(parser_);
142 }
143
144 /* Returns the contents of the current attribute value. */
145 const char *value() const {
146 return htmlparser_value(parser_);
147 }
148
149 /* Returns true if inside javascript. This can be a javascript block, a
150 * javascript attribute value or the parser may just be in javascript mode
151 * (HtmlParser::MODE_JS) */
152 bool InJavascript() const {
153 return static_cast<bool>(htmlparser_in_js(parser_));
154 }
155
156 /* Returns true if the parser is currently inside a CSS construct.
157 *
158 * Currently this can be either a STYLE tag, a STYLE attribute or the fact
159 * that the parser was reset using MODE_CSS using ResetMode().
160 */
161 bool InCss() const {
162 return static_cast<bool>(htmlparser_in_css(parser_));
163 }
164
165 /* Returns true if the current attribute is quoted */
166 bool IsAttributeQuoted() const {
167 return static_cast<bool>(htmlparser_is_attr_quoted(parser_));
168 }
169
170 /* Returns true if the parser is inside a js string literal.
171 */
172 bool IsJavascriptQuoted() const {
173 return static_cast<bool>(htmlparser_is_js_quoted(parser_));
174 }
175
176 /* Returns the index within the current value or -1 if the parser is not
177 * inside an attribute value */
178 int ValueIndex() const {
179 return htmlparser_value_index(parser_);
180 }
181
182 /* Returns true if this is the first character of a url inside an attribute.
183 *
184 * This function can be used by an html sanitizer or auto escaping system as
185 * a hint that it should validate the url for a whitelist of protocol
186 * handlers and for well-formedness, or that it should just escape a
187 * component of it.
188 *
189 * For attributes that expect a url this will return true if we are at the
190 * first character of the attribute, but for the special case of a meta
191 * redirect tag some analysis is made in order to verify if we are at the
192 * start of a url or not.
193 *
194 * For any other attributes, the result will always be false.
195 *
196 */
197 bool IsUrlStart() const {
198 return htmlparser_is_url_start(parser_);
199 }
200
201 /* Returns the current attribute type.
202 *
203 * The attribute type can be one of:
204 * ATTR_NONE - not inside an attribute
205 * ATTR_REGULAR - Inside a normal attribute
206 * ATTR_URI - Inside an attribute that accepts a uri
207 * ATTR_JS - Inside a javascript attribute
208 * ATTR_STYLE - Inside a css style attribute
209 * */
210 int AttributeType() const {
211 return htmlparser_attr_type(parser_);
212 }
213
214 /* Return the current line number. */
215 int line_number() const {
216 return htmlparser_get_line_number(parser_);
217 }
218
219 /* Set the current line number. */
220 void set_line_number(int line) {
221 return htmlparser_set_line_number(parser_, line);
222 }
223
224 /* Return the current column number. */
225 int column_number() const {
226 return htmlparser_get_column_number(parser_);
227 }
228
229 /* Set the current line number. */
230 void set_column_number(int column) {
231 return htmlparser_set_column_number(parser_, column);
232 }
233
234 /* Retrieve a human readable error message in case an error occurred.
235 *
236 * NULL is returned if the parser didn't encounter an error.
237 */
238 const char *GetErrorMessage() {
239 return htmlparser_get_error_msg(parser_);
240 }
241
242 /* Returns the current state the javascript parser is in.
243 *
244 * Should only be used for testing.
245 */
246 int javascript_state() const {
247 return htmlparser_js_state(parser_);
248 };
249
250 /* Resets the parser to it's initial state and changes the parser mode.
251 *
252 * Internal state (tag name, attribute name, state of statemachine) is
253 * reset as * though the object was just created.
254 *
255 * Available modes:
256 * MODE_HTML - Parses html text
257 * MODE_JS - Parses javascript files
258 * MODE_CSS - Parses CSS files. No actual parsing is actually done
259 * but InCss() always returns true.
260 * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
261 * be used in a template expanded in the
262 * following context: <a $template>
263 */
264 void ResetMode(enum Mode mode) {
265 return htmlparser_reset_mode(parser_, mode);
266 }
267
268 /* Resets the parser to it's initial state and to the default mode, which is
269 * MODE_HTML.
270 *
271 * All internal context like tag name, attribute name or the state of the
272 * statemachine are reset to it's original values as if the object was just
273 * created.
274 */
275 void Reset() {
276 return htmlparser_reset(parser_);
277 }
278
279 /* Invoked when text is inserted by the caller.
280 *
281 * Should be called before a template directive that expands to content is
282 * found. This changes the current state by following the default rule,
283 * ensuring we stay in sync with template.
284 *
285 * Returns true if template directives are accepted for this state and
286 * false if they are not, which should result in an error condition.
287 *
288 * Right now the only case being handled are unquoted attribute values and
289 * it always returns true. In the future we can handle more cases and
290 * restrict the states were we allow template directives by returning false
291 * for those.
292 */
293 bool InsertText() {
294 return static_cast<bool>(htmlparser_insert_text(parser_));
295 }
296
297 /* Copies the context of the HtmlParser object referenced in source to the
298 * current object.
299 */
300 void CopyFrom(const HtmlParser *source) {
301 CHECK(this != source);
302 CHECK(source != NULL);
303 htmlparser_copy(parser_, source->parser_);
304 }
305
306 ~HtmlParser() {
307 htmlparser_delete(parser_);
308 };
309
310
311 private:
312 htmlparser_ctx *parser_;
313 DISALLOW_COPY_AND_ASSIGN(HtmlParser);
314};
315
316} // namespace security_streamhtmlparser
317
318#endif // SECURITY_STREAMHTMLPARSER_HTMLPARSER_CPP_H__