blob: 120fb3ecd7e0bf21a2e5f6527c032d1724dccb17 [file] [log] [blame]
Brian Silverman70325d62015-09-20 17:00:43 -04001/*
2 * Copyright (c) 2007, Google Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 * * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * ---
31 *
32 * Author: falmeida@google.com (Filipe Almeida)
33 */
34
35#ifndef SECURITY_STREAMHTMLPARSER_HTMLPARSER_H
36#define SECURITY_STREAMHTMLPARSER_HTMLPARSER_H
37
38#include <config.h>
39#include "htmlparser/statemachine.h"
40#include "htmlparser/jsparser.h"
41
42// Annoying stuff for windows in opensource-land -- make sure clients
43// (in this case unittests) can import the functions.
44#ifndef CTEMPLATE_DLL_DECL
45# ifdef _MSC_VER
46# define CTEMPLATE_DLL_DECL __declspec(dllimport)
47# else
48# define CTEMPLATE_DLL_DECL /* should be the empty string for non-windows */
49# endif
50#endif
51
52#ifdef __cplusplus
53namespace ctemplate_htmlparser {
54#endif
55
56/* entity filter */
57
58/* String sizes used in htmlparser and entityfilter structures including the
59 * NULL terminator.
60 */
61#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE
62#define HTMLPARSER_MAX_ENTITY_SIZE 10
63
64
65enum htmlparser_state_external_enum {
66 HTMLPARSER_STATE_TEXT,
67 HTMLPARSER_STATE_TAG,
68 HTMLPARSER_STATE_ATTR,
69 HTMLPARSER_STATE_VALUE,
70 HTMLPARSER_STATE_COMMENT,
71 HTMLPARSER_STATE_JS_FILE,
72 HTMLPARSER_STATE_CSS_FILE,
73 HTMLPARSER_STATE_ERROR
74};
75
76enum htmlparser_mode {
77 HTMLPARSER_MODE_HTML,
78 HTMLPARSER_MODE_JS,
79 HTMLPARSER_MODE_CSS,
80 HTMLPARSER_MODE_HTML_IN_TAG
81};
82
83enum htmlparser_attr_type {
84 HTMLPARSER_ATTR_NONE,
85 HTMLPARSER_ATTR_REGULAR,
86 HTMLPARSER_ATTR_URI,
87 HTMLPARSER_ATTR_JS,
88 HTMLPARSER_ATTR_STYLE
89};
90
91
92/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep
93 * a forward declaration in here, since these structures are private.
94 */
95
96/* entityfilter context structure.
97 *
98 * The entity filter collection of routines provide a way to decode html
99 * entities from an html document in a streaming way.
100 *
101 * The html_process() function receives a character at a time from the input
102 * stream and returns 0 or more characters which should be appended to the
103 * resulting decoded document.
104 *
105 * Currently this collection of functions are only exported for testing purposes
106 * and shouldn't be called from outside of htmlparser.c.
107 *
108 * Since we really only use these functions with the very specific purpose of
109 * decoding html entities for javascript attributes, only a small subset of
110 * entities are supported: &lt;, &gt;, &quote;, &amp;, &apos, and the numeric
111 * character references for both decimal and hexadecimal.
112 */
113typedef struct entityfilter_ctx_s {
114
115 /* Current position into the buffer. */
116 int buffer_pos;
117
118 /* True if currently processing an html entity. */
119 int in_entity;
120
121 /* Temporary character buffer that is used while processing html entities.
122 */
123 char buffer[HTMLPARSER_MAX_ENTITY_SIZE];
124
125 /* String buffer returned to the application after we decoded an html
126 * entity.
127 */
128 char output[HTMLPARSER_MAX_ENTITY_SIZE];
129} entityfilter_ctx;
130
131/* Resets the entityfilter to its initial state so it can be reused.
132 */
133void entityfilter_reset(entityfilter_ctx *ctx);
134
135/* Initializes a new entity filter object.
136 */
137entityfilter_ctx *entityfilter_new(void);
138
139/* Deallocates an entity filter object.
140 */
141void entityfilter_delete(entityfilter_ctx *ctx);
142
143/* Copies the context of the entityfilter pointed to by src to the entityfilter
144 * dst.
145 */
146void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src);
147
148/* Processes a character from the input stream and decodes any html entities
149 * in the accumulated buffer.
150 *
151 * Returns a reference to a string that points to an internal buffer. This
152 * buffer will be changed after every call to entityfilter_process(). As
153 * such this string should be duplicated before subsequent calls to
154 * entityfilter_process().
155 */
156const char *entityfilter_process(entityfilter_ctx *ctx, char c);
157
158
159/* html parser */
160
161/* Stores the context of the html parser.
162 * If this structure is changed, htmlparser_new(), htmlparser_copy() and
163 * htmlparser_reset() should be updated accordingly.
164 */
165typedef struct htmlparser_ctx_s {
166
167 /* Holds a reference to the statemachine context. */
168 statemachine_ctx *statemachine;
169
170 /* Holds a reference to the statemachine definition in use. Right now this is
171 * only used so we can deallocate it at the end.
172 *
173 * It should be readonly and contain the same values across jsparser
174 * instances.
175 */
176 /* TODO(falmeida): Change statemachine_def to const. */
177 statemachine_definition *statemachine_def;
178
179 /* Holds a reference to the javascript parser. */
180 jsparser_ctx *jsparser;
181
182 /* Holds a reference to the entity filter. Used for decoding html entities
183 * inside javascript attributes. */
184 entityfilter_ctx *entityfilter;
185
186 /* Offset into the current attribute value where 0 is the first character in
187 * the value. */
188 int value_index;
189
190 /* True if currently processing javascript. */
191 int in_js;
192
193 /* Current tag name. */
194 char tag[HTMLPARSER_MAX_STRING];
195
196 /* Current attribute name. */
197 char attr[HTMLPARSER_MAX_STRING];
198
199 /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */
200 char value[HTMLPARSER_MAX_STRING];
201
202} htmlparser_ctx;
203
204/* Resets the parser to its initial state and to the default mode, which
205 * is MODE_HTML.
206 *
207 * All internal context like tag name, attribute name or the state of the
208 * statemachine are reset to its original values as if the object was just
209 * created.
210 */
211extern CTEMPLATE_DLL_DECL
212void htmlparser_reset(htmlparser_ctx *ctx);
213
214/* Resets the parser to its initial state and changes the parser mode.
215 * All internal context like tag name, attribute name or the state of the
216 * statemachine are reset to their original values as if the object was just
217 * created.
218 *
219 * Available modes:
220 * HTMLPARSER_MODE_HTML - Parses html text
221 * HTMLPARSER_MODE_JS - Parses javascript files
222 * HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done
223 * but htmlparser_in_css() always returns true.
224 * HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
225 * be used in a template expanded in the
226 * following context: <a $template>
227 *
228 */
229extern CTEMPLATE_DLL_DECL
230void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode);
231
232/* Initializes a new htmlparser instance.
233 *
234 * Returns a pointer to the new instance or NULL if the initialization fails.
235 * Initialization failure is fatal, and if this function fails it may not
236 * deallocate all previsouly allocated memory.
237 */
238extern CTEMPLATE_DLL_DECL
239htmlparser_ctx *htmlparser_new(void);
240
241/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
242 *
243 * Also copies over the instances of the state machine, the jsparser and the
244 * entity filter but not the statemachine definition since this one is read
245 * only.
246 */
247extern CTEMPLATE_DLL_DECL
248void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src);
249
250/* Receives an htmlparser context and returns the current html state.
251 *
252 * The return value will be one of the states of htmlparser_state_external_enum.
253 */
254extern CTEMPLATE_DLL_DECL
255int htmlparser_state(htmlparser_ctx *ctx);
256
257/* Parses the input html stream and returns the finishing state.
258 *
259 * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse()
260 * is called after an error situation was encountered the behaviour is
261 * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode()
262 * can be called to reset the state.
263 */
264extern CTEMPLATE_DLL_DECL
265int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size);
266
267/* Returns true if the parser is inside an attribute value and the value is
268 * surrounded by single or double quotes. */
269extern CTEMPLATE_DLL_DECL
270int htmlparser_is_attr_quoted(htmlparser_ctx *ctx);
271
272/* Returns true if the parser is currently in javascript. This can be a
273 * an attribute that takes javascript, a javascript block or the parser
274 * can just be in MODE_JS. */
275extern CTEMPLATE_DLL_DECL
276int htmlparser_in_js(htmlparser_ctx *ctx);
277
278/* Returns the current tag or NULL if not available or we haven't seen the
279 * entire tag yet.
280 *
281 * There is no stack implemented because we currently don't have a need for
282 * it, which means tag names are tracked only one level deep.
283 *
284 * This is better understood by looking at the following example:
285 *
286 * <b [tag=b]>
287 * [tag=b]
288 * <i>
289 * [tag=i]
290 * </i>
291 * [tag=NULL]
292 * </b>
293 *
294 * The tag is correctly filled inside the tag itself and before any new inner
295 * tag is closed, at which point the tag will be null.
296 *
297 * For our current purposes this is not a problem, but we may implement a tag
298 * tracking stack in the future for completeness.
299 *
300 */
301extern CTEMPLATE_DLL_DECL
302const char *htmlparser_tag(htmlparser_ctx *ctx);
303
304/* Returns the current attribute name if after an attribute name or in an
305 * attribute value. Returns NULL otherwise. */
306extern CTEMPLATE_DLL_DECL
307const char *htmlparser_attr(htmlparser_ctx *ctx);
308
309/* Returns the contents of the current attribute value.
310 *
311 * Returns NULL if not inside an attribute value.
312 */
313extern CTEMPLATE_DLL_DECL
314const char *htmlparser_value(htmlparser_ctx *ctx);
315
316/* Returns true if the parser is currently inside a CSS construct.
317 *
318 * Currently this can be either a STYLE tag, a STYLE attribute or the fact that
319 * the parser was reset in HTMLPARSER_MODE_CSS using
320 * htmlparser_reset_mode().
321 */
322extern CTEMPLATE_DLL_DECL
323int htmlparser_in_css(htmlparser_ctx *ctx);
324
325/* Returns the current state of the javascript state machine.
326 *
327 * Currently only present for testing purposes.
328 */
329extern CTEMPLATE_DLL_DECL
330int htmlparser_js_state(htmlparser_ctx *ctx);
331
332/* Returns non-zero if currently inside a javascript string literal and zero
333 * otherwise.
334 */
335extern CTEMPLATE_DLL_DECL
336int htmlparser_is_js_quoted(htmlparser_ctx *ctx);
337
338/* Returns non-zero if currently inside an attribute value and zero otherwise.
339 */
340extern CTEMPLATE_DLL_DECL
341int htmlparser_value_index(htmlparser_ctx *ctx);
342
343/* Returns true if this is the first character of a url inside an attribute.
344 *
345 * This function can be used by an html sanitizer or auto escaping system as a
346 * hint that it should validate the url for a whitelist of protocol handlers and
347 * for well-formedness, or that it should just escape a component of it.
348 *
349 * For attributes that expect a URL, this will return true if we are at the
350 * first character of the URL, false otherwise.
351 * For most attributes, this is the same as checking that we are at the first
352 * character of the attribute value but it also works correctly for the
353 * "content" attribute of the "meta" tag where the URL follows some earlier
354 * content.
355 * e.g: <meta http-equiv="refresh" * content="5; URL=http://bla.">
356 *
357 * For any other attributes, the result will always be false.
358 */
359extern CTEMPLATE_DLL_DECL
360int htmlparser_is_url_start(htmlparser_ctx *ctx);
361
362/* Returns the current attribute type.
363 *
364 * The attribute type can be one of:
365 * HTMLPARSER_ATTR_NONE - not inside an attribute.
366 * HTMLPARSER_ATTR_REGULAR - Inside a normal attribute.
367 * HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri.
368 * HTMLPARSER_ATTR_JS - Inside a javascript attribute.
369 * HTMLPARSER_ATTR_STYLE - Inside a css style attribute.
370 */
371extern CTEMPLATE_DLL_DECL
372int htmlparser_attr_type(htmlparser_ctx *ctx);
373
374/* Return the current line number. */
375extern CTEMPLATE_DLL_DECL
376int htmlparser_get_line_number(htmlparser_ctx *ctx);
377
378/* Set the current line number. */
379extern CTEMPLATE_DLL_DECL
380void htmlparser_set_line_number(htmlparser_ctx *ctx, int line);
381
382/* Return the current column number. */
383extern CTEMPLATE_DLL_DECL
384int htmlparser_get_column_number(htmlparser_ctx *ctx);
385
386/* Set the current column number. */
387extern CTEMPLATE_DLL_DECL
388void htmlparser_set_column_number(htmlparser_ctx *ctx, int column);
389
390/* Retrieve a human readable error message in case an error occurred.
391 *
392 * NULL is returned if the parser didn't encounter an error.
393 */
394extern CTEMPLATE_DLL_DECL
395const char *htmlparser_get_error_msg(htmlparser_ctx *ctx);
396
397/* Invoked by the caller when text is expanded by the caller.
398 *
399 * Should be invoked when a template directive that expands to content is
400 * executed but we don't provide this content to the parser itself. This changes
401 * the current state by following the default rule, ensuring we stay in sync
402 * with the template.
403 *
404 * Returns 1 if template directives are accepted for this state and 0 if they
405 * are not, which should result in an error condition.
406 *
407 * Right now the only case being handled are unquoted attribute values and it
408 * always returns 1. When insert_text() is called after the equals sign, we
409 * assume some text was consumed and we are now in the middle of the attribute
410 * value itself. Example:
411 *
412 * <a href=$HREF_VALUE alt=alternate_text>
413 *
414 * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't
415 * the parser would only have seen the following html:
416 *
417 * <a href= alt=alternate_text>
418 *
419 * and would interpret alt=alternate_text as the value of the href attribute.
420 */
421extern CTEMPLATE_DLL_DECL
422int htmlparser_insert_text(htmlparser_ctx *ctx);
423
424/* Deallocates an htmlparser context object.
425 */
426extern CTEMPLATE_DLL_DECL
427void htmlparser_delete(htmlparser_ctx *ctx);
428
429#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1);
430#ifdef __cplusplus
431#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \
432 static_cast<int>(strlen(b)));
433#else
434#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b));
435#endif
436
437#ifdef __cplusplus
438} /* namespace security_streamhtmlparser */
439#endif
440
441#endif /* SECURITY_STREAMHTMLPARSER_HTMLPARSER_H */