Blame - src/htmlparser/htmlparser.cc - RealtimeRoboticsGroup/test

blob: 749a74eca830a3a4bb6dbfa2aee6949ba77507ed [file] [log] [blame]

Brian Silverman	70325d6	2015-09-20 17:00:43 -0400	[diff] [blame^]	1	/*
				2	* Copyright (c) 2007, Google Inc.
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions are
				7	* met:
				8	*
				9	* * Redistributions of source code must retain the above copyright
				10	* notice, this list of conditions and the following disclaimer.
				11	* * Redistributions in binary form must reproduce the above
				12	* copyright notice, this list of conditions and the following disclaimer
				13	* in the documentation and/or other materials provided with the
				14	* distribution.
				15	* * Neither the name of Google Inc. nor the names of its
				16	* contributors may be used to endorse or promote products derived from
				17	* this software without specific prior written permission.
				18	*
				19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30	* ---
				31	*
				32	* Author: falmeida@google.com (Filipe Almeida)
				33	*/
				34
				35	/* TODO(falmeida): Breaks on NULL characters in the stream. fix.
				36	*/
				37
				38	#include <stdio.h>
				39	#include <stdlib.h>
				40	#include <string.h>
				41	#include <ctype.h>
				42	#include <assert.h>
				43
				44	#include "htmlparser/statemachine.h"
				45	#include "htmlparser/htmlparser.h"
				46	#include "htmlparser/jsparser.h"
				47
				48	/* So we can support both C and C++ compilers, we use the CAST() macro instead
				49	* of using C style casts or static_cast<>() directly.
				50	*/
				51	#ifdef __cplusplus
				52	#define CAST(type, expression) (static_cast<type>(expression))
				53	#else
				54	#define CAST(type, expression) ((type)(expression))
				55	#endif
				56
				57	#ifdef __cplusplus
				58	namespace ctemplate_htmlparser {
				59	#endif
				60
				61	/* Generated state machine definition. */
				62	#include "htmlparser/htmlparser_fsm.h"
				63
				64	#define is_js_attribute(attr) ((attr)[0] == 'o' && (attr)[1] == 'n')
				65	#define is_style_attribute(attr) (strcmp((attr), "style") == 0)
				66
				67	/* html entity filter */
				68	static struct entityfilter_table_s {
				69	const char *entity;
				70	const char *value;
				71	} entityfilter_table[] = {
				72	{ "lt", "<" },
				73	{ "gt", ">" },
				74	{ "quot", "\"" },
				75	{ "amp", "&" },
				76	{ "apos", "\'" },
				77	{ NULL, NULL }
				78	};
				79
				80	/* Utility functions */
				81
				82	/* Similar to strncpy() but avoids the NULL padding. */
				83	static inline void nopad_strncpy(char dst, const char src, size_t dst_size,
				84	size_t src_size)
				85	{
				86	size_t size;
				87
				88	/* size = min(dst_size, src_size) */
				89	size = dst_size > src_size ? src_size : dst_size;
				90	strncpy(dst, src, size);
				91	if (size > 0)
				92	dst[size - 1] = '\0';
				93	}
				94
				95	/* Converts the internal state into the external superstate.
				96	*/
				97	static int state_external(int st)
				98	{
				99	if (st == STATEMACHINE_ERROR)
				100	return HTMLPARSER_STATE_ERROR;
				101	else
				102	return htmlparser_states_external[st];
				103	}
				104
				105	/* Returns true if the character is considered an html whitespace character.
				106	*
				107	* From: http://www.w3.org/TR/html401/struct/text.html#h-9.1
				108	*/
				109	static inline int html_isspace(char chr)
				110	{
				111	if (chr == ' ' \|\| chr == '\t' \|\| chr == '\n' \|\| chr == '\r') {
				112	return 1;
				113	} else {
				114	return 0;
				115	}
				116	}
				117
				118	/* Returns true if the attribute is expected to contain a url
				119	* This list was taken from: http://www.w3.org/TR/html4/index/attributes.html
				120	*/
				121	static int is_uri_attribute(char *attr)
				122	{
				123	if (attr == NULL)
				124	return 0;
				125
				126	switch (attr[0]) {
				127	case 'a':
				128	if (strcmp(attr, "action") == 0)
				129	return 1;
				130	/* TODO(falmeida): This is a uri list. Should we treat it diferently? */
				131	if (strcmp(attr, "archive") == 0) /* This is a uri list */
				132	return 1;
				133	break;
				134
				135	case 'b':
				136	if (strcmp(attr, "background") == 0)
				137	return 1;
				138	break;
				139
				140	case 'c':
				141	if (strcmp(attr, "cite") == 0)
				142	return 1;
				143	if (strcmp(attr, "classid") == 0)
				144	return 1;
				145	if (strcmp(attr, "codebase") == 0)
				146	return 1;
				147	break;
				148
				149	case 'd':
				150	if (strcmp(attr, "data") == 0)
				151	return 1;
				152	if (strcmp(attr, "dynsrc") == 0) /* from msdn */
				153	return 1;
				154	break;
				155
				156	case 'h':
				157	if (strcmp(attr, "href") == 0)
				158	return 1;
				159	break;
				160
				161	case 'l':
				162	if (strcmp(attr, "longdesc") == 0)
				163	return 1;
				164	break;
				165
				166	case 's':
				167	if (strcmp(attr, "src") == 0)
				168	return 1;
				169	break;
				170
				171	case 'u':
				172	if (strcmp(attr, "usemap") == 0)
				173	return 1;
				174	break;
				175	}
				176
				177	return 0;
				178
				179	}
				180
				181	/* Convert a string to lower case characters inplace.
				182	*/
				183	static void tolower_str(char *s)
				184	{
				185	while (*s != '\0') {
				186	s = CAST(char, tolower(CAST(unsigned char,s)));
				187	s++;
				188	}
				189	}
				190
				191	static const char ignore_spaces_or_digits(const char value) {
				192	while (html_isspace(value) \|\| ((value >= '0' && *value <= '9')))
				193	value++;
				194
				195	return value;
				196	}
				197
				198	static const char ignore_spaces(const char value) {
				199	while (html_isspace(*value))
				200	value++;
				201
				202	return value;
				203	}
				204
				205	/* Return type of the function meta_redirect_type.
				206	*/
				207	enum meta_redirect_type_enum {
				208	META_REDIRECT_TYPE_NONE,
				209	META_REDIRECT_TYPE_URL_START,
				210	META_REDIRECT_TYPE_URL
				211	};
				212
				213	/* Analyzes a string for the presence of a meta refresh type url.
				214	*
				215	* This function receives the value of the content attribute of a meta tag and
				216	* parses it in order to identify if a url is going to be present. This is the
				217	* format of such tag:
				218	*
				219	* <meta http-equiv="refresh" content="5; URL=http://www.google.com">
				220	*
				221	* Using a regular expression library would be the most obvious way to implement
				222	* this functionality, but introducing such a dependency is undesirable. We
				223	* opted instead to parse programmaticly since the expression is simple enough.
				224	*
				225	* For reference, this is the spec on the meta http refresh tag:
				226	* http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
				227	*
				228	* If the value has no content after the expression, we know we are at the start
				229	* of the URL. Otherwise we are past the start of the URL.
				230	*
				231	*
				232	* Returns:
				233	*
				234	* This functions returns one of the following values:
				235	* META_REDIRECT_TYPE_NONE - A url was not identified in the input string.
				236	* META_REDIRECT_TYPE_URL_START - The input string ends exactly at the start
				237	* of the url.
				238	* META_REDIRECT_TYPE_URL - The input string ends somewhere in the middle or
				239	* the end of the url.
				240	*
				241	* A few examples:
				242	* "5"
				243	* Returns META_REDIRECT_TYPE_NONE since we don't expect a url to follow.
				244	*
				245	* "5; URL = "
				246	* The function returns META_REDIRECT_TYPE_URL_START since we expect a url to
				247	* follow.
				248	*
				249	* "5; URL = http://www.google.com/?"
				250	* Returns META_REDIRECT_TYPE_URL since the input value terminates in the
				251	* middle or end of a url.
				252	*
				253	*
				254	* Caveats: We are only recording up to 256 characters of attribute values, so
				255	* our analysis is limited to that. This shouldn't be an issue in practice
				256	* though as it would be unexpected for the part of the string that we are
				257	* matching to be so long.
				258	*/
				259	enum meta_redirect_type_enum meta_redirect_type(const char *value) {
				260
				261	if (value == NULL)
				262	return META_REDIRECT_TYPE_NONE;
				263
				264	/* Match while [ \t\r\n0-9]* */
				265	value = ignore_spaces_or_digits(value);
				266
				267	/* Verify that we got a semi-colon character */
				268	if (*value != ';')
				269	return META_REDIRECT_TYPE_NONE;
				270	value++;
				271
				272	/* Match while [ \t\r\n]* */
				273	value = ignore_spaces(value);
				274
				275	/* Validate that we have 'URL' */
				276	if (strncasecmp(value, "url", strlen("url")) != 0)
				277	return META_REDIRECT_TYPE_NONE;
				278
				279	value += strlen("url");
				280
				281	/* Match while [ \t\r\n]* */
				282	value = ignore_spaces(value);
				283
				284	if (*value != '=')
				285	return META_REDIRECT_TYPE_NONE;
				286	value++;
				287
				288	/* Match while [ \t\r\n]* */
				289	value = ignore_spaces(value);
				290
				291	/* The HTML5 spec allows for the url to be quoted, so we skip a single or
				292	* double quote if we find one.
				293	*/
				294	if (value == '"' \|\| value == '\'')
				295	value++;
				296
				297	if (*value == '\0')
				298	return META_REDIRECT_TYPE_URL_START;
				299	else
				300	return META_REDIRECT_TYPE_URL;
				301	}
				302
				303
				304	/* Resets the entityfilter to it's initial state so it can be reused.
				305	*/
				306	void entityfilter_reset(entityfilter_ctx *ctx)
				307	{
				308	ctx->buffer[0] = 0;
				309	ctx->buffer_pos = 0;
				310	ctx->in_entity = 0;
				311	}
				312
				313	/* Initializes a new entity filter object.
				314	*/
				315	entityfilter_ctx *entityfilter_new()
				316	{
				317	entityfilter_ctx *ctx;
				318	ctx = CAST(entityfilter_ctx *,
				319	malloc(sizeof(entityfilter_ctx)));
				320
				321	if (ctx == NULL)
				322	return NULL;
				323	ctx->buffer[0] = 0;
				324	ctx->buffer_pos = 0;
				325	ctx->in_entity = 0;
				326
				327	return ctx;
				328	}
				329
				330	/* Copies the context of the entityfilter pointed to by src to the entityfilter
				331	* dst.
				332	*/
				333	void entityfilter_copy(entityfilter_ctx dst, entityfilter_ctx src)
				334	{
				335	assert(src != NULL);
				336	assert(dst != NULL);
				337	assert(src != dst);
				338	memcpy(dst, src, sizeof(entityfilter_ctx));
				339	}
				340
				341
				342	/* Deallocates an entity filter object.
				343	*/
				344	void entityfilter_delete(entityfilter_ctx *ctx)
				345	{
				346	free(ctx);
				347	}
				348
				349	/* Converts a string containing an hexadecimal number to a string containing
				350	* one character with the corresponding ascii value.
				351	*
				352	* The provided output char array must be at least 2 chars long.
				353	*/
				354	static const char parse_hex(const char s, char *output)
				355	{
				356	int n;
				357	n = strtol(s, NULL, 16);
				358	output[0] = n;
				359	output[1] = 0;
				360	/* TODO(falmeida): Make this function return void */
				361	return output;
				362	}
				363
				364	/* Converts a string containing a decimal number to a string containing one
				365	* character with the corresponding ascii value.
				366	*
				367	* The provided output char array must be at least 2 chars long.
				368	*/
				369	static const char parse_dec(const char s, char *output)
				370	{
				371	int n;
				372	n = strtol(s, NULL, 10);
				373	output[0] = n;
				374	output[1] = 0;
				375	return output;
				376	}
				377
				378	/* Converts a string with an html entity to it's encoded form, which is written
				379	* to the output string.
				380	*/
				381	static const char entity_convert(const char s, char *output, char terminator)
				382	{
				383	/* TODO(falmeida): Handle wide char encodings */
				384	struct entityfilter_table_s *t = entityfilter_table;
				385
				386	if (s[0] == '#') {
				387	if (s[1] == 'x' \|\| s[1] == 'X') { /* hex */
				388	return parse_hex(s + 2, output);
				389	} else { /* decimal */
				390	return parse_dec(s + 1, output);
				391	}
				392	}
				393
				394	while (t->entity != NULL) {
				395	if (strcasecmp(t->entity, s) == 0)
				396	return t->value;
				397	t++;
				398	}
				399
				400	snprintf(output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s%c", s, terminator);
				401	output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0';
				402
				403	return output;
				404	}
				405
				406
				407	/* Processes a character from the input stream and decodes any html entities
				408	* in the processed input stream.
				409	*/
				410	const char entityfilter_process(entityfilter_ctx ctx, char c)
				411	{
				412	if (ctx->in_entity) {
				413	if (c == ';' \|\| html_isspace(c)) {
				414	ctx->in_entity = 0;
				415	ctx->buffer[ctx->buffer_pos] = '\0';
				416	ctx->buffer_pos = 0;
				417	return entity_convert(ctx->buffer, ctx->output, c);
				418	} else {
				419	ctx->buffer[ctx->buffer_pos++] = c;
				420	if (ctx->buffer_pos >= HTMLPARSER_MAX_ENTITY_SIZE - 2) {
				421	/* No more buffer to use, finalize and return.
				422	* We need two characters left, one for the '&' character and
				423	* another for the NULL termination. */
				424	ctx->buffer[ctx->buffer_pos] = '\0';
				425	ctx->in_entity=0;
				426	ctx->buffer_pos = 0;
				427	snprintf(ctx->output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s",
				428	ctx->buffer);
				429	ctx->output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0';
				430	return ctx->output;
				431	}
				432	}
				433	} else {
				434	if (c == '&') {
				435	ctx->in_entity = 1;
				436	ctx->buffer_pos = 0;
				437	} else {
				438	ctx->output[0] = c;
				439	ctx->output[1] = 0;
				440	return ctx->output;
				441	}
				442	}
				443	return "";
				444	}
				445
				446	/* Called when the parser enters a new tag. Starts recording it's name into
				447	* html->tag.
				448	*/
				449	static void enter_tag_name(statemachine_ctx *ctx, int start, char chr, int end)
				450	{
				451	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				452	assert(html != NULL);
				453
				454	html->tag[0] = '\0';
				455	statemachine_start_record(ctx);
				456	}
				457
				458	/* Called when the parser exits the tag name in order to finalize the recording.
				459	*
				460	* It converts the tag name to lowercase, and if the tag was closed, just
				461	* clears html->tag.
				462	*/
				463	static void exit_tag_name(statemachine_ctx *ctx, int start, char chr, int end)
				464	{
				465	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				466	assert(html != NULL);
				467
				468	nopad_strncpy(html->tag, statemachine_stop_record(ctx),
				469	HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
				470
				471	tolower_str(html->tag);
				472
				473	if (html->tag[0] == '/')
				474	html->tag[0] = '\0';
				475	}
				476
				477	/* Called when the parser enters a new tag. Starts recording it's name into
				478	* html->attr
				479	*/
				480	static void enter_attr(statemachine_ctx *ctx, int start, char chr, int end)
				481	{
				482	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				483	assert(html != NULL);
				484
				485	html->attr[0] = '\0';
				486	statemachine_start_record(ctx);
				487	}
				488
				489	/* Called when the parser exits the attribute name in order to finalize the
				490	* recording.
				491	*
				492	* It converts the tag name to lowercase.
				493	*/
				494	static void exit_attr(statemachine_ctx *ctx, int start, char chr, int end)
				495	{
				496	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				497	assert(html != NULL);
				498
				499	nopad_strncpy(html->attr, statemachine_stop_record(ctx),
				500	HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
				501
				502	tolower_str(html->attr);
				503	}
				504
				505	/* Called when we enter an attribute value.
				506	*
				507	* Keeps track of a position index inside the value and initializes the
				508	* javascript state machine for attributes that accept javascript.
				509	*/
				510	static void enter_value(statemachine_ctx *ctx, int start, char chr, int end)
				511	{
				512	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				513	assert(html != NULL);
				514
				515	html->value_index = 0;
				516
				517	if (is_js_attribute(html->attr)) {
				518	entityfilter_reset(html->entityfilter);
				519	jsparser_reset(html->jsparser);
				520	html->in_js = 1;
				521	} else {
				522	html->in_js = 0;
				523	}
				524	}
				525
				526	/* Called when we enter the contents of an attribute value.
				527	*
				528	* Initializes the recording of the contents of the value.
				529	*/
				530	static void enter_value_content(statemachine_ctx *ctx, int start, char chr,
				531	int end)
				532	{
				533	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				534	assert(html != NULL);
				535
				536	html->value[0] = '\0';
				537	statemachine_start_record(ctx);
				538	}
				539
				540	/* Called when we exit the contents of an attribute value.
				541	*
				542	* Finalizes the recording of the contents of the value.
				543	*/
				544	static void exit_value_content(statemachine_ctx *ctx, int start, char chr,
				545	int end)
				546	{
				547	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				548	assert(html != NULL);
				549
				550	nopad_strncpy(html->value, statemachine_stop_record(ctx),
				551	HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
				552
				553	html->in_js = 0;
				554	}
				555
				556	/* Called for every character inside an attribute value.
				557	*
				558	* Used to process javascript and keep track of the position index inside the
				559	* attribute value.
				560	*/
				561	static void in_state_value(statemachine_ctx *ctx, int start, char chr, int end)
				562	{
				563	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				564	assert(html != NULL);
				565
				566	html->value_index++;
				567
				568	if (html->in_js == 1) {
				569	const char *output;
				570	output = entityfilter_process(html->entityfilter, chr);
				571	jsparser_parse_str(html->jsparser, output);
				572	}
				573	}
				574
				575	/* Called everytime the parser leaves a tag definition.
				576	*
				577	* When we encounter a script tag, we initialize the js parser and switch the
				578	* state to cdata. We also switch to the cdata state when we encounter any
				579	* other CDATA/RCDATA tag (style, title or textarea) except that we do not
				580	* initialize the js parser.
				581	*
				582	* To simplify the code, we treat RCDATA and CDATA sections the same since the
				583	* differences between them don't affect the context we are in.
				584	*/
				585	static void tag_close(statemachine_ctx *ctx, int start, char chr, int end)
				586	{
				587	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				588	assert(html != NULL);
				589
				590	if (strcmp(html->tag, "script") == 0) {
				591	ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
				592	jsparser_reset(html->jsparser);
				593	html->in_js = 1;
				594	} else if (strcmp(html->tag, "style") == 0 \|\|
				595	strcmp(html->tag, "title") == 0 \|\|
				596	strcmp(html->tag, "textarea") == 0) {
				597	ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
				598	html->in_js = 0;
				599	}
				600	}
				601
				602	/* Called inside cdata blocks in order to parse the javascript.
				603	*
				604	* Calls the javascript parser if currently in a script tag.
				605	*/
				606	static void in_state_cdata(statemachine_ctx *ctx, int start, char chr, int end)
				607	{
				608	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				609	assert(html != NULL);
				610
				611	if (html->in_js)
				612	jsparser_parse_chr(html->jsparser, chr);
				613	}
				614
				615	/* Called if we encounter a '<' character in a cdata section.
				616	*
				617	* When encountering a '<' character inside cdata, we need to find the closing
				618	* tag name in order to know if the tag is going to be closed or not, so we
				619	* start recording the name of what could be the closing tag.
				620	*/
				621	static void enter_state_cdata_may_close(statemachine_ctx *ctx, int start,
				622	char chr, int end)
				623	{
				624	statemachine_start_record(ctx);
				625	}
				626
				627	/* Called when we finish reading what could be a closing cdata tag.
				628	*
				629	* Checks if the closing tag name matches the current entity, and if so closes
				630	* the element.
				631	*/
				632	static void exit_state_cdata_may_close(statemachine_ctx *ctx, int start,
				633	char chr, int end)
				634	{
				635	htmlparser_ctx html = CAST(htmlparser_ctx , ctx->user);
				636	const char *cdata_close_tag;
				637	assert(html != NULL);
				638
				639	cdata_close_tag = statemachine_stop_record(ctx);
				640	assert(cdata_close_tag[0] == '/');
				641
				642	if (strcasecmp(&cdata_close_tag[1], html->tag) == 0 &&
				643	(chr == '>' \|\| html_isspace(chr))) { /* Make sure we have a delimiter */
				644	html->tag[0] = '\0'; /* Empty tag mimicking exit_tag_name(). */
				645	html->in_js = 0; /* In case this was a script tag. */
				646	} else {
				647	/* Does not close the CDATA section. Go back to CDATA. */
				648	ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
				649	}
				650	}
				651
				652	/* Resets the parser to it's initial state and changes the parser mode.
				653	*/
				654	void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode)
				655	{
				656	assert(ctx != NULL);
				657	statemachine_reset(ctx->statemachine);
				658	ctx->in_js = 0;
				659	ctx->tag[0] = '\0';
				660	ctx->attr[0] = '\0';
				661	ctx->value[0] = '\0';
				662
				663	jsparser_reset(ctx->jsparser);
				664
				665	switch (mode) {
				666	case HTMLPARSER_MODE_HTML:
				667	ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TEXT;
				668	break;
				669	case HTMLPARSER_MODE_JS:
				670	ctx->statemachine->current_state = HTMLPARSER_STATE_INT_JS_FILE;
				671	ctx->in_js = 1;
				672	break;
				673	case HTMLPARSER_MODE_CSS:
				674	ctx->statemachine->current_state = HTMLPARSER_STATE_INT_CSS_FILE;
				675	break;
				676	case HTMLPARSER_MODE_HTML_IN_TAG:
				677	ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TAG_SPACE;
				678	break;
				679	default:
				680	assert("Invalid mode in htmlparser_reset_mode()." && 0);
				681	}
				682	}
				683
				684	/* Resets the parser to it's initial state and to the default mode, which
				685	* is MODE_HTML.
				686	*/
				687	void htmlparser_reset(htmlparser_ctx *ctx)
				688	{
				689	assert(ctx != NULL);
				690	htmlparser_reset_mode(ctx, HTMLPARSER_MODE_HTML);
				691	}
				692
				693	/* Creates a new state machine definition and initializes the events for the
				694	* state transitions.
				695	*
				696	* Although each instance of the parser has it's own private instance of a
				697	* statemachine definition, they are still identical across html parser objects
				698	* and are never modified after creation. As such, changes to this definition
				699	* should not occur outside this function and should not depend on properties
				700	* of this particular parser instance as in the future we may opt to use a
				701	* single public definition across parser objects.
				702	*/
				703	static statemachine_definition *create_statemachine_definition()
				704	{
				705	statemachine_definition *def;
				706	def = statemachine_definition_new(HTMLPARSER_NUM_STATES);
				707	if (def == NULL)
				708	return NULL;
				709
				710	statemachine_definition_populate(def, htmlparser_state_transitions,
				711	htmlparser_states_internal_names);
				712
				713	statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_NAME,
				714	enter_tag_name);
				715	statemachine_exit_state(def, HTMLPARSER_STATE_INT_TAG_NAME, exit_tag_name);
				716
				717	statemachine_enter_state(def, HTMLPARSER_STATE_INT_ATTR, enter_attr);
				718	statemachine_exit_state(def, HTMLPARSER_STATE_INT_ATTR, exit_attr);
				719
				720	statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_CLOSE, tag_close);
				721
				722	/* CDATA states. We must list all cdata and javascript states here. */
				723	/* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't
				724	* go out of sync.
				725	*/
				726	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_TEXT, in_state_cdata);
				727	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START,
				728	in_state_cdata);
				729	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH,
				730	in_state_cdata);
				731	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY,
				732	in_state_cdata);
				733	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH,
				734	in_state_cdata);
				735	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH,
				736	in_state_cdata);
				737	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_LT, in_state_cdata);
				738	statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
				739	in_state_cdata);
				740
				741	/* For simplification, we treat the javascript mode as if it were cdata. */
				742	statemachine_in_state(def, HTMLPARSER_STATE_INT_JS_FILE, in_state_cdata);
				743
				744	statemachine_enter_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
				745	enter_state_cdata_may_close);
				746	statemachine_exit_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
				747	exit_state_cdata_may_close);
				748	/* value states */
				749	statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE, enter_value);
				750
				751	/* Called when we enter the content of the value */
				752	statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT,
				753	enter_value_content);
				754	statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_Q,
				755	enter_value_content);
				756	statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_DQ,
				757	enter_value_content);
				758
				759	/* Called when we exit the content of the value */
				760	statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT,
				761	exit_value_content);
				762	statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_Q,
				763	exit_value_content);
				764	statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_DQ,
				765	exit_value_content);
				766
				767	statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT, in_state_value);
				768	statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_Q, in_state_value);
				769	statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_DQ, in_state_value);
				770
				771	return def;
				772	}
				773
				774
				775	/* Initializes a new htmlparser instance.
				776	*
				777	* Returns a pointer to the new instance or NULL if the initialization fails.
				778	* Initialization failure is fatal, and if this function fails it may not
				779	* deallocate all previsouly allocated memory.
				780	*/
				781	htmlparser_ctx *htmlparser_new()
				782	{
				783	htmlparser_ctx *html;
				784
				785	html = CAST(htmlparser_ctx *, calloc(1, sizeof(htmlparser_ctx)));
				786	if (html == NULL)
				787	return NULL;
				788
				789	html->statemachine_def = create_statemachine_definition();
				790	if (html->statemachine_def == NULL)
				791	return NULL;
				792
				793	html->statemachine = statemachine_new(html->statemachine_def, html);
				794	if (html->statemachine == NULL)
				795	return NULL;
				796
				797	html->jsparser = jsparser_new();
				798	if (html->jsparser == NULL)
				799	return NULL;
				800
				801	html->entityfilter = entityfilter_new();
				802	if (html->entityfilter == NULL)
				803	return NULL;
				804
				805	htmlparser_reset(html);
				806
				807	return html;
				808	}
				809
				810	/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
				811	*/
				812	void htmlparser_copy(htmlparser_ctx dst, const htmlparser_ctx src)
				813	{
				814	dst->value_index = src->value_index;
				815	dst->in_js = src->in_js;
				816	strcpy(dst->tag, src->tag);
				817	strcpy(dst->attr, src->attr);
				818	strcpy(dst->value, src->value);
				819
				820	statemachine_copy(dst->statemachine,
				821	src->statemachine,
				822	dst->statemachine_def,
				823	dst);
				824
				825	jsparser_copy(dst->jsparser, src->jsparser);
				826
				827	entityfilter_copy(dst->entityfilter, src->entityfilter);
				828
				829	}
				830
				831	/* Receives an htmlparser context and Returns the current html state.
				832	*/
				833	int htmlparser_state(htmlparser_ctx *ctx)
				834	{
				835	return state_external(ctx->statemachine->current_state);
				836	}
				837
				838	/* Parses the input html stream and returns the finishing state.
				839	*/
				840	int htmlparser_parse(htmlparser_ctx ctx, const char str, int size)
				841	{
				842	int internal_state;
				843	internal_state = statemachine_parse(ctx->statemachine, str, size);
				844	return state_external(internal_state);
				845	}
				846
				847
				848	/* Returns true if the parser is inside an attribute value and the value is
				849	* surrounded by single or double quotes. */
				850	int htmlparser_is_attr_quoted(htmlparser_ctx *ctx) {
				851	int st = statemachine_get_state(ctx->statemachine);
				852	if (st == HTMLPARSER_STATE_INT_VALUE_Q_START \|\|
				853	st == HTMLPARSER_STATE_INT_VALUE_Q \|\|
				854	st == HTMLPARSER_STATE_INT_VALUE_DQ_START \|\|
				855	st == HTMLPARSER_STATE_INT_VALUE_DQ)
				856	return 1;
				857	else
				858	return 0;
				859	}
				860
				861	/* Returns true if the parser is currently in javascript.
				862	*/
				863	int htmlparser_in_js(htmlparser_ctx *ctx) {
				864	int st = statemachine_get_state(ctx->statemachine);
				865
				866	/* CDATA states plus JS_FILE. We must list all cdata and javascript states
				867	* here. */
				868	/* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't go
				869	* out of sync. */
				870	if (ctx->in_js &&
				871	(st == HTMLPARSER_STATE_INT_CDATA_TEXT \|\|
				872	st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START \|\|
				873	st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH \|\|
				874	st == HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY \|\|
				875	st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH \|\|
				876	st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH \|\|
				877	st == HTMLPARSER_STATE_INT_CDATA_LT \|\|
				878	st == HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE \|\|
				879	st == HTMLPARSER_STATE_INT_JS_FILE))
				880	return 1;
				881
				882	if (state_external(st) == HTMLPARSER_STATE_VALUE && ctx->in_js)
				883	return 1;
				884	else
				885	return 0;
				886	}
				887
				888	/* Returns the current tag or NULL if not available or we haven't seen the
				889	* entire tag yet.
				890	*/
				891	const char htmlparser_tag(htmlparser_ctx ctx)
				892	{
				893	if (ctx->tag[0] != '\0')
				894	return ctx->tag;
				895	else
				896	return NULL;
				897	}
				898
				899	/* Returns true if inside an attribute or a value */
				900	int htmlparser_in_attr(htmlparser_ctx *ctx)
				901	{
				902	int ext_state = state_external(statemachine_get_state(ctx->statemachine));
				903	return ext_state == HTMLPARSER_STATE_ATTR \|\|
				904	ext_state == HTMLPARSER_STATE_VALUE;
				905	}
				906
				907	/* Returns the current attribute name if after an attribute name or in an
				908	* attribute value. Returns NULL otherwise. */
				909	const char htmlparser_attr(htmlparser_ctx ctx)
				910	{
				911	if (htmlparser_in_attr(ctx))
				912	return ctx->attr;
				913	else
				914	return NULL;
				915	}
				916
				917	/* Returns true if the parser is currently inside a CSS construct.
				918	*/
				919	int htmlparser_in_css(htmlparser_ctx *ctx) {
				920	int state = statemachine_get_state(ctx->statemachine);
				921	const char *tag = htmlparser_tag(ctx);
				922	int external_state = state_external(state);
				923
				924	if (state == HTMLPARSER_STATE_INT_CSS_FILE \|\|
				925	(external_state == HTMLPARSER_STATE_VALUE &&
				926	htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_STYLE) \|\|
				927	(tag && strcmp(tag, "style") == 0)) {
				928	return 1;
				929	} else {
				930	return 0;
				931	}
				932	}
				933
				934	/* Returns the contents of the current attribute value.
				935	*/
				936	const char htmlparser_value(htmlparser_ctx ctx)
				937	{
				938	int ext_state = state_external(statemachine_get_state(ctx->statemachine));
				939	if (ext_state == HTMLPARSER_STATE_VALUE) {
				940	strncpy(ctx->value, statemachine_record_buffer(ctx->statemachine),
				941	HTMLPARSER_MAX_STRING);
				942	ctx->value[HTMLPARSER_MAX_STRING - 1] = '\0';
				943	return ctx->value;
				944	} else {
				945	return NULL;
				946	}
				947	}
				948
				949
				950	/* Returns the current state of the javascript state machine
				951	*
				952	* Currently only present for testing purposes.
				953	*/
				954	int htmlparser_js_state(htmlparser_ctx *ctx)
				955	{
				956	return jsparser_state(ctx->jsparser);
				957	}
				958
				959	/* True is currently inside a javascript string literal
				960	*/
				961	int htmlparser_is_js_quoted(htmlparser_ctx *ctx)
				962	{
				963	if (htmlparser_in_js(ctx)) {
				964	int st = jsparser_state(ctx->jsparser);
				965	if (st == JSPARSER_STATE_Q \|\|
				966	st == JSPARSER_STATE_DQ)
				967	return 1;
				968	}
				969	return 0;
				970	}
				971
				972	/* True if currently inside an attribute value
				973	*/
				974	int htmlparser_in_value(htmlparser_ctx *ctx)
				975	{
				976	int ext_state = state_external(statemachine_get_state(ctx->statemachine));
				977	return ext_state == HTMLPARSER_STATE_VALUE;
				978	}
				979
				980	/* Returns the position inside the current attribute value
				981	*/
				982	int htmlparser_value_index(htmlparser_ctx *ctx)
				983	{
				984	if (htmlparser_in_value(ctx))
				985	return ctx->value_index;
				986
				987	return -1;
				988	}
				989
				990	/* Returns true if this is the first character of a url inside an attribute.
				991	*/
				992	int htmlparser_is_url_start(htmlparser_ctx *ctx)
				993	{
				994	if (htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_URI) {
				995	const char* tag = htmlparser_tag(ctx);
				996	/const char attr =*/ htmlparser_attr(ctx);
				997
				998	if ((tag && strcmp(tag, "meta") == 0 &&
				999	meta_redirect_type(htmlparser_value(ctx)) ==
				1000	META_REDIRECT_TYPE_URL_START) \|\|
				1001	htmlparser_value_index(ctx) == 0)
				1002	return 1;
				1003
				1004	}
				1005	return 0;
				1006	}
				1007
				1008	/* Returns the current attribute type.
				1009	*/
				1010	int htmlparser_attr_type(htmlparser_ctx *ctx)
				1011	{
				1012	if (!htmlparser_in_attr(ctx))
				1013	return HTMLPARSER_ATTR_NONE;
				1014
				1015	if (is_js_attribute(ctx->attr))
				1016	return HTMLPARSER_ATTR_JS;
				1017
				1018	if (is_uri_attribute(ctx->attr))
				1019	return HTMLPARSER_ATTR_URI;
				1020
				1021	if (is_style_attribute(ctx->attr))
				1022	return HTMLPARSER_ATTR_STYLE;
				1023
				1024	const char* tag = htmlparser_tag(ctx);
				1025	const char* attr = htmlparser_attr(ctx);
				1026
				1027	/* Special logic to handle meta redirect type tags. */
				1028	if (tag && strcmp(tag, "meta") == 0 &&
				1029	attr && strcmp(attr, "content") == 0) {
				1030
				1031	const char* value = htmlparser_value(ctx);
				1032	meta_redirect_type_enum redirect_type = meta_redirect_type(value);
				1033
				1034	if (redirect_type == META_REDIRECT_TYPE_URL \|\|
				1035	redirect_type == META_REDIRECT_TYPE_URL_START)
				1036	return HTMLPARSER_ATTR_URI;
				1037	}
				1038
				1039	return HTMLPARSER_ATTR_REGULAR;
				1040	}
				1041
				1042	/* Return the current line number. */
				1043	int htmlparser_get_line_number(htmlparser_ctx *ctx) {
				1044	return statemachine_get_line_number(ctx->statemachine);
				1045	}
				1046
				1047	/* Set the current line number. */
				1048	void htmlparser_set_line_number(htmlparser_ctx *ctx, int line) {
				1049	statemachine_set_line_number(ctx->statemachine, line);
				1050	}
				1051
				1052	/* Return the current column number. */
				1053	int htmlparser_get_column_number(htmlparser_ctx *ctx) {
				1054	return statemachine_get_column_number(ctx->statemachine);
				1055	}
				1056
				1057	/* Set the current column number. */
				1058	void htmlparser_set_column_number(htmlparser_ctx *ctx, int column) {
				1059	statemachine_set_column_number(ctx->statemachine, column);
				1060	}
				1061
				1062	/* Retrieve a human readable error message in case an error occurred.
				1063	*
				1064	* NULL is returned if the parser didn't encounter an error.
				1065	*/
				1066	const char htmlparser_get_error_msg(htmlparser_ctx ctx) {
				1067	return statemachine_get_error_msg(ctx->statemachine);
				1068	}
				1069
				1070	/* Invoked by the caller when text is expanded by the caller.
				1071	*/
				1072	int htmlparser_insert_text(htmlparser_ctx *ctx)
				1073	{
				1074	/* TODO(falmeida): Generalize and use a table for these values. */
				1075
				1076	if (statemachine_get_state(ctx->statemachine) == HTMLPARSER_STATE_INT_VALUE) {
				1077	statemachine_set_state(ctx->statemachine, HTMLPARSER_STATE_INT_VALUE_TEXT);
				1078	}
				1079	return 1;
				1080	}
				1081
				1082	/* Deallocates an htmlparser context object.
				1083	*/
				1084	void htmlparser_delete(htmlparser_ctx *ctx)
				1085	{
				1086	assert(ctx != NULL);
				1087	statemachine_definition_delete(ctx->statemachine_def);
				1088	statemachine_delete(ctx->statemachine);
				1089	jsparser_delete(ctx->jsparser);
				1090	entityfilter_delete(ctx->entityfilter);
				1091	free(ctx);
				1092	}
				1093
				1094	#ifdef __cplusplus
				1095	} /* namespace security_streamhtmlparser */
				1096	#endif