src/htmlparser/htmlparser_fsm.config - RealtimeRoboticsGroup/test - Gitiles

 # Copyright (c) 2008, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # ---
 #
 # Author: falmeida@google.com (Filipe Almeida)

 # TODO(falmeida): Add more descriptive names to the states and drop the
 # abbreviations.
 # TODO(falmeida): Reorder the states so that it's easier to read.
 # TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.

 name = 'htmlparser'

 comment = 'Definition of a finite state machine for a subset of HTTP 4.1'

 condition('<', '<')
 condition('>', '>')
 condition('=', '=')

 # TODO(falmeida): This is not the correct expression. tag and attribute names
 # can only consist of alpha character.
 condition('id', 'A-Za-z0-9_:.-')
 condition('idtag', 'A-Za-z0-9/_:.-')

 # Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
 condition('space', ' \t\n\r')
 condition('!', '!')
 condition('q', '\'')
 condition('dq', '\"')
 condition('/', '/')
 condition('*', '*')
 condition('-', '-')
 condition('?', '?')
 condition('lf', '\n')
 condition('quote', '\\')

 # TODO(falmeida): This default rule is a hack and shouldn't be here.
 condition('default', '[:default:]')

 state(name = 'text',
       external = 'text',
       transitions = [
         ['<', 'tag_start'],
         ['default', 'text']
       ])

 # When we found the < character in text.
 # Tag opening is defined in the HTML5 draft here:
 # http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
 # We don't exactly follow this and are much more loose in order to mimic the way
 # the major browsers behave.
 state(name = 'tag_start',
       external = 'tag',
       transitions = [
         ['idtag', 'tag_name'],
         ['?', 'pi'],
         ['!', 'declaration_start'],
         ['<', 'tag_start'],
         ['default', 'text']
       ])

 # Name of the tag. Includes the closing tag character '/'.
 state(name = 'tag_name',
       external = 'tag',
       transitions = [
         ['idtag', 'tag_name'],
         ['space', 'tag_space'],
         ['>', 'tag_close']
       ])

 # HTML declaration and comment parsing
 #
 # We don't expose declaration state because at this point we only want to
 # ensure that we are parsing them correctly so we don't get out of sync.
 # This is specifically made for DOCTYPE declarations and won't work if DTD's
 # are defined inside the declaration.
 # The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
 # but that will add a lot of unecessary states, and unless we build a simple
 # declarative way to unfold a string match into multiple states, I don't
 # think it's worth worrying about for now.

 # Got '<!'. The next character will decide if we open a declaration or a
 # comment.
 state(name = 'declaration_start',
       external = 'text',
       transitions = [
         ['-', 'comment_open'],
         ['>', 'text'],
         ['default', 'declaration_body']
       ])

 # Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
 state(name = 'declaration_body',
       external = 'text',
       transitions = [
         ['>', 'text'],
         ['default', 'declaration_body']
       ])

 # Got '<!-'.
 state(name = 'comment_open',
       external = 'text',
       transitions = [
         ['-', 'comment_body'],
         ['default', 'text']
       ])

 # Inside a comment. We only close when we see '-->'
 state(name = 'comment_body',
       external = 'comment',
       transitions = [
         ['-', 'comment_dash'],
         ['default', 'comment_body']
       ])

 # Got '-' inside a comment.
 state(name = 'comment_dash',
       external = 'comment',
       transitions = [
         ['-', 'comment_dash_dash'],
         ['default', 'comment_body']
       ])

 # Got '--' inside a comment.
 state(name = 'comment_dash_dash',
       external = 'comment',
       transitions = [
         ['-', 'comment_dash_dash'],
         ['>', 'text'],
         ['default', 'comment_body']
       ])

 # XML Processing instruction parsing according to:
 # http://www.w3.org/TR/REC-xml/#sec-pi
 #
 # Everything between the characters <? and ?> is considered to be part of the
 # processing instruction.
 state(name = 'pi',
       external = 'text',
       transitions = [
         ['?', 'pi_may_end'],
         ['default', 'pi']
       ])

 state(name = 'pi_may_end',
       external = 'text',
       transitions = [
         ['>', 'text'],
         ['default', 'pi']
       ])

 # Whitespace between tag name, attributes.
 state(name = 'tag_space',
       external = 'tag',
       transitions = [
         ['>', 'tag_close'],
         ['space', 'tag_space'],
         ['id', 'attr'],
         ['/', 'tag_space']
       ])

 state(name = 'tag_close',
       external = 'text',
       transitions = [
         ['<', 'tag_start'],
         ['default', 'text']
       ])

 # Name of the attribute.
 state(name = 'attr',
       external = 'attr',
       transitions = [
         ['id', 'attr'],
         ['>', 'tag_close'],
         ['/', 'tag_space'],
         ['=', 'value'],
         ['space', 'attr_space']
       ])

 # After the attribute name.
 state(name = 'attr_space',
       external = 'attr',
       transitions = [
         ['>', 'tag_close'],
         ['space', 'attr_space'],
         ['id', 'attr'],
         ['/', 'tag_space'],
         ['=', 'value']
       ])

 # Expecting a value, after attribute=
 state(name = 'value',
       external = 'value',
       transitions = [
         ['q', 'value_q_start'],
         ['dq', 'value_dq_start'],
         ['space', 'value'],
         ['>', 'tag_close'],
         ['default', 'value_text']
       ])

 # Unquoted attribute value.
 state(name = 'value_text',
       external = 'value',
       transitions = [
         ['>', 'tag_close'],
         ['space', 'tag_space'],
         ['default', 'value_text']
       ])

 # First character of a single quoted attribute value.
 state(name = 'value_q_start',
       external = 'value',
       transitions = [
         ['q', 'tag_space'],
         ['default', 'value_q']
       ])

 # In the middle of a single quoted attribute value.
 state(name = 'value_q',
       external = 'value',
       transitions = [
         ['q', 'tag_space'],
         ['default', 'value_q']
       ])

 # First character of a double quoted attribute value.
 state(name = 'value_dq_start',
       external = 'value',
       transitions = [
         ['dq', 'tag_space'],
         ['default', 'value_dq']
       ])

 # In the middle of a double quoted attribute value.
 state(name = 'value_dq',
       external = 'value',
       transitions = [
         ['dq', 'tag_space'],
         ['default', 'value_dq']
       ])

 # CDATA escaping text spans.
 # TODO(falmeida): These states should go after cdata_text.

 # Got '<!'
 state(name = 'cdata_comment_start',
       external = 'text',
       transitions = [
         ['-', 'cdata_comment_start_dash'],
         ['default', 'cdata_text'],
       ])

 # Got '<!-'.
 state(name = 'cdata_comment_start_dash',
       external = 'text',
       transitions = [
         ['-', 'cdata_comment_body'],
         ['default', 'cdata_text']
       ])

 # Inside a comment
 state(name = 'cdata_comment_body',
       external = 'text',
       transitions = [
         ['-', 'cdata_comment_dash'],
         ['default', 'cdata_comment_body']
       ])

 # Got '-' inside a comment.
 state(name = 'cdata_comment_dash',
       external = 'text',
       transitions = [
         ['-', 'cdata_comment_dash_dash'],
         ['default', 'cdata_comment_body']
       ])

 # Got '--' inside a comment.
 state(name = 'cdata_comment_dash_dash',
       external = 'text',
       transitions = [
         ['-', 'cdata_comment_dash_dash'],
         ['>', 'cdata_text'],
         ['default', 'cdata_comment_body']
       ])

 # CDATA processing
 #
 # To simplify the code, we treat RCDATA and CDATA sections the same since the
 # differences between them don't affect the context we are in.
 state(name = 'cdata_text',
       external = 'text',
       transitions = [
         ['<', 'cdata_lt'],
         ['default', 'cdata_text']
       ])

 # Possible beginning of the closing tag.
 state(name = 'cdata_lt',
       external = 'text',
       transitions = [
         ['/', 'cdata_may_close'],
         ['!', 'cdata_comment_start'],
         ['default', 'cdata_text']
       ])

 # If we encounter </tag where tag matches the last opened tag, we exit the
 # CDATA section. Part of this logic is handled in the code.
 state(name = 'cdata_may_close',
       external = 'text',
       transitions = [
         ['idtag', 'cdata_may_close'],
         ['>', 'text'],
         ['space', 'tag_space'],
         ['default', 'cdata_text']
       ])

 # The next states are used for specialized parser modes.
 state(name = 'js_file',
       external = 'js_file',
       transitions = [
         ['default', 'js_file']
       ])

 # TODO(falmeida): Having css_file and js_file as the external name doesn't make
 #                 sense. This should instead be text and the js/css state be
 #                 returned by # in_js() and in_css().
 state(name = 'css_file',
       external = 'css_file',
       transitions = [
         ['default', 'css_file']
       ])

 state(name = 'null',
       external = 'text',
       transitions = [
         ['default', 'null']
       ])
	# Copyright (c) 2008, Google Inc.
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met:
	#
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above
	# copyright notice, this list of conditions and the following disclaimer
	# in the documentation and/or other materials provided with the
	# distribution.
	# * Neither the name of Google Inc. nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	# ---
	#
	# Author: falmeida@google.com (Filipe Almeida)

	# TODO(falmeida): Add more descriptive names to the states and drop the
	# abbreviations.
	# TODO(falmeida): Reorder the states so that it's easier to read.
	# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.

	name = 'htmlparser'

	comment = 'Definition of a finite state machine for a subset of HTTP 4.1'

	condition('<', '<')
	condition('>', '>')
	condition('=', '=')

	# TODO(falmeida): This is not the correct expression. tag and attribute names
	# can only consist of alpha character.
	condition('id', 'A-Za-z0-9_:.-')
	condition('idtag', 'A-Za-z0-9/_:.-')

	# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
	condition('space', ' \t\n\r')
	condition('!', '!')
	condition('q', '\'')
	condition('dq', '\"')
	condition('/', '/')
	condition('', '')
	condition('-', '-')
	condition('?', '?')
	condition('lf', '\n')
	condition('quote', '\\')

	# TODO(falmeida): This default rule is a hack and shouldn't be here.
	condition('default', '[:default:]')

	state(name = 'text',
	external = 'text',
	transitions = [
	['<', 'tag_start'],
	['default', 'text']
	])

	# When we found the < character in text.
	# Tag opening is defined in the HTML5 draft here:
	# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
	# We don't exactly follow this and are much more loose in order to mimic the way
	# the major browsers behave.
	state(name = 'tag_start',
	external = 'tag',
	transitions = [
	['idtag', 'tag_name'],
	['?', 'pi'],
	['!', 'declaration_start'],
	['<', 'tag_start'],
	['default', 'text']
	])

	# Name of the tag. Includes the closing tag character '/'.
	state(name = 'tag_name',
	external = 'tag',
	transitions = [
	['idtag', 'tag_name'],
	['space', 'tag_space'],
	['>', 'tag_close']
	])

	# HTML declaration and comment parsing
	#
	# We don't expose declaration state because at this point we only want to
	# ensure that we are parsing them correctly so we don't get out of sync.
	# This is specifically made for DOCTYPE declarations and won't work if DTD's
	# are defined inside the declaration.
	# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
	# but that will add a lot of unecessary states, and unless we build a simple
	# declarative way to unfold a string match into multiple states, I don't
	# think it's worth worrying about for now.

	# Got '<!'. The next character will decide if we open a declaration or a
	# comment.
	state(name = 'declaration_start',
	external = 'text',
	transitions = [
	['-', 'comment_open'],
	['>', 'text'],
	['default', 'declaration_body']
	])

	# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
	state(name = 'declaration_body',
	external = 'text',
	transitions = [
	['>', 'text'],
	['default', 'declaration_body']
	])

	# Got '<!-'.
	state(name = 'comment_open',
	external = 'text',
	transitions = [
	['-', 'comment_body'],
	['default', 'text']
	])

	# Inside a comment. We only close when we see '-->'
	state(name = 'comment_body',
	external = 'comment',
	transitions = [
	['-', 'comment_dash'],
	['default', 'comment_body']
	])

	# Got '-' inside a comment.
	state(name = 'comment_dash',
	external = 'comment',
	transitions = [
	['-', 'comment_dash_dash'],
	['default', 'comment_body']
	])

	# Got '--' inside a comment.
	state(name = 'comment_dash_dash',
	external = 'comment',
	transitions = [
	['-', 'comment_dash_dash'],
	['>', 'text'],
	['default', 'comment_body']
	])

	# XML Processing instruction parsing according to:
	# http://www.w3.org/TR/REC-xml/#sec-pi
	#
	# Everything between the characters <? and ?> is considered to be part of the
	# processing instruction.
	state(name = 'pi',
	external = 'text',
	transitions = [
	['?', 'pi_may_end'],
	['default', 'pi']
	])

	state(name = 'pi_may_end',
	external = 'text',
	transitions = [
	['>', 'text'],
	['default', 'pi']
	])

	# Whitespace between tag name, attributes.
	state(name = 'tag_space',
	external = 'tag',
	transitions = [
	['>', 'tag_close'],
	['space', 'tag_space'],
	['id', 'attr'],
	['/', 'tag_space']
	])

	state(name = 'tag_close',
	external = 'text',
	transitions = [
	['<', 'tag_start'],
	['default', 'text']
	])

	# Name of the attribute.
	state(name = 'attr',
	external = 'attr',
	transitions = [
	['id', 'attr'],
	['>', 'tag_close'],
	['/', 'tag_space'],
	['=', 'value'],
	['space', 'attr_space']
	])

	# After the attribute name.
	state(name = 'attr_space',
	external = 'attr',
	transitions = [
	['>', 'tag_close'],
	['space', 'attr_space'],
	['id', 'attr'],
	['/', 'tag_space'],
	['=', 'value']
	])

	# Expecting a value, after attribute=
	state(name = 'value',
	external = 'value',
	transitions = [
	['q', 'value_q_start'],
	['dq', 'value_dq_start'],
	['space', 'value'],
	['>', 'tag_close'],
	['default', 'value_text']
	])

	# Unquoted attribute value.
	state(name = 'value_text',
	external = 'value',
	transitions = [
	['>', 'tag_close'],
	['space', 'tag_space'],
	['default', 'value_text']
	])

	# First character of a single quoted attribute value.
	state(name = 'value_q_start',
	external = 'value',
	transitions = [
	['q', 'tag_space'],
	['default', 'value_q']
	])

	# In the middle of a single quoted attribute value.
	state(name = 'value_q',
	external = 'value',
	transitions = [
	['q', 'tag_space'],
	['default', 'value_q']
	])

	# First character of a double quoted attribute value.
	state(name = 'value_dq_start',
	external = 'value',
	transitions = [
	['dq', 'tag_space'],
	['default', 'value_dq']
	])

	# In the middle of a double quoted attribute value.
	state(name = 'value_dq',
	external = 'value',
	transitions = [
	['dq', 'tag_space'],
	['default', 'value_dq']
	])

	# CDATA escaping text spans.
	# TODO(falmeida): These states should go after cdata_text.

	# Got '<!'
	state(name = 'cdata_comment_start',
	external = 'text',
	transitions = [
	['-', 'cdata_comment_start_dash'],
	['default', 'cdata_text'],
	])

	# Got '<!-'.
	state(name = 'cdata_comment_start_dash',
	external = 'text',
	transitions = [
	['-', 'cdata_comment_body'],
	['default', 'cdata_text']
	])

	# Inside a comment
	state(name = 'cdata_comment_body',
	external = 'text',
	transitions = [
	['-', 'cdata_comment_dash'],
	['default', 'cdata_comment_body']
	])

	# Got '-' inside a comment.
	state(name = 'cdata_comment_dash',
	external = 'text',
	transitions = [
	['-', 'cdata_comment_dash_dash'],
	['default', 'cdata_comment_body']
	])

	# Got '--' inside a comment.
	state(name = 'cdata_comment_dash_dash',
	external = 'text',
	transitions = [
	['-', 'cdata_comment_dash_dash'],
	['>', 'cdata_text'],
	['default', 'cdata_comment_body']
	])

	# CDATA processing
	#
	# To simplify the code, we treat RCDATA and CDATA sections the same since the
	# differences between them don't affect the context we are in.
	state(name = 'cdata_text',
	external = 'text',
	transitions = [
	['<', 'cdata_lt'],
	['default', 'cdata_text']
	])

	# Possible beginning of the closing tag.
	state(name = 'cdata_lt',
	external = 'text',
	transitions = [
	['/', 'cdata_may_close'],
	['!', 'cdata_comment_start'],
	['default', 'cdata_text']
	])

	# If we encounter </tag where tag matches the last opened tag, we exit the
	# CDATA section. Part of this logic is handled in the code.
	state(name = 'cdata_may_close',
	external = 'text',
	transitions = [
	['idtag', 'cdata_may_close'],
	['>', 'text'],
	['space', 'tag_space'],
	['default', 'cdata_text']
	])

	# The next states are used for specialized parser modes.
	state(name = 'js_file',
	external = 'js_file',
	transitions = [
	['default', 'js_file']
	])

	# TODO(falmeida): Having css_file and js_file as the external name doesn't make
	# sense. This should instead be text and the js/css state be
	# returned by # in_js() and in_css().
	state(name = 'css_file',
	external = 'css_file',
	transitions = [
	['default', 'css_file']
	])

	state(name = 'null',
	external = 'text',
	transitions = [
	['default', 'null']
	])