blob: 8ca4b7b8412d9c2073cdc2430138ccf7bbd9f2c6 [file] [log] [blame]
# Copyright (c) 2008, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ---
#
# Author: falmeida@google.com (Filipe Almeida)
# TODO(falmeida): Add more descriptive names to the states and drop the
# abbreviations.
# TODO(falmeida): Reorder the states so that it's easier to read.
# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.
name = 'htmlparser'
comment = 'Definition of a finite state machine for a subset of HTTP 4.1'
condition('<', '<')
condition('>', '>')
condition('=', '=')
# TODO(falmeida): This is not the correct expression. tag and attribute names
# can only consist of alpha character.
condition('id', 'A-Za-z0-9_:.-')
condition('idtag', 'A-Za-z0-9/_:.-')
# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
condition('space', ' \t\n\r')
condition('!', '!')
condition('q', '\'')
condition('dq', '\"')
condition('/', '/')
condition('*', '*')
condition('-', '-')
condition('?', '?')
condition('lf', '\n')
condition('quote', '\\')
# TODO(falmeida): This default rule is a hack and shouldn't be here.
condition('default', '[:default:]')
state(name = 'text',
external = 'text',
transitions = [
['<', 'tag_start'],
['default', 'text']
])
# When we found the < character in text.
# Tag opening is defined in the HTML5 draft here:
# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
# We don't exactly follow this and are much more loose in order to mimic the way
# the major browsers behave.
state(name = 'tag_start',
external = 'tag',
transitions = [
['idtag', 'tag_name'],
['?', 'pi'],
['!', 'declaration_start'],
['<', 'tag_start'],
['default', 'text']
])
# Name of the tag. Includes the closing tag character '/'.
state(name = 'tag_name',
external = 'tag',
transitions = [
['idtag', 'tag_name'],
['space', 'tag_space'],
['>', 'tag_close']
])
# HTML declaration and comment parsing
#
# We don't expose declaration state because at this point we only want to
# ensure that we are parsing them correctly so we don't get out of sync.
# This is specifically made for DOCTYPE declarations and won't work if DTD's
# are defined inside the declaration.
# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
# but that will add a lot of unecessary states, and unless we build a simple
# declarative way to unfold a string match into multiple states, I don't
# think it's worth worrying about for now.
# Got '<!'. The next character will decide if we open a declaration or a
# comment.
state(name = 'declaration_start',
external = 'text',
transitions = [
['-', 'comment_open'],
['>', 'text'],
['default', 'declaration_body']
])
# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
state(name = 'declaration_body',
external = 'text',
transitions = [
['>', 'text'],
['default', 'declaration_body']
])
# Got '<!-'.
state(name = 'comment_open',
external = 'text',
transitions = [
['-', 'comment_body'],
['default', 'text']
])
# Inside a comment. We only close when we see '-->'
state(name = 'comment_body',
external = 'comment',
transitions = [
['-', 'comment_dash'],
['default', 'comment_body']
])
# Got '-' inside a comment.
state(name = 'comment_dash',
external = 'comment',
transitions = [
['-', 'comment_dash_dash'],
['default', 'comment_body']
])
# Got '--' inside a comment.
state(name = 'comment_dash_dash',
external = 'comment',
transitions = [
['-', 'comment_dash_dash'],
['>', 'text'],
['default', 'comment_body']
])
# XML Processing instruction parsing according to:
# http://www.w3.org/TR/REC-xml/#sec-pi
#
# Everything between the characters <? and ?> is considered to be part of the
# processing instruction.
state(name = 'pi',
external = 'text',
transitions = [
['?', 'pi_may_end'],
['default', 'pi']
])
state(name = 'pi_may_end',
external = 'text',
transitions = [
['>', 'text'],
['default', 'pi']
])
# Whitespace between tag name, attributes.
state(name = 'tag_space',
external = 'tag',
transitions = [
['>', 'tag_close'],
['space', 'tag_space'],
['id', 'attr'],
['/', 'tag_space']
])
state(name = 'tag_close',
external = 'text',
transitions = [
['<', 'tag_start'],
['default', 'text']
])
# Name of the attribute.
state(name = 'attr',
external = 'attr',
transitions = [
['id', 'attr'],
['>', 'tag_close'],
['/', 'tag_space'],
['=', 'value'],
['space', 'attr_space']
])
# After the attribute name.
state(name = 'attr_space',
external = 'attr',
transitions = [
['>', 'tag_close'],
['space', 'attr_space'],
['id', 'attr'],
['/', 'tag_space'],
['=', 'value']
])
# Expecting a value, after attribute=
state(name = 'value',
external = 'value',
transitions = [
['q', 'value_q_start'],
['dq', 'value_dq_start'],
['space', 'value'],
['>', 'tag_close'],
['default', 'value_text']
])
# Unquoted attribute value.
state(name = 'value_text',
external = 'value',
transitions = [
['>', 'tag_close'],
['space', 'tag_space'],
['default', 'value_text']
])
# First character of a single quoted attribute value.
state(name = 'value_q_start',
external = 'value',
transitions = [
['q', 'tag_space'],
['default', 'value_q']
])
# In the middle of a single quoted attribute value.
state(name = 'value_q',
external = 'value',
transitions = [
['q', 'tag_space'],
['default', 'value_q']
])
# First character of a double quoted attribute value.
state(name = 'value_dq_start',
external = 'value',
transitions = [
['dq', 'tag_space'],
['default', 'value_dq']
])
# In the middle of a double quoted attribute value.
state(name = 'value_dq',
external = 'value',
transitions = [
['dq', 'tag_space'],
['default', 'value_dq']
])
# CDATA escaping text spans.
# TODO(falmeida): These states should go after cdata_text.
# Got '<!'
state(name = 'cdata_comment_start',
external = 'text',
transitions = [
['-', 'cdata_comment_start_dash'],
['default', 'cdata_text'],
])
# Got '<!-'.
state(name = 'cdata_comment_start_dash',
external = 'text',
transitions = [
['-', 'cdata_comment_body'],
['default', 'cdata_text']
])
# Inside a comment
state(name = 'cdata_comment_body',
external = 'text',
transitions = [
['-', 'cdata_comment_dash'],
['default', 'cdata_comment_body']
])
# Got '-' inside a comment.
state(name = 'cdata_comment_dash',
external = 'text',
transitions = [
['-', 'cdata_comment_dash_dash'],
['default', 'cdata_comment_body']
])
# Got '--' inside a comment.
state(name = 'cdata_comment_dash_dash',
external = 'text',
transitions = [
['-', 'cdata_comment_dash_dash'],
['>', 'cdata_text'],
['default', 'cdata_comment_body']
])
# CDATA processing
#
# To simplify the code, we treat RCDATA and CDATA sections the same since the
# differences between them don't affect the context we are in.
state(name = 'cdata_text',
external = 'text',
transitions = [
['<', 'cdata_lt'],
['default', 'cdata_text']
])
# Possible beginning of the closing tag.
state(name = 'cdata_lt',
external = 'text',
transitions = [
['/', 'cdata_may_close'],
['!', 'cdata_comment_start'],
['default', 'cdata_text']
])
# If we encounter </tag where tag matches the last opened tag, we exit the
# CDATA section. Part of this logic is handled in the code.
state(name = 'cdata_may_close',
external = 'text',
transitions = [
['idtag', 'cdata_may_close'],
['>', 'text'],
['space', 'tag_space'],
['default', 'cdata_text']
])
# The next states are used for specialized parser modes.
state(name = 'js_file',
external = 'js_file',
transitions = [
['default', 'js_file']
])
# TODO(falmeida): Having css_file and js_file as the external name doesn't make
# sense. This should instead be text and the js/css state be
# returned by # in_js() and in_css().
state(name = 'css_file',
external = 'css_file',
transitions = [
['default', 'css_file']
])
state(name = 'null',
external = 'text',
transitions = [
['default', 'null']
])