Squashed 'third_party/ctemplate/' content from commit 6742f62 Change-Id: I828e4e4c906f13ba19944d78a8a78652b62949af git-subtree-dir: third_party/ctemplate git-subtree-split: 6742f6233db12f545e90baa8f34f5c29c4eb396a

commit: 70325d6fab299f4f0f2ffd0f862aa73d3c001d67 [log] [tgz]
author: Brian Silverman <brian@peloton-tech.com> Sun Sep 20 17:00:43 2015 -0400
committer: Brian Silverman <brian@peloton-tech.com> Sun Sep 20 17:00:43 2015 -0400
tree: cdcbecab12b7fc611da2925be3deaec537555deb
diff --git a/src/htmlparser/htmlparser_fsm.config b/src/htmlparser/htmlparser_fsm.config
new file mode 100644
index 0000000..8ca4b7b
--- /dev/null
+++ b/src/htmlparser/htmlparser_fsm.config

@@ -0,0 +1,365 @@
+# Copyright (c) 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ---
+#
+# Author: falmeida@google.com (Filipe Almeida)
+
+# TODO(falmeida): Add more descriptive names to the states and drop the
+# abbreviations.
+# TODO(falmeida): Reorder the states so that it's easier to read.
+# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.
+
+name = 'htmlparser'
+
+comment = 'Definition of a finite state machine for a subset of HTTP 4.1'
+
+condition('<', '<')
+condition('>', '>')
+condition('=', '=')
+
+# TODO(falmeida): This is not the correct expression. tag and attribute names
+# can only consist of alpha character.
+condition('id', 'A-Za-z0-9_:.-')
+condition('idtag', 'A-Za-z0-9/_:.-')
+
+# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
+condition('space', ' \t\n\r')
+condition('!', '!')
+condition('q', '\'')
+condition('dq', '\"')
+condition('/', '/')
+condition('*', '*')
+condition('-', '-')
+condition('?', '?')
+condition('lf', '\n')
+condition('quote', '\\')
+
+# TODO(falmeida): This default rule is a hack and shouldn't be here.
+condition('default', '[:default:]')
+
+state(name = 'text',
+      external = 'text',
+      transitions = [
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# When we found the < character in text.
+# Tag opening is defined in the HTML5 draft here:
+# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
+# We don't exactly follow this and are much more loose in order to mimic the way
+# the major browsers behave.
+state(name = 'tag_start',
+      external = 'tag',
+      transitions = [
+        ['idtag', 'tag_name'],
+        ['?', 'pi'],
+        ['!', 'declaration_start'],
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# Name of the tag. Includes the closing tag character '/'.
+state(name = 'tag_name',
+      external = 'tag',
+      transitions = [
+        ['idtag', 'tag_name'],
+        ['space', 'tag_space'],
+        ['>', 'tag_close']
+      ])
+
+# HTML declaration and comment parsing
+#
+# We don't expose declaration state because at this point we only want to
+# ensure that we are parsing them correctly so we don't get out of sync.
+# This is specifically made for DOCTYPE declarations and won't work if DTD's
+# are defined inside the declaration.
+# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
+# but that will add a lot of unecessary states, and unless we build a simple
+# declarative way to unfold a string match into multiple states, I don't
+# think it's worth worrying about for now.
+
+# Got '<!'. The next character will decide if we open a declaration or a
+# comment.
+state(name = 'declaration_start',
+      external = 'text',
+      transitions = [
+        ['-', 'comment_open'],
+        ['>', 'text'],
+        ['default', 'declaration_body']
+      ])
+
+# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
+state(name = 'declaration_body',
+      external = 'text',
+      transitions = [
+        ['>', 'text'],
+        ['default', 'declaration_body']
+      ])
+
+# Got '<!-'.
+state(name = 'comment_open',
+      external = 'text',
+      transitions = [
+        ['-', 'comment_body'],
+        ['default', 'text']
+      ])
+
+# Inside a comment. We only close when we see '-->'
+state(name = 'comment_body',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash'],
+        ['default', 'comment_body']
+      ])
+
+# Got '-' inside a comment.
+state(name = 'comment_dash',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash_dash'],
+        ['default', 'comment_body']
+      ])
+
+# Got '--' inside a comment.
+state(name = 'comment_dash_dash',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash_dash'],
+        ['>', 'text'],
+        ['default', 'comment_body']
+      ])
+
+# XML Processing instruction parsing according to:
+# http://www.w3.org/TR/REC-xml/#sec-pi
+#
+# Everything between the characters <? and ?> is considered to be part of the
+# processing instruction.
+state(name = 'pi',
+      external = 'text',
+      transitions = [
+        ['?', 'pi_may_end'],
+        ['default', 'pi']
+      ])
+
+state(name = 'pi_may_end',
+      external = 'text',
+      transitions = [
+        ['>', 'text'],
+        ['default', 'pi']
+      ])
+
+# Whitespace between tag name, attributes.
+state(name = 'tag_space',
+      external = 'tag',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'tag_space'],
+        ['id', 'attr'],
+        ['/', 'tag_space']
+      ])
+
+state(name = 'tag_close',
+      external = 'text',
+      transitions = [
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# Name of the attribute.
+state(name = 'attr',
+      external = 'attr',
+      transitions = [
+        ['id', 'attr'],
+        ['>', 'tag_close'],
+        ['/', 'tag_space'],
+        ['=', 'value'],
+        ['space', 'attr_space']
+      ])
+
+# After the attribute name.
+state(name = 'attr_space',
+      external = 'attr',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'attr_space'],
+        ['id', 'attr'],
+        ['/', 'tag_space'],
+        ['=', 'value']
+      ])
+
+# Expecting a value, after attribute=
+state(name = 'value',
+      external = 'value',
+      transitions = [
+        ['q', 'value_q_start'],
+        ['dq', 'value_dq_start'],
+        ['space', 'value'],
+        ['>', 'tag_close'],
+        ['default', 'value_text']
+      ])
+
+# Unquoted attribute value.
+state(name = 'value_text',
+      external = 'value',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'tag_space'],
+        ['default', 'value_text']
+      ])
+
+# First character of a single quoted attribute value.
+state(name = 'value_q_start',
+      external = 'value',
+      transitions = [
+        ['q', 'tag_space'],
+        ['default', 'value_q']
+      ])
+
+# In the middle of a single quoted attribute value.
+state(name = 'value_q',
+      external = 'value',
+      transitions = [
+        ['q', 'tag_space'],
+        ['default', 'value_q']
+      ])
+
+# First character of a double quoted attribute value.
+state(name = 'value_dq_start',
+      external = 'value',
+      transitions = [
+        ['dq', 'tag_space'],
+        ['default', 'value_dq']
+      ])
+
+# In the middle of a double quoted attribute value.
+state(name = 'value_dq',
+      external = 'value',
+      transitions = [
+        ['dq', 'tag_space'],
+        ['default', 'value_dq']
+      ])
+
+# CDATA escaping text spans.
+# TODO(falmeida): These states should go after cdata_text.
+
+# Got '<!'
+state(name = 'cdata_comment_start',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_start_dash'],
+        ['default', 'cdata_text'],
+      ])
+
+# Got '<!-'.
+state(name = 'cdata_comment_start_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_body'],
+        ['default', 'cdata_text']
+      ])
+
+# Inside a comment
+state(name = 'cdata_comment_body',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# Got '-' inside a comment.
+state(name = 'cdata_comment_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash_dash'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# Got '--' inside a comment.
+state(name = 'cdata_comment_dash_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash_dash'],
+        ['>', 'cdata_text'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# CDATA processing
+#
+# To simplify the code, we treat RCDATA and CDATA sections the same since the
+# differences between them don't affect the context we are in.
+state(name = 'cdata_text',
+      external = 'text',
+      transitions = [
+        ['<', 'cdata_lt'],
+        ['default', 'cdata_text']
+      ])
+
+# Possible beginning of the closing tag.
+state(name = 'cdata_lt',
+      external = 'text',
+      transitions = [
+        ['/', 'cdata_may_close'],
+        ['!', 'cdata_comment_start'],
+        ['default', 'cdata_text']
+      ])
+
+# If we encounter </tag where tag matches the last opened tag, we exit the
+# CDATA section. Part of this logic is handled in the code.
+state(name = 'cdata_may_close',
+      external = 'text',
+      transitions = [
+        ['idtag', 'cdata_may_close'],
+        ['>', 'text'],
+        ['space', 'tag_space'],
+        ['default', 'cdata_text']
+      ])
+
+# The next states are used for specialized parser modes.
+state(name = 'js_file',
+      external = 'js_file',
+      transitions = [
+        ['default', 'js_file']
+      ])
+
+# TODO(falmeida): Having css_file and js_file as the external name doesn't make
+#                 sense. This should instead be text and the js/css state be
+#                 returned by # in_js() and in_css().
+state(name = 'css_file',
+      external = 'css_file',
+      transitions = [
+        ['default', 'css_file']
+      ])
+
+state(name = 'null',
+      external = 'text',
+      transitions = [
+        ['default', 'null']
+      ])
+
commit	70325d6fab299f4f0f2ffd0f862aa73d3c001d67	[log] [tgz]
author	Brian Silverman <brian@peloton-tech.com>	Sun Sep 20 17:00:43 2015 -0400
committer	Brian Silverman <brian@peloton-tech.com>	Sun Sep 20 17:00:43 2015 -0400
tree	cdcbecab12b7fc611da2925be3deaec537555deb