| # Copyright (c) 2008, Google Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: |
| # |
| # * Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # * Redistributions in binary form must reproduce the above |
| # copyright notice, this list of conditions and the following disclaimer |
| # in the documentation and/or other materials provided with the |
| # distribution. |
| # * Neither the name of Google Inc. nor the names of its |
| # contributors may be used to endorse or promote products derived from |
| # this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| # --- |
| # |
| # Author: falmeida@google.com (Filipe Almeida) |
| |
| # TODO(falmeida): Add more descriptive names to the states and drop the |
| # abbreviations. |
| # TODO(falmeida): Reorder the states so that it's easier to read. |
| # TODO(falmeida): Support CDATA blocks in the form: <![CDATA[. |
| |
| name = 'htmlparser' |
| |
| comment = 'Definition of a finite state machine for a subset of HTTP 4.1' |
| |
| condition('<', '<') |
| condition('>', '>') |
| condition('=', '=') |
| |
| # TODO(falmeida): This is not the correct expression. tag and attribute names |
| # can only consist of alpha character. |
| condition('id', 'A-Za-z0-9_:.-') |
| condition('idtag', 'A-Za-z0-9/_:.-') |
| |
| # Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1 |
| condition('space', ' \t\n\r') |
| condition('!', '!') |
| condition('q', '\'') |
| condition('dq', '\"') |
| condition('/', '/') |
| condition('*', '*') |
| condition('-', '-') |
| condition('?', '?') |
| condition('lf', '\n') |
| condition('quote', '\\') |
| |
| # TODO(falmeida): This default rule is a hack and shouldn't be here. |
| condition('default', '[:default:]') |
| |
| state(name = 'text', |
| external = 'text', |
| transitions = [ |
| ['<', 'tag_start'], |
| ['default', 'text'] |
| ]) |
| |
| # When we found the < character in text. |
| # Tag opening is defined in the HTML5 draft here: |
| # http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state |
| # We don't exactly follow this and are much more loose in order to mimic the way |
| # the major browsers behave. |
| state(name = 'tag_start', |
| external = 'tag', |
| transitions = [ |
| ['idtag', 'tag_name'], |
| ['?', 'pi'], |
| ['!', 'declaration_start'], |
| ['<', 'tag_start'], |
| ['default', 'text'] |
| ]) |
| |
| # Name of the tag. Includes the closing tag character '/'. |
| state(name = 'tag_name', |
| external = 'tag', |
| transitions = [ |
| ['idtag', 'tag_name'], |
| ['space', 'tag_space'], |
| ['>', 'tag_close'] |
| ]) |
| |
| # HTML declaration and comment parsing |
| # |
| # We don't expose declaration state because at this point we only want to |
| # ensure that we are parsing them correctly so we don't get out of sync. |
| # This is specifically made for DOCTYPE declarations and won't work if DTD's |
| # are defined inside the declaration. |
| # The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML' |
| # but that will add a lot of unecessary states, and unless we build a simple |
| # declarative way to unfold a string match into multiple states, I don't |
| # think it's worth worrying about for now. |
| |
| # Got '<!'. The next character will decide if we open a declaration or a |
| # comment. |
| state(name = 'declaration_start', |
| external = 'text', |
| transitions = [ |
| ['-', 'comment_open'], |
| ['>', 'text'], |
| ['default', 'declaration_body'] |
| ]) |
| |
| # Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>' |
| state(name = 'declaration_body', |
| external = 'text', |
| transitions = [ |
| ['>', 'text'], |
| ['default', 'declaration_body'] |
| ]) |
| |
| # Got '<!-'. |
| state(name = 'comment_open', |
| external = 'text', |
| transitions = [ |
| ['-', 'comment_body'], |
| ['default', 'text'] |
| ]) |
| |
| # Inside a comment. We only close when we see '-->' |
| state(name = 'comment_body', |
| external = 'comment', |
| transitions = [ |
| ['-', 'comment_dash'], |
| ['default', 'comment_body'] |
| ]) |
| |
| # Got '-' inside a comment. |
| state(name = 'comment_dash', |
| external = 'comment', |
| transitions = [ |
| ['-', 'comment_dash_dash'], |
| ['default', 'comment_body'] |
| ]) |
| |
| # Got '--' inside a comment. |
| state(name = 'comment_dash_dash', |
| external = 'comment', |
| transitions = [ |
| ['-', 'comment_dash_dash'], |
| ['>', 'text'], |
| ['default', 'comment_body'] |
| ]) |
| |
| # XML Processing instruction parsing according to: |
| # http://www.w3.org/TR/REC-xml/#sec-pi |
| # |
| # Everything between the characters <? and ?> is considered to be part of the |
| # processing instruction. |
| state(name = 'pi', |
| external = 'text', |
| transitions = [ |
| ['?', 'pi_may_end'], |
| ['default', 'pi'] |
| ]) |
| |
| state(name = 'pi_may_end', |
| external = 'text', |
| transitions = [ |
| ['>', 'text'], |
| ['default', 'pi'] |
| ]) |
| |
| # Whitespace between tag name, attributes. |
| state(name = 'tag_space', |
| external = 'tag', |
| transitions = [ |
| ['>', 'tag_close'], |
| ['space', 'tag_space'], |
| ['id', 'attr'], |
| ['/', 'tag_space'] |
| ]) |
| |
| state(name = 'tag_close', |
| external = 'text', |
| transitions = [ |
| ['<', 'tag_start'], |
| ['default', 'text'] |
| ]) |
| |
| # Name of the attribute. |
| state(name = 'attr', |
| external = 'attr', |
| transitions = [ |
| ['id', 'attr'], |
| ['>', 'tag_close'], |
| ['/', 'tag_space'], |
| ['=', 'value'], |
| ['space', 'attr_space'] |
| ]) |
| |
| # After the attribute name. |
| state(name = 'attr_space', |
| external = 'attr', |
| transitions = [ |
| ['>', 'tag_close'], |
| ['space', 'attr_space'], |
| ['id', 'attr'], |
| ['/', 'tag_space'], |
| ['=', 'value'] |
| ]) |
| |
| # Expecting a value, after attribute= |
| state(name = 'value', |
| external = 'value', |
| transitions = [ |
| ['q', 'value_q_start'], |
| ['dq', 'value_dq_start'], |
| ['space', 'value'], |
| ['>', 'tag_close'], |
| ['default', 'value_text'] |
| ]) |
| |
| # Unquoted attribute value. |
| state(name = 'value_text', |
| external = 'value', |
| transitions = [ |
| ['>', 'tag_close'], |
| ['space', 'tag_space'], |
| ['default', 'value_text'] |
| ]) |
| |
| # First character of a single quoted attribute value. |
| state(name = 'value_q_start', |
| external = 'value', |
| transitions = [ |
| ['q', 'tag_space'], |
| ['default', 'value_q'] |
| ]) |
| |
| # In the middle of a single quoted attribute value. |
| state(name = 'value_q', |
| external = 'value', |
| transitions = [ |
| ['q', 'tag_space'], |
| ['default', 'value_q'] |
| ]) |
| |
| # First character of a double quoted attribute value. |
| state(name = 'value_dq_start', |
| external = 'value', |
| transitions = [ |
| ['dq', 'tag_space'], |
| ['default', 'value_dq'] |
| ]) |
| |
| # In the middle of a double quoted attribute value. |
| state(name = 'value_dq', |
| external = 'value', |
| transitions = [ |
| ['dq', 'tag_space'], |
| ['default', 'value_dq'] |
| ]) |
| |
| # CDATA escaping text spans. |
| # TODO(falmeida): These states should go after cdata_text. |
| |
| # Got '<!' |
| state(name = 'cdata_comment_start', |
| external = 'text', |
| transitions = [ |
| ['-', 'cdata_comment_start_dash'], |
| ['default', 'cdata_text'], |
| ]) |
| |
| # Got '<!-'. |
| state(name = 'cdata_comment_start_dash', |
| external = 'text', |
| transitions = [ |
| ['-', 'cdata_comment_body'], |
| ['default', 'cdata_text'] |
| ]) |
| |
| # Inside a comment |
| state(name = 'cdata_comment_body', |
| external = 'text', |
| transitions = [ |
| ['-', 'cdata_comment_dash'], |
| ['default', 'cdata_comment_body'] |
| ]) |
| |
| # Got '-' inside a comment. |
| state(name = 'cdata_comment_dash', |
| external = 'text', |
| transitions = [ |
| ['-', 'cdata_comment_dash_dash'], |
| ['default', 'cdata_comment_body'] |
| ]) |
| |
| # Got '--' inside a comment. |
| state(name = 'cdata_comment_dash_dash', |
| external = 'text', |
| transitions = [ |
| ['-', 'cdata_comment_dash_dash'], |
| ['>', 'cdata_text'], |
| ['default', 'cdata_comment_body'] |
| ]) |
| |
| # CDATA processing |
| # |
| # To simplify the code, we treat RCDATA and CDATA sections the same since the |
| # differences between them don't affect the context we are in. |
| state(name = 'cdata_text', |
| external = 'text', |
| transitions = [ |
| ['<', 'cdata_lt'], |
| ['default', 'cdata_text'] |
| ]) |
| |
| # Possible beginning of the closing tag. |
| state(name = 'cdata_lt', |
| external = 'text', |
| transitions = [ |
| ['/', 'cdata_may_close'], |
| ['!', 'cdata_comment_start'], |
| ['default', 'cdata_text'] |
| ]) |
| |
| # If we encounter </tag where tag matches the last opened tag, we exit the |
| # CDATA section. Part of this logic is handled in the code. |
| state(name = 'cdata_may_close', |
| external = 'text', |
| transitions = [ |
| ['idtag', 'cdata_may_close'], |
| ['>', 'text'], |
| ['space', 'tag_space'], |
| ['default', 'cdata_text'] |
| ]) |
| |
| # The next states are used for specialized parser modes. |
| state(name = 'js_file', |
| external = 'js_file', |
| transitions = [ |
| ['default', 'js_file'] |
| ]) |
| |
| # TODO(falmeida): Having css_file and js_file as the external name doesn't make |
| # sense. This should instead be text and the js/css state be |
| # returned by # in_js() and in_css(). |
| state(name = 'css_file', |
| external = 'css_file', |
| transitions = [ |
| ['default', 'css_file'] |
| ]) |
| |
| state(name = 'null', |
| external = 'text', |
| transitions = [ |
| ['default', 'null'] |
| ]) |
| |