Handle text script content

2019-12-27 21:52:49 +11:00 · 2019-12-27 21:52:49 +11:00 · e15381c1cb
parent a76c1f3cd5
commit e15381c1cb
14 changed files with 257 additions and 54 deletions
--- a/src/spec/tag/child/blacklist.c
+++ b/src/spec/tag/child/blacklist.c
--- a/src/spec/tag/child/whitelist.c
+++ b/src/spec/tag/child/whitelist.c
--- a/src/spec/tag/parent/blacklist.c
+++ b/src/spec/tag/parent/blacklist.c
--- a/src/spec/tag/parent/whitelist.c
+++ b/src/spec/tag/parent/whitelist.c
--- a/notes/Processing.md
+++ b/notes/Processing.md
@ -2,16 +2,18 @@

 ## Redundant requires

-Sometimes the code will look like it duplicates matching logic. For example:
+Sometimes the code will look like it does redundant matching logic. For example:

 ```rust
-fn process_comment(proc: &mut Proc) -> () {
-	proc.matches("<!--").require_reason("comment").skip();
+pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
+    chain!(proc.match_seq(b"<!--").expect().discard());

-	proc.while_not_matches("-->").skip();
+    chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());

-	proc.matches("-->").require_reason("comment end").skip();
+    chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
+
+    Ok(())
 }
 ```

-At first glance, it might appear that the second call `while_not_matches` makes it redundant to require it again immediately afterwards. However, it's possible that the `while_not_matches` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
+At first glance, it might appear that the second call `match_while_not_seq` makes it redundant to require it again immediately afterwards. However, it's possible that the `match_while_not_seq` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
--- a/omission.md
+++ b/omission.md
@ -1,19 +0,0 @@
-# Tag omission
-
-|Tag name|Details|
-|---|---|
-|`li`| The end tag can be omitted if the list item is immediately followed by another `<li>` element, or if there is no more content in its parent element.|
-|`p`| The start tag is required. The end tag may be omitted if the `<p>` element is immediately followed by an `<address>`, `<article>`, `<aside>`, `<blockquote>`, `<div>`, `<dl>`, `<fieldset>`, `<footer>`, `<form>`, `<h1>`, `<h2>`, `<h3>`, `<h4>`, `<h5>`, `<h6>`, `<header>`, `<hr>`, `<menu>`, `<nav>`, `<ol>`, `<pre>`, `<section>`, `<table>`, `<ul>` or another `<p>` element, or if there is no more content in the parent element and the parent element is not an `<a>` element.|
-|`rp`| The end tag can be omitted if the element is immediately followed by an `<rt>` or another `<rp>` element, or if there is no more content in the parent element.|
-|`rt`| The end tag may be omitted if the `<rt>` element is immediately followed by an `<rt>` or `<rp>` element, or if there is no more content in the parent element.|
-|`rtc`| The closing tag can be omitted if it is immediately followed by a `<rb>`, `<rtc>` or `<rt>` element opening tag or by its parent closing tag.|
-|`caption`| The end tag can be omitted if the element is not immediately followed by ASCII whitespace or a comment.|
-|`colgroup`| The start tag may be omitted, if it has a `<col>` element as its first child and if it is not preceded by a `<colgroup>` whose end tag has been omitted. The end tag may be omitted, if it is not followed by a space or a comment.|
-|`tbody`| The `<tbody>` element is not a required child element for a parent `<table>` element to graphically render. However, it must not be present, if its parent `<table>` element has a `<tr>` element as a child.|
-|`td`| The start tag is mandatory. The end tag may be omitted, if it is immediately followed by a `<th>` or `<td>` element or if there are no more data in its parent element.|
-|`tfoot`|The start tag is mandatory. The end tag may be omitted if there is no more content in the parent `<table>` element.|
-|`th`|The start tag is mandatory. The end tag may be omitted, if it is immediately followed by a `<th>` or `<td>` element or if there are no more data in its parent element.|
-|`thead`|The start tag is mandatory. The end tag may be omitted if the `<thead>` element is immediately followed by a `<tbody>` or `<tfoot>` element.|
-|`tr`|Start tag is mandatory. End tag may be omitted if the `<tr>` element is immediately followed by a `<tr>` element, or if the row is the last element in its parent table group (`<thead>`, `<tbody>` or `<tfoot>`) element.|
-|`optgroup`|The start tag is mandatory. The end tag is optional if this element is immediately followed by another `<optgroup>` element, or if the parent element has no more content.|
-|`option`|The start tag is mandatory. The end tag is optional if this element is immediately followed by another `<option>` element or an `<optgroup>`, or if the parent element has no more content.|
--- a/content.md
+++ b/content.md
@ -0,0 +1,125 @@
+# Text script content
+
+```html
+<script type="text/html">
+  <script>
+    exec1 = true;
+  </script>
+  <script>
+    exec2 = true;
+  </script>
+</script>
+```
+These are true about the above snippet:
+- `document.querySelectorAll('script').length === 2`.
+- `!exec1 && exec2`.
+- `document.querySelector('script[type="text/html"]')` has exactly one child node and it's a text node.
+
+## Comments
+
+If there is one or more `<script>` inside an HTML comment before any `</script>`, the first `</script>` will not end the main script.
+
+### Examples
+
+Ending tag inside comment works because there are no nested script tags.
+
+```html
+<script type="text/plain"><!--
+</script>
+```
+
+There is a nested tag but it doesn't need to be closed because it's treated as text.
+
+```html
+<script type="text/plain">
+  <script>
+</script>
+```
+
+There is a nested tag but it doesn't need to be closed because it's not inside an HTML comment.
+
+```html
+<script type="text/plain">
+  <script><!--
+</script>
+```
+
+There is a nested tag but it doesn't need to be closed because main closing tag is not inside an HTML comment.
+
+```html
+<script type="text/plain"><!--
+  <script>-->
+</script>
+```
+
+The main closing tag is in a HTML comment but still works because there is no `<script>` before it in its comment (there is one in the previous comment).
+
+```html
+<script type="text/plain">
+    <!--<script>--><!--
+</script>
+```
+
+There is a nested tag and it's in an HTML comment but it doesn't need to be closed because it's not `script`.
+
+```html
+<script type="text/plain"><!--
+  <div>
+</script>
+```
+
+First closing tag is inside a comment with one or more previous opening tags so it doesn't close main script tag.
+
+```html
+<script type="text/plain"><!--
+  <script>alert();</script>
+</script>
+```
+
+There is a nested tag and it needs to be closed because it's in a comment `script`. The amount of `<script>` doesn't matter.
+
+```html
+<script type="text/plain"><!--
+  <script><script><script><script><script>alert();</script>
+</script>
+```
+
+First and second closing tags close their respective previous 1+ opening tags.
+
+```html
+<script type="text/plain"><!--
+  <script><script><script>alert();</script>
+  <script></script>
+</script>
+```
+
+Main closing tag works because it is in a separate comment.
+
+```html
+<script type="text/plain">
+  <!--<script><script><script>--><!--
+</script>
+```
+
+Main closing tag works because it is not in a comment.
+
+```html
+<script type="text/plain">
+  <!--<script>-->
+</script>
+```
+
+Figure this out:
+
+```html
+<script type="text/plain"><!--
+  <script>
+    alert();
+  </script>
+  </script
+  <script>
+    alert();
+  </script>
+</script>
+<h1>Test</h1>
+```
--- a/src/proc.rs
+++ b/src/proc.rs
@ -48,11 +48,25 @@ pub struct ProcessorRange {
    end: usize,
 }

+impl ProcessorRange {
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+    pub fn empty(&self) -> bool {
+        self.start >= self.end
+    }
+}
+
 // Processing state of a file. Most fields are used internally and set during
 // processing. Single use only; create one per processing.
 pub struct Processor<'d> {
    code: &'d mut [u8],

+    // Index of the next character to read.
+    read_next: usize,
+    // Index of the next unwritten space.
+    write_next: usize,
+
    // Match.
    // Need to record start as we might get slice after keeping or skipping.
    match_start: usize,
@ -63,11 +77,6 @@ pub struct Processor<'d> {
    // Character matched, if any. Only exists for single-character matches and if matched.
    match_char: Option<u8>,
    match_reason: RequireReason,
-
-    // Index of the next character to read.
-    read_next: usize,
-    // Index of the next unwritten space.
-    write_next: usize,
 }

 impl<'d> Index<ProcessorRange> for Processor<'d> {
@ -314,6 +323,10 @@ impl<'d> Processor<'d> {
    pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
        self.write_next = checkpoint.write_next;
    }
+    /// Get written characters since checkpoint as range.
+    pub fn written_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
+        ProcessorRange { start: checkpoint.write_next, end: self.write_next }
+    }
    /// Get amount of source characters consumed since checkpoint.
    pub fn consumed_count(&self, checkpoint: Checkpoint) -> usize {
        self.read_next - checkpoint.read_next
--- a/src/unit/attr/mod.rs
+++ b/src/unit/attr/mod.rs
@ -1,7 +1,7 @@
 use phf::{phf_set, Set};

 use crate::err::ProcessingResult;
-use crate::proc::Processor;
+use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::is_control;
 use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue};

@ -18,6 +18,12 @@ pub enum AttrType {
    NoValue,
 }

+pub struct ProcessedAttr {
+    pub name: ProcessorRange,
+    pub typ: AttrType,
+    pub value: Option<ProcessorRange>,
+}
+
 // Characters allowed in an attribute name.
 // NOTE: Unicode noncharacters not tested.
 // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
@ -28,26 +34,28 @@ fn is_name_char(c: u8) -> bool {
    }
 }

-pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
+pub fn process_attr(proc: &mut Processor) -> ProcessingResult<ProcessedAttr> {
    // Expect `process_attr` to be called at an attribute.
-    let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());
+    let name = chain!(proc.match_while_pred(is_name_char).expect().keep().range());
    let after_name = proc.checkpoint();

    // TODO DOC Attr must be case sensitive
-    let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
+    let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(&proc[name]);
    let has_value = chain!(proc.match_char(b'=').keep().matched());

-    if !has_value {
-        Ok(AttrType::NoValue)
+    let (typ, value) = if !has_value {
+        (AttrType::NoValue, None)
    } else {
        match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
-            ProcessedAttrValue { empty: true, .. } => {
+            ProcessedAttrValue { value: None, .. } => {
                // Value is empty, which is equivalent to no value, so discard `=` and any quotes.
                proc.erase_written(after_name);
-                Ok(AttrType::NoValue)
+                (AttrType::NoValue, None)
            }
-            ProcessedAttrValue { delimiter: DelimiterType::Unquoted, .. } => Ok(AttrType::Unquoted),
-            ProcessedAttrValue { delimiter: DelimiterType::Double, .. } | ProcessedAttrValue { delimiter: DelimiterType::Single, .. } => Ok(AttrType::Quoted),
+            ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value),
+            ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value),
        }
-    }
+    };
+
+    Ok(ProcessedAttr { name, typ, value })
 }
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -1,7 +1,7 @@
 use phf::{Map, phf_map};

 use crate::err::ProcessingResult;
-use crate::proc::Processor;
+use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::is_whitespace;
 use crate::unit::entity::{EntityType, maybe_process_entity, ParsedEntity};

@ -216,7 +216,7 @@ macro_rules! consume_attr_value_chars {

 pub struct ProcessedAttrValue {
    pub delimiter: DelimiterType,
-    pub empty: bool,
+    pub value: Option<ProcessorRange>,
 }

 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
@ -229,7 +229,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    };

    // Stage 1: read and collect metrics on attribute value characters.
-    let value_start_checkpoint = proc.checkpoint();
+    let src_value_checkpoint = proc.checkpoint();
    let mut metrics = Metrics {
        count_double_quotation: 0,
        count_single_quotation: 0,
@ -245,7 +245,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    });

    // Stage 2: optimally minify attribute value using metrics.
-    proc.restore(value_start_checkpoint);
+    proc.restore(src_value_checkpoint);
    let optimal_delimiter = metrics.get_optimal_delimiter_type();
    let optimal_delimiter_char = match optimal_delimiter {
        DelimiterType::Double => Some(b'"'),
@ -259,6 +259,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    let mut char_type;
    // Used to determine first and last characters.
    let mut char_no = 0usize;
+    let processed_value_checkpoint = proc.checkpoint();
    consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, char_type, {
        match char_type {
            // This should never happen.
@ -293,6 +294,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        };
        char_no += 1;
    });
+    let processed_value_range = proc.written_range(processed_value_checkpoint);
    // Ensure closing delimiter in src has been matched and discarded, if any.
    if let Some(c) = src_delimiter {
        chain!(proc.match_char(c).expect().discard());
@ -304,6 +306,6 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo

    Ok(ProcessedAttrValue {
        delimiter: optimal_delimiter,
-        empty: metrics.collected_count == 0,
+        value: Some(processed_value_range).filter(|r| !r.empty()),
    })
 }
--- a/src/unit/script/js.rs
+++ b/src/unit/script/js.rs
@ -9,7 +9,7 @@ fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"//").expect().keep());

    // Comment can end at closing </script>.
-    // WARNING: Closing tag must not contain whitespace.
+    // TODO WARNING: Closing tag must not contain whitespace.
    // TODO Optimise
    while !chain!(proc.match_line_terminator().keep().matched()) {
        if chain!(proc.match_seq(b"</script>").matched()) {
@ -26,7 +26,7 @@ fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"/*").expect().keep());

    // Comment can end at closing </script>.
-    // WARNING: Closing tag must not contain whitespace.
+    // TODO WARNING: Closing tag must not contain whitespace.
    // TODO Optimise
    while !chain!(proc.match_seq(b"*/").keep().matched()) {
        if chain!(proc.match_seq(b"</script>").matched()) {
@ -91,7 +91,7 @@ fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
    Ok(())
 }

-pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
+pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
    while !chain!(proc.match_seq(b"</").matched()) {
        if chain!(proc.match_seq(b"//").matched()) {
            parse_comment_single(proc)?;
--- a/src/unit/script/mod.rs
+++ b/src/unit/script/mod.rs
@ -0,0 +1,2 @@
+pub mod js;
+pub mod text;
--- a/src/unit/script/text.rs
+++ b/src/unit/script/text.rs
@ -0,0 +1,37 @@
+use crate::proc::Processor;
+use crate::err::ProcessingResult;
+use crate::spec::codepoint::is_whitespace;
+
+pub fn process_text_script(proc: &mut Processor) -> ProcessingResult<()> {
+    // NOTE: See "notes/Text script content.md".
+    let mut in_comment = false;
+    let mut comment_has_unclosed_script = false;
+    loop {
+        // TODO Optimise
+        if chain!(proc.match_seq(b"<!--").keep().matched()) {
+            // NOTE: Could already be in comment, so don't reset `comment_has_unclosed_script`.
+            in_comment = true;
+        } else if chain!(proc.match_seq(b"-->").keep().matched()) {
+            comment_has_unclosed_script = false;
+            in_comment = false;
+        } else if in_comment && chain!(proc.match_seq(b"<script").keep().matched()) {
+            // TODO DOC Case sensitive, no space before tag name, nothing else in tag.
+            // TODO Opening tag can have attributes, whitespace, etc.
+            chain!(proc.match_char(b'>').require().keep());
+            comment_has_unclosed_script = true;
+        } else if chain!(proc.match_seq(b"</script").matched()) {
+            // TODO DOC Case sensitive, no space before tag name, nothing else in tag.
+            if !comment_has_unclosed_script {
+                break;
+            }
+            comment_has_unclosed_script = false;
+            // Keep previously matched closing tag start.
+            proc.keep();
+            // TODO Close tag can have whitespace.
+            chain!(proc.match_char(b'>').require().keep());
+        } else {
+            proc.accept()?;
+        };
+    };
+    Ok(())
+}
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -1,11 +1,32 @@
 use crate::err::{ErrorType, ProcessingResult};
-use crate::proc::Processor;
+use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
 use crate::spec::tag::void::VOID_TAGS;
-use crate::unit::attr::{AttrType, process_attr};
+use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
 use crate::unit::content::process_content;
-use crate::unit::script::process_script;
 use crate::unit::style::process_style;
+use phf::{Set, phf_set};
+use crate::unit::script::text::process_text_script;
+use crate::unit::script::js::process_js_script;
+
+pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! {
+    b"application/ecmascript",
+    b"application/javascript",
+    b"application/x-ecmascript",
+    b"application/x-javascript",
+    b"text/ecmascript",
+    b"text/javascript",
+    b"text/javascript1.0",
+    b"text/javascript1.1",
+    b"text/javascript1.2",
+    b"text/javascript1.3",
+    b"text/javascript1.4",
+    b"text/javascript1.5",
+    b"text/jscript",
+    b"text/livescript",
+    b"text/x-ecmascript",
+    b"text/x-javascript",
+};

 // Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
 // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
@ -36,6 +57,8 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {

    let mut last_attr_type: Option<AttrType> = None;
    let mut self_closing = false;
+    // Value of any "type" attribute; if multiple, last kept.
+    let mut attr_type_value: Option<ProcessorRange> = None;

    loop {
        // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
@ -57,12 +80,18 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
        }

        // Write space after tag name or unquoted/valueless attribute.
+        // Don't write after unquoted.
        match last_attr_type {
            Some(AttrType::Unquoted) | Some(AttrType::NoValue) | None => proc.write(b' '),
            _ => {}
        };

-        last_attr_type = Some(process_attr(proc)?);
+        let ProcessedAttr { name, typ, value } = process_attr(proc)?;
+        match &proc[name] {
+            b"type" => attr_type_value = value,
+            _ => {},
+        };
+        last_attr_type = Some(typ);
    };

    if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) {
@ -70,7 +99,11 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
    };

    match tag_type {
-        TagType::Script => process_script(proc)?,
+        TagType::Script => if attr_type_value.is_none() || attr_type_value.filter(|n| JAVASCRIPT_MIME_TYPES.contains(&proc[*n])).is_some() {
+            process_js_script(proc)?;
+        } else {
+            process_text_script(proc)?;
+        },
        TagType::Style => process_style(proc)?,
        _ => process_content(proc, Some(opening_name_range))?,
    };