diff --git a/ExampleApp/Program.cs b/ExampleApp/Program.cs index ac211f9..f3e15f7 100644 --- a/ExampleApp/Program.cs +++ b/ExampleApp/Program.cs @@ -17,9 +17,11 @@ var tokenizer = new HTMLTokenizer(new ByteBuffer(html)); +var tokens = new List(); + while (tokenizer.NextToken() is { } token) { - + tokens.Add(token); } sw.Stop(); diff --git a/HTML.NET.sln.DotSettings.user b/HTML.NET.sln.DotSettings.user index c63b261..cef036a 100644 --- a/HTML.NET.sln.DotSettings.user +++ b/HTML.NET.sln.DotSettings.user @@ -1,4 +1,4 @@  - <SessionState ContinuousTestingMode="0" IsActive="True" Name="All tests from &lt;UnitTests&gt;" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"> - <Project Location="E:\projects\LibHtmlNet\UnitTests" Presentation="&lt;UnitTests&gt;" /> + <SessionState ContinuousTestingMode="0" IsActive="True" Name="All tests from &lt;UnitTests&gt;" xmlns="urn:schemas-jetbrains-com:jetbrains-ut-session"> + <Project Location="E:\projects\LibHtmlNet\UnitTests" Presentation="&lt;UnitTests&gt;" /> </SessionState> \ No newline at end of file diff --git a/HTMLParser/src/ByteBuffer.cs b/HTMLParser/src/ByteBuffer.cs index 4b24d6b..a55e905 100644 --- a/HTMLParser/src/ByteBuffer.cs +++ b/HTMLParser/src/ByteBuffer.cs @@ -20,7 +20,7 @@ public ByteBuffer(byte[] data) [Pure] public bool IsEndOfBuffer() { - return !CanPeekByte(1); + return Position >= Length; } [Pure] @@ -43,46 +43,28 @@ public bool MatchCaseInsensitiveString(string word) [Pure] public byte PeekByte(int offset = 0) { - AssertRead(); - return _data[Position + offset]; + return _data[(int)Position + offset]; } [Pure] private Span PeekBytes(int count) { - AssertRead(count); return new Span(_data, (int)Position, count); } public void UnreadByte() { - if (Position <= 0) - throw new ArgumentOutOfRangeException(nameof(Position), "Cannot unread byte under position 0"); Position--; } public void Skip(int count) { - AssertRead(count); Position += count; } [Pure] public byte ReadByte() { - AssertRead(); return _data[Position++]; } - - [Pure] - private bool CanPeekByte(int count) - { - return Position + count <= Length; - } - - private void AssertRead(int count = 1) - { - if (!CanPeekByte(count)) - throw new ArgumentOutOfRangeException(nameof(Position), "Cannot read past the end of the buffer"); - } } \ No newline at end of file diff --git a/HTMLParser/src/Parser/HTMLTokenizer.States.cs b/HTMLParser/src/Parser/HTMLTokenizer.States.cs index 46d33bb..0f9e434 100644 --- a/HTMLParser/src/Parser/HTMLTokenizer.States.cs +++ b/HTMLParser/src/Parser/HTMLTokenizer.States.cs @@ -67,15 +67,15 @@ private void DataState(char currentInputCharacter) // Emit the current input character as a character token case '\0': - var token = CurrentToken(); + var token = CurrentToken(HTMLTokenType.Character); LogParseError("unexpected-null-character", token); - EmitToken(currentInputCharacter.ToString()); + EmitToken(HTMLTokenType.Character, currentInputCharacter.ToString()); break; // Anything else default: // Emit the current input character as a character token - EmitToken(currentInputCharacter); + EmitToken(HTMLTokenType.Character, currentInputCharacter); break; } @@ -102,23 +102,23 @@ private void TagOpenState(char currentInputCharacter) // Create a new start tag token, set its tag name to the empty string. Reconsume in the tag name state. case >= 'A' and <= 'Z': // A-Z case >= 'a' and <= 'z': // a-z - CurrentToken(); + CurrentToken(HTMLTokenType.StartTag); SwitchState(HtmlTokenizerState.TagName, true); break; // This is an unexpected-question-mark-instead-of-tag-name parse error. // Create a comment token whose data is the empty string. Reconsume in the bogus comment state. case '?': // ? - LogParseError("unexpected-question-mark-instead-of-tag-name", CurrentToken()); - EmitToken(); + LogParseError("unexpected-question-mark-instead-of-tag-name", CurrentToken(HTMLTokenType.Comment)); + EmitToken(HTMLTokenType.Comment); SwitchState(HtmlTokenizerState.BogusComment, true); break; // This is an invalid-first-character-of-tag-name parse error. // Emit a U+003C LESS-THAN SIGN character token. Reconsume in the data state. default: - LogParseError("invalid-first-character-of-tag-name", CurrentToken()); - EmitToken('<'); + LogParseError("invalid-first-character-of-tag-name", CurrentToken(HTMLTokenType.Character)); + EmitToken(HTMLTokenType.Character, '<'); SwitchState(HtmlTokenizerState.Data, true); break; } @@ -136,14 +136,14 @@ private void EndTagOpenState(char currentInputCharacter) // Reconsume in the tag name state. case >= 'A' and <= 'Z': // A-Z case >= 'a' and <= 'z': // a-z - CurrentToken(); + CurrentToken(HTMLTokenType.EndTag); SwitchState(HtmlTokenizerState.TagName, true); break; // This is a missing-end-tag-name parse error. // Switch to the data state. case '>': // > - LogParseError("missing-end-tag-name", CurrentToken()); + LogParseError("missing-end-tag-name", CurrentToken(HTMLTokenType.Character)); SwitchState(HtmlTokenizerState.Data); break; @@ -151,8 +151,8 @@ private void EndTagOpenState(char currentInputCharacter) // Create a comment token whose data is the empty string. // Reconsume in the bogus comment state. default: - LogParseError("invalid-first-character-of-tag-name", CurrentToken()); - EmitToken(); + LogParseError("invalid-first-character-of-tag-name", CurrentToken(HTMLTokenType.Comment)); + EmitToken(HTMLTokenType.Comment); SwitchState(HtmlTokenizerState.BogusComment); break; } @@ -181,20 +181,20 @@ private void TagNameState(char currentInputCharacter) // Switch to the data state. Emit the current tag token. case '>': // > - EmitToken(); + EmitToken(HTMLTokenType.Tag); SwitchState(HtmlTokenizerState.Data); break; // ASCII upper alpha character // Append the lowercase version of the current input character (add 0x0020 to the character's code point) to the current tag token's tag name. case >= 'A' and <= 'Z': // A-Z - CurrentToken().TagName += currentInputCharacter + 0x20; + CurrentToken(HTMLTokenType.Tag).TagName += currentInputCharacter + 0x20; break; // Anything else // Append the current input character to the current tag token's tag name. default: - CurrentToken().TagName += currentInputCharacter; + CurrentToken(HTMLTokenType.Tag).TagName += currentInputCharacter; break; } @@ -224,8 +224,8 @@ private void BeforeAttributeNameState(char currentInputCharacter) // Start a new attribute in the current tag token. Set that attribute's name to the current input character, // and its value to the empty string. Switch to the attribute name state. case '=': // = - LogParseError("unexpected-equals-sign-before-attribute-name", CurrentToken()); - CurrentToken().NewAttribute(currentInputCharacter); + LogParseError("unexpected-equals-sign-before-attribute-name", CurrentToken(HTMLTokenType.Character)); + CurrentToken(HTMLTokenType.StartTag).NewAttribute(currentInputCharacter); SwitchState(HtmlTokenizerState.AttributeName); break; @@ -233,7 +233,7 @@ private void BeforeAttributeNameState(char currentInputCharacter) // Start a new attribute in the current tag token. // Set that attribute's name and value to the empty string. default: - CurrentToken().NewAttribute(); + CurrentToken(HTMLTokenType.StartTag).NewAttribute(); SwitchState(HtmlTokenizerState.AttributeName, true); break; } @@ -265,14 +265,14 @@ private void AttributeNameState(char currentInputCharacter) // ASCII upper alpha character // Append the lowercase version of the current input character (add 0x0020 to the character's code point) to the current attribute's name. case >= 'A' and <= 'Z': // A-Z - CurrentToken().AddAttributeName((char)(currentInputCharacter + 0x20)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeName((char)(currentInputCharacter + 0x20)); break; // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's name. case '\0': // NULL - LogParseError("unexpected-null-character", CurrentToken()); - CurrentToken().AddAttributeName("\uFFFD"); + LogParseError("unexpected-null-character", CurrentToken(HTMLTokenType.Character)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeName("\uFFFD"); break; // This is an unexpected-character-in-attribute-name parse error. @@ -280,14 +280,14 @@ private void AttributeNameState(char currentInputCharacter) case '"': // " case '\'': // ' case '<': // < - LogParseError("unexpected-character-in-attribute-name", CurrentToken()); - CurrentToken().AddAttributeName(currentInputCharacter); + LogParseError("unexpected-character-in-attribute-name", CurrentToken(HTMLTokenType.Character)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeName(currentInputCharacter); break; // Anything else // Append the current input character to the current attribute's name. default: - CurrentToken().AddAttributeName(currentInputCharacter); + CurrentToken(HTMLTokenType.StartTag).AddAttributeName(currentInputCharacter); break; } } @@ -301,7 +301,7 @@ private void MarkupDeclarationOpenState(char currentInputCharacter) if (currentInputCharacter == '-' && _buffer.PeekByte() == '-') { Skip(1); - CurrentToken(); + CurrentToken(HTMLTokenType.Comment); SwitchState(HtmlTokenizerState.CommentStart); } @@ -328,8 +328,8 @@ private void MarkupDeclarationOpenState(char currentInputCharacter) // Switch to the bogus comment state (don't consume anything in the current state). else { - LogParseError("incorrectly-opened-comment", CurrentToken()); - CurrentToken(); + LogParseError("incorrectly-opened-comment", CurrentToken(HTMLTokenType.Comment)); + CurrentToken(HTMLTokenType.Comment); SwitchState(HtmlTokenizerState.BogusComment); } } @@ -357,7 +357,7 @@ private void DocTypeState(char currentInputCharacter) // This is a missing-whitespace-before-doctype-name parse error. // Reconsume in the before DOCTYPE name state. default: - LogParseError("missing-whitespace-before-doctype-name", CurrentToken()); + LogParseError("missing-whitespace-before-doctype-name", CurrentToken(HTMLTokenType.Character)); SwitchState(HtmlTokenizerState.BeforeDocTypeName, true); break; } @@ -382,7 +382,7 @@ private void BeforeDocTypeName(char currentInputCharacter) // Create a new DOCTYPE token. Set its name to the lowercase version of the current input character (add 0x0020 to the character's code point). // Switch to the DOCTYPE name state. case >= 'A' and <= 'Z': // A-Z - CurrentToken().Name = + CurrentToken(HTMLTokenType.DOCTYPE).Name = Encoding.UTF8.GetString(new[] { (byte)(currentInputCharacter + 0x20) }); SwitchState(HtmlTokenizerState.DocTypeName); break; @@ -391,8 +391,8 @@ private void BeforeDocTypeName(char currentInputCharacter) // Create a new DOCTYPE token. Set its name to a U+FFFD REPLACEMENT CHARACTER character. // Switch to the DOCTYPE name state. case '\0': // NULL - CurrentToken().ForceQuirks = true; - EmitToken(); + CurrentToken(HTMLTokenType.DOCTYPE).ForceQuirks = true; + EmitToken(HTMLTokenType.DOCTYPE); SwitchState(HtmlTokenizerState.Data); break; @@ -401,8 +401,8 @@ private void BeforeDocTypeName(char currentInputCharacter) // Emit the token. // Switch to the data state. case '>': // > - CurrentToken().ForceQuirks = true; - EmitToken(); + CurrentToken(HTMLTokenType.DOCTYPE).ForceQuirks = true; + EmitToken(HTMLTokenType.DOCTYPE); SwitchState(HtmlTokenizerState.Data); break; @@ -410,7 +410,7 @@ private void BeforeDocTypeName(char currentInputCharacter) // Create a new DOCTYPE token. Set its name to the current input character. // Switch to the DOCTYPE name state. default: - CurrentToken().Name = currentInputCharacter.ToString(); + CurrentToken(HTMLTokenType.DOCTYPE).Name = currentInputCharacter.ToString(); SwitchState(HtmlTokenizerState.DocTypeName); break; } @@ -432,27 +432,27 @@ private void DocTypeNameState(char currentInputCharacter) // Switch to the data state. // Emit the current DOCTYPE token. case '>': // > - EmitToken(); + EmitToken(HTMLTokenType.DOCTYPE); SwitchState(HtmlTokenizerState.Data); break; // ASCII upper alpha character // Append the lowercase version of the current input character (add 0x0020 to the character's code point) to the current DOCTYPE token's name. case >= 'A' and <= 'Z': // A-Z - CurrentToken().Name += (char)(currentInputCharacter + 0x20); + CurrentToken(HTMLTokenType.DOCTYPE).Name += (char)(currentInputCharacter + 0x20); break; // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the current DOCTYPE token's name. case '\0': // NULL - LogParseError("unexpected-null-character", CurrentToken()); - CurrentToken().Name += "\uFFFD"; + LogParseError("unexpected-null-character", CurrentToken(HTMLTokenType.Character)); + CurrentToken(HTMLTokenType.DOCTYPE).Name += "\uFFFD"; break; // Anything else // Append the current input character to the current DOCTYPE token's name. default: - CurrentToken().Name += currentInputCharacter; + CurrentToken(HTMLTokenType.DOCTYPE).Name += currentInputCharacter; break; } @@ -486,9 +486,9 @@ private void BeforeAttributeValueState(char currentInputCharacter) // This is a missing-attribute-value parse error. // Switch to the data state. // Emit the current tag token. - LogParseError("missing-attribute-value", CurrentToken()); + LogParseError("missing-attribute-value", CurrentToken(HTMLTokenType.Tag)); SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Tag); break; // Reconsume in the attribute value (unquoted) state. @@ -514,7 +514,7 @@ private void CommentStartState(char currentInputCharacter) // Emit the comment token. case '>': // > SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Comment); break; // Anything else @@ -592,7 +592,7 @@ private void HexadecimalCharacterReferenceStartState(char currentInputCharacter) // This is a missing-semicolon-after-character-reference parse error. // Reconsume in the numeric character reference end state. default: - LogParseError("missing-semicolon-after-character-reference", CurrentToken()); + LogParseError("missing-semicolon-after-character-reference", CurrentToken(HTMLTokenType.Character)); SwitchState(HtmlTokenizerState.NumericCharacterReferenceEnd, true); break; } @@ -615,7 +615,7 @@ private void DecimalCharacterReferenceStartState(char currentInputCharacter) // Flush code points consumed as a character reference. // Reconsume in the return state. default: - LogParseError("absence-of-digits-in-numeric-character-reference", CurrentToken()); + LogParseError("absence-of-digits-in-numeric-character-reference", CurrentToken(HTMLTokenType.Character)); FlushCodePointsConsumedAsCharacterReference(); SwitchState(_returnState, true); break; @@ -630,7 +630,7 @@ private void NumericCharacterReferenceEndState(char currentInputCharacter) // Set the character reference code to 0xFFFD. if (_characterReferenceCode == 0x00) { - LogParseError("null-character-reference", CurrentToken()); + LogParseError("null-character-reference", CurrentToken(HTMLTokenType.Character)); _characterReferenceCode = 0xFFFD; } @@ -638,7 +638,7 @@ private void NumericCharacterReferenceEndState(char currentInputCharacter) // Set the character reference code to 0xFFFD. else if (_characterReferenceCode > 0x10FFFF) { - LogParseError("character-reference-outside-unicode-range", CurrentToken()); + LogParseError("character-reference-outside-unicode-range", CurrentToken(HTMLTokenType.Character)); _characterReferenceCode = 0xFFFD; } @@ -646,21 +646,21 @@ private void NumericCharacterReferenceEndState(char currentInputCharacter) // Set the character reference code to 0xFFFD. else if (IsSurrogate(_characterReferenceCode)) { - LogParseError("surrogate-character-reference", CurrentToken()); + LogParseError("surrogate-character-reference", CurrentToken(HTMLTokenType.Character)); _characterReferenceCode = 0xFFFD; } // If the number is a noncharacter, then this is a noncharacter-character-reference parse error. else if (IsNonCharacter(_characterReferenceCode)) { - LogParseError("noncharacter-character-reference", CurrentToken()); + LogParseError("noncharacter-character-reference", CurrentToken(HTMLTokenType.Character)); } // If the number is 0x0D, or a control that's not ASCII whitespace, then this is a control-character-reference parse error. else if (_characterReferenceCode == 0x0D || (IsControl(_characterReferenceCode) && !IsWhiteSpace(_characterReferenceCode))) { - LogParseError("control-character-reference", CurrentToken()); + LogParseError("control-character-reference", CurrentToken(HTMLTokenType.Character)); // If the number is one of the numbers in the first column of the following table, // then find the row with that number in the first column, @@ -741,7 +741,7 @@ private void DecimalCharacterReferenceState(char currentInputCharacter) // This is a missing-semicolon-after-character-reference parse error. // Reconsume in the numeric character reference end state. default: - LogParseError("missing-semicolon-after-character-reference", CurrentToken()); + LogParseError("missing-semicolon-after-character-reference", CurrentToken(HTMLTokenType.Character)); SwitchState(HtmlTokenizerState.NumericCharacterReferenceEnd, true); break; } @@ -756,7 +756,7 @@ private void CommentState(char currentInputCharacter) // Append the current input character to the comment token's data. // Switch to the comment less-than sign state. case '<': // < - CurrentToken().Data.Append(currentInputCharacter); + CurrentToken(HTMLTokenType.Comment).Data.Append(currentInputCharacter); SwitchState(HtmlTokenizerState.CommentLessThanSign); break; @@ -768,13 +768,13 @@ private void CommentState(char currentInputCharacter) // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data. case '\0': // NULL - CurrentToken().Data.Append('\uFFFD'); + CurrentToken(HTMLTokenType.Comment).Data.Append('\uFFFD'); break; // Anything else // Append the current input character to the comment token's data. default: - CurrentToken().Data.Append(currentInputCharacter); + CurrentToken(HTMLTokenType.Comment).Data.Append(currentInputCharacter); break; } @@ -806,14 +806,14 @@ private void AttributeValueUnquotedState(char currentInputCharacter) // Emit the current tag token. case '>': SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Tag); break; // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value. case '\0': - LogParseError("unexpected-null-character", CurrentToken()); - CurrentToken().AddAttributeValue("\uFFFD"); + LogParseError("unexpected-null-character", CurrentToken(HTMLTokenType.StartTag)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue("\uFFFD"); break; // This is an unexpected-character-in-unquoted-attribute-value parse error. @@ -824,14 +824,14 @@ private void AttributeValueUnquotedState(char currentInputCharacter) case '<': case '=': case '`': - LogParseError("unexpected-character-in-unquoted-attribute-value", CurrentToken()); - CurrentToken().AddAttributeValue(currentInputCharacter); + LogParseError("unexpected-character-in-unquoted-attribute-value", CurrentToken(HTMLTokenType.StartTag)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue(currentInputCharacter); break; // Anything else // Append the current input character to the current attribute's value. default: - CurrentToken().AddAttributeValue(currentInputCharacter); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue(currentInputCharacter); break; } @@ -859,14 +859,14 @@ private void AttributeValueSingleQuotedState(char currentInputCharacter) // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value. case '\0': - LogParseError("unexpected-null-character", CurrentToken()); - CurrentToken().AddAttributeValue("\uFFFD"); + LogParseError("unexpected-null-character", CurrentToken(HTMLTokenType.StartTag)); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue("\uFFFD"); break; // Anything else // Append the current input character to the current attribute's value. default: - CurrentToken().AddAttributeValue(currentInputCharacter); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue(currentInputCharacter); break; } } @@ -892,13 +892,13 @@ private void AttributeValueDoubleQuotedState(char currentInputCharacter) case '\0': // NULL // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute value. - CurrentToken().AddAttributeValue("\uFFFD"); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue("\uFFFD"); break; // Anything else // Append the current input character to the current attribute value. default: - CurrentToken().AddAttributeValue(currentInputCharacter); + CurrentToken(HTMLTokenType.StartTag).AddAttributeValue(currentInputCharacter); break; } } @@ -907,7 +907,7 @@ private void AttributeValueDoubleQuotedState(char currentInputCharacter) // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state private void AfterAttributeValueQuotedState(char currentInputCharacter) { - CurrentToken().FinishAttribute(); + CurrentToken(HTMLTokenType.Tag).FinishAttribute(); switch (currentInputCharacter) { @@ -927,7 +927,7 @@ private void AfterAttributeValueQuotedState(char currentInputCharacter) // Switch to the data state. case '>': // > SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Tag); break; // Anything else @@ -956,7 +956,7 @@ private void CommentEndDashState(char currentInputCharacter) // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. // Reconsume in the comment state. default: - CurrentToken().Data.Append('-'); + CurrentToken(HTMLTokenType.Comment).Data.Append('-'); SwitchState(HtmlTokenizerState.Comment, true); break; } @@ -974,7 +974,7 @@ private void CommentEndState(char currentInputCharacter) // Emit the current comment token. case '>': // > SwitchState(HtmlTokenizerState.Data); - EmitToken(CurrentToken().Data.ToString()); + EmitToken(HTMLTokenType.Comment, CurrentToken(HTMLTokenType.Comment).Data.ToString()); break; // Switch to the comment end bang state. @@ -984,13 +984,13 @@ private void CommentEndState(char currentInputCharacter) // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. case '-': // - - CurrentToken().Data.Append('-'); + CurrentToken(HTMLTokenType.Comment).Data.Append('-'); break; // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's data. // Reconsume in the comment state. default: - CurrentToken().Data.Append("--"); + CurrentToken(HTMLTokenType.Comment).Data.Append("--"); SwitchState(HtmlTokenizerState.Comment, true); break; } @@ -1005,7 +1005,7 @@ private void CommentEndBangState(char currentInputCharacter) // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION MARK character (!) to the comment token's data. // Switch to the comment end dash state. case '-': // - - CurrentToken().Data.Append("--!"); + CurrentToken(HTMLTokenType.Comment).Data.Append("--!"); SwitchState(HtmlTokenizerState.CommentEndDash); break; @@ -1013,16 +1013,16 @@ private void CommentEndBangState(char currentInputCharacter) // Switch to the data state. // Emit the current comment token. case '>': // > - LogParseError("incorrectly-closed-comment", CurrentToken()); + LogParseError("incorrectly-closed-comment", CurrentToken(HTMLTokenType.Comment)); SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Comment); break; // Anything else // Append two U+002D HYPHEN-MINUS characters (-) and a U+0021 EXCLAMATION MARK character (!) to the comment token's data. // Reconsume in the comment state. default: - CurrentToken().Data.Append("--!"); + CurrentToken(HTMLTokenType.Comment).Data.Append("--!"); SwitchState(HtmlTokenizerState.Comment, true); break; } @@ -1045,15 +1045,15 @@ private void CommentStartDashState(char currentInputCharacter) // Switch to the data state. // Emit the current comment token. case '>': // > - LogParseError("abrupt-closing-of-empty-comment", CurrentToken()); + LogParseError("abrupt-closing-of-empty-comment", CurrentToken(HTMLTokenType.Comment)); SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Comment); break; // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. // Reconsume in the comment state. default: - CurrentToken().Data.Append('-'); + CurrentToken(HTMLTokenType.Comment).Data.Append('-'); SwitchState(HtmlTokenizerState.Comment, true); break; } @@ -1069,20 +1069,20 @@ private void BogusCommentState(char currentInputCharacter) // Emit the current comment token. case '>': // > SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Comment); break; // This is an unexpected-null-character parse error. // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data. case '\0': // NULL - LogParseError("unexpected-null-character", CurrentToken()); - CurrentToken().Data.Append('\uFFFD'); + LogParseError("unexpected-null-character", CurrentToken(HTMLTokenType.Comment)); + CurrentToken(HTMLTokenType.Comment).Data.Append('\uFFFD'); break; // Anything else // Append the current input character to the comment token's data. default: - CurrentToken().Data.Append(currentInputCharacter); + CurrentToken(HTMLTokenType.Comment).Data.Append(currentInputCharacter); break; } @@ -1098,13 +1098,13 @@ private void CommentLessThanSignState(char currentInputCharacter) // Append the current input character to the comment token's data. // Switch to the comment less-than sign bang state. case '!': // ! - CurrentToken().Data.Append(currentInputCharacter); + CurrentToken(HTMLTokenType.Comment).Data.Append(currentInputCharacter); SwitchState(HtmlTokenizerState.CommentLessThanSignBang); break; // Append the current input character to the comment token's data. case '<': // < - CurrentToken().Data.Append(currentInputCharacter); + CurrentToken(HTMLTokenType.Comment).Data.Append(currentInputCharacter); break; // Anything else @@ -1205,19 +1205,19 @@ private void AmbiguousAmpersandState(char currentInputCharacter) case >= '0' and <= '9': // 0-9 if (ConsumedAsPartOfAnAttribute()) { - CurrentToken().AddAttributeValue(currentInputCharacter); + CurrentToken(HTMLTokenType.Tag).AddAttributeValue(currentInputCharacter); } else { - CurrentToken().Data.Append(currentInputCharacter); - EmitToken(CurrentToken().Data.ToString()); + CurrentToken(HTMLTokenType.Character).Data.Append(currentInputCharacter); + EmitToken(HTMLTokenType.Character, CurrentToken(HTMLTokenType.Character).Data.ToString()); } break; // This is an unknown-named-character-reference parse error. Reconsume in the return state. case ';': - LogParseError("unknown-named-character-reference", CurrentToken()); + LogParseError("unknown-named-character-reference", CurrentToken(HTMLTokenType.Character)); SwitchState(_returnState, true); break; @@ -1262,7 +1262,7 @@ private void CommentLessThanSignBangDashDashState(char currentInputCharacter) // This is a nested-comment parse error. // Reconsume in the comment end state. default: - LogParseError("nested-comment", CurrentToken()); + LogParseError("nested-comment", CurrentToken(HTMLTokenType.Comment)); SwitchState(HtmlTokenizerState.CommentEnd, true); break; } @@ -1280,16 +1280,16 @@ private void SelfClosingStartTagState(char currentInputCharacter) // Switch to the data state. // Emit the current tag token. case '>': // > - CurrentToken().SelfClosing = true; + CurrentToken(HTMLTokenType.StartTag).SelfClosing = true; SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Tag); break; // Anything else // This is an unexpected-solidus-in-tag parse error. // Reconsume in the before attribute name state. default: - LogParseError("unexpected-solidus-in-tag", CurrentToken()); + LogParseError("unexpected-solidus-in-tag", CurrentToken(HTMLTokenType.Tag)); SwitchState(HtmlTokenizerState.BeforeAttributeName, true); break; } @@ -1299,7 +1299,7 @@ private void SelfClosingStartTagState(char currentInputCharacter) // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state private void AfterAttributeNameState(char currentInputCharacter) { - CurrentToken().FinishAttribute(); + CurrentToken(HTMLTokenType.Tag).FinishAttribute(); switch (currentInputCharacter) { @@ -1324,7 +1324,7 @@ private void AfterAttributeNameState(char currentInputCharacter) // Emit the current tag token. case '>': // > SwitchState(HtmlTokenizerState.Data); - EmitToken(); + EmitToken(HTMLTokenType.Tag); break; // Anything else @@ -1332,7 +1332,7 @@ private void AfterAttributeNameState(char currentInputCharacter) // Set that attribute name and value to the empty string. // Reconsume in the attribute name state. default: - CurrentToken().NewAttribute(); + CurrentToken(HTMLTokenType.Tag).NewAttribute(); SwitchState(HtmlTokenizerState.AttributeName, true); break; } @@ -1439,9 +1439,9 @@ private void FlushCodePointsConsumedAsCharacterReference() // user agent must append to the code point from the buffer to the current attribute's value // if the character reference was consumed as part of an attribute, or emit the code point as a character token otherwise. if (ConsumedAsPartOfAnAttribute()) - CurrentToken().AddAttributeValue(_temporaryBuffer.ToString()); + CurrentToken(HTMLTokenType.Tag).AddAttributeValue(_temporaryBuffer.ToString()); else - EmitToken(_temporaryBuffer.ToString()); + EmitToken(HTMLTokenType.Character, _temporaryBuffer.ToString()); _temporaryBuffer.Clear(); } diff --git a/HTMLParser/src/Parser/HTMLTokenizer.cs b/HTMLParser/src/Parser/HTMLTokenizer.cs index badd2e1..8ff31f9 100644 --- a/HTMLParser/src/Parser/HTMLTokenizer.cs +++ b/HTMLParser/src/Parser/HTMLTokenizer.cs @@ -6,7 +6,7 @@ namespace HTML_NET.Parser; public partial class HTMLTokenizer { private readonly ByteBuffer _buffer; - private readonly Dictionary _currentTokens; + private readonly HTMLToken _currentToken; private readonly StringBuilder _temporaryBuffer; private int _characterReferenceCode; @@ -20,7 +20,7 @@ public partial class HTMLTokenizer public HTMLTokenizer(ByteBuffer buffer) { _buffer = buffer; - _currentTokens = new Dictionary(); + _currentToken = new HTMLToken(HTMLTokenType.DOCTYPE); _currentState = HtmlTokenizerState.Data; _returnState = HtmlTokenizerState.Data; _temporaryBuffer = new StringBuilder(); @@ -219,55 +219,40 @@ private bool HasNextToken() return _nextToken != null; } - private void EmitToken() where T : HTMLToken, new() + private void EmitToken(HTMLTokenType type) { - EmitToken(string.Empty); + EmitToken(type, string.Empty); } - private void EmitToken(char currentInputCharacter) where T : HTMLToken, new() + private void EmitToken(HTMLTokenType type, char currentInputCharacter) { - EmitToken(currentInputCharacter.ToString()); + EmitToken(type, currentInputCharacter.ToString()); } - private void EmitToken(string data) where T : HTMLToken, new() + private void EmitToken(HTMLTokenType type, string data) { - var token = CurrentToken(); + var token = CurrentToken(type); token.Data.Append(data); - _nextToken = token; - _currentTokens.Remove(typeof(T)); + } - - private T CurrentToken() where T : HTMLToken, new() - { - // if parent class of token is TagToken, we set type to TagToken - if (typeof(T).IsSubclassOf(typeof(TagToken))) return CurrentToken(typeof(TagToken)); - - // if we already have a token of the specified type, we return it - if (_currentTokens.TryGetValue(typeof(T), out var value)) return (T)value; - - // otherwise we create a new token of the specified type - var token = new T(); - - // FIXME: I think Position - 1 is correct here, but I'm not sure - token.Position = _buffer.Position; - _currentTokens.Add(typeof(T), token); - - return token; - } - - private T CurrentToken(Type parentType) where T : HTMLToken, new() + + private HTMLToken CurrentToken(HTMLTokenType type) { - if (_currentTokens.TryGetValue(parentType, out var value)) - { - var token = (TagToken)value; - if (token is T typedToken) return typedToken; - } + if (_currentToken.Type == type) return _currentToken; + + // reset current token + _currentToken.Data.Clear(); + _currentToken.Name = string.Empty; + _currentToken.TagName = string.Empty; + _currentToken.ForceQuirks = false; + _currentToken.SelfClosing = false; + _currentToken.Attributes.Clear(); + _currentToken.NewAttribute(string.Empty); + _currentToken.Type = type; - var newToken = new T(); - _currentTokens.Add(parentType, newToken); - return newToken; + return _currentToken; } } \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/CharacterToken.cs b/HTMLParser/src/Parser/Tokens/CharacterToken.cs deleted file mode 100644 index 3ee49f6..0000000 --- a/HTMLParser/src/Parser/Tokens/CharacterToken.cs +++ /dev/null @@ -1,13 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class CharacterToken : HTMLToken -{ - public CharacterToken() : base(HTMLTokenType.Character) - { - } - - public override int GetLength() - { - return Data.Length; - } -} \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/CommentToken.cs b/HTMLParser/src/Parser/Tokens/CommentToken.cs deleted file mode 100644 index 5980c5b..0000000 --- a/HTMLParser/src/Parser/Tokens/CommentToken.cs +++ /dev/null @@ -1,13 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class CommentToken : HTMLToken -{ - public CommentToken() : base(HTMLTokenType.Comment) - { - } - - public override int GetLength() - { - throw new NotImplementedException("CommentToken.GetLength() is not implemented."); - } -} \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/DOCTYPEToken.cs b/HTMLParser/src/Parser/Tokens/DOCTYPEToken.cs deleted file mode 100644 index 925f109..0000000 --- a/HTMLParser/src/Parser/Tokens/DOCTYPEToken.cs +++ /dev/null @@ -1,18 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class DOCTYPEToken : HTMLToken -{ - public DOCTYPEToken() : base(HTMLTokenType.DOCTYPE) - { - Name = ""; - ForceQuirks = false; - } - - public string Name { get; set; } - public bool ForceQuirks { get; set; } - - public override int GetLength() - { - throw new NotImplementedException("DOCTYPEToken.GetLength() is not implemented."); - } -} \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/EndTagToken.cs b/HTMLParser/src/Parser/Tokens/EndTagToken.cs deleted file mode 100644 index 938b5be..0000000 --- a/HTMLParser/src/Parser/Tokens/EndTagToken.cs +++ /dev/null @@ -1,8 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class EndTagToken : TagToken -{ - public EndTagToken() : base(HTMLTokenType.EndTag) - { - } -} \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/HTMLToken.cs b/HTMLParser/src/Parser/Tokens/HTMLToken.cs index bb29566..9d62590 100644 --- a/HTMLParser/src/Parser/Tokens/HTMLToken.cs +++ b/HTMLParser/src/Parser/Tokens/HTMLToken.cs @@ -2,21 +2,79 @@ namespace HTML_NET.Parser.Tokens; -public abstract class HTMLToken +public sealed class HTMLToken { - private HTMLToken(HTMLTokenType type, string data) + private string _currentAttributeName; + private string _currentAttributeValue; + + public HTMLToken(HTMLTokenType type, string data) { Type = type; Data = new StringBuilder(data); + Attributes = new Dictionary(); } - protected HTMLToken(HTMLTokenType type) : this(type, string.Empty) + public HTMLToken(HTMLTokenType type) : this(type, string.Empty) { } - public HTMLTokenType Type { get; protected set; } + public HTMLTokenType Type { get; set; } public long Position { get; set; } public StringBuilder Data { get; set; } + + public string Name { get; set; } + public bool ForceQuirks { get; set; } + public string TagName { get; set; } + + public bool SelfClosing { get; set; } + public int GetLength() + { + return 0; + } + + public void NewAttribute(char name) + { + NewAttribute(new string(name, 1)); + } + + public void NewAttribute(string name = "") + + { + _currentAttributeName = name; + _currentAttributeValue = ""; + } + + public void AddAttributeName(string value) + { + _currentAttributeValue += value; + } + + public void AddAttributeName(char value) + { + AddAttributeName(new string(value, 1)); + } + + public void AddAttributeValue(string value) + { + _currentAttributeValue += value; + } + + public void AddAttributeValue(char value) + { + AddAttributeValue(new string(value, 1)); + } + + public void FinishAttribute() + { + if(string.IsNullOrWhiteSpace(_currentAttributeName)) + return; + + if (Attributes.ContainsKey(_currentAttributeName)) + Attributes[_currentAttributeName] = _currentAttributeValue; + + _currentAttributeName = ""; + _currentAttributeValue = ""; + } - public abstract int GetLength(); + public Dictionary Attributes { get; set; } } \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/StartTagToken.cs b/HTMLParser/src/Parser/Tokens/StartTagToken.cs deleted file mode 100644 index 8409152..0000000 --- a/HTMLParser/src/Parser/Tokens/StartTagToken.cs +++ /dev/null @@ -1,8 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class StartTagToken : TagToken -{ - public StartTagToken() : base(HTMLTokenType.StartTag) - { - } -} \ No newline at end of file diff --git a/HTMLParser/src/Parser/Tokens/TagToken.cs b/HTMLParser/src/Parser/Tokens/TagToken.cs deleted file mode 100644 index 90e5c17..0000000 --- a/HTMLParser/src/Parser/Tokens/TagToken.cs +++ /dev/null @@ -1,73 +0,0 @@ -namespace HTML_NET.Parser.Tokens; - -public class TagToken : HTMLToken -{ - private string _currentAttributeName; - private string _currentAttributeValue; - - protected TagToken(HTMLTokenType type) : base(type) - { - TagName = ""; - _currentAttributeName = ""; - _currentAttributeValue = ""; - } - - public TagToken() : base(HTMLTokenType.StartTag) - { - TagName = ""; - _currentAttributeName = ""; - _currentAttributeValue = ""; - } - - public string TagName { get; set; } - private Dictionary Attributes { get; } = new(); - public bool SelfClosing { get; set; } - - public void NewAttribute(string name = "") - { - _currentAttributeName = name; - _currentAttributeValue = ""; - } - - public void NewAttribute(char name) - { - NewAttribute(new string(name, 1)); - } - - public void AddAttributeName(string value) - { - _currentAttributeValue += value; - } - - public void AddAttributeName(char value) - { - AddAttributeName(new string(value, 1)); - } - - public void AddAttributeValue(string value) - { - _currentAttributeValue += value; - } - - public void AddAttributeValue(char value) - { - AddAttributeValue(new string(value, 1)); - } - - public void FinishAttribute() - { - if(string.IsNullOrWhiteSpace(_currentAttributeName)) - return; - - if (Attributes.ContainsKey(_currentAttributeName)) - Attributes[_currentAttributeName] = _currentAttributeValue; - - _currentAttributeName = ""; - _currentAttributeValue = ""; - } - - public override int GetLength() - { - throw new NotImplementedException("TagToken.GetLength() is not implemented."); - } -} \ No newline at end of file