From 3316f627b24e02f04b7ac6d86ceee1658c33b46c Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 16 May 2024 14:36:10 +0900
Subject: [PATCH 001/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 191932b8..d317e666 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.2.8"
+  VERSION = "3.2.9"
   REVISION = ""
 
   Copyright = COPYRIGHT

From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 20 May 2024 12:17:27 +0900
Subject: [PATCH 002/101] Add support for old strscan

Fix GH-132

If we support old strscan, users can also use strscan installed as a
default gem.

Reported by Adam. Thanks!!!
---
 .github/workflows/test.yml      | 32 ++++++++++++++++++++++----------
 lib/rexml/parsers/baseparser.rb | 11 +++++++++++
 rexml.gemspec                   |  2 +-
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fd26b9ab..f977de60 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,14 +3,14 @@ on:
   - push
   - pull_request
 jobs:
-  ruby-versions:
+  ruby-versions-inplace:
     uses: ruby/actions/.github/workflows/ruby_versions.yml@master
     with:
       engine: cruby-jruby
       min_version: 2.5
 
   inplace:
-    needs: ruby-versions
+    needs: ruby-versions-inplace
     name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -20,7 +20,7 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
+        ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }}
         exclude:
           - {runs-on: macos-latest, ruby-version: 2.5}
         # include:
@@ -47,7 +47,14 @@ jobs:
       - name: Test
         run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal"
 
+  ruby-versions-gem:
+    uses: ruby/actions/.github/workflows/ruby_versions.yml@master
+    with:
+      engine: cruby-jruby
+      min_version: 3.0
+
   gem:
+    needs: ruby-versions-gem
     name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -57,21 +64,26 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        ruby-version:
-          - "3.0"
-          - head
+        ruby-version: ${{ fromJson(needs.ruby-versions-gem.outputs.versions) }}
     steps:
       - uses: actions/checkout@v4
       - uses: ruby/setup-ruby@v1
         with:
           ruby-version: ${{ matrix.ruby-version }}
       - name: Install as gem
-        env:
-          BUNDLE_PATH__SYSTEM: "true"
-          BUNDLE_WITHOUT: "benchmark:development"
         run: |
           rake install
-          bundle install
+      - name: Install test dependencies on non-Windows
+        if: matrix.runs-on != 'windows-latest'
+        run: |
+          for gem in $(ruby -e 'puts ARGF.read[/^group :test do(.*)^end/m, 1].scan(/"(.+?)"/)' Gemfile); do
+            gem install ${gem}
+          done
+      - name: Install test dependencies on Windows
+        if: matrix.runs-on == 'windows-latest'
+        run: |
+          gem install test-unit
+          gem install test-unit-ruby-core
       - name: Test
         run: |
           ruby -run -e mkdir -- tmp
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index d09237c5..da051a76 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -7,6 +7,17 @@
 
 module REXML
   module Parsers
+    if StringScanner::Version < "3.0.8"
+      module StringScannerCaptures
+        refine StringScanner do
+          def captures
+            values_at(*(1...size))
+          end
+        end
+      end
+      using StringScannerCaptures
+    end
+
     # = Using the Pull Parser
     # <em>This API is experimental, and subject to change.</em>
     #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
diff --git a/rexml.gemspec b/rexml.gemspec
index 97eac657..169e49dc 100644
--- a/rexml.gemspec
+++ b/rexml.gemspec
@@ -55,5 +55,5 @@ Gem::Specification.new do |spec|
 
   spec.required_ruby_version = '>= 2.5.0'
 
-  spec.add_runtime_dependency("strscan", ">= 3.0.9")
+  spec.add_runtime_dependency("strscan")
 end

From f525ef79367e70b041763c2a6c332628b3f85e48 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 30 May 2024 20:56:26 +0900
Subject: [PATCH 003/101] Use /#{Regexp.escape}/ instead of Regexp.union

It's for readability.
---
 lib/rexml/source.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 0f3c5011..4483aecc 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -69,7 +69,7 @@ def read(term = nil)
     end
 
     def read_until(term)
-      @scanner.scan_until(Regexp.union(term)) or @scanner.rest
+      @scanner.scan_until(/#{Regexp.escape(term)}/) or @scanner.rest
     end
 
     def ensure_buffer
@@ -173,7 +173,7 @@ def read(term = nil)
     end
 
     def read_until(term)
-      pattern = Regexp.union(term)
+      pattern = /#{Regexp.escape(term)}/
       begin
         until str = @scanner.scan_until(pattern)
           @scanner << readline(term)

From f59790b0caa8966a68be3353b132634f35aefbe6 Mon Sep 17 00:00:00 2001
From: Andrii Konchyn <andry.konchin@gmail.com>
Date: Fri, 31 May 2024 23:18:44 +0300
Subject: [PATCH 004/101] Fix the NEWS.md and change PR reference that fixes
 CVE-2024-35176 (#133)

It seems to me that mentioned in the NEWS.md and in the release notes PR
#124 ("Move development dependencies to Gemfile") isn't a correct one
and not related to CVE-2024-35176:

```
- Improved parse performance when an attribute has many <s.
  - GH-124
```

#126 looks like fixes the issue with attribute value that contains
multiple '>' characters. At least it adds a proper test.
---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 013409e6..7bfe3b9a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -30,7 +30,7 @@
 
   * Improved parse performance when an attribute has many `<`s.
 
-    * GH-124
+    * GH-126
 
 ### Fixes
 

From 4444a04ece4c02a7bd51e8c75623f22dc12d882b Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 2 Jun 2024 16:59:16 +0900
Subject: [PATCH 005/101] Add missing encode for custom term

---
 lib/rexml/source.rb | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 4483aecc..999f4671 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -163,6 +163,7 @@ def initialize(arg, block_size=500, encoding=nil)
     end
 
     def read(term = nil)
+      term = encode(term) if term
       begin
         @scanner << readline(term)
         true
@@ -174,6 +175,7 @@ def read(term = nil)
 
     def read_until(term)
       pattern = /#{Regexp.escape(term)}/
+      term = encode(term)
       begin
         until str = @scanner.scan_until(pattern)
           @scanner << readline(term)

From 3e3893d48357c04c4f3a7088819880905a64742d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 2 Jun 2024 17:07:04 +0900
Subject: [PATCH 006/101] Source#read_until: Add missing position move on all
 read

---
 lib/rexml/parsers/baseparser.rb |  2 ++
 lib/rexml/source.rb             | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index da051a76..82575685 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -644,8 +644,10 @@ def parse_attributes(prefixes, curr_ns)
               raise REXML::ParseException.new(message, @source)
             end
             quote = match[1]
+            start_position = @source.position
             value = @source.read_until(quote)
             unless value.chomp!(quote)
+              @source.position = start_position
               message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
               raise REXML::ParseException.new(message, @source)
             end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 999f4671..3be3f846 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -69,7 +69,12 @@ def read(term = nil)
     end
 
     def read_until(term)
-      @scanner.scan_until(/#{Regexp.escape(term)}/) or @scanner.rest
+      data = @scanner.scan_until(/#{Regexp.escape(term)}/)
+      unless data
+        data = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+      end
+      data
     end
 
     def ensure_buffer
@@ -181,7 +186,9 @@ def read_until(term)
           @scanner << readline(term)
         end
       rescue EOFError
-        @scanner.rest
+        rest = @scanner.rest
+        @scanner.pos = @scanner.string.bytesize
+        rest
       else
         read if @scanner.eos? and !@source.eof?
         str

From 037c16a5768d25d69570ccce73b2eb78b559a9b4 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 3 Jun 2024 10:24:24 +0900
Subject: [PATCH 007/101] Optimize Source#read_until method (#135)

Optimize `Source#read_until` method.

## Benchmark
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom      9.877       9.992        15.605       17.559 i/s -     100.000 times in 10.124592s 10.008017s 6.408031s 5.695167s
                 sax     22.903      25.151        39.482       50.846 i/s -     100.000 times in 4.366300s 3.975922s 2.532822s 1.966706s
                pull     25.940      30.474        44.685       61.450 i/s -     100.000 times in 3.855070s 3.281511s 2.237879s 1.627346s
              stream     25.255      29.500        41.819       53.605 i/s -     100.000 times in 3.959539s 3.389825s 2.391256s 1.865505s

Comparison:
                              dom
         after(YJIT):        17.6 i/s
        before(YJIT):        15.6 i/s - 1.13x  slower
               after:        10.0 i/s - 1.76x  slower
              before:         9.9 i/s - 1.78x  slower

                              sax
         after(YJIT):        50.8 i/s
        before(YJIT):        39.5 i/s - 1.29x  slower
               after:        25.2 i/s - 2.02x  slower
              before:        22.9 i/s - 2.22x  slower

                             pull
         after(YJIT):        61.4 i/s
        before(YJIT):        44.7 i/s - 1.38x  slower
               after:        30.5 i/s - 2.02x  slower
              before:        25.9 i/s - 2.37x  slower

                           stream
         after(YJIT):        53.6 i/s
        before(YJIT):        41.8 i/s - 1.28x  slower
               after:        29.5 i/s - 1.82x  slower
              before:        25.3 i/s - 2.12x  slower

```

- YJIT=ON : 1.13x - 1.38x faster
- YJIT=OFF : 1.01x - 1.17x faster

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/source.rb | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 3be3f846..542b76a6 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -34,6 +34,16 @@ class Source
     attr_reader :line
     attr_reader :encoding
 
+    module Private
+      PRE_DEFINED_TERM_PATTERNS = {}
+      pre_defined_terms = ["'", '"']
+      pre_defined_terms.each do |term|
+        PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
+      end
+    end
+    private_constant :Private
+    include Private
+
     # Constructor
     # @param arg must be a String, and should be a valid XML document
     # @param encoding if non-null, sets the encoding of the source to this
@@ -69,7 +79,8 @@ def read(term = nil)
     end
 
     def read_until(term)
-      data = @scanner.scan_until(/#{Regexp.escape(term)}/)
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
+      data = @scanner.scan_until(pattern)
       unless data
         data = @scanner.rest
         @scanner.pos = @scanner.string.bytesize
@@ -179,7 +190,7 @@ def read(term = nil)
     end
 
     def read_until(term)
-      pattern = /#{Regexp.escape(term)}/
+      pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
       term = encode(term)
       begin
         until str = @scanner.scan_until(pattern)

From d5ddbff19ca8b96c8fdf66fde4654c1c8c5e377b Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 3 Jun 2024 10:26:19 +0900
Subject: [PATCH 008/101] benchmark: Remove non-parsing operations from the DOM
 case (#136)

## Why?
`.elements.each("root/child") {|_|}` is not a parsing operation.

## Result
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     16.254      16.412        27.189       28.940 i/s -     100.000 times in 6.152343s 6.093050s 3.677924s 3.455456s
                 sax     22.909      23.194        39.481       40.099 i/s -     100.000 times in 4.365165s 4.311414s 2.532840s 2.493807s
                pull     26.281      25.918        44.465       45.733 i/s -     100.000 times in 3.805063s 3.858328s 2.248968s 2.186621s
              stream     25.196      25.185        41.674       40.947 i/s -     100.000 times in 3.968828s 3.970585s 2.399554s 2.442158s

Comparison:
                              dom
         after(YJIT):        28.9 i/s
        before(YJIT):        27.2 i/s - 1.06x  slower
               after:        16.4 i/s - 1.76x  slower
              before:        16.3 i/s - 1.78x  slower

                              sax
         after(YJIT):        40.1 i/s
        before(YJIT):        39.5 i/s - 1.02x  slower
               after:        23.2 i/s - 1.73x  slower
              before:        22.9 i/s - 1.75x  slower

                             pull
         after(YJIT):        45.7 i/s
        before(YJIT):        44.5 i/s - 1.03x  slower
              before:        26.3 i/s - 1.74x  slower
               after:        25.9 i/s - 1.76x  slower

                           stream
        before(YJIT):        41.7 i/s
         after(YJIT):        40.9 i/s - 1.02x  slower
              before:        25.2 i/s - 1.65x  slower
               after:        25.2 i/s - 1.65x  slower

```

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 benchmark/parse.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml
index e7066fcb..f2c7d336 100644
--- a/benchmark/parse.yaml
+++ b/benchmark/parse.yaml
@@ -47,7 +47,7 @@ prelude: |
   end
 
 benchmark:
-  'dom'        : REXML::Document.new(xml).elements.each("root/child") {|_|}
+  'dom'        : REXML::Document.new(xml)
   'sax'        : REXML::Parsers::SAX2Parser.new(xml).parse
   'pull'       : |
     parser = REXML::Parsers::PullParser.new(xml)

From 2fc3f79e63b9673e2703b3f03d1a8fe47ca149f0 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 6 Jun 2024 10:54:05 +0900
Subject: [PATCH 009/101] test: improve name

---
 test/test_document.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_document.rb b/test/test_document.rb
index f96bfd5d..78d9d7de 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -202,7 +202,7 @@ def test_xml_declaration_standalone
       assert_equal('no', doc.stand_alone?, bug2539)
     end
 
-    def test_gt_linear_performance
+    def test_gt_linear_performance_attribute_value
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq) do |n|
         REXML::Document.new('<test testing="' + ">" * n + '"></test>')

From da67561afb2a5f6910c69d5e0e73bea8d457f303 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 6 Jun 2024 10:54:13 +0900
Subject: [PATCH 010/101] test: reduce the number of rehearsal executions

It reduces test execution time.
---
 test/test_document.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_document.rb b/test/test_document.rb
index 78d9d7de..4bf3f55d 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -204,7 +204,7 @@ def test_xml_declaration_standalone
 
     def test_gt_linear_performance_attribute_value
       seq = [10000, 50000, 100000, 150000, 200000]
-      assert_linear_performance(seq) do |n|
+      assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<test testing="' + ">" * n + '"></test>')
       end
     end

From dab80658b684a093f4ef8b2c0b154df58aa710c9 Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Fri, 7 Jun 2024 11:23:14 +0900
Subject: [PATCH 011/101] Improve `Node#each_recursive` performance (#139)

Fix #134

## Summary

This PR does:

- Add `benchmark/each_recursive.yaml`
- Rewrite `Node#each_recursive` implementation for performance
- Add a test for `Node#each_recursive`

The performance of `Node#each_recursive` is improved 60~80x faster.

## Details

`each_recursive` is too much slow as I described in #134.
I improved this performance by rewriting its implementation in this PR.

Also, I added a benchmark in `benchmark/each_recursive.yaml` and the
following is a result on my laptop:

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/each_recursive.yaml
ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
      each_recursive      11.279     686.502       17.926        1.470k i/s -     100.000 times in 8.866303s 0.145666s 5.578360s 0.068018s

Comparison:
                   each_recursive
        master(YJIT):      1470.2 i/s
              master:       686.5 i/s - 2.14x  slower
         3.2.6(YJIT):        17.9 i/s - 82.01x  slower
         rexml 3.2.6:        11.3 i/s - 130.35x  slower

```

We can see that the performance is improved 60~80x faster.

Additionally, I added a new test for `Node#each_recursive`.
It was missing, but we need it to confirm not to break the previous
behavior.

Thank you.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
---
 benchmark/each_recursive.yaml | 40 +++++++++++++++++++++++++++++++++++
 lib/rexml/node.rb             | 12 +++++++----
 test/test_document.rb         | 36 +++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 4 deletions(-)
 create mode 100644 benchmark/each_recursive.yaml

diff --git a/benchmark/each_recursive.yaml b/benchmark/each_recursive.yaml
new file mode 100644
index 00000000..c745f8ce
--- /dev/null
+++ b/benchmark/each_recursive.yaml
@@ -0,0 +1,40 @@
+loop_count: 100
+contexts:
+  - gems:
+      rexml: 3.2.6
+    require: false
+    prelude: require 'rexml'
+  - name: master
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+  - name: 3.2.6(YJIT)
+    gems:
+      rexml: 3.2.6
+    require: false
+    prelude: |
+      require 'rexml'
+      RubyVM::YJIT.enable
+  - name: master(YJIT)
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+      RubyVM::YJIT.enable
+
+prelude: |
+  require 'rexml/document'
+
+  xml_source = +"<root>"
+  100.times do
+    x_node_source = ""
+    100.times do
+      x_node_source = "<x>#{x_node_source}</x>"
+    end
+    xml_source << x_node_source
+  end
+  xml_source << "</root>"
+
+  document = REXML::Document.new(xml_source)
+
+benchmark:
+  each_recursive: document.each_recursive { |_| }
diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb
index 081caba6..c771db70 100644
--- a/lib/rexml/node.rb
+++ b/lib/rexml/node.rb
@@ -52,10 +52,14 @@ def parent?
 
     # Visit all subnodes of +self+ recursively
     def each_recursive(&block) # :yields: node
-      self.elements.each {|node|
-        block.call(node)
-        node.each_recursive(&block)
-      }
+      stack = []
+      each { |child| stack.unshift child if child.node_type == :element }
+      until stack.empty?
+        child = stack.pop
+        yield child
+        n = stack.size
+        child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element }
+      end
     end
 
     # Find (and return) first subnode (recursively) for which the block
diff --git a/test/test_document.rb b/test/test_document.rb
index 4bf3f55d..7fccbacb 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -209,6 +209,42 @@ def test_gt_linear_performance_attribute_value
       end
     end
 
+    def test_each_recursive
+      xml_source = <<~XML
+        <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+        <root name="root">
+          <x name="1_1">
+            <x name="1_2">
+              <x name="1_3" />
+            </x>
+          </x>
+          <x name="2_1">
+            <x name="2_2">
+              <x name="2_3" />
+            </x>
+          </x>
+          <!-- comment -->
+          <![CDATA[ cdata ]]>
+        </root>
+      XML
+
+      expected_names = %w[
+        root
+        1_1 1_2 1_3
+        2_1 2_2 2_3
+      ]
+
+      document = REXML::Document.new(xml_source)
+
+      # Node#each_recursive iterates elements only.
+      # This does not iterate XML declerations, comments, attributes, CDATA sections, etc.
+      actual_names = []
+      document.each_recursive do |element|
+        actual_names << element.attributes["name"]
+      end
+      assert_equal(expected_names, actual_names)
+    end
+
     class WriteTest < Test::Unit::TestCase
       def setup
         @document = REXML::Document.new(<<-EOX)

From e06b3fb2660c682423e10d59b92d192c42e9825d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Fri, 7 Jun 2024 14:34:25 +0900
Subject: [PATCH 012/101] Improve text parse performance

If there are many ">"s in text, parsing is very slow.

    Calculating -------------------------------------
                         rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
               attribute       1.116      3.618k        1.117        1.941k i/s -      10.000 times in 8.957748s 0.002764s 8.951665s 0.005152s
                    text      27.089      2.262k       42.632        1.033k i/s -      10.000 times in 0.369147s 0.004421s 0.234566s 0.009683s

    Comparison:
                            attribute
                  master:      3617.6 i/s
            master(YJIT):      1941.1 i/s - 1.86x  slower
             3.2.6(YJIT):         1.1 i/s - 3238.31x  slower
             rexml 3.2.6:         1.1 i/s - 3240.51x  slower

                                 text
                  master:      2261.8 i/s
            master(YJIT):      1032.7 i/s - 2.19x  slower
             3.2.6(YJIT):        42.6 i/s - 53.05x  slower
             rexml 3.2.6:        27.1 i/s - 83.49x  slower
---
 benchmark/gt.yaml               | 34 +++++++++++++++++++++++++++++++++
 lib/rexml/parsers/baseparser.rb | 10 ++++++++--
 lib/rexml/source.rb             | 19 +++++++++---------
 3 files changed, 52 insertions(+), 11 deletions(-)
 create mode 100644 benchmark/gt.yaml

diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml
new file mode 100644
index 00000000..3f6af739
--- /dev/null
+++ b/benchmark/gt.yaml
@@ -0,0 +1,34 @@
+loop_count: 10
+contexts:
+  - gems:
+      rexml: 3.2.6
+    require: false
+    prelude: require "rexml"
+  - name: master
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require "rexml"
+  - name: 3.2.6(YJIT)
+    gems:
+      rexml: 3.2.6
+    require: false
+    prelude: |
+      require "rexml"
+      RubyVM::YJIT.enable
+  - name: master(YJIT)
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require "rexml"
+      RubyVM::YJIT.enable
+
+prelude: |
+  require "rexml/document"
+
+  n = 10000
+  gts = ">" * n
+  in_attribute = "<test attribute=\"#{gts}\"></test>"
+  in_text = "<test>#{gts}</test>"
+
+benchmark:
+  "attribute": REXML::Document.new(in_attribute)
+  "text":      REXML::Document.new(in_text)
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 82575685..eadc78f7 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -373,6 +373,10 @@ def pull_event
         begin
           start_position = @source.position
           if @source.match("<", true)
+            # :text's read_until may remain only "<" in buffer. In the
+            # case, buffer is empty here. So we need to fill buffer
+            # here explicitly.
+            @source.ensure_buffer
             if @source.match("/", true)
               @nsstack.shift
               last_tag = @tags.pop
@@ -438,8 +442,10 @@ def pull_event
               return [ :start_element, tag, attributes ]
             end
           else
-            md = @source.match(/([^<]*)/um, true)
-            text = md[1]
+            text = @source.read_until("<")
+            if text.chomp!("<")
+              @source.position -= "<".bytesize
+            end
             return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 542b76a6..982aa84a 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -36,7 +36,7 @@ class Source
 
     module Private
       PRE_DEFINED_TERM_PATTERNS = {}
-      pre_defined_terms = ["'", '"']
+      pre_defined_terms = ["'", '"', "<"]
       pre_defined_terms.each do |term|
         PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
       end
@@ -192,17 +192,18 @@ def read(term = nil)
     def read_until(term)
       pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
       term = encode(term)
-      begin
-        until str = @scanner.scan_until(pattern)
-          @scanner << readline(term)
-        end
-      rescue EOFError
+      until str = @scanner.scan_until(pattern)
+        break if @source.nil?
+        break if @source.eof?
+        @scanner << readline(term)
+      end
+      if str
+        read if @scanner.eos? and !@source.eof?
+        str
+      else
         rest = @scanner.rest
         @scanner.pos = @scanner.string.bytesize
         rest
-      else
-        read if @scanner.eos? and !@source.eof?
-        str
       end
     end
 

From 964c9dc7896e9a0b8ba012702fb06d6538b6acf1 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 9 Jun 2024 11:31:12 +0900
Subject: [PATCH 013/101] Add 3.2.9 entry

---
 NEWS.md | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 7bfe3b9a..ce33b764 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,32 @@
 # News
 
+## 3.2.9 - 2024-06-19 {#version-3-2-9}
+
+### Improvements
+
+  * Added support for old strscan.
+    * GH-132
+    * Reported by Adam
+
+  * Improved attribute value parse performance.
+    * GH-135
+    * Patch by NAITOH Jun.
+
+  * Improved `REXML::Node#each_recursive` performance.
+    * GH-134
+    * GH-139
+    * Patch by Hiroya Fujinami.
+
+  * Improved text parse performance.
+    * Reported by mprogrammer.
+
+### Thanks
+
+  * Adam
+  * NAITOH Jun
+  * Hiroya Fujinami
+  * mprogrammer
+
 ## 3.2.8 - 2024-05-16 {#version-3-2-8}
 
 ### Fixes
@@ -65,7 +92,6 @@
   * jcavalieri
   * DuKewu
 
-
 ## 3.2.6 - 2023-07-27 {#version-3-2-6}
 
 ### Improvements

From 7ca7ccdfc65f5bb1d61797163ef213774a99cbbb Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sun, 9 Jun 2024 11:32:37 +0900
Subject: [PATCH 014/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index d317e666..3e870822 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.2.9"
+  VERSION = "3.3.0"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 5078c86573002e4dfd8543dba5b313f234f08e95 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 09:49:22 +0900
Subject: [PATCH 015/101] news: fix a typo

Reported by nicholas a. evans. Thanks!!!
---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index ce33b764..473fbf20 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,6 @@
 # News
 
-## 3.2.9 - 2024-06-19 {#version-3-2-9}
+## 3.2.9 - 2024-06-09 {#version-3-2-9}
 
 ### Improvements
 

From a7d66f2d3b9142a5afbfceb921a1b51546aee7ee Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 09:50:27 +0900
Subject: [PATCH 016/101] ci document: use the latest Ruby

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f977de60..f593c1d1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -98,7 +98,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: ruby/setup-ruby@v1
         with:
-          ruby-version: 2.7
+          ruby-version: ruby
       - name: Install dependencies
         run: |
           bundle install

From 31738ccfc3324f4b32769fa1695c78c06a88c277 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 09:52:35 +0900
Subject: [PATCH 017/101] Add support for strscan 0.7.0 installed with Ruby 2.6

Fix GH-142

Reported by Fernando Trigoso. Thanks!!!
---
 .github/workflows/test.yml | 18 +++++++-----------
 lib/rexml/source.rb        | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f593c1d1..2383d198 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,14 +3,14 @@ on:
   - push
   - pull_request
 jobs:
-  ruby-versions-inplace:
+  ruby-versions:
     uses: ruby/actions/.github/workflows/ruby_versions.yml@master
     with:
       engine: cruby-jruby
       min_version: 2.5
 
   inplace:
-    needs: ruby-versions-inplace
+    needs: ruby-versions
     name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -20,7 +20,7 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }}
+        ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
         exclude:
           - {runs-on: macos-latest, ruby-version: 2.5}
         # include:
@@ -47,14 +47,8 @@ jobs:
       - name: Test
         run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal"
 
-  ruby-versions-gem:
-    uses: ruby/actions/.github/workflows/ruby_versions.yml@master
-    with:
-      engine: cruby-jruby
-      min_version: 3.0
-
   gem:
-    needs: ruby-versions-gem
+    needs: ruby-versions
     name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -64,7 +58,9 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        ruby-version: ${{ fromJson(needs.ruby-versions-gem.outputs.versions) }}
+        exclude:
+          - {runs-on: macos-latest, ruby-version: 2.5}
+        ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
     steps:
       - uses: actions/checkout@v4
       - uses: ruby/setup-ruby@v1
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 982aa84a..67154832 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -1,8 +1,28 @@
 # coding: US-ASCII
 # frozen_string_literal: false
+
+require "strscan"
+
 require_relative 'encoding'
 
 module REXML
+  if StringScanner::Version < "1.0.0"
+    module StringScannerCheckScanString
+      refine StringScanner do
+        def check(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+
+        def scan(pattern)
+          pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String)
+          super(pattern)
+        end
+      end
+    end
+    using StringScannerCheckScanString
+  end
+
   # Generates Source-s.  USE THIS CLASS.
   class SourceFactory
     # Generates a Source object

From 0d9b98c7f6bd221c362644329c4cee8a2338ddc4 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 14:40:58 +0900
Subject: [PATCH 018/101] ci: don't use Ruby 2.5 for gem test

Because REXML isn't a default gem yet in Ruby 2.5.
---
 .github/workflows/test.yml | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2383d198..0bd43457 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,14 +3,14 @@ on:
   - push
   - pull_request
 jobs:
-  ruby-versions:
+  ruby-versions-inplace:
     uses: ruby/actions/.github/workflows/ruby_versions.yml@master
     with:
       engine: cruby-jruby
       min_version: 2.5
 
   inplace:
-    needs: ruby-versions
+    needs: ruby-versions-inplace
     name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -20,7 +20,7 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
+        ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }}
         exclude:
           - {runs-on: macos-latest, ruby-version: 2.5}
         # include:
@@ -47,8 +47,14 @@ jobs:
       - name: Test
         run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal"
 
+  ruby-versions-gems:
+    uses: ruby/actions/.github/workflows/ruby_versions.yml@master
+    with:
+      engine: cruby-jruby
+      min_version: 2.6 # REXML is a default gem since Ruby 2.6
+
   gem:
-    needs: ruby-versions
+    needs: ruby-versions-gems
     name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
@@ -58,9 +64,7 @@ jobs:
           - ubuntu-latest
           - macos-latest
           - windows-latest
-        exclude:
-          - {runs-on: macos-latest, ruby-version: 2.5}
-        ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
+        ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }}
     steps:
       - uses: actions/checkout@v4
       - uses: ruby/setup-ruby@v1

From 8247bdc55c85073e953fd27687f42e427b6f071b Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 15:10:29 +0900
Subject: [PATCH 019/101] Add 3.3.0 entry

---
 NEWS.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 473fbf20..c8e9ecc0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,12 +1,24 @@
 # News
 
+## 3.3.0 - 2024-06-11 {#version-3-3-0}
+
+### Improvements
+
+  * Added support for strscan 0.7.0 installed with Ruby 2.6.
+    * GH-142
+    * Reported by Fernando Trigoso.
+
+### Thanks
+
+  * Fernando Trigoso
+
 ## 3.2.9 - 2024-06-09 {#version-3-2-9}
 
 ### Improvements
 
   * Added support for old strscan.
     * GH-132
-    * Reported by Adam
+    * Reported by Adam.
 
   * Improved attribute value parse performance.
     * GH-135

From 0274467fdba450388a8d71edbc603b0ffbfd4de3 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 11 Jun 2024 15:11:07 +0900
Subject: [PATCH 020/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 3e870822..3af03ec7 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.0"
+  VERSION = "3.3.1"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 6415113201e0ebc334ff26a585ca7fdab418351b Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Tue, 11 Jun 2024 17:38:32 +0900
Subject: [PATCH 021/101] Remove an unused class var `@@namespaces` (#144)

`@@namespaces` is defined under `REXML`, but it is never used.
At least, `rake test` passes when it is removed.

I guess the comment above `@@namespaces` is also false.
---
 lib/rexml/element.rb | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index bf913a82..2899759d 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -7,14 +7,6 @@
 require_relative "parseexception"
 
 module REXML
-  # An implementation note about namespaces:
-  # As we parse, when we find namespaces we put them in a hash and assign
-  # them a unique ID.  We then convert the namespace prefix for the node
-  # to the unique ID.  This makes namespace lookup much faster for the
-  # cost of extra memory use.  We save the namespace prefix for the
-  # context node and convert it back when we write it.
-  @@namespaces = {}
-
   # An \REXML::Element object represents an XML element.
   #
   # An element:

From b5bf109a599ea733663150e99c09eb44046b41dd Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 15:12:32 +0900
Subject: [PATCH 022/101] Add a "malformed comment" check for top-level
 comments (#145)

This check was missing. Therefore, `REXML::Document.new("<!--")` raised
the ``undefined method `[]' for nil`` error, for example.

This PR also adds tests for "malformed comment" checks.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
---
 lib/rexml/parsers/baseparser.rb |  9 +++-
 test/parse/test_comment.rb      | 96 +++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_comment.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index eadc78f7..eae0db8b 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -228,7 +228,14 @@ def pull_event
             return process_instruction(start_position)
           elsif @source.match("<!", true)
             if @source.match("--", true)
-              return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
+              md = @source.match(/(.*?)-->/um, true)
+              if md.nil?
+                raise REXML::ParseException.new("Unclosed comment", @source)
+              end
+              if /--|-\z/.match?(md[1])
+                raise REXML::ParseException.new("Malformed comment", @source)
+              end
+              return [ :comment, md[1] ]
             elsif @source.match("DOCTYPE", true)
               base_error_message = "Malformed DOCTYPE"
               unless @source.match(/\s+/um, true)
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
new file mode 100644
index 00000000..8f143495
--- /dev/null
+++ b/test/parse/test_comment.rb
@@ -0,0 +1,96 @@
+require "test/unit"
+require "rexml/document"
+
+module REXMLTests
+  class TestParseComment < Test::Unit::TestCase
+    def parse(xml)
+      REXML::Document.new(xml)
+    end
+
+    class TestInvalid < self
+      def test_toplevel_unclosed_comment
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!--")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Unclosed comment
+          Line: 1
+          Position: 4
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_toplevel_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 11
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_toplevel_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 9
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_doctype_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!DOCTYPE foo [<!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 26
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_doctype_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<!DOCTYPE foo [<!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 24
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_after_doctype_malformed_comment_inner
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-- -- -->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 14
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+
+      def test_after_doctype_malformed_comment_end
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-- --->")
+        end
+        assert_equal(<<~DETAIL, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 12
+          Last 80 unconsumed characters:
+        DETAIL
+      end
+    end
+  end
+end

From 3b026f89b66af7a1e24fe394724e81b06b25d552 Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 15:55:32 +0900
Subject: [PATCH 023/101] Improve `Element#attribute` implementation as 6500x
 faster (#146)

`Element#namespaces` is heavy method because this method needs to
traverse all ancestors of the element. `Element#attribute` calls
`namespaces` redundantly, so it is much slower.

This PR reduces `namespaces` calls in `Element#attribute`. Also, this PR
removes a redundant `respond_to?` because `namespaces` must return
`Hash` in the current implementation.

Below is the result of a benchmark for this on my laptop.

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/makenowjust/Projects/github.com/makenowjust/simple-dotfiles/.asdf/installs/ruby/3.3.2/bin/ruby -v -S benchmark-driver /Users/makenowjust/Projects/github.com/ruby/rexml/benchmark/attribute.yaml
ruby 3.3.2 (2024-05-30 revision e5a195edf6) [arm64-darwin23]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
   attribute_with_ns     425.420     849.271       5.336k       10.629k i/s -      1.000k times in 2.350620s 1.177481s 0.187416s 0.094084s
attribute_without_ns     834.750      5.587M      10.656k        2.950M i/s -      1.000k times in 1.197963s 0.000179s 0.093846s 0.000339s

Comparison:
                attribute_with_ns
        master(YJIT):     10628.8 i/s
         3.2.6(YJIT):      5335.7 i/s - 1.99x  slower
              master:       849.3 i/s - 12.52x  slower
         rexml 3.2.6:       425.4 i/s - 24.98x  slower

             attribute_without_ns
              master:   5586593.2 i/s
        master(YJIT):   2949854.4 i/s - 1.89x  slower
         3.2.6(YJIT):     10655.8 i/s - 524.28x  slower
         rexml 3.2.6:       834.8 i/s - 6692.53x  slower

```

This result shows that `Element#attribute` is now 6500x faster than the
old implementation if `namespace` is not supplied.

It seems strange that it is slower when YJIT is enabled, but we believe
this is a separate issue.

Thank you.

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
---
 benchmark/attribute.yaml | 38 ++++++++++++++++++++++++++++++++++++++
 lib/rexml/element.rb     |  9 ++-------
 2 files changed, 40 insertions(+), 7 deletions(-)
 create mode 100644 benchmark/attribute.yaml

diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml
new file mode 100644
index 00000000..5dd7fded
--- /dev/null
+++ b/benchmark/attribute.yaml
@@ -0,0 +1,38 @@
+loop_count: 1000
+contexts:
+  - gems:
+      rexml: 3.2.6
+    require: false
+    prelude: require 'rexml'
+  - name: master
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+  - name: 3.2.6(YJIT)
+    gems:
+      rexml: 3.2.6
+    require: false
+    prelude: |
+      require 'rexml'
+      RubyVM::YJIT.enable
+  - name: master(YJIT)
+    prelude: |
+      $LOAD_PATH.unshift(File.expand_path("lib"))
+      require 'rexml'
+      RubyVM::YJIT.enable
+
+prelude: |
+  require 'rexml/document'
+
+  xml_source = "<deepest x:with_ns='foo' without_ns='bar'></deepest>"
+  100.times do
+    xml_source = "<nest>#{xml_source}</nest>"
+  end
+  xml_source = "<root xmlns:x='xyz'>#{xml_source}</root>"
+
+  document = REXML::Document.new(xml_source)
+  deepest_node = document.elements["//deepest"]
+
+benchmark:
+  with_ns:    deepest_node.attribute("with_ns", "xyz")
+  without_ns: deepest_node.attribute("without_ns")
diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index 2899759d..a5808d7c 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -1276,16 +1276,11 @@ def [](name_or_index)
     #   document.root.attribute("x", "a") # => a:x='a:x'
     #
     def attribute( name, namespace=nil )
-      prefix = nil
-      if namespaces.respond_to? :key
-        prefix = namespaces.key(namespace) if namespace
-      else
-        prefix = namespaces.index(namespace) if namespace
-      end
+      prefix = namespaces.key(namespace) if namespace
       prefix = nil if prefix == 'xmlns'
 
       ret_val =
-        attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" )
+        attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name )
 
       return ret_val unless ret_val.nil?
       return nil if prefix.nil?

From 1e31ffc7c9170255c2a62773ac1e1d90c4991a9d Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Thu, 13 Jun 2024 23:29:59 +0900
Subject: [PATCH 024/101] Fix small typos (#148)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I found these typos with using
[`typos-cli`](https://github.com/crate-ci/typos).

Now, we can obtain no typo reports from the `typos` command with this
configuration (`.typos.toml`):

```toml
[files]
extend-exclude = [
  "*.svg",
  "*.xml",
]

[default.extend-words]
# Common variable names in this project.
arry = "arry"
blok = "blok"
eles = "eles"
# Incomplete words in test data.
caf = "caf"
# German words in test data.
abl = "abl" # NOTE: It is a part of "Ablüfe".
alle = "alle"
ist = "ist"
technik = "technik"
```

Thank you.

---------

Co-authored-by: Olle Jonsson <olle.jonsson@gmail.com>
---
 test/test_document.rb   | 2 +-
 test/test_light.rb      | 2 +-
 test/test_sax.rb        | 2 +-
 test/xpath/test_base.rb | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_document.rb b/test/test_document.rb
index 7fccbacb..2b0a8a73 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -237,7 +237,7 @@ def test_each_recursive
       document = REXML::Document.new(xml_source)
 
       # Node#each_recursive iterates elements only.
-      # This does not iterate XML declerations, comments, attributes, CDATA sections, etc.
+      # This does not iterate XML declarations, comments, attributes, CDATA sections, etc.
       actual_names = []
       document.each_recursive do |element|
         actual_names << element.attributes["name"]
diff --git a/test/test_light.rb b/test/test_light.rb
index 54b2c52e..c556c978 100644
--- a/test/test_light.rb
+++ b/test/test_light.rb
@@ -62,7 +62,7 @@ def test_access_child_elements
       assert_equal( 'c', a[1].name )
     end
 
-    def test_itterate_over_children
+    def test_iterate_over_children
       foo = make_small_document
       ctr = 0
       foo[0].each { ctr += 1 }
diff --git a/test/test_sax.rb b/test/test_sax.rb
index c2255bf3..8e905f2e 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -140,7 +140,7 @@ def test_simple_doctype_listener
 
     # test doctype with missing name, should throw ParseException
     # submitted by Jeff Barczewseki
-    def test_doctype_with_mising_name_throws_exception
+    def test_doctype_with_missing_name_throws_exception
       xml = <<~END
         <?xml version="1.0"?>
         <!DOCTYPE >
diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb
index 68b33ab7..1dacd69d 100644
--- a/test/xpath/test_base.rb
+++ b/test/xpath/test_base.rb
@@ -651,7 +651,7 @@ def test_comparisons
       source = "<a><b id='1'/><b id='2'/><b id='3'/></a>"
       doc = REXML::Document.new(source)
 
-      # NOTE TO SER: check that number() is required
+      # NOTE: check that number() is required
       assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size
       assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size
       assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size

From d906ae2f05351ea68e5860be9b8c6e1de57dee9b Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Fri, 14 Jun 2024 06:00:13 +0900
Subject: [PATCH 025/101] Add a "Malformed comment" check for invalid comments
 such as `<!-->` (#147)

`Document.new("<a><!-->")` raises `undefined method '[]' for nil`. This
commit fixes it and adds a test for it.
---
 lib/rexml/parsers/baseparser.rb |  5 ++---
 test/parse/test_comment.rb      | 13 +++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index eae0db8b..272d8a6b 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -406,12 +406,11 @@ def pull_event
               if md[0][0] == ?-
                 md = @source.match(/--(.*?)-->/um, true)
 
-                case md[1]
-                when /--/, /-\z/
+                if md.nil? || /--|-\z/.match?(md[1])
                   raise REXML::ParseException.new("Malformed comment", @source)
                 end
 
-                return [ :comment, md[1] ] if md
+                return [ :comment, md[1] ]
               else
                 md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
                 return [ :cdata, md[1] ] if md
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 8f143495..ce6678e8 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -68,6 +68,19 @@ def test_doctype_malformed_comment_end
         DETAIL
       end
 
+      def test_after_doctype_malformed_comment_short
+        exception = assert_raise(REXML::ParseException) do
+          parse("<a><!-->")
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed comment
+          Line: 1
+          Position: 8
+          Last 80 unconsumed characters:
+          -->
+        DETAIL
+      end
+
       def test_after_doctype_malformed_comment_inner
         exception = assert_raise(REXML::ParseException) do
           parse("<a><!-- -- -->")

From f7040112601104d71d3254a0834c4932b1b68f04 Mon Sep 17 00:00:00 2001
From: Hiroya Fujinami <make.just.on@gmail.com>
Date: Wed, 19 Jun 2024 14:47:34 +0900
Subject: [PATCH 026/101] Reject unclosed DOCTYPE on parsing (#153)

Fix #152

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb              | 10 ++++-
 lib/rexml/parsers/treeparser.rb              | 23 ++++------
 test/parse/test_document_type_declaration.rb | 45 ++++++++++++++++++++
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 272d8a6b..5791ab1d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -216,7 +216,12 @@ def pull_event
           x, @closed = @closed, nil
           return [ :end_element, x ]
         end
-        return [ :end_document ] if empty?
+        if empty?
+          if @document_status == :in_doctype
+            raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
+          end
+          return [ :end_document ]
+        end
         return @stack.shift if @stack.size > 0
         #STDERR.puts @source.encoding
         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
@@ -373,6 +378,9 @@ def pull_event
             @document_status = :after_doctype
             return [ :end_doctype ]
           end
+          if @document_status == :in_doctype
+            raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source)
+          end
         end
         if @document_status == :after_doctype
           @source.match(/\s*/um, true)
diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb
index bf9a4254..0cb6f7cc 100644
--- a/lib/rexml/parsers/treeparser.rb
+++ b/lib/rexml/parsers/treeparser.rb
@@ -16,7 +16,6 @@ def add_listener( listener )
 
       def parse
         tag_stack = []
-        in_doctype = false
         entities = nil
         begin
           while true
@@ -39,17 +38,15 @@ def parse
               tag_stack.pop
               @build_context = @build_context.parent
             when :text
-              if not in_doctype
-                if @build_context[-1].instance_of? Text
-                  @build_context[-1] << event[1]
-                else
-                  @build_context.add(
-                    Text.new(event[1], @build_context.whitespace, nil, true)
-                  ) unless (
-                    @build_context.ignore_whitespace_nodes and
-                    event[1].strip.size==0
-                  )
-                end
+              if @build_context[-1].instance_of? Text
+                @build_context[-1] << event[1]
+              else
+                @build_context.add(
+                  Text.new(event[1], @build_context.whitespace, nil, true)
+                ) unless (
+                  @build_context.ignore_whitespace_nodes and
+                  event[1].strip.size==0
+                )
               end
             when :comment
               c = Comment.new( event[1] )
@@ -60,14 +57,12 @@ def parse
             when :processing_instruction
               @build_context.add( Instruction.new( event[1], event[2] ) )
             when :end_doctype
-              in_doctype = false
               entities.each { |k,v| entities[k] = @build_context.entities[k].value }
               @build_context = @build_context.parent
             when :start_doctype
               doctype = DocType.new( event[1..-1], @build_context )
               @build_context = doctype
               entities = {}
-              in_doctype = true
             when :attlistdecl
               n = AttlistDecl.new( event[1..-1] )
               @build_context.add( n )
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 8faa0b78..3ca0b536 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -53,6 +53,51 @@ def test_no_name
       end
     end
 
+    class TestUnclosed < self
+      def test_no_extra_node
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new("<!DOCTYPE foo [")
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: unclosed
+          Line: 1
+          Position: 15
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+
+      def test_start_element
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new(<<~DOCTYPE)
+            <!DOCTYPE foo [ <r>
+          DOCTYPE
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: invalid declaration
+          Line: 1
+          Position: 20
+          Last 80 unconsumed characters:
+          <r>#{' '}
+        DETAIL
+      end
+
+      def test_text
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new(<<~DOCTYPE)
+            <!DOCTYPE foo [ text
+          DOCTYPE
+        end
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed DOCTYPE: invalid declaration
+          Line: 1
+          Position: 21
+          Last 80 unconsumed characters:
+          text#{' '}
+        DETAIL
+      end
+    end
+
     class TestExternalID < self
       class TestSystem < self
         def test_left_bracket_in_system_literal

From 4c288086ed1869f224dc534182019c93d0317319 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 22 Jun 2024 10:42:44 +0900
Subject: [PATCH 027/101] Fix a bug that a large XML can't be parsed (#154)

GitHub: fix GH-150

If a parsed XML is later than `2 ** 31 - 1`, we can't parse it. Because
`StringScanner`s position is stored as `int`. We can avoid the
restriction by dropping large parsed content.

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb |  2 ++
 lib/rexml/source.rb             |  7 +++++++
 test/parser/test_base_parser.rb | 27 +++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 test/parser/test_base_parser.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5791ab1d..a003ac29 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -204,6 +204,8 @@ def peek depth=0
 
       # Returns the next event.  This is a +PullEvent+ object.
       def pull
+        @source.drop_parsed_content
+
         pull_event.tap do |event|
           @listeners.each do |listener|
             listener.receive event
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 67154832..f12ee172 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -55,6 +55,7 @@ class Source
     attr_reader :encoding
 
     module Private
+      SCANNER_RESET_SIZE = 100000
       PRE_DEFINED_TERM_PATTERNS = {}
       pre_defined_terms = ["'", '"', "<"]
       pre_defined_terms.each do |term|
@@ -84,6 +85,12 @@ def buffer
       @scanner.rest
     end
 
+    def drop_parsed_content
+      if @scanner.pos > Private::SCANNER_RESET_SIZE
+        @scanner.string = @scanner.rest
+      end
+    end
+
     def buffer_encoding=(encoding)
       @scanner.string.force_encoding(encoding)
     end
diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb
new file mode 100644
index 00000000..17d01979
--- /dev/null
+++ b/test/parser/test_base_parser.rb
@@ -0,0 +1,27 @@
+# frozen_string_literal: false
+
+require 'rexml/parsers/baseparser'
+
+module REXMLTests
+  class BaseParserTester < Test::Unit::TestCase
+    def test_large_xml
+      large_text = "a" * 100_000
+      xml = <<-XML
+        <?xml version="1.0"?>
+        <root>
+          <child>#{large_text}</child>
+          <child>#{large_text}</child>
+        </root>
+      XML
+
+      parser = REXML::Parsers::BaseParser.new(xml)
+      while parser.has_next?
+        parser.pull
+      end
+
+      assert do
+        parser.position < xml.bytesize
+      end
+    end
+  end
+end

From cfa8dd90077000f21f55a6b7e5f041e2b4fd5e04 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 22 Jun 2024 14:21:28 +0900
Subject: [PATCH 028/101] Don't include private_constant-ed module (#155)

Included constants are not private. So private constants in private
module aren't private.

See also: https://github.com/ruby/rexml/pull/154#discussion_r1649469269
---
 lib/rexml/parsers/baseparser.rb | 13 ++++++-------
 lib/rexml/source.rb             |  1 -
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index a003ac29..c83e7958 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -134,7 +134,6 @@ module Private
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
       end
       private_constant :Private
-      include Private
 
       def initialize( source )
         self.stream = source
@@ -302,7 +301,7 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
+              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
               ref = false
               if match[1] == '%'
                 ref = true
@@ -328,7 +327,7 @@ def pull_event
               match << '%' if ref
               return match
             elsif @source.match("ATTLIST", true)
-              md = @source.match(ATTLISTDECL_END, true)
+              md = @source.match(Private::ATTLISTDECL_END, true)
               raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
               element = md[1]
               contents = md[0]
@@ -397,7 +396,7 @@ def pull_event
             if @source.match("/", true)
               @nsstack.shift
               last_tag = @tags.pop
-              md = @source.match(CLOSE_PATTERN, true)
+              md = @source.match(Private::CLOSE_PATTERN, true)
               if md and !last_tag
                 message = "Unexpected top-level end tag (got '#{md[1]}')"
                 raise REXML::ParseException.new(message, @source)
@@ -431,7 +430,7 @@ def pull_event
               return process_instruction(start_position)
             else
               # Get the next tag
-              md = @source.match(TAG_PATTERN, true)
+              md = @source.match(Private::TAG_PATTERN, true)
               unless md
                 @source.position = start_position
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
@@ -539,7 +538,7 @@ def need_source_encoding_update?(xml_declaration_encoding)
       end
 
       def parse_name(base_error_message)
-        md = @source.match(NAME_PATTERN, true)
+        md = @source.match(Private::NAME_PATTERN, true)
         unless md
           if @source.match(/\s*\S/um)
             message = "#{base_error_message}: invalid name"
@@ -618,7 +617,7 @@ def parse_id_invalid_details(accept_external_id:,
       end
 
       def process_instruction(start_position)
-        match_data = @source.match(INSTRUCTION_END, true)
+        match_data = @source.match(Private::INSTRUCTION_END, true)
         unless match_data
           message = "Invalid processing instruction node"
           @source.position = start_position
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index f12ee172..5715c352 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -63,7 +63,6 @@ module Private
       end
     end
     private_constant :Private
-    include Private
 
     # Constructor
     # @param arg must be a String, and should be a valid XML document

From 22d206a657c9923b19fa3e3c7451800f875ba31d Mon Sep 17 00:00:00 2001
From: fynsta <63241108+fynsta@users.noreply.github.com>
Date: Sun, 23 Jun 2024 00:42:36 +0200
Subject: [PATCH 029/101] Add changelog_uri to gemspec (#156)

Supported here:
https://guides.rubygems.org/specification-reference/#metadata

Useful for running https://github.com/MaximeD/gem_updater
---
 rexml.gemspec | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rexml.gemspec b/rexml.gemspec
index 169e49dc..0de3e845 100644
--- a/rexml.gemspec
+++ b/rexml.gemspec
@@ -16,6 +16,10 @@ Gem::Specification.new do |spec|
   spec.homepage      = "https://github.com/ruby/rexml"
   spec.license       = "BSD-2-Clause"
 
+  spec.metadata = {
+    "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}"
+  }
+
   files = [
     "LICENSE.txt",
     "NEWS.md",

From e6e07f27c27a8b0955b61ee43ef73a5c283ad038 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 23 Jun 2024 20:50:25 +0900
Subject: [PATCH 030/101] Reuse of Set.new at prefixes variables (#157)

## Why?
`Set.new()` instances of the prefixes variable can be reused, reducing
initialization costs.

## Result
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.714      17.658        32.898       33.247 i/s -     100.000 times in 5.645176s 5.663160s 3.039707s 3.007755s
                 sax     25.280      25.281        47.483       49.990 i/s -     100.000 times in 3.955694s 3.955534s 2.106006s 2.000389s
                pull     29.048      29.061        59.944       61.498 i/s -     100.000 times in 3.442599s 3.441014s 1.668222s 1.626060s
              stream     28.181      28.440        52.340       55.078 i/s -     100.000 times in 3.548546s 3.516169s 1.910599s 1.815599s

Comparison:
                              dom
         after(YJIT):        33.2 i/s
        before(YJIT):        32.9 i/s - 1.01x  slower
              before:        17.7 i/s - 1.88x  slower
               after:        17.7 i/s - 1.88x  slower

                              sax
         after(YJIT):        50.0 i/s
        before(YJIT):        47.5 i/s - 1.05x  slower
               after:        25.3 i/s - 1.98x  slower
              before:        25.3 i/s - 1.98x  slower

                             pull
         after(YJIT):        61.5 i/s
        before(YJIT):        59.9 i/s - 1.03x  slower
               after:        29.1 i/s - 2.12x  slower
              before:        29.0 i/s - 2.12x  slower

                           stream
         after(YJIT):        55.1 i/s
        before(YJIT):        52.3 i/s - 1.05x  slower
               after:        28.4 i/s - 1.94x  slower
              before:        28.2 i/s - 1.95x  slower

```

YJIT=ON : 1.01x - 1.05x faster
YJIT=OFF : 0.99x - 1.00x faster
---
 lib/rexml/parsers/baseparser.rb | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index c83e7958..2f068e0c 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -138,6 +138,7 @@ module Private
       def initialize( source )
         self.stream = source
         @listeners = []
+        @prefixes = Set.new
       end
 
       def add_listener( listener )
@@ -253,7 +254,7 @@ def pull_event
                 @source.position = start_position
                 raise REXML::ParseException.new(message, @source)
               end
-              @nsstack.unshift(curr_ns=Set.new)
+              @nsstack.unshift(Set.new)
               name = parse_name(base_error_message)
               if @source.match(/\s*\[/um, true)
                 id = [nil, nil, nil]
@@ -437,12 +438,12 @@ def pull_event
               end
               tag = md[1]
               @document_status = :in_element
-              prefixes = Set.new
-              prefixes << md[2] if md[2]
+              @prefixes.clear
+              @prefixes << md[2] if md[2]
               @nsstack.unshift(curr_ns=Set.new)
-              attributes, closed = parse_attributes(prefixes, curr_ns)
+              attributes, closed = parse_attributes(@prefixes, curr_ns)
               # Verify that all of the prefixes have been defined
-              for prefix in prefixes
+              for prefix in @prefixes
                 unless @nsstack.find{|k| k.member?(prefix)}
                   raise UndefinedNamespaceException.new(prefix,@source,self)
                 end

From a579730f25ec7443796495541ec57c071b91805d Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Tue, 25 Jun 2024 09:07:11 +0900
Subject: [PATCH 031/101] Optimize BaseParser#unnormalize method (#158)

## Benchmark
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.704      18.106        34.215       33.806 i/s -     100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s
                 sax     25.664      25.302        48.429       48.602 i/s -     100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s
                pull     28.966      29.215        61.710       62.068 i/s -     100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s
              stream     28.291      28.426        53.860       55.548 i/s -     100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s

Comparison:
                              dom
        before(YJIT):        34.2 i/s
         after(YJIT):        33.8 i/s - 1.01x  slower
               after:        18.1 i/s - 1.89x  slower
              before:        17.7 i/s - 1.93x  slower

                              sax
         after(YJIT):        48.6 i/s
        before(YJIT):        48.4 i/s - 1.00x  slower
              before:        25.7 i/s - 1.89x  slower
               after:        25.3 i/s - 1.92x  slower

                             pull
         after(YJIT):        62.1 i/s
        before(YJIT):        61.7 i/s - 1.01x  slower
               after:        29.2 i/s - 2.12x  slower
              before:        29.0 i/s - 2.14x  slower

                           stream
         after(YJIT):        55.5 i/s
        before(YJIT):        53.9 i/s - 1.03x  slower
               after:        28.4 i/s - 1.95x  slower
              before:        28.3 i/s - 1.96x  slower

```

- YJIT=ON : 1.00x - 1.03x faster
- YJIT=OFF : 0.98x - 1.02x faster
---
 lib/rexml/parsers/baseparser.rb | 15 +++++++++++----
 test/test_pullparser.rb         | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 2f068e0c..275372ee 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -132,6 +132,13 @@ module Private
         GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
         PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
+        CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
+        CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
+        DEFAULT_ENTITIES_PATTERNS = {}
+        default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
+        default_entities.each do |term|
+          DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
+        end
       end
       private_constant :Private
 
@@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil )
 
       # Unescapes all possible entities
       def unnormalize( string, entities=nil, filter=nil )
-        rv = string.gsub( /\r\n?/, "\n" )
+        rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
         matches = rv.scan( REFERENCE_RE )
         return rv if matches.size == 0
-        rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
+        rv.gsub!( Private::CHARACTER_REFERENCES ) {
           m=$1
           m = "0#{m}" if m[0] == ?x
           [Integer(m)].pack('U*')
@@ -518,7 +525,7 @@ def unnormalize( string, entities=nil, filter=nil )
             unless filter and filter.include?(entity_reference)
               entity_value = entity( entity_reference, entities )
               if entity_value
-                re = /&#{entity_reference};/
+                re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
               else
                 er = DEFAULT_ENTITIES[entity_reference]
@@ -526,7 +533,7 @@ def unnormalize( string, entities=nil, filter=nil )
               end
             end
           end
-          rv.gsub!( /&amp;/, '&' )
+          rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
         end
         rv
       end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 53a985ba..b6a48c93 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -62,6 +62,26 @@ def test_entity_replacement
       end
     end
 
+    def test_character_references
+      source = '<a>&#65;</a><b>&#x42;</b>'
+      parser = REXML::Parsers::PullParser.new( source )
+      element_name = ''
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :start_element
+          element_name = event[0]
+        when :text
+          case element_name
+          when 'a'
+            assert_equal('A', event[1])
+          when 'b'
+            assert_equal('B', event[1])
+          end
+        end
+      end
+    end
+
     def test_peek_unshift
       source = "<a><b/></a>"
       REXML::Parsers::PullParser.new(source)

From 20017eea807e8fa386aa5c79ae779004d8b366dd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 25 Jun 2024 11:26:33 +0900
Subject: [PATCH 032/101] Add 3.3.1 entry

---
 NEWS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index c8e9ecc0..3e406574 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,52 @@
 # News
 
+## 3.3.1 - 2024-06-25 {#version-3-3-1}
+
+### Improvements
+
+  * Added support for detecting malformed top-level comments.
+    * GH-145
+    * Patch by Hiroya Fujinami.
+
+  * Improved `REXML::Element#attribute` performance.
+    * GH-146
+    * Patch by Hiroya Fujinami.
+
+  * Added support for detecting malformed `<!-->` comments.
+    * GH-147
+    * Patch by Hiroya Fujinami.
+
+  * Added support for detecting unclosed `DOCTYPE`.
+    * GH-152
+    * Patch by Hiroya Fujinami.
+
+  * Added `changlog_uri` metadata to gemspec.
+    * GH-156
+    * Patch by fynsta.
+
+  * Improved parse performance.
+    * GH-157
+    * GH-158
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a bug that large XML can't be parsed.
+    * GH-154
+    * Patch by NAITOH Jun.
+
+  * Fixed a bug that private constants are visible.
+    * GH-155
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * Hiroya Fujinami
+
+  * NAITOH Jun
+
+  * fynsta
+
 ## 3.3.0 - 2024-06-11 {#version-3-3-0}
 
 ### Improvements

From 78b29137bf1ee46e7cf028f52cfa16f6e2578cfd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 25 Jun 2024 11:27:12 +0900
Subject: [PATCH 033/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 3af03ec7..573d0a13 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.1"
+  VERSION = "3.3.2"
   REVISION = ""
 
   Copyright = COPYRIGHT

From face9dd1fdde20351316c6c3b8090a65cd490305 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 27 Jun 2024 06:43:12 +0900
Subject: [PATCH 034/101] Optimize BaseParser#unnormalize method to replace
 "\r\n" with "\n" only when "\r\n" is included (#160)

## Why?

See: https://github.com/ruby/rexml/pull/158#issuecomment-2187663068

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     17.674      17.567        32.759       32.316 i/s -     100.000 times in 5.657973s 5.692371s 3.052595s 3.094448s
                 sax     25.261      25.377        48.889       49.911 i/s -     100.000 times in 3.958626s 3.940640s 2.045460s 2.003575s
                pull     28.968      29.121        61.584       61.774 i/s -     100.000 times in 3.452132s 3.433967s 1.623789s 1.618809s
              stream     28.395      28.803        55.289       57.970 i/s -     100.000 times in 3.521761s 3.471812s 1.808673s 1.725029s

Comparison:
                              dom
        before(YJIT):        32.8 i/s
         after(YJIT):        32.3 i/s - 1.01x  slower
              before:        17.7 i/s - 1.85x  slower
               after:        17.6 i/s - 1.86x  slower

                              sax
         after(YJIT):        49.9 i/s
        before(YJIT):        48.9 i/s - 1.02x  slower
               after:        25.4 i/s - 1.97x  slower
              before:        25.3 i/s - 1.98x  slower

                             pull
         after(YJIT):        61.8 i/s
        before(YJIT):        61.6 i/s - 1.00x  slower
               after:        29.1 i/s - 2.12x  slower
              before:        29.0 i/s - 2.13x  slower

                           stream
         after(YJIT):        58.0 i/s
        before(YJIT):        55.3 i/s - 1.05x  slower
               after:        28.8 i/s - 2.01x  slower
              before:        28.4 i/s - 2.04x  slower

```

- YJIT=ON : 0.98x - 1.05x faster
- YJIT=OFF : 0.98x - 1.02x faster

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb |  6 +++++-
 test/test_pullparser.rb         | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 275372ee..02759e70 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -511,7 +511,11 @@ def normalize( input, entities=nil, entity_filter=nil )
 
       # Unescapes all possible entities
       def unnormalize( string, entities=nil, filter=nil )
-        rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
+        if string.include?("\r")
+          rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
+        else
+          rv = string.dup
+        end
         matches = rv.scan( REFERENCE_RE )
         return rv if matches.size == 0
         rv.gsub!( Private::CHARACTER_REFERENCES ) {
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index b6a48c93..073d896d 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -82,6 +82,27 @@ def test_character_references
       end
     end
 
+    def test_text_content_with_line_breaks
+      source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
+      parser = REXML::Parsers::PullParser.new( source )
+
+      events = {}
+      element_name = ''
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :start_element
+          element_name = event[0]
+        when :text
+          events[element_name] = event[1]
+        end
+      end
+
+      assert_equal('A', events['a'])
+      assert_equal("B\n", events['b'])
+      assert_equal("C\n", events['c'])
+    end
+
     def test_peek_unshift
       source = "<a><b/></a>"
       REXML::Parsers::PullParser.new(source)

From eb45c8dcca962c04e56f46b0040b2c33278ca3f9 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 8 Jul 2024 05:52:19 +0900
Subject: [PATCH 035/101] fix: Extra content at the end of the document (#161)

## Why?

XML with additional content at the end of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]    Misc       ::=          Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]    PI         ::=          '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]    PITarget           ::=          Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```
---
 lib/rexml/parsers/baseparser.rb           |  9 ++++++
 test/parse/test_comment.rb                | 12 ++++++++
 test/parse/test_element.rb                | 34 +++++++++++++++++++++++
 test/parse/test_processing_instruction.rb | 12 ++++++++
 test/parse/test_text.rb                   | 25 +++++++++++++++++
 test/test_pullparser.rb                   | 14 +++++-----
 6 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 test/parse/test_text.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 02759e70..900c19cc 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -460,8 +460,12 @@ def pull_event
                 @closed = tag
                 @nsstack.shift
               else
+                if @tags.empty? and @have_root
+                  raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
+                end
                 @tags.push( tag )
               end
+              @have_root = true
               return [ :start_element, tag, attributes ]
             end
           else
@@ -469,6 +473,11 @@ def pull_event
             if text.chomp!("<")
               @source.position -= "<".bytesize
             end
+            if @tags.empty? and @have_root
+              unless /\A\s*\z/.match?(text)
+                raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
+              end
+            end
             return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index ce6678e8..46a07409 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -105,5 +105,17 @@ def test_after_doctype_malformed_comment_end
         DETAIL
       end
     end
+
+    def test_after_root
+      parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal(" ok comment ", events[:comment])
+    end
   end
 end
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 14d0703a..a65cfa85 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -85,6 +85,40 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start
 </ </x>
         DETAIL
       end
+
+      def test_after_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a></a><b>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra tag at the end of the document (got '<b')
+          Line: 1
+          Position: 10
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+
+      def test_after_empty_element_tag_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a/><b>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra tag at the end of the document (got '<b')
+          Line: 1
+          Position: 7
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
     end
   end
 end
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index f0c0c24e..40dadd11 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -40,5 +40,17 @@ def test_garbage_text
                      ])
       end
     end
+
+    def test_after_root
+      parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal("abc", events[:processing_instruction])
+    end
   end
 end
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
new file mode 100644
index 00000000..f1622b71
--- /dev/null
+++ b/test/parse/test_text.rb
@@ -0,0 +1,25 @@
+require "test/unit"
+require 'rexml/parsers/baseparser'
+
+module REXMLTests
+  class TestParseText < Test::Unit::TestCase
+    class TestInvalid < self
+      def test_after_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a></a>c')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Extra content at the end of the document (got 'c')
+          Line: 1
+          Position: 8
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+    end
+  end
+end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 073d896d..0aca46be 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -63,8 +63,10 @@ def test_entity_replacement
     end
 
     def test_character_references
-      source = '<a>&#65;</a><b>&#x42;</b>'
+      source = '<root><a>&#65;</a><b>&#x42;</b></root>'
       parser = REXML::Parsers::PullParser.new( source )
+
+      events = {}
       element_name = ''
       while parser.has_next?
         event = parser.pull
@@ -72,14 +74,12 @@ def test_character_references
         when :start_element
           element_name = event[0]
         when :text
-          case element_name
-          when 'a'
-            assert_equal('A', event[1])
-          when 'b'
-            assert_equal('B', event[1])
-          end
+          events[element_name] = event[1]
         end
       end
+
+      assert_equal('A', events['a'])
+      assert_equal("B", events['b'])
     end
 
     def test_text_content_with_line_breaks

From ebc3e85bfa2796fb4922c1932760bec8390ff87c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 8 Jul 2024 05:54:06 +0900
Subject: [PATCH 036/101] Add position check for XML declaration (#162)

## Why?
XML declaration must be the first item.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog

```
[22]   prolog   ::=   	XMLDecl Misc* (doctypedecl Misc*)?
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl

```
[23]   XMLDecl  ::=   '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
```

See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193
---
 lib/rexml/parsers/baseparser.rb           |  5 ++++-
 test/parse/test_processing_instruction.rb | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 900c19cc..2a448e13 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -644,7 +644,10 @@ def process_instruction(start_position)
           @source.position = start_position
           raise REXML::ParseException.new(message, @source)
         end
-        if @document_status.nil? and match_data[1] == "xml"
+        if match_data[1] == "xml"
+          if @document_status
+            raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
+          end
           content = match_data[2]
           version = VERSION.match(content)
           version = version[1] unless version.nil?
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 40dadd11..13384935 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -39,6 +39,23 @@ def test_garbage_text
                        pi.content,
                      ])
       end
+
+      def test_xml_declaration_not_at_document_start
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('<a><?xml version="1.0" ?></a>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: XML declaration is not at the start
+          Line: 1
+          Position: 25
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
     end
 
     def test_after_root

From b2ec329dc1dc7635b224a6d61687c24b1e1db6fd Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 10 Jul 2024 09:50:12 +0900
Subject: [PATCH 037/101] test: move an attribute value test to
 parse/test_element.rb

---
 test/parse/test_element.rb | 11 +++++++++++
 test/test_document.rb      | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index a65cfa85..261f25c3 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseElement < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -120,5 +124,12 @@ def test_after_empty_element_tag_root
         DETAIL
       end
     end
+
+    def test_gt_linear_performance_attribute_value
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<test testing="' + ">" * n + '"></test>')
+      end
+    end
   end
 end
diff --git a/test/test_document.rb b/test/test_document.rb
index 2b0a8a73..ec0e8a5a 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
 # frozen_string_literal: false
 
-require 'core_assertions'
-
 module REXMLTests
   class TestDocument < Test::Unit::TestCase
-    include Test::Unit::CoreAssertions
-
     def test_version_attributes_to_s
       doc = REXML::Document.new(<<~eoxml)
         <?xml version="1.0" encoding="UTF-8" standalone="no"?>
@@ -202,13 +198,6 @@ def test_xml_declaration_standalone
       assert_equal('no', doc.stand_alone?, bug2539)
     end
 
-    def test_gt_linear_performance_attribute_value
-      seq = [10000, 50000, 100000, 150000, 200000]
-      assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<test testing="' + ">" * n + '"></test>')
-      end
-    end
-
     def test_each_recursive
       xml_source = <<~XML
         <?xml version="1.0" encoding="UTF-8" standalone="yes"?>

From 5e140edc3051741691e00bf96fa5119b44288a42 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 11 Jul 2024 09:49:56 +0900
Subject: [PATCH 038/101] Stop adding extra new line after XML declaration with
 pretty format (#164)

If the XML file does not end with a newline, a space is added to the end
of the first line.

```ruby
Failure: test_indent(REXMLTests::TestDocument::WriteTest::ArgumentsTest)
/Users/naitoh/ghq/github.com/naitoh/rexml/test/test_document.rb:270:in `test_indent'
     267:           output = ""
     268:           indent = 2
     269:           @document.write(output, indent)
  => 270:           assert_equal(<<-EOX.chomp, output)
     271: <?xml version='1.0' encoding='UTF-8'?>
     272: <message>
     273:   Hello world!
<"<?xml version='1.0' encoding='UTF-8'?>\n" +
"<message>\n" +
"  Hello world!\n" +
"</message>"> expected but was
<"<?xml version='1.0' encoding='UTF-8'?> \n" +
"<message>\n" +
"  Hello world!\n" +
"</message>">

diff:
? <?xml version='1.0' encoding='UTF-8'?>
  <message>
    Hello world!
  </message>

```

This is happen because `REXML::Formatters::Pretty#write_document` has a
logic that depends on the last text node.

We should ignore all top-level text nodes with pretty format.
---
 lib/rexml/formatters/pretty.rb |  2 +-
 test/test_document.rb          | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb
index a1198b7a..a838d835 100644
--- a/lib/rexml/formatters/pretty.rb
+++ b/lib/rexml/formatters/pretty.rb
@@ -111,7 +111,7 @@ def write_document( node, output )
         # itself, then we don't need a carriage return... which makes this
         # logic more complex.
         node.children.each { |child|
-          next if child == node.children[-1] and child.instance_of?(Text)
+          next if child.instance_of?(Text)
           unless child == node.children[0] or child.instance_of?(Text) or
             (child == node.children[1] and !node.children[0].writethis)
             output << "\n"
diff --git a/test/test_document.rb b/test/test_document.rb
index ec0e8a5a..9cd77c4e 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -236,7 +236,7 @@ def test_each_recursive
 
     class WriteTest < Test::Unit::TestCase
       def setup
-        @document = REXML::Document.new(<<-EOX)
+        @document = REXML::Document.new(<<-EOX.chomp)
 <?xml version="1.0" encoding="UTF-8"?>
 <message>Hello world!</message>
 EOX
@@ -246,7 +246,7 @@ class ArgumentsTest < self
         def test_output
           output = ""
           @document.write(output)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message>Hello world!</message>
 EOX
@@ -269,7 +269,7 @@ def test_transitive
           indent = 2
           transitive = true
           @document.write(output, indent, transitive)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message
 >Hello world!</message
@@ -298,7 +298,7 @@ def test_encoding
           japanese_text = "こんにちは"
           @document.root.text = japanese_text
           @document.write(output, indent, transitive, ie_hack, encoding)
-          assert_equal(<<-EOX.encode(encoding), output)
+          assert_equal(<<-EOX.chomp.encode(encoding), output)
 <?xml version='1.0' encoding='SHIFT_JIS'?>
 <message>#{japanese_text}</message>
 EOX
@@ -309,7 +309,7 @@ class OptionsTest < self
         def test_output
           output = ""
           @document.write(:output => output)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message>Hello world!</message>
 EOX
@@ -329,7 +329,7 @@ def test_indent
         def test_transitive
           output = ""
           @document.write(:output => output, :indent => 2, :transitive => true)
-          assert_equal(<<-EOX, output)
+          assert_equal(<<-EOX.chomp, output)
 <?xml version='1.0' encoding='UTF-8'?>
 <message
 >Hello world!</message
@@ -351,7 +351,7 @@ def test_encoding
           japanese_text = "こんにちは"
           @document.root.text = japanese_text
           @document.write(:output => output, :encoding => encoding)
-          assert_equal(<<-EOX.encode(encoding), output)
+          assert_equal(<<-EOX.chomp.encode(encoding), output)
 <?xml version='1.0' encoding='SHIFT_JIS'?>
 <message>#{japanese_text}</message>
 EOX

From 6d6400cdc03b612c3a3181b9055af87d3d2ddc68 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 11 Jul 2024 12:13:44 +0900
Subject: [PATCH 039/101] Add tests for REXML::Text.check (#165)

This patch will add missing REXML::Text.check tests.

This is the tests for the part that is checked using a regular
expression:

https://github.com/ruby/rexml/blob/b2ec329dc1dc7635b224a6d61687c24b1e1db6fd/lib/rexml/text.rb#L155-L172
---
 test/test_text_check.rb | 92 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 test/test_text_check.rb

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
new file mode 100644
index 00000000..d4076edf
--- /dev/null
+++ b/test/test_text_check.rb
@@ -0,0 +1,92 @@
+# frozen_string_literal: false
+
+module REXMLTests
+  class TextCheckTester < Test::Unit::TestCase
+
+    def check(string)
+      REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil)
+    end
+
+    def assert_check(string)
+      assert_nothing_raised { check(string) }
+    end
+
+    def assert_check_failed(string, illegal_part)
+      message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}"
+      assert_raise(RuntimeError.new(message)) do
+        check(string)
+      end
+    end
+
+    class TestValid < self
+      def test_entity_name_start_char_colon
+        assert_check('&:;')
+      end
+
+      def test_entity_name_start_char_under_score
+        assert_check('&_;')
+      end
+
+      def test_entity_name_mix
+        assert_check('&A.b-0123;')
+      end
+
+      def test_character_reference_decimal
+        assert_check('&#0162;')
+      end
+
+      def test_character_reference_hex
+        assert_check('&#x10FFFF;')
+      end
+
+      def test_entity_name_non_ascii
+        # U+3042 HIRAGANA LETTER A
+        # U+3044 HIRAGANA LETTER I
+        assert_check("&\u3042\u3044;")
+      end
+
+      def test_normal_string
+        assert_check("foo")
+      end
+    end
+
+    class TestInvalid < self
+      def test_lt
+        assert_check_failed('<;', '<')
+      end
+
+      def test_lt_mix
+        assert_check_failed('ab<cd', '<')
+      end
+
+      def test_entity_reference_missing_colon
+        assert_check_failed('&amp', '&')
+      end
+
+      def test_character_reference_decimal_invalid_value
+        # U+0008 BACKSPACE
+        assert_check_failed('&#8;', '&#8;')
+      end
+
+      def test_character_reference_format_hex_0x
+        # U+0041 LATIN CAPITAL LETTER A
+        assert_check_failed('&#0x41;', '&#0x41;')
+      end
+
+      def test_character_reference_format_hex_00x
+        # U+0041 LATIN CAPITAL LETTER A
+        assert_check_failed('&#00x41;', '&#00x41;')
+      end
+
+      def test_character_reference_hex_surrogate_block
+        # U+0D800 SURROGATE PAIR
+        assert_check_failed('&#xD800;', '&#xD800;')
+      end
+
+      def test_entity_name_non_ascii_symbol
+        # U+00BF INVERTED QUESTION MARK
+        assert_check_failed('&\u00BF;', '&')
+      end
+    end
+  end
+end

From 87f7cd0f492dbe4c0963d41e0fbec8f4d29d357a Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 11 Jul 2024 18:44:54 +0900
Subject: [PATCH 040/101] Fix test for Text.check (#166)

This patch will fix incorrect string in a case where unicode characters.
Because of the use of single quotes, it was simply an ASCII string.
---
 test/test_text_check.rb | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index d4076edf..56d00440 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -20,23 +20,23 @@ def assert_check_failed(string, illegal_part)
 
     class TestValid < self
       def test_entity_name_start_char_colon
-        assert_check('&:;')
+        assert_check("&:;")
       end
 
       def test_entity_name_start_char_under_score
-        assert_check('&_;')
+        assert_check("&_;")
       end
 
       def test_entity_name_mix
-        assert_check('&A.b-0123;')
+        assert_check("&A.b-0123;")
       end
 
       def test_character_reference_decimal
-        assert_check('&#0162;')
+        assert_check("&#0162;")
       end
 
       def test_character_reference_hex
-        assert_check('&#x10FFFF;')
+        assert_check("&#x10FFFF;")
       end
 
       def test_entity_name_non_ascii
@@ -52,40 +52,40 @@ def test_normal_string
 
     class TestInvalid < self
       def test_lt
-        assert_check_failed('<;', '<')
+        assert_check_failed("<;", "<")
       end
 
       def test_lt_mix
-        assert_check_failed('ab<cd', '<')
+        assert_check_failed("ab<cd", "<")
       end
 
       def test_entity_reference_missing_colon
-        assert_check_failed('&amp', '&')
+        assert_check_failed("&amp", "&")
       end
 
       def test_character_reference_decimal_invalid_value
         # U+0008 BACKSPACE
-        assert_check_failed('&#8;', '&#8;')
+        assert_check_failed("&#8;", "&#8;")
       end
 
       def test_character_reference_format_hex_0x
         # U+0041 LATIN CAPITAL LETTER A
-        assert_check_failed('&#0x41;', '&#0x41;')
+        assert_check_failed("&#0x41;", "&#0x41;")
       end
 
       def test_character_reference_format_hex_00x
         # U+0041 LATIN CAPITAL LETTER A
-        assert_check_failed('&#00x41;', '&#00x41;')
+        assert_check_failed("&#00x41;", "&#00x41;")
       end
 
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
-        assert_check_failed('&#xD800;', '&#xD800;')
+        assert_check_failed("&#xD800;", "&#xD800;")
       end
 
       def test_entity_name_non_ascii_symbol
         # U+00BF INVERTED QUESTION MARK
-        assert_check_failed('&\u00BF;', '&')
+        assert_check_failed("&\u00BF;", "&")
       end
     end
   end

From 18214a6460b6e6ba583acec656634edda50b2768 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 20:52:09 +0900
Subject: [PATCH 041/101] test Text.check: add empty reference case

---
 test/test_text_check.rb | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 56d00440..08cacbdb 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -59,6 +59,10 @@ def test_lt_mix
         assert_check_failed("ab<cd", "<")
       end
 
+      def test_reference_empty
+        assert_check_failed("&;", "&")
+      end
+
       def test_entity_reference_missing_colon
         assert_check_failed("&amp", "&")
       end

From 0832562f31375d33077f6a737cee2e3df8424b28 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:00:43 +0900
Subject: [PATCH 042/101] test Text.check: add garbage at the end in character
 reference cases

---
 test/test_text_check.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 08cacbdb..b2eebe92 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -67,6 +67,11 @@ def test_entity_reference_missing_colon
         assert_check_failed("&amp", "&")
       end
 
+      def test_character_reference_decimal_garbage_at_the_end
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#48x;", "&")
+      end
+
       def test_character_reference_decimal_invalid_value
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
@@ -82,6 +87,11 @@ def test_character_reference_format_hex_00x
         assert_check_failed("&#00x41;", "&#00x41;")
       end
 
+      def test_character_reference_hex_garbage_at_the_end
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#x48x;", "&")
+      end
+
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
         assert_check_failed("&#xD800;", "&#xD800;")

From 704044056df5bd03ffb60303f42999c8780b0770 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:03:54 +0900
Subject: [PATCH 043/101] test Text.check: use "why" for test name

---
 test/test_text_check.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index b2eebe92..1ba534fa 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -72,7 +72,7 @@ def test_character_reference_decimal_garbage_at_the_end
         assert_check_failed("&#48x;", "&")
       end
 
-      def test_character_reference_decimal_invalid_value
+      def test_character_reference_decimal_control_character
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
       end

From ddea83ff7a890b9d341fca1aa031d575aa88d1ac Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:06:08 +0900
Subject: [PATCH 044/101] test Text.check: add a space at the start in
 character reference cases

---
 test/test_text_check.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index 1ba534fa..a1cc2149 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -72,6 +72,11 @@ def test_character_reference_decimal_garbage_at_the_end
         assert_check_failed("&#48x;", "&")
       end
 
+      def test_character_reference_decimal_space_at_the_start
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&# 48;", "&")
+      end
+
       def test_character_reference_decimal_control_character
         # U+0008 BACKSPACE
         assert_check_failed("&#8;", "&#8;")
@@ -92,6 +97,11 @@ def test_character_reference_hex_garbage_at_the_end
         assert_check_failed("&#x48x;", "&")
       end
 
+      def test_character_reference_hex_space_at_the_start
+        # U+0030 DIGIT ZERO
+        assert_check_failed("&#x 30;", "&")
+      end
+
       def test_character_reference_hex_surrogate_block
         # U+0D800 SURROGATE PAIR
         assert_check_failed("&#xD800;", "&#xD800;")

From 20f808478c4b5243adb24cae4fcc357db7116853 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 11 Jul 2024 21:08:26 +0900
Subject: [PATCH 045/101] test Text.check: add entity reference with new line
 case

---
 test/test_text_check.rb | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/test_text_check.rb b/test/test_text_check.rb
index a1cc2149..11cf65a3 100644
--- a/test/test_text_check.rb
+++ b/test/test_text_check.rb
@@ -111,6 +111,11 @@ def test_entity_name_non_ascii_symbol
         # U+00BF INVERTED QUESTION MARK
         assert_check_failed("&\u00BF;", "&")
       end
+
+      def test_entity_name_new_line
+        # U+0026 AMPERSAND
+        assert_check_failed("&\namp\nx;", "&")
+      end
     end
   end
 end

From a5075c151d8e700057d7b3e1fd1db571ac2c4c4c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Fri, 12 Jul 2024 09:33:30 +0900
Subject: [PATCH 046/101] Do not output :text event after the root tag is
 closed (#167)

## Why?
GitHub: fix GH-163

## Change
- sax_test.rb
```
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'

require 'libxml-ruby'
require 'nokogiri'

xml = <<EOS
<root> a b  c
</root>
<!-- ok comment -->
<?abc version="1.0" ?>
EOS

class Listener
  def method_missing(name, *args)
    p [name, *args]
  end
end

puts "LibXML(SAX)"
parser = LibXML::XML::SaxParser.string(xml)
parser.callbacks = Listener.new
parser.parse

puts ""
puts "Nokogiri(SAX)"
parser = Nokogiri::XML::SAX::Parser.new(Listener.new)
parser.parse(xml)

puts ""
puts "REXML(SAX)"
parser = REXML::Parsers::SAX2Parser.new(xml)
parser.listen(Listener.new)
parser.parse

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
    res = parser.pull
    p res
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse
```

## Before (rexml 3.3.1)
```
LibXML(SAX)
[:on_start_document]
[:on_start_element_ns, "root", {}, nil, nil, {}]
[:on_characters, " a b  c \n"]
[:on_end_element_ns, "root", nil, nil]
[:on_comment, " ok comment "]
[:on_processing_instruction, "abc", "version=\"1.0\" "]
[:on_end_document]

Nokogiri(SAX)
[:start_document]
[:start_element_namespace, "root", [], nil, nil, []]
[:characters, " a b  c \n"]
[:end_element_namespace, "root", nil, nil]
[:comment, " ok comment "]
[:processing_instruction, "abc", "version=\"1.0\" "]
[:end_document]

REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, " a b  c \n"]
[:progress, 15]
[:end_element, nil, "root", "root"]
[:progress, 22]
[:characters, "\n"]
[:progress, 23]
[:comment, " ok comment "]
[:progress, 42]
[:characters, "\n"]
[:progress, 43]
[:processing_instruction, "abc", " version=\"1.0\" "]
[:progress, 65]
[:characters, "\n"]
[:progress, 66]
[:end_document]

REXML(Pull)
start_element: ["root", {}]
text: [" a b  c \n", " a b  c \n"]
end_element: ["root"]
text: ["\n", "\n"]
comment: [" ok comment "]
text: ["\n", "\n"]
processing_instruction: ["abc", " version=\"1.0\" "]
text: ["\n", "\n"]

REXML(Stream)
[:tag_start, "root", {}]
[:text, " a b  c \n"]
[:tag_end, "root"]
[:text, "\n"]
[:comment, " ok comment "]
[:text, "\n"]
[:instruction, "abc", " version=\"1.0\" "]
[:text, "\n"]
```

## After(This PR)
```
REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, " a b  c \n"]
[:progress, 15]
[:end_element, nil, "root", "root"]
[:progress, 22]
[:comment, " ok comment "]
[:progress, 42]
[:processing_instruction, "abc", " version=\"1.0\" "]
[:progress, 65]
[:end_document]

REXML(Pull)
start_element: ["root", {}]
text: [" a b  c \n", " a b  c \n"]
end_element: ["root"]
comment: [" ok comment "]
processing_instruction: ["abc", " version=\"1.0\" "]
end_document: []

REXML(Stream)
[:tag_start, "root", {}]
[:text, " a b  c \n"]
[:tag_end, "root"]
[:comment, " ok comment "]
[:instruction, "abc", " version=\"1.0\" "]
```
---
 lib/rexml/parsers/baseparser.rb |  1 +
 test/parse/test_text.rb         | 15 +++++++++++++++
 test/parser/test_ultra_light.rb |  1 -
 test/test_core.rb               |  2 +-
 test/test_document.rb           |  2 +-
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 2a448e13..5cf1af21 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -477,6 +477,7 @@ def pull_event
               unless /\A\s*\z/.match?(text)
                 raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
               end
+              return pull_event
             end
             return [ :text, text ]
           end
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
index f1622b71..1acefc40 100644
--- a/test/parse/test_text.rb
+++ b/test/parse/test_text.rb
@@ -21,5 +21,20 @@ def test_after_root
         DETAIL
       end
     end
+
+    def test_whitespace_characters_after_root
+      parser = REXML::Parsers::BaseParser.new('<a>b</a> ')
+
+      events = []
+      while parser.has_next?
+        event = parser.pull
+        case event[0]
+        when :text
+          events << event[1]
+        end
+      end
+
+      assert_equal(["b"], events)
+    end
   end
 end
diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb
index 44fd1d1e..b3f576ff 100644
--- a/test/parser/test_ultra_light.rb
+++ b/test/parser/test_ultra_light.rb
@@ -17,7 +17,6 @@ def test_entity_declaration
                        [:entitydecl, "name", "value"]
                      ],
                      [:start_element, :parent, "root", {}],
-                     [:text, "\n"],
                    ],
                    parse(<<-INTERNAL_SUBSET))
 <!ENTITY name "value">
diff --git a/test/test_core.rb b/test/test_core.rb
index 44e2e7ea..e1fba8a7 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -826,7 +826,7 @@ def test_deep_clone
     end
 
     def test_whitespace_before_root
-      a = <<EOL
+      a = <<EOL.chomp
 <?xml version='1.0'?>
   <blo>
     <wak>
diff --git a/test/test_document.rb b/test/test_document.rb
index 9cd77c4e..33cf4002 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -435,7 +435,7 @@ def test_utf_16
 
           actual_xml = ""
           document.write(actual_xml)
-          expected_xml = <<-EOX.encode("UTF-16BE")
+          expected_xml = <<-EOX.chomp.encode("UTF-16BE")
 \ufeff<?xml version='1.0' encoding='UTF-16'?>
 <message>Hello world!</message>
 EOX

From 4ebf21f686654af7254beb3721a5c57990eafc30 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 14 Jul 2024 20:22:00 +0900
Subject: [PATCH 047/101] Fix a bug that SAX2 parser doesn't expand the
 predefined entities for "characters" (#168)

## Why?

SAX2 parser expand user-defined entity references and character
references but doesn't expand predefined entity references.

## Change
- text_unnormalized.rb
```
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'

xml = <<EOS
<root>
  <A>&lt;P&gt;&https://github.com/ruby/rexml/pull/13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</A>
</root>
EOS

class Listener
  def method_missing(name, *args)
    p [name, *args]
  end
end

puts "REXML(DOM)"
REXML::Document.new(xml).elements.each("/root/A") {|element| puts element.text}

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
    res = parser.pull
    p res
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, Listener.new).parse

puts ""
puts "REXML(SAX)"
parser = REXML::Parsers::SAX2Parser.new(xml)
parser.listen(Listener.new)
parser.parse
```

## Before (master)
```
$ ruby text_unnormalized.rb
REXML(DOM)
 <I> <B> Text </B>  </I>

REXML(Pull)
start_element: ["root", {}]
text: ["\n  ", "\n  "]
start_element: ["A", {}]
text: ["&lt;P&gt;&https://github.com/ruby/rexml/pull/13; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;", "<P>\r <I> <B> Text </B>  </I>"]
end_element: ["A"]
text: ["\n", "\n"]
end_element: ["root"]
end_document: []

REXML(Stream)
[:tag_start, "root", {}]
[:text, "\n  "]
[:tag_start, "A", {}]
[:text, "<P>\r <I> <B> Text </B>  </I>"]
[:tag_end, "A"]
[:text, "\n"]
[:tag_end, "root"]

REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "&lt;P&gt;\r &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;"] #<= This
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```

## After(This PR)
```
$ ruby text_unnormalized.rb
REXML(SAX)
[:start_document]
[:start_element, nil, "root", "root", {}]
[:progress, 6]
[:characters, "\n  "]
[:progress, 9]
[:start_element, nil, "A", "A", {}]
[:progress, 12]
[:characters, "<P>\r <I> <B> Text </B>  </I>"]
[:progress, 74]
[:end_element, nil, "A", "A"]
[:progress, 78]
[:characters, "\n"]
[:progress, 79]
[:end_element, nil, "root", "root"]
[:progress, 86]
[:end_document]
```
---
 lib/rexml/parsers/sax2parser.rb   | 21 ++-------------------
 lib/rexml/parsers/streamparser.rb |  4 ++--
 test/test_pullparser.rb           | 16 ++++++++++++++++
 test/test_sax.rb                  | 11 +++++++++++
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb
index 6a24ce22..36f98c2a 100644
--- a/lib/rexml/parsers/sax2parser.rb
+++ b/lib/rexml/parsers/sax2parser.rb
@@ -157,25 +157,8 @@ def parse
               end
             end
           when :text
-            #normalized = @parser.normalize( event[1] )
-            #handle( :characters, normalized )
-            copy = event[1].clone
-
-            esub = proc { |match|
-              if @entities.has_key?($1)
-                @entities[$1].gsub(Text::REFERENCE, &esub)
-              else
-                match
-              end
-            }
-
-            copy.gsub!( Text::REFERENCE, &esub )
-            copy.gsub!( Text::NUMERICENTITY ) {|m|
-              m=$1
-              m = "0#{m}" if m[0] == ?x
-              [Integer(m)].pack('U*')
-            }
-            handle( :characters, copy )
+            unnormalized = @parser.unnormalize( event[1], @entities )
+            handle( :characters, unnormalized )
           when :entitydecl
             handle_entitydecl( event )
           when :processing_instruction, :comment, :attlistdecl,
diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb
index 9e0eb0b3..fa3ac496 100644
--- a/lib/rexml/parsers/streamparser.rb
+++ b/lib/rexml/parsers/streamparser.rb
@@ -36,8 +36,8 @@ def parse
             @listener.tag_end( event[1] )
             @tag_stack.pop
           when :text
-            normalized = @parser.unnormalize( event[1] )
-            @listener.text( normalized )
+            unnormalized = @parser.unnormalize( event[1] )
+            @listener.text( unnormalized )
           when :processing_instruction
             @listener.instruction( *event[1,2] )
           when :start_doctype
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 0aca46be..096e8b7f 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -82,6 +82,22 @@ def test_character_references
       assert_equal("B", events['b'])
     end
 
+    def test_text_entity_references
+      source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</a></root>'
+      parser = REXML::Parsers::PullParser.new( source )
+
+      events = []
+      while parser.has_next?
+        event = parser.pull
+        case event.event_type
+        when :text
+          events << event[1]
+        end
+      end
+
+      assert_equal(["<P> <I> <B> Text </B>  </I>"], events)
+    end
+
     def test_text_content_with_line_breaks
       source = "<root><a>A</a><b>B\n</b><c>C\r\n</c></root>"
       parser = REXML::Parsers::PullParser.new( source )
diff --git a/test/test_sax.rb b/test/test_sax.rb
index 8e905f2e..5a3f5e4e 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -31,6 +31,17 @@ def test_entity_replacement
       assert_equal '--1234--', results[1]
     end
 
+    def test_characters_predefined_entities
+      source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</a></root>'
+
+      sax = Parsers::SAX2Parser.new( source )
+      results = []
+      sax.listen(:characters) {|x| results << x }
+      sax.parse
+
+      assert_equal(["<P> <I> <B> Text </B>  </I>"], results)
+    end
+
     def test_sax2
       File.open(fixture_path("documentation.xml")) do |f|
         parser = Parsers::SAX2Parser.new( f )

From b8a5f4cd5c8fe29c65d7a00e67170223d9d2b50e Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:48:53 +0900
Subject: [PATCH 048/101] Fix performance issue caused by using repeated `>`
 characters inside `<?xml` (#170)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb           |  3 ++-
 lib/rexml/source.rb                       |  6 +++---
 test/parse/test_processing_instruction.rb | 11 +++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5cf1af21..b117e654 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -125,6 +125,7 @@ class BaseParser
 
       module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+        INSTRUCTION_TERM = "?>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -639,7 +640,7 @@ def parse_id_invalid_details(accept_external_id:,
       end
 
       def process_instruction(start_position)
-        match_data = @source.match(Private::INSTRUCTION_END, true)
+        match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
         unless match_data
           message = "Invalid processing instruction node"
           @source.position = start_position
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 5715c352..4c30532a 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -117,7 +117,7 @@ def read_until(term)
     def ensure_buffer
     end
 
-    def match(pattern, cons=false)
+    def match(pattern, cons=false, term: nil)
       if cons
         @scanner.scan(pattern).nil? ? nil : @scanner
       else
@@ -240,7 +240,7 @@ def ensure_buffer
     # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
     # - ">"
     # - "XXX>" (X is any string excluding '>')
-    def match( pattern, cons=false )
+    def match( pattern, cons=false, term: nil )
       while true
         if cons
           md = @scanner.scan(pattern)
@@ -250,7 +250,7 @@ def match( pattern, cons=false )
         break if md
         return nil if pattern.is_a?(String)
         return nil if @source.nil?
-        return nil unless read
+        return nil unless read(term)
       end
 
       md.nil? ? nil : @scanner
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 13384935..ac4c2ff0 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseProcessinInstruction < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -69,5 +73,12 @@ def test_after_root
 
       assert_equal("abc", events[:processing_instruction])
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')
+      end
+    end
   end
 end

From 0af55fa49d4c9369f90f239a9571edab800ed36e Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:57:39 +0900
Subject: [PATCH 049/101] Fix ReDoS caused by very large character references
 using repeated 0s (#169)

This patch will fix the ReDoS that is caused by large string of 0s on a
character reference (like `&#00000000...`).

This is occurred in Ruby 3.1 or earlier.
---
 lib/rexml/text.rb                      | 48 ++++++++++++++++++--------
 test/parse/test_character_reference.rb | 17 +++++++++
 2 files changed, 51 insertions(+), 14 deletions(-)
 create mode 100644 test/parse/test_character_reference.rb

diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
index b47bad3b..7e0befe9 100644
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@@ -151,25 +151,45 @@ def Text.check string, pattern, doctype
         end
       end
 
-      # context sensitive
-      string.scan(pattern) do
-        if $1[-1] != ?;
-          raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
-        elsif $1[0] == ?&
-          if $5 and $5[0] == ?#
-            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
-            when *VALID_CHAR
+      pos = 0
+      while (index = string.index(/<|&/, pos))
+        if string[index] == "<"
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        unless (end_index = string.index(/[^\s];/, index + 1))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        value = string[(index + 1)..end_index]
+        if /\s/.match?(value)
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
+        end
+
+        if value[0] == "#"
+          character_reference = value[1..-1]
+
+          unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference))
+            if character_reference[0] == "x" || character_reference[-1] == "x"
+              raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
             else
-              raise "Illegal character #{$1.inspect} in raw string #{string.inspect}"
+              raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
             end
-          # FIXME: below can't work but this needs API change.
-          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
-          #   if !doctype or !doctype.entities.has_key?($3)
-          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
-          #   end
           end
+
+          case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i)
+          when *VALID_CHAR
+          else
+            raise "Illegal character #{string.inspect} in raw string #{string.inspect}"
+          end
+        elsif !(/\A#{Entity::NAME}\z/um.match?(value))
+          raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}"
         end
+
+        pos = end_index + 1
       end
+
+      string
     end
 
     def node_type
diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb
new file mode 100644
index 00000000..8ddeccaa
--- /dev/null
+++ b/test/parse/test_character_reference.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseCharacterReference < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance_many_preceding_zeros
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<test testing="&#' + "0" * n + '97;"/>')
+      end
+    end
+  end
+end

From c1b64c174ec2e8ca2174c51332670e3be30c865f Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 10:57:50 +0900
Subject: [PATCH 050/101] Fix performance issue caused by using repeated `>`
 characters inside comments (#171)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb |  3 ++-
 test/parse/test_comment.rb      | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index b117e654..ba205175 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -126,6 +126,7 @@ class BaseParser
       module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         INSTRUCTION_TERM = "?>"
+        COMMENT_TERM = "-->"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -243,7 +244,7 @@ def pull_event
             return process_instruction(start_position)
           elsif @source.match("<!", true)
             if @source.match("--", true)
-              md = @source.match(/(.*?)-->/um, true)
+              md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
               if md.nil?
                 raise REXML::ParseException.new("Unclosed comment", @source)
               end
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 46a07409..543d9ad8 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -1,8 +1,12 @@
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseComment < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     def parse(xml)
       REXML::Document.new(xml)
     end
@@ -117,5 +121,12 @@ def test_after_root
 
       assert_equal(" ok comment ", events[:comment])
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!-- ' + ">" * n + ' -->')
+      end
+    end
   end
 end

From 9f1415a2616c77cad44a176eee90e8457b4774b6 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:04:40 +0900
Subject: [PATCH 051/101] Fix performance issue caused by using repeated `>`
 characters inside `CDATA [ PAYLOAD ]` (#172)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb |  3 ++-
 test/parse/test_cdata.rb        | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_cdata.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index ba205175..e2c0fd80 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -127,6 +127,7 @@ module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
+        CDATA_TERM = "]]>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -431,7 +432,7 @@ def pull_event
 
                 return [ :comment, md[1] ]
               else
-                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
+                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
                 return [ :cdata, md[1] ] if md
               end
               raise REXML::ParseException.new( "Declarations can only occur "+
diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb
new file mode 100644
index 00000000..9e8fa8b2
--- /dev/null
+++ b/test/parse/test_cdata.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseCData < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<description><![CDATA[ ' + ">" * n + ' ]]></description>')
+      end
+    end
+  end
+end

From c33ea498102be65082940e8b7d6d31cb2c6e6ee2 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:11:17 +0900
Subject: [PATCH 052/101] Fix performance issue caused by using repeated `>`
 characters after ` <!DOCTYPE name` (#173)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb              |  3 ++-
 test/parse/test_document_type_declaration.rb | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e2c0fd80..7fe6c4e8 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -128,6 +128,7 @@ module Private
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
         CDATA_TERM = "]]>"
+        DOCTYPE_TERM = "]>"
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -384,7 +385,7 @@ def pull_event
               end
               return [ :comment, md[1] ] if md
             end
-          elsif match = @source.match(/(%.*?;)\s*/um, true)
+          elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
             return [ :externalentity, match[1] ]
           elsif @source.match(/\]\s*>/um, true)
             @document_status = :after_doctype
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 3ca0b536..61c3f04d 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -1,9 +1,13 @@
 # frozen_string_literal: false
 require "test/unit"
+require "core_assertions"
+
 require "rexml/document"
 
 module REXMLTests
   class TestParseDocumentTypeDeclaration < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     private
     def parse(doctype)
       REXML::Document.new(<<-XML).doctype
@@ -276,6 +280,16 @@ def test_notation_attlist
                      doctype.children.collect(&:class))
       end
 
+      def test_gt_linear_performance_malformed_entity
+        seq = [10000, 50000, 100000, 150000, 200000]
+        assert_linear_performance(seq, rehearsal: 10) do |n|
+          begin
+            REXML::Document.new('<!DOCTYPE root [' + "%>" * n + ']><test/>')
+          rescue
+          end
+        end
+      end
+
       private
       def parse(internal_subset)
         super(<<-DOCTYPE)

From a79ac8b4b42a9efabe33a0be31bd82d33fd50347 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:18:11 +0900
Subject: [PATCH 053/101] Fix performance issue caused by using repeated `>`
 characters inside `<!DOCTYPE root [<!-- PAYLOAD -->]>` (#174)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb              | 2 +-
 test/parse/test_document_type_declaration.rb | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 7fe6c4e8..4fcdaba7 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -378,7 +378,7 @@ def pull_event
                 raise REXML::ParseException.new(message, @source)
               end
               return [:notationdecl, name, *id]
-            elsif md = @source.match(/--(.*?)-->/um, true)
+            elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
               case md[1]
               when /--/, /-\z/
                 raise REXML::ParseException.new("Malformed comment", @source)
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 61c3f04d..3c3371ea 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -290,6 +290,13 @@ def test_gt_linear_performance_malformed_entity
         end
       end
 
+      def test_gt_linear_performance_comment
+        seq = [10000, 50000, 100000, 150000, 200000]
+        assert_linear_performance(seq, rehearsal: 10) do |n|
+          REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
+        end
+      end
+
       private
       def parse(internal_subset)
         super(<<-DOCTYPE)

From 67efb5951ed09dbb575c375b130a1e469f437d1f Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:26:57 +0900
Subject: [PATCH 054/101] Fix performance issue caused by using repeated `>`
 characters inside `<!DOCTYPE name [<!ENTITY>]>` (#175)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb       | 8 ++++++--
 test/parse/test_entity_declaration.rb | 7 +++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 4fcdaba7..e8f1a069 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -124,11 +124,15 @@ class BaseParser
       }
 
       module Private
-        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+        # Terminal requires two or more letters.
         INSTRUCTION_TERM = "?>"
         COMMENT_TERM = "-->"
         CDATA_TERM = "]]>"
         DOCTYPE_TERM = "]>"
+        # Read to the end of DOCTYPE because there is no proper ENTITY termination
+        ENTITY_TERM = DOCTYPE_TERM
+
+        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -313,7 +317,7 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true).captures.compact]
+              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM).captures.compact]
               ref = false
               if match[1] == '%'
                 ref = true
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index e15deec6..07529016 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -32,5 +32,12 @@ def test_empty
  <!ENTITY>  ]> <r/>
       DETAIL
     end
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">')
+      end
+    end
   end
 end

From 1cc1d9a74ede52f3d9ce774cafb11c57b3905165 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:27:57 +0900
Subject: [PATCH 055/101] Suppress have_root not initialized warnings on Ruby <
 3

---
 lib/rexml/parsers/baseparser.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e8f1a069..860be203 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -165,6 +165,7 @@ def add_listener( listener )
       def stream=( source )
         @source = SourceFactory.create_from( source )
         @closed = nil
+        @have_root = false
         @document_status = nil
         @tags = []
         @stack = []

From 1f1e6e9b40bf339894e843dfd679c2fb1a5ddbf2 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:35:41 +0900
Subject: [PATCH 056/101] Fix ReDoS by using repeated space characters inside
 `<!DOCTYPE name [<!ATTLIST>]>` (#176)

Fix performance by removing unnecessary spaces.

This is occurred in Ruby 3.1 or earlier.
---
 lib/rexml/parsers/baseparser.rb |  2 +-
 test/parse/test_attlist.rb      | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 test/parse/test_attlist.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 860be203..47380f0d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -350,7 +350,7 @@ def pull_event
               contents = md[0]
 
               pairs = {}
-              values = md[0].scan( ATTDEF_RE )
+              values = md[0].strip.scan( ATTDEF_RE )
               values.each do |attdef|
                 unless attdef[3] == "#IMPLIED"
                   attdef.compact!
diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb
new file mode 100644
index 00000000..eee9309c
--- /dev/null
+++ b/test/parse/test_attlist.rb
@@ -0,0 +1,17 @@
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
+
+module REXMLTests
+  class TestParseAttlist < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
+    def test_gt_linear_performance
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!DOCTYPE schema SYSTEM "foo.dtd" [<!ATTLIST ' + " " * n + ' root v CDATA #FIXED "test">]>')
+      end
+    end
+  end
+end

From 910e5a2b487cb5a30989884a39f9cad2cc499cfc Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Tue, 16 Jul 2024 11:36:05 +0900
Subject: [PATCH 057/101] Fix performance issue caused by using repeated `>`
 characters inside `<xml><!-- --></xml>` (#177)

A `<` is treated as a string delimiter.
In certain cases, if `<` is used in succession, read and match are
repeated, which slows down the process. Therefore, the following is used
to read ahead to a specific part of the string in advance.
---
 lib/rexml/parsers/baseparser.rb | 2 +-
 test/parse/test_comment.rb      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 47380f0d..5688c773 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -430,7 +430,7 @@ def pull_event
               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
               raise REXML::ParseException.new("Malformed node", @source) unless md
               if md[0][0] == ?-
-                md = @source.match(/--(.*?)-->/um, true)
+                md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
 
                 if md.nil? || /--|-\z/.match?(md[1])
                   raise REXML::ParseException.new("Malformed comment", @source)
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 543d9ad8..50c765f5 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -128,5 +128,12 @@ def test_gt_linear_performance
         REXML::Document.new('<!-- ' + ">" * n + ' -->')
       end
     end
+
+    def test_gt_linear_performance_in_element
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<xml><!-- ' + '>' * n + ' --></xml>')
+      end
+    end
   end
 end

From 0e33d3adfb5069b20622e5ed9393d10b8cc17b40 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:37:45 +0900
Subject: [PATCH 058/101] test: improve linear performance test names

Use "test_linear_performance_XXX" style.
---
 test/parse/test_attlist.rb                   | 2 +-
 test/parse/test_cdata.rb                     | 2 +-
 test/parse/test_character_reference.rb       | 2 +-
 test/parse/test_comment.rb                   | 4 ++--
 test/parse/test_document_type_declaration.rb | 4 ++--
 test/parse/test_element.rb                   | 2 +-
 test/parse/test_entity_declaration.rb        | 2 +-
 test/parse/test_processing_instruction.rb    | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/parse/test_attlist.rb b/test/parse/test_attlist.rb
index eee9309c..c1b4376c 100644
--- a/test/parse/test_attlist.rb
+++ b/test/parse/test_attlist.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseAttlist < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!DOCTYPE schema SYSTEM "foo.dtd" [<!ATTLIST ' + " " * n + ' root v CDATA #FIXED "test">]>')
diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb
index 9e8fa8b2..b5f1a3bc 100644
--- a/test/parse/test_cdata.rb
+++ b/test/parse/test_cdata.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseCData < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<description><![CDATA[ ' + ">" * n + ' ]]></description>')
diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb
index 8ddeccaa..bf8d2190 100644
--- a/test/parse/test_character_reference.rb
+++ b/test/parse/test_character_reference.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseCharacterReference < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_gt_linear_performance_many_preceding_zeros
+    def test_linear_performance_many_preceding_zeros
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<test testing="&#' + "0" * n + '97;"/>')
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index 50c765f5..b7892232 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -122,14 +122,14 @@ def test_after_root
       assert_equal(" ok comment ", events[:comment])
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_top_level_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!-- ' + ">" * n + ' -->')
       end
     end
 
-    def test_gt_linear_performance_in_element
+    def test_linear_performance_in_element_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<xml><!-- ' + '>' * n + ' --></xml>')
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 3c3371ea..490a27d4 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -280,7 +280,7 @@ def test_notation_attlist
                      doctype.children.collect(&:class))
       end
 
-      def test_gt_linear_performance_malformed_entity
+      def test_linear_performance_percent_gt
         seq = [10000, 50000, 100000, 150000, 200000]
         assert_linear_performance(seq, rehearsal: 10) do |n|
           begin
@@ -290,7 +290,7 @@ def test_gt_linear_performance_malformed_entity
         end
       end
 
-      def test_gt_linear_performance_comment
+      def test_linear_performance_comment_gt
         seq = [10000, 50000, 100000, 150000, 200000]
         assert_linear_performance(seq, rehearsal: 10) do |n|
           REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 261f25c3..2b0746ea 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -125,7 +125,7 @@ def test_after_empty_element_tag_root
       end
     end
 
-    def test_gt_linear_performance_attribute_value
+    def test_linear_performance_attribute_value_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<test testing="' + ">" * n + '"></test>')
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index 07529016..7d750b90 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -33,7 +33,7 @@ def test_empty
       DETAIL
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">')
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index ac4c2ff0..7943cd3c 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -74,7 +74,7 @@ def test_after_root
       assert_equal("abc", events[:processing_instruction])
     end
 
-    def test_gt_linear_performance
+    def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')

From 2b285ac0804f2918de642f7ed4646dc6d645a7fc Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:38:07 +0900
Subject: [PATCH 059/101] Add 3.3.2 entry

---
 NEWS.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 3e406574..3b62f6aa 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,53 @@
 # News
 
+## 3.3.2 - 2024-07-16 {#version-3-3-2}
+
+### Improvements
+
+  * Improved parse performance.
+    * GH-160
+    * Patch by NAITOH Jun.
+
+  * Improved parse performance.
+    * GH-169
+    * GH-170
+    * GH-171
+    * GH-172
+    * GH-173
+    * GH-174
+    * Patch by Watson.
+
+  * Added support for raising a parse exception when an XML has extra
+    content after the root element.
+    * GH-161
+    * Patch by NAITOH Jun.
+
+  * Added support for raising a parse exception when an XML
+    declaration exists in wrong position.
+    * GH-162
+    * Patch by NAITOH Jun.
+
+  * Removed needless a space after XML declaration in pretty print mode.
+    * GH-164
+    * Patch by NAITOH Jun.
+
+  * Stopped to emit `:text` event after the root element.
+    * GH-167
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a bug that SAX2 parser doesn't expand predefined entities for
+    `characters` callback.
+    * GH-168
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * NAITOH Jun
+
+  * Watson
+
 ## 3.3.1 - 2024-06-25 {#version-3-3-1}
 
 ### Improvements

From 8fed63e18a3ce677dcbb457e4f33b29efad4cf1f Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 11:57:52 +0900
Subject: [PATCH 060/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 573d0a13..39e92a57 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.2"
+  VERSION = "3.3.3"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 7e75de227cf72c86bf1c7d0496933b704e7f97e7 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 16 Jul 2024 12:04:39 +0900
Subject: [PATCH 061/101] Add missing references in 3.3.2 entry

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 3b62f6aa..76355d87 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,9 @@
     * GH-172
     * GH-173
     * GH-174
+    * GH-175
+    * GH-176
+    * GH-177
     * Patch by Watson.
 
   * Added support for raising a parse exception when an XML has extra

From 2c39c91a65d69357cfbc35dd8079b3606d86bb70 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Fri, 19 Jul 2024 17:15:15 +0900
Subject: [PATCH 062/101] Fix method scope in test in order to invoke the tests
 properly and fix exception message (#182)

This PR includes following two fixes.

1. The `test_empty` and `test_linear_performance_gt` were defined as
private method. Seems that test-unit runner does not invoke private
methods even if the methods have `test_` prefix.
2. When parse malformed entity declaration, the exception might have the
message about `NoMethodError`. The proper exception message will be
contained by this fix.
---
 lib/rexml/parsers/baseparser.rb       |  6 +++++-
 test/parse/test_entity_declaration.rb | 17 +++++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 5688c773..bbdcfc6c 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -318,7 +318,11 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match = [:entitydecl, *@source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM).captures.compact]
+              match_data = @source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM)
+              unless match_data
+                raise REXML::ParseException.new("Malformed entity declaration", @source)
+              end
+              match = [:entitydecl, *match_data.captures.compact]
               ref = false
               if match[1] == '%'
                 ref = true
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index 7d750b90..72f26afe 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -1,9 +1,13 @@
 # frozen_string_literal: false
-require 'test/unit'
-require 'rexml/document'
+require "test/unit"
+require "core_assertions"
+
+require "rexml/document"
 
 module REXMLTests
   class TestParseEntityDeclaration < Test::Unit::TestCase
+    include Test::Unit::CoreAssertions
+
     private
     def xml(internal_subset)
       <<-XML
@@ -18,6 +22,7 @@ def parse(internal_subset)
       REXML::Document.new(xml(internal_subset)).doctype
     end
 
+    public
     def test_empty
       exception = assert_raise(REXML::ParseException) do
         parse(<<-INTERNAL_SUBSET)
@@ -25,18 +30,18 @@ def test_empty
         INTERNAL_SUBSET
       end
       assert_equal(<<-DETAIL.chomp, exception.to_s)
-Malformed notation declaration: name is missing
+Malformed entity declaration
 Line: 5
-Position: 72
+Position: 70
 Last 80 unconsumed characters:
- <!ENTITY>  ]> <r/>
+>  ]> <r/> 
       DETAIL
     end
 
     def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">')
+        REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">]>')
       end
     end
   end

From 2bca7bd84a5cf13af8f5633dd7d3d519fc990d67 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Tue, 23 Jul 2024 05:53:46 +0900
Subject: [PATCH 063/101] Add support for detecting invalid XML that has
 unsupported content before root element (#184)

## Why?

XML with content at the start of the document is invalid.

https://www.w3.org/TR/2006/REC-xml11-20060816/#document

```
[1]   document   ::=   ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog

```
[22]   	prolog	   ::=   	XMLDecl Misc* (doctypedecl Misc*)?
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
```
[23]   	XMLDecl	   ::=   	'<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc

```
[27]   	Misc	   ::=   	Comment | PI | S
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI

```
[16]   	PI	   ::=   	'<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget

```
[17]   	PITarget	   ::=   	Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
```

https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
```
[28]   	doctypedecl	   ::=   	'<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
```

See: https://github.com/ruby/rexml/pull/164#discussion_r1683552024
---
 lib/rexml/parsers/baseparser.rb           | 10 ++++--
 test/parse/test_comment.rb                | 12 +++++++
 test/parse/test_processing_instruction.rb | 43 +++++++++++++----------
 test/parse/test_text.rb                   | 17 +++++++++
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index bbdcfc6c..54014e57 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -486,11 +486,15 @@ def pull_event
             if text.chomp!("<")
               @source.position -= "<".bytesize
             end
-            if @tags.empty? and @have_root
+            if @tags.empty?
               unless /\A\s*\z/.match?(text)
-                raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
+                if @have_root
+                  raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source)
+                else
+                  raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source)
+                end
               end
-              return pull_event
+              return pull_event if @have_root
             end
             return [ :text, text ]
           end
diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb
index b7892232..4475dca7 100644
--- a/test/parse/test_comment.rb
+++ b/test/parse/test_comment.rb
@@ -110,6 +110,18 @@ def test_after_doctype_malformed_comment_end
       end
     end
 
+    def test_before_root
+      parser = REXML::Parsers::BaseParser.new('<!-- ok comment --><a></a>')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal(" ok comment ", events[:comment])
+    end
+
     def test_after_root
       parser = REXML::Parsers::BaseParser.new('<a></a><!-- ok comment -->')
 
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 7943cd3c..8d42e964 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -25,25 +25,6 @@ def test_no_name
         DETAIL
       end
 
-      def test_garbage_text
-        # TODO: This should be parse error.
-        # Create test/parse/test_document.rb or something and move this to it.
-        doc = parse(<<-XML)
-x<?x y
-<!--?><?x -->?>
-<r/>
-        XML
-        pi = doc.children[1]
-        assert_equal([
-                       "x",
-                       "y\n<!--",
-                     ],
-                     [
-                       pi.target,
-                       pi.content,
-                     ])
-      end
-
       def test_xml_declaration_not_at_document_start
         exception = assert_raise(REXML::ParseException) do
           parser = REXML::Parsers::BaseParser.new('<a><?xml version="1.0" ?></a>')
@@ -62,6 +43,30 @@ def test_xml_declaration_not_at_document_start
       end
     end
 
+    def test_comment
+      doc = parse(<<-XML)
+<?x y
+<!--?><?x -->?>
+<r/>
+      XML
+      assert_equal([["x", "y\n<!--"],
+                    ["x", "-->"]],
+                   [[doc.children[0].target, doc.children[0].content],
+                    [doc.children[1].target, doc.children[1].content]])
+    end
+
+    def test_before_root
+      parser = REXML::Parsers::BaseParser.new('<?abc version="1.0" ?><a></a>')
+
+      events = {}
+      while parser.has_next?
+        event = parser.pull
+        events[event[0]] = event[1]
+      end
+
+      assert_equal("abc", events[:processing_instruction])
+    end
+
     def test_after_root
       parser = REXML::Parsers::BaseParser.new('<a></a><?abc version="1.0" ?>')
 
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
index 1acefc40..04f553ae 100644
--- a/test/parse/test_text.rb
+++ b/test/parse/test_text.rb
@@ -4,6 +4,23 @@
 module REXMLTests
   class TestParseText < Test::Unit::TestCase
     class TestInvalid < self
+      def test_before_root
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('b<a></a>')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Content at the start of the document (got 'b')
+          Line: 1
+          Position: 4
+          Last 80 unconsumed characters:
+          <a>
+        DETAIL
+      end
+
       def test_after_root
         exception = assert_raise(REXML::ParseException) do
           parser = REXML::Parsers::BaseParser.new('<a></a>c')

From 086287c37a37d8f36853045b888dc28e05e9c0c2 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Wed, 24 Jul 2024 12:51:08 +0900
Subject: [PATCH 064/101] Add more invalid test cases for  parsing entitly
 declaration (#183)

This patch will add the test cases to verify that it raises an exception
properly when parsing malformed entity declaration.

---------

Co-authored-by: takuya kodama <a.s.takuya1026@gmail.com>
---
 test/parse/test_entity_declaration.rb | 480 ++++++++++++++++++++++++++
 1 file changed, 480 insertions(+)

diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index 72f26afe..daaf5ed2 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -23,6 +23,486 @@ def parse(internal_subset)
     end
 
     public
+
+    # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-GEDecl
+    class TestGeneralEntityDeclaration < self
+      # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
+      class TestName < self
+        def test_prohibited_character
+          exception = assert_raise(REXML::ParseException) do
+            REXML::Document.new("<!DOCTYPE root [<!ENTITY invalid&name \"valid-entity-value\">]>")
+          end
+          assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 61
+Last 80 unconsumed characters:
+ invalid&name "valid-entity-value">]>
+          DETAIL
+        end
+      end
+
+      # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDef
+      class TestEntityDefinition < self
+        # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
+        class TestEntityValue < self
+          def test_no_quote
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name invalid-entity-value>]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 59
+Last 80 unconsumed characters:
+ valid-name invalid-entity-value>]>
+            DETAIL
+          end
+
+          def test_prohibited_character
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name \"% &\">]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 44
+Last 80 unconsumed characters:
+ valid-name "% &">]>
+            DETAIL
+          end
+
+          def test_mixed_quote
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name \"invalid-entity-value'>]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 61
+Last 80 unconsumed characters:
+ valid-name "invalid-entity-value'>]>
+            DETAIL
+          end
+        end
+
+        # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID
+        class TestExternalID < self
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral
+          class TestSystemLiteral < self
+            def test_no_quote_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name SYSTEM invalid-system-literal>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 68
+Last 80 unconsumed characters:
+ valid-name SYSTEM invalid-system-literal>]>
+              DETAIL
+            end
+
+            def test_no_quote_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"valid-pubid-literal\" invalid-system-literal>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 90
+Last 80 unconsumed characters:
+ valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]>
+              DETAIL
+            end
+
+            def test_mixed_quote_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name SYSTEM 'invalid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 70
+Last 80 unconsumed characters:
+ valid-name SYSTEM 'invalid-system-literal">]>
+              DETAIL
+            end
+
+            def test_mixed_quote_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"valid-pubid-literal\" \"invalid-system-literal'>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 92
+Last 80 unconsumed characters:
+ valid-name PUBLIC "valid-pubid-literal" "invalid-system-literal'>]>
+              DETAIL
+            end
+
+            def test_no_literal_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name SYSTEM>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 45
+Last 80 unconsumed characters:
+ valid-name SYSTEM>]>
+              DETAIL
+            end
+
+            def test_no_literal_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"valid-pubid-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 67
+Last 80 unconsumed characters:
+ valid-name PUBLIC "valid-pubid-literal">]>
+              DETAIL
+            end
+          end
+
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar
+          class TestPublicIDLiteral < self
+            def test_no_quote
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC invalid-pubid-literal \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 90
+Last 80 unconsumed characters:
+ valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_prohibited_pubid_character
+              exception = assert_raise(REXML::ParseException) do
+                # U+3042 HIRAGANA LETTER A
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"\u3042\" \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8'))
+Malformed entity declaration
+Line: 1
+Position: 74
+Last 80 unconsumed characters:
+ valid-name PUBLIC "\u3042" "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_mixed_quote
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"invalid-pubid-literal' \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 92
+Last 80 unconsumed characters:
+ valid-name PUBLIC "invalid-pubid-literal' "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_no_literal
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 45
+Last 80 unconsumed characters:
+ valid-name PUBLIC>]>
+              DETAIL
+            end
+          end
+        end
+
+        # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NDataDecl
+        class TestNotationDataDeclaration < self
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar
+          def test_prohibited_character
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name PUBLIC \"valid-pubid-literal\" \"valid-system-literal\" NDATA invalid&name>]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 109
+Last 80 unconsumed characters:
+ valid-name PUBLIC "valid-pubid-literal" "valid-system-literal" NDATA invalid&nam
+            DETAIL
+          end
+        end
+
+        def test_entity_value_and_notation_data_declaration
+          exception = assert_raise(REXML::ParseException) do
+            REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-name \"valid-entity-value\" NDATA valid-ndata-value>]>")
+          end
+          assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 83
+Last 80 unconsumed characters:
+ valid-name "valid-entity-value" NDATA valid-ndata-value>]>
+        DETAIL
+        end
+      end
+
+      def test_no_space
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new("<!DOCTYPE root [<!ENTITY valid-namePUBLIC\"valid-pubid-literal\"\"valid-system-literal\"NDATAvalid-name>]>")
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 102
+Last 80 unconsumed characters:
+ valid-namePUBLIC"valid-pubid-literal""valid-system-literal"NDATAvalid-name>]>
+        DETAIL
+      end
+    end
+
+    # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDecl
+    class TestParsedEntityDeclaration < self
+      # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
+      class TestName < self
+        def test_prohibited_character
+          exception = assert_raise(REXML::ParseException) do
+            REXML::Document.new("<!DOCTYPE root [<!ENTITY % invalid&name \"valid-entity-value\">]>")
+          end
+          assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 63
+Last 80 unconsumed characters:
+ % invalid&name "valid-entity-value">]>
+          DETAIL
+        end
+      end
+
+      # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDef
+      class TestParsedEntityDefinition < self
+        # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
+        class TestEntityValue < self
+          def test_no_quote
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name invalid-entity-value>]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 61
+Last 80 unconsumed characters:
+ % valid-name invalid-entity-value>]>
+            DETAIL
+          end
+
+          def test_prohibited_character
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name \"% &\">]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 46
+Last 80 unconsumed characters:
+ % valid-name "% &">]>
+            DETAIL
+          end
+
+          def test_mixed_quote
+            exception = assert_raise(REXML::ParseException) do
+              REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name 'invalid-entity-value\">]>")
+            end
+            assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 63
+Last 80 unconsumed characters:
+ % valid-name 'invalid-entity-value">]>
+            DETAIL
+          end
+        end
+
+        # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID
+        class TestExternalID < self
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral
+          class TestSystemLiteral < self
+            def test_no_quote_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name SYSTEM invalid-system-literal>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 70
+Last 80 unconsumed characters:
+ % valid-name SYSTEM invalid-system-literal>]>
+              DETAIL
+            end
+
+            def test_no_quote_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC \"valid-pubid-literal\" invalid-system-literal>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 92
+Last 80 unconsumed characters:
+ % valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]>
+              DETAIL
+            end
+
+            def test_mixed_quote_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name SYSTEM \"invalid-system-literal'>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 72
+Last 80 unconsumed characters:
+ % valid-name SYSTEM "invalid-system-literal'>]>
+              DETAIL
+            end
+
+            def test_mixed_quote_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC \"valid-pubid-literal\" 'invalid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 94
+Last 80 unconsumed characters:
+ % valid-name PUBLIC "valid-pubid-literal" 'invalid-system-literal">]>
+              DETAIL
+            end
+
+            def test_no_literal_in_system
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name SYSTEM>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 47
+Last 80 unconsumed characters:
+ % valid-name SYSTEM>]>
+              DETAIL
+            end
+
+            def test_no_literal_in_public
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC \"valid-pubid-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 69
+Last 80 unconsumed characters:
+ % valid-name PUBLIC "valid-pubid-literal">]>
+              DETAIL
+            end
+          end
+
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
+          # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar
+          class TestPublicIDLiteral < self
+            def test_no_quote
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC invalid-pubid-literal \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 92
+Last 80 unconsumed characters:
+ % valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_prohibited_pubid_character
+              exception = assert_raise(REXML::ParseException) do
+                # U+3042 HIRAGANA LETTER A
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC \"\u3042\" \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8'))
+Malformed entity declaration
+Line: 1
+Position: 76
+Last 80 unconsumed characters:
+ % valid-name PUBLIC "\u3042" "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_mixed_quote
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC 'invalid-pubid-literal\" \"valid-system-literal\">]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 94
+Last 80 unconsumed characters:
+ % valid-name PUBLIC 'invalid-pubid-literal" "valid-system-literal">]>
+              DETAIL
+            end
+
+            def test_no_literal
+              exception = assert_raise(REXML::ParseException) do
+                REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name PUBLIC>]>")
+              end
+              assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 47
+Last 80 unconsumed characters:
+ % valid-name PUBLIC>]>
+              DETAIL
+            end
+          end
+        end
+
+        def test_entity_value_and_notation_data_declaration
+          exception = assert_raise(REXML::ParseException) do
+            REXML::Document.new("<!DOCTYPE root [<!ENTITY % valid-name \"valid-entity-value\" NDATA valid-ndata-value>]>")
+          end
+          assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 85
+Last 80 unconsumed characters:
+ % valid-name "valid-entity-value" NDATA valid-ndata-value>]>
+        DETAIL
+        end
+      end
+
+      def test_no_space
+        exception = assert_raise(REXML::ParseException) do
+          REXML::Document.new("<!DOCTYPE root [<!ENTITY %valid-nameSYSTEM\"valid-system-literal\">]>")
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed entity declaration
+Line: 1
+Position: 67
+Last 80 unconsumed characters:
+ %valid-nameSYSTEM"valid-system-literal">]>
+        DETAIL
+      end
+    end
+
     def test_empty
       exception = assert_raise(REXML::ParseException) do
         parse(<<-INTERNAL_SUBSET)

From 033d1909a8f259d5a7c53681bcaf14f13bcf0368 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Thu, 1 Aug 2024 09:20:31 +0900
Subject: [PATCH 065/101] Add support for XML entity expansion limitation in
 SAX and pull parsers (#187)

- Supported `REXML::Security.entity_expansion_limit=` in SAX and pull parsers
- Supported `REXML::Security.entity_expansion_text_limit=` in SAX and pull parsers
---
 lib/rexml/parsers/baseparser.rb | 19 ++++++-
 lib/rexml/parsers/pullparser.rb |  4 ++
 lib/rexml/parsers/sax2parser.rb |  4 ++
 test/test_document.rb           | 25 +++++----
 test/test_pullparser.rb         | 96 +++++++++++++++++++++++++++++++++
 test/test_sax.rb                | 86 +++++++++++++++++++++++++++++
 6 files changed, 222 insertions(+), 12 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 54014e57..c4ddee3c 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -154,6 +154,7 @@ def initialize( source )
         self.stream = source
         @listeners = []
         @prefixes = Set.new
+        @entity_expansion_count = 0
       end
 
       def add_listener( listener )
@@ -161,6 +162,7 @@ def add_listener( listener )
       end
 
       attr_reader :source
+      attr_reader :entity_expansion_count
 
       def stream=( source )
         @source = SourceFactory.create_from( source )
@@ -513,7 +515,9 @@ def pull_event
       def entity( reference, entities )
         value = nil
         value = entities[ reference ] if entities
-        if not value
+        if value
+          record_entity_expansion
+        else
           value = DEFAULT_ENTITIES[ reference ]
           value = value[2] if value
         end
@@ -552,12 +556,17 @@ def unnormalize( string, entities=nil, filter=nil )
         }
         matches.collect!{|x|x[0]}.compact!
         if matches.size > 0
+          sum = 0
           matches.each do |entity_reference|
             unless filter and filter.include?(entity_reference)
               entity_value = entity( entity_reference, entities )
               if entity_value
                 re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
+                sum += rv.bytesize
+                if sum > Security.entity_expansion_text_limit
+                  raise "entity expansion has grown too large"
+                end
               else
                 er = DEFAULT_ENTITIES[entity_reference]
                 rv.gsub!( er[0], er[2] ) if er
@@ -570,6 +579,14 @@ def unnormalize( string, entities=nil, filter=nil )
       end
 
       private
+
+      def record_entity_expansion
+        @entity_expansion_count += 1
+        if @entity_expansion_count > Security.entity_expansion_limit
+          raise "number of entity expansions exceeded, processing aborted."
+        end
+      end
+
       def need_source_encoding_update?(xml_declaration_encoding)
         return false if xml_declaration_encoding.nil?
         return false if /\AUTF-16\z/i =~ xml_declaration_encoding
diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb
index f8b232a2..36b45953 100644
--- a/lib/rexml/parsers/pullparser.rb
+++ b/lib/rexml/parsers/pullparser.rb
@@ -47,6 +47,10 @@ def add_listener( listener )
         @listeners << listener
       end
 
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+
       def each
         while has_next?
           yield self.pull
diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb
index 36f98c2a..cec9d2fc 100644
--- a/lib/rexml/parsers/sax2parser.rb
+++ b/lib/rexml/parsers/sax2parser.rb
@@ -22,6 +22,10 @@ def source
         @parser.source
       end
 
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+
       def add_listener( listener )
         @parser.add_listener( listener )
       end
diff --git a/test/test_document.rb b/test/test_document.rb
index 33cf4002..0764631d 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -41,7 +41,7 @@ def teardown
 
       class GeneralEntityTest < self
         def test_have_value
-          xml = <<EOF
+          xml = <<XML
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE member [
   <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
@@ -55,23 +55,24 @@ def test_have_value
 <member>
 &a;
 </member>
-EOF
+XML
 
           doc = REXML::Document.new(xml)
-          assert_raise(RuntimeError) do
+          assert_raise(RuntimeError.new("entity expansion has grown too large")) do
             doc.root.children.first.value
           end
+
           REXML::Security.entity_expansion_limit = 100
           assert_equal(100, REXML::Security.entity_expansion_limit)
           doc = REXML::Document.new(xml)
-          assert_raise(RuntimeError) do
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             doc.root.children.first.value
           end
           assert_equal(101, doc.entity_expansion_count)
         end
 
         def test_empty_value
-          xml = <<EOF
+          xml = <<XML
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE member [
   <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
@@ -85,23 +86,24 @@ def test_empty_value
 <member>
 &a;
 </member>
-EOF
+XML
 
           doc = REXML::Document.new(xml)
-          assert_raise(RuntimeError) do
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             doc.root.children.first.value
           end
+
           REXML::Security.entity_expansion_limit = 100
           assert_equal(100, REXML::Security.entity_expansion_limit)
           doc = REXML::Document.new(xml)
-          assert_raise(RuntimeError) do
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             doc.root.children.first.value
           end
           assert_equal(101, doc.entity_expansion_count)
         end
 
         def test_with_default_entity
-          xml = <<EOF
+          xml = <<XML
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE member [
   <!ENTITY a "a">
@@ -112,14 +114,15 @@ def test_with_default_entity
 &a2;
 &lt;
 </member>
-EOF
+XML
 
           REXML::Security.entity_expansion_limit = 4
           doc = REXML::Document.new(xml)
           assert_equal("\na\na a\n<\n", doc.root.children.first.value)
+
           REXML::Security.entity_expansion_limit = 3
           doc = REXML::Document.new(xml)
-          assert_raise(RuntimeError) do
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             doc.root.children.first.value
           end
         end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 096e8b7f..55205af8 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -155,5 +155,101 @@ def test_peek
       end
       assert_equal( 0, names.length )
     end
+
+    class EntityExpansionLimitTest < Test::Unit::TestCase
+      def setup
+        @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+      end
+
+      def teardown
+        REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+      end
+
+      class GeneralEntityTest < self
+        def test_have_value
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx">
+]>
+<member>
+&a;
+</member>
+          XML
+
+          parser = REXML::Parsers::PullParser.new(source)
+          assert_raise(RuntimeError.new("entity expansion has grown too large")) do
+            while parser.has_next?
+              parser.pull
+            end
+          end
+        end
+
+        def test_empty_value
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "">
+]>
+<member>
+&a;
+</member>
+          XML
+
+          parser = REXML::Parsers::PullParser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            while parser.has_next?
+              parser.pull
+            end
+          end
+
+          REXML::Security.entity_expansion_limit = 100
+          parser = REXML::Parsers::PullParser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            while parser.has_next?
+              parser.pull
+            end
+          end
+          assert_equal(101, parser.entity_expansion_count)
+        end
+
+        def test_with_default_entity
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "a">
+  <!ENTITY a2 "&a; &a;">
+]>
+<member>
+&a;
+&a2;
+&lt;
+</member>
+          XML
+
+          REXML::Security.entity_expansion_limit = 4
+          parser = REXML::Parsers::PullParser.new(source)
+          while parser.has_next?
+            parser.pull
+          end
+
+          REXML::Security.entity_expansion_limit = 3
+          parser = REXML::Parsers::PullParser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            while parser.has_next?
+              parser.pull
+            end
+          end
+        end
+      end
+    end
   end
 end
diff --git a/test/test_sax.rb b/test/test_sax.rb
index 5a3f5e4e..5e3ad75b 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -99,6 +99,92 @@ def test_sax2
       end
     end
 
+    class EntityExpansionLimitTest < Test::Unit::TestCase
+      def setup
+        @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+      end
+
+      def teardown
+        REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+      end
+
+      class GeneralEntityTest < self
+        def test_have_value
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx">
+]>
+<member>
+&a;
+</member>
+          XML
+
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          assert_raise(RuntimeError.new("entity expansion has grown too large")) do
+            sax.parse
+          end
+        end
+
+        def test_empty_value
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "">
+]>
+<member>
+&a;
+</member>
+          XML
+
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            sax.parse
+          end
+
+          REXML::Security.entity_expansion_limit = 100
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            sax.parse
+          end
+          assert_equal(101, sax.entity_expansion_count)
+        end
+
+        def test_with_default_entity
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "a">
+  <!ENTITY a2 "&a; &a;">
+]>
+<member>
+&a;
+&a2;
+&lt;
+</member>
+          XML
+
+          REXML::Security.entity_expansion_limit = 4
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          sax.parse
+
+          REXML::Security.entity_expansion_limit = 3
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+            sax.parse
+          end
+        end
+      end
+    end
+
     # used by test_simple_doctype_listener
     # submitted by Jeff Barczewski
     class SimpleDoctypeListener

From 6cac15d45864c8d70904baa5cbfcc97181000960 Mon Sep 17 00:00:00 2001
From: tomoya ishida <tomoyapenguin@gmail.com>
Date: Thu, 1 Aug 2024 09:21:19 +0900
Subject: [PATCH 066/101] Fix source.match performance without specifying term
 string (#186)

Performance problem of `source.match(regexp)` was recently fixed by
specifying terminator string. However, I think maintaining appropriate
terminator string for a regexp is hard.
I propose solving this performance issue by increasing bytes to read in
each iteration.
---
 lib/rexml/parsers/baseparser.rb | 22 +++++++---------------
 lib/rexml/source.rb             | 26 ++++++++++++++++++--------
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index c4ddee3c..b5df6dbc 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -124,14 +124,6 @@ class BaseParser
       }
 
       module Private
-        # Terminal requires two or more letters.
-        INSTRUCTION_TERM = "?>"
-        COMMENT_TERM = "-->"
-        CDATA_TERM = "]]>"
-        DOCTYPE_TERM = "]>"
-        # Read to the end of DOCTYPE because there is no proper ENTITY termination
-        ENTITY_TERM = DOCTYPE_TERM
-
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
@@ -253,7 +245,7 @@ def pull_event
             return process_instruction(start_position)
           elsif @source.match("<!", true)
             if @source.match("--", true)
-              md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
+              md = @source.match(/(.*?)-->/um, true)
               if md.nil?
                 raise REXML::ParseException.new("Unclosed comment", @source)
               end
@@ -320,7 +312,7 @@ def pull_event
               raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
               return [ :elementdecl, "<!ELEMENT" + md[1] ]
             elsif @source.match("ENTITY", true)
-              match_data = @source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM)
+              match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
               unless match_data
                 raise REXML::ParseException.new("Malformed entity declaration", @source)
               end
@@ -389,14 +381,14 @@ def pull_event
                 raise REXML::ParseException.new(message, @source)
               end
               return [:notationdecl, name, *id]
-            elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
+            elsif md = @source.match(/--(.*?)-->/um, true)
               case md[1]
               when /--/, /-\z/
                 raise REXML::ParseException.new("Malformed comment", @source)
               end
               return [ :comment, md[1] ] if md
             end
-          elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
+          elsif match = @source.match(/(%.*?;)\s*/um, true)
             return [ :externalentity, match[1] ]
           elsif @source.match(/\]\s*>/um, true)
             @document_status = :after_doctype
@@ -436,7 +428,7 @@ def pull_event
               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
               raise REXML::ParseException.new("Malformed node", @source) unless md
               if md[0][0] == ?-
-                md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
+                md = @source.match(/--(.*?)-->/um, true)
 
                 if md.nil? || /--|-\z/.match?(md[1])
                   raise REXML::ParseException.new("Malformed comment", @source)
@@ -444,7 +436,7 @@ def pull_event
 
                 return [ :comment, md[1] ]
               else
-                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
+                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
                 return [ :cdata, md[1] ] if md
               end
               raise REXML::ParseException.new( "Declarations can only occur "+
@@ -673,7 +665,7 @@ def parse_id_invalid_details(accept_external_id:,
       end
 
       def process_instruction(start_position)
-        match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
+        match_data = @source.match(Private::INSTRUCTION_END, true)
         unless match_data
           message = "Invalid processing instruction node"
           @source.position = start_position
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 4c30532a..ff887fc0 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -117,7 +117,7 @@ def read_until(term)
     def ensure_buffer
     end
 
-    def match(pattern, cons=false, term: nil)
+    def match(pattern, cons=false)
       if cons
         @scanner.scan(pattern).nil? ? nil : @scanner
       else
@@ -204,10 +204,20 @@ def initialize(arg, block_size=500, encoding=nil)
       end
     end
 
-    def read(term = nil)
+    def read(term = nil, min_bytes = 1)
       term = encode(term) if term
       begin
-        @scanner << readline(term)
+        str = readline(term)
+        @scanner << str
+        read_bytes = str.bytesize
+        begin
+          while read_bytes < min_bytes
+            str = readline(term)
+            @scanner << str
+            read_bytes += str.bytesize
+          end
+        rescue IOError
+        end
         true
       rescue Exception, NameError
         @source = nil
@@ -237,10 +247,9 @@ def ensure_buffer
       read if @scanner.eos? && @source
     end
 
-    # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
-    # - ">"
-    # - "XXX>" (X is any string excluding '>')
-    def match( pattern, cons=false, term: nil )
+    def match( pattern, cons=false )
+      # To avoid performance issue, we need to increase bytes to read per scan
+      min_bytes = 1
       while true
         if cons
           md = @scanner.scan(pattern)
@@ -250,7 +259,8 @@ def match( pattern, cons=false, term: nil )
         break if md
         return nil if pattern.is_a?(String)
         return nil if @source.nil?
-        return nil unless read(term)
+        return nil unless read(nil, min_bytes)
+        min_bytes *= 2
       end
 
       md.nil? ? nil : @scanner

From 11dc1b1430175d69713284ca936809ca8ca819b4 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 09:51:30 +0900
Subject: [PATCH 067/101] test: fix location

---
 test/parse/test_document_type_declaration.rb | 34 ++++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 490a27d4..d30640b8 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -280,23 +280,6 @@ def test_notation_attlist
                      doctype.children.collect(&:class))
       end
 
-      def test_linear_performance_percent_gt
-        seq = [10000, 50000, 100000, 150000, 200000]
-        assert_linear_performance(seq, rehearsal: 10) do |n|
-          begin
-            REXML::Document.new('<!DOCTYPE root [' + "%>" * n + ']><test/>')
-          rescue
-          end
-        end
-      end
-
-      def test_linear_performance_comment_gt
-        seq = [10000, 50000, 100000, 150000, 200000]
-        assert_linear_performance(seq, rehearsal: 10) do |n|
-          REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
-        end
-      end
-
       private
       def parse(internal_subset)
         super(<<-DOCTYPE)
@@ -306,5 +289,22 @@ def parse(internal_subset)
         DOCTYPE
       end
     end
+
+    def test_linear_performance_percent_gt
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        begin
+          REXML::Document.new('<!DOCTYPE root [' + "%>" * n + ']><test/>')
+        rescue
+        end
+      end
+    end
+
+    def test_linear_performance_comment_gt
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
+      end
+    end
   end
 end

From 163d366f21a6d66bf7104f2283eac5b07676c5f8 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 09:52:48 +0900
Subject: [PATCH 068/101] test: use double quote for string literal

---
 test/parse/test_document_type_declaration.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index d30640b8..4f020586 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -294,7 +294,7 @@ def test_linear_performance_percent_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         begin
-          REXML::Document.new('<!DOCTYPE root [' + "%>" * n + ']><test/>')
+          REXML::Document.new("<!DOCTYPE root [" + "%>" * n + "]><test/>")
         rescue
         end
       end
@@ -303,7 +303,7 @@ def test_linear_performance_percent_gt
     def test_linear_performance_comment_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<!DOCTYPE root [<!-- ' + ">" * n + ' -->]>')
+        REXML::Document.new("<!DOCTYPE root [<!-- " + ">" * n + " -->]>")
       end
     end
   end

From 50c725249e434ae89d6286827368af6d0ccea146 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 1 Aug 2024 09:56:36 +0900
Subject: [PATCH 069/101] test: add a performance test for %...; in document
 declaration

---
 test/parse/test_document_type_declaration.rb | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 4f020586..99c23745 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -306,5 +306,12 @@ def test_linear_performance_comment_gt
         REXML::Document.new("<!DOCTYPE root [<!-- " + ">" * n + " -->]>")
       end
     end
+
+    def test_linear_performance_external_entity_right_bracket_gt
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<!DOCTYPE root [%" + "]>" * n + ";]>")
+      end
+    end
   end
 end

From 29027c9ec0afd8d3c2ecc8a80d9af0b24be33920 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 10:34:13 +0900
Subject: [PATCH 070/101] test: use double quote for string literal

---
 test/parse/test_entity_declaration.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index daaf5ed2..30aad48a 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -521,7 +521,7 @@ def test_empty
     def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version "' + '>' * n + '">]>')
+        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version \"" + ">" * n + "\">]>")
       end
     end
   end

From 46c6397d5c647a700fb1817d0093471621d92a27 Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 1 Aug 2024 10:39:02 +0900
Subject: [PATCH 071/101] test: add performance tests for entity declaration

---
 test/parse/test_entity_declaration.rb | 33 +++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
index 30aad48a..81d95b58 100644
--- a/test/parse/test_entity_declaration.rb
+++ b/test/parse/test_entity_declaration.rb
@@ -518,10 +518,39 @@ def test_empty
       DETAIL
     end
 
-    def test_linear_performance_gt
+    def test_linear_performance_entity_value_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version \"" + ">" * n + "\">]>")
+        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version \"" +
+                            ">" * n +
+                            "\">]>")
+      end
+    end
+
+    def test_linear_performance_entity_value_gt_right_bracket
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version \"" +
+                            ">]" * n +
+                            "\">]>")
+      end
+    end
+
+    def test_linear_performance_system_literal_in_system_gt_right_bracket
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version SYSTEM \"" +
+                            ">]" * n +
+                            "\">]>")
+      end
+    end
+
+    def test_linear_performance_system_literal_in_public_gt_right_bracket
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<!DOCTYPE rubynet [<!ENTITY rbconfig.ruby_version PUBLIC \"pubid-literal\" \"" +
+                            ">]" * n +
+                            "\">]>")
       end
     end
   end

From 850488abf20f9327ebc00094cd3bb64eea400a59 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 10:43:21 +0900
Subject: [PATCH 072/101] test: use double quote for string literal

---
 test/parse/test_processing_instruction.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 8d42e964..2273de64 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -82,7 +82,7 @@ def test_after_root
     def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<?xml version="1.0" ' + ">" * n + ' ?>')
+        REXML::Document.new("<?xml version=\"1.0\" " + ">" * n + " ?>")
       end
     end
   end

From 73661ef281f5a829f7fec4ea673d42436c533ded Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:03:45 +0900
Subject: [PATCH 073/101] test: fix a typo

---
 test/parse/test_processing_instruction.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 2273de64..49cf23a5 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -4,7 +4,7 @@
 require "rexml/document"
 
 module REXMLTests
-  class TestParseProcessinInstruction < Test::Unit::TestCase
+  class TestParseProcessingInstruction < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
     def parse(xml)

From e2546e6ecade16b04c9ee528e5be8509fe16c2d6 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:23:43 +0900
Subject: [PATCH 074/101] parse pi: improve invalid case detection

---
 lib/rexml/parsers/baseparser.rb           | 35 +++++++++++++----------
 test/parse/test_processing_instruction.rb | 35 +++++++++++++++++++++--
 2 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index b5df6dbc..44dc6580 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -124,11 +124,10 @@ class BaseParser
       }
 
       module Private
-        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
-        NAME_PATTERN = /\s*#{NAME}/um
+        NAME_PATTERN = /#{NAME}/um
         GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
         PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
@@ -242,7 +241,7 @@ def pull_event
         if @document_status == nil
           start_position = @source.position
           if @source.match("<?", true)
-            return process_instruction(start_position)
+            return process_instruction
           elsif @source.match("<!", true)
             if @source.match("--", true)
               md = @source.match(/(.*?)-->/um, true)
@@ -442,7 +441,7 @@ def pull_event
               raise REXML::ParseException.new( "Declarations can only occur "+
                 "in the doctype declaration.", @source)
             elsif @source.match("?", true)
-              return process_instruction(start_position)
+              return process_instruction
             else
               # Get the next tag
               md = @source.match(Private::TAG_PATTERN, true)
@@ -588,14 +587,14 @@ def need_source_encoding_update?(xml_declaration_encoding)
       def parse_name(base_error_message)
         md = @source.match(Private::NAME_PATTERN, true)
         unless md
-          if @source.match(/\s*\S/um)
+          if @source.match(/\S/um)
             message = "#{base_error_message}: invalid name"
           else
             message = "#{base_error_message}: name is missing"
           end
           raise REXML::ParseException.new(message, @source)
         end
-        md[1]
+        md[0]
       end
 
       def parse_id(base_error_message,
@@ -664,18 +663,24 @@ def parse_id_invalid_details(accept_external_id:,
         end
       end
 
-      def process_instruction(start_position)
-        match_data = @source.match(Private::INSTRUCTION_END, true)
-        unless match_data
-          message = "Invalid processing instruction node"
-          @source.position = start_position
-          raise REXML::ParseException.new(message, @source)
+      def process_instruction
+        name = parse_name("Malformed XML: Invalid processing instruction node")
+        if @source.match(/\s+/um, true)
+          match_data = @source.match(/(.*?)\?>/um, true)
+          unless match_data
+            raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
+          end
+          content = match_data[1]
+        else
+          content = nil
+          unless @source.match("?>", true)
+            raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
+          end
         end
-        if match_data[1] == "xml"
+        if name == "xml"
           if @document_status
             raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
           end
-          content = match_data[2]
           version = VERSION.match(content)
           version = version[1] unless version.nil?
           encoding = ENCODING.match(content)
@@ -690,7 +695,7 @@ def process_instruction(start_position)
           standalone = standalone[1] unless standalone.nil?
           return [ :xmldecl, version, encoding, standalone ]
         end
-        [:processing_instruction, match_data[1], match_data[2]]
+        [:processing_instruction, name, content]
       end
 
       def parse_attributes(prefixes, curr_ns)
diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index 49cf23a5..fba79cea 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -17,11 +17,37 @@ def test_no_name
           parse("<??>")
         end
         assert_equal(<<-DETAIL.chomp, exception.to_s)
-Invalid processing instruction node
+Malformed XML: Invalid processing instruction node: invalid name
 Line: 1
 Position: 4
 Last 80 unconsumed characters:
-<??>
+?>
+        DETAIL
+      end
+
+      def test_unclosed_content
+        exception = assert_raise(REXML::ParseException) do
+          parse("<?name content")
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed XML: Unclosed processing instruction
+Line: 1
+Position: 14
+Last 80 unconsumed characters:
+content
+        DETAIL
+      end
+
+      def test_unclosed_no_content
+        exception = assert_raise(REXML::ParseException) do
+          parse("<?name")
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed XML: Unclosed processing instruction
+Line: 1
+Position: 6
+Last 80 unconsumed characters:
+
         DETAIL
       end
 
@@ -79,6 +105,11 @@ def test_after_root
       assert_equal("abc", events[:processing_instruction])
     end
 
+    def test_content_question
+      document = REXML::Document.new("<a><?name con?tent?></a>")
+      assert_equal("con?tent", document.root.children.first.content)
+    end
+
     def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|

From 1599e8785f2d7734169aeb37a0b5d94f8212356d Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 1 Aug 2024 11:24:22 +0900
Subject: [PATCH 075/101] test: add a performance test for PI with many tabs

---
 test/parse/test_processing_instruction.rb | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb
index fba79cea..ba381dc4 100644
--- a/test/parse/test_processing_instruction.rb
+++ b/test/parse/test_processing_instruction.rb
@@ -116,5 +116,12 @@ def test_linear_performance_gt
         REXML::Document.new("<?xml version=\"1.0\" " + ">" * n + " ?>")
       end
     end
+
+    def test_linear_performance_tab
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<?name" + "\t" * n + "version=\"1.0\" > ?>")
+      end
+    end
   end
 end

From 0fbe7d5a0eac8cfaffa6c3b27f3b9a90061a0fbc Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:33:46 +0900
Subject: [PATCH 076/101] test: don't use abbreviated name

---
 .../{test_attlist.rb => test_attribute_list_declaration.rb}     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename test/parse/{test_attlist.rb => test_attribute_list_declaration.rb} (86%)

diff --git a/test/parse/test_attlist.rb b/test/parse/test_attribute_list_declaration.rb
similarity index 86%
rename from test/parse/test_attlist.rb
rename to test/parse/test_attribute_list_declaration.rb
index c1b4376c..bf2c1ce3 100644
--- a/test/parse/test_attlist.rb
+++ b/test/parse/test_attribute_list_declaration.rb
@@ -4,7 +4,7 @@
 require "rexml/document"
 
 module REXMLTests
-  class TestParseAttlist < Test::Unit::TestCase
+  class TestParseAttributeListDeclaration < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
     def test_linear_performance_gt

From b93d790b36c065a3f7f3e0c3f5b2b71254a4d96d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:34:44 +0900
Subject: [PATCH 077/101] test: use double quote for string literal

---
 test/parse/test_attribute_list_declaration.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb
index bf2c1ce3..f9e8cf5d 100644
--- a/test/parse/test_attribute_list_declaration.rb
+++ b/test/parse/test_attribute_list_declaration.rb
@@ -10,7 +10,9 @@ class TestParseAttributeListDeclaration < Test::Unit::TestCase
     def test_linear_performance_gt
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
-        REXML::Document.new('<!DOCTYPE schema SYSTEM "foo.dtd" [<!ATTLIST ' + " " * n + ' root v CDATA #FIXED "test">]>')
+        REXML::Document.new("<!DOCTYPE schema SYSTEM \"foo.dtd\" [<!ATTLIST " +
+                            " " * n +
+                            " root v CDATA #FIXED \"test\">]>")
       end
     end
   end

From be86b3de0aca8394534b715a83a63bf51c5195f5 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:35:05 +0900
Subject: [PATCH 078/101] test: fix wrong test name

---
 test/parse/test_attribute_list_declaration.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb
index f9e8cf5d..2a8e2639 100644
--- a/test/parse/test_attribute_list_declaration.rb
+++ b/test/parse/test_attribute_list_declaration.rb
@@ -7,7 +7,7 @@ module REXMLTests
   class TestParseAttributeListDeclaration < Test::Unit::TestCase
     include Test::Unit::CoreAssertions
 
-    def test_linear_performance_gt
+    def test_linear_performance_space
       seq = [10000, 50000, 100000, 150000, 200000]
       assert_linear_performance(seq, rehearsal: 10) do |n|
         REXML::Document.new("<!DOCTYPE schema SYSTEM \"foo.dtd\" [<!ATTLIST " +

From 17ff3e78745b31db4e841357e8eed2f6669bea7b Mon Sep 17 00:00:00 2001
From: Watson <watson1978@gmail.com>
Date: Thu, 1 Aug 2024 11:45:51 +0900
Subject: [PATCH 079/101] test: add a performance test for attribute list
 declaration

---
 test/parse/test_attribute_list_declaration.rb | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb
index 2a8e2639..43882528 100644
--- a/test/parse/test_attribute_list_declaration.rb
+++ b/test/parse/test_attribute_list_declaration.rb
@@ -15,5 +15,16 @@ def test_linear_performance_space
                             " root v CDATA #FIXED \"test\">]>")
       end
     end
+
+    def test_linear_performance_tab_and_gt
+      seq = [10000, 50000, 100000, 150000, 200000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        REXML::Document.new("<!DOCTYPE root [<!ATTLIST " +
+                            "\t" * n +
+                            "root value CDATA \"" +
+                            ">" * n +
+                            "\">]>")
+      end
+    end
   end
 end

From e4a067e11235a2ec7a00616d41350485e384ec05 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:51:33 +0900
Subject: [PATCH 080/101] Add 3.3.3 entry

---
 NEWS.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 76355d87..72318b7f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,39 @@
 # News
 
+## 3.3.3 - 2024-08-01 {#version-3-3-3}
+
+### Improvements
+
+  * Added support for detecting invalid XML that has unsupported
+    content before root element
+    * GH-184
+    * Patch by NAITOH Jun.
+
+  * Added support for `REXML::Security.entity_expansion_limit=` and
+    `REXML::Security.entity_expansion_text_limit=` in SAX2 and pull
+    parsers
+    * GH-187
+    * Patch by NAITOH Jun.
+
+  * Added more tests for invalid XMLs.
+    * GH-183
+    * Patch by Watson.
+
+  * Added more performance tests.
+    * Patch by Watson.
+
+  * Improved parse performance.
+    * GH-186
+    * Patch by tomoya ishida.
+
+### Thanks
+
+  * NAITOH Jun
+
+  * Watson
+
+  * tomoya ishida
+
 ## 3.3.2 - 2024-07-16 {#version-3-3-2}
 
 ### Improvements

From d65e27c765c1004f07b910c024f856eda549587d Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 11:54:33 +0900
Subject: [PATCH 081/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 39e92a57..818bd01a 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.3"
+  VERSION = "3.3.4"
   REVISION = ""
 
   Copyright = COPYRIGHT

From cb2137880df6e5906f67a0c3701ffac3eded798f Mon Sep 17 00:00:00 2001
From: takuya kodama <otegami@clear-code.com>
Date: Thu, 1 Aug 2024 15:16:46 +0800
Subject: [PATCH 082/101] Add missing rexml/security require in
 rexml/parsers/baseparser.rb (#189)

`REXML::Parser::BaseParser` uses `REXML::Security` since #187. But
`rexml/parsers/baseparser.rb` doesn't require `rexml/security`
explicitly.

This doesn't cause a problem in normal usages because `require "rexml"`
requires `rexml/security` implicitly. If an user requires specific
parser such as `rexml/parsers/streamparser` explicitly, this causes a
problem.

We should require `rexml/security` explicitly in
`rexml/parsers/baseparser.rb` explicitly because
`REXML::Parser::BaseParser` uses `REXML::Security`.

## How to reproduce

When `lib/rexml/parsers/baseparser.rb` is required directly, the
`REXML::Security` module is not required. It causes the following error:

```ruby
require "rexml/parsers/streamparser"
require "rexml/streamlistener"

class Listener
  include REXML::StreamListener
end

REXML::Parsers::StreamParser.new("<root>&gt;</root>", Listener.new).parse
```

```console
$ ruby test.rb
lib/rexml/parsers/baseparser.rb:558:in 'block in REXML::Parsers::BaseParser#unnormalize': uninitialized constant REXML::Parsers::BaseParser::Security (NameError)

                if sum > Security.entity_expansion_text_limit
                         ^^^^^^^^
Did you mean?  SecurityError
	from <internal:array>:54:in 'Array#each'
	from rexml/parsers/baseparser.rb:551:in 'REXML::Parsers::BaseParser#unnormalize'
	from rexml/parsers/streamparser.rb:39:in 'REXML::Parsers::StreamParser#parse'
	from test.rb:8:in '<main>'
```
---
 lib/rexml/parsers/baseparser.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 44dc6580..28810bfa 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative '../parseexception'
 require_relative '../undefinednamespaceexception'
+require_relative '../security'
 require_relative '../source'
 require 'set'
 require "strscan"

From 911dca43f2a645bffbfcfb07d57f2aaf52d19733 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 16:19:34 +0900
Subject: [PATCH 083/101] Add 3.3.4 entry

---
 NEWS.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 72318b7f..a924538e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,19 @@
 # News
 
+## 3.3.4 - 2024-08-01 {#version-3-3-4}
+
+### Fixes
+
+  * Fixed a bug that `REXML::Security` isn't defined when
+    `REXML::Parsers::StreamParser` is used and
+    `rexml/parsers/streamparser` is only required.
+    * GH-189
+    * Patch by takuya kodama.
+
+### Thanks
+
+  * takuya kodama
+
 ## 3.3.3 - 2024-08-01 {#version-3-3-3}
 
 ### Improvements

From e3f747fb4fe30f5c890a4bea5b12dd72f595e6b3 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 1 Aug 2024 16:20:26 +0900
Subject: [PATCH 084/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 818bd01a..bb804b0e 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.4"
+  VERSION = "3.3.5"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 1892770f3e32d75368ffad99b8e86d539786c213 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 12 Aug 2024 09:58:23 +0900
Subject: [PATCH 085/101] Fix calculation of
 Security.entity_expansion_text_limit in SAX/pull parsers (#195)

GitHub: fix #193

## [Why?]
In SAX and pull parsers, the total value of rv.bytesize was checked, but
the summing process was unnecessary.

- Add Log
```patch
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 28810bf..5cfc089 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -556,6 +556,7 @@ module REXML
                 re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
                 sum += rv.bytesize
+puts " rv.bytesize: #{rv.bytesize} sum: #{sum} > Security.entity_expansion_text_limit: #{Security.entity_expansion_text_limit} : #{rv}"
                 if sum > Security.entity_expansion_text_limit
                   raise "entity expansion has grown too large"
                 end
diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
index 7e0befe..cc68dbf 100644
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@@ -415,6 +415,7 @@ module REXML
       sum = 0
       string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
         s = Text.expand($&, doctype, filter)
+puts " s.bytesize: #{s.bytesize} sum + s.bytesize : #{sum + s.bytesize } > Security.entity_expansion_text_limit: #{Security.entity_expansion_text_limit} : #{s}"
         if sum + s.bytesize > Security.entity_expansion_text_limit
           raise "entity expansion has grown too large"
         else
```

- entity_expansion_text_limit.rb
```ruby
$LOAD_PATH.unshift(File.expand_path("lib"))

require 'rexml'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'

def dom_entity_expansion_count_check(xml)
  doc = REXML::Document.new(xml)
  doc.root.children.first.value
  puts "DOM: entity_expansion_count: #{doc.entity_expansion_count}"
end

def sax_entity_expansion_count_check(xml)
  sax = REXML::Parsers::SAX2Parser.new(xml)
  sax.parse
  puts "SAX: entity_expansion_count: #{sax.entity_expansion_count}"
end

def pull_entity_expansion_count_check(xml)
  parser = REXML::Parsers::PullParser.new(xml)
  while parser.has_next?
    parser.pull
  end
  puts "Pull: entity_expansion_count: #{parser.entity_expansion_count}"
end

xml = <<XML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE member [
  <!ENTITY a "&b;&b;&b;">
  <!ENTITY b "&c;&d;&e;">
  <!ENTITY c "xxxxxxxxxx">
  <!ENTITY d "yyyyyyyyyy">
  <!ENTITY e "zzzzzzzzzz">
]>
<member>&a;</member>
XML

dom_entity_expansion_count_check(xml)
sax_entity_expansion_count_check(xml)
pull_entity_expansion_count_check(xml)
```

```
$ ruby entity_expansion_text_limit.rb
 s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx
 s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy
 s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz
 s.bytesize: 30 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx
 s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy
 s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz
 s.bytesize: 30 sum + s.bytesize : 60 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 s.bytesize: 10 sum + s.bytesize : 10 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx
 s.bytesize: 10 sum + s.bytesize : 20 > Security.entity_expansion_text_limit: 10240 : yyyyyyyyyy
 s.bytesize: 10 sum + s.bytesize : 30 > Security.entity_expansion_text_limit: 10240 : zzzzzzzzzz
 s.bytesize: 30 sum + s.bytesize : 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 s.bytesize: 90 sum + s.bytesize : 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
DOM: entity_expansion_count: 13
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 180 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 270 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
SAX: entity_expansion_count: 13
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 180 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 16 sum: 16 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxx&d;&e;
 rv.bytesize: 23 sum: 39 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyy&e;
 rv.bytesize: 30 sum: 69 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 270 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
 rv.bytesize: 90 sum: 90 > Security.entity_expansion_text_limit: 10240 : xxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzzxxxxxxxxxxyyyyyyyyyyzzzzzzzzzz
Pull: entity_expansion_count: 13
```

90 bytes is the expected value, but SAX and Pull exceed 90 bytes due to
unnecessary total processing.
---
 lib/rexml/parsers/baseparser.rb |  4 +---
 test/test_document.rb           | 20 ++++++++++++++++++++
 test/test_pullparser.rb         | 30 ++++++++++++++++++++++++++++++
 test/test_sax.rb                | 24 ++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 28810bfa..342f9482 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -548,15 +548,13 @@ def unnormalize( string, entities=nil, filter=nil )
         }
         matches.collect!{|x|x[0]}.compact!
         if matches.size > 0
-          sum = 0
           matches.each do |entity_reference|
             unless filter and filter.include?(entity_reference)
               entity_value = entity( entity_reference, entities )
               if entity_value
                 re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
                 rv.gsub!( re, entity_value )
-                sum += rv.bytesize
-                if sum > Security.entity_expansion_text_limit
+                if rv.bytesize > Security.entity_expansion_text_limit
                   raise "entity expansion has grown too large"
                 end
               else
diff --git a/test/test_document.rb b/test/test_document.rb
index 0764631d..72ec3579 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -33,10 +33,12 @@ def test_new
     class EntityExpansionLimitTest < Test::Unit::TestCase
       def setup
         @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+        @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit
       end
 
       def teardown
         REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+        REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit
       end
 
       class GeneralEntityTest < self
@@ -126,6 +128,24 @@ def test_with_default_entity
             doc.root.children.first.value
           end
         end
+
+        def test_entity_expansion_text_limit
+          xml = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;">
+  <!ENTITY b "&c;&d;&e;">
+  <!ENTITY c "xxxxxxxxxx">
+  <!ENTITY d "yyyyyyyyyy">
+  <!ENTITY e "zzzzzzzzzz">
+]>
+<member>&a;</member>
+          XML
+
+          REXML::Security.entity_expansion_text_limit = 90
+          doc = REXML::Document.new(xml)
+          assert_equal(90, doc.root.children.first.value.bytesize)
+        end
       end
 
       class ParameterEntityTest < self
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 55205af8..827fad1d 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -159,10 +159,12 @@ def test_peek
     class EntityExpansionLimitTest < Test::Unit::TestCase
       def setup
         @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+        @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit
       end
 
       def teardown
         REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+        REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit
       end
 
       class GeneralEntityTest < self
@@ -249,6 +251,34 @@ def test_with_default_entity
             end
           end
         end
+
+        def test_entity_expansion_text_limit
+          source = <<-XML
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;">
+  <!ENTITY b "&c;&d;&e;">
+  <!ENTITY c "xxxxxxxxxx">
+  <!ENTITY d "yyyyyyyyyy">
+  <!ENTITY e "zzzzzzzzzz">
+]>
+<member>&a;</member>
+          XML
+
+          REXML::Security.entity_expansion_text_limit = 90
+          parser = REXML::Parsers::PullParser.new(source)
+          events = {}
+          element_name = ''
+          while parser.has_next?
+            event = parser.pull
+            case event.event_type
+            when :start_element
+              element_name = event[0]
+            when :text
+              events[element_name] = event[1]
+            end
+          end
+          assert_equal(90, events['member'].size)
+        end
       end
     end
   end
diff --git a/test/test_sax.rb b/test/test_sax.rb
index 5e3ad75b..f452de50 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -102,10 +102,12 @@ def test_sax2
     class EntityExpansionLimitTest < Test::Unit::TestCase
       def setup
         @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+        @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit
       end
 
       def teardown
         REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+        REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit
       end
 
       class GeneralEntityTest < self
@@ -182,6 +184,28 @@ def test_with_default_entity
             sax.parse
           end
         end
+
+        def test_entity_expansion_text_limit
+          source = <<-XML
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;">
+  <!ENTITY b "&c;&d;&e;">
+  <!ENTITY c "xxxxxxxxxx">
+  <!ENTITY d "yyyyyyyyyy">
+  <!ENTITY e "zzzzzzzzzz">
+]>
+<member>&a;</member>
+          XML
+
+          REXML::Security.entity_expansion_text_limit = 90
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          text_size = nil
+          sax.listen(:characters, ["member"]) do |text|
+            text_size = text.size
+          end
+          sax.parse
+          assert_equal(90, text_size)
+        end
       end
     end
 

From 21d90cbba9a029f85146acbd66c3ce8630b1a608 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 12 Aug 2024 10:02:56 +0900
Subject: [PATCH 086/101] Add 3.3.5 entry

---
 NEWS.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index a924538e..165b1c76 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,22 @@
 # News
 
+## 3.3.5 - 2024-08-12 {#version-3-3-5}
+
+### Fixes
+
+  * Fixed a bug that `REXML::Security.entity_expansion_text_limit`
+    check has wrong text size calculation in SAX and pull parsers.
+    * GH-193
+    * GH-195
+    * Reported by Viktor Ivarsson.
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * Viktor Ivarsson
+
+  * NAITOH Jun
+
 ## 3.3.4 - 2024-08-01 {#version-3-3-4}
 
 ### Fixes

From e14847cee53d26eb162ad786ba12e3cd7a86fce0 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 12 Aug 2024 10:03:34 +0900
Subject: [PATCH 087/101] Bump version

---
 lib/rexml/rexml.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index bb804b0e..99d574b3 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.5"
+  VERSION = "3.3.6"
   REVISION = ""
 
   Copyright = COPYRIGHT

From 2f019f9b33594b561e1ef39c42fab1f2fda51891 Mon Sep 17 00:00:00 2001
From: Viktor Ivarsson <viktor.ivarsson@twingly.com>
Date: Fri, 16 Aug 2024 03:47:25 +0200
Subject: [PATCH 088/101] Improve `BaseParser#unnormalize` (#194)

The current implementation of `#unnormalize` iterates over matched
entity references that already has been substituted. With these changes
we will reduce the number of redundant calls to `rv.gsub!`.

* Reject filtered matches earlier in the loop
* Improve `#unnormalize` by removing redundant calls to `rv.gsub!`
* Improve `entity_expansion_limit` tests

---

Example:

```ruby
require "rexml/parsers/baseparser"

entity_less_than = "&lt;"
entitiy_length = 100

filler_text = "A"
filler_length = 100

feed = "#{entity_less_than * entitiy_length}#{filler_text * filler_length}"

base_parser = REXML::Parsers::BaseParser.new("")
base_parser.unnormalize(feed) # => "<" * 100 + "A" * 100
```

Before this PR, the example above would require 100 iterations. After
this PR, 1 iteration.

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
---
 lib/rexml/parsers/baseparser.rb | 53 ++++++++++++++++++++++++---------
 test/test_pullparser.rb         | 14 +++++----
 test/test_sax.rb                | 12 ++++----
 3 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 342f9482..b471feff 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -8,6 +8,22 @@
 
 module REXML
   module Parsers
+    unless [].respond_to?(:tally)
+      module EnumerableTally
+        refine Enumerable do
+          def tally
+            counts = {}
+            each do |item|
+              counts[item] ||= 0
+              counts[item] += 1
+            end
+            counts
+          end
+        end
+      end
+      using EnumerableTally
+    end
+
     if StringScanner::Version < "3.0.8"
       module StringScannerCaptures
         refine StringScanner do
@@ -547,20 +563,29 @@ def unnormalize( string, entities=nil, filter=nil )
           [Integer(m)].pack('U*')
         }
         matches.collect!{|x|x[0]}.compact!
+        if filter
+          matches.reject! do |entity_reference|
+            filter.include?(entity_reference)
+          end
+        end
         if matches.size > 0
-          matches.each do |entity_reference|
-            unless filter and filter.include?(entity_reference)
-              entity_value = entity( entity_reference, entities )
-              if entity_value
-                re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
-                rv.gsub!( re, entity_value )
-                if rv.bytesize > Security.entity_expansion_text_limit
-                  raise "entity expansion has grown too large"
-                end
-              else
-                er = DEFAULT_ENTITIES[entity_reference]
-                rv.gsub!( er[0], er[2] ) if er
+          matches.tally.each do |entity_reference, n|
+            entity_expansion_count_before = @entity_expansion_count
+            entity_value = entity( entity_reference, entities )
+            if entity_value
+              if n > 1
+                entity_expansion_count_delta =
+                  @entity_expansion_count - entity_expansion_count_before
+                record_entity_expansion(entity_expansion_count_delta * (n - 1))
+              end
+              re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
+              rv.gsub!( re, entity_value )
+              if rv.bytesize > Security.entity_expansion_text_limit
+                raise "entity expansion has grown too large"
               end
+            else
+              er = DEFAULT_ENTITIES[entity_reference]
+              rv.gsub!( er[0], er[2] ) if er
             end
           end
           rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
@@ -570,8 +595,8 @@ def unnormalize( string, entities=nil, filter=nil )
 
       private
 
-      def record_entity_expansion
-        @entity_expansion_count += 1
+      def record_entity_expansion(delta=1)
+        @entity_expansion_count += delta
         if @entity_expansion_count > Security.entity_expansion_limit
           raise "number of entity expansions exceeded, processing aborted."
         end
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index 827fad1d..dbde8779 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -206,21 +206,23 @@ def test_empty_value
 </member>
           XML
 
+          REXML::Security.entity_expansion_limit = 100000
           parser = REXML::Parsers::PullParser.new(source)
-          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
-            while parser.has_next?
-              parser.pull
-            end
+          while parser.has_next?
+            parser.pull
           end
+          assert_equal(11111, parser.entity_expansion_count)
 
-          REXML::Security.entity_expansion_limit = 100
+          REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
           parser = REXML::Parsers::PullParser.new(source)
           assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             while parser.has_next?
               parser.pull
             end
           end
-          assert_equal(101, parser.entity_expansion_count)
+          assert do
+            parser.entity_expansion_count > @default_entity_expansion_limit
+          end
         end
 
         def test_with_default_entity
diff --git a/test/test_sax.rb b/test/test_sax.rb
index f452de50..d31de183 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -147,17 +147,19 @@ def test_empty_value
 </member>
           XML
 
+          REXML::Security.entity_expansion_limit = 100000
           sax = REXML::Parsers::SAX2Parser.new(source)
-          assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
-            sax.parse
-          end
+          sax.parse
+          assert_equal(11111, sax.entity_expansion_count)
 
-          REXML::Security.entity_expansion_limit = 100
+          REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
           sax = REXML::Parsers::SAX2Parser.new(source)
           assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
             sax.parse
           end
-          assert_equal(101, sax.entity_expansion_count)
+          assert do
+            sax.entity_expansion_count > @default_entity_expansion_limit
+          end
         end
 
         def test_with_default_entity

From 1c76dbbb7c5f001f4b931b8055e8e2d7578da760 Mon Sep 17 00:00:00 2001
From: Viktor Ivarsson <viktor.ivarsson@twingly.com>
Date: Sat, 17 Aug 2024 04:09:54 +0200
Subject: [PATCH 089/101] Fix RuntimeError in `REXML::Parsers::BaseParser` for
 valid feeds (#199)

GitHub: fix GH-198

Change `#entity` to not match against default entities

After this change, the following example will not raise a RuntimeError:

```ruby
# rexml/refactor_entity_example.rb

$LOAD_PATH.unshift(File.expand_path("lib"))

require "rexml/parsers/baseparser"

valid_feed = "&lt;p&gt;#{'A' * 10_240}&lt;/p&gt;"

base_parser = REXML::Parsers::BaseParser.new("")
base_parser.unnormalize(valid_feed) # => "<p>" + "A" * 10_240 + "</p>"
```

Default entities now gets substituted by this block instead


https://github.com/ruby/rexml/blob/e14847cee53d26eb162ad786ba12e3cd7a86fce0/lib/rexml/parsers/baseparser.rb#L560-L563

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Co-authored-by: NAITOH Jun <naitoh@gmail.com>
---
 lib/rexml/parsers/baseparser.rb | 16 +++++++-------
 test/test_pullparser.rb         | 30 ++++++++++++++++++++++++++
 test/test_sax.rb                | 24 +++++++++++++++++++++
 test/test_stream.rb             | 37 +++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index b471feff..c03f375f 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -521,15 +521,13 @@ def pull_event
       private :pull_event
 
       def entity( reference, entities )
-        value = nil
-        value = entities[ reference ] if entities
-        if value
-          record_entity_expansion
-        else
-          value = DEFAULT_ENTITIES[ reference ]
-          value = value[2] if value
-        end
-        unnormalize( value, entities ) if value
+        return unless entities
+
+        value = entities[ reference ]
+        return if value.nil?
+
+        record_entity_expansion
+        unnormalize( value, entities )
       end
 
       # Escapes all possible entities
diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb
index dbde8779..005a106a 100644
--- a/test/test_pullparser.rb
+++ b/test/test_pullparser.rb
@@ -254,6 +254,36 @@ def test_with_default_entity
           end
         end
 
+        def test_with_only_default_entities
+          member_value = "&lt;p&gt;#{'A' * @default_entity_expansion_text_limit}&lt;/p&gt;"
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<member>
+#{member_value}
+</member>
+          XML
+
+          parser = REXML::Parsers::PullParser.new(source)
+          events = {}
+          element_name = ''
+          while parser.has_next?
+            event = parser.pull
+            case event.event_type
+            when :start_element
+              element_name = event[0]
+            when :text
+              events[element_name] = event[1]
+            end
+          end
+
+          expected_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>"
+          assert_equal(expected_value, events['member'].strip)
+          assert_equal(0, parser.entity_expansion_count)
+          assert do
+            events['member'].bytesize > @default_entity_expansion_text_limit
+          end
+        end
+
         def test_entity_expansion_text_limit
           source = <<-XML
 <!DOCTYPE member [
diff --git a/test/test_sax.rb b/test/test_sax.rb
index d31de183..ae17e364 100644
--- a/test/test_sax.rb
+++ b/test/test_sax.rb
@@ -187,6 +187,30 @@ def test_with_default_entity
           end
         end
 
+        def test_with_only_default_entities
+          member_value = "&lt;p&gt;#{'A' * @default_entity_expansion_text_limit}&lt;/p&gt;"
+          source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<member>
+#{member_value}
+</member>
+          XML
+
+          sax = REXML::Parsers::SAX2Parser.new(source)
+          text_value = nil
+          sax.listen(:characters, ["member"]) do |text|
+            text_value = text
+          end
+          sax.parse
+
+          expected_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>"
+          assert_equal(expected_value, text_value.strip)
+          assert_equal(0, sax.entity_expansion_count)
+          assert do
+            text_value.bytesize > @default_entity_expansion_text_limit
+          end
+        end
+
         def test_entity_expansion_text_limit
           source = <<-XML
 <!DOCTYPE member [
diff --git a/test/test_stream.rb b/test/test_stream.rb
index 545d5349..615d497f 100644
--- a/test/test_stream.rb
+++ b/test/test_stream.rb
@@ -89,6 +89,43 @@ def entity(content)
     end
   end
 
+  class EntityExpansionLimitTest < Test::Unit::TestCase
+    def setup
+      @default_entity_expansion_limit = REXML::Security.entity_expansion_limit
+      @default_entity_expansion_text_limit = REXML::Security.entity_expansion_text_limit
+    end
+
+    def teardown
+      REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+      REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit
+    end
+
+    def test_with_only_default_entities
+      member_value = "&lt;p&gt;#{'A' * @default_entity_expansion_text_limit}&lt;/p&gt;"
+      source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<member>
+#{member_value}
+</member>
+      XML
+
+      listener = MyListener.new
+      class << listener
+        attr_accessor :text_value
+        def text(text)
+          @text_value << text
+        end
+      end
+      listener.text_value = ""
+      REXML::Document.parse_stream(source, listener)
+
+      expected_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>"
+      assert_equal(expected_value, listener.text_value.strip)
+      assert do
+        listener.text_value.bytesize > @default_entity_expansion_text_limit
+      end
+    end
+  end
 
   # For test_listener
   class RequestReader

From c8110b4830c990453e167ccca934e65858308fa1 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sat, 17 Aug 2024 11:25:28 +0900
Subject: [PATCH 090/101] Fix to not allow parameter entity references at
 internal subsets (#191)

## Why?
In the internal subset of DTD, references to parameter entities are not
allowed within markup declarations.

See: https://www.w3.org/TR/xml/#wfc-PEinInternalSubset

> Well-formedness constraint: PEs in Internal Subset

> In the internal DTD subset, parameter-entity references MUST NOT occur
within markup declarations; they may occur where markup declarations can
occur. (This does not apply to references that occur in external
parameter entities or to the external subset.)
---
 lib/rexml/entity.rb             |  52 ++--------------
 lib/rexml/parsers/baseparser.rb |   3 +
 test/test_document.rb           |  50 ----------------
 test/test_entity.rb             | 102 +++++++++++++++++++++++++-------
 4 files changed, 89 insertions(+), 118 deletions(-)

diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb
index 573db691..12bbad3f 100644
--- a/lib/rexml/entity.rb
+++ b/lib/rexml/entity.rb
@@ -12,6 +12,7 @@ class Entity < Child
     EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
     NDATADECL = "\\s+NDATA\\s+#{NAME}"
     PEREFERENCE = "%#{NAME};"
+    PEREFERENCE_RE = /#{PEREFERENCE}/um
     ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
     PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
     ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
@@ -19,7 +20,7 @@ class Entity < Child
     GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
     ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
 
-    attr_reader :name, :external, :ref, :ndata, :pubid
+    attr_reader :name, :external, :ref, :ndata, :pubid, :value
 
     # Create a new entity.  Simple entities can be constructed by passing a
     # name, value to the constructor; this creates a generic, plain entity
@@ -68,14 +69,11 @@ def Entity::matches? string
     end
 
     # Evaluates to the unnormalized value of this entity; that is, replacing
-    # all entities -- both %ent; and &ent; entities.  This differs from
-    # +value()+ in that +value+ only replaces %ent; entities.
+    # &ent; entities.
     def unnormalized
       document.record_entity_expansion unless document.nil?
-      v = value()
-      return nil if v.nil?
-      @unnormalized = Text::unnormalize(v, parent)
-      @unnormalized
+      return nil if @value.nil?
+      @unnormalized = Text::unnormalize(@value, parent)
     end
 
     #once :unnormalized
@@ -121,46 +119,6 @@ def to_s
       write rv
       rv
     end
-
-    PEREFERENCE_RE = /#{PEREFERENCE}/um
-    # Returns the value of this entity.  At the moment, only internal entities
-    # are processed.  If the value contains internal references (IE,
-    # %blah;), those are replaced with their values.  IE, if the doctype
-    # contains:
-    #  <!ENTITY % foo "bar">
-    #  <!ENTITY yada "nanoo %foo; nanoo>
-    # then:
-    #  doctype.entity('yada').value   #-> "nanoo bar nanoo"
-    def value
-      @resolved_value ||= resolve_value
-    end
-
-    def parent=(other)
-      @resolved_value = nil
-      super
-    end
-
-    private
-    def resolve_value
-      return nil if @value.nil?
-      return @value unless @value.match?(PEREFERENCE_RE)
-
-      matches = @value.scan(PEREFERENCE_RE)
-      rv = @value.clone
-      if @parent
-        sum = 0
-        matches.each do |entity_reference|
-          entity_value = @parent.entity( entity_reference[0] )
-          if sum + entity_value.bytesize > Security.entity_expansion_text_limit
-            raise "entity expansion has grown too large"
-          else
-            sum += entity_value.bytesize
-          end
-          rv.gsub!( /%#{entity_reference.join};/um, entity_value )
-        end
-      end
-      rv
-    end
   end
 
   # This is a set of entity constants -- the ones defined in the XML
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index c03f375f..e38ff86e 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -141,6 +141,7 @@ class BaseParser
       }
 
       module Private
+        PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
         TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
@@ -350,6 +351,8 @@ def pull_event
                 match[4] = match[4][1..-2] # HREF
                 match.delete_at(5) if match.size > 5 # Chop out NDATA decl
                 # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
+              elsif Private::PEREFERENCE_PATTERN.match?(match[2])
+                raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source)
               else
                 match[2] = match[2][1..-2]
                 match.pop if match.size == 4
diff --git a/test/test_document.rb b/test/test_document.rb
index 72ec3579..25a8828f 100644
--- a/test/test_document.rb
+++ b/test/test_document.rb
@@ -147,56 +147,6 @@ def test_entity_expansion_text_limit
           assert_equal(90, doc.root.children.first.value.bytesize)
         end
       end
-
-      class ParameterEntityTest < self
-        def test_have_value
-          xml = <<EOF
-<!DOCTYPE root [
-  <!ENTITY % a "BOOM.BOOM.BOOM.BOOM.BOOM.BOOM.BOOM.BOOM.BOOM.">
-  <!ENTITY % b "%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;">
-  <!ENTITY % c "%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;">
-  <!ENTITY % d "%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;">
-  <!ENTITY % e "%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;">
-  <!ENTITY % f "%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;">
-  <!ENTITY % g "%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;">
-  <!ENTITY test "test %g;">
-]>
-<cd></cd>
-EOF
-
-          assert_raise(REXML::ParseException) do
-            REXML::Document.new(xml)
-          end
-          REXML::Security.entity_expansion_limit = 100
-          assert_equal(100, REXML::Security.entity_expansion_limit)
-          assert_raise(REXML::ParseException) do
-            REXML::Document.new(xml)
-          end
-        end
-
-        def test_empty_value
-          xml = <<EOF
-<!DOCTYPE root [
-  <!ENTITY % a "">
-  <!ENTITY % b "%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;%a;">
-  <!ENTITY % c "%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;%b;">
-  <!ENTITY % d "%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;%c;">
-  <!ENTITY % e "%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;">
-  <!ENTITY % f "%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;%e;">
-  <!ENTITY % g "%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;%f;">
-  <!ENTITY test "test %g;">
-]>
-<cd></cd>
-EOF
-
-          REXML::Document.new(xml)
-          REXML::Security.entity_expansion_limit = 90
-          assert_equal(90, REXML::Security.entity_expansion_limit)
-          assert_raise(REXML::ParseException) do
-            REXML::Document.new(xml)
-          end
-        end
-      end
     end
 
     def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source
diff --git a/test/test_entity.rb b/test/test_entity.rb
index a2b262f7..89f83894 100644
--- a/test/test_entity.rb
+++ b/test/test_entity.rb
@@ -59,8 +59,7 @@ def test_parse_entity
 
     def test_constructor
       one = [ %q{<!ENTITY % YN '"Yes"'>},
-        %q{<!ENTITY % YN2 "Yes">},
-        %q{<!ENTITY WhatHeSaid "He said %YN;">},
+        %q{<!ENTITY WhatHeSaid 'He said "Yes"'>},
         '<!ENTITY open-hatch
                 SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">',
         '<!ENTITY open-hatch2
@@ -71,8 +70,7 @@ def test_constructor
                 NDATA gif>' ]
       source = %q{<!DOCTYPE foo [
         <!ENTITY % YN '"Yes"'>
-        <!ENTITY % YN2 "Yes">
-        <!ENTITY WhatHeSaid "He said %YN;">
+        <!ENTITY WhatHeSaid 'He said "Yes"'>
         <!ENTITY open-hatch
                 SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">
         <!ENTITY open-hatch2
@@ -104,6 +102,84 @@ def test_replace_entities
       assert_equal source, out
     end
 
+    def test_readers_with_reference
+      entity = REXML::Entity.new([:entitydecl, "a", "B", "%"])
+      assert_equal([
+                     '<!ENTITY % a "B">',
+                     "a",
+                     "B",
+                     "B",
+                     "B",
+                   ],
+                   [
+                     entity.to_s,
+                     entity.name,
+                     entity.value,
+                     entity.normalized,
+                     entity.unnormalized,
+                   ])
+    end
+
+    def test_readers_without_reference
+      entity = REXML::Entity.new([:entitydecl, "a", "&b;"])
+      assert_equal([
+                     '<!ENTITY a "&b;">',
+                     "a",
+                     "&b;",
+                     "&b;",
+                     "&b;",
+                   ],
+                   [
+                     entity.to_s,
+                     entity.name,
+                     entity.value,
+                     entity.normalized,
+                     entity.unnormalized,
+                   ])
+    end
+
+    def test_readers_with_nested_references
+      doctype = REXML::DocType.new('root')
+      doctype.add(REXML::Entity.new([:entitydecl, "a", "&b;"]))
+      doctype.add(REXML::Entity.new([:entitydecl, "b", "X"]))
+      assert_equal([
+                     "a",
+                     "&b;",
+                     "&b;",
+                     "X",
+                     "b",
+                     "X",
+                     "X",
+                     "X",
+                  ],
+                   [
+                     doctype.entities["a"].name,
+                     doctype.entities["a"].value,
+                     doctype.entities["a"].normalized,
+                     doctype.entities["a"].unnormalized,
+                     doctype.entities["b"].name,
+                     doctype.entities["b"].value,
+                     doctype.entities["b"].normalized,
+                     doctype.entities["b"].unnormalized,
+                   ])
+    end
+
+    def test_parameter_entity_reference_forbidden_by_internal_subset_in_parser
+      source = '<!DOCTYPE root [ <!ENTITY % a "B" > <!ENTITY c "%a;" > ]><root/>'
+      parser = REXML::Parsers::BaseParser.new(source)
+      exception = assert_raise(REXML::ParseException) do
+        while parser.has_next?
+          parser.pull
+        end
+      end
+      assert_equal(<<-DETAIL, exception.to_s)
+Parameter entity references forbidden in internal subset: "%a;"
+Line: 1
+Position: 54
+Last 80 unconsumed characters:
+      DETAIL
+    end
+
     def test_entity_string_limit
       template = '<!DOCTYPE bomb [ <!ENTITY a "^" > ]> <bomb>$</bomb>'
       len      = 5120 # 5k per entity
@@ -122,22 +198,6 @@ def test_entity_string_limit
       end
     end
 
-    def test_entity_string_limit_for_parameter_entity
-      template = '<!DOCTYPE bomb [ <!ENTITY % a "^" > <!ENTITY bomb "$" > ]><root/>'
-      len      = 5120 # 5k per entity
-      template.sub!(/\^/, "B" * len)
-
-      # 10k is OK
-      entities = '%a;' * 2 # 5k entity * 2 = 10k
-      REXML::Document.new(template.sub(/\$/, entities))
-
-      # above 10k explodes
-      entities = '%a;' * 3 # 5k entity * 2 = 15k
-      assert_raise(REXML::ParseException) do
-        REXML::Document.new(template.sub(/\$/, entities))
-      end
-    end
-
     def test_raw
       source = '<!DOCTYPE foo [
 <!ENTITY ent "replace">
@@ -161,7 +221,7 @@ def test_lazy_evaluation
     def test_entity_replacement
       source = %q{<!DOCTYPE foo [
       <!ENTITY % YN '"Yes"'>
-      <!ENTITY WhatHeSaid "He said %YN;">]>
+      <!ENTITY WhatHeSaid 'He said "Yes"'>]>
       <a>&WhatHeSaid;</a>}
 
       d = REXML::Document.new( source )

From 790ad5c8530d1b6f6ad7445c085a7403119c5150 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 15:54:14 +0900
Subject: [PATCH 091/101] test: split duplicated attribute case and namespace
 conflict case

---
 test/test_core.rb | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/test/test_core.rb b/test/test_core.rb
index e1fba8a7..b079c203 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -114,22 +114,35 @@ def test_attribute
         name4='test4'/>).join(' '), e.to_s
     end
 
-    def test_attribute_namespace_conflict
+    def test_attribute_duplicated
       # https://www.w3.org/TR/xml-names/#uniqAttrs
       message = <<-MESSAGE.chomp
 Duplicate attribute "a"
-Line: 4
-Position: 140
+Line: 2
+Position: 24
 Last 80 unconsumed characters:
 />
       MESSAGE
       assert_raise(REXML::ParseException.new(message)) do
         Document.new(<<-XML)
+<x>
+  <bad a="1" a="2"/>
+</x>
+        XML
+      end
+    end
+
+    def test_attribute_namespace_conflict
+      # https://www.w3.org/TR/xml-names/#uniqAttrs
+      message = <<-MESSAGE.chomp
+Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org"
+      MESSAGE
+      assert_raise(REXML::ParseException.new(message)) do
+        Document.new(<<-XML)
 <!-- http://www.w3.org is bound to n1 and n2 -->
 <x xmlns:n1="http://www.w3.org"
-   xmlns:n2="http://www.w3.org" >
-  <bad a="1"     a="2" />
-  <bad n1:a="1"  n2:a="2" />
+   xmlns:n2="http://www.w3.org">
+  <bad n1:a="1" n2:a="2"/>
 </x>
         XML
       end

From 6422fa34494fd4145d7bc68fbbe9525d42becf62 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 16:10:05 +0900
Subject: [PATCH 092/101] Use loop instead of recursive call for Element#root

It's for performance and avoiding stack level too deep.
---
 lib/rexml/element.rb | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index a5808d7c..27132926 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -441,9 +441,14 @@ def root_node
     # Related: #root_node, #document.
     #
     def root
-      return elements[1] if self.kind_of? Document
-      return self if parent.kind_of? Document or parent.nil?
-      return parent.root
+      target = self
+      while target
+        return target.elements[1] if target.kind_of? Document
+        parent = target.parent
+        return target if parent.kind_of? Document or parent.nil?
+        target = parent
+      end
+      nil
     end
 
     # :call-seq:

From fdbffe744b38811be8b1cf6a9eec3eea4d71c412 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 16:14:19 +0900
Subject: [PATCH 093/101] Use loop instead of recursive call for
 Element#namespace

It's for performance and avoiding stack level too deep.
---
 lib/rexml/element.rb | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index 27132926..eb802165 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -624,8 +624,12 @@ def namespace(prefix=nil)
       else
         prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns'
       end
-      ns = attributes[ prefix ]
-      ns = parent.namespace(prefix) if ns.nil? and parent
+      ns = nil
+      target = self
+      while ns.nil? and target
+        ns = target.attributes[prefix]
+        target = target.parent
+      end
       ns = '' if ns.nil? and prefix == 'xmlns'
       return ns
     end

From df3a0cc83013f3cde7b7c2044e3ce00bcad321cb Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 16:18:58 +0900
Subject: [PATCH 094/101] test: fix indent

---
 test/parser/test_sax2.rb        | 276 ++++++++++++++++----------------
 test/parser/test_tree.rb        |  48 +++---
 test/parser/test_ultra_light.rb |  94 +++++------
 3 files changed, 209 insertions(+), 209 deletions(-)

diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb
index 91d135f5..9cc76ffb 100644
--- a/test/parser/test_sax2.rb
+++ b/test/parser/test_sax2.rb
@@ -4,200 +4,200 @@
 require "rexml/sax2listener"
 
 module REXMLTests
-class TestSAX2Parser < Test::Unit::TestCase
-  class TestDocumentTypeDeclaration < self
-    private
-    def xml(internal_subset)
-      <<-XML
+  class TestSAX2Parser < Test::Unit::TestCase
+    class TestDocumentTypeDeclaration < self
+      private
+      def xml(internal_subset)
+        <<-XML
 <!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
 #{internal_subset}
 ]>
 <r/>
-      XML
-    end
+        XML
+      end
 
-    class TestEntityDeclaration < self
-      class Listener
-        include REXML::SAX2Listener
-        attr_reader :entity_declarations
-        def initialize
-          @entity_declarations = []
-        end
+      class TestEntityDeclaration < self
+        class Listener
+          include REXML::SAX2Listener
+          attr_reader :entity_declarations
+          def initialize
+            @entity_declarations = []
+          end
 
-        def entitydecl(declaration)
-          super
-          @entity_declarations << declaration
+          def entitydecl(declaration)
+            super
+            @entity_declarations << declaration
+          end
         end
-      end
 
-      private
-      def parse(internal_subset)
-        listener = Listener.new
-        parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset))
-        parser.listen(listener)
-        parser.parse
-        listener.entity_declarations
-      end
+        private
+        def parse(internal_subset)
+          listener = Listener.new
+          parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset))
+          parser.listen(listener)
+          parser.parse
+          listener.entity_declarations
+        end
 
-      class TestGeneralEntity < self
-        class TestValue < self
-          def test_double_quote
-            assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET))
+        class TestGeneralEntity < self
+          class TestValue < self
+            def test_double_quote
+              assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET))
 <!ENTITY name "value">
-            INTERNAL_SUBSET
-          end
+              INTERNAL_SUBSET
+            end
 
-          def test_single_quote
-            assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET))
+            def test_single_quote
+              assert_equal([["name", "value"]], parse(<<-INTERNAL_SUBSET))
 <!ENTITY name 'value'>
-            INTERNAL_SUBSET
+              INTERNAL_SUBSET
+            end
           end
-        end
 
-        class TestExternlID < self
-          class TestSystem < self
-            def test_with_ndata
-              declaration = [
-                "name",
-                "SYSTEM", "system-literal",
-                "NDATA", "ndata-name",
-              ]
-              assert_equal([declaration],
-                           parse(<<-INTERNAL_SUBSET))
+          class TestExternlID < self
+            class TestSystem < self
+              def test_with_ndata
+                declaration = [
+                  "name",
+                  "SYSTEM", "system-literal",
+                  "NDATA", "ndata-name",
+                ]
+                assert_equal([declaration],
+                             parse(<<-INTERNAL_SUBSET))
 <!ENTITY name SYSTEM "system-literal" NDATA ndata-name>
+                INTERNAL_SUBSET
+              end
+
+              def test_without_ndata
+                declaration = [
+                  "name",
+                  "SYSTEM", "system-literal",
+                ]
+                assert_equal([declaration],
+                             parse(<<-INTERNAL_SUBSET))
+<!ENTITY name SYSTEM "system-literal">
+                INTERNAL_SUBSET
+              end
+            end
+
+            class TestPublic < self
+              def test_with_ndata
+                declaration = [
+                  "name",
+                  "PUBLIC", "public-literal", "system-literal",
+                  "NDATA", "ndata-name",
+                ]
+                assert_equal([declaration],
+                             parse(<<-INTERNAL_SUBSET))
+<!ENTITY name PUBLIC "public-literal" "system-literal" NDATA ndata-name>
+                INTERNAL_SUBSET
+              end
+
+              def test_without_ndata
+                declaration = [
+                  "name",
+                  "PUBLIC", "public-literal", "system-literal",
+                ]
+                assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
+<!ENTITY name PUBLIC "public-literal" "system-literal">
+                INTERNAL_SUBSET
+              end
+            end
+          end
+        end
+
+        class TestParameterEntity < self
+          class TestValue < self
+            def test_double_quote
+              assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET))
+<!ENTITY % name "value">
               INTERNAL_SUBSET
             end
 
-            def test_without_ndata
-              declaration = [
-                "name",
-                "SYSTEM", "system-literal",
-              ]
-              assert_equal([declaration],
-                           parse(<<-INTERNAL_SUBSET))
-<!ENTITY name SYSTEM "system-literal">
+            def test_single_quote
+              assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET))
+<!ENTITY % name 'value'>
               INTERNAL_SUBSET
             end
           end
 
-          class TestPublic < self
-            def test_with_ndata
+          class TestExternlID < self
+            def test_system
               declaration = [
+                "%",
                 "name",
-                "PUBLIC", "public-literal", "system-literal",
-                "NDATA", "ndata-name",
+                "SYSTEM", "system-literal",
               ]
               assert_equal([declaration],
-                           parse(<<-INTERNAL_SUBSET))
-<!ENTITY name PUBLIC "public-literal" "system-literal" NDATA ndata-name>
+                             parse(<<-INTERNAL_SUBSET))
+<!ENTITY % name SYSTEM "system-literal">
               INTERNAL_SUBSET
             end
 
-            def test_without_ndata
+            def test_public
               declaration = [
+                "%",
                 "name",
                 "PUBLIC", "public-literal", "system-literal",
               ]
               assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
-<!ENTITY name PUBLIC "public-literal" "system-literal">
+<!ENTITY % name PUBLIC "public-literal" "system-literal">
               INTERNAL_SUBSET
             end
           end
         end
       end
 
-      class TestParameterEntity < self
-        class TestValue < self
-          def test_double_quote
-            assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET))
-<!ENTITY % name "value">
-            INTERNAL_SUBSET
+      class TestNotationDeclaration < self
+        class Listener
+          include REXML::SAX2Listener
+          attr_reader :notation_declarations
+          def initialize
+            @notation_declarations = []
           end
 
-          def test_single_quote
-            assert_equal([["%", "name", "value"]], parse(<<-INTERNAL_SUBSET))
-<!ENTITY % name 'value'>
-            INTERNAL_SUBSET
+          def notationdecl(*declaration)
+            super
+            @notation_declarations << declaration
           end
         end
 
+        private
+        def parse(internal_subset)
+          listener = Listener.new
+          parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset))
+          parser.listen(listener)
+          parser.parse
+          listener.notation_declarations
+        end
+
         class TestExternlID < self
           def test_system
-            declaration = [
-              "%",
-              "name",
-              "SYSTEM", "system-literal",
-            ]
+            declaration = ["name", "SYSTEM", nil, "system-literal"]
             assert_equal([declaration],
-                           parse(<<-INTERNAL_SUBSET))
-<!ENTITY % name SYSTEM "system-literal">
-            INTERNAL_SUBSET
-          end
-
-          def test_public
-            declaration = [
-              "%",
-              "name",
-              "PUBLIC", "public-literal", "system-literal",
-            ]
-            assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
-<!ENTITY % name PUBLIC "public-literal" "system-literal">
-            INTERNAL_SUBSET
-          end
-        end
-      end
-    end
-
-    class TestNotationDeclaration < self
-      class Listener
-        include REXML::SAX2Listener
-        attr_reader :notation_declarations
-        def initialize
-          @notation_declarations = []
-        end
-
-        def notationdecl(*declaration)
-          super
-          @notation_declarations << declaration
-        end
-      end
-
-      private
-      def parse(internal_subset)
-        listener = Listener.new
-        parser = REXML::Parsers::SAX2Parser.new(xml(internal_subset))
-        parser.listen(listener)
-        parser.parse
-        listener.notation_declarations
-      end
-
-      class TestExternlID < self
-        def test_system
-          declaration = ["name", "SYSTEM", nil, "system-literal"]
-          assert_equal([declaration],
-                       parse(<<-INTERNAL_SUBSET))
+                         parse(<<-INTERNAL_SUBSET))
 <!NOTATION name SYSTEM "system-literal">
-          INTERNAL_SUBSET
-        end
+           INTERNAL_SUBSET
+         end
 
-        def test_public
-          declaration = ["name", "PUBLIC", "public-literal", "system-literal"]
-          assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
+         def test_public
+           declaration = ["name", "PUBLIC", "public-literal", "system-literal"]
+           assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
 <!NOTATION name PUBLIC "public-literal" "system-literal">
-          INTERNAL_SUBSET
+            INTERNAL_SUBSET
+          end
         end
-      end
 
-      class TestPublicID < self
-        def test_literal
-          declaration = ["name", "PUBLIC", "public-literal", nil]
-          assert_equal([declaration],
-                       parse(<<-INTERNAL_SUBSET))
+        class TestPublicID < self
+          def test_literal
+            declaration = ["name", "PUBLIC", "public-literal", nil]
+            assert_equal([declaration],
+                         parse(<<-INTERNAL_SUBSET))
 <!NOTATION name PUBLIC "public-literal">
-          INTERNAL_SUBSET
+            INTERNAL_SUBSET
+          end
         end
       end
     end
   end
 end
-end
diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb
index 8a5d9d12..88bc075c 100644
--- a/test/parser/test_tree.rb
+++ b/test/parser/test_tree.rb
@@ -4,40 +4,40 @@
 require "rexml/parsers/treeparser"
 
 module REXMLTests
-class TestTreeParser < Test::Unit::TestCase
-  class TestInvalid < self
-    def test_unmatched_close_tag
-      xml = "<root></not-root>"
-      exception = assert_raise(REXML::ParseException) do
-        parse(xml)
-      end
-      assert_equal(<<-MESSAGE, exception.to_s)
+  class TestTreeParser < Test::Unit::TestCase
+    class TestInvalid < self
+      def test_unmatched_close_tag
+        xml = "<root></not-root>"
+        exception = assert_raise(REXML::ParseException) do
+          parse(xml)
+        end
+        assert_equal(<<-MESSAGE, exception.to_s)
 Missing end tag for 'root' (got 'not-root')
 Line: 1
 Position: #{xml.bytesize}
 Last 80 unconsumed characters:
-      MESSAGE
-    end
-
-    def test_no_close_tag
-      xml = "<root>"
-      exception = assert_raise(REXML::ParseException) do
-        parse(xml)
+        MESSAGE
       end
-      assert_equal(<<-MESSAGE, exception.to_s)
+
+      def test_no_close_tag
+        xml = "<root>"
+        exception = assert_raise(REXML::ParseException) do
+          parse(xml)
+        end
+        assert_equal(<<-MESSAGE, exception.to_s)
 No close tag for /root
 Line: 1
 Position: #{xml.bytesize}
 Last 80 unconsumed characters:
-      MESSAGE
-    end
+        MESSAGE
+      end
 
-    private
-    def parse(xml)
-      document = REXML::Document.new
-      parser = REXML::Parsers::TreeParser.new(xml, document)
-      parser.parse
+      private
+      def parse(xml)
+        document = REXML::Document.new
+        parser = REXML::Parsers::TreeParser.new(xml, document)
+        parser.parse
+      end
     end
   end
 end
-end
diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb
index b3f576ff..d1364d6a 100644
--- a/test/parser/test_ultra_light.rb
+++ b/test/parser/test_ultra_light.rb
@@ -3,66 +3,66 @@
 require "rexml/parsers/ultralightparser"
 
 module REXMLTests
-class TestUltraLightParser < Test::Unit::TestCase
-  class TestDocumentTypeDeclaration < self
-    def test_entity_declaration
-      assert_equal([
-                     [
-                       :start_doctype,
-                       :parent,
-                       "root",
-                       "SYSTEM",
-                       "urn:x-test",
-                       nil,
-                       [:entitydecl, "name", "value"]
+  class TestUltraLightParser < Test::Unit::TestCase
+    class TestDocumentTypeDeclaration < self
+      def test_entity_declaration
+        assert_equal([
+                       [
+                         :start_doctype,
+                         :parent,
+                         "root",
+                         "SYSTEM",
+                         "urn:x-test",
+                         nil,
+                         [:entitydecl, "name", "value"]
+                       ],
+                       [:start_element, :parent, "root", {}],
                      ],
-                     [:start_element, :parent, "root", {}],
-                   ],
-                   parse(<<-INTERNAL_SUBSET))
+                     parse(<<-INTERNAL_SUBSET))
 <!ENTITY name "value">
-      INTERNAL_SUBSET
-    end
+        INTERNAL_SUBSET
+      end
 
-    private
-    def xml(internal_subset)
-      <<-XML
+      private
+      def xml(internal_subset)
+        <<-XML
 <!DOCTYPE root SYSTEM "urn:x-test" [
 #{internal_subset}
 ]>
 <root/>
-      XML
-    end
+        XML
+      end
 
-    def parse(internal_subset)
-      parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset))
-      normalize(parser.parse)
-    end
+      def parse(internal_subset)
+        parser = REXML::Parsers::UltraLightParser.new(xml(internal_subset))
+        normalize(parser.parse)
+      end
 
-    def normalize(root)
-      root.collect do |child|
-        normalize_child(child)
+      def normalize(root)
+        root.collect do |child|
+          normalize_child(child)
+        end
       end
-    end
 
-    def normalize_child(child)
-      tag = child.first
-      case tag
-      when :start_doctype
-        normalized_parent = :parent
-        normalized_doctype = child.dup
-        normalized_doctype[1] = normalized_parent
-        normalized_doctype
-      when :start_element
-        tag, _parent, name, attributes, *children = child
-        normalized_parent = :parent
-        normalized_children = children.collect do |sub_child|
-          normalize_child(sub_child)
+      def normalize_child(child)
+        tag = child.first
+        case tag
+        when :start_doctype
+          normalized_parent = :parent
+          normalized_doctype = child.dup
+          normalized_doctype[1] = normalized_parent
+          normalized_doctype
+        when :start_element
+          tag, _parent, name, attributes, *children = child
+          normalized_parent = :parent
+          normalized_children = children.collect do |sub_child|
+            normalize_child(sub_child)
+          end
+          [tag, normalized_parent, name, attributes, *normalized_children]
+        else
+          child
         end
-        [tag, normalized_parent, name, attributes, *normalized_children]
-      else
-        child
       end
     end
   end
 end
-end

From 6e00a14daf2f901df535eafe96cc94d43a957ffe Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 16:20:50 +0900
Subject: [PATCH 095/101] test: fix indent

---
 test/parser/test_sax2.rb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/parser/test_sax2.rb b/test/parser/test_sax2.rb
index 9cc76ffb..c2548907 100644
--- a/test/parser/test_sax2.rb
+++ b/test/parser/test_sax2.rb
@@ -177,12 +177,12 @@ def test_system
             assert_equal([declaration],
                          parse(<<-INTERNAL_SUBSET))
 <!NOTATION name SYSTEM "system-literal">
-           INTERNAL_SUBSET
-         end
+            INTERNAL_SUBSET
+          end
 
-         def test_public
-           declaration = ["name", "PUBLIC", "public-literal", "system-literal"]
-           assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
+          def test_public
+            declaration = ["name", "PUBLIC", "public-literal", "system-literal"]
+            assert_equal([declaration], parse(<<-INTERNAL_SUBSET))
 <!NOTATION name PUBLIC "public-literal" "system-literal">
             INTERNAL_SUBSET
           end

From 35e1681a179c28d5b6ec97d4ab1c110e5ac00303 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 16:21:19 +0900
Subject: [PATCH 096/101] test tree-parser: move common method to base class

---
 test/parser/test_tree.rb | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb
index 88bc075c..cdd28d2c 100644
--- a/test/parser/test_tree.rb
+++ b/test/parser/test_tree.rb
@@ -5,6 +5,12 @@
 
 module REXMLTests
   class TestTreeParser < Test::Unit::TestCase
+    private def parse(xml)
+      document = REXML::Document.new
+      parser = REXML::Parsers::TreeParser.new(xml, document)
+      parser.parse
+    end
+
     class TestInvalid < self
       def test_unmatched_close_tag
         xml = "<root></not-root>"
@@ -31,13 +37,6 @@ def test_no_close_tag
 Last 80 unconsumed characters:
         MESSAGE
       end
-
-      private
-      def parse(xml)
-        document = REXML::Document.new
-        parser = REXML::Parsers::TreeParser.new(xml, document)
-        parser.parse
-      end
     end
   end
 end

From 2b47b161db19c38c5e45e36c2008c045543e976e Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 17:05:29 +0900
Subject: [PATCH 097/101] parser: move duplicated end tag check to BaseParser

---
 lib/rexml/parsers/baseparser.rb   | 4 ++++
 lib/rexml/parsers/streamparser.rb | 8 --------
 lib/rexml/parsers/treeparser.rb   | 7 -------
 test/parser/test_tree.rb          | 2 +-
 4 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e38ff86e..093af36a 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -249,6 +249,10 @@ def pull_event
           if @document_status == :in_doctype
             raise ParseException.new("Malformed DOCTYPE: unclosed", @source)
           end
+          unless @tags.empty?
+            path = "/" + @tags.join("/")
+            raise ParseException.new("Missing end tag for '#{path}'", @source)
+          end
           return [ :end_document ]
         end
         return @stack.shift if @stack.size > 0
diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb
index fa3ac496..e2da2a7d 100644
--- a/lib/rexml/parsers/streamparser.rb
+++ b/lib/rexml/parsers/streamparser.rb
@@ -7,7 +7,6 @@ class StreamParser
       def initialize source, listener
         @listener = listener
         @parser = BaseParser.new( source )
-        @tag_stack = []
       end
 
       def add_listener( listener )
@@ -20,21 +19,14 @@ def parse
           event = @parser.pull
           case event[0]
           when :end_document
-            unless @tag_stack.empty?
-              tag_path = "/" + @tag_stack.join("/")
-              raise ParseException.new("Missing end tag for '#{tag_path}'",
-                                       @parser.source)
-            end
             return
           when :start_element
-            @tag_stack << event[1]
             attrs = event[2].each do |n, v|
               event[2][n] = @parser.unnormalize( v )
             end
             @listener.tag_start( event[1], attrs )
           when :end_element
             @listener.tag_end( event[1] )
-            @tag_stack.pop
           when :text
             unnormalized = @parser.unnormalize( event[1] )
             @listener.text( unnormalized )
diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb
index 0cb6f7cc..4565a406 100644
--- a/lib/rexml/parsers/treeparser.rb
+++ b/lib/rexml/parsers/treeparser.rb
@@ -15,7 +15,6 @@ def add_listener( listener )
       end
 
       def parse
-        tag_stack = []
         entities = nil
         begin
           while true
@@ -23,19 +22,13 @@ def parse
             #STDERR.puts "TREEPARSER GOT #{event.inspect}"
             case event[0]
             when :end_document
-              unless tag_stack.empty?
-                raise ParseException.new("No close tag for #{@build_context.xpath}",
-                                         @parser.source, @parser)
-              end
               return
             when :start_element
-              tag_stack.push(event[1])
               el = @build_context = @build_context.add_element( event[1] )
               event[2].each do |key, value|
                 el.attributes[key]=Attribute.new(key,value,self)
               end
             when :end_element
-              tag_stack.pop
               @build_context = @build_context.parent
             when :text
               if @build_context[-1].instance_of? Text
diff --git a/test/parser/test_tree.rb b/test/parser/test_tree.rb
index cdd28d2c..315be9c2 100644
--- a/test/parser/test_tree.rb
+++ b/test/parser/test_tree.rb
@@ -31,7 +31,7 @@ def test_no_close_tag
           parse(xml)
         end
         assert_equal(<<-MESSAGE, exception.to_s)
-No close tag for /root
+Missing end tag for '/root'
 Line: 1
 Position: #{xml.bytesize}
 Last 80 unconsumed characters:

From cb158582f18cebb3bf7b3f21f230e2fb17d435aa Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 17:39:14 +0900
Subject: [PATCH 098/101] parser: keep the current namespaces instead of stack
 of Set

It improves namespace resolution performance for deep element.
---
 lib/rexml/parsers/baseparser.rb | 45 +++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 093af36a..9ed032d3 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -181,7 +181,8 @@ def stream=( source )
         @tags = []
         @stack = []
         @entities = []
-        @nsstack = []
+        @namespaces = {}
+        @namespaces_restore_stack = []
       end
 
       def position
@@ -285,7 +286,6 @@ def pull_event
                 @source.position = start_position
                 raise REXML::ParseException.new(message, @source)
               end
-              @nsstack.unshift(Set.new)
               name = parse_name(base_error_message)
               if @source.match(/\s*\[/um, true)
                 id = [nil, nil, nil]
@@ -379,7 +379,7 @@ def pull_event
                   val = attdef[4] if val == "#FIXED "
                   pairs[attdef[0]] = val
                   if attdef[0] =~ /^xmlns:(.*)/
-                    @nsstack[0] << $1
+                    @namespaces[$1] = val
                   end
                 end
               end
@@ -432,7 +432,7 @@ def pull_event
             # here explicitly.
             @source.ensure_buffer
             if @source.match("/", true)
-              @nsstack.shift
+              @namespaces_restore_stack.pop
               last_tag = @tags.pop
               md = @source.match(Private::CLOSE_PATTERN, true)
               if md and !last_tag
@@ -477,18 +477,18 @@ def pull_event
               @document_status = :in_element
               @prefixes.clear
               @prefixes << md[2] if md[2]
-              @nsstack.unshift(curr_ns=Set.new)
-              attributes, closed = parse_attributes(@prefixes, curr_ns)
+              push_namespaces_restore
+              attributes, closed = parse_attributes(@prefixes)
               # Verify that all of the prefixes have been defined
               for prefix in @prefixes
-                unless @nsstack.find{|k| k.member?(prefix)}
+                unless @namespaces.key?(prefix)
                   raise UndefinedNamespaceException.new(prefix,@source,self)
                 end
               end
 
               if closed
                 @closed = tag
-                @nsstack.shift
+                pop_namespaces_restore
               else
                 if @tags.empty? and @have_root
                   raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source)
@@ -599,6 +599,31 @@ def unnormalize( string, entities=nil, filter=nil )
       end
 
       private
+      def add_namespace(prefix, uri)
+        @namespaces_restore_stack.last[prefix] = @namespaces[prefix]
+        if uri.nil?
+          @namespaces.delete(prefix)
+        else
+          @namespaces[prefix] = uri
+        end
+      end
+
+      def push_namespaces_restore
+        namespaces_restore = {}
+        @namespaces_restore_stack.push(namespaces_restore)
+        namespaces_restore
+      end
+
+      def pop_namespaces_restore
+        namespaces_restore = @namespaces_restore_stack.pop
+        namespaces_restore.each do |prefix, uri|
+          if uri.nil?
+            @namespaces.delete(prefix)
+          else
+            @namespaces[prefix] = uri
+          end
+        end
+      end
 
       def record_entity_expansion(delta=1)
         @entity_expansion_count += delta
@@ -727,7 +752,7 @@ def process_instruction
         [:processing_instruction, name, content]
       end
 
-      def parse_attributes(prefixes, curr_ns)
+      def parse_attributes(prefixes)
         attributes = {}
         closed = false
         while true
@@ -770,7 +795,7 @@ def parse_attributes(prefixes, curr_ns)
                   "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
                 raise REXML::ParseException.new( msg, @source, self)
               end
-              curr_ns << local_part
+              add_namespace(local_part, value)
             elsif prefix
               prefixes << prefix unless prefix == "xml"
             end

From 6109e0183cecf4f8b587d76209716cb1bbcd6bd5 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Wed, 21 Aug 2024 15:23:00 +0900
Subject: [PATCH 099/101] Fix a bug that Stream parser doesn't expand the
 user-defined entity references for "text" (#200)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Why?
Pull parser expands character references and predefined entity
references, but doesn't expand user-defined entity references.

## Change
- text_stream_unnormalize.rb
```
$LOAD_PATH.unshift(File.expand_path("lib"))
require 'rexml/document'
require 'rexml/parsers/sax2parser'
require 'rexml/parsers/pullparser'
require 'rexml/parsers/streamparser'
require 'rexml/streamlistener'

xml = <<EOS
<!DOCTYPE foo [
  <!ENTITY la "1234">
  <!ENTITY lala "--&la;--">
  <!ENTITY lalal "&la;&la;">
]><root><la>&la;</la><lala>&lala;</lala><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt; &lt;/I&gt;</a><b>test&#8482;</b></root>
EOS

class StListener
  include REXML::StreamListener

  def text(text)
    puts text
  end
end

puts "REXML(DOM)"
REXML::Document.new(xml).elements.each("/root/*") {|element| puts element.text}

puts ""
puts "REXML(Pull)"
parser = REXML::Parsers::PullParser.new(xml)
while parser.has_next?
  event = parser.pull
  case event.event_type
  when :text
    puts event[1]
  end
end

puts ""
puts "REXML(Stream)"
parser = REXML::Parsers::StreamParser.new(xml, StListener.new).parse

puts ""
puts "REXML(SAX)"
sax = REXML::Parsers::SAX2Parser.new(xml)
sax.listen(:characters) {|x| puts x }
sax.parse
```

## Before (master)
```
$ ruby text_stream_unnormalize.rb
REXML(DOM)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Pull)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Stream)
&la;      #<= This
&lala;    #<= This
<P> <I> <B> Text </B> </I>
test™

REXML(SAX)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™
```

## After(This PR)

```
$ ruby text_stream_unnormalize.rb
REXML(DOM)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Pull)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(Stream)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™

REXML(SAX)
1234
--1234--
<P> <I> <B> Text </B> </I>
test™
```
---
 lib/rexml/parsers/streamparser.rb |   8 +-
 test/test_stream.rb               | 141 +++++++++++++++++++++++++++++-
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb
index e2da2a7d..7781fe44 100644
--- a/lib/rexml/parsers/streamparser.rb
+++ b/lib/rexml/parsers/streamparser.rb
@@ -7,12 +7,17 @@ class StreamParser
       def initialize source, listener
         @listener = listener
         @parser = BaseParser.new( source )
+        @entities = {}
       end
 
       def add_listener( listener )
         @parser.add_listener( listener )
       end
 
+      def entity_expansion_count
+        @parser.entity_expansion_count
+      end
+
       def parse
         # entity string
         while true
@@ -28,7 +33,7 @@ def parse
           when :end_element
             @listener.tag_end( event[1] )
           when :text
-            unnormalized = @parser.unnormalize( event[1] )
+            unnormalized = @parser.unnormalize( event[1], @entities )
             @listener.text( unnormalized )
           when :processing_instruction
             @listener.instruction( *event[1,2] )
@@ -40,6 +45,7 @@ def parse
           when :comment, :attlistdecl, :cdata, :xmldecl, :elementdecl
             @listener.send( event[0].to_s, *event[1..-1] )
           when :entitydecl, :notationdecl
+            @entities[ event[1] ] = event[2] if event.size == 3
             @listener.send( event[0].to_s, event[1..-1] )
           when :externalentity
             entity_reference = event[1]
diff --git a/test/test_stream.rb b/test/test_stream.rb
index 615d497f..782066c2 100644
--- a/test/test_stream.rb
+++ b/test/test_stream.rb
@@ -87,6 +87,42 @@ def entity(content)
 
       assert_equal(["ISOLat2"], listener.entities)
     end
+
+    def test_entity_replacement
+      source = <<-XML
+<!DOCTYPE foo [
+  <!ENTITY la "1234">
+  <!ENTITY lala "--&la;--">
+  <!ENTITY lalal "&la;&la;">
+]><a><la>&la;</la><lala>&lala;</lala></a>
+      XML
+
+      listener = MyListener.new
+      class << listener
+        attr_accessor :text_values
+        def text(text)
+          @text_values << text
+        end
+      end
+      listener.text_values = []
+      REXML::Document.parse_stream(source, listener)
+      assert_equal(["1234", "--1234--"], listener.text_values)
+    end
+
+    def test_characters_predefined_entities
+      source = '<root><a>&lt;P&gt; &lt;I&gt; &lt;B&gt; Text &lt;/B&gt;  &lt;/I&gt;</a></root>'
+
+      listener = MyListener.new
+      class << listener
+        attr_accessor :text_value
+        def text(text)
+          @text_value << text
+        end
+      end
+      listener.text_value = ""
+      REXML::Document.parse_stream(source, listener)
+      assert_equal("<P> <I> <B> Text </B>  </I>", listener.text_value)
+    end
   end
 
   class EntityExpansionLimitTest < Test::Unit::TestCase
@@ -100,6 +136,81 @@ def teardown
       REXML::Security.entity_expansion_text_limit = @default_entity_expansion_text_limit
     end
 
+    def test_have_value
+      source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx">
+]>
+<member>
+&a;
+</member>
+      XML
+
+      assert_raise(RuntimeError.new("entity expansion has grown too large")) do
+        REXML::Document.parse_stream(source, MyListener.new)
+      end
+    end
+
+    def test_empty_value
+      source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;">
+  <!ENTITY b "&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;">
+  <!ENTITY c "&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;">
+  <!ENTITY d "&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;">
+  <!ENTITY e "">
+]>
+<member>
+&a;
+</member>
+      XML
+
+      listener = MyListener.new
+      REXML::Security.entity_expansion_limit = 100000
+      parser = REXML::Parsers::StreamParser.new( source, listener )
+      parser.parse
+      assert_equal(11111, parser.entity_expansion_count)
+
+      REXML::Security.entity_expansion_limit = @default_entity_expansion_limit
+      parser = REXML::Parsers::StreamParser.new( source, listener )
+      assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+        parser.parse
+      end
+      assert do
+        parser.entity_expansion_count > @default_entity_expansion_limit
+      end
+    end
+
+    def test_with_default_entity
+      source = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE member [
+  <!ENTITY a "a">
+  <!ENTITY a2 "&a; &a;">
+]>
+<member>
+&a;
+&a2;
+&lt;
+</member>
+      XML
+
+      listener = MyListener.new
+      REXML::Security.entity_expansion_limit = 4
+      REXML::Document.parse_stream(source, listener)
+
+      REXML::Security.entity_expansion_limit = 3
+      assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do
+        REXML::Document.parse_stream(source, listener)
+      end
+    end
+
     def test_with_only_default_entities
       member_value = "&lt;p&gt;#{'A' * @default_entity_expansion_text_limit}&lt;/p&gt;"
       source = <<-XML
@@ -117,14 +228,42 @@ def text(text)
         end
       end
       listener.text_value = ""
-      REXML::Document.parse_stream(source, listener)
+      parser = REXML::Parsers::StreamParser.new( source, listener )
+      parser.parse
 
       expected_value = "<p>#{'A' * @default_entity_expansion_text_limit}</p>"
       assert_equal(expected_value, listener.text_value.strip)
+      assert_equal(0, parser.entity_expansion_count)
       assert do
         listener.text_value.bytesize > @default_entity_expansion_text_limit
       end
     end
+
+    def test_entity_expansion_text_limit
+      source = <<-XML
+<!DOCTYPE member [
+  <!ENTITY a "&b;&b;&b;">
+  <!ENTITY b "&c;&d;&e;">
+  <!ENTITY c "xxxxxxxxxx">
+  <!ENTITY d "yyyyyyyyyy">
+  <!ENTITY e "zzzzzzzzzz">
+]>
+<member>&a;</member>
+      XML
+
+      listener = MyListener.new
+      class << listener
+        attr_accessor :text_value
+        def text(text)
+          @text_value << text
+        end
+      end
+      listener.text_value = ""
+      REXML::Security.entity_expansion_text_limit = 90
+      REXML::Document.parse_stream(source, listener)
+
+      assert_equal(90, listener.text_value.size)
+    end
   end
 
   # For test_listener

From 7cb5eaeb221c322b9912f724183294d8ce96bae3 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 17 Aug 2024 17:45:52 +0900
Subject: [PATCH 100/101] parser tree: improve namespace conflicted attribute
 check performance

It was slow for deep element.

Reported by l33thaxor. Thanks!!!
---
 lib/rexml/element.rb            | 11 -----------
 lib/rexml/parsers/baseparser.rb | 15 +++++++++++++++
 test/parse/test_element.rb      | 14 ++++++++++++++
 test/test_core.rb               |  4 ++++
 4 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index eb802165..4e3a60b9 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -2384,17 +2384,6 @@ def []=( name, value )
       elsif old_attr.kind_of? Hash
         old_attr[value.prefix] = value
       elsif old_attr.prefix != value.prefix
-        # Check for conflicting namespaces
-        if value.prefix != "xmlns" and old_attr.prefix != "xmlns"
-          old_namespace = old_attr.namespace
-          new_namespace = value.namespace
-          if old_namespace == new_namespace
-            raise ParseException.new(
-                    "Namespace conflict in adding attribute \"#{value.name}\": "+
-                    "Prefix \"#{old_attr.prefix}\" = \"#{old_namespace}\" and "+
-                    "prefix \"#{value.prefix}\" = \"#{new_namespace}\"")
-          end
-        end
         store value.name, {old_attr.prefix => old_attr,
                            value.prefix    => value}
       else
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 9ed032d3..d11c2766 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -754,6 +754,7 @@ def process_instruction
 
       def parse_attributes(prefixes)
         attributes = {}
+        expanded_names = {}
         closed = false
         while true
           if @source.match(">", true)
@@ -805,6 +806,20 @@ def parse_attributes(prefixes)
               raise REXML::ParseException.new(msg, @source, self)
             end
 
+            unless prefix == "xmlns"
+              uri = @namespaces[prefix]
+              expanded_name = [uri, local_part]
+              existing_prefix = expanded_names[expanded_name]
+              if existing_prefix
+                message = "Namespace conflict in adding attribute " +
+                          "\"#{local_part}\": " +
+                          "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " +
+                          "prefix \"#{prefix}\" = \"#{uri}\""
+                raise REXML::ParseException.new(message, @source, self)
+              end
+              expanded_names[expanded_name] = prefix
+            end
+
             attributes[name] = value
           else
             message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 2b0746ea..ab4818da 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -131,5 +131,19 @@ def test_linear_performance_attribute_value_gt
         REXML::Document.new('<test testing="' + ">" * n + '"></test>')
       end
     end
+
+    def test_linear_performance_deep_same_name_attributes
+      seq = [100, 500, 1000, 1500, 2000]
+      assert_linear_performance(seq, rehearsal: 10) do |n|
+        xml = <<-XML
+<?xml version="1.0"?>
+<root xmlns:ns="ns-uri">
+#{"<x ns:name='ns-value' name='value'>\n" * n}
+#{"</x>\n" * n}
+</root>
+        XML
+        REXML::Document.new(xml)
+      end
+    end
   end
 end
diff --git a/test/test_core.rb b/test/test_core.rb
index b079c203..48666c86 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -136,6 +136,10 @@ def test_attribute_namespace_conflict
       # https://www.w3.org/TR/xml-names/#uniqAttrs
       message = <<-MESSAGE.chomp
 Namespace conflict in adding attribute "a": Prefix "n1" = "http://www.w3.org" and prefix "n2" = "http://www.w3.org"
+Line: 4
+Position: 140
+Last 80 unconsumed characters:
+/>
       MESSAGE
       assert_raise(REXML::ParseException.new(message)) do
         Document.new(<<-XML)

From 95871f399eda642a022b03550479b7994895c742 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Thu, 22 Aug 2024 09:54:49 +0900
Subject: [PATCH 101/101] Add 3.3.6 entry

---
 NEWS.md | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 165b1c76..6c290678 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,44 @@
 # News
 
+## 3.3.6 - 2024-08-22 {#version-3-3-6}
+
+### Improvements
+
+  * Removed duplicated entity expansions for performance.
+    * GH-194
+    * Patch by Viktor Ivarsson.
+
+  * Improved namespace conflicted attribute check performance. It was
+    too slow for deep elements.
+    * Reported by l33thaxor.
+
+### Fixes
+
+  * Fixed a bug that default entity expansions are counted for
+    security check. Default entity expansions should not be counted
+    because they don't have a security risk.
+    * GH-198
+    * GH-199
+    * Patch Viktor Ivarsson
+
+  * Fixed a parser bug that parameter entity references in internal
+    subsets are expanded. It's not allowed in the XML specification.
+    * GH-191
+    * Patch by NAITOH Jun.
+
+  * Fixed a stream parser bug that user-defined entity references in
+    text aren't expanded.
+    * GH-200
+    * Patch by NAITOH Jun.
+
+### Thanks
+
+  * Viktor Ivarsson
+
+  * NAITOH Jun
+
+  * l33thaxor
+
 ## 3.3.5 - 2024-08-12 {#version-3-3-5}
 
 ### Fixes